Neural network help: AI doesn't improve

Hi, i made a very simple and basic (i thought, but might as well be skynet…) Neural Network. I am quite familiar with how it should work, although i never made one myself (though i copied a machine learning system, which is where i got the data set from, from an online tutorial).

The AI should get trained to recognize handwritten digits from 0-9. It has 194 input neurons, 20 hidden layers with each 20 neurons and 1 output with 10 neurons. They are all connected as they should be and should work… but it doesn’t.

Now, the problem is that it just doesn’t improve… I set up 2 AIs where the worse one will get replaced with the better one and then be slightly changed (only the newly replaced one, so that no degeneration can happen, or at least not much…). And this alone should make it improve… overtime the replaced one has to become even slightly better than the original one and thus replace it, but it just doesn’t improve… I had it stuck on 11% correct… now somehow i got it to get to 17%, but it still doesn’t improve for some reason… i mean how can it be that it is stuck at 17% but doesn’t improve… My only though is that it may be because the targetNeurons should be swapped out too sometime, but i feel that would just cause more noise than help… And it should still be improving even just a bit, but it doesn’t…

Now i assume you’ll need some code, but it’s 500 lines +some for loading the data, and you’d need the data to run it effectively, but i’ll just post the code below… would be good if we could include data too…

Main
Network a;
Network b;

int right = 1;
int wrong = 1;
boolean done = true;

float[] numbers = new float[1024];

void setup() {
  size(1024, 512);
  frameRate(5);
  loadData();

  a = new Network(194, 20, 10);
  b = new Network(194, 20, 10);

  //visualize(a, 10, 10, 500, 500);
  //visualize(b, 522, 10, 500, 500);

  //noLoop();
}

void draw() {



  if (frameCount%5 == 0)
    makeGraph();

  if (done) {
    println(millis());
    thread("runAll");
    println(right, " : ", wrong, " = ", (float)right/wrong);
  }



  //visualize(a, 10, 10, 500, 500);
  //visualize(b, 522, 10, 500, 500);
}

int aCorrect;
int bCorrect;
int aBetter;
int bBetter;
void runAll() {
  done = false;


  //just testing
  aCorrect = 0;
  bCorrect = 0;
  aBetter = 0;
  bBetter = 0;



  for (int n = 0; n < 100; n++) {
    run();
  }
  correct();
  done = true;
}


void run() {
  int rnd = (int) random(0, training_set.length);
  for (int i = 0; i < a.getInputLength(); i++) {
    a.triggerInput(i, training_set[rnd].inputs[i]);
    b.triggerInput(i, training_set[rnd].inputs[i]);
  }

  a.runHidden();
  b.runHidden();
  a.callOutput();
  b.callOutput();

  checkOutput(training_set[rnd].output);
}

//just testing
void correct() {
  if (aCorrect >= bCorrect) {
    b = a.clone();
    b.mutate();
  } else if (bCorrect > aCorrect) {
    a = b.clone();
    a.mutate();
  } else if (aBetter >= bBetter) {
    b = a.clone();
    b.mutate();
  } else if (bBetter > aBetter) {
    a = b.clone();
    b.mutate();
  }
}

//just testing
void checkOutput(int desired) {
  if (isCorrect(a, desired)) {
    aCorrect++;
    right++;
    //b = a.clone();
    //b.mutate();
  } else if (isCorrect(b, desired)) {
    bCorrect++;
    right++;
    //a = b.clone();
    //a.mutate();
  } else if (a.output[desired] > b.output[desired]) {
    aBetter++;
    wrong++;
    //  b = a.clone();
    //  b.mutate();
  } else if (b.output[desired] > a.output[desired]) {
    bBetter++;
    wrong++;
    //  a = b.clone();
    //  a.mutate();
  } else {
    println("Error!!!!!!!!!!!!!!!!!", isCorrect(a, desired), isCorrect(b, desired), a.output[desired], b.output[desired]);
  }
  a.restart();
  b.restart();
}

//void checkOutput(int desired) {
//  if (a.output[desired] > b.output[desired]) {
//    if (isCorrect(a, desired))
//      right++; 
//    else
//      wrong++;
//    b = a.clone();
//    b.mutate();//might need to do this somewhere else, to avoid to strong changes being accepted because they werent of the same output number as the one that was checked... but no idea where...
//    //println("A Won finding : ", training_set[desired].output);
//  } else {
//    if (isCorrect(b, desired))
//      right++;
//    else
//      wrong++;
//    a = b.clone();
//    a.mutate();//same reason as the b.mutate() 6 lines above should be changed
//    //println("B Won finding : ", training_set[desired].output);
//  }

//  a.restart();
//  b.restart();
//}

boolean isCorrect(Network net, int desired) {
  int highestIdx = 0;
  for (int i = 1; i < net.output.length; i++)
    if ((float)net.output[i] > (float)net.output[highestIdx]) 
      highestIdx = i;
  if (highestIdx == desired)
    return true;
  else
    return false;
}

void visualize(Network net, float offsetX, float offsetY, float w, float h) {
  for (int i = 0; i < net.layers.length; i++) {
    int x = floor((i*(w/net.layers.length))+offsetX);
    for (int j = 0; j < net.layers[i].length; j++) {
      int y = floor((j*floor(h/net.layers[i].length))+offsetY);
      int extraOffsetY = (int)(h%floor(((net.layers[i].length-1)*floor(h/net.layers[i].length))+offsetY));
      y+=extraOffsetY/2;
      for (int k = 0; k < net.layers[i][j].targets.length; k++) {
        stroke(i == 0 || i == net.layers.length-1? 255 : map(net.layers[i][j].biases[k], 0, 1, 0, 255));
        strokeWeight(0);
        ellipse(x, y, 2, 2);
        line(x, y, floor((net.layers[i][j].targets[k].x*(w/net.layers.length)) + offsetX), floor((net.layers[i][j].targets[k].y*floor(h/net.layers[(int)net.layers[i][j].targets[k].x].length)) + offsetY) +
          ((int)(h%floor(((net.layers[(int)net.layers[i][j].targets[k].x].length-1)*floor(h/net.layers[(int)net.layers[i][j].targets[k].x].length)) + offsetY))));
      }
    }
  }
}

void makeGraph() {
  background(255);
  stroke(0);
  noFill();
  beginShape();
  for (int i = 0; i < numbers.length; i++) {
    vertex(i, 512-numbers[i]);
  }
  endShape();
  for (int i = 1; i < numbers.length; i++) {
    numbers[i-1] = numbers[i];
  }
  numbers[numbers.length-1] = (float)right/wrong*512;
  line(0, 512-numbers[numbers.length-1], 1024, 512-numbers[numbers.length-1]);
  fill(0);
  text("Average Correct : " + (float)right/wrong, 10, 512-numbers[numbers.length-1]);
}
LoadData
Card [] testing_set; // the set we use to test (2000)
Card [] training_set; // the set we use to train (8000)

class Card { // This class contains all the functions to format and save the data

  float [] inputs;
  float [] outputs;
  int output;

  Card() {
    inputs = new float [196]; // the images are a grid of 14x14 pixels which makes for a total of 196
    outputs = new float[10]; // the number of possible outputs; from 0 to 9
  }

  void imageLoad(byte [] images, int offset) { // Images is an array of 1,960,000 bytes, each one representing a pixel (0-255) of the 10,000 * 14x14 (196) images
    // We know one image consists of 196 bytes so the location is: offset*196
    for (int i = 0; i < 196; i++) {
      inputs[i] = int(images[i+offset]) / 128.0 - 1.0; // We then store each pixel in the array inputs[] after converting it from (0 - 255) to (+1 - -1) as they vary on the greyscale
    }
  }

  void labelLoad(byte [] labels, int offset) {  // Labels is an array of 10,000 bytes, each representing the answer of each image

    output = int(labels[offset]);

    for (int i = 0; i < 10; i++) {  // We then set the correct index in output[] to +1 if it corresponds to the ouput and -1 if not
      if (i == output) {
        outputs[i] = 1.0;
      } else {
        outputs[i] = -1.0;
      }
    }
  }
}

void loadData() { // In this function we initialise all out data in two seperate arrays, training[] and test[]

  byte [] images = loadBytes("t10k-images-14x14.idx3-ubyte");
  byte [] labels = loadBytes("t10k-labels.idx1-ubyte");
  training_set = new Card [8000];
  int tr_pos = 0;
  testing_set = new Card [2000];
  int te_pos = 0;
  for (int i = 0; i < 10000; i++) {
    if (i % 5 != 0) { 
      training_set[tr_pos] = new Card();
      training_set[tr_pos].imageLoad(images, 16 + i * 196); // There is an offset of 16 bytes
      training_set[tr_pos].labelLoad(labels, 8 + i);  // There is an offset of 8 bytes
      tr_pos++;
    } else {
      testing_set[te_pos] = new Card();
      testing_set[te_pos].imageLoad(images, 16 + i * 196);  // There is an offset of 16 bytes 
      testing_set[te_pos].labelLoad(labels, 8 + i);  // There is an offset of 8 bytes
      te_pos++;
    }
  }
}
Network
class Network { //<>// //<>//
  Neuron[][] layers;
  float[] output;

  Network(int inputSize, int hiddenSize, int outputSize) {
    layers = new Neuron[hiddenSize+2][];

    layers[0] = new Neuron[inputSize];

    for (int i = 1; i < layers.length-1; i++) {
      layers[i] = new Neuron[hiddenSize];
    }

    layers[layers.length-1] = new Neuron[outputSize];
    
    init();

    output = new float[outputSize];
  }

  void init() {
    for (int i = 0; i < layers.length-1; i++) {
      for (int j = 0; j < layers[i].length; j++) {
        layers[i][j] = new Neuron(this);
      }
    }
    for (int i = 0; i < layers[layers.length-1].length; i++) {
      layers[layers.length-1][i] = new OutputNeuron(this);
    }
  }

  void triggerInput(int idx, float strength) {
    layers[0][idx].trigger(strength);
    layers[0][idx].send();
  }

  int getInputLength() {
    return layers[0].length;
  }

  void runHidden() {
    for (int n = 0; n < 1000; n++) {
      for (int i = 1; i < layers.length-1; i++) {
        for (int j = 0; j < layers[i].length; j++) {
          layers[i][j].send();
        }
      }
    }
  }

  void callOutput() {
    for (int i = 0; i < layers[layers.length-1].length; i++) {
      ((OutputNeuron)layers[layers.length-1][i]).send(i);
    }
  }

  Neuron getOutput(int idx) { //can maybe be deleted... but not sure, need to check //second look and can still be deleted...
    return layers[layers.length-1][idx];
  }

  Network clone() {
    Network result = new Network(layers[0].length, layers.length-2, layers[layers.length-1].length);
    for (int i = 0; i < result.layers.length; i++) {
      for (int j = 0; j < result.layers[i].length; j++) {
        result.layers[i][j] = layers[i][j].clone(result);
      }
    }
    return result;
  }

  void mutate() {
    for (int i = 0; i < layers.length-1; i++) {
      for (int j = 0; j < layers[i].length; j++) {
        layers[i][j].mutateBias(); 
        layers[i][j].mutateGate();
      }
    }
  }

  void restart() {
    for (int i = 0; i < layers.length; i++) {
      for (int j = 0; j < layers[i].length; j++) {
        layers[i][j].restart();
      }
    }
  }
}
Neuron
class Neuron {
  //maybe add self pvector to set initial targets closeby...
  Network parent;
  float value = 0;
  float gate = 0;
  PVector[] targets;
  float[] biases;
  boolean active = false;
  boolean wasActive = false;
  final float factor = 0.1f;

  Neuron(Network par) {
    parent = par;
    int rnd = (int) random(1, 20);
    targets = new PVector[rnd];
    for (int i = 0; i < rnd; i++) {
      targets[i] = new PVector();
      setTargetRandom(i);
    }
    biases = new float[targets.length];
    for (int i = 0; i < biases.length; i++) {
      biases[i] = random(-0.5, 0.5);
    }
  }

  void setTargetRandom(int idx) {
    targets[idx].x = (int) random(1, parent.layers.length);
    targets[idx].y = (int) random(0, parent.layers[(int)targets[idx].x].length);
  }


  void trigger(float strength) {
    value += strength;
    if (value > abs(gate) || value < -abs(gate)) {
      active = true; 
      wasActive = true;
    }
  }

  void send() {
    if (active) {
      value = constrain(value, -100, 100);
      for (int i = 0; i < targets.length; i++) {
        parent.layers[(int)targets[i].x][(int)targets[i].y].trigger(value*biases[i]);
      }
      active = false;
      value = 0;
    }
  }

  void mutateBias() {
    //if (wasActive) {
    for (int i = 0; i < biases.length; i++) {
      if (random(0, 10) < 1) {
        biases[i]+=random(-factor, factor);
        biases[i] = constrain(biases[i], -0.5, 0.5);
      }
      // }
    }
  }

  void mutateBias1() {
    if (wasActive) {
      for (int i = 0; i < biases.length; i++) {
        if (random(0, 10) < 1) {
          biases[i] += random(0, factor);
          biases[i] = constrain(biases[i], -0.5, 0.5);
        }
      }
    } else {
      for (int i = 0; i < biases.length; i++) {
        if (random(0, 10) < 1) {
          biases[i] += random(-factor/100, 0);
          biases[i] = constrain(biases[i], -0.5, 0.5);
        }
      }
    }
  }

  void mutateGate() {
    if (wasActive)
      if (random(0, 100) < 1) {
        gate += random(-factor, factor);
      }
  }

  //ignore this for now... may be used later to change targets, but will cause completely new behaviour
  void mutateTarget() {
    if (random(0, 500) < 1) {
      setTargetRandom((int)random(0, targets.length));
    }
  }

  void restart() {
    value = 0; 
    active = false;
    wasActive = false;
  }

  Neuron clone(Network targetParent) {
    Neuron result = new Neuron(targetParent);
    result.value = value;
    result.gate = gate;
    result.targets = targets.clone();
    result.biases = biases.clone();
    result.wasActive = wasActive;
    return result;
  }
}

class OutputNeuron extends Neuron {
  OutputNeuron(Network par) {
    super(par);
  }

  @Override
    void send() {
  }

  void send(int idx) {
    parent.output[idx] = value;
  }

  float getValue() {
    return value;
  }

  Neuron clone(Network targetParent) {
    Neuron result = new OutputNeuron(targetParent);
    result.value = value;
    result.gate = gate;
    result.targets = targets.clone();
    result.biases = biases.clone();
    result.wasActive = wasActive;
    return result;
  }
}

I know this might not be the most efficient way (or in any way beautiful), but i just want to get it to do what it should this way. Also, if anyone has improvements for the code, i’d gladly try it :blush: But mainly performance things, not something that would just plain change how the code works, cause as i said, wann try it this way :sweat_smile:

So, if anyone knows where the problem lies, please tell me :sweat_smile:

Oh and note that i’m not too sure about the terminology, but that’s just because the terminology itself is not sure about itself :sweat_smile:

Also note, that if you wanna run it, the code is very slow, so it takes around 1 second to run the 2 Networks 100 times (so around 5millis per run)…which might not be too bad… considering its my first selfmade attempt.

Edit: I tried changing the hidden Layer size to 5 * 5, and also to connect to all Neurons, But still no improvement… so it‘s not that the connections Need to be mutated too… also tried to change it to a rnn like structure (Layer only Sends to Next Layer), But still nothing…

1 Like
some thing i want to say, pls. not angry
code needs still be formatted
by </>
as usual

Sorry :sweat_smile: Didn’t realize that it was not formatted in there :sweat_smile:

did you ever manage to find a solution to this, I’m currently working on my own implementation using an example code I found online. The code would train and test using the mnist dataset, and i added a pgraphics element to test user input, which works ok ish, but stuck on improvements.

pretty sure we used the same resource, i recognise the loadData() function.

1 Like