SwitchNet Neural network examples

Neural networks using fast transforms as a core component.

You can view the fast transform as a super fast to compute synthetic weight matrix that you combine with parametric activation functions as the adjustable part.
Or you can view the fast transform as a 1 to all connection algorithm linking together multiple small width layers into a complete wide layer. Gaining a speed advantage and parameter reduction advantage.

// Test with Lissajous curves
color c1;
float rate;
int epochs;
float[][] ex;
float[] work = new float[256];
SWNet net;

void setup() {
  size(400, 400);
  net = new SWNet(256, 4, 4,0.1);//vecLen,widthBy,depth,learning rate
  rate=0.05f; //Learning rate         
  epochs=20; //Epochs per frame
  c1 = color(0xff, 0xd7, 00);
  ex=new float[8][256];
  for (int i = 0; i < 127; i++) {
    // Training data
    float t = (i * 2 * PI) / 127;
    ex[0][2 * i] = sin(t);
    ex[0][2 * i + 1] = sin(2 * t);
    ex[1][2 * i] = sin(2 * t);
    ex[1][2 * i + 1] = sin(t);
    ex[2][2 * i] = sin(2 * t);
    ex[2][2 * i + 1] = sin(3 * t);
    ex[3][2 * i] = sin(3 * t);
    ex[3][2 * i + 1] = sin(2 * t);
    ex[4][2 * i] = sin(3 * t);
    ex[4][2 * i + 1] = sin(4 * t);
    ex[5][2 * i] = sin(4 * t);
    ex[5][2 * i + 1] = sin(3 * t);
    ex[6][2 * i] = sin(2 * t);
    ex[6][2 * i + 1] = sin(5 * t);
    ex[7][2 * i] = sin(5 * t);
    ex[7][2 * i + 1] = sin(2 * t);
  }
  textSize(16);
}

void draw() {
  background(0);
  //loadPixels();
  for (int k=0; k<epochs; k++) {
    for (int i = 0; i < ex.length; i++) {
      net.train(ex[i], ex[i]);
    }
  }
  for (int i = 0; i < 8; i++) {
    for (int j = 0; j < 255; j += 2) {
      set(25 + i * 40 + int(18 * ex[i][j]), 44 + int(18 * ex[i][j + 1]), c1);
    }
  }
  for (int i = 0; i < 8; i++) {
    net.recall(work, ex[i]);
    for (int j = 0; j < 255; j += 2) {
      set(25 + i * 40 +int( 18 * work[j]), 104 + int(18 * work[j + 1]), c1);
    }
  }
  //updatePixels();
  text("Training Data", 5, 20);
  text("Autoassociative recall", 5, 80);
  text("Iterations: " + frameCount*epochs, 5, 150);
}

final class SWNet {
  final int vecLen;
  final int widthBy;
  final int depth;
  final float[] params;
  final float[] surface;
  final float[] work;
  final float[] flipsIn;
  final float[] flipsOut;
  float rate;
  SWNet(int vecLen, int widthBy, int depth, float rate) {
    this.vecLen = vecLen;
    this.widthBy=widthBy; // Increase the width to allow full information flow through each layer.
    this.depth = depth;
    this.rate=rate;
    params = new float[2*vecLen*widthBy*depth];
    surface=new float[vecLen*widthBy*depth];
    work=new float[vecLen*widthBy];
    flipsIn=new float[vecLen*widthBy];
    flipsOut=new float[vecLen];
    for (int i = 0; i < params.length; i++) {
      params[i] = randomGaussian();
    }
    int r = 0x9E3779B9;
    for (int i=0; i<vecLen*widthBy; i++) {
	  r=r*1664525+1013904223; // Numerical receipes LCG random number generator
      flipsIn[i] = (r & 1<<16) == 0 ? -1f : 1f; // bit 16, higher bits more random
    }
    for (int i=0; i<vecLen; i++) {
	  r=r*1664525+1013904223;
      flipsOut[i] = (r & 1<<16) == 0 ? -1f : 1f;
    }
  }

  void recall(float[] result, float[] input) {
    final int n=vecLen*widthBy;
    for (int i=0; i<n; i++) {
      work[i]=flipsIn[i]*input[i&(vecLen-1)]; //vecLen-1 acts as a binary mask
    }
    whtN(work); // Quasi Random Projection of input, the spectral response aspect of the transform is not need
    int paIdx = 0;
    for (int i = 0; i < depth; i++) {
      for (int j = 0; j < n; j ++) {
        float x= work[j];
        if (x < 0f) {
          work[j]=x*params[paIdx];
        } else {
          work[j]=x*params[paIdx+1];
        }
        paIdx +=2;
      }
      whtN(work);
    }
    for(int i=0;i<vecLen;i++){
      result[i]=work[i]*flipsOut[i];
    }  
  }

  void train(float[] target, float[] input) {
    final int n=vecLen*widthBy; // Forward pass
    for (int i=0; i<n; i++) {
      work[i]=flipsIn[i]*input[i&(vecLen-1)];
    }
    whtN(work);
    int paIdx = 0;
    for (int i = 0; i < depth; i++) {
      System.arraycopy(work, 0, surface, i*n, n); // store layer outputs 
      for (int j = 0; j < n; j ++) {
        float x= work[j];
        if (x < 0) {
          work[j]=x*params[paIdx];
        } else {
          work[j]=x*params[paIdx+1];
        }
        paIdx +=2;
      }
      whtN(work);
    }
    for(int i=0;i<vecLen;i++){
      work[i]=work[i]*flipsOut[i];
    }  
    for (int i=0; i<vecLen; i++) {
      work[i]=(target[i]-work[i])*rate; //error by learning rate
    }
    for(int i=0;i<vecLen;i++){
      work[i]=work[i]*flipsOut[i];
    } 
    for (int i=vecLen; i<n; i++) {    // clear upper
      work[i]=0f;
    }
    for (int i = depth-1; i >=0; i--) {
      whtN(work);  // invert error back
      paIdx=i*n*2;
      int surIdx=i*n;
      for (int j = 0; j < n; j ++,paIdx+=2) {
        float x=surface[surIdx+j];
        float e=work[j]; // error
        if (x < 0f) {    
          params[paIdx]+=x*e;
          work[j]*=params[paIdx];
        } else {
          params[paIdx+1]+=x*e;
          work[j]*=params[paIdx+1];
        }
      }
    }
  }
}


// Fast Walsh Hadamard Transform (normalized for scale.)
void whtN(float[] vec) {
  final int n = vec.length;
  int hs = 1;
  while (hs < n) {
    int i = 0;
    while (i < n) {
      final int j = i + hs;  // final here is good hint to hotspot
      while (i < j) {
        float a = vec[i];
        float b = vec[i + hs];
        vec[i] = a + b;
        vec[i + hs] = a - b;
        i += 1;
      }
      i += hs;
    }
    hs += hs;
  }
  float sc=1f/sqrt(n);  // scale to leave overall vector magnitude unchanged
  for (int i=0; i<n; i++) {
    vec[i]*=sc;
  }
}

Further experiments:
https://archive.org/details/switch-nets

This looks really interesting, thanks for sharing!

I’m trying to get a clearer picture of how this could be applied in practice. Could you share some concrete examples of where this technique might be useful, especially in ways that are easy to grasp for someone without a deep background in neural networks?

By the way, have you considered turning it into a Processing library? We have a library template and detailed instructions on creating a library.

2025-02-24 14.16.02

2 Likes

thanks for sharing.
it s fascinating this converge with a so simple ‘neuron’, an analogic electronic version (5 resistors 2 transistor by node i think) will be so fast !!
( the computer version is so efficient i didn’t realize first, i added frameRate(1); to see the learning progression)

edit: found a patent about an implemenattion in 1972 : US3775602A - Real time walsh-hadamard transformation of two-dimensional discrete pictures - Google Patents
and it looks recent asic implementation too…
thanks again

I might put it in a library since the suggestion has been made.
I think next I will do a Javascript processing version.
I tidied up the code a bit:

color c1;
int epochs;
float[][] ex;
float[] work = new float[256];
SWNet net;

void setup() {
  size(400, 400);
  net = new SWNet(256, 4, 4, 0.1);//vecLen,widthBy,depth,learning rate
  epochs=25; //Epochs per frame
  c1 = color(255, 255, 255);
  ex=new float[8][256];
  for (int i = 0; i < 127; i++) {
    // Training data
    float t = (i * 2 * PI) / 127;
    ex[0][2 * i] = sin(t);
    ex[0][2 * i + 1] = sin(2 * t);
    ex[1][2 * i] = sin(2 * t);
    ex[1][2 * i + 1] = sin(t);
    ex[2][2 * i] = sin(2 * t);
    ex[2][2 * i + 1] = sin(3 * t);
    ex[3][2 * i] = sin(3 * t);
    ex[3][2 * i + 1] = sin(2 * t);
    ex[4][2 * i] = sin(3 * t);
    ex[4][2 * i + 1] = sin(4 * t);
    ex[5][2 * i] = sin(4 * t);
    ex[5][2 * i + 1] = sin(3 * t);
    ex[6][2 * i] = sin(2 * t);
    ex[6][2 * i + 1] = sin(5 * t);
    ex[7][2 * i] = sin(5 * t);
    ex[7][2 * i + 1] = sin(2 * t);
  }
  textSize(16);
  // frameRate(5);
}

void draw() {
  background(0);
  for (int k=0; k<epochs; k++) {
    for (int i = 0; i < ex.length; i++) {
      net.train(ex[i], ex[i]);
    }
  }
  for (int i = 0; i < 8; i++) {
    for (int j = 0; j < 255; j += 2) {
      set(25 + i * 40 + int(18 * ex[i][j]), 44 + int(18 * ex[i][j + 1]), c1);
    }
  }
  for (int i = 0; i < 8; i++) {
    net.recall(work, ex[i]);
    for (int j = 0; j < 255; j += 2) {
      set(25 + i * 40 +int( 18 * work[j]), 104 + int(18 * work[j + 1]), c1);
    }
  }
  text("Training Data", 5, 20);
  text("Autoassociative recall", 5, 80);
  text("Iterations: " + frameCount*epochs, 5, 150);
}

final class SWNet {
  final int vecLen;
  final int widthBy;
  final int depth;
  final float[] params;
  final float[] surface;
  final float[] work;
  final float[] flipsIn;
  final float[] flipsOut;
  float rate;
  SWNet(int vecLen, int widthBy, int depth, float rate) {
    this.vecLen = vecLen;
    this.widthBy=widthBy;
    this.depth = depth;
    this.rate=rate;
    params = new float[2*vecLen*widthBy*depth];
    surface=new float[vecLen*widthBy*depth];
    work=new float[vecLen*widthBy];
    flipsIn=new float[vecLen*widthBy];
    flipsOut=new float[vecLen];
    for (int i = 0; i < params.length; i++) {
      params[i] = randomGaussian();
    }
    int r = 0x9E3779B9;
    for (int i=0; i<vecLen*widthBy; i++) {
      r=r*1664525+1013904223; // Numerical receipes LCG random number generator
      flipsIn[i] = (r & (1<<16)) == 0 ? -1f : 1f; // bit 16, higher bits more random
    }
    for (int i=0; i<vecLen; i++) {
      r=r*1664525+1013904223;
      flipsOut[i] = (r & (1<<16)) == 0 ? -1f : 1f;
    }
  }

  void recall(float[] result, float[] input) {
    final int n=vecLen*widthBy;
    final boolean t=result==work;  //training
    for (int i=0; i<n; i++) {
      work[i]=flipsIn[i]*input[i&(vecLen-1)]; //vecLen-1 acts as a binary mask
    }
    whtN(work);
    for (int i = 0, paIdx=0; i < depth; i++) {
      if(t) System.arraycopy(work, 0, surface, i*n, n);
      for (int j = 0; j < n; j ++, paIdx+=2) {
        if (work[j] < 0f) {
          work[j] *=params[paIdx];
        } else {
          work[j] *=params[paIdx+1];
        }
      }
      whtN(work);
    }
    if(t) return;
    for (int i=0; i<vecLen; i++) {
      result[i]=work[i]*flipsOut[i];
    }
  }

  void train(float[] target, float[] input) {
    final int n=vecLen*widthBy;
    recall(work,input);
    for (int i=0; i<vecLen; i++) {
      work[i]=(target[i]*flipsOut[i]-work[i])*rate;
    }
    for (int i=vecLen; i<n; i++) {    // clear upper
      work[i]=0f;
    }
    for (int i = depth-1; i >=0; i--) {
      whtN(work);
      for (int j = 0, surIdx=i*n, paIdx=2*i*n; j < n; j ++, paIdx+=2) {
        float x=surface[surIdx+j];
        if (x < 0f) {
          params[paIdx]+=x*work[j];
          work[j] *=params[paIdx];
        } else {
          params[paIdx+1]+=x*work[j];
          work[j] *=params[paIdx+1];
        }
      }
    }
  }
}

// Fast Walsh Hadamard Transform (normalized for scale.)
void whtN(float[] vec) {
  final int n = vec.length;
  int hs = 1;
  while (hs < n) {
    int i = 0;
    while (i < n) {
      final int j = i + hs;  // final here is good hint to hotspot
      while (i < j) {
        float a = vec[i];
        float b = vec[i + hs];
        vec[i] = a + b;
        vec[i + hs] = a - b;
        i += 1;
      }
      i += hs;
    }
    hs += hs;
  }
  float sc=1f/sqrt(n);  // scale to leave overall vector magnitude unchanged
  for (int i=0; i<n; i++) {
    vec[i]*=sc;
  }
}

I think I will create some other examples. I haven’t really done much coding for a year or so. And I don’t want to sit coding for 8 hours non-stop per day either. I think a little bit every day and improve the quality of code I create.
I have a little website with some background information:
https://sites.google.com/view/algorithmshortcuts/switchnet

2 Likes

I should say vecLen the length of the input,output and target arrays should be some positive integer power of 2 like 64,128,256,…
Likewise widthBy should be 1,2,4,8,16,…
The (learning) rate maybe can be 0.1 for small networks, substantially smaller for deep nets like 0.001.

One application would as very wide input layers for a neural network that was dealing with say super high resolution images.
Or where you need very fast neural networks for real-time control of some machine or whatever.
I’m not really sure how to start explaining it at a very basic level.
I think if you start with the idea that the ReLU neural network activation function is a switch:
https://sites.google.com/view/algorithmshortcuts/relu-as-a-switch
And then explain how weighted sums can be connected up as weighted sum switching networks. And then kind of go through the (simple) linear algebra of that.

Maybe it is simpler to say that weighting the inputs to a fast transform allows that transform to Approximate a full neural network weight matrix quite well. If the internal width of the network is say 256, then there are 2 to the power of 256 different switching decisions. That is an absolutely astronomical number of full neural network weight matrix approximations. And surely one of those will approximately do for some task you are training the network for.

1 Like

I did a Processing p5.js version with 256 training examples.
You try to recall a sine wave given a cosine wave.
https://editor.p5js.org/seanhaddps/sketches/TlfJQFFxU

Also I tried to explain how fast transforms can turn simple weighting of a vector into rotations.
https://sites.google.com/view/algorithmshortcuts/uniform-point-picking-on-a-sphere
That is only a mathematical outline, that some computer scientist can fill in (or disprove!) Up to them.

1 Like

I have done a dense layer version of the SwitchNet neural network.
This allows you to exceed the n squared layer parameter limit of a conventional neural network layer or you may make the layers sparser than in a conventional neural network layer.
Anyway you get far more switching per parameter than in a conventional say ReLU based neural network.
[Switch Net DL : Sean O'Connor : Free Download, Borrow, and Streaming : Internet Archive]
(Switch Net DL : Sean O'Connor : Free Download, Borrow, and Streaming : Internet Archive)
I definitely struggled writing the code, for one reason, not having coded anything much for a long time. I would call the code alpha stage.
It is unpaid work so things tend to proceed very slowly. Like years of delay, literally.
There could be entities out there that were not so delayed.

I wrote a P5.js on-line version.
https://sites.google.com/view/algorithmshortcuts/switchnet-dense-layer

2 Likes