Mastodon Politics, Power, and Science: More than doubled the speed of my neural network training using cblas and SSE.

Saturday, August 17, 2024

More than doubled the speed of my neural network training using cblas and SSE.

This was actually very easy and strait forward to do. 




void add_biases_simd(double *output, double *biases, int output_size) {
for (int i = 0; i < output_size; i += 4) {
__m128d voutput = _mm_loadu_pd(&output[i]);
__m128d vbiases = _mm_loadu_pd(&biases[i]);
voutput = _mm_add_pd(voutput, vbiases);
_mm_storeu_pd(&output[i], voutput);
}
}

void forward_layer(double *input, double *weights, double *biases, int input_size, int output_size, double *output, activation_function activate, ActivationFunction activation) {
// Use SIMD or BLAS for matrix multiplication
cblas_dgemv(CblasRowMajor, CblasNoTrans, output_size, input_size, 1.0, weights, input_size, input, 1, 0.0, output, 1);
// Add biases using SIMD
add_biases_simd(output, biases, output_size);
// Apply activation function
if (activation == ACTIVATION_SOFTMAX) {
softmax(output, output, output_size);
} else if (activate) {
for (int i = 0; i < output_size; ++i) {
output[i] = activate(output[i]);
}
}
}
void forward_layer_simple(double *input, double *weights, double *biases, int input_size, int output_size, double *output, activation_function activate, ActivationFunction activation)
{
for (int i = 0; i < output_size; ++i)
{
output[i] = 0.0;
for (int j = 0; j < input_size; ++j)
{output[i] += input[j] * weights[i * input_size + j]; }
output[i] += biases[i];
}

if (activation == ACTIVATION_SOFTMAX)
{ softmax(output, output, output_size);
} else if (activate) {
for (int i = 0; i < output_size; ++i)
{ output[i] = activate(output[i]); }
}
}


This an improvement of the backward pass.  I now need to implement my optimizations in terms of sse and cblas.  I am not concerned about keeping these old functions around because I won't be running the backward pass on embedded machines. 


void backward_pass(NeuralNet *net, double *input, double *expected, double *output) {
int i, j, k;
Layer *output_layer = &net->layers[net->num_layers - 1];

// Calculate output layer errors
if (output_layer->activation == ACTIVATION_SOFTMAX) {
for (i = 0; i < output_layer->output_size; ++i) {
output_layer->errors[i] = output[i] - expected[i];
}
} else {
for (i = 0; i < output_layer->output_size; ++i) {
output_layer->errors[i] = net->calculate_error_derivative(net, output[i], expected[i]) *
output_layer->activate_derivative(output_layer->activations[i]);
}
}

// Backpropagate errors and update weights
for (i = net->num_layers - 1; i > 0; --i) {
Layer *current_layer = &net->layers[i];
Layer *prev_layer = &net->layers[i - 1];

// Calculate gradients and update weights and biases
for (j = 0; j < current_layer->output_size; ++j) {
for (k = 0; k < current_layer->input_size; k += 2) {
int weight_index = j * current_layer->input_size + k;
double gradient0 = current_layer->errors[j] * prev_layer->activations[k] / net->batchsize;
double gradient1 = current_layer->errors[j] * prev_layer->activations[k + 1] / net->batchsize;

double jitter0 = jitter(net);
double jitter1 = jitter(net);

// Use SSE2 to update weights in pairs
__m128d grad = _mm_set_pd(gradient1 - jitter1, gradient0 - jitter0);
__m128d weight = _mm_loadu_pd(&current_layer->weights[weight_index]);
__m128d update = _mm_sub_pd(weight, _mm_mul_pd(grad, _mm_set1_pd(net->learningrate)));

_mm_storeu_pd(&current_layer->weights[weight_index], update);
}
// Update biases with jitter
double bias_update = current_layer->errors[j] / net->batchsize - jitter(net);
current_layer->biases[j] -= net->learningrate * bias_update;
}

// Calculate errors for previous layer (if not input layer)
if (i > 1) {
cblas_dgemv(CblasRowMajor, CblasTrans, current_layer->output_size, current_layer->input_size, 1.0,
current_layer->weights, current_layer->input_size, current_layer->errors, 1, 0.0,
prev_layer->errors, 1);

for (j = 0; j < prev_layer->output_size; ++j) {
prev_layer->errors[j] *= prev_layer->activate_derivative(prev_layer->activations[j]);
}
}
}
}


Now, I did lose the optimized updates to weights and biases.  They now have to be rewritten in terms of  sse and cblas.  




 

No comments:

Post a Comment

Progress on the campaign manager

You can see that you can build tactical maps automatically from the world map data.  You can place roads, streams, buildings. The framework ...