This was actually very easy and strait forward to do.
void add_biases_simd(double *output, double *biases, int output_size) {
for (int i = 0; i < output_size; i += 4) {
__m128d voutput = _mm_loadu_pd(&output[i]);
__m128d vbiases = _mm_loadu_pd(&biases[i]);
voutput = _mm_add_pd(voutput, vbiases);
_mm_storeu_pd(&output[i], voutput);
}
}
void forward_layer(double *input, double *weights, double *biases, int input_size, int output_size, double *output, activation_function activate, ActivationFunction activation) {
// Use SIMD or BLAS for matrix multiplication
cblas_dgemv(CblasRowMajor, CblasNoTrans, output_size, input_size, 1.0, weights, input_size, input, 1, 0.0, output, 1);
// Add biases using SIMD
add_biases_simd(output, biases, output_size);
// Apply activation function
if (activation == ACTIVATION_SOFTMAX) {
softmax(output, output, output_size);
} else if (activate) {
for (int i = 0; i < output_size; ++i) {
output[i] = activate(output[i]);
}
}
}
void forward_layer_simple(double *input, double *weights, double *biases, int input_size, int output_size, double *output, activation_function activate, ActivationFunction activation)
{
for (int i = 0; i < output_size; ++i)
{
output[i] = 0.0;
for (int j = 0; j < input_size; ++j)
{output[i] += input[j] * weights[i * input_size + j]; }
output[i] += biases[i];
}
if (activation == ACTIVATION_SOFTMAX)
{ softmax(output, output, output_size);
} else if (activate) {
for (int i = 0; i < output_size; ++i)
{ output[i] = activate(output[i]); }
}
}
This an improvement of the backward pass. I now need to implement my optimizations in terms of sse and cblas. I am not concerned about keeping these old functions around because I won't be running the backward pass on embedded machines.
void backward_pass(NeuralNet *net, double *input, double *expected, double *output) {
int i, j, k;
Layer *output_layer = &net->layers[net->num_layers - 1];
// Calculate output layer errors
if (output_layer->activation == ACTIVATION_SOFTMAX) {
for (i = 0; i < output_layer->output_size; ++i) {
output_layer->errors[i] = output[i] - expected[i];
}
} else {
for (i = 0; i < output_layer->output_size; ++i) {
output_layer->errors[i] = net->calculate_error_derivative(net, output[i], expected[i]) *
output_layer->activate_derivative(output_layer->activations[i]);
}
}
// Backpropagate errors and update weights
for (i = net->num_layers - 1; i > 0; --i) {
Layer *current_layer = &net->layers[i];
Layer *prev_layer = &net->layers[i - 1];
// Calculate gradients and update weights and biases
for (j = 0; j < current_layer->output_size; ++j) {
for (k = 0; k < current_layer->input_size; k += 2) {
int weight_index = j * current_layer->input_size + k;
double gradient0 = current_layer->errors[j] * prev_layer->activations[k] / net->batchsize;
double gradient1 = current_layer->errors[j] * prev_layer->activations[k + 1] / net->batchsize;
double jitter0 = jitter(net);
double jitter1 = jitter(net);
// Use SSE2 to update weights in pairs
__m128d grad = _mm_set_pd(gradient1 - jitter1, gradient0 - jitter0);
__m128d weight = _mm_loadu_pd(¤t_layer->weights[weight_index]);
__m128d update = _mm_sub_pd(weight, _mm_mul_pd(grad, _mm_set1_pd(net->learningrate)));
_mm_storeu_pd(¤t_layer->weights[weight_index], update);
}
// Update biases with jitter
double bias_update = current_layer->errors[j] / net->batchsize - jitter(net);
current_layer->biases[j] -= net->learningrate * bias_update;
}
// Calculate errors for previous layer (if not input layer)
if (i > 1) {
cblas_dgemv(CblasRowMajor, CblasTrans, current_layer->output_size, current_layer->input_size, 1.0,
current_layer->weights, current_layer->input_size, current_layer->errors, 1, 0.0,
prev_layer->errors, 1);
for (j = 0; j < prev_layer->output_size; ++j) {
prev_layer->errors[j] *= prev_layer->activate_derivative(prev_layer->activations[j]);
}
}
}
}
Now, I did lose the optimized updates to weights and biases. They now have to be rewritten in terms of sse and cblas.
No comments:
Post a Comment