Politics, Power, and Science: More than doubled the speed of my neural network training using cblas and SSE.

This was actually very easy and strait forward to do.

void add_biases_simd(double *output, double *biases, int output_size) {
    for (int i = 0; i < output_size; i += 4) {
        __m128d voutput = _mm_loadu_pd(&output[i]);
        __m128d vbiases = _mm_loadu_pd(&biases[i]);
        voutput = _mm_add_pd(voutput, vbiases);
        _mm_storeu_pd(&output[i], voutput);
    }
}

void forward_layer(double *input, double *weights, double *biases, int input_size, int output_size, double *output, activation_function activate, ActivationFunction activation) {
    // Use SIMD or BLAS for matrix multiplication
    cblas_dgemv(CblasRowMajor, CblasNoTrans, output_size, input_size, 1.0, weights, input_size, input, 1, 0.0, output, 1);
    // Add biases using SIMD
    add_biases_simd(output, biases, output_size);
    // Apply activation function
    if (activation == ACTIVATION_SOFTMAX) {
        softmax(output, output, output_size);
    } else if (activate) {
        for (int i = 0; i < output_size; ++i) {
            output[i] = activate(output[i]);
        }
    }
}
void forward_layer_simple(double *input, double *weights, double *biases, int input_size, int output_size, double *output, activation_function activate, ActivationFunction activation)
{
    for (int i = 0; i < output_size; ++i)
    {
        output[i] = 0.0;
        for (int j = 0; j < input_size; ++j)
        {output[i] += input[j] * weights[i * input_size + j]; }
        output[i] += biases[i];
    }

    if (activation == ACTIVATION_SOFTMAX)
    { softmax(output, output, output_size);
    } else if (activate) {
        for (int i = 0; i < output_size; ++i)
        { output[i] = activate(output[i]); }
    }
}

This an improvement of the backward pass. I now need to implement my optimizations in terms of sse and cblas. I am not concerned about keeping these old functions around because I won't be running the backward pass on embedded machines.

void backward_pass(NeuralNet *net, double *input, double *expected, double *output) {
    int i, j, k;
    Layer *output_layer = &net->layers[net->num_layers - 1];

    // Calculate output layer errors
    if (output_layer->activation == ACTIVATION_SOFTMAX) {
        for (i = 0; i < output_layer->output_size; ++i) {
            output_layer->errors[i] = output[i] - expected[i];
        }
    } else {
        for (i = 0; i < output_layer->output_size; ++i) {
            output_layer->errors[i] = net->calculate_error_derivative(net, output[i], expected[i]) * 
                                      output_layer->activate_derivative(output_layer->activations[i]);
        }
    }

    // Backpropagate errors and update weights
    for (i = net->num_layers - 1; i > 0; --i) {
        Layer *current_layer = &net->layers[i];
        Layer *prev_layer = &net->layers[i - 1];

        // Calculate gradients and update weights and biases
        for (j = 0; j < current_layer->output_size; ++j) {
            for (k = 0; k < current_layer->input_size; k += 2) {
                int weight_index = j * current_layer->input_size + k;
                double gradient0 = current_layer->errors[j] * prev_layer->activations[k] / net->batchsize;
                double gradient1 = current_layer->errors[j] * prev_layer->activations[k + 1] / net->batchsize;

                double jitter0 = jitter(net);
                double jitter1 = jitter(net);

                // Use SSE2 to update weights in pairs
                __m128d grad = _mm_set_pd(gradient1 - jitter1, gradient0 - jitter0);
                __m128d weight = _mm_loadu_pd(&current_layer->weights[weight_index]);
                __m128d update = _mm_sub_pd(weight, _mm_mul_pd(grad, _mm_set1_pd(net->learningrate)));

                _mm_storeu_pd(&current_layer->weights[weight_index], update);
            }
            // Update biases with jitter
            double bias_update = current_layer->errors[j] / net->batchsize - jitter(net);
            current_layer->biases[j] -= net->learningrate * bias_update;
        }

        // Calculate errors for previous layer (if not input layer)
        if (i > 1) {
            cblas_dgemv(CblasRowMajor, CblasTrans, current_layer->output_size, current_layer->input_size, 1.0,
                        current_layer->weights, current_layer->input_size, current_layer->errors, 1, 0.0,
                        prev_layer->errors, 1);

            for (j = 0; j < prev_layer->output_size; ++j) {
                prev_layer->errors[j] *= prev_layer->activate_derivative(prev_layer->activations[j]);
            }
        }
    }
}

Now, I did lose the optimized updates to weights and biases. They now have to be rewritten in terms of sse and cblas.

Politics, Power, and Science

Saturday, August 17, 2024

More than doubled the speed of my neural network training using cblas and SSE.

No comments:

Post a Comment