Start with the MNIST database of handwritten digits, it's a common entry point and you'll find countless tutorials.
http://yann.lecun.com/exdb/mnist/
Here's for instance an ANN with forward propagation that I wrote for this challenge. It was a lesbillion years ago, so forgive the poor code quality. You need to install cblas
hdf5
and gsl
.
#include <stdlib.h>
#include <stdio.h>
#include <assert.h>
#include <time.h>
#include <math.h>
#include <hdf5.h>
#include <hdf5_hl.h>
#include <gsl/gsl_cblas.h>
void random_numbers(int n, int m, int *num)
{
int i, in, im, *is_used;
srand((unsigned int) time(NULL));
is_used = malloc(n * sizeof(int));
for (i = 0; i < n; ++i) is_used[i] = 0;
im = 0;
for (in = n - m; in < n && im < m; ++in) {
int r = rand() % (in + 1);
if (is_used[r]) r = in;
assert(!is_used[r]);
num[im++] = r;
is_used[r] = 1;
}
assert((im = m));
free(is_used);
}
herr_t load_matrix(char *filename, char *matrix_name, float *data)
{
hid_t file_id;
hsize_t dims[2];
herr_t status;
file_id = H5Fopen(filename, H5F_ACC_RDONLY, H5P_DEFAULT);
status = H5LTread_dataset_float(file_id, matrix_name, data);
status = H5LTget_dataset_info(file_id, matrix_name, dims, NULL, NULL);
return(status);
}
int main()
{
size_t X_size, y_size;
float *X, *y, *Theta1, *Theta2, *L2, *L3, *test;
int i, n, max, *rand_num;
int errors = 0;
X_size = 5000 * 400;
X = malloc(X_size * sizeof(float));
load_matrix("data/dataset.h5", "X", X);
y_size = 5000;
y = malloc(y_size * sizeof(float));
load_matrix("data/dataset.h5", "y", y);
/* choose 100 randoms images in X */
rand_num = malloc(100 * sizeof(int));
random_numbers(5000, 100, rand_num);
/* test the 100 random digits */
test = malloc(401 * sizeof(float));
Theta1 = malloc(401 * 25 * sizeof(float));
Theta2 = malloc(26 * 10 * sizeof(float));
L2 = malloc(26 * sizeof(float));
L3 = malloc(10 * sizeof(float));
for (n = 0; n < 5000; ++n) {
/* test data no 1203 */
test[0] = 1;
for (i = 0; i < 400; ++i) {
test[i+1] = X[n + i * 5000];
}
/* load Theta1, 401x25 matrix of weights for the hidden layer */
load_matrix("data/weights.h5", "Theta1", Theta1);
/* layer 2 is 26x1 */
/* multiply */
cblas_sgemm(CblasRowMajor, CblasTrans, CblasNoTrans, 25, 1, 401, 1.0, Theta1, 25, test, 1, 0.0, L2 + 1, 1);
/* sigmoid */
for(i = 1; i < 26; ++i) {
L2[i] = 1 / (1 + exp(-L2[i]));
}
/*add bias value */
L2[0] = 1;
/* Layer 3 */
/* load Theta2, 26x10 matrix of weights for the output layer */
load_matrix("data/weights.h5", "Theta2", Theta2);
/* output layer is 10x1 */
cblas_sgemm(CblasRowMajor, CblasTrans, CblasNoTrans, 10, 1, 26, 1.0, Theta2, 10, L2, 1, 0.0, L3, 1);
/*get the max value*/
max = 0;
for (i = 0; i < 10; ++i) {
if (L3[i] > L3[max]) max = i;
}
if ((max + 1) % 10 != (int) y[n] % 10) {
printf("%4d - predicted: %d | digit is: %d\n", n, (max + 1) % 10, (int) y[n] % 10);
errors++;
}
}
printf("accuracy: %f\n", 100 * (5000 - (float) errors) / 5000);
free(X);
free(y);
free(test);
free(L2);
free(L3);
free(Theta1);
free(Theta2);
free(rand_num);
return 0;
}
I've put the MNIST dataset in HDF5, you can download it here: https://send.firefox.com/download/7f815eb602434217/#CjjlBzTawTx8CFGP_uGpxg (link will expire in 7 days)
gcc -Wall -pedantic -lhdf5 -l hdf5_hl -lgslcblas -ldl -lm fw_propagation.c
./a.out
... (list of errors)
accuracy: 97.519997
Not awful but nothing to boast about. I found this code on an old hard disk, it doesn't seeem to choose digits randomly. It's either bit rot or I got bored with it without even finishing. I don't remember, I was surprised it's still compiling.