#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>
#include <time.h>

unsigned int get_cache_line_size ( int id ) {
    uint32_t eax = 4;
    uint32_t ebx;
    uint32_t ecx = id;
    uint32_t edx;
    
    __asm__ (
        "cpuid":
        "+a" (eax),
        "=b" (ebx),
        "+c" (ecx),
        "=d" (edx)
    );
    
    int cache_type = eax & 0x1F;
    
    if ( cache_type == 0 ) {
        return -1;
    }
    
    return ( ebx & 0xFFF ) + 1;
}

int* read_matrix ( FILE *input_file, int *rows, int *columns ) {
    fscanf ( input_file, "%d %d", rows, columns );
    
    int *matrix = malloc ( (*rows) * (*columns) * sizeof ( int ) );
    
    if ( matrix == NULL ) {
        exit ( 3 );
    }
    
    for ( int i = 0; i < (*rows); ++i ) {
        for ( int j = 0; j < (*columns); ++j ) {
            fscanf ( input_file, "%d", &matrix[i * (*columns) + j] );
        }
    }
    
    return matrix;
}

void multiply_matrices_v0 ( int *a, int a_rows, int a_columns, int *b, int b_rows, int b_columns, int *c ) {
    for ( unsigned int i = 0; i < a_rows; ++i ) {
        for ( unsigned int j = 0; j < b_columns; ++j ) {
            for ( unsigned int k = 0; k < a_columns; ++k ) {
                unsigned int a_index = i * a_columns + k;
                unsigned int b_index = k * b_columns + j;
                unsigned int c_index = i * b_columns + j;
                
                c[c_index] += a[a_index] * b[b_index];
            }
        }
    }
}

void multiply_matrices_v1 ( int *a, int a_rows, int a_columns, int *b, int b_rows, int b_columns, int *c ) {
    for ( unsigned int i = 0; i < a_rows; ++i ) {
        for ( unsigned int k = 0; k < a_columns; ++k ) {
            for ( unsigned int j = 0; j < b_columns; ++j ) {
                unsigned int a_index = i * a_columns + k;
                unsigned int b_index = k * b_columns + j;
                unsigned int c_index = i * b_columns + j;
                
                c[c_index] += a[a_index] * b[b_index];
            }
        }
    }
}

void multiply_matrices_v2 ( int *a, int a_rows, int a_columns, int *b, int b_rows, int b_columns, int *c ) {
    unsigned int stride = get_cache_line_size ( 0 ) / sizeof ( int );
    
    for ( unsigned int i = 0; i < a_rows; i += stride ) {
        for ( unsigned int k = 0; k < a_columns; ++k ) {
            for ( unsigned int j = 0; j < b_columns; j += stride ) {
                for ( unsigned int m = i; m < ( i + stride ) && i < a_rows; ++m ) {
                    for ( unsigned int l = j; l < ( j + stride ) && l < b_columns; ++l ) {
                        unsigned int a_index = m * a_columns + k;
                        unsigned int b_index = k * b_columns + l;
                        unsigned int c_index = m * b_columns + l;
                        
                        c[c_index] += a[a_index] * b[b_index];
                    }
                }
            }
        }
    }
}

int main ( int argc, char **argv ) {
    int type       = atoi ( argv[1] );
    int iterations = atoi ( argv[2] );
    
    FILE *a_matrix_file = fopen ( argv[3], "r" );
    FILE *b_matrix_file = fopen ( argv[4], "r" );
    
    if ( a_matrix_file == NULL || b_matrix_file == NULL ) {
        exit ( 1 );
    }
    
    int a_rows = 0, a_columns = 0;
    int b_rows = 0, b_columns = 0;
    
    int *a = read_matrix ( a_matrix_file, &a_rows, &a_columns );
    int *b = read_matrix ( b_matrix_file, &b_rows, &b_columns );
    
    int c_rows    = a_rows;
    int c_columns = b_columns;
    int *c = calloc ( c_rows * c_columns, sizeof ( int ) );
    
    if ( c == NULL ) {
        exit ( 2 );
    }
    
    clock_t start = clock ( );
    for ( int i = 0; i < iterations; ++i ) {
        switch ( type ) {
            case 0: {
                multiply_matrices_v0 ( a, a_rows, a_columns, b, b_rows, b_columns, c );
                break;
            }
            case 1: {
                multiply_matrices_v1 ( a, a_rows, a_columns, b, b_rows, b_columns, c );
                break;
            }
            case 2: {
                multiply_matrices_v2 ( a, a_rows, a_columns, b, b_rows, b_columns, c );
                break;
            }
        }
    }
    
    clock_t end = clock ( );
    
    double elapsed_in_ms = ( end - start ) * 1. / CLOCKS_PER_SEC * 1000;
    
    printf ( "ELAPSED (ms) = %lf\n", elapsed_in_ms );
    
    fclose ( a_matrix_file );
    fclose ( b_matrix_file );
    free ( a );
    free ( b );
    free ( c );
    
    return 0;
}