#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>
#include <time.h>

unsigned int get_cache_line_size ( int id ) {
    uint32_t eax = 4;
    uint32_t ebx;
    uint32_t ecx = id;
    uint32_t edx;
    
    __asm__ (
        "cpuid":
        "+a" (eax),
        "=b" (ebx),
        "+c" (ecx),
        "=d" (edx)
    );
    
    int cache_type = eax & 0x1F;
    
    if ( cache_type == 0 ) {
        return -1;
    }
    
    return ( ebx & 0xFFF ) + 1;
}

int* read_matrix ( FILE *input_file, int *rows, int *columns ) {
    fscanf ( input_file, "%d %d", rows, columns );
    
    int *matrix = malloc ( (*rows) * (*columns) * sizeof ( int ) );
    
    if ( matrix == NULL ) {
        exit ( 3 );
    }
    
    for ( int i = 0; i < (*rows); ++i ) {
        for ( int j = 0; j < (*columns); ++j ) {
            fscanf ( input_file, "%d", &matrix[i * (*columns) + j] );
        }
    }
    
    return matrix;
}

void laplace_v0 ( int *previous, int* next, int rows, int columns ) {
    for ( unsigned int i = 1; i < ( rows - 1 ); ++i ) {
        for ( unsigned int j = 1; j < ( columns - 1 ); ++j ) {
            unsigned int center = ( i + 0 ) * columns + ( j + 0 );
            unsigned int up     = ( i - 1 ) * columns + ( j + 0 );
            unsigned int down   = ( i + 1 ) * columns + ( j + 0 );
            unsigned int left   = ( i + 0 ) * columns + ( j - 1 );
            unsigned int right  = ( i + 0 ) * columns + ( j + 1 );
            
            int c = previous[center];
            int u = previous[up];
            int d = previous[down];
            int l = previous[left];
            int r = previous[right];
            
            next[center] = u + d + l + r - 4 * c;
        }
    }
}

void laplace_v1 ( int *previous, int* next, int rows, int columns ) {
    const unsigned int stride = get_cache_line_size ( 0 ) / sizeof ( int );
    
    for ( unsigned int i_block = 1; i_block < ( rows - 1 ); i_block += stride )  {
        for ( unsigned int j_block = 1; j_block < ( columns - 1 ); j_block += stride) {
            for ( unsigned int i = i_block; i < ( i_block + stride ) && i < ( rows - 1 ); ++i ) {
                for ( unsigned int j = j_block; j < ( j_block + stride ) && j < ( columns - 1 ); ++j ) {
                    unsigned int center = ( i + 0 ) * columns + ( j + 0 );
                    unsigned int up     = ( i - 1 ) * columns + ( j + 0 );
                    unsigned int down   = ( i + 1 ) * columns + ( j + 0 );
                    unsigned int left   = ( i + 0 ) * columns + ( j - 1 );
                    unsigned int right  = ( i + 0 ) * columns + ( j + 1 );
                    
                    int c = previous[center];
                    int u = previous[up];
                    int d = previous[down];
                    int l = previous[left];
                    int r = previous[right];
                    
                    next[center] = u + d + l + r - 4 * c;
                }
            }
        }
    }
}

void laplace_v2 ( int *previous, int* next, int rows, int columns ) {
    const unsigned int stride = get_cache_line_size ( 0 ) / sizeof ( int );
    
    for ( unsigned int i_block = 1; i_block < ( rows - 1 ); i_block += stride )  {
        for ( unsigned int j_block = 1; j_block < ( columns - 1 ); j_block += stride) {
            for ( unsigned int i = i_block; i < ( i_block + stride ) && i < ( rows - 1 ); ++i ) {
                for ( unsigned int j = j_block; j < ( j_block + stride ) && j < ( columns - 1 ); j += 8 ) {
                    unsigned int ic0 = ( i + 0 ) * columns + ( j + 0 ) + 0;
                    unsigned int ip0 = ( i - 1 ) * columns + ( j + 0 ) + 0;
                    unsigned int id0 = ( i + 1 ) * columns + ( j + 0 ) + 0;
                    unsigned int il0 = ( i + 0 ) * columns + ( j - 1 ) + 0;
                    unsigned int ir0 = ( i + 0 ) * columns + ( j + 1 ) + 0;
                    
                    int c0 = previous[ic0];
                    int u0 = previous[ip0];
                    int d0 = previous[id0];
                    int l0 = previous[il0];
                    int r0 = previous[ir0];
                    
                    next[ic0] = u0 + d0 + l0 + r0 - 4 * c0;
                    
                    unsigned int ic1 = ( i + 0 ) * columns + ( j + 0 ) + 1;
                    unsigned int ip1 = ( i - 1 ) * columns + ( j + 0 ) + 1;
                    unsigned int id1 = ( i + 1 ) * columns + ( j + 0 ) + 1;
                    unsigned int il1 = ( i + 0 ) * columns + ( j - 1 ) + 1;
                    unsigned int ir1 = ( i + 0 ) * columns + ( j + 1 ) + 1;
    
                    int c1 = previous[ic1];
                    int u1 = previous[ip1];
                    int d1 = previous[id1];
                    int l1 = previous[il1];
                    int r1 = previous[ir1];
    
                    next[ic1] = u1 + d1 + l1 + r1 - 4 * c1;
                    
                    unsigned int ic2 = ( i + 0 ) * columns + ( j + 0 ) + 2;
                    unsigned int ip2 = ( i - 1 ) * columns + ( j + 0 ) + 2;
                    unsigned int id2 = ( i + 1 ) * columns + ( j + 0 ) + 2;
                    unsigned int il2 = ( i + 0 ) * columns + ( j - 1 ) + 2;
                    unsigned int ir2 = ( i + 0 ) * columns + ( j + 1 ) + 2;
    
                    int c2 = previous[ic2];
                    int u2 = previous[ip2];
                    int d2 = previous[id2];
                    int l2 = previous[il2];
                    int r2 = previous[ir2];
    
                    next[ic2] = u2 + d2 + l2 + r2 - 4 * c2;
                    
                    unsigned int ic3 = ( i + 0 ) * columns + ( j + 0 ) + 3;
                    unsigned int ip3 = ( i - 1 ) * columns + ( j + 0 ) + 3;
                    unsigned int id3 = ( i + 1 ) * columns + ( j + 0 ) + 3;
                    unsigned int il3 = ( i + 0 ) * columns + ( j - 1 ) + 3;
                    unsigned int ir3 = ( i + 0 ) * columns + ( j + 1 ) + 3;
    
                    int c3 = previous[ic3];
                    int u3 = previous[ip3];
                    int d3 = previous[id3];
                    int l3 = previous[il3];
                    int r3 = previous[ir3];
    
                    next[ic3] = u3 + d3 + l3 + r3 - 4 * c3;
                    
                    unsigned int ic4 = ( i + 0 ) * columns + ( j + 0 ) + 4;
                    unsigned int ip4 = ( i - 1 ) * columns + ( j + 0 ) + 4;
                    unsigned int id4 = ( i + 1 ) * columns + ( j + 0 ) + 4;
                    unsigned int il4 = ( i + 0 ) * columns + ( j - 1 ) + 4;
                    unsigned int ir4 = ( i + 0 ) * columns + ( j + 1 ) + 4;
    
                    int c4 = previous[ic4];
                    int u4 = previous[ip4];
                    int d4 = previous[id4];
                    int l4 = previous[il4];
                    int r4 = previous[ir4];
    
                    next[ic4] = u4 + d4 + l4 + r4 - 4 * c4;
                    
                    unsigned int ic5 = ( i + 0 ) * columns + ( j + 0 ) + 5;
                    unsigned int ip5 = ( i - 1 ) * columns + ( j + 0 ) + 5;
                    unsigned int id5 = ( i + 1 ) * columns + ( j + 0 ) + 5;
                    unsigned int il5 = ( i + 0 ) * columns + ( j - 1 ) + 5;
                    unsigned int ir5 = ( i + 0 ) * columns + ( j + 1 ) + 5;
    
                    int c5 = previous[ic5];
                    int u5 = previous[ip5];
                    int d5 = previous[id5];
                    int l5 = previous[il5];
                    int r5 = previous[ir5];
    
                    next[ic5] = u5 + d5 + l5 + r5 - 4 * c5;
                    
                    unsigned int ic6 = ( i + 0 ) * columns + ( j + 0 ) + 6;
                    unsigned int ip6 = ( i - 1 ) * columns + ( j + 0 ) + 6;
                    unsigned int id6 = ( i + 1 ) * columns + ( j + 0 ) + 6;
                    unsigned int il6 = ( i + 0 ) * columns + ( j - 1 ) + 6;
                    unsigned int ir6 = ( i + 0 ) * columns + ( j + 1 ) + 6;
    
                    int c6 = previous[ic6];
                    int u6 = previous[ip6];
                    int d6 = previous[id6];
                    int l6 = previous[il6];
                    int r6 = previous[ir6];
    
                    next[ic6] = u6 + d6 + l6 + r6 - 4 * c6;
                    
                    unsigned int ic7 = ( i + 0 ) * columns + ( j + 0 ) + 7;
                    unsigned int ip7 = ( i - 1 ) * columns + ( j + 0 ) + 7;
                    unsigned int id7 = ( i + 1 ) * columns + ( j + 0 ) + 7;
                    unsigned int il7 = ( i + 0 ) * columns + ( j - 1 ) + 7;
                    unsigned int ir7 = ( i + 0 ) * columns + ( j + 1 ) + 7;
    
                    int c7 = previous[ic7];
                    int u7 = previous[ip7];
                    int d7 = previous[id7];
                    int l7 = previous[il7];
                    int r7 = previous[ir7];
    
                    next[ic7] = u7 + d7 + l7 + r7 - 4 * c7;
                }
            }
        }
    }
}


int main ( int argc, char **argv ) {
    int  type      = atoi ( argv[1] );
    int iterations = atoi ( argv[2] );
    
    FILE *matrix_file = fopen ( argv[3], "r" );
    
    if ( matrix_file == NULL ) {
        exit ( 1 );
    }
    
    int rows    = 0;
    int columns = 0;
    
    int *previous = read_matrix ( matrix_file, &rows, &columns );
    int *next     = malloc ( rows * columns * sizeof ( int ) );
    
    fclose ( matrix_file );
    
    if ( next == NULL ) {
        exit ( 2 );
    }
    
    clock_t start = clock ( );
    for ( int i = 0; i < iterations; ++i ) {
        switch ( type ) {
            case 0: {
                laplace_v0 ( previous, next, rows, columns );
                break;
            }
            case 1: {
                laplace_v1 ( previous, next, rows, columns );
                break;
            }
            case 2: {
                laplace_v2 ( previous, next, rows, columns );
                break;
            }
        }
    }
    clock_t end = clock ( );
    
    double elapsed_in_ms = ( end - start ) * 1. / CLOCKS_PER_SEC * 1000;
    
    printf ( "ELAPSED (ms) = %lf\n", elapsed_in_ms );
    
    free ( previous );
    free ( next );
    
    return 0;
}