#include #include #include #include #include double gettime(void) { struct timeval tv; gettimeofday(&tv, NULL); return tv.tv_sec + 1e-6 * tv.tv_usec; } void matfill(long N, double *mat, double val) { long i, j; for(i = 0; i < N; i ++){ for(j = 0; j < N; j ++){ mat[i * N + j] = val; } } } void matmul(long N, double *a, double *b, double *c) { /************************************************************************* *** Matrix-Matrix Multiplication *** *** (cache blocking,loop unrolling,OpenMP tasks,Strassen algorithm) *** *** *** *** HP-SEE Computing Challenge *** *** http://www.hp-see.eu/hp-see-computing-challenge *** *** *** *** (C) Alexandros Sokratis Papadakis 2012 *** *** papadakis@ceid.upatras.gr *** *************************************************************************/ /******************************************************* * internal constants * *******************************************************/ #define NUM_THREADS 8 #define BLOCK_SIZE 50 //IMPORTANT: if BLOCK_SIZE!=50, correct the loop unrolling of core_mul,core_add,core_sub function #define THRESHOLD_MATRIX 7000 int THRESHOLD_MULT; /******************************************************* * internal functions * *******************************************************/ void matmul2(long N, double *X, double *Y, double *Z) { // S=X*Y classic // X,Y square matrices of size N int i,j,k; int tol1=N/NUM_THREADS; int thread; double *b_t=(double *)malloc(N*N*sizeof(double)); if(b_t==NULL){ printf("\nnot enough memory\n"); exit(1); } // save matrix b as b-transpose to perform better dot products (C language: row wise memory) for(thread=0;thread= 0; i--) free(P[i]); } /******************************************************* * function's code * *******************************************************/ #pragma omp parallel num_threads(NUM_THREADS) { #pragma omp single { if(N<=THRESHOLD_MATRIX) THRESHOLD_MULT=180; else THRESHOLD_MULT=1200; matmul0(N, a, b, c); } } } int main(int argc, char **argv) { long N; double *A, *B, *C, t; N = atoi(argv[1]); A = (double *) malloc(N * N * sizeof(double)); B = (double *) malloc(N * N * sizeof(double)); C = (double *) malloc(N * N * sizeof(double)); matfill(N, A, 1.0); matfill(N, B, 2.0); matfill(N, C, 0.0); t = gettime(); matmul(N, A, B, C); t = gettime() - t; fprintf(stdout, "%ld\t%le\t%le\n", N, t, (2 * N - 1) * N * N / t); fflush(stdout); free(A); free(B); free(C); return EXIT_SUCCESS; }