#include <iostream>
#include <math.h>
using namespace std;

// Timing routine
#ifdef _MSC_VER 
#include <sys\timeb.h>
#else
#include <sys/timeb.h>
#endif

// realtime() reads the system clock and returns the 
// time in milliseconds.
 
double realtime()
{
	struct timeb tp;
	ftime(&tp);
	return((double)(tp.time)*1000+(double)(tp.millitm));
}

//
// BLAS level 2 routine dgemm. This is the f2c translation of level 3 blas routine dgemm.f.
// Alternately one can use a pre-translated version from the 
// "cblas" package that is part of clapack. See www.netllib.org.
//
// Use of these routines requires an understanding of column and row major ordering
// of two dimensional arrays. 

/*  C := alpha*op( A )*op( B ) + beta*C, */

extern  "C"    int dgemm_(char* transa, char* transb, long* m, long* n, long* k,
double* alpha, double* a, long* lda, double* b, long* ldb,
double* beta, double* c, long* ldc, short f1, short f2);

//########################################################################
//                          Main  
//########################################################################

int main()
{
    long i; long j; long k;
    long Nsize;

    cout << " Enter Matrix Size " << endl;
    cin  >> Nsize;
//
//  Dynamically create an Nsize by Nsize matrix 
// 
    long M = Nsize; long N = Nsize;

    double* aData = new double[M*N];   // matrix data 
    double* bData = new double[M*N];
    double* cData = new double[M*N];

    double** A    = new double*[M];    // access arrays
    double** B    = new double*[M];
    double** C    = new double*[M];

    for(i = 0; i < M; i++) A[i] = &aData[i*N];
    for(i = 0; i < M; i++) B[i] = &bData[i*N];
    for(i = 0; i < M; i++) C[i] = &cData[i*N];
//
//  Initialize matrices with data 
//
    for(i = 0; i < M; i++)
    { 
    for(j = 0; j < N; j++)
    {
       C[i][j] = 0.0;
       A[i][j] = 1.0;
       B[i][j] = 1.0;
    }}

    // make A non-symmetric so we have a better test case

    A[1][N] = 0.0;

    double timeStart = realtime();

    char transa = 'T'; // Tronspose flag since A row major ordering
    char transb = 'T'; // Tronspose flag since B row major ordering
                       
    long K = M;
    double alpha = 1.0;
    double beta  = 0.0;
    short f1 = 1;
    short f2 = 1;

    /* dgemm: 

        C := alpha*op( A )*op( B ) + beta*C, 
 
        C will contain the result in column major order so the transpose
        of C will be the result in row major order 
    */

    dgemm_(&transa, &transb, &M, &N, &K,&alpha, aData, &M, bData, &M,
						   &beta, cData ,&M, f1, f2);

    double timeEnd  = realtime();

    cout << "Computation Done" << endl;
    cout << (timeEnd - timeStart)/1000 << " Seconds " << endl;

#ifdef ERR_CHECK
//
//  Verify that computation was performed correctly if this
//  routine is compiled with compiler define ERR_CHECK
//
    // data for alternate computation

    double* c2Data = new double[M*N];
    double** C2    = new double*[M];
    for(i = 0; i < M; i++) C2[i] = &c2Data[i*N];

    // alternate computation

    for(i = 0; i < M; i++)
    { 
    for(j = 0; j < N; j++)
    {
       C2[i][j] = 0.0;
       for(k = 0; k < N; k++)
       {
       C2[i][j] += A[i][k]*B[k][j];
       }
    }}

   // Evaluate difference. The result computed directly, C2, is compared 
   // with the the transpose of C, since dgemm returns results in column
   // major order.

   double maxDiff = 0.0;
   double diff;
   for(i = 0; i < M; i++)
   { 
   for(j = 0; j < N; j++)
   {
       // compare C2 to transpose of C 

       diff = fabs(C2[i][j] - C[j][i]);        
       maxDiff = (diff > maxDiff) ? diff : maxDiff;
   }}

    cout << "Maximum Difference : " << maxDiff << endl;
    delete [] c2Data; 
    delete [] C2; 

#endif

//
//  Clean up
//
    delete [] aData; delete [] bData; delete [] cData;
    delete [] A;     delete [] B;     delete [] C;    

    return 0;
}
