#include "atlas_misc.h"

void ATL_USERMM
   (const int M, const int N, const int K, const TYPE alpha, const TYPE *A, const int lda, const TYPE *B, const int ldb, const TYPE beta, TYPE *C, const int ldc)
/*
 * matmul with TA=T, TB=N, MB=0, NB=0, KB=0, 
 * lda=0, ldb=0, ldc=0, mu=4, nu=4, ku=6
 */
{
   const int Mb = (M>>2)<<2;
   const int Nb = (N>>2)<<2;
   const int Kb = (K/6)*6;
   const TYPE *stM = A + (lda*Mb);
   const TYPE *stN = B + (ldb*Nb);
   #define incAk 6
   const int incAm = ((((lda) << 2)) - Kb), incAn = -(Mb*lda);
   #define incBk 6
   const int incBm = -(Kb), incBn = (((ldb) << 2));
   #define incCm 4
   const int incCn = (((ldc) << 2)) - (Mb);
   TYPE *pC0=C, *pC1=pC0+(ldc), *pC2=pC1+(ldc), *pC3=pC2+(ldc);
   const TYPE *pA0=A, *pA1=pA0+(lda), *pA2=pA1+(lda), *pA3=pA2+(lda);
   const TYPE *pB0=B, *pB1=pB0+(ldb), *pB2=pB1+(ldb), *pB3=pB2+(ldb);
   register int k;
   register TYPE rA0_0, rA1_0, rA2_0, rA3_0, rA4_0, rA5_0,
                 rA0_1, rA1_1, rA2_1, rA3_1, rA4_1, rA5_1,
                 rA0_2, rA1_2, rA2_2, rA3_2, rA4_2, rA5_2,
                 rA0_3, rA1_3, rA2_3, rA3_3, rA4_3, rA5_3;
   register TYPE rB0_0, rB1_0, rB2_0, rB3_0, rB4_0, rB5_0,
                 rB0_1, rB1_1, rB2_1, rB3_1, rB4_1, rB5_1,
                 rB0_2, rB1_2, rB2_2, rB3_2, rB4_2, rB5_2,
                 rB0_3, rB1_3, rB2_3, rB3_3, rB4_3, rB5_3;
   register TYPE rC0_0, rC1_0, rC2_0, rC3_0, 
                 rC0_1, rC1_1, rC2_1, rC3_1, 
                 rC0_2, rC1_2, rC2_2, rC3_2,
                 rC0_3, rC1_3, rC2_3, rC3_3;
   do /* N-loop */
   {
      do /* M-loop */
      {
         rC0_0 = *pC0; rC1_0 = pC0[1]; rC2_0 = pC0[2]; rC3_0 = pC0[3];
         rC0_1 = *pC1; rC1_1 = pC1[1]; rC2_1 = pC1[2]; rC3_1 = pC1[3];
         rC0_2 = *pC2; rC1_2 = pC2[1]; rC2_2 = pC2[2]; rC3_2 = pC2[3];
         rC0_3 = *pC3; rC1_3 = pC3[1]; rC2_3 = pC3[2]; rC3_3 = pC3[3];

         rA0_0 = *pA0;   rA0_1 = *pA1;   rA0_2 = *pA2;   rA0_3 = *pA3;
         rA1_0 = pA0[1]; rA1_1 = pA1[1]; rA1_2 = pA2[1]; rA1_3 = pA3[1];
         rA2_0 = pA0[2]; rA2_1 = pA1[2]; rA2_2 = pA2[2]; rA2_3 = pA3[2];

         rB0_0 = *pB0;   rB0_1 = *pB1;   rB0_2 = *pB2;   rB0_3 = *pB3;
         rB1_0 = pB0[1]; rB1_1 = pB1[1]; rB1_2 = pB2[1]; rB1_3 = pB3[1];
         rB2_0 = pB0[2]; rB2_1 = pB1[2]; rB2_2 = pB2[2]; rB2_3 = pB3[2];

         for (k=Kb-6; k; k -= 6) /* easy loop to unroll */
         {
            rC0_0 += rA0_0 * rB0_0; rB3_0 = pB0[3]; 
            rC1_0 += rA0_1 * rB0_0;
            rC2_0 += rA0_2 * rB0_0; rA3_0 = pA0[3]; 
            rC3_0 += rA0_3 * rB0_0;
            rC0_1 += rA0_0 * rB0_1; rA3_1 = pA1[3]; 
            rC1_1 += rA0_1 * rB0_1;
            rC2_1 += rA0_2 * rB0_1; rA3_2 = pA2[3]; 
            rC3_1 += rA0_3 * rB0_1;
            rC0_2 += rA0_0 * rB0_2; rA3_3 = pA3[3];
            rC1_2 += rA0_1 * rB0_2;
            rC2_2 += rA0_2 * rB0_2; rB3_1 = pB1[3]; 
            rC3_2 += rA0_3 * rB0_2;
            rC0_3 += rA0_0 * rB0_3; rB3_2 = pB2[3];
            rC1_3 += rA0_1 * rB0_3;
            rC2_3 += rA0_2 * rB0_3; rB3_3 = pB3[3];
            rC3_3 += rA0_3 * rB0_3;

            rC0_0 += rA1_0 * rB1_0; rB4_0 = pB0[4]; 
            rC1_0 += rA1_1 * rB1_0;
            rC2_0 += rA1_2 * rB1_0; rA4_2 = pA2[4];  
            rC3_0 += rA1_3 * rB1_0;
            rC0_1 += rA1_0 * rB1_1; rA4_3 = pA3[4]; 
            rC1_1 += rA1_1 * rB1_1;
            rC2_1 += rA1_2 * rB1_1; rA4_0 = pA0[4];
            rC3_1 += rA1_3 * rB1_1;
            rC0_2 += rA1_0 * rB1_2; rA4_1 = pA1[4];
            rC1_2 += rA1_1 * rB1_2;
            rC2_2 += rA1_2 * rB1_2; rB4_1 = pB1[4]; 
            rC3_2 += rA1_3 * rB1_2;
            rC0_3 += rA1_0 * rB1_3; rB4_2 = pB2[4]; 
            rC1_3 += rA1_1 * rB1_3;
            rC2_3 += rA1_2 * rB1_3; rB4_3 = pB3[4];
            rC3_3 += rA1_3 * rB1_3;

            rC0_0 += rA2_0 * rB2_0; rB5_0 = pB0[5]; pB0 += 6;
            rC1_0 += rA2_1 * rB2_0;
            rC2_0 += rA2_2 * rB2_0; rA5_0 = pA0[5]; pA0 += 6;
            rC3_0 += rA2_3 * rB2_0;
            rC0_1 += rA2_0 * rB2_1; rA5_1 = pA1[5]; pA1 += 6;
            rC1_1 += rA2_1 * rB2_1;
            rC2_1 += rA2_2 * rB2_1; rA5_2 = pA2[5]; pA2 += 6;
            rC3_1 += rA2_3 * rB2_1;
            rC0_2 += rA2_0 * rB2_2; rA5_3 = pA3[5]; pA3 += 6;
            rC1_2 += rA2_1 * rB2_2;
            rC2_2 += rA2_2 * rB2_2; rB5_1 = pB1[5]; pB1 += 6;
            rC3_2 += rA2_3 * rB2_2;
            rC0_3 += rA2_0 * rB2_3; rB5_2 = pB2[5]; pB2 += 6;
            rC1_3 += rA2_1 * rB2_3;
            rC2_3 += rA2_2 * rB2_3; rB5_3 = pB3[5]; pB3 += 6;
            rC3_3 += rA2_3 * rB2_3;

            rC0_0 += rA3_0 * rB3_0; rB0_0 = *pB0;   
            rC1_0 += rA3_1 * rB3_0;
            rC2_0 += rA3_2 * rB3_0; rA0_0 = *pA0;
            rC3_0 += rA3_3 * rB3_0;
            rC0_1 += rA3_0 * rB3_1; rA0_1 = *pA1;
            rC1_1 += rA3_1 * rB3_1;
            rC2_1 += rA3_2 * rB3_1; rA0_2 = *pA2;
            rC3_1 += rA3_3 * rB3_1;
            rC0_2 += rA3_0 * rB3_2; rA0_3 = *pA3;
            rC1_2 += rA3_1 * rB3_2;
            rC2_2 += rA3_2 * rB3_2; rB0_1 = *pB1;
            rC3_2 += rA3_3 * rB3_2;
            rC0_3 += rA3_0 * rB3_3; rB0_2 = *pB2;
            rC1_3 += rA3_1 * rB3_3;
            rC2_3 += rA3_2 * rB3_3; rB0_3 = *pB3;
            rC3_3 += rA3_3 * rB3_3;
         

            rC0_0 += rA4_0 * rB4_0; rB1_0 = pB0[1]; 
            rC1_0 += rA4_1 * rB4_0;
            rC2_0 += rA4_2 * rB4_0; rA1_0 = pA0[1];
            rC3_0 += rA4_3 * rB4_0;
            rC0_1 += rA4_0 * rB4_1; rA1_1 = pA1[1];
            rC1_1 += rA4_1 * rB4_1;
            rC2_1 += rA4_2 * rB4_1; rA1_2 = pA2[1];
            rC3_1 += rA4_3 * rB4_1;
            rC0_2 += rA4_0 * rB4_2; rA1_3 = pA3[1];
            rC1_2 += rA4_1 * rB4_2;
            rC2_2 += rA4_2 * rB4_2; rB1_1 = pB1[1];
            rC3_2 += rA4_3 * rB4_2;
            rC0_3 += rA4_0 * rB4_3; rB1_2 = pB2[1];
            rC1_3 += rA4_1 * rB4_3;
            rC2_3 += rA4_2 * rB4_3; rB1_3 = pB3[1];
            rC3_3 += rA4_3 * rB4_3;
         
            rC0_0 += rA5_0 * rB5_0; rB2_0 = pB0[2]; 
            rC1_0 += rA5_1 * rB5_0;
            rC2_0 += rA5_2 * rB5_0; rA2_0 = pA0[2];
            rC3_0 += rA5_3 * rB5_0;
            rC0_1 += rA5_0 * rB5_1; rA2_1 = pA1[2];
            rC1_1 += rA5_1 * rB5_1;
            rC2_1 += rA5_2 * rB5_1; rA2_2 = pA2[2];
            rC3_1 += rA5_3 * rB5_1;
            rC0_2 += rA5_0 * rB5_2; rA2_3 = pA3[2];
            rC1_2 += rA5_1 * rB5_2;
            rC2_2 += rA5_2 * rB5_2; rB2_1 = pB1[2];
            rC3_2 += rA5_3 * rB5_2;
            rC0_3 += rA5_0 * rB5_3; rB2_2 = pB2[2];
            rC1_3 += rA5_1 * rB5_3;
            rC2_3 += rA5_2 * rB5_3; rB2_3 = pB3[2];
            rC3_3 += rA5_3 * rB5_3;
         }
            rC0_0 += rA0_0 * rB0_0; rB3_0 = pB0[3]; 
            rC1_0 += rA0_1 * rB0_0;
            rC2_0 += rA0_2 * rB0_0; rA3_0 = pA0[3]; 
            rC3_0 += rA0_3 * rB0_0;
            rC0_1 += rA0_0 * rB0_1; rA3_1 = pA1[3]; 
            rC1_1 += rA0_1 * rB0_1;
            rC2_1 += rA0_2 * rB0_1; rA3_2 = pA2[3]; 
            rC3_1 += rA0_3 * rB0_1;
            rC0_2 += rA0_0 * rB0_2; rA3_3 = pA3[3];
            rC1_2 += rA0_1 * rB0_2;
            rC2_2 += rA0_2 * rB0_2; rB3_1 = pB1[3]; 
            rC3_2 += rA0_3 * rB0_2;
            rC0_3 += rA0_0 * rB0_3; rB3_2 = pB2[3];
            rC1_3 += rA0_1 * rB0_3;
            rC2_3 += rA0_2 * rB0_3; rB3_3 = pB3[3];
            rC3_3 += rA0_3 * rB0_3;

            rC0_0 += rA1_0 * rB1_0; rB4_0 = pB0[4]; 
            rC1_0 += rA1_1 * rB1_0;
            rC2_0 += rA1_2 * rB1_0; rA4_2 = pA2[4];  
            rC3_0 += rA1_3 * rB1_0;
            rC0_1 += rA1_0 * rB1_1; rA4_3 = pA3[4]; 
            rC1_1 += rA1_1 * rB1_1;
            rC2_1 += rA1_2 * rB1_1; rA4_0 = pA0[4];
            rC3_1 += rA1_3 * rB1_1;
            rC0_2 += rA1_0 * rB1_2; rA4_1 = pA1[4];
            rC1_2 += rA1_1 * rB1_2;
            rC2_2 += rA1_2 * rB1_2; rB4_1 = pB1[4]; 
            rC3_2 += rA1_3 * rB1_2;
            rC0_3 += rA1_0 * rB1_3; rB4_2 = pB2[4]; 
            rC1_3 += rA1_1 * rB1_3;
            rC2_3 += rA1_2 * rB1_3; rB4_3 = pB3[4];
            rC3_3 += rA1_3 * rB1_3;

            rC0_0 += rA2_0 * rB2_0; rB5_0 = pB0[5]; pB0 += 6;
            rC1_0 += rA2_1 * rB2_0;
            rC2_0 += rA2_2 * rB2_0; rA5_0 = pA0[5]; pA0 += 6;
            rC3_0 += rA2_3 * rB2_0;
            rC0_1 += rA2_0 * rB2_1; rA5_1 = pA1[5]; pA1 += 6;
            rC1_1 += rA2_1 * rB2_1;
            rC2_1 += rA2_2 * rB2_1; rA5_2 = pA2[5]; pA2 += 6;
            rC3_1 += rA2_3 * rB2_1;
            rC0_2 += rA2_0 * rB2_2; rA5_3 = pA3[5]; pA3 += 6;
            rC1_2 += rA2_1 * rB2_2;
            rC2_2 += rA2_2 * rB2_2; rB5_1 = pB1[5]; pB1 += 6;
            rC3_2 += rA2_3 * rB2_2;
            rC0_3 += rA2_0 * rB2_3; rB5_2 = pB2[5]; pB2 += 6;
            rC1_3 += rA2_1 * rB2_3;
            rC2_3 += rA2_2 * rB2_3; rB5_3 = pB3[5]; pB3 += 6;
            rC3_3 += rA2_3 * rB2_3;

            rC0_0 += rA3_0 * rB3_0;
            rC1_0 += rA3_1 * rB3_0;
            rC2_0 += rA3_2 * rB3_0;
            rC3_0 += rA3_3 * rB3_0;
            rC0_1 += rA3_0 * rB3_1;
            rC1_1 += rA3_1 * rB3_1;
            rC2_1 += rA3_2 * rB3_1;
            rC3_1 += rA3_3 * rB3_1;
            rC0_2 += rA3_0 * rB3_2;
            rC1_2 += rA3_1 * rB3_2;
            rC2_2 += rA3_2 * rB3_2;
            rC3_2 += rA3_3 * rB3_2;
            rC0_3 += rA3_0 * rB3_3;
            rC1_3 += rA3_1 * rB3_3;
            rC2_3 += rA3_2 * rB3_3;
            rC3_3 += rA3_3 * rB3_3;
         

            rC0_0 += rA4_0 * rB4_0;
            rC1_0 += rA4_1 * rB4_0;
            rC2_0 += rA4_2 * rB4_0;
            rC3_0 += rA4_3 * rB4_0;
            rC0_1 += rA4_0 * rB4_1;
            rC1_1 += rA4_1 * rB4_1;
            rC2_1 += rA4_2 * rB4_1;
            rC3_1 += rA4_3 * rB4_1;
            rC0_2 += rA4_0 * rB4_2;
            rC1_2 += rA4_1 * rB4_2;
            rC2_2 += rA4_2 * rB4_2;
            rC3_2 += rA4_3 * rB4_2;
            rC0_3 += rA4_0 * rB4_3;
            rC1_3 += rA4_1 * rB4_3;
            rC2_3 += rA4_2 * rB4_3;
            rC3_3 += rA4_3 * rB4_3;
         
            rC0_0 += rA5_0 * rB5_0;
            rC1_0 += rA5_1 * rB5_0;
            rC2_0 += rA5_2 * rB5_0;
            rC3_0 += rA5_3 * rB5_0;
            rC0_1 += rA5_0 * rB5_1;
            rC1_1 += rA5_1 * rB5_1;
            rC2_1 += rA5_2 * rB5_1;
            rC3_1 += rA5_3 * rB5_1;
            rC0_2 += rA5_0 * rB5_2;
            rC1_2 += rA5_1 * rB5_2;
            rC2_2 += rA5_2 * rB5_2;
            rC3_2 += rA5_3 * rB5_2;
            rC0_3 += rA5_0 * rB5_3;
            rC1_3 += rA5_1 * rB5_3;
            rC2_3 += rA5_2 * rB5_3;
            rC3_3 += rA5_3 * rB5_3;

         *pC0 = rC0_0;
         pC0[1] = rC1_0;
         pC0[2] = rC2_0;
         pC0[3] = rC3_0;
         *pC1 = rC0_1;
         pC1[1] = rC1_1;
         pC1[2] = rC2_1;
         pC1[3] = rC3_1;
         *pC2 = rC0_2;
         pC2[1] = rC1_2;
         pC2[2] = rC2_2;
         pC2[3] = rC3_2;
         *pC3 = rC0_3;
         pC3[1] = rC1_3;
         pC3[2] = rC2_3;
         pC3[3] = rC3_3;
         pC0 += incCm;
         pC1 += incCm;
         pC2 += incCm;
         pC3 += incCm;
         pA0 += incAm;
         pA1 += incAm;
         pA2 += incAm;
         pA3 += incAm;
         pB0 += incBm;
         pB1 += incBm;
         pB2 += incBm;
         pB3 += incBm;
      }
      while(pA0 != stM);
      pC0 += incCn;
      pC1 += incCn;
      pC2 += incCn;
      pC3 += incCn;
      pA0 += incAn;
      pA1 += incAn;
      pA2 += incAn;
      pA3 += incAn;
      pB0 += incBn;
      pB1 += incBn;
      pB2 += incBn;
      pB3 += incBn;
   }
   while(pB0 != stN);
}
#ifdef incAm
   #undef incAm
#endif
#ifdef incAn
   #undef incAn
#endif
#ifdef incAk
   #undef incAk
#endif
#ifdef incBm
   #undef incBm
#endif
#ifdef incBn
   #undef incBn
#endif
#ifdef incBk
   #undef incBk
#endif
#ifdef incCm
   #undef incCm
#endif
#ifdef incCn
   #undef incCn
#endif
#ifdef incCk
   #undef incCk
#endif
#ifdef Mb
   #undef Mb
#endif
#ifdef Nb
   #undef Nb
#endif
#ifdef Kb
   #undef Kb
#endif
