/*
 *             Automatically Tuned Linear Algebra Software v3.1.2Dev
 **************** THIS IS AN UNSUPPORTED DEVELOPER RELEASE *****************
 *                      (C) Copyright 1999 Camm Maguire                      
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *   1. Redistributions of source code must retain the above copyright
 *      notice, this list of conditions and the following disclaimer.
 *   2. Redistributions in binary form must reproduce the above copyright
 *      notice, this list of conditions, and the following disclaimer in the
 *      documentation and/or other materials provided with the distribution.
 *   3. The name of the University, the ATLAS group, or the names of its 
 *      contributers may not be used to endorse or promote products derived
 *      from this software without specific written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE. 
 *
 */

#ifndef STRIDE
#endif

#ifdef SREAL
   #define STRIDE 1
   #define NDPM 2
#endif

#ifdef SCPLX
   #define STRIDE 1
#ifdef NO_TRANSPOSE
   #define NDPM 3
#else
   #define NDPM 4
#endif
#endif

#ifdef DREAL
   #define STRIDE 1
#ifdef NO_TRANSPOSE
   #define NDPM 4
#else
   #define NDPM 2
#endif
#endif

#ifdef DCPLX
#ifdef NO_TRANSPOSE
   #define STRIDE 1
#else
   #define STRIDE 12
#endif
   #define NDPM 2
#endif

#define NDPM 1

#if NDPM > 4
#error Max NDPM is 4 for gemvT
#endif

#include <stdio.h>
#include <stdlib.h>

#include "camm_util.h"

#if defined(BETAX) || defined(BETAXI0)
#include "camm_scale.h"
#endif


#if NDPM >= 4
#define EXT4 Mjoin(4dp,BLC)
#undef NDP
#define NDP 4
#undef EXT
#define EXT EXT4
#include "camm_dpa.h"
#endif

#if NDPM >= 3
#define EXT3 Mjoin(3dp,BLC)
#undef NDP
#define NDP 3
#undef EXT
#define EXT EXT3
#include "camm_dpa.h"
#endif

#if NDPM >= 2
#define EXT2 Mjoin(2dp,BLC)
#undef NDP
#define NDP 2
#undef EXT
#define EXT EXT2
#include "camm_dpa.h"
#endif

#define EXT1 Mjoin(1dp,BLC)
#undef NDP
#define NDP 1
#undef EXT
#define EXT EXT1
#include "camm_dpa.h"


#undef NDP
#define NDP NDPM
#undef EXT
#define EXT Mjoin(Mjoin(NDP,Mjoin(dp,BLC)),m)
#include "camm_dpa.h"

#ifdef NO_TRANSPOSE
#ifdef Conj_
#define FEXT Nc
#else
#define FEXT N
#endif
#else
#ifdef Conj_
#define FEXT C
#else
#define FEXT T
#endif
#endif

#if 0
#include "atlas_misc.h"
void Mjoin(Mjoin(Mjoin(Mjoin(Mjoin(PATL,gemvN),NM),_x1),BNM),_y1)
   (const int M, const int N, const SCALAR alpha, const TYPE *A, const int lda,
    const TYPE *X, const int incX, const SCALAR beta, TYPE *Y, const int incY)
{
   int i, j;
   for (i=0; i != M; i++)
   {
      #ifdef BETA0
         Y[i] = ATL_rzero;
      #elif defined(BETAX)
         Y[i] *= beta;
      #endif
      for (j=0; j != N; j++) Y[i] += A[i+j*lda] * X[j];
   }
}
#endif

#define FN Mjoin(Mjoin(Mjoin(ATL_,PREC),gemv),Mjoin(FEXT,Mjoin(_a1_x1_,Mjoin(BL,_y1))))

void 
FN(int m,int n, SCALAR alpha,const TYPE *a,
   int lda,const TYPE *b,int binc,
   SCALAR beta,TYPE *c,int cinc) {

  int i,mm,nn;
  const TYPE *ae;
#ifdef NO_TRANSPOSE
  int len=m,w=n;
#define zz b
#else
  int len=n,w=m;
#define zz c
#endif

fprintf(stderr, "line %d: binc=%d, cinc=%d, beta=%f, alpha=%f, m=%d, n=%d\n",
        __LINE__, binc, cinc, beta, alpha, m, n);
binc = cinc = 1;
alpha = 1.0;
#if 1
if (beta == 0.0) for (i=0; i < m; i++) c[i] = 0.0;
else for (i=0; i < m; i++) c[i] *= beta;

#else

#if defined(NO_TRANSPOSE) && defined(BETA0)
/*  memset(c,0,m*sizeof(*c)); */
fprintf(stderr, "line %d of %s\n", __LINE__, __FILE__);
for (i=0; i < m; i++) c[i] = 0.0;
#endif

#if defined(BETAX) || defined(BETAXI0)
#if defined(SCPLX) || defined(DCPLX)
  SCALE(beta,c,m);
#endif
#if defined(SREAL) || defined(DREAL)
  SCALE(&beta,c,m);
#endif
#endif

#endif

  ae=a+w*lda;
  nn=STRIDE*lda;


#if NDPM == 1
fprintf(stderr, "line %d of %s\n", __LINE__, __FILE__);
  for (;a<ae;a+=lda,zz++)
    Mjoin(dp,EXT)(b,a,nn,c,STRIDE,len);

#else

fprintf(stderr, "line %d of %s\n", __LINE__, __FILE__);
fprintf(stderr, "NDPM=%d, STRIDE=%d, M=%d, N=%d, lda=%d, beta=%f\n", 
        NDPM, STRIDE, m, n, lda, beta);
  while (a+NDPM*nn<=ae) {
    for (i=0;i<STRIDE;i++,a+=lda,zz++) 
      Mjoin(dp,EXT)(b,a,nn,c,STRIDE,len);

    a+=(NDPM-1)*nn;
    zz+=(NDPM-1)*STRIDE;
  }

#if 1
  for (i=0;a<ae && i<STRIDE;i++,a+=lda,zz++) {

    mm=(ae-a)/nn;
#if STRIDE > 1
    if (((ae-a)/lda)%STRIDE)
      mm++;
#endif
    
    if (mm == 1)
      Mjoin(dp,EXT1)(b,a,nn,c,STRIDE,len);

#if ( NDPM == 2 && STRIDE > 1 ) || NDPM > 2
    else if (mm == 2)
      Mjoin(dp,EXT2)(b,a,nn,c,STRIDE,len);
#endif

#if ( NDPM == 3 && STRIDE > 1 ) || NDPM > 3
    else if (mm == 3)
      Mjoin(dp,EXT3)(b,a,nn,c,STRIDE,len);
#endif

#if ( NDPM == 4 && STRIDE > 1 ) || NDPM > 4
    else if (mm == 4)
      Mjoin(dp,EXT4)(b,a,nn,c,STRIDE,len);
#endif


  }
#else
  for (i=0; i < mm; i++) c[i] = 0.0;
#endif

#endif

}

