#include #include #include "cudpp.h" __global__ static void zero(int n,float v[]) { const int pid = threadIdx.x+blockIdx.x*blockDim.x; const int np = blockDim.x*gridDim.x; int i; for(i = pid; i double gettime() { struct timeval tv; gettimeofday(&tv,NULL); return tv.tv_sec + 1e-6*tv.tv_usec; } int main(void) { int n = 10000,nnz = n*n/100; int *first,*cidx; float *A,*x,*y1,*y2,*x_g,*y_g; int i,iter,niter = 100; first = (int *) malloc(sizeof(int) * (n+1)); cidx = (int *) malloc(sizeof(int) * nnz); A = (float *) malloc(sizeof(float) * nnz); sprand(n,n,nnz,first,cidx,A); x = (float *) malloc(sizeof(float) * n); y1 = (float *) malloc(sizeof(float) * n); y2 = (float *) malloc(sizeof(float) * n); for(i = 0; i 0.5); } double t = gettime(); for(iter = 0; iter<5; iter++) spmul(n,n,first,cidx,A,x,y1); t = gettime() - t; printf("(CPU) flops = %.3e, time per iteration = %.3fms\n", 2.0*nnz*5/t,t/5*1e3); CUDPPConfiguration config; config.datatype = CUDPP_FLOAT; config.options = (CUDPPOption)0; config.algorithm = CUDPP_SPMVMULT; CUDPPHandle sparseMatrixHandle; CUDPPResult result = CUDPP_SUCCESS; result = cudppSparseMatrix(&sparseMatrixHandle, config, nnz, n, (void *) A, (unsigned int *) first, (unsigned int *) cidx); if (result != CUDPP_SUCCESS) { fprintf(stderr, "Error creating Sparse matrix object\n"); return 1; } cudaMalloc((void **) &x_g,sizeof(float)*n); cudaMalloc((void **) &y_g,sizeof(float)*n); cudaMemcpy(x_g,x,sizeof(float)*n,cudaMemcpyHostToDevice); // Run it once to avoid timing startup overhead zero<<<14*6,128>>>(n,y_g); cudppSparseMatrixVectorMultiply(sparseMatrixHandle, y_g, x_g); cudaThreadSynchronize(); double t0 = gettime(); for(iter = 0; iter>>(n,y_g); cudppSparseMatrixVectorMultiply(sparseMatrixHandle, y_g, x_g); } cudaThreadSynchronize(); double t1 = gettime(); printf(" flops = %.3e, time per iteration = %.3fms\n", 2.0*nnz*niter/(t1-t0),(t1-t0)/niter*1e3); cudaMemcpy(y2,y_g,sizeof(float)*n,cudaMemcpyDeviceToHost); for(i = 0; i