Monday, 3 June 2013

CUDA 5.0 first calculations

OK - now, since I have my 'Hello World' program done, I think it would make sense to write something that actually uses CUDA processing power. Inspired by the CUDA introductory video, I want to write a simple vector addition kernel and run a quick test on how it compares to the CPU processing. I haven't mentioned it earlier, but I'm writing and running this code on my GeForce GTX 660M laptop GPU. Let's get started!

My first piece of code runs on the CPU and its only purpose is to add two vectors (of size one million, 1,000,000) together.

#include <cuda.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <stdio.h>
#include <time.h>

#pragma comment(lib, "cudart") 

typedef struct 
{
 float * const content;
 const unsigned int size;
} pjVector_t;

void AddVectors(const pjVector_t * const firstVector, const pjVector_t * const secondVector, pjVector_t * const resultVector);

int main(void)
{
 unsigned int vectorSize = 1000000;
 double cpuTime;
 clock_t begin, end;
 pjVector_t firstVector = { (float *)calloc(vectorSize, sizeof(float)), vectorSize };
 pjVector_t secondVector = { (float *)calloc(vectorSize, sizeof(float)), vectorSize };
 pjVector_t resultVector = { (float *)calloc(vectorSize, sizeof(float)), vectorSize };

 for (unsigned int i = 0; i < vectorSize; i++)
 {
  firstVector.content[i] = 1.0f;
  secondVector.content[i] = 2.0f;
 }

 begin = clock();

 AddVectors(&firstVector, &secondVector, &resultVector);

 end = clock();
 cpuTime = (double)(end - begin) / CLOCKS_PER_SEC;

 printf("Result vector calculated in: %f[sec]\n", cpuTime);
 getchar();


 free(firstVector.content);
 free(secondVector.content);
 free(resultVector.content);
 
 return 0;
}

void AddVectors(const pjVector_t * const firstVector, const pjVector_t * const secondVector, pjVector_t * const resultVector)
{
 for (unsigned int i = 0; i < firstVector -> size; i++)
 {
  resultVector -> content[i] = firstVector -> content[i] + secondVector -> content[i];
 }
}


Even though it's single-threaded it's pretty fast and takes from 5 to 6 milliseconds to calculate the result vector.

My second piece of code combines GPU and CPU code to compare the performance. A couple of things before I show the code though:

  • To simplify passing parameters to my kernel, I ditched the structures and chose plain float arrays.
  • Although I'm sure it can be done a lot more efficient, I'm running my CUDA code using 1000 blocks and 1000 threads each (1,000 * 1,000 = 1,000,000) just to show how the problem is divided into sections.
  • Performance is measured by an external tool, NVIDIA Visual Profiler which comes with the CUDA Toolkit.
#include <cuda.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <stdio.h>
#include <time.h>

#pragma comment(lib, "cudart") 

typedef struct 
{
 float *content;
 const unsigned int size;
} pjVector_t;

__global__ void AddVectorsKernel(float *firstVector, float *secondVector, float *resultVector)
{
 unsigned int index = threadIdx.x + blockIdx.x * blockDim.x;
 resultVector[index] = firstVector[index] + secondVector[index];
}

void AddVectors(const pjVector_t * const firstVector, const pjVector_t * const secondVector, pjVector_t * const resultVector);

int main(void)
{
 const unsigned int vectorLength = 1000000;
 const unsigned int blocks = 1000;
 const unsigned int threads = 1000;
 const unsigned int vectorSize = sizeof(float) * vectorLength;
 double cpuTime;
 clock_t begin, end;

 pjVector_t firstVector = { (float *)calloc(vectorLength, sizeof(float)), vectorLength };
 pjVector_t secondVector = { (float *)calloc(vectorLength, sizeof(float)), vectorLength };
 pjVector_t resultVector = { (float *)calloc(vectorLength, sizeof(float)), vectorLength };

 float *d_firstVector;
 float *d_secondVector;
 float *d_resultVector;

 cudaMalloc((void **)&d_firstVector, vectorSize);
 cudaMalloc((void **)&d_secondVector, vectorSize);
 cudaMalloc((void **)&d_resultVector, vectorSize);

 for (unsigned int i = 0; i < vectorLength; i++)
 {
  firstVector.content[i] = 1.0f;
  secondVector.content[i] = 2.0f;
 }

 // CPU calculatons
 begin = clock();

 AddVectors(&firstVector, &secondVector, &resultVector);

 end = clock();
 cpuTime = (double)(end - begin) / CLOCKS_PER_SEC;
 // - CPU calculatons

 // GPU calculatons
 cudaMemcpy(d_firstVector, firstVector.content, vectorSize, cudaMemcpyHostToDevice);
 cudaMemcpy(d_secondVector, secondVector.content, vectorSize, cudaMemcpyHostToDevice);

 AddVectorsKernel<<<blocks, threads>>>(d_firstVector, d_secondVector, d_resultVector);

 cudaMemcpy(resultVector.content, d_resultVector, vectorSize, cudaMemcpyDeviceToHost);
 // - GPU calculatons

 free(firstVector.content);
 free(secondVector.content);
 free(resultVector.content);

 cudaFree(d_firstVector);
 cudaFree(d_secondVector);
 cudaFree(d_resultVector);
 cudaDeviceReset();

 printf("CPU result vector calculated in: %f[ms]\n", cpuTime * 1000.0);

 getchar();
 
 return 0;
}

void AddVectors(const pjVector_t * const firstVector, const pjVector_t * const secondVector, pjVector_t * const resultVector)
{
 for (unsigned int i = 0; i < firstVector -> size; i++)
 {
  resultVector -> content[i] = firstVector -> content[i] + secondVector -> content[i];
 }
}



This time the magic really happened: vector addition takes only 792.166[µs], which is roughly 15% of the previous value. That's simply amazing for the first try. Let me show some screenshots from the profiler:



But what really made me smile was the warning messages from the profiler:



It actually complains about the time needed to copy the data being longer than the time of the calculations! And see how much time the cudaMalloc needed? As it turns out, adding two vectors of size 1,000,000 is the smallest problem here! I am really pleased with my first real CUDA test and it certainly inspires me even more.

No comments:

Post a Comment