Peikun's blog: CUDA编程小窥

并行计算课程可选部分有CUDA编程，这两周在系里的集群上感受了一下，非常不错。

一个字总结，好用易上手。

远程SSH上学校的主机，ssh peikun.zhou@129.175.237.164，然后再用申请的临时账号ssh进GPU集群，ssh -X zhou@tp-jetson.

拷代码过去，scp helloworld.cu zhou@tp-jetson:/home/zhou/

1.先来一个简单的例子，helloworld.cu

#include <stdio.h>

__device__ const char *STR = "HELLO WORLD!";

const char STR_LENGTH = 12;

__global__ void hello()

{

printf("%c\n", STR[threadIdx.x % STR_LENGTH]);

}

int main(void)

{

int num_threads = STR_LENGTH;

int num_blocks = 1;

hello<<<num_blocks,num_threads>>>();

cudaDeviceSynchronize();

return 0;

}

1).__global__表明是在GPU Device上运行

2).hello<<<num_blocks,num_threads>>>();

Triple angle brackets mark a call from host code to device code， Also called a “kernel launch”

这部分是并行计算的代码，即GPU并行计算的代码。每一个block里有个若干个thread，这句的意思是有num_blocks个block，每一个Bloack里有num_threads个thread.

我们须知，在GPU里，每一个Block都是并行计算的。

3).nVIDIA的编译器，即nvcc可以编译常规的C语言代码（Host Code非Device code）

nvcc helloworld.cu编译之，./hello运行结果，如下，

zhou@tegra-ubuntu:~$ ./hello

这是12个并行输出（计算）的结果。

2.向量加法运算

#include <stdio.h>

__global__ void vector_add(int *a, int *b, int *c)

{

/* insert code to calculate the index properly using blockIdx.x, blockDim.x, threadIdx.x */

int index = threadIdx.x + blockIdx.x * blockDim.x;

c[index] = a[index] + b[index];

}

/* experiment with N */

/* how large can it be? */

#define N (2048*2048)

#define THREADS_PER_BLOCK 512

int main()

{

int *a, *b, *c;

int *d_a, *d_b, *d_c;

int size = N * sizeof( int );

/* allocate space for device copies of a, b, c */

cudaMalloc( (void **) &d_a, size );

cudaMalloc( (void **) &d_b, size );

cudaMalloc( (void **) &d_c, size );

/* allocate space for host copies of a, b, c and setup input values */

a = (int *)malloc( size );

b = (int *)malloc( size );

c = (int *)malloc( size );

for( int i = 0; i < N; i++ )

{

a[i] = b[i] = i;

c[i] = 0;

}

/* copy inputs to device */

/* fix the parameters needed to copy data to the device */

cudaMemcpy( d_a, a, size, cudaMemcpyHostToDevice );

cudaMemcpy( d_b, b, size, cudaMemcpyHostToDevice );

/* launch the kernel on the GPU */

/* insert the launch parameters to launch the kernel properly using blocks and threads */

vector_add<<< N/THREADS_PER_BLOCK, THREADS_PER_BLOCK >>>( d_a, d_b, d_c );

/* copy result back to host */

/* fix the parameters needed to copy data back to the host */

cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost );

printf( "c[0] = %d\n",0,c[0] );

printf( "c[%d] = %d\n",N-1, c[N-1] );

/* clean up */

free(a);

free(b);

free(c);

cudaFree( d_a );

cudaFree( d_b );

cudaFree( d_c );

return 0;

} /* end main */

注意看英文注释。

我们看CUDA内置的很多函数，都是C/C++的影子，如这里的cudaMemcpy()，cudaMalloc()，用法大同小异，非常容易上手。

左总结右总结，想梳理一下CUDA并行计算的过程，还是说不好，即然这样就偷懒把官方的入门文档放一下。

Peikun's blog

2015年11月26日星期四

CUDA编程小窥

没有评论:

发表评论