Threads within a block cooperate via shared memory, atomic operations and barrier synchronization
Threads in different blocks do not interact
blockldx:1D,2D, or3D (CUDA4.0) threadldx: , or
Memory management
cudaMalloc()
cudaFree()
cudaMemcpy()
just like C but some differences
the copy is asynchronous (it will be immediately returned, but the actual copy will be done later)
1 2 3 4 5 6 7 8 9 10 11 12
voidvecAdd(float* h_A, float* h_B, float* h_C, int n ) { int size = n * sizeof (float); float * d_A, d_B, d_c; cudaMalloc((void **) &d_A, size); cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice); cudaMalloc((void **) &d_B, size); cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice); cudaMalloc((void **) &d_C, size); // Kernel invocation code - to be shown later cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost); cudaFree(d_A); cudaFree(d_B); cudaFree (d_C); }
1 2 3 4 5 6
cudaError_t err = cudaMalloc ((void * *) &d_A, size); if (err != cudaSuccess) { printf("%s in % s at line % d backslash n", cudaGetErrorString(err), __FILE__, __LINE__); exit(EXIT_FAILURE); }
1 2 3 4 5 6 7 8
__global__ voidvecAddKernel(float* $A$, float* $B$, float* $\mathrm{C}$, int $\mathrm{n}$ ) \{ int $i=$ threadIdx.x+blockDim.x*blockIdx.x; i{f (i<n)} $C[i]=A[i]+B[i]$; //boundary check
\}
1 2 3 4 5 6
int vecAdd (float* $h \_A, f l o a t * h \_B, f l o a t * h \_C$, int $n$ ) \{ dim3DimGrid( $(\mathrm{n}-1) /256+1,1,1) ; \quad$ Host Code dim3DimBlock $(256,1,1)$; vecAddKernnel<<<DimGrid,DimBlock>》>(d_A, d_B, d_C, n) ; \}
A kernel function must return void
__device__ and __host__ can be used together
__host__ is optional if used alone
Post title:Notes for Heterogeneous Parallel Programming By W. Hwu
Post author:Jackcui
Create time:2024-03-01 15:00:05
Post link:https://jackcuii.github.io/2024/03/01/HPP/
Copyright Notice:All articles in this blog are licensed under BY-NC-SA unless stating additionally.