Hello CUDA World!

Simplest CUDA Kernel

output 42

#include <stdio.h>

__global__ void kernel(int* out) {
  *out = 42;
}

#define BLOCKS_PER_GRID   1
#define THREADS_PER_BLOCK 1

int main(void) {
  int* device;
  cudaMalloc((void **)&device, sizeof(int));

  kernel<<<BLOCKS_PER_GRID, THREADS_PER_BLOCK>>>(device);

  int host;
  cudaMemcpy(&host, device, sizeof(int), cudaMemcpyDeviceToHost);
  printf("%d\n", host);

  getchar();
  return 0;
}

Multiple Threads

output 42 for N times

#include <stdio.h>

__global__ void kernel(int* out) {
  out[threadIdx.x] = 42;
}

#define N                 512
#define BLOCKS_PER_GRID     1
#define THREADS_PER_BLOCK   N

int main(void) {
  int* device;
  cudaMalloc((void **)&device, N * sizeof(int));

  kernel<<<BLOCKS_PER_GRID, THREADS_PER_BLOCK>>>(device);

  int host[N];
  cudaMemcpy(host, device, N * sizeof(int), cudaMemcpyDeviceToHost);
  for (int i = 0; i < N; i++)
    printf("%d ", host[i]);

  getchar();
  return 0;
}

Multiple Blocks

output 42 for N times

#include <stdio.h>

__global__ void kernel(int* out) {
  out[blockIdx.x * blockDim.x + threadIdx.x] = 42;
}

#define N                 1024
#define THREADS_PER_BLOCK  512
#define BLOCKS_PER_GRID   N / THREADS_PER_BLOCK

int main(void) {
  int* device;
  cudaMalloc((void **)&device, N * sizeof(int));

  kernel<<<BLOCKS_PER_GRID, THREADS_PER_BLOCK>>>(device);

  int host[N];
  cudaMemcpy(host, device, N * sizeof(int), cudaMemcpyDeviceToHost);
  for (int i = 0; i < N; i++)
    printf("%d ", host[i]);

  getchar();
  return 0;
}

หากจะนำข้อความไปใช้ ต้องแสดงที่มา และห้ามใช้ในเชิงพาณิชย์

Leave a Reply

Please log in using one of these methods to post your comment:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s