【CUDA】Memory Coalescing（内存合并）

内存合并的基础是DRAM Burst。DRAM的地址空间被分成不同的Burst段，当处理器从DRAM访问数据时，位于相同段的其他数据也会被传送到处理器，如下图所示：

利用这一特性，当我们从全局内存搬运数据到共享内存或者寄存器时，相邻的线程应该尽量访问相邻的数据，从而达到内存合并的目的，如下图所示：

如下为实现内存合并的一般性规则：

即，访问元素的id以能以 (X + threadIdx.x)的形式表示，其中X是与threadIdx.x无关的量。

如下是两组实验，main函数详见博客：https://blog.csdn.net/xll_bit/article/details/117700829?spm=1001.2014.3001.5501

本文只粘贴核函数的实现：

//单个线程对数组a按列访问，每个线程在同一指令下访问同一行中相邻元素，能进行内存合并访问

__global__ void kernel_globalx(my_type *a, my_type *b, my_type *c,

const int M, const int N, const int K){

int x = blockIdx.x * blockDim.x + threadIdx.x;

int y = blockIdx.y * blockDim.y + threadIdx.y;

if(x < K && y < M){

my_type tmp = 0;

for(int n = 0; n < N; n++){

tmp += a[y * N + n];

}

//c[y * K + x] = tmp;

c[0] = tmp;

}

//单个线程对数组a按行访问，每个线程在同一指令下访问同一列中相邻元素，不能进行内存合并

__global__ void kernel_globaly(my_type *a, my_type *b, my_type *c,

const int M, const int N, const int K){

int x = blockIdx.x * blockDim.x + threadIdx.x;

int y = blockIdx.y * blockDim.y + threadIdx.y;

if(x < K && y < M){

my_type tmp = 0;

for(int n = 0; n < N; n++){

tmp += a[n * K + x];

}

//c[y * K + x] = tmp;

c[0] = tmp;

}

/* 运行结果

gpu globalx: 16.0523

gpu globaly: 18.9172

从运行时间来看，globaly的运行时间长于globalx，和上面理论一致。

另外，在使用tile形式将全局内存搬到共享内存的过程中时，对两个矩阵的元素访问是都可以进行内存合并的

以下为相关的核函数以及结果：

__global__ void kernel_shared1(my_type *a, my_type *b, my_type *c,

const int M, const int N, const int K){

__shared__ my_type s_a[TILE_SIZE][TILE_SIZE];

__shared__ my_type s_b[TILE_SIZE][TILE_SIZE];

int x = blockIdx.x * blockDim.x + threadIdx.x;

int y = blockIdx.y * blockDim.y + threadIdx.y;

int tx = threadIdx.x;

int ty = threadIdx.y;

my_type tmp = 0;

for(int n = 0; n < N; n += TILE_SIZE){

s_a[ty][tx] = 0;

int x1 = y *N + n + tx;

int x2 = (n + ty) * K + x;

if( y < M && n + tx < N){

s_a[ty][tx] = a[ x1];

}

__syncthreads();

for(int i = 0; i < TILE_SIZE; i ++){

tmp += s_a[ty][i];

}

__syncthreads();

}

__global__ void kernel_shared2(my_type *a, my_type *b, my_type *c,

const int M, const int N, const int K){

__shared__ my_type s_a[TILE_SIZE][TILE_SIZE];

__shared__ my_type s_b[TILE_SIZE][TILE_SIZE];

int x = blockIdx.x * blockDim.x + threadIdx.x;

int y = blockIdx.y * blockDim.y + threadIdx.y;

int tx = threadIdx.x;

int ty = threadIdx.y;

my_type tmp = 0;

for(int n = 0; n < N; n += TILE_SIZE){

s_b[ty][tx] = 0;

int x1 = y *N + n + tx;

int x2 = (n + ty) * K + x;

if (x < K && n + ty < N){

s_b[ty][tx] = b[ x2 ];

}

__syncthreads();

for(int i = 0; i < TILE_SIZE; i ++){

tmp += s_b[i][tx];

}

__syncthreads();

}

/* 运行结果

gpu shared1: 3.42726

gpu shared2: 4.54283

相关发现

QQ是怎么算等级的一颗星星是几级啊

[其他]手机QQ哪个版本好用

3DMAX怎么渲染图-3DM软件

Limit手表品牌 Limit手表怎么样？

自然伙伴