CNSS Recruit

我要入门

向量求和 - CUDA

最开始就写了个最普通的版本：

__global__ void vector_add(const float* __restrict__ A,
                           const float* __restrict__ B,
                           float* __restrict__ C,
                           int N) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;

    if (idx < N) {
        C[idx] = A[idx] + B[idx];
    }
}

感觉这么写已经并行的不行了，那就应该关注一下其他的问题了。比如能不能保证正确地执行，在上面这种情况下向量多长，就要用多少个 thread，那 thread 没分配够不是炸了，所以修改一下：

__global__ void vector_add(const float* __restrict__ A,
                           const float* __restrict__ B,
                           float* __restrict__ C,
                           int N) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int step = blockDim.x * gridDim.x;

    for (int i = idx; i < N; i += step) {
        C[i] = A[i] + B[i];
    }
}

这个 kernel 应该就能处理 thread 没分配够的问题了。单纯加法也没有数据依赖，应该就这样了吧。

ncu 的性能分析工具感觉好强，强到看不太懂指标（，但是直接生成的报告还是很容易看懂的，甚至还有优化提示。那次跑出来大概是：

Memory Throughput: 93.77%
Achieved Occupancy: 90.80%
提示还是先去看 DRAM

还是访存有问题，又试了一种分块的方法：

__global__ void vector_add(const float* __restrict__ A,
                           const float* __restrict__ B,
                           float* __restrict__ C,
                           int N) {
    const int VEC_SIZE = 4;
    int block_start_idx = blockIdx.x * blockDim.x * VEC_SIZE;
    int thread_start_idx = block_start_idx + threadIdx.x;
    int stride = gridDim.x * blockDim.x * VEC_SIZE;

    for (int i = thread_start_idx; i < N; i += stride) {
        if (i + (VEC_SIZE - 1) < N) {
            float a[VEC_SIZE];
            float b[VEC_SIZE];
            for (int j = 0; j < VEC_SIZE; ++j) {
                a[j] = A[i + j * blockDim.x];
                b[j] = B[i + j * blockDim.x];
            }
            for (int j = 0; j < VEC_SIZE; ++j) {
                C[i + j * blockDim.x] = a[j] + b[j];
            }
        } else {
            for (int j = 0; j < VEC_SIZE; ++j) {
                int current_idx = i + j * blockDim.x;
                if (current_idx < N) {
                    C[current_idx] = A[current_idx] + B[current_idx];
                }
            }
        }
    }
}

但是实际性能还不如原版。

Softmax - CPU

照着代码把合适的编译指示填上去：

void vanilla_softmax(float *input, float *output, int N) {
    if (N <= 0) {
        return;
    }

    float max_val = input[0];
    float sum_exp = 0.0f;

#pragma omp parallel
    {
#pragma omp for reduction(max : max_val)
        for (int i = 1; i < N; i++) {
            if (input[i] > max_val) {
                max_val = input[i];
            }
        }

#pragma omp for simd reduction(+: sum_exp)
        for (int i = 0; i < N; i++) {
            float exp_val = expf(input[i] - max_val);
            output[i] = exp_val;
            sum_exp += exp_val;
        }

#pragma omp for simd
        for (int i = 0; i < N; i++) {
            output[i] /= sum_exp;
        }
    }
}

进出并行区域是有开销的，没有数据依赖的循环可以用 simd 继续优化，编译指示全部糊上去就有很好的效果了。

如果要再优化，就是在 if 判断的时候加一点分支预测，也就是：

if (input[i] > max_val) [[unlikely]] {
    max_val = input[i];
}

假设数据是随机的，那不更新的数学期望是大于更新的，所以用 unlikely。

Softmax - CUDA

要把这个函数转成 CUDA 代码，最先卡住的是同步问题。__syncthreads() 只能做到块内同步，如果有多个 block 还是会有问题。之前看文档看到过 cuda::barrier 能做 grid 级别同步，但是那个 API 在 cuda/barrier 里，那么问题就是不用外部库的时候怎么确保多个块是同步的。

kernel 结束的时候肯定是同步的，这应该是最基本的约束。那就让 solve() 里只起一个 block，用块内同步先逃课：

#include <cuda_runtime.h>
#include <float.h>

__global__ void reduceSoftMax(const float* input, float* output, int N) {
    __shared__ float s_data[1024];
    int tid = threadIdx.x;
    int idx = blockIdx.x * blockDim.x + threadIdx.x;

    float blockMax = input[idx];
    for (int i = tid; i < N; i += blockDim.x) {
        blockMax = fmaxf(blockMax, input[i]);
    }
    s_data[tid] = blockMax;
    __syncthreads();

    for (int step = blockDim.x / 2; step > 0; step /= 2) {
        if (tid < step) {
            s_data[tid] = fmaxf(s_data[tid], s_data[tid + step]);
        }
        __syncthreads();
    }
    const float MAX = s_data[0];

    float blockSum = 0.0f;
    for (int i = tid; i < N; i += blockDim.x) {
        float val = expf(input[i] - MAX);
        output[i] = val;
        blockSum += val;
    }
    s_data[tid] = blockSum;
    __syncthreads();

    for (int step = blockDim.x / 2; step > 0; step /= 2) {
        if (tid < step) {
            s_data[tid] += s_data[tid + step];
        }
        __syncthreads();
    }
    float SUM = s_data[0];

    for (int i = tid; i < N; i += blockDim.x) {
        output[i] /= SUM;
    }
}

extern "C" void solve(const float* input, float* output, int N) {
    reduceSoftMax<<<1, 1024>>>(input, output, N);
    cudaDeviceSynchronize();
}

优化空间就是让这个函数能用多个块并且保证能正确使用，然后一个 warp 内的 thread 竟然可以直接读寄存器，那一直 reduce 下来就能 warp/block 一路 reduce 下来，寄存器太超模了肯定会有提升。而且还有 __shfl_down_sync 这种比较方便的 API，但是工作量太大了写不动了（

__device__ float warpReduceMax(float val) {
    for (int step = warpSize / 2; step > 0; step /= 2) {
        val = fmaxf(val, __shfl_down_sync(0xffffffff, val, step));
    }
    return val;
}

__device__ float warpReduceSum(float val) {
    for (int step = warpSize / 2; step > 0; step /= 2) {
        val += __shfl_down_sync(0xffffffff, val, step);
    }
    return val;
}

矩阵乘法 - CPU

第一版的代码就是查了一下 omp 的用法，先实现一个普通的乘法。最开始甚至还是 vector<vector<float>>：

第一版：vector 套 vector

#pragma omp parallel for shared(A, B, C)
for (int i = 0; i < M; ++i) {
    for (int j = 0; j < N; ++j) {
        for (int k = 0; k < K; ++k) {
            C[i][j] += A[i][k] * B[k][j];
        }
    }
}

当时跑出来可以粗略估计一下，只用 1/6 左右的时间完成了相同的计算，差别只是添加了一行 #pragma omp parallel for。

但是用时还是太长。要优化，那么根据计组，现代处理器为了减少访存时间肯定是有 cache 的，要发挥局部性的优势，矩阵用 vector 表示显然效率不够高，所以换成 C 风格数组分配连续内存。

为了确认是缓存未命中导致的，当时还用 perf 跑了一下，能看到访存占了大部分时间。

再往下就是顺着 cache 继续想。第二个矩阵 B 在运算的时候是按列取的，如果让这里也访问连续地址就还能优化，这个时候就要狠狠地转置这个矩阵：

float A[2048][1024];
float B[1024][2048];
float B_T[2048][1024];
float C[2048][2048];

void trans_B() {
#pragma omp parallel for
    for (int i = 0; i < K; ++i) {
        for (int j = 0; j < N; ++j) {
            B_T[j][i] = B[i][j];
        }
    }
}

void multi_with_tran() {
#pragma omp parallel for
    for (int i = 0; i < M; ++i) {
        for (int j = 0; j < N; ++j) {
            float sum = 0.0f;
            for (int k = 0; k < K; ++k) {
                sum += A[i][k] * B_T[j][k];
            }
            C[i][j] = sum;
        }
    }
}

最后的测试逻辑就是先测无转置，再测有转置：

reset_matrix_C();
double start_time_no_transpose = omp_get_wtime();
multi();
double end_time_no_transpose = omp_get_wtime();
printf("无转置乘法: %.6f 秒\n", end_time_no_transpose - start_time_no_transpose);

reset_matrix_C();
double transpose_start = omp_get_wtime();
trans_B();
double transpose_end = omp_get_wtime();

double start_time_with_transpose = omp_get_wtime();
multi_with_tran();
double end_time_with_transpose = omp_get_wtime();

printf("转置乘法: %.6f 秒\n", end_time_with_transpose - start_time_with_transpose);
printf("转置: %.6f 秒\n", transpose_end - transpose_start);

还是比较有提升的。

那么顺着这个思路继续嗯优化缓存，就是分块了，但是感觉和多线程没有很大关系了（，再按照我电脑的 L1、L2 分块应该就到头了。

但是为什么之前的 flag 没分 😭😭😭

矩阵乘法 - CUDA

这题一开始就是最普通的二维线程映射：

__global__ void matrix_multiplication_kernel(const float* A, const float* B,
                                             float* C, int M, int N, int K) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < M && col < K) {
        float sum = 0.0f;
        for (int i = 0; i < N; ++i) {
            sum += A[row * N + i] * B[i * K + col];
        }
        C[row * K + col] = sum;
    }
}

看了下文档，索引和线程 ID 的计算关系为 ID = x + y * Dx。对于大小为 (Dx, Dy, Dz) 的三维块，索引为 (x, y, z) 的线程，其线程 ID 是 x + y * Dx + z * Dx * Dy。也就是按 x 索引到 Dx 之后 y 索引进一，y 索引到 Dy 之后 z 进一。

也就是说矩阵用二维线程索引的并行性和一维索引应该是等价的，但是这个时候就要求 <<<>>> 配置的参数要和 kernel 逻辑对得上，不然整套索引就乱了。好在 solve() 已经给出了这部分逻辑：

extern "C" void solve(const float* A, const float* B, float* C, int M, int N, int K) {
    dim3 threadsPerBlock(16, 16);
    dim3 blocksPerGrid((K + threadsPerBlock.x - 1) / threadsPerBlock.x,
                       (M + threadsPerBlock.y - 1) / threadsPerBlock.y);

    matrix_multiplication_kernel<<<blocksPerGrid, threadsPerBlock>>>(A, B, C, M, N, K);
    cudaDeviceSynchronize();
}

然后看文档里 shared memory 的例子，刚好也是矩阵乘法。不过这个优化后来好像在另外一题用了，所以这里先去试了 float4。

之前交 CPU 的时候提到了 SIMD，GPU 肯定也是有这种东西的，于是尝试用 float4 弄了一个版本：

__global__ void matrix_multiplication_kernel(const float* A, const float* B,
                                             float* C, int M, int N, int K) {
    const int VEC_SIZE = 4;

    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col_vec_idx = blockIdx.x * blockDim.x + threadIdx.x;
    int matrix_col_index = col_vec_idx * VEC_SIZE;

    if (row < M && matrix_col_index < K) {
        float4 sum = make_float4(0.0f, 0.0f, 0.0f, 0.0f);

        for (int i = 0; i < N; ++i) {
            const float a_val = A[row * N + i];
            const float4 b_vals = ((const float4*)B)[i * (K / VEC_SIZE) + col_vec_idx];

            sum.x += a_val * b_vals.x;
            sum.y += a_val * b_vals.y;
            sum.z += a_val * b_vals.z;
            sum.w += a_val * b_vals.w;
        }

        ((float4*)C)[row * (K / VEC_SIZE) + col_vec_idx] = sum;
    }
}

基本就是同时处理四个元素，很直观的并行。每个 thread 都完整地执行代码，那实际上就是增大单个 thread 的负载然后降低访存压力。把原版设置为 baseline 跑了一下 ncu，吞吐量大概 +118%，时间也少了 60%。

但是直接类型强转，传个非对齐的数据就不行了。为了适配不对齐数据的向量加载，要引入一堆 if 判断，性能疑似会暴跌，阻塞很严重。那不如改成 16 字节对齐的时候用 float4，不对齐就用原来的版本：

__global__ void matrix_multiplication_kernel(const float* A, const float* B,
                                             float* C, int M, int N, int K) {
    const int VEC_SIZE = 4;
    __const__ bool is_k_aligned = (K % VEC_SIZE == 0);

    if (is_k_aligned) {
        int row = blockIdx.y * blockDim.y + threadIdx.y;
        int col_vec_idx = blockIdx.x * blockDim.x + threadIdx.x;
        int matrix_col_index = col_vec_idx * VEC_SIZE;

        if (row < M && matrix_col_index < K) {
            float4 sum = make_float4(0.0f, 0.0f, 0.0f, 0.0f);

            for (int i = 0; i < N; ++i) {
                const float a_val = A[row * N + i];
                const float4 b_vals = ((const float4*)B)[i * (K / VEC_SIZE) + col_vec_idx];

                sum.x += a_val * b_vals.x;
                sum.y += a_val * b_vals.y;
                sum.z += a_val * b_vals.z;
                sum.w += a_val * b_vals.w;
            }

            ((float4*)C)[row * (K / VEC_SIZE) + col_vec_idx] = sum;
        }
    } else {
        int col = blockIdx.x * blockDim.x + threadIdx.x;
        int row = blockIdx.y * blockDim.y + threadIdx.y;

        if (row < M && col < K) {
            float sum = 0.0f;
            for (int i = 0; i < N; ++i) {
                sum += A[row * N + i] * B[i * K + col];
            }
            C[row * K + col] = sum;
        }
    }
}

感觉写得太乱了，而且鼠鼠找不到比较好的参考实现，所以只能文档看到哪优化到哪了 😢😢😢😢

共享家园

这题其实就是在找 bug。参考文档给出的 shared memory 矩阵乘法例子，很容易就能发现少了一个 __syncthreads()，计算后不同步就会导致 RAW 问题，产生数据冒险。

改完之后大概是这样：

__global__ void incorrect_matrix_multiplication_kernel(const float* A, const float* B,
                                                       float* C, int M, int N, int K) {
    int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
    int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
    int ltid_x = threadIdx.x;
    int ltid_y = threadIdx.y;

    __shared__ float sA[TILE_SIZE][TILE_SIZE];
    __shared__ float sB[TILE_SIZE][TILE_SIZE];
    float acc = 0.0f;

    for (int n = 0; n < N; n += TILE_SIZE) {
        sA[ltid_y][ltid_x] = (tid_y < M && ltid_x + n < N) ? A[tid_y * N + ltid_x + n] : 0.0f;
        sB[ltid_y][ltid_x] = (n + ltid_y < N && tid_x < K) ? B[(n + ltid_y) * K + tid_x] : 0.0f;
        __syncthreads();

        if (tid_y < M && tid_x < K) {
            for (int t = 0; t < TILE_SIZE; t++) {
                acc += sA[ltid_y][t] * sB[t][ltid_x];
            }
        }

        __syncthreads();
    }

    if (tid_y < M && tid_x < K) {
        C[tid_y * K + tid_x] = acc;
    }
}

另一个问题是 __syncthreads() 的作用域是整个 block。也就是说如果一个 block 中的一部分 thread 先 return 了，其他线程还在等同步，运算结果就会出问题。理论上应该死锁，但是这个 kernel 还是能正常结束，有点奇诡（

Flash Attention

昨晚写完 md 没检查（，这里换成一个能直接丢 leetgpu 上跑的版本。

核心就是把逻辑维度拉成 (1, N, h) 的 grid，把 attention 拆成 h 个头上 N 行 softmax 计算。Q * K^T、softmax、Attention * V 这几步都还是比较典型的并行划分，只是 head 上的数据在内存里是交错的，所以偏移量计算更烦一点。

#include <cuda_runtime.h>
#include <iostream>
#include <cstdio>
#include <cstdlib>
#include <cmath>

__device__ inline float warpReduceMax(float val) {
    for (int step = 16; step > 0; step /= 2) {
        val = fmaxf(val, __shfl_down_sync(0xffffffff, val, step));
    }
    return __shfl_sync(0xffffffff, val, 0);
}

__device__ inline float warpReduceSum(float val) {
    for (int step = 16; step > 0; step /= 2) {
        val += __shfl_down_sync(0xffffffff, val, step);
    }
    return __shfl_sync(0xffffffff, val, 0);
}

__global__ void attention_fused_kernel(const float* Q, const float* K, const float* V,
                                       float* output,
                                       int N, int d_model, int h, int Br, int Bc) {
    int head = blockIdx.z;
    int row_block = blockIdx.y;
    if (head >= h || row_block >= (N + Br - 1) / Br) return;

    const int w = d_model / h;
    const float scale = 1.0f / sqrtf((float)w);
    int col_off = head * w;

    extern __shared__ float sram[];
    int tid = threadIdx.x;

    float* Qp = sram;
    float* Kp = Qp + Br * w;
    float* Vp = Kp + Bc * w;
    float* M_i = Vp + Bc * w;
    float* L_i = M_i + Br;
    float* O_i = L_i + Br;
    float* Sp = O_i + Br * w;
    float* warpReduce = Sp + Br * Bc;

    const int brTileRow = row_block * Br;

    for (int i = tid; i < Br; i += blockDim.x) {
        M_i[i] = -INFINITY;
        L_i[i] = 0.0f;
    }
    for (int i = tid; i < Br * w; i += blockDim.x) {
        O_i[i] = 0.0f;
    }
    __syncthreads();

    for (int i = tid; i < Br * w; i += blockDim.x) {
        int r = i / w;
        int c = i % w;
        if (brTileRow + r < N) {
            Qp[i] = Q[(brTileRow + r) * d_model + col_off + c];
        } else {
            Qp[i] = 0.0f;
        }
    }
    __syncthreads();

    int Tc = (N + Bc - 1) / Bc;

    for (int tile_batch = 0; tile_batch < Tc; tile_batch++) {
        int bcTileRow = tile_batch * Bc;

        for (int i = tid; i < Bc * w; i += blockDim.x) {
            int r = i / w;
            int c = i % w;
            if (bcTileRow + r < N) {
                Kp[i] = K[(bcTileRow + r) * d_model + col_off + c];
                Vp[i] = V[(bcTileRow + r) * d_model + col_off + c];
            } else {
                Kp[i] = 0.0f;
                Vp[i] = 0.0f;
            }
        }
        __syncthreads();

        for (int i = tid; i < Br * Bc; i += blockDim.x) {
            int r = i / Bc;
            int c = i % Bc;
            float acc = 0.0f;

            if (brTileRow + r < N && bcTileRow + c < N) {
                for (int k = 0; k < w; ++k) {
                    acc += Qp[r * w + k] * Kp[c * w + k];
                }
                acc *= scale;
            } else {
                acc = -INFINITY;
            }
            Sp[i] = acc;
        }
        __syncthreads();

        for (int r = 0; r < Br; r++) {
            if (brTileRow + r >= N) continue;

            float row_max = -INFINITY;
            for (int c = tid; c < Bc; c += blockDim.x) {
                row_max = fmaxf(row_max, Sp[r * Bc + c]);
            }

            __shared__ float s_max[32];
            int warp_id = tid / 32;
            int lane_id = tid % 32;

            for (int offset = 16; offset > 0; offset /= 2) {
                row_max = fmaxf(row_max, __shfl_down_sync(0xFFFFFFFF, row_max, offset));
            }
            if (lane_id == 0) s_max[warp_id] = row_max;
            __syncthreads();

            if (tid == 0) {
                float final_max = s_max[0];
                for (int i = 1; i < (blockDim.x + 31) / 32; i++) {
                    final_max = fmaxf(final_max, s_max[i]);
                }
                warpReduce[r] = final_max;
            }
            __syncthreads();

            float m_ij = warpReduce[r];
            float m_i_old = M_i[r];
            float m_i_new = fmaxf(m_i_old, m_ij);

            if (tid == 0) M_i[r] = m_i_new;
            __syncthreads();

            float row_sum = 0.0f;
            for (int c = tid; c < Bc; c += blockDim.x) {
                float val = expf(Sp[r * Bc + c] - m_i_new);
                Sp[r * Bc + c] = val;
                row_sum += val;
            }

            for (int offset = 16; offset > 0; offset /= 2) {
                row_sum += __shfl_down_sync(0xFFFFFFFF, row_sum, offset);
            }
            if (lane_id == 0) s_max[warp_id] = row_sum;
            __syncthreads();

            if (tid == 0) {
                float final_sum = s_max[0];
                for (int i = 1; i < (blockDim.x + 31) / 32; i++) {
                    final_sum += s_max[i];
                }
                warpReduce[r] = final_sum;
            }
            __syncthreads();

            float l_ij = warpReduce[r];
            float scale_factor = expf(m_i_old - m_i_new);

            if (tid == 0) {
                L_i[r] = L_i[r] * scale_factor + l_ij;
            }
            __syncthreads();

            for (int c = tid; c < w; c += blockDim.x) {
                O_i[r * w + c] *= scale_factor;
                float acc = 0.0f;
                for (int k = 0; k < Bc; k++) {
                    acc += Sp[r * Bc + k] * Vp[k * w + c];
                }
                O_i[r * w + c] += acc;
            }
            __syncthreads();
        }
    }

    for (int i = tid; i < Br * w; i += blockDim.x) {
        int r = i / w;
        int c = i % w;
        if (brTileRow + r < N && L_i[r] > 0.0f) {
            float final_val = O_i[i] / L_i[r];
            output[(brTileRow + r) * d_model + col_off + c] = final_val;
        }
    }
}

extern "C" void solve(const float* Q, const float* K, const float* V, float* output,
                      int N, int d_model, int h) {
    const int Br = 32;
    const int Bc = 32;
    const int w = d_model / h;

    dim3 block(256);
    dim3 grid(1, (N + Br - 1) / Br, h);

    size_t shared_mem =
        (Br * w + 2 * Bc * w + 2 * Br + Br * w + Br * Bc + Br) * sizeof(float);

    attention_fused_kernel<<<grid, block, shared_mem>>>(
        Q, K, V, output, N, d_model, h, Br, Bc
    );
}

其实和之前的题相比，这里主要的难度在维度划分或者说数据依赖分析。矩阵乘法、softmax 都是比较典型的划分，根据数据格式就能比较方便地划分清楚，但是 attention 数据划分到每个 head 上是交错的，所以更偏向一种逻辑维度的划分。

那我们直接拉成 (1, N, h) 的 gridDim，转化成 h 个头的 N 个 softmax 计算。Q * K^T 部分直接从主存拿跨行的数据，那么每个 block 负责一行的 softmax，就比较方便了，直接 reduce 两次。

差不多就是这些了。