Unlocking GPU Performance: How Handwritten PTX Code Enhances CUDA Kernels
Introduction
When it comes to GPU programming, CUDA offers developers an efficient way to harness the power of NVIDIA GPUs. CUDA’s abstraction simplifies writing parallel code, but for developers seeking maximum performance, CUDA’s compiled kernels sometimes fall short. Handwritten PTX (Parallel Thread Execution) code — a low-level assembly language for NVIDIA GPUs — can be the key to achieving unmatched performance gains. In this article, we demonstrate how PTX can optimize matrix multiplication tasks and explore why these optimizations are often beyond the reach of the CUDA compiler.
Benchmarking CUDA Kernels vs. PTX
For our benchmark, we compared the performance of two implementations of 1024x1024 matrix multiplication:
- CUDA kernels written in high-level C++.
- Manually optimized PTX code.
The benchmarks were conducted for both real and complex matrices on an NVIDIA GeForce RTX 2080 Ti GPU. The results are summarized below:
Real Matrix Multiplication
CUDA Kernel for Real Matrix Multiplication:
__global__ void mul_matrix_real(float* c, float* a, float* b, int M, int N, int K)
{
//----------
float cc;
int i, x, y, z;
i = blockIdx.x * blockDim.x + threadIdx.x;
if (i >= (M * N)) { return; }
y = i / N;
x = i % N;
cc = 0.0;
for (z = 0; z < K; ++z)
{
cc += a[z + y * K] * b[z * N + x];
}
c[i] = cc;
//----------
}
Handwritten PTX for Real Matrix Multiplication
.version 7.5
.target sm_75
.target texmode_independent
//<<<<<<<<<<<<<<<<<<code>>>>>>>>>>>>>>>>>>>>>>
.entry ptx_mul_matrix_real (
.param .u64 c_addr,
.param .u64 a_addr,
.param .u64 b_addr,
.param .u32 M,
.param .u32 N,
.param .u32 K)
{
.shared .f32 s_b[1024];
.reg .pred p;
.reg .b64 $c_addr, $a_addr, $b_addr, d_addr;
.reg .b32 $M, $N, $K;
.reg .b32 ii, i, j, y, x;
.reg .f32 a, b, s;
.reg .u32 $ctaid_x, $ntid_x, $tid_x;
LABEL_ptx_mul_matrix_real:
ld.param.u64 $c_addr, [c_addr];
ld.param.u64 $a_addr, [a_addr];
ld.param.u64 $b_addr, [b_addr];
ld.param.s32 $M, [M];
ld.param.s32 $N, [N];
ld.param.s32 $K, [K];
mov.u32 $ctaid_x, %ctaid.x;
mov.u32 $ntid_x, %ntid.x;
mov.u32 $tid_x, %tid.x;
mad.lo.u32 ii, $ctaid_x, $ntid_x, $tid_x;
mul.lo.u32 j, $M, $N;
setp.hs.u32 p, ii, j;
@p exit;
div.u32 y, ii, $N;
rem.u32 x, ii, $N;
mad.lo.u32 i, $tid_x, $N, x;
cvt.u64.u32 d_addr, i;
mad.lo.u64 $b_addr, d_addr, 4, $b_addr;
ld.global.f32 b, [$b_addr];
st.shared.f32 s_b[x], b;
bar.sync 0;
mad.lo.u32 i, y, $K, x;
cvt.u64.u32 d_addr, i;
mad.lo.u64 $a_addr, d_addr, 4, $a_addr;
mov.f32 s, 0.0;
mov.u32 i, 0;
Label_while_ptx_mul_matrix_real:
ld.global.f32 a, [$a_addr];
ld.shared.f32 b, s_b[i];
mad.rn.f32 s, a, b, s;
add.u64 $a_addr, $a_addr, 4;
add.u32 i, i, 1;
setp.lo.u32 p, i, $K;
@p bra Label_while_ptx_mul_matrix_real;
cvt.u64.u32 d_addr, ii;
mad.lo.u64 d_addr, d_addr, 4, $c_addr;
st.global.f32 [d_addr], s;
exit;
}
Complex Matrix Multiplication
CUDA Kernel for Complex Matrix Multiplication
__global__ void mul_matrix_complex(float* c, float* a, float* b, int M, int N, int K)
{
float a_r, a_i, b_r, b_i, c_r, c_i;
int i, x, y, z, o_a, o_b;
i = blockIdx.x * blockDim.x + threadIdx.x;
if (i >= (M * N)) { return; }
y = i / N;
x = i % N;
c_r = 0.0;
c_i = 0.0;
for (z = 0; z < K; ++z)
{
o_a = 2 * (z + y * K);
o_b = 2 * (z * N + x);
a_r = a[o_a];
a_i = a[o_a + 1];
b_r = b[o_b];
b_i = b[o_b + 1];
c_r += a_r * b_r - a_i * b_i;
c_i += a_i * b_r + a_r * b_i;
}
c[i * 2] = c_r;
c[i * 2 + 1] = c_i;
}
Handwritten PTX for Complex Matrix Multiplication
.entry ptx_mul_matrix_complex (
.param .u64 c_addr,
.param .u64 a_addr,
.param .u64 b_addr,
.param .u32 M,
.param .u32 N,
.param .u32 K)
{
.shared .f32 s_b[2048];
.reg .pred p;
.reg .u64 $c_addr, $a_addr, $b_addr, d_addr;
.reg .b32 $M, $N, $K;
.reg .b32 ii, i, j, y, x;
.reg .f32 a1, a2, a_r, a_i, b_r, b_i, s_f_r, s_f_i;
.reg .u32 $ctaid_x, $ntid_x, $tid_x;
LABEL_ptx_mul_matrix_complex:
//-------------
ld.param.u64 $c_addr, [c_addr];
ld.param.u64 $a_addr, [a_addr];
ld.param.u64 $b_addr, [b_addr];
ld.param.s32 $M, [M];
ld.param.s32 $N, [N];
ld.param.s32 $K, [K];
//-------------
mov.u32 $ctaid_x, %ctaid.x;
mov.u32 $ntid_x, %ntid.x;
mov.u32 $tid_x, %tid.x;
mad.lo.u32 ii, $ctaid_x, $ntid_x, $tid_x;
//-------------
mul.lo.u32 j, $M, $N;
setp.hs.u32 p, ii, j;
@p exit;
//---
div.u32 y, ii, $N;
rem.u32 x, ii, $N;
//---
shl.b32 j, x, 1;
mad.lo.u32 i, $tid_x, $N, x;
cvt.u64.u32 d_addr, i;
mad.lo.u64 $b_addr, d_addr, 8, $b_addr;
ld.global.v2.f32 {b_r, b_i}, [$b_addr];
st.shared.v2.f32 s_b[j], {b_r, b_i};
//---
bar.sync 0;
//---
mad.lo.u32 i, y, $K, x;
cvt.u64.u32 d_addr, i;
mad.lo.u64 $a_addr, d_addr, 8, $a_addr;
//---
mov.f32 s_f_r, 0.0;
mov.f32 s_f_i, 0.0;
mov.u32 i, 0;
Label_while_ptx_mul_matrix_complex:
shl.b32 j, i, 1;
ld.global.v2.f32 {a_r, a_i}, [$a_addr];
ld.shared.v2.f32 {b_r, b_i}, s_b[j];
//---
mul.rn.f32 a2, a_r, b_r;
mul.rn.f32 a1, a_i, b_i;
sub.f32 a1, a2, a1;
add.f32 s_f_r, a1, s_f_r;
mul.rn.f32 a2, a_i, b_r;
mul.rn.f32 a1, a_r, b_i;
add.f32 a1, a2, a1;
add.f32 s_f_i, a1, s_f_i;
//---
add.u64 $a_addr, $a_addr, 8;
//---
add.u32 i, i, 1;
setp.lo.u32 p, i, $K;
@p bra Label_while_ptx_mul_matrix_complex;
cvt.u64.u32 d_addr, ii;
mad.lo.u64 d_addr, d_addr, 8, $c_addr;
st.global.v2.f32 [d_addr], {s_f_r,s_f_i};
exit;
}
Why PTX Outperforms CUDA (Especially for Complex Matrices)
The performance gain for complex matrices is even higher due to:
- Complex Arithmetic Efficiency: Handwritten PTX allows direct optimization of operations like multiplication and addition of complex numbers, avoiding intermediate overhead.
- Memory Access Patterns: PTX can better handle the interleaved real and imaginary parts of complex numbers, ensuring efficient memory coalescing.
- Instruction Utilization: CUDA compilers may not fully utilize special instructions or registers optimized for handling complex numbers, whereas PTX can leverage them explicitly.
- Fine-Grained Control: PTX allows developers to control every aspect of GPU execution, from thread scheduling to memory access patterns. In contrast, CUDA compilers make general assumptions that may not optimize for specific use cases.
Limitations of Handwritten PTX
While PTX offers substantial performance improvements, it comes with challenges:
- Complexity: Writing and debugging PTX requires expertise in GPU architecture.
- Portability: PTX code is tied to specific GPU architectures, requiring modifications for newer hardware.
- Development Time: Writing PTX is significantly more time-consuming than using CUDA.
- Expertise Required: We at CodeArtworks possess deep expertise in optimizing CUDA applications and porting kernels to PTX. Engaging professional help like ours ensures that your GPU code achieves maximum performance without sacrificing maintainability or portability.
Conclusion
Handwritten PTX code is a powerful tool for developers aiming to maximize GPU performance. Our experiment with matrix multiplication demonstrates that PTX can deliver significant speedups over CUDA kernels, particularly for complex operations. However, the trade-offs in complexity and portability must be carefully considered. For performance-critical applications, mastering PTX can unlock the full potential of NVIDIA GPUs.