Unlocking GPU Performance: How Handwritten PTX Code Enhances CUDA Kernels

5 min readDec 22, 2024

Introduction

When it comes to GPU programming, CUDA offers developers an efficient way to harness the power of NVIDIA GPUs. CUDA’s abstraction simplifies writing parallel code, but for developers seeking maximum performance, CUDA’s compiled kernels sometimes fall short. Handwritten PTX (Parallel Thread Execution) code — a low-level assembly language for NVIDIA GPUs — can be the key to achieving unmatched performance gains. In this article, we demonstrate how PTX can optimize matrix multiplication tasks and explore why these optimizations are often beyond the reach of the CUDA compiler.

Benchmarking CUDA Kernels vs. PTX

For our benchmark, we compared the performance of two implementations of 1024x1024 matrix multiplication:

CUDA kernels written in high-level C++.
Manually optimized PTX code.

The benchmarks were conducted for both real and complex matrices on an NVIDIA GeForce RTX 2080 Ti GPU. The results are summarized below:

Real Matrix Multiplication

CUDA Kernel for Real Matrix Multiplication:

__global__ void mul_matrix_real(float* c, float* a, float* b, int M, int N, int K)
{
    //----------
    float cc;
    int i, x, y, z;

    i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i >= (M * N)) { return; }

    y = i / N;
    x = i % N;

    cc = 0.0;
    for (z = 0; z < K; ++z)
    {
        cc += a[z + y * K] * b[z * N + x];
    }
    c[i] = cc;
    //----------
}

Handwritten PTX for Real Matrix Multiplication

.version 7.5
.target sm_75
.target texmode_independent
//<<<<<<<<<<<<<<<<<<code>>>>>>>>>>>>>>>>>>>>>>
.entry ptx_mul_matrix_real (
       .param .u64 c_addr,
       .param .u64 a_addr,
       .param .u64 b_addr,
       .param .u32 M,
       .param .u32 N,
       .param .u32 K)
{
 .shared .f32 s_b[1024];
 .reg .pred p;
 .reg .b64 $c_addr, $a_addr, $b_addr, d_addr;
 .reg .b32 $M, $N, $K;
 .reg .b32 ii, i, j, y, x;
 .reg .f32 a, b, s;
 .reg .u32 $ctaid_x, $ntid_x, $tid_x;
 
 LABEL_ptx_mul_matrix_real:
 
 ld.param.u64        $c_addr, [c_addr];
 ld.param.u64        $a_addr, [a_addr]; 
 ld.param.u64        $b_addr, [b_addr]; 
 ld.param.s32            $M, [M]; 
 ld.param.s32            $N, [N]; 
 ld.param.s32            $K, [K]; 
 
 mov.u32        $ctaid_x, %ctaid.x; 
 mov.u32         $ntid_x, %ntid.x; 
 mov.u32          $tid_x, %tid.x;
 mad.lo.u32  ii, $ctaid_x, $ntid_x, $tid_x;
  
 mul.lo.u32   j, $M, $N;
 setp.hs.u32  p, ii, j;
   @p   exit;
 
 div.u32  y, ii, $N;
 rem.u32  x, ii, $N;
  
 mad.lo.u32   i, $tid_x, $N, x;
 cvt.u64.u32  d_addr, i;
 mad.lo.u64   $b_addr, d_addr, 4, $b_addr;
  ld.global.f32    b, [$b_addr];
  st.shared.f32    s_b[x], b;
 bar.sync 0;   
 mad.lo.u32   i, y, $K, x;
 cvt.u64.u32  d_addr, i;
 mad.lo.u64   $a_addr, d_addr, 4, $a_addr;
 
 mov.f32 s, 0.0;
 mov.u32 i, 0; 
 Label_while_ptx_mul_matrix_real:
     ld.global.f32    a, [$a_addr];
     ld.shared.f32    b, s_b[i];
     mad.rn.f32  s, a, b, s;
     add.u64   $a_addr, $a_addr, 4;
   add.u32 i, i, 1;
 setp.lo.u32 p, i, $K;
     @p bra Label_while_ptx_mul_matrix_real;
 cvt.u64.u32  d_addr, ii;
 mad.lo.u64   d_addr, d_addr, 4, $c_addr;
  st.global.f32 [d_addr], s;
 
 exit;
}

Complex Matrix Multiplication

CUDA Kernel for Complex Matrix Multiplication

__global__ void mul_matrix_complex(float* c, float* a, float* b, int M, int N, int K)
{
    float a_r, a_i, b_r, b_i, c_r, c_i;
    int i, x, y, z, o_a, o_b;

    i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i >= (M * N)) { return; }

    y = i / N;
    x = i % N;

    c_r = 0.0;
    c_i = 0.0;
    for (z = 0; z < K; ++z)
    {
        o_a = 2 * (z + y * K);
        o_b = 2 * (z * N + x);

        a_r = a[o_a];
        a_i = a[o_a + 1];
        b_r = b[o_b];
        b_i = b[o_b + 1];

        c_r += a_r * b_r - a_i * b_i;
        c_i += a_i * b_r + a_r * b_i;
    }
    c[i * 2] = c_r;
    c[i * 2 + 1] = c_i;
}

Handwritten PTX for Complex Matrix Multiplication

.entry ptx_mul_matrix_complex (
          .param .u64 c_addr,
          .param .u64 a_addr,
          .param .u64 b_addr,
          .param .u32 M,
          .param .u32 N,
          .param .u32 K)
{
 .shared .f32 s_b[2048];
 
 .reg .pred p;
 .reg .u64 $c_addr, $a_addr, $b_addr, d_addr;
 .reg .b32 $M, $N, $K;
 .reg .b32 ii, i, j, y, x;
 .reg .f32 a1, a2, a_r, a_i, b_r, b_i, s_f_r, s_f_i;
 .reg .u32 $ctaid_x, $ntid_x, $tid_x;
 
 LABEL_ptx_mul_matrix_complex:
 //-------------
 ld.param.u64        $c_addr, [c_addr];
 ld.param.u64        $a_addr, [a_addr]; 
 ld.param.u64        $b_addr, [b_addr]; 
 ld.param.s32            $M, [M]; 
 ld.param.s32            $N, [N]; 
 ld.param.s32            $K, [K]; 
 //-------------
 mov.u32        $ctaid_x, %ctaid.x; 
 mov.u32         $ntid_x, %ntid.x; 
 mov.u32          $tid_x, %tid.x;
 mad.lo.u32  ii, $ctaid_x, $ntid_x, $tid_x;
 //-------------
 mul.lo.u32   j, $M, $N;
 setp.hs.u32  p, ii, j;
 @p   exit;
 //---
 div.u32  y, ii, $N;
 rem.u32  x, ii, $N;
 //---  
 shl.b32      j, x, 1;
 mad.lo.u32   i, $tid_x, $N, x;
 cvt.u64.u32  d_addr, i;
 mad.lo.u64   $b_addr, d_addr, 8, $b_addr;
 ld.global.v2.f32    {b_r, b_i}, [$b_addr];
 st.shared.v2.f32    s_b[j], {b_r, b_i};
 //---
 bar.sync 0;
 //---    
 mad.lo.u32   i, y, $K, x;
 cvt.u64.u32  d_addr, i;
 mad.lo.u64   $a_addr, d_addr, 8, $a_addr;
 //---    
 mov.f32  s_f_r, 0.0;
 mov.f32  s_f_i, 0.0;
 mov.u32 i, 0; 
 Label_while_ptx_mul_matrix_complex:
   shl.b32      j, i, 1;
   ld.global.v2.f32    {a_r, a_i}, [$a_addr];
   ld.shared.v2.f32    {b_r, b_i}, s_b[j];
   //---
   mul.rn.f32  a2, a_r, b_r;
   mul.rn.f32  a1, a_i, b_i;
   sub.f32     a1,  a2,  a1;
   add.f32  s_f_r,  a1,  s_f_r;
   mul.rn.f32  a2, a_i, b_r;
   mul.rn.f32  a1, a_r, b_i;
   add.f32     a1,  a2,  a1;
   add.f32  s_f_i,  a1,  s_f_i;
   //---
            add.u64   $a_addr, $a_addr, 8;
   //---
  add.u32 i, i, 1;
 setp.lo.u32 p, i, $K;
    @p bra Label_while_ptx_mul_matrix_complex;
 cvt.u64.u32  d_addr, ii;
 mad.lo.u64   d_addr, d_addr, 8, $c_addr;
   st.global.v2.f32 [d_addr], {s_f_r,s_f_i};

 exit;
}

Why PTX Outperforms CUDA (Especially for Complex Matrices)

The performance gain for complex matrices is even higher due to:

Complex Arithmetic Efficiency: Handwritten PTX allows direct optimization of operations like multiplication and addition of complex numbers, avoiding intermediate overhead.
Memory Access Patterns: PTX can better handle the interleaved real and imaginary parts of complex numbers, ensuring efficient memory coalescing.
Instruction Utilization: CUDA compilers may not fully utilize special instructions or registers optimized for handling complex numbers, whereas PTX can leverage them explicitly.
Fine-Grained Control: PTX allows developers to control every aspect of GPU execution, from thread scheduling to memory access patterns. In contrast, CUDA compilers make general assumptions that may not optimize for specific use cases.

Limitations of Handwritten PTX

While PTX offers substantial performance improvements, it comes with challenges:

Complexity: Writing and debugging PTX requires expertise in GPU architecture.
Portability: PTX code is tied to specific GPU architectures, requiring modifications for newer hardware.
Development Time: Writing PTX is significantly more time-consuming than using CUDA.
Expertise Required: We at CodeArtworks possess deep expertise in optimizing CUDA applications and porting kernels to PTX. Engaging professional help like ours ensures that your GPU code achieves maximum performance without sacrificing maintainability or portability.

Conclusion

Handwritten PTX code is a powerful tool for developers aiming to maximize GPU performance. Our experiment with matrix multiplication demonstrates that PTX can deliver significant speedups over CUDA kernels, particularly for complex operations. However, the trade-offs in complexity and portability must be carefully considered. For performance-critical applications, mastering PTX can unlock the full potential of NVIDIA GPUs.