My Cuda Note 2: Build a kernel in PyTorch

3 minute read

Published: February 24, 2025

You can also find the official example in Github

Last modified: 2025-02-27

I.How to build a kernel

First, create a .cu and use cuda to build a kernel:

__global__ void test_kernel_cuda(
    int numel,
    const float* a,
    const float* b,
    float* output
) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < numel) {
        output[i] = a[i] + b[i];
    }
}

numel: the size of the array a, b: input array output: output array

Then, use a function to wrap it, take pytorch as an example:

at::Tensor test_kernel_cuda_wrapper(
    const at::Tensor& a,
    const at::Tensor& b
) {
    TORCH_CHECK(a.device().is_cuda(), "input tensor a must be a CUDA tensor");
    TORCH_CHECK(b.device().is_cuda(), "input tensor b must be a CUDA tensor");
    TORCH_CHECK(a.dtype() == b.dtype(), "input tensor a and b must have the same dtype");
    TORCH_CHECK(a.sizes() == b.sizes(), "input tensor a and b must have the same shape");
    auto numel = a.numel();
    auto output = at::empty_like(a);
    test_kernel_cuda<<<(numel + 256 - 1) / 256, 256>>>(
        numel,
        a.data_ptr<float>(),
        b.data_ptr<float>(),
        output.data_ptr<float>()
    );
    return output;
}

After that, use TORCH_LIBRARY_IMPL to register the implement:

TORCH_LIBRARY_IMPL(cuda_test, CUDA, m) {
    m.impl("test_kernel", &test_kernel_cuda_wrapper);
}

Create a .cpp to register the module:

extern "C" {
    PyObject* PyInit__C(void)
    {
        static struct PyModuleDef module_def = {
            PyModuleDef_HEAD_INIT,
            "_C",   /* name of module */
            NULL,   /* module documentation, may be NULL */
            -1,     /* size of per-interpreter state of the module,
                     or -1 if the module keeps state in global variables. */
            NULL,   /* methods */
        };
        return PyModule_Create(&module_def);
    }
}

Then register this library to pytorch:

TORCH_LIBRARY(cuda_test, m) {
    m.def("test_kernel(Tensor a, Tensor b) -> Tensor");
}

Create __init__.py and ops.py as follow:

# __init__.py
import torch
from pathlib import Path
from . import _C, ops

# ops.py
import torch
from torch import Tensor

__all__ = ["test_kernel"]

def test_kernel(a: Tensor, b: Tensor) -> Tensor:
    return torch.ops.cuda_test.test_kernel.default(a, b)

Createa setup.pt:

import os
import torch
import glob

from setuptools import find_packages, setup

from torch.utils.cpp_extension import (
    CppExtension,
    CUDAExtension,
    BuildExtension,
    CUDA_HOME,
)

library_name = "cuda_test"

if torch.__version__ >= "2.6.0":
    py_limited_api = True
else:
    py_limited_api = False


def get_extensions():
    debug_mode = os.getenv("DEBUG", "0") == "1"
    use_cuda = os.getenv("USE_CUDA", "1") == "1"
    if debug_mode:
        print("Compiling in debug mode")

    use_cuda = use_cuda and torch.cuda.is_available() and CUDA_HOME is not None
    extension = CUDAExtension if use_cuda else CppExtension

    extra_link_args = []
    extra_compile_args = {
        "cxx": [
            "-O3" if not debug_mode else "-O0",
            "-fdiagnostics-color=always",
            "-DPy_LIMITED_API=0x03090000",  # min CPython version 3.9
        ],
        "nvcc": [
            "-O3" if not debug_mode else "-O0",
        ],
    }
    if debug_mode:
        extra_compile_args["cxx"].append("-g")
        extra_compile_args["nvcc"].append("-g")
        extra_link_args.extend(["-O0", "-g"])

    this_dir = os.path.dirname(os.path.curdir)
    extensions_dir = os.path.join(this_dir, "cuda_test",
                                  "csrc")
    sources = list(glob.glob(os.path.join(extensions_dir, "*.cpp")))

    cuda_sources = list(glob.glob(os.path.join(extensions_dir, "*.cu")))
    print(f"cuda_sources: {cuda_sources}")
    if use_cuda:
        sources += cuda_sources
    ext_modules = [
        extension(
            f"{library_name}._C",
            sources,
            extra_compile_args=extra_compile_args,
            extra_link_args=extra_link_args,
            py_limited_api=py_limited_api,
        )
    ]

    return ext_modules

setup(
    name=library_name,
    version="0.0.1",
    packages=find_packages(),
    ext_modules=get_extensions(),
    install_requires=["torch"],
    cmdclass={"build_ext": BuildExtension},
    options={"bdist_wheel": {"py_limited_api": "cp39"}} if py_limited_api else {},
)