API Reference
This document provides comprehensive API documentation for all public interfaces in HPC-AI-Optimization-Lab.
Table of Contents
- Common Library
- Module 01: Elementwise
- Module 02: Reduction
- Module 03: GEMM
- Module 04: Convolution
- Module 05: Attention
- Module 06: Quantization
- Module 07: CUDA 13 Features
- Python API
Common Library
Tensor<T>
RAII wrapper for GPU memory management.
Header: common/tensor.cuh
#include "common/tensor.cuh"
namespace hpc {
template <CudaNumeric T>
class Tensor {
public:
// Constructor
explicit Tensor(size_t size);
// Destructor (automatic cleanup)
~Tensor();
// Move semantics (no copy)
Tensor(Tensor&& other) noexcept;
Tensor& operator=(Tensor&& other) noexcept;
// Accessors
T* data() noexcept;
const T* data() const noexcept;
size_t size() const noexcept;
size_t bytes() const noexcept;
bool empty() const noexcept;
// Host ↔ Device transfers
void copy_from_host(const T* host_data);
void copy_to_host(T* host_data) const;
void copy_from_host(const std::vector<T>& host_vec);
std::vector<T> to_host() const;
// Async transfers
void copy_from_host_async(const T* host_data, cudaStream_t stream);
void copy_to_host_async(T* host_data, cudaStream_t stream) const;
// Operations
void zero();
};
} // namespace hpc2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
Example:
#include "common/tensor.cuh"
int main() {
// Create tensor with 1024 floats
hpc::Tensor<float> data(1024);
// Initialize with host data
std::vector<float> host_data(1024, 1.0f);
data.copy_from_host(host_data);
// Use in kernel
my_kernel<<<1, 256>>>(data.data(), data.size());
cudaDeviceSynchronize();
// Retrieve results
auto result = data.to_host();
return 0;
} // Automatic cleanup2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
CudaTimer
GPU-side performance timing.
Header: common/timer.cuh
#include "common/timer.cuh"
namespace hpc {
class CudaTimer {
public:
CudaTimer();
~CudaTimer();
// Non-copyable, movable
CudaTimer(CudaTimer&&) noexcept;
CudaTimer& operator=(CudaTimer&&) noexcept;
// Timing operations
void start(cudaStream_t stream = nullptr);
void stop(cudaStream_t stream = nullptr);
// Results
float elapsed_ms() const;
};
} // namespace hpc2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
Example:
hpc::CudaTimer timer;
hpc::Tensor<float> data(1024 * 1024);
timer.start();
data.zero();
cudaDeviceSynchronize();
timer.stop();
std::cout << "Time: " << timer.elapsed_ms() << " ms\n";2
3
4
5
6
7
8
9
CUDA_CHECK Macro
Error handling macro for CUDA operations.
Header: common/cuda_check.cuh
#include "common/cuda_check.cuh"
// Check return value
CUDA_CHECK(cudaMalloc(&ptr, size));
// Check last kernel launch
kernel<<<grid, block>>>(args);
CUDA_CHECK_LAST();2
3
4
5
6
7
8
Module 01: Elementwise
ReLU
Rectified Linear Unit activation function.
Header: elementwise/relu.cuh
#include "elementwise/relu.cuh"
namespace hpc::elementwise {
enum class OptLevel {
Naive, // Basic per-element processing
Vectorized, // float4 load/store
GridStride // Grid stride loop (recommended)
};
template <typename T, OptLevel Level = OptLevel::GridStride>
requires std::is_same_v<T, float> || std::is_same_v<T, __half>
void relu(const T* input, T* output, size_t n,
cudaStream_t stream = nullptr);
} // namespace hpc::elementwise2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
Example:
#include "elementwise/relu.cuh"
#include "common/tensor.cuh"
// Using default optimization (GridStride)
hpc::Tensor<float> input(1024);
hpc::Tensor<float> output(1024);
hpc::elementwise::relu<float>(input.data(), output.data(), 1024);
// Using specific optimization level
hpc::elementwise::relu<float, hpc::elementwise::OptLevel::Vectorized>(
input.data(), output.data(), 1024);2
3
4
5
6
7
8
9
10
11
12
Sigmoid
Sigmoid activation function.
Header: elementwise/sigmoid.cuh
#include "elementwise/sigmoid.cuh"
namespace hpc::elementwise {
template <typename T, OptLevel Level = OptLevel::GridStride>
requires std::is_same_v<T, float> || std::is_same_v<T, __half>
void sigmoid(const T* input, T* output, size_t n,
cudaStream_t stream = nullptr);
} // namespace hpc::elementwise2
3
4
5
6
7
8
9
10
Vector Add
Elementwise vector addition.
Header: elementwise/vector_add.cuh
#include "elementwise/vector_add.cuh"
namespace hpc::elementwise {
template <typename T, OptLevel Level = OptLevel::GridStride>
requires std::is_same_v<T, float> || std::is_same_v<T, __half>
void vector_add(const T* a, const T* b, T* c, size_t n,
cudaStream_t stream = nullptr);
} // namespace hpc::elementwise2
3
4
5
6
7
8
9
10
Transpose
Matrix transpose with optimization options.
Header: elementwise/transpose.cuh
#include "elementwise/transpose.cuh"
namespace hpc::elementwise {
enum class TransposeOpt {
Naive, // Direct read-row write-col
SharedMemory, // Use shared memory
SharedMemPadded // Shared memory with padding (recommended)
};
template <typename T, TransposeOpt Opt = TransposeOpt::SharedMemPadded>
requires std::is_same_v<T, float> || std::is_same_v<T, __half>
void transpose(const T* input, T* output, int rows, int cols,
cudaStream_t stream = nullptr);
} // namespace hpc::elementwise2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
Module 02: Reduction
Softmax
Numerically stable softmax with online algorithm.
Header: reduction/softmax.cuh
#include "reduction/softmax.cuh"
namespace hpc::reduction {
enum class SoftmaxOpt {
Naive, // Two-pass with global atomics
WarpShuffle, // Warp-level reduction
OnlineSoftmax, // Single-pass (recommended)
Fused // Fused with L2 cache persistence
};
template <typename T, SoftmaxOpt Opt = SoftmaxOpt::OnlineSoftmax>
requires std::is_same_v<T, float> || std::is_same_v<T, __half>
void softmax(const T* input, T* output, int batch, int seq_len,
cudaStream_t stream = nullptr);
} // namespace hpc::reduction2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
Example:
#include "reduction/softmax.cuh"
int batch = 32, seq_len = 128;
hpc::Tensor<float> input(batch * seq_len);
hpc::Tensor<float> output(batch * seq_len);
hpc::reduction::softmax<float>(
input.data(), output.data(), batch, seq_len);2
3
4
5
6
7
8
LayerNorm
Layer normalization.
Header: reduction/layernorm.cuh
#include "reduction/layernorm.cuh"
namespace hpc::reduction {
template <typename T>
requires std::is_same_v<T, float> || std::is_same_v<T, __half>
void layer_norm(const T* input, const T* gamma, const T* beta,
T* output, int batch, int hidden_size,
float eps = 1e-5f, cudaStream_t stream = nullptr);
} // namespace hpc::reduction2
3
4
5
6
7
8
9
10
11
RMSNorm
Root Mean Square Layer Normalization.
Header: reduction/rmsnorm.cuh
#include "reduction/rmsnorm.cuh"
namespace hpc::reduction {
template <typename T>
requires std::is_same_v<T, float> || std::is_same_v<T, __half>
void rms_norm(const T* input, const T* gamma, T* output,
int batch, int hidden_size,
float eps = 1e-5f, cudaStream_t stream = nullptr);
} // namespace hpc::reduction2
3
4
5
6
7
8
9
10
11
Module 03: GEMM
General Matrix Multiply
7-step optimization progression.
Header: gemm/gemm.cuh
#include "gemm/gemm.cuh"
namespace hpc::gemm {
enum class GemmOpt {
Naive, // Step 1: Global memory only
SharedMemTiling, // Step 2: Shared memory tiling
DoubleBuffer, // Step 3: Double buffering
RegisterTiling, // Step 4: Register tiling
TensorCoreWMMA, // Step 5: WMMA API ✅
TensorCoreMMA, // Step 6: MMA PTX (delegates to Step 5) 🚧
SoftwarePipeline // Step 7: Software pipelining (planned) 🚧
};
template <typename T, GemmOpt Opt = GemmOpt::SharedMemTiling>
requires std::is_same_v<T, float> ||
std::is_same_v<T, __half> ||
std::is_same_v<T, int8_t>
void gemm(const T* A, const T* B, T* C,
int M, int N, int K,
float alpha = 1.0f, float beta = 0.0f,
cudaStream_t stream = nullptr);
// CUTLASS baseline (float only, experimental)
template <typename T>
void gemm_cutlass(const T* A, const T* B, T* C,
int M, int N, int K,
float alpha = 1.0f, float beta = 0.0f,
cudaStream_t stream = nullptr);
} // namespace hpc::gemm2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
Type Support Matrix:
| Type | Steps 1-4 | Step 5 (WMMA) | Step 6 (MMA) | Step 7 (Pipeline) | CUTLASS |
|---|---|---|---|---|---|
float | ✅ | ✅ | ✅† | 🚧 | ✅ |
__half | ✅ | ✅ | ✅† | 🚧 | 🚧 |
int8_t | ✅‡ | ✅ | ✅† | 🚧 | 🚧 |
† Step 6 currently delegates to Step 5 for stability
‡ INT8 GEMM currently only has SharedMemTiling optimization fully implemented; other optimization levels delegate to SharedMemTiling
Example:
#include "gemm/gemm.cuh"
int M = 1024, N = 1024, K = 1024;
hpc::Tensor<float> A(M * K);
hpc::Tensor<float> B(K * N);
hpc::Tensor<float> C(M * N);
// Basic shared memory tiling
hpc::gemm::gemm<float, hpc::gemm::GemmOpt::SharedMemTiling>(
A.data(), B.data(), C.data(), M, N, K);
// Tensor Core (requires __half)
hpc::Tensor<__half> A_h(M * K);
hpc::Tensor<__half> B_h(K * N);
hpc::Tensor<__half> C_h(M * N);
hpc::gemm::gemm<__half, hpc::gemm::GemmOpt::TensorCoreWMMA>(
A_h.data(), B_h.data(), C_h.data(), M, N, K);
// CUTLASS baseline (float only)
hpc::gemm::gemm_cutlass<float>(
A.data(), B.data(), C.data(), M, N, K);2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
Module 04: Convolution
Implicit GEMM Convolution
Production-ready convolution implementation.
Header: convolution/conv_implicit_gemm.cuh
#include "convolution/conv_implicit_gemm.cuh"
namespace hpc::convolution {
struct ConvParams {
int batch;
int in_channels;
int out_channels;
int in_height;
int in_width;
int kernel_h;
int kernel_w;
int stride_h;
int stride_w;
int pad_h;
int pad_w;
int dilation_h;
int dilation_w;
};
template <typename T>
void conv2d_implicit_gemm(const T* input, const T* weight, T* output,
const ConvParams& params,
cudaStream_t stream = nullptr);
} // namespace hpc::convolution2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
Winograd Convolution
Optimized 3×3 convolution (experimental).
Header: convolution/conv_winograd.cuh
#include "convolution/conv_winograd.cuh"
namespace hpc::convolution {
struct WinogradConfig {
int tile_size = 4;
bool use_winograd = true;
};
void conv2d_winograd(const float* input, const float* weight, float* output,
const ConvParams& params,
const WinogradConfig& config = {},
cudaStream_t stream = nullptr);
} // namespace hpc::convolution2
3
4
5
6
7
8
9
10
11
12
13
14
15
Module 05: Attention
FlashAttention Forward
IO-aware attention mechanism.
Header: attention/flash_attention.cuh
#include "attention/flash_attention.cuh"
namespace hpc::attention {
struct FlashAttnConfig {
int batch_size;
int num_heads;
int seq_len;
int head_dim; // Currently supports 64
float scale; // Typically 1/sqrt(head_dim)
bool causal; // Apply causal mask
};
template <typename T>
requires std::is_same_v<T, float> || std::is_same_v<T, __half>
void flash_attention_forward(const T* Q, const T* K, const T* V,
T* O,
const FlashAttnConfig& config,
cudaStream_t stream = nullptr);
} // namespace hpc::attention2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
Example:
#include "attention/flash_attention.cuh"
int batch = 2, heads = 8, seq = 512, dim = 64;
int total = batch * heads * seq * dim;
hpc::Tensor<float> Q(total), K(total), V(total), O(total);
hpc::attention::FlashAttnConfig config{
.batch_size = batch,
.num_heads = heads,
.seq_len = seq,
.head_dim = dim,
.scale = 1.0f / std::sqrt((float)dim),
.causal = true
};
hpc::attention::flash_attention_forward<float>(
Q.data(), K.data(), V.data(), O.data(), config);2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
RoPE (Rotary Positional Embedding)
Header: attention/rope.cuh
#include "attention/rope.cuh"
namespace hpc::attention {
template <typename T>
void apply_rope(T* query, T* key,
int batch, int num_heads, int seq_len, int head_dim,
const float* cos_cache, const float* sin_cache,
cudaStream_t stream = nullptr);
} // namespace hpc::attention2
3
4
5
6
7
8
9
10
11
TopK
For MoE routing.
Header: attention/topk.cuh
#include "attention/topk.cuh"
namespace hpc::attention {
template <typename T>
void topk(const T* input, T* output, int* indices,
int batch, int n, int k,
cudaStream_t stream = nullptr);
} // namespace hpc::attention2
3
4
5
6
7
8
9
10
Module 06: Quantization
INT8 Quantization
Per-row symmetric quantization.
Header: quantization/int8_quant.cuh
#include "quantization/int8_quant.cuh"
namespace hpc::quantization {
void quantize_int8(const float* input, int8_t* output, float* scale,
int rows, int cols, cudaStream_t stream = nullptr);
void dequantize_int8(const int8_t* input, const float* scale,
float* output, int rows, int cols,
cudaStream_t stream = nullptr);
} // namespace hpc::quantization2
3
4
5
6
7
8
9
10
11
12
Module 07: CUDA 13 Features
TMA Copy
Tensor Memory Accelerator utilities.
Header: cuda13/tma.cuh
#include "cuda13/tma.cuh"
namespace hpc::cuda13 {
struct TMAConfig {
int cluster_width = 1;
int cluster_height = 1;
int pipeline_depth = 2;
bool use_tma = true;
};
template <typename T, int NUM_CHANNELS = 8>
void tma_copy_2d(const T* src, T* dst,
int rows, int cols,
const TMAConfig& config,
cudaStream_t stream = nullptr);
} // namespace hpc::cuda132
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
Thread Block Clusters
Header: cuda13/cluster.cuh
#include "cuda13/cluster.cuh"
namespace hpc::cuda13 {
struct ClusterConfig {
dim3 cluster_dims;
dim3 grid_dims;
dim3 block_dims;
bool use_cluster = true;
};
template <typename T>
void cluster_reduce(const T* input, T* output, size_t n,
const ClusterConfig& config,
cudaStream_t stream = nullptr);
} // namespace hpc::cuda132
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
FP8 GEMM
Header: cuda13/fp8_gemm.cuh
#include "cuda13/fp8_gemm.cuh"
namespace hpc::cuda13 {
enum class FP8Format { e4m3, e5m2 };
struct FP8GEMMConfig {
int tile_m = 16;
int tile_n = 16;
int tile_k = 16;
FP8Format format_a = FP8Format::e4m3;
FP8Format format_b = FP8Format::e4m3;
float scale_a = 1.0f;
float scale_b = 1.0f;
bool use_fp8 = true;
};
void fp8_gemm(const __half* A, const __half* B, __half* C,
int M, int N, int K,
const FP8GEMMConfig& config,
cudaStream_t stream = nullptr);
} // namespace hpc::cuda132
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
Python API
Module Structure
import hpc_ai_opt
# Available submodules
hpc_ai_opt.elementwise # ReLU, Sigmoid, VectorAdd
hpc_ai_opt.reduction # Softmax, LayerNorm, RMSNorm
hpc_ai_opt.gemm # Matrix multiplication2
3
4
5
6
Usage Pattern
import torch
import hpc_ai_opt
# Create tensors
x = torch.randn(1024, 1024, device="cuda", dtype=torch.float32)
y = torch.empty_like(x)
# Call CUDA kernel
hpc_ai_opt.elementwise.relu(x, y)
# Results are in-place in y2
3
4
5
6
7
8
9
10
11
Available Functions
| Submodule | Function | Signature |
|---|---|---|
elementwise | relu | (input, output) |
elementwise | sigmoid | (input, output) |
reduction | softmax | (input, output, batch, seq_len) |
gemm | sgemm | (A, B, C, M, N, K) |
Type Constraints
All template functions use C++20 concepts:
// CudaNumeric concept
template <typename T>
concept CudaNumeric = std::is_arithmetic_v<T> ||
std::is_same_v<T, __half> ||
std::is_same_v<T, __nv_bfloat16>;
// Usage in function signatures
template <CudaNumeric T>
void function(const T* input, T* output, size_t n);2
3
4
5
6
7
8
9
Error Handling
All functions throw std::invalid_argument for:
- Null pointer arguments
- Invalid dimensions (negative or zero where positive required)
- Unsupported parameter combinations
try {
hpc::gemm::gemm<float>(nullptr, B, C, M, N, K);
} catch (const std::invalid_argument& e) {
std::cerr << "Error: " << e.what() << std::endl;
}2
3
4
5
CUDA errors are reported via CUDA_CHECK_LAST() which throws std::runtime_error with CUDA error details.