Skip to content

API Reference

Overview

Hetero-Paged-Infer provides a Rust API for integrating the inference engine into your applications. This guide covers the core types, traits, and usage patterns.

Core Types

InferenceEngine

The main entry point for inference operations.

use hetero_infer::{EngineConfig, GenerationParams, InferenceEngine};

// Create engine with default configuration
let config = EngineConfig::default();
let mut engine = InferenceEngine::new(config)?;

// Submit a request
let params = GenerationParams {
    max_tokens: 100,
    temperature: 1.0,
    top_p: 0.9,
};
let request_id = engine.submit_request("Hello, world!", params)?;

// Run inference
let completed = engine.run();

// Process results
for result in completed {
    println!("Output: {}", result.output_text);
}

EngineConfig

Configuration for the inference engine.

pub struct EngineConfig {
    pub block_size: u32,          // Tokens per physical block (default: 16)
    pub max_num_blocks: u32,      // Total physical blocks (default: 1024)
    pub max_batch_size: u32,      // Max sequences per batch (default: 32)
    pub max_num_seqs: u32,        // Max concurrent sequences (default: 256)
    pub max_model_len: u32,       // Max sequence length (default: 2048)
    pub max_total_tokens: u32,    // Max tokens per batch (default: 4096)
    pub memory_threshold: f32,    // Memory pressure threshold (default: 0.9)
}

Usage:

// Default configuration
let config = EngineConfig::default();

// Custom configuration
let config = EngineConfig {
    block_size: 32,
    max_num_blocks: 2048,
    max_batch_size: 64,
    ..Default::default()
};

// Validate configuration
config.validate()?;

// Load from file
let config = EngineConfig::from_file("config.json")?;

GenerationParams

Parameters controlling text generation.

pub struct GenerationParams {
    pub max_tokens: u32,      // Maximum tokens to generate
    pub temperature: f32,     // Sampling temperature (0.0 - 2.0)
    pub top_p: f32,          // Nucleus sampling threshold (0.0 - 1.0)
}

Defaults:

impl Default for GenerationParams {
    fn default() -> Self {
        Self {
            max_tokens: 100,
            temperature: 1.0,
            top_p: 0.9,
        }
    }
}

Request

Represents an inference request.

pub struct Request {
    pub id: u64,
    pub input_tokens: Vec<u32>,
    pub output_tokens: Vec<u32>,
    pub max_tokens: u32,
    pub temperature: f32,
    pub top_p: f32,
    pub state: RequestState,
    pub created_at: Instant,
}

pub enum RequestState {
    Pending,
    Prefill,
    Decode,
    Completed,
    Failed(String),
}

Sequence

Active request with KV Cache allocation.

pub struct Sequence {
    pub seq_id: u64,
    pub request: Request,
    pub logical_blocks: Vec<LogicalBlock>,
    pub num_computed_tokens: u32,
    pub num_generated_tokens: u32,
}

Trait Interfaces

TokenizerTrait

pub trait TokenizerTrait: Send + Sync {
    /// Encode text to token IDs
    fn encode(&self, text: &str) -> Vec<u32>;

    /// Decode token IDs to text
    fn decode(&self, tokens: &[u32]) -> String;

    /// Vocabulary size
    fn vocab_size(&self) -> u32;

    /// Special token IDs
    fn bos_token_id(&self) -> u32;
    fn eos_token_id(&self) -> u32;
    fn pad_token_id(&self) -> u32;
}

SimpleTokenizer (built-in implementation):

use hetero_infer::SimpleTokenizer;

let tokenizer = SimpleTokenizer::new();
let tokens = tokenizer.encode("Hello");
let text = tokenizer.decode(&tokens);

SchedulerTrait

pub trait SchedulerTrait: Send + Sync {
    /// Add a new request to the pending queue
    fn add_request(&mut self, request: Request) -> Result<u64, SchedulerError>;

    /// Schedule the next batch for execution
    fn schedule(&mut self) -> SchedulerOutput;

    /// Update sequence states after GPU execution
    fn update_sequences(&mut self, outputs: &ExecutionOutput);

    /// Get completed requests
    fn get_completed(&mut self) -> Vec<Request>;

    /// Check if there's pending work
    fn has_pending_work(&self) -> bool;
}

KVCacheManagerTrait

pub trait KVCacheManagerTrait: Send + Sync {
    /// Allocate blocks for a new sequence
    fn allocate_sequence(&mut self, seq_id: u64, num_tokens: u32) -> Result<(), MemoryError>;

    /// Allocate an additional block for a growing sequence
    fn allocate_block(&mut self, seq_id: u64) -> Result<PhysicalBlockRef, MemoryError>;

    /// Release all blocks for a sequence
    fn free_sequence(&mut self, seq_id: u64);

    /// Get block table for GPU execution
    fn get_block_table(&self, seq_id: u64) -> Option<Vec<u32>>;

    /// Get memory statistics
    fn get_memory_stats(&self) -> MemoryStats;

    /// Check if n blocks can be allocated
    fn can_allocate(&self, num_blocks: u32) -> bool;
}

pub struct MemoryStats {
    pub total_blocks: u32,
    pub used_blocks: u32,
    pub free_blocks: u32,
    pub num_sequences: u32,
}

GPUExecutorTrait

pub trait GPUExecutorTrait: Send + Sync {
    /// Execute a batch of sequences
    fn execute(&mut self, batch: &ExecutionBatch) -> ExecutionOutput;

    /// Capture CUDA Graph for decode phase
    fn capture_decode_graph(&mut self, batch_size: u32);

    /// Execute using captured graph
    fn execute_graph(&mut self, batch: &ExecutionBatch) -> ExecutionOutput;
}

pub struct ExecutionBatch {
    pub input_tokens: Vec<u32>,
    pub positions: Vec<u32>,
    pub seq_lens: Vec<u32>,
    pub block_tables: Vec<Vec<u32>>,
    pub is_prefill: Vec<bool>,
}

pub struct ExecutionOutput {
    pub next_tokens: Vec<u32>,
    pub logits: Option<Vec<f32>>,
}

Error Handling

The API uses a structured error type:

use hetero_infer::EngineError;

match result {
    Ok(output) => println!("Success: {}", output),
    Err(EngineError::Config(e)) => eprintln!("Config error: {}", e),
    Err(EngineError::Memory(e)) => eprintln!("Memory error: {}", e),
    Err(EngineError::Validation(e)) => eprintln!("Validation error: {}", e),
    Err(EngineError::Execution(e)) => eprintln!("Execution error: {}", e),
    Err(EngineError::Scheduler(e)) => eprintln!("Scheduler error: {}", e),
}

Error Types

Error Description Typical Cause
ConfigError Invalid configuration Negative values, zero block_size
MemoryError Memory allocation failure Out of blocks, GPU OOM
ValidationError Invalid request parameters Invalid temperature, top_p
ExecutionError GPU execution failure CUDA error, timeout
SchedulerError Scheduling failure Invalid state transition

Usage Examples

Basic Inference

use hetero_infer::{EngineConfig, GenerationParams, InferenceEngine};

fn main() -> Result<(), Box<dyn std::error::Error>> {
    // Create engine
    let config = EngineConfig::default();
    let mut engine = InferenceEngine::new(config)?;

    // Submit request
    let params = GenerationParams {
        max_tokens: 50,
        temperature: 0.8,
        top_p: 0.95,
    };
    let request_id = engine.submit_request("Hello, world!", params)?;
    println!("Request {} submitted", request_id);

    // Run inference
    let completed = engine.run();

    // Get results
    for result in completed {
        println!("Request {} completed:", result.request_id);
        println!("  Output: {}", result.output_text);
        println!("  Tokens generated: {}", result.generated_tokens);
    }

    Ok(())
}

Step-by-Step Execution

For more control over the inference loop:

// Submit requests
let id1 = engine.submit_request("First request", params.clone())?;
let id2 = engine.submit_request("Second request", params)?;

// Execute step by step
while engine.has_pending_work() {
    let completed = engine.step();

    for result in &completed {
        println!("Request {} completed", result.request_id);
    }
}

Custom Tokenizer

use hetero_infer::TokenizerTrait;

struct MyTokenizer {
    vocab: HashMap<String, u32>,
}

impl TokenizerTrait for MyTokenizer {
    fn encode(&self, text: &str) -> Vec<u32> {
        // Custom encoding logic
        vec![]
    }

    fn decode(&self, tokens: &[u32]) -> String {
        // Custom decoding logic
        String::new()
    }

    fn vocab_size(&self) -> u32 {
        self.vocab.len() as u32
    }

    fn bos_token_id(&self) -> u32 { 0 }
    fn eos_token_id(&self) -> u32 { 1 }
    fn pad_token_id(&self) -> u32 { 2 }
}

Memory Monitoring

// Get memory statistics
let stats = engine.get_memory_stats();
println!("Memory usage: {}/{} blocks ({}%)", 
    stats.used_blocks, 
    stats.total_blocks,
    (stats.used_blocks as f32 / stats.total_blocks as f32) * 100.0
);

Type Exports

Main exports from lib.rs:

pub use crate::config::EngineConfig;
pub use crate::engine::{InferenceEngine, EngineMetrics, CompletedRequest};
pub use crate::error::EngineError;
pub use crate::types::{Request, Sequence, GenerationParams, RequestState};
pub use crate::kv_cache::{KVCacheManager, KVCacheManagerTrait, MemoryStats};
pub use crate::scheduler::{Scheduler, SchedulerTrait, SchedulerOutput};
pub use crate::tokenizer::{SimpleTokenizer, TokenizerTrait};
pub use crate::gpu_executor::{GPUExecutorTrait, ExecutionBatch, ExecutionOutput};

For configuration details, see CONFIGURATION.md.