Core Types¶

InferenceEngine¶

Main orchestrator for inference operations.

pub struct InferenceEngine {
    config: EngineConfig,
    tokenizer: Box<dyn TokenizerTrait>,
    scheduler: Box<dyn SchedulerTrait>,
    kv_cache_manager: Box<dyn KVCacheManagerTrait>,
    gpu_executor: Box<dyn GPUExecutorTrait>,
    metrics: EngineMetrics,
}

Methods¶

`new()`¶

pub fn new(config: EngineConfig) -> Result<Self, EngineError>

Create a new inference engine with the given configuration.

let config = EngineConfig::default();
let engine = InferenceEngine::new(config)?;

`submit_request()`¶

pub fn submit_request(
    &mut self,
    text: &str,
    params: GenerationParams
) -> Result<u64, EngineError>

Submit a text generation request.

let params = GenerationParams {
    max_tokens: 100,
    temperature: 0.8,
    top_p: 0.95,
};
let request_id = engine.submit_request("Hello", params)?;

`run()`¶

pub fn run(&mut self) -> Vec<CompletedRequest>

Run the main inference loop until all requests complete.

let completed = engine.run();
for result in completed {
    println!("{}", result.output_text);
}

`step()`¶

pub fn step(&mut self) -> Vec<CompletedRequest>

Execute a single scheduling and inference step.

while engine.has_pending_work() {
    let completed = engine.step();
    process_results(&completed);
}

EngineConfig¶

Configuration for the inference engine.

pub struct EngineConfig {
    pub block_size: u32,
    pub max_num_blocks: u32,
    pub max_batch_size: u32,
    pub max_num_seqs: u32,
    pub max_model_len: u32,
    pub max_total_tokens: u32,
    pub memory_threshold: f32,
}

Field	Default	Description
block_size	16	Tokens per physical block
max_num_blocks	1024	Total physical blocks
max_batch_size	32	Max sequences per batch
max_num_seqs	256	Max concurrent sequences
max_model_len	2048	Max context length
max_total_tokens	4096	Max tokens per batch
memory_threshold	0.9	Memory pressure threshold

impl Default for EngineConfig {
    fn default() -> Self {
        Self {
            block_size: 16,
            max_num_blocks: 1024,
            max_batch_size: 32,
            max_num_seqs: 256,
            max_model_len: 2048,
            max_total_tokens: 4096,
            memory_threshold: 0.9,
        }
    }
}

GenerationParams¶

Parameters for text generation.

pub struct GenerationParams {
    pub max_tokens: u32,
    pub temperature: f32,
    pub top_p: f32,
}

Field	Default	Range	Description
max_tokens	100	1+	Maximum tokens to generate
temperature	1.0	0.0-2.0	Sampling temperature
top_p	0.9	0.0-1.0	Nucleus sampling threshold

Request¶

Represents an inference request.

pub struct Request {
    pub id: u64,
    pub input_tokens: Vec<u32>,
    pub output_tokens: Vec<u32>,
    pub max_tokens: u32,
    pub temperature: f32,
    pub top_p: f32,
    pub state: RequestState,
    pub created_at: Instant,
}

pub enum RequestState {
    Pending,
    Prefill,
    Decode,
    Completed,
    Failed(String),
}

Sequence¶

Active request with KV Cache allocation.

pub struct Sequence {
    pub seq_id: u64,
    pub request: Request,
    pub logical_blocks: Vec<LogicalBlock>,
    pub num_computed_tokens: u32,
    pub num_generated_tokens: u32,
}

CompletedRequest¶

Result of completed inference.

pub struct CompletedRequest {
    pub request_id: u64,
    pub input_text: String,
    pub output_text: String,
    pub input_tokens: usize,
    pub generated_tokens: usize,
    pub duration: Duration,
}

EngineMetrics¶

Runtime performance metrics.

pub struct EngineMetrics {
    pub requests_processed: u64,
    pub tokens_generated: u64,
    pub avg_latency_ms: f64,
    pub throughput_tok_per_sec: f64,
}

Usage Examples¶

Complete Workflow¶

use hetero_infer::*;
use std::time::Duration;

fn main() -> Result<(), EngineError> {
    // Configure
    let config = EngineConfig {
        max_batch_size: 64,
        max_num_blocks: 2048,
        ..Default::default()
    };

    // Create engine
    let mut engine = InferenceEngine::new(config)?;

    // Submit multiple requests
    let params = GenerationParams {
        max_tokens: 100,
        temperature: 0.8,
        ..Default::default()
    };

    engine.submit_request("First prompt", params.clone())?;
    engine.submit_request("Second prompt", params.clone())?;
    engine.submit_request("Third prompt", params)?;

    // Run inference
    let completed = engine.run();

    // Process results
    for result in completed {
        println!("Request {}: {}", 
            result.request_id, 
            result.output_text
        );
    }

    // Check metrics
    let metrics = &engine.metrics;
    println!("Throughput: {:.2} tok/s", 
        metrics.throughput_tok_per_sec
    );

    Ok(())
}

Next: Trait Interfaces