Skip to content

ModelConfig

Configuration structures for model and generation settings.

ModelConfig

Defines model hyperparameters.

cpp
namespace tinyllm {

struct ModelConfig {
    // Vocabulary and dimensions
    size_t vocab_size = 32000;
    size_t hidden_dim = 4096;
    size_t intermediate_dim = 11008;
    size_t head_dim = 128;

    // Layer configuration
    size_t num_layers = 32;
    size_t num_heads = 32;
    size_t num_kv_heads = 32;  // For grouped-query attention

    // Sequence configuration
    size_t max_seq_len = 2048;

    // Normalization parameters
    float rope_theta = 10000.0f;
    float rms_norm_eps = 1e-5f;

    // Validation
    bool is_valid() const;
    std::string validation_error() const;
};

}  // namespace tinyllm

Fields

FieldTypeDefaultDescription
vocab_sizesize_t32000Size of the vocabulary
hidden_dimsize_t4096Hidden layer dimension
intermediate_dimsize_t11008FFN intermediate dimension
head_dimsize_t128Dimension per attention head
num_layerssize_t32Number of transformer layers
num_headssize_t32Number of query heads
num_kv_headssize_t32Number of key/value heads
max_seq_lensize_t2048Maximum sequence length
rope_thetafloat10000.0RoPE frequency base
rms_norm_epsfloat1e-5RMS normalization epsilon

Example

cpp
ModelConfig config;
config.vocab_size = 32000;
config.hidden_dim = 4096;
config.num_layers = 32;

if (!config.is_valid()) {
    std::cerr << config.validation_error() << std::endl;
}

GenerationConfig

Controls text generation behavior.

cpp
namespace tinyllm {

struct GenerationConfig {
    // Token limits
    size_t max_tokens = 100;
    size_t min_tokens = 0;

    // Sampling parameters
    float temperature = 1.0f;
    float top_p = 1.0f;
    size_t top_k = 0;  // 0 = disabled

    // Repetition control
    float repetition_penalty = 1.0f;
    size_t repetition_window = 20;

    // Seed for reproducibility
    size_t seed = 0;  // 0 = random seed

    // Stop sequences
    std::vector<std::string> stop_sequences;

    // Special token IDs
    TokenId eos_token_id = 2;
    TokenId pad_token_id = 0;
};

}  // namespace tinyllm

Fields

FieldTypeDefaultDescription
max_tokenssize_t100Maximum tokens to generate
min_tokenssize_t0Minimum tokens before stopping
temperaturefloat1.0Sampling temperature
top_pfloat1.0Nucleus sampling threshold
top_ksize_t0Top-k sampling (0 = off)
repetition_penaltyfloat1.0Penalty for repeated tokens
repetition_windowsize_t20Window for repetition penalty
seedsize_t0Random seed (0 = random)
stop_sequencesvector<string>{}Sequences that stop generation
eos_token_idTokenId2End-of-sequence token
pad_token_idTokenId0Padding token

Sampling Strategies

cpp
GenerationConfig config;

// Greedy decoding (deterministic)
config.temperature = 0.0f;

// Standard sampling
config.temperature = 0.7f;

// Top-K sampling
config.top_k = 50;
config.temperature = 0.9f;

// Nucleus (Top-P) sampling
config.top_p = 0.95f;
config.temperature = 0.9f;

// Combined (recommended)
config.top_k = 50;
config.top_p = 0.95f;
config.temperature = 0.7f;

KVCacheConfig

Configures the key-value cache.

cpp
namespace tinyllm {

struct KVCacheConfig {
    size_t max_batch_size = 1;
    size_t max_seq_len = 2048;
    bool enable_swapping = false;
    float swap_threshold = 0.9f;
};

}  // namespace tinyllm

Fields

FieldTypeDefaultDescription
max_batch_sizesize_t1Maximum concurrent sequences
max_seq_lensize_t2048Maximum cached sequence length
enable_swappingboolfalseEnable CPU memory swapping
swap_thresholdfloat0.9GPU memory threshold for swap

QuantizationConfig

Configures quantization settings.

cpp
namespace tinyllm {

enum class QuantizationType {
    INT8,
    INT4,
    FP8,
};

struct QuantizationConfig {
    QuantizationType type = QuantizationType::INT8;
    size_t group_size = 128;
    bool symmetrical = true;
};

}  // namespace tinyllm

See Also

Released under the MIT License.