Spaces:

enCoder
/

tiny-vllm

Running

File size: 1,739 Bytes

from __future__ import annotations

from dataclasses import dataclass, field
from typing import Optional


@dataclass
class EngineConfig:
    # Model
    model: str = "Qwen/Qwen2.5-0.5B-Instruct"
    dtype: str = "float32"  # "float32" on CPU; "float16"/"bfloat16" on GPU
    device: str = "cpu"     # "cpu" or "cuda"
    trust_remote_code: bool = False

    # Paged KV cache
    block_size: int = 16          # tokens per physical block
    num_blocks: int = 512         # total physical blocks in the pool
    enable_prefix_caching: bool = True

    # Scheduler
    max_num_seqs: int = 16                # max sequences in a batch
    max_num_batched_tokens: int = 512     # total tokens processed per step
    max_model_len: int = 2048             # upper bound on prompt + generated tokens

    # Logging / events
    emit_events: bool = True              # produce engine events for the UI
    event_buffer: int = 256
    record_path: Optional[str] = None     # JSONL file to append every event to
                                          # (powers the static GH-Pages replay)

    def __post_init__(self) -> None:
        if self.max_num_batched_tokens < self.block_size:
            raise ValueError(
                "max_num_batched_tokens must be >= block_size "
                f"({self.max_num_batched_tokens} < {self.block_size})"
            )


@dataclass
class SamplingParams:
    max_tokens: int = 64
    temperature: float = 1.0
    top_p: float = 1.0
    top_k: int = -1                       # -1 disables top-k
    stop_token_ids: list[int] = field(default_factory=list)
    seed: Optional[int] = None
    ignore_eos: bool = False

    @property
    def is_greedy(self) -> bool:
        return self.temperature <= 0.0