Spaces:

enCoder
/

tiny-vllm

Running

Add GitHub Pages demo and recording functionality

39fa862 12 days ago

1.74 kB

	from __future__ import annotations

	from dataclasses import dataclass, field
	from typing import Optional


	@dataclass
	class EngineConfig:
	# Model
	model: str = "Qwen/Qwen2.5-0.5B-Instruct"
	dtype: str = "float32" # "float32" on CPU; "float16"/"bfloat16" on GPU
	device: str = "cpu" # "cpu" or "cuda"
	trust_remote_code: bool = False

	# Paged KV cache
	block_size: int = 16 # tokens per physical block
	num_blocks: int = 512 # total physical blocks in the pool
	enable_prefix_caching: bool = True

	# Scheduler
	max_num_seqs: int = 16 # max sequences in a batch
	max_num_batched_tokens: int = 512 # total tokens processed per step
	max_model_len: int = 2048 # upper bound on prompt + generated tokens

	# Logging / events
	emit_events: bool = True # produce engine events for the UI
	event_buffer: int = 256
	record_path: Optional[str] = None # JSONL file to append every event to
	# (powers the static GH-Pages replay)

	def __post_init__(self) -> None:
	if self.max_num_batched_tokens < self.block_size:
	raise ValueError(
	"max_num_batched_tokens must be >= block_size "
	f"({self.max_num_batched_tokens} < {self.block_size})"
	)


	@dataclass
	class SamplingParams:
	max_tokens: int = 64
	temperature: float = 1.0
	top_p: float = 1.0
	top_k: int = -1 # -1 disables top-k
	stop_token_ids: list[int] = field(default_factory=list)
	seed: Optional[int] = None
	ignore_eos: bool = False

	@property
	def is_greedy(self) -> bool:
	return self.temperature <= 0.0