Upload model/config.py with huggingface_hub

82036d2 verified 16 days ago

6.39 kB

	# model/config.py
	# PyCraft-1 model configuration
	# All architectural hyperparameters live here.
	# Other files import this — never hardcode numbers elsewhere.

	from dataclasses import dataclass, field
	from typing import Optional
	import math


	@dataclass
	class PyCraftConfig:
	# ------------------------------------------------------------------ #
	# Vocabulary & sequence
	# ------------------------------------------------------------------ #
	vocab_size: int = 32000 # BPE tokenizer vocab (trained in Phase 2)
	max_seq_len: int = 2048 # context window

	# ------------------------------------------------------------------ #
	# Model dimensions
	# ------------------------------------------------------------------ #
	d_model: int = 512 # embedding / hidden dimension
	n_layers: int = 8 # number of transformer blocks
	n_heads: int = 8 # number of query heads
	# number of key/value heads (GQA 4:1 ratio)
	n_kv_heads: int = 2

	# SwiGLU FFN intermediate dim.
	# Standard formula: (4 * d_model * 2/3), rounded to nearest multiple of 64
	# 512 * 4 * 2/3 = 1365.3 → round to 1408 for clean tensor ops
	d_ff: int = 1408

	# ------------------------------------------------------------------ #
	# Attention settings
	# ------------------------------------------------------------------ #
	use_qk_norm: bool = True # QK-Norm (OLMo 2 / Qwen 3 technique)
	rope_theta: float = 10000.0 # RoPE base frequency
	attn_dropout: float = 0.0 # keep 0.0 during pretraining

	# ------------------------------------------------------------------ #
	# Training knobs
	# ------------------------------------------------------------------ #
	dropout: float = 0.0 # residual dropout (0 for pretraining)
	weight_tying: bool = False # tie input embedding ↔ output projection
	# False: we have enough params at 120M

	# ------------------------------------------------------------------ #
	# FIM (Fill-in-the-Middle) special token IDs
	# These will be set after tokenizer training.
	# Defaults match standard FIM token positions.
	# ------------------------------------------------------------------ #
	fim_prefix_id: Optional[int] = None
	fim_suffix_id: Optional[int] = None
	fim_middle_id: Optional[int] = None
	fim_pad_id: Optional[int] = None

	# ------------------------------------------------------------------ #
	# Derived / computed properties
	# ------------------------------------------------------------------ #
	@property
	def head_dim(self) -> int:
	"""Dimension of each attention head."""
	assert self.d_model % self.n_heads == 0, (
	f"d_model ({self.d_model}) must be divisible by n_heads ({self.n_heads})"
	)
	return self.d_model // self.n_heads

	@property
	def n_heads_per_kv(self) -> int:
	"""How many Q heads share each KV head."""
	assert self.n_heads % self.n_kv_heads == 0, (
	f"n_heads ({self.n_heads}) must be divisible by n_kv_heads ({self.n_kv_heads})"
	)
	return self.n_heads // self.n_kv_heads

	@property
	def param_count_approx(self) -> int:
	"""Rough parameter count for sanity checking."""
	embed = self.vocab_size * self.d_model
	attn = self.n_layers * (
	self.d_model * self.d_model + # Wq
	2 * self.d_model * (self.n_kv_heads * self.head_dim) + # Wk, Wv
	self.d_model * self.d_model # Wo
	)
	ffn = self.n_layers * (
	3 * self.d_model * self.d_ff # SwiGLU: gate, up, down
	)
	norms = self.n_layers * 2 * self.d_model # RMSNorm per block x2
	lm_head = self.vocab_size * self.d_model # output projection
	return embed + attn + ffn + norms + lm_head

	def __post_init__(self):
	# Validate GQA constraint
	assert self.n_heads % self.n_kv_heads == 0, (
	f"n_heads must be divisible by n_kv_heads. "
	f"Got {self.n_heads} and {self.n_kv_heads}."
	)
	# Validate head dimension
	assert self.d_model % self.n_heads == 0, (
	f"d_model must be divisible by n_heads."
	)


	# ------------------------------------------------------------------ #
	# Convenience presets
	# ------------------------------------------------------------------ #

	def get_config_120m() -> PyCraftConfig:
	"""
	PyCraft-1 (120M parameters).
	Designed to train on a single RTX 3050 4GB laptop GPU.
	"""
	return PyCraftConfig(
	vocab_size=32000,
	max_seq_len=1024,
	d_model=512,
	n_layers=8,
	n_heads=8,
	n_kv_heads=2,
	d_ff=1408,
	use_qk_norm=True,
	rope_theta=10000.0,
	dropout=0.1,
	)


	def get_config_tiny() -> PyCraftConfig:
	"""
	PyCraft-tiny (~ 15M parameters).
	For rapid iteration and smoke-testing the training loop.
	Use this first before committing to a full training run.
	"""
	return PyCraftConfig(
	vocab_size=32000,
	max_seq_len=512,
	d_model=256,
	n_layers=4,
	n_heads=4,
	n_kv_heads=2,
	d_ff=704,
	use_qk_norm=True,
	rope_theta=10000.0,
	)


	# ------------------------------------------------------------------ #
	# Quick self-test
	# ------------------------------------------------------------------ #
	if __name__ == "__main__":
	cfg = get_config_120m()
	params_m = cfg.param_count_approx / 1e6
	print(f"PyCraft-1 config loaded.")
	print(f" d_model : {cfg.d_model}")
	print(f" n_layers : {cfg.n_layers}")
	print(f" n_heads : {cfg.n_heads} (Q)")
	print(
	f" n_kv_heads : {cfg.n_kv_heads} (KV, GQA {cfg.n_heads_per_kv}:1)")
	print(f" head_dim : {cfg.head_dim}")
	print(f" d_ff : {cfg.d_ff} (SwiGLU)")
	print(f" QK-Norm : {cfg.use_qk_norm}")
	print(f" Approx params: {params_m:.1f}M")

	cfg_tiny = get_config_tiny()
	print(f"\nPyCraft-tiny config loaded.")
	print(f" Approx params: {cfg_tiny.param_count_approx / 1e6:.1f}M")