JulianKrgd's picture
Upload src/model/config.py with huggingface_hub
4e7bc2c verified
"""
Julian Model Configuration.
250M parameter GPT-style decoder-only transformer.
"""
from dataclasses import dataclass
from typing import Optional
@dataclass
class JulianConfig:
"""
Configuration for Julian 250M model.
Architecture: GPT-style decoder-only transformer
Parameters: ~250M
Optimized for: 5B tokens (Chinchilla optimal)
"""
# Model dimensions
vocab_size: int = 24000 # SentencePiece vocab
max_seq_len: int = 2048 # Context length
d_model: int = 1024 # Hidden dimension
n_layers: int = 14 # Transformer layers
n_heads: int = 16 # Attention heads
d_ff: int = 4096 # FFN intermediate (4x d_model)
# Regularization
dropout: float = 0.1
attention_dropout: float = 0.1
# Architecture choices
use_bias: bool = False # No bias (like LLaMA)
rope_theta: float = 10000.0 # RoPE base frequency
rms_norm_eps: float = 1e-6 # RMSNorm epsilon
# Initialization
initializer_range: float = 0.02
# Special tokens
pad_token_id: int = 0
bos_token_id: int = 2
eos_token_id: int = 3
@property
def head_dim(self) -> int:
return self.d_model // self.n_heads
def estimate_params(self) -> int:
"""Estimate total parameters."""
# Embeddings (shared input/output)
embed_params = self.vocab_size * self.d_model
# Per transformer layer
# Attention: Q, K, V, O projections
attn_params = 4 * self.d_model * self.d_model
# FFN: up, gate, down projections (SwiGLU style)
ffn_params = 3 * self.d_model * self.d_ff
# Layer norms
norm_params = 2 * self.d_model
layer_params = attn_params + ffn_params + norm_params
total_layer_params = self.n_layers * layer_params
# Final norm
final_norm = self.d_model
return embed_params + total_layer_params + final_norm
def __post_init__(self):
assert self.d_model % self.n_heads == 0, "d_model must be divisible by n_heads"
# Preset configurations
JULIAN_250M = JulianConfig()
JULIAN_125M = JulianConfig(
d_model=768,
n_layers=12,
n_heads=12,
d_ff=3072,
)
JULIAN_100M = JulianConfig(
d_model=640,
n_layers=12,
n_heads=10,
d_ff=2560,
max_seq_len=2048,
)
JULIAN_500M = JulianConfig(
d_model=1280,
n_layers=24,
n_heads=20,
d_ff=5120,
)
if __name__ == "__main__":
config = JULIAN_250M
params = config.estimate_params()
print(f"Julian 250M Configuration:")
print(f" d_model: {config.d_model}")
print(f" n_layers: {config.n_layers}")
print(f" n_heads: {config.n_heads}")
print(f" d_ff: {config.d_ff}")
print(f" vocab_size: {config.vocab_size}")
print(f" Estimated params: {params:,} ({params/1e6:.1f}M)")