| """ |
| Julian Model Configuration. |
| 250M parameter GPT-style decoder-only transformer. |
| """ |
|
|
| from dataclasses import dataclass |
| from typing import Optional |
|
|
|
|
| @dataclass |
| class JulianConfig: |
| """ |
| Configuration for Julian 250M model. |
| |
| Architecture: GPT-style decoder-only transformer |
| Parameters: ~250M |
| Optimized for: 5B tokens (Chinchilla optimal) |
| """ |
|
|
| |
| vocab_size: int = 24000 |
| max_seq_len: int = 2048 |
| d_model: int = 1024 |
| n_layers: int = 14 |
| n_heads: int = 16 |
| d_ff: int = 4096 |
|
|
| |
| dropout: float = 0.1 |
| attention_dropout: float = 0.1 |
|
|
| |
| use_bias: bool = False |
| rope_theta: float = 10000.0 |
| rms_norm_eps: float = 1e-6 |
|
|
| |
| initializer_range: float = 0.02 |
|
|
| |
| pad_token_id: int = 0 |
| bos_token_id: int = 2 |
| eos_token_id: int = 3 |
|
|
| @property |
| def head_dim(self) -> int: |
| return self.d_model // self.n_heads |
|
|
| def estimate_params(self) -> int: |
| """Estimate total parameters.""" |
| |
| embed_params = self.vocab_size * self.d_model |
|
|
| |
| |
| attn_params = 4 * self.d_model * self.d_model |
| |
| ffn_params = 3 * self.d_model * self.d_ff |
| |
| norm_params = 2 * self.d_model |
|
|
| layer_params = attn_params + ffn_params + norm_params |
| total_layer_params = self.n_layers * layer_params |
|
|
| |
| final_norm = self.d_model |
|
|
| return embed_params + total_layer_params + final_norm |
|
|
| def __post_init__(self): |
| assert self.d_model % self.n_heads == 0, "d_model must be divisible by n_heads" |
|
|
|
|
| |
| JULIAN_250M = JulianConfig() |
|
|
| JULIAN_125M = JulianConfig( |
| d_model=768, |
| n_layers=12, |
| n_heads=12, |
| d_ff=3072, |
| ) |
|
|
| JULIAN_100M = JulianConfig( |
| d_model=640, |
| n_layers=12, |
| n_heads=10, |
| d_ff=2560, |
| max_seq_len=2048, |
| ) |
|
|
| JULIAN_500M = JulianConfig( |
| d_model=1280, |
| n_layers=24, |
| n_heads=20, |
| d_ff=5120, |
| ) |
|
|
|
|
| if __name__ == "__main__": |
| config = JULIAN_250M |
| params = config.estimate_params() |
| print(f"Julian 250M Configuration:") |
| print(f" d_model: {config.d_model}") |
| print(f" n_layers: {config.n_layers}") |
| print(f" n_heads: {config.n_heads}") |
| print(f" d_ff: {config.d_ff}") |
| print(f" vocab_size: {config.vocab_size}") |
| print(f" Estimated params: {params:,} ({params/1e6:.1f}M)") |
|
|