JulianKrgd commited on
Commit
4e7bc2c
·
verified ·
1 Parent(s): 4d1770a

Upload src/model/config.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. src/model/config.py +109 -0
src/model/config.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Julian Model Configuration.
3
+ 250M parameter GPT-style decoder-only transformer.
4
+ """
5
+
6
+ from dataclasses import dataclass
7
+ from typing import Optional
8
+
9
+
10
+ @dataclass
11
+ class JulianConfig:
12
+ """
13
+ Configuration for Julian 250M model.
14
+
15
+ Architecture: GPT-style decoder-only transformer
16
+ Parameters: ~250M
17
+ Optimized for: 5B tokens (Chinchilla optimal)
18
+ """
19
+
20
+ # Model dimensions
21
+ vocab_size: int = 24000 # SentencePiece vocab
22
+ max_seq_len: int = 2048 # Context length
23
+ d_model: int = 1024 # Hidden dimension
24
+ n_layers: int = 14 # Transformer layers
25
+ n_heads: int = 16 # Attention heads
26
+ d_ff: int = 4096 # FFN intermediate (4x d_model)
27
+
28
+ # Regularization
29
+ dropout: float = 0.1
30
+ attention_dropout: float = 0.1
31
+
32
+ # Architecture choices
33
+ use_bias: bool = False # No bias (like LLaMA)
34
+ rope_theta: float = 10000.0 # RoPE base frequency
35
+ rms_norm_eps: float = 1e-6 # RMSNorm epsilon
36
+
37
+ # Initialization
38
+ initializer_range: float = 0.02
39
+
40
+ # Special tokens
41
+ pad_token_id: int = 0
42
+ bos_token_id: int = 2
43
+ eos_token_id: int = 3
44
+
45
+ @property
46
+ def head_dim(self) -> int:
47
+ return self.d_model // self.n_heads
48
+
49
+ def estimate_params(self) -> int:
50
+ """Estimate total parameters."""
51
+ # Embeddings (shared input/output)
52
+ embed_params = self.vocab_size * self.d_model
53
+
54
+ # Per transformer layer
55
+ # Attention: Q, K, V, O projections
56
+ attn_params = 4 * self.d_model * self.d_model
57
+ # FFN: up, gate, down projections (SwiGLU style)
58
+ ffn_params = 3 * self.d_model * self.d_ff
59
+ # Layer norms
60
+ norm_params = 2 * self.d_model
61
+
62
+ layer_params = attn_params + ffn_params + norm_params
63
+ total_layer_params = self.n_layers * layer_params
64
+
65
+ # Final norm
66
+ final_norm = self.d_model
67
+
68
+ return embed_params + total_layer_params + final_norm
69
+
70
+ def __post_init__(self):
71
+ assert self.d_model % self.n_heads == 0, "d_model must be divisible by n_heads"
72
+
73
+
74
+ # Preset configurations
75
+ JULIAN_250M = JulianConfig()
76
+
77
+ JULIAN_125M = JulianConfig(
78
+ d_model=768,
79
+ n_layers=12,
80
+ n_heads=12,
81
+ d_ff=3072,
82
+ )
83
+
84
+ JULIAN_100M = JulianConfig(
85
+ d_model=640,
86
+ n_layers=12,
87
+ n_heads=10,
88
+ d_ff=2560,
89
+ max_seq_len=2048,
90
+ )
91
+
92
+ JULIAN_500M = JulianConfig(
93
+ d_model=1280,
94
+ n_layers=24,
95
+ n_heads=20,
96
+ d_ff=5120,
97
+ )
98
+
99
+
100
+ if __name__ == "__main__":
101
+ config = JULIAN_250M
102
+ params = config.estimate_params()
103
+ print(f"Julian 250M Configuration:")
104
+ print(f" d_model: {config.d_model}")
105
+ print(f" n_layers: {config.n_layers}")
106
+ print(f" n_heads: {config.n_heads}")
107
+ print(f" d_ff: {config.d_ff}")
108
+ print(f" vocab_size: {config.vocab_size}")
109
+ print(f" Estimated params: {params:,} ({params/1e6:.1f}M)")