Kirim1 commited on
Commit
0d6275e
·
verified ·
1 Parent(s): 808e8db

Create training_config.json

Browse files
Files changed (1) hide show
  1. training_config.json +193 -0
training_config.json ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_config": {
3
+ "base_model": "Kirim-ai/Kirim-V1-base",
4
+ "target_model": "Kirim-ai/Kirim-1-Math",
5
+ "parameters": "30B",
6
+ "architecture": "KirimForCausalLM",
7
+ "expansion_method": "width_and_depth"
8
+ },
9
+
10
+ "training_stages": {
11
+ "stage_1_expansion": {
12
+ "description": "Expand from 13B to 30B",
13
+ "duration_days": 15,
14
+ "hardware": "512x H100 80GB",
15
+ "method": "progressive_expansion",
16
+ "hidden_size": {
17
+ "from": 4096,
18
+ "to": 5120
19
+ },
20
+ "num_layers": {
21
+ "from": 32,
22
+ "to": 48
23
+ }
24
+ },
25
+
26
+ "stage_2_math_pretraining": {
27
+ "description": "Mathematical corpus pre-training",
28
+ "duration_days": 30,
29
+ "hardware": "512x H100 80GB",
30
+ "data": {
31
+ "total_tokens": "500B",
32
+ "sources": [
33
+ "mathematical_proofs",
34
+ "olympiad_problems",
35
+ "arxiv_math_papers",
36
+ "textbooks",
37
+ "math_stackexchange"
38
+ ],
39
+ "distribution": {
40
+ "proofs": 0.25,
41
+ "problems": 0.30,
42
+ "papers": 0.20,
43
+ "textbooks": 0.15,
44
+ "qa": 0.10
45
+ }
46
+ },
47
+ "hyperparameters": {
48
+ "batch_size": 2048,
49
+ "learning_rate": 1.5e-4,
50
+ "warmup_steps": 2000,
51
+ "weight_decay": 0.1,
52
+ "gradient_clipping": 1.0,
53
+ "optimizer": "AdamW",
54
+ "scheduler": "cosine"
55
+ }
56
+ },
57
+
58
+ "stage_3_instruction_tuning": {
59
+ "description": "Mathematical instruction following",
60
+ "duration_days": 5,
61
+ "hardware": "128x H100 80GB",
62
+ "data": {
63
+ "total_examples": 200000,
64
+ "categories": {
65
+ "algebra": 40000,
66
+ "calculus": 35000,
67
+ "geometry": 30000,
68
+ "number_theory": 25000,
69
+ "probability": 20000,
70
+ "linear_algebra": 20000,
71
+ "discrete_math": 15000,
72
+ "topology": 10000,
73
+ "other": 5000
74
+ }
75
+ },
76
+ "hyperparameters": {
77
+ "batch_size": 128,
78
+ "learning_rate": 2e-5,
79
+ "num_epochs": 3,
80
+ "warmup_ratio": 0.1,
81
+ "weight_decay": 0.01
82
+ }
83
+ },
84
+
85
+ "stage_4_tool_calling": {
86
+ "description": "Tool calling capability training",
87
+ "duration_days": 3,
88
+ "hardware": "64x H100 80GB",
89
+ "data": {
90
+ "total_examples": 50000,
91
+ "tool_types": {
92
+ "calculator": 15000,
93
+ "symbolic_solver": 12000,
94
+ "code_executor": 10000,
95
+ "derivative": 5000,
96
+ "integrate": 5000,
97
+ "other_tools": 3000
98
+ }
99
+ },
100
+ "hyperparameters": {
101
+ "batch_size": 64,
102
+ "learning_rate": 1e-5,
103
+ "num_epochs": 2,
104
+ "gradient_accumulation_steps": 4
105
+ }
106
+ },
107
+
108
+ "stage_5_reinforcement_learning": {
109
+ "description": "RL for solution correctness",
110
+ "duration_days": 7,
111
+ "hardware": "256x H100 80GB",
112
+ "method": "PPO",
113
+ "reward_model": {
114
+ "type": "outcome_based",
115
+ "verification_methods": [
116
+ "symbolic_verification",
117
+ "numerical_check",
118
+ "unit_tests"
119
+ ]
120
+ },
121
+ "hyperparameters": {
122
+ "ppo_epochs": 4,
123
+ "batch_size": 512,
124
+ "learning_rate": 1e-6,
125
+ "clip_range": 0.2,
126
+ "value_loss_coef": 0.5,
127
+ "entropy_coef": 0.01
128
+ }
129
+ }
130
+ },
131
+
132
+ "total_training": {
133
+ "duration_days": 60,
134
+ "gpu_hours": 30720,
135
+ "estimated_cost_usd": 450000,
136
+ "total_tokens_processed": "1.5T",
137
+ "checkpoint_frequency": "every_1000_steps"
138
+ },
139
+
140
+ "optimization": {
141
+ "precision": "BF16",
142
+ "gradient_checkpointing": true,
143
+ "flash_attention": true,
144
+ "zero_optimization": {
145
+ "stage": 3,
146
+ "offload_optimizer": true,
147
+ "offload_param": false
148
+ },
149
+ "tensor_parallelism": 8,
150
+ "pipeline_parallelism": 4,
151
+ "data_parallelism": 16
152
+ },
153
+
154
+ "evaluation": {
155
+ "frequency": "every_500_steps",
156
+ "benchmarks": [
157
+ "GSM8K",
158
+ "MATH",
159
+ "MMLU-Math",
160
+ "Minerva",
161
+ "AMC10",
162
+ "AMC12",
163
+ "AIME"
164
+ ],
165
+ "tool_calling_tests": true,
166
+ "unit_test_coverage": 0.95
167
+ },
168
+
169
+ "data_sources": {
170
+ "mathematical_corpus": {
171
+ "proofs": {
172
+ "size": "125B tokens",
173
+ "sources": ["ProofWiki", "Lean", "Coq", "Isabelle"]
174
+ },
175
+ "olympiad_problems": {
176
+ "size": "150B tokens",
177
+ "sources": ["IMO", "USAMO", "AMC", "AIME", "Putnam"]
178
+ },
179
+ "arxiv_papers": {
180
+ "size": "100B tokens",
181
+ "categories": ["math.AC", "math.AG", "math.NT", "math.CO"]
182
+ },
183
+ "textbooks": {
184
+ "size": "75B tokens",
185
+ "levels": ["undergraduate", "graduate", "reference"]
186
+ },
187
+ "qa_platforms": {
188
+ "size": "50B tokens",
189
+ "sources": ["Math StackExchange", "MathOverflow"]
190
+ }
191
+ }
192
+ }
193
+ }