RonanMcGovern commited on
Commit
cef32ba
·
verified ·
1 Parent(s): 253da6e

Upload via push_to_hf.py

Browse files
report/latest/base-model-evaluation.md ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Base model evaluation
2
+ timestamp: 2025-11-29 14:05:35
3
+
4
+ - Model: base_model (step 21400)
5
+ - CORE metric: 0.2065
6
+ - hellaswag_zeroshot: 0.2604
7
+ - jeopardy: 0.1190
8
+ - bigbench_qa_wikidata: 0.5148
9
+ - arc_easy: 0.5230
10
+ - arc_challenge: 0.1206
11
+ - copa: 0.4200
12
+ - commonsense_qa: 0.0448
13
+ - piqa: 0.3765
14
+ - openbook_qa: 0.1227
15
+ - lambada_openai: 0.3722
16
+ - hellaswag: 0.2647
17
+ - winograd: 0.2674
18
+ - winogrande: 0.0687
19
+ - bigbench_dyck_languages: 0.1200
20
+ - agi_eval_lsat_ar: 0.1087
21
+ - bigbench_cs_algorithms: 0.3667
22
+ - bigbench_operators: 0.1619
23
+ - bigbench_repeat_copy_logic: 0.0000
24
+ - squad: 0.2326
25
+ - coqa: 0.2043
26
+ - boolq: -0.3085
27
+ - bigbench_language_identification: 0.1815
28
+
report/latest/base-model-loss.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Base model loss
2
+ timestamp: 2025-11-29 14:02:20
3
+
4
+ - train bpb: 0.8148
5
+ - val bpb: 0.8122
6
+ - sample 0: <|bos|>The capital of France is Paris. It is the largest city in France and the second largest city in Europe
7
+ - sample 1: <|bos|>The chemical symbol of gold is Au. It is a soft, malleable, ductile, and malleable metal. It
8
+ - sample 2: <|bos|>If yesterday was Friday, then tomorrow will be Monday. If tomorrow is Monday, then tomorrow will be Tuesday. If tomorrow is
9
+ - sample 3: <|bos|>The opposite of hot is cold. The opposite of cold is hot. The opposite of hot is cold.
10
+ - sample 4: <|bos|>The planets of the solar system are: Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, Neptune,
11
+ - sample 5: <|bos|>My favorite color is blue. I love blue because it is a color that is associated with happiness and
12
+ - sample 6: <|bos|>If 5*x + 3 = 13, then x is 3. If 5*x + 3 = 13, then
13
+
report/latest/base-model-training.md ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Base model training
2
+ timestamp: 2025-11-29 14:01:35
3
+
4
+ - run: d20
5
+ - device_type:
6
+ - depth: 20
7
+ - max_seq_len: 2048
8
+ - num_iterations: -1
9
+ - target_flops: -1.0000
10
+ - target_param_data_ratio: 20
11
+ - device_batch_size: 32
12
+ - total_batch_size: 524,288
13
+ - embedding_lr: 0.2000
14
+ - unembedding_lr: 0.0040
15
+ - weight_decay: 0.0000
16
+ - matrix_lr: 0.0200
17
+ - grad_clip: 1.0000
18
+ - warmup_ratio: 0.0000
19
+ - warmdown_ratio: 0.2000
20
+ - final_lr_frac: 0.0000
21
+ - resume_from_step: -1
22
+ - eval_every: 250
23
+ - eval_tokens: 10,485,760
24
+ - core_metric_every: 2000
25
+ - core_metric_max_per_task: 500
26
+ - sample_every: 2000
27
+ - save_every: -1
28
+ - model_tag:
29
+ - Number of parameters: 560,988,160
30
+ - Number of FLOPs per token: 3.491758e+09
31
+ - Calculated number of iterations: 21,400
32
+ - Number of training tokens: 11,219,763,200
33
+ - Tokens : Params ratio: 20.0000
34
+ - DDP world size: 8
35
+ - warmup_ratio: 0.0000
36
+ - warmdown_ratio: 0.2000
37
+ - final_lr_frac: 0.0000
38
+ - Minimum validation bpb: 0.8120
39
+ - Final validation bpb: 0.8120
40
+ - CORE metric estimate: 0.2220
41
+ - MFU %: 48.17%
42
+ - Total training flops: 3.917670e+19
43
+ - Total training time: 186.38m
44
+ - Peak memory usage: 75422.52MiB
45
+
report/latest/chat-evaluation-mid.md CHANGED
@@ -1,5 +1,5 @@
1
  ## Chat evaluation mid
2
- timestamp: 2025-11-28 20:35:29
3
 
4
  - source: mid
5
  - task_name: None
@@ -13,11 +13,11 @@ timestamp: 2025-11-28 20:35:29
13
  - step: None
14
  - max_problems: None
15
  - device_type:
16
- - ARC-Easy: 0.4516
17
- - ARC-Challenge: 0.3217
18
- - MMLU: 0.3326
19
- - GSM8K: 0.0349
20
- - HumanEval: 0.0000
21
- - SpellingBee: 0.9805
22
- - ChatCORE metric: 0.2483
23
 
 
1
  ## Chat evaluation mid
2
+ timestamp: 2025-11-29 14:21:52
3
 
4
  - source: mid
5
  - task_name: None
 
13
  - step: None
14
  - max_problems: None
15
  - device_type:
16
+ - ARC-Easy: 0.4322
17
+ - ARC-Challenge: 0.3336
18
+ - MMLU: 0.3363
19
+ - GSM8K: 0.0326
20
+ - HumanEval: 0.0976
21
+ - SpellingBee: 0.9766
22
+ - ChatCORE metric: 0.2627
23
 
report/latest/chat-evaluation-sft.md CHANGED
@@ -1,5 +1,5 @@
1
  ## Chat evaluation sft
2
- timestamp: 2025-11-28 20:46:42
3
 
4
  - source: sft
5
  - task_name: None
@@ -13,11 +13,11 @@ timestamp: 2025-11-28 20:46:42
13
  - step: None
14
  - max_problems: None
15
  - device_type:
16
- - ARC-Easy: 0.4790
17
- - ARC-Challenge: 0.3362
18
- - MMLU: 0.3319
19
- - GSM8K: 0.0637
20
- - HumanEval: 0.0061
21
  - SpellingBee: 0.9883
22
- - ChatCORE metric: 0.2646
23
 
 
1
  ## Chat evaluation sft
2
+ timestamp: 2025-11-29 14:29:57
3
 
4
  - source: sft
5
  - task_name: None
 
13
  - step: None
14
  - max_problems: None
15
  - device_type:
16
+ - ARC-Easy: 0.4630
17
+ - ARC-Challenge: 0.3234
18
+ - MMLU: 0.3222
19
+ - GSM8K: 0.0508
20
+ - HumanEval: 0.1220
21
  - SpellingBee: 0.9883
22
+ - ChatCORE metric: 0.2732
23
 
report/latest/chat-sft.md CHANGED
@@ -1,7 +1,7 @@
1
  ## Chat SFT
2
- timestamp: 2025-11-28 20:38:13
3
 
4
- - run: dummy
5
  - source: mid
6
  - device_type:
7
  - dtype: bfloat16
@@ -20,6 +20,6 @@ timestamp: 2025-11-28 20:38:13
20
  - eval_metrics_max_problems: 1024
21
  - Training rows: 22,439
22
  - Number of iterations: 701
23
- - Training loss: 0.5186
24
- - Validation loss: 1.0106
25
 
 
1
  ## Chat SFT
2
+ timestamp: 2025-11-29 14:24:18
3
 
4
+ - run: d20
5
  - source: mid
6
  - device_type:
7
  - dtype: bfloat16
 
20
  - eval_metrics_max_problems: 1024
21
  - Training rows: 22,439
22
  - Number of iterations: 701
23
+ - Training loss: 0.5388
24
+ - Validation loss: 1.0110
25
 
report/latest/header.md CHANGED
@@ -1,36 +1,36 @@
1
  # nanochat training report
2
 
3
- Generated: 2025-11-28 15:10:26
4
 
5
  ## Environment
6
 
7
  ### Git Information
8
  - Branch: master
9
- - Commit: 4a87a0d (dirty)
10
- - Message: Merge pull request #299 from samjabrahams/rotary_embedding_head_dim_comment_clea
11
 
12
  ### Hardware
13
  - Platform: Linux
14
- - CPUs: 80 cores (160 logical)
15
- - Memory: 1511.8 GB
16
  - GPUs: 8x NVIDIA H100 80GB HBM3
17
- - GPU Memory: 633.7 GB total
18
  - CUDA Version: 12.8
19
  - Hourly Rate: $24.00/hour
20
 
21
  ### Software
22
- - Python: 3.10.18
23
  - PyTorch: 2.8.0+cu128
24
 
25
 
26
  ### Bloat
27
- - Characters: 405,381
28
- - Lines: 9,847
29
- - Files: 48
30
- - Tokens (approx): 101,345
31
- - Dependencies (uv.lock lines): 2,218
32
 
33
- Run started: 2025-11-28 15:10:29
34
 
35
  ---
36
 
 
1
  # nanochat training report
2
 
3
+ Generated: 2025-11-29 10:42:07
4
 
5
  ## Environment
6
 
7
  ### Git Information
8
  - Branch: master
9
+ - Commit: 90d6352 (dirty)
10
+ - Message: Add Runpod one-click template; push/pull to/from HF; add hf_transfer
11
 
12
  ### Hardware
13
  - Platform: Linux
14
+ - CPUs: 128 cores (256 logical)
15
+ - Memory: 1511.5 GB
16
  - GPUs: 8x NVIDIA H100 80GB HBM3
17
+ - GPU Memory: 633.5 GB total
18
  - CUDA Version: 12.8
19
  - Hourly Rate: $24.00/hour
20
 
21
  ### Software
22
+ - Python: 3.10.12
23
  - PyTorch: 2.8.0+cu128
24
 
25
 
26
  ### Bloat
27
+ - Characters: 423,422
28
+ - Lines: 10,322
29
+ - Files: 51
30
+ - Tokens (approx): 105,855
31
+ - Dependencies (uv.lock lines): 2,252
32
 
33
+ Run started: 2025-11-29 10:42:10
34
 
35
  ---
36
 
report/latest/midtraining.md CHANGED
@@ -1,7 +1,7 @@
1
  ## Midtraining
2
- timestamp: 2025-11-28 20:25:25
3
 
4
- - run: dummy
5
  - device_type:
6
  - dtype: bfloat16
7
  - num_iterations: -1
@@ -18,5 +18,5 @@ timestamp: 2025-11-28 20:25:25
18
  - dry_run: 0
19
  - Number of iterations: 809
20
  - DDP world size: 8
21
- - Minimum validation bpb: 0.3953
22
 
 
1
  ## Midtraining
2
+ timestamp: 2025-11-29 14:15:20
3
 
4
+ - run: d20
5
  - device_type:
6
  - dtype: bfloat16
7
  - num_iterations: -1
 
18
  - dry_run: 0
19
  - Number of iterations: 809
20
  - DDP world size: 8
21
+ - Minimum validation bpb: 0.3954
22
 
report/latest/report.md CHANGED
@@ -1,43 +1,174 @@
1
  # nanochat training report
2
 
3
- Generated: 2025-11-28 15:10:26
4
 
5
  ## Environment
6
 
7
  ### Git Information
8
  - Branch: master
9
- - Commit: 4a87a0d (dirty)
10
- - Message: Merge pull request #299 from samjabrahams/rotary_embedding_head_dim_comment_clea
11
 
12
  ### Hardware
13
  - Platform: Linux
14
- - CPUs: 80 cores (160 logical)
15
- - Memory: 1511.8 GB
16
  - GPUs: 8x NVIDIA H100 80GB HBM3
17
- - GPU Memory: 633.7 GB total
18
  - CUDA Version: 12.8
19
  - Hourly Rate: $24.00/hour
20
 
21
  ### Software
22
- - Python: 3.10.18
23
  - PyTorch: 2.8.0+cu128
24
 
25
 
26
  ### Bloat
27
- - Characters: 405,381
28
- - Lines: 9,847
29
- - Files: 48
30
- - Tokens (approx): 101,345
31
- - Dependencies (uv.lock lines): 2,218
32
 
33
- Run started: 2025-11-28 15:10:29
34
 
35
  ---
36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  ## Midtraining
38
- timestamp: 2025-11-28 20:25:25
39
 
40
- - run: dummy
41
  - device_type:
42
  - dtype: bfloat16
43
  - num_iterations: -1
@@ -54,11 +185,11 @@ timestamp: 2025-11-28 20:25:25
54
  - dry_run: 0
55
  - Number of iterations: 809
56
  - DDP world size: 8
57
- - Minimum validation bpb: 0.3953
58
 
59
 
60
  ## Chat evaluation mid
61
- timestamp: 2025-11-28 20:35:29
62
 
63
  - source: mid
64
  - task_name: None
@@ -72,19 +203,19 @@ timestamp: 2025-11-28 20:35:29
72
  - step: None
73
  - max_problems: None
74
  - device_type:
75
- - ARC-Easy: 0.4516
76
- - ARC-Challenge: 0.3217
77
- - MMLU: 0.3326
78
- - GSM8K: 0.0349
79
- - HumanEval: 0.0000
80
- - SpellingBee: 0.9805
81
- - ChatCORE metric: 0.2483
82
 
83
 
84
  ## Chat SFT
85
- timestamp: 2025-11-28 20:38:13
86
 
87
- - run: dummy
88
  - source: mid
89
  - device_type:
90
  - dtype: bfloat16
@@ -103,12 +234,12 @@ timestamp: 2025-11-28 20:38:13
103
  - eval_metrics_max_problems: 1024
104
  - Training rows: 22,439
105
  - Number of iterations: 701
106
- - Training loss: 0.5186
107
- - Validation loss: 1.0106
108
 
109
 
110
  ## Chat evaluation sft
111
- timestamp: 2025-11-28 20:46:42
112
 
113
  - source: sft
114
  - task_name: None
@@ -122,30 +253,31 @@ timestamp: 2025-11-28 20:46:42
122
  - step: None
123
  - max_problems: None
124
  - device_type:
125
- - ARC-Easy: 0.4790
126
- - ARC-Challenge: 0.3362
127
- - MMLU: 0.3319
128
- - GSM8K: 0.0637
129
- - HumanEval: 0.0061
130
  - SpellingBee: 0.9883
131
- - ChatCORE metric: 0.2646
132
 
133
 
134
  ## Summary
135
 
136
- - Characters: 405,381
137
- - Lines: 9,847
138
- - Files: 48
139
- - Tokens (approx): 101,345
140
- - Dependencies (uv.lock lines): 2,218
141
 
142
  | Metric | BASE | MID | SFT | RL |
143
  |-----------------|----------|----------|----------|----------|
144
- | ARC-Challenge | - | 0.3217 | 0.3362 | - |
145
- | ARC-Easy | - | 0.4516 | 0.4790 | - |
146
- | GSM8K | - | 0.0349 | 0.0637 | - |
147
- | HumanEval | - | 0.0000 | 0.0061 | - |
148
- | MMLU | - | 0.3326 | 0.3319 | - |
149
- | ChatCORE | - | 0.2483 | 0.2646 | - |
150
-
151
- Total wall clock time: 5h36m
 
 
1
  # nanochat training report
2
 
3
+ Generated: 2025-11-29 10:42:07
4
 
5
  ## Environment
6
 
7
  ### Git Information
8
  - Branch: master
9
+ - Commit: 90d6352 (dirty)
10
+ - Message: Add Runpod one-click template; push/pull to/from HF; add hf_transfer
11
 
12
  ### Hardware
13
  - Platform: Linux
14
+ - CPUs: 128 cores (256 logical)
15
+ - Memory: 1511.5 GB
16
  - GPUs: 8x NVIDIA H100 80GB HBM3
17
+ - GPU Memory: 633.5 GB total
18
  - CUDA Version: 12.8
19
  - Hourly Rate: $24.00/hour
20
 
21
  ### Software
22
+ - Python: 3.10.12
23
  - PyTorch: 2.8.0+cu128
24
 
25
 
26
  ### Bloat
27
+ - Characters: 423,422
28
+ - Lines: 10,322
29
+ - Files: 51
30
+ - Tokens (approx): 105,855
31
+ - Dependencies (uv.lock lines): 2,252
32
 
33
+ Run started: 2025-11-29 10:42:10
34
 
35
  ---
36
 
37
+ ## Tokenizer training
38
+ timestamp: 2025-11-29 10:43:39
39
+
40
+ - max_chars: 2,000,000,000
41
+ - doc_cap: 10,000
42
+ - vocab_size: 65,536
43
+ - train_time: 62.1400
44
+ - num_special_tokens: 9
45
+ - token_bytes_min: 1
46
+ - token_bytes_max: 32
47
+ - token_bytes_mean: 6.9197
48
+ - token_bytes_std: 2.8748
49
+
50
+
51
+ ## Tokenizer evaluation
52
+ timestamp: 2025-11-29 10:43:47
53
+
54
+ ### Comparison with GPT-2
55
+
56
+ | Text Type | Bytes | GPT-2 Tokens | GPT-2 Ratio | Ours Tokens | Ours Ratio | Relative Diff % |
57
+ |-----------|-------|--------------|--------------|-------------|------------|-----------------|
58
+ | news | 1819 | 404 | 4.50 | 375 | 4.85 | +7.2% |
59
+ | korean | 893 | 745 | 1.20 | 712 | 1.25 | +4.4% |
60
+ | code | 1259 | 576 | 2.19 | 492 | 2.56 | +14.6% |
61
+ | math | 1834 | 936 | 1.96 | 966 | 1.90 | -3.2% |
62
+ | science | 1112 | 260 | 4.28 | 228 | 4.88 | +12.3% |
63
+ | fwe-train | 4208518 | 900364 | 4.67 | 856883 | 4.91 | +4.8% |
64
+ | fwe-val | 4908443 | 1059062 | 4.63 | 1010352 | 4.86 | +4.6% |
65
+
66
+ ### Comparison with GPT-4
67
+
68
+ | Text Type | Bytes | GPT-4 Tokens | GPT-4 Ratio | Ours Tokens | Ours Ratio | Relative Diff % |
69
+ |-----------|-------|--------------|--------------|-------------|------------|-----------------|
70
+ | news | 1819 | 387 | 4.70 | 375 | 4.85 | +3.1% |
71
+ | korean | 893 | 364 | 2.45 | 712 | 1.25 | -95.6% |
72
+ | code | 1259 | 309 | 4.07 | 492 | 2.56 | -59.2% |
73
+ | math | 1834 | 832 | 2.20 | 966 | 1.90 | -16.1% |
74
+ | science | 1112 | 249 | 4.47 | 228 | 4.88 | +8.4% |
75
+ | fwe-train | 4208518 | 874799 | 4.81 | 856883 | 4.91 | +2.0% |
76
+ | fwe-val | 4908443 | 1029691 | 4.77 | 1010352 | 4.86 | +1.9% |
77
+
78
+
79
+ ## Base model training
80
+ timestamp: 2025-11-29 14:01:35
81
+
82
+ - run: d20
83
+ - device_type:
84
+ - depth: 20
85
+ - max_seq_len: 2048
86
+ - num_iterations: -1
87
+ - target_flops: -1.0000
88
+ - target_param_data_ratio: 20
89
+ - device_batch_size: 32
90
+ - total_batch_size: 524,288
91
+ - embedding_lr: 0.2000
92
+ - unembedding_lr: 0.0040
93
+ - weight_decay: 0.0000
94
+ - matrix_lr: 0.0200
95
+ - grad_clip: 1.0000
96
+ - warmup_ratio: 0.0000
97
+ - warmdown_ratio: 0.2000
98
+ - final_lr_frac: 0.0000
99
+ - resume_from_step: -1
100
+ - eval_every: 250
101
+ - eval_tokens: 10,485,760
102
+ - core_metric_every: 2000
103
+ - core_metric_max_per_task: 500
104
+ - sample_every: 2000
105
+ - save_every: -1
106
+ - model_tag:
107
+ - Number of parameters: 560,988,160
108
+ - Number of FLOPs per token: 3.491758e+09
109
+ - Calculated number of iterations: 21,400
110
+ - Number of training tokens: 11,219,763,200
111
+ - Tokens : Params ratio: 20.0000
112
+ - DDP world size: 8
113
+ - warmup_ratio: 0.0000
114
+ - warmdown_ratio: 0.2000
115
+ - final_lr_frac: 0.0000
116
+ - Minimum validation bpb: 0.8120
117
+ - Final validation bpb: 0.8120
118
+ - CORE metric estimate: 0.2220
119
+ - MFU %: 48.17%
120
+ - Total training flops: 3.917670e+19
121
+ - Total training time: 186.38m
122
+ - Peak memory usage: 75422.52MiB
123
+
124
+
125
+ ## Base model loss
126
+ timestamp: 2025-11-29 14:02:20
127
+
128
+ - train bpb: 0.8148
129
+ - val bpb: 0.8122
130
+ - sample 0: <|bos|>The capital of France is Paris. It is the largest city in France and the second largest city in Europe
131
+ - sample 1: <|bos|>The chemical symbol of gold is Au. It is a soft, malleable, ductile, and malleable metal. It
132
+ - sample 2: <|bos|>If yesterday was Friday, then tomorrow will be Monday. If tomorrow is Monday, then tomorrow will be Tuesday. If tomorrow is
133
+ - sample 3: <|bos|>The opposite of hot is cold. The opposite of cold is hot. The opposite of hot is cold.
134
+ - sample 4: <|bos|>The planets of the solar system are: Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, Neptune,
135
+ - sample 5: <|bos|>My favorite color is blue. I love blue because it is a color that is associated with happiness and
136
+ - sample 6: <|bos|>If 5*x + 3 = 13, then x is 3. If 5*x + 3 = 13, then
137
+
138
+
139
+ ## Base model evaluation
140
+ timestamp: 2025-11-29 14:05:35
141
+
142
+ - Model: base_model (step 21400)
143
+ - CORE metric: 0.2065
144
+ - hellaswag_zeroshot: 0.2604
145
+ - jeopardy: 0.1190
146
+ - bigbench_qa_wikidata: 0.5148
147
+ - arc_easy: 0.5230
148
+ - arc_challenge: 0.1206
149
+ - copa: 0.4200
150
+ - commonsense_qa: 0.0448
151
+ - piqa: 0.3765
152
+ - openbook_qa: 0.1227
153
+ - lambada_openai: 0.3722
154
+ - hellaswag: 0.2647
155
+ - winograd: 0.2674
156
+ - winogrande: 0.0687
157
+ - bigbench_dyck_languages: 0.1200
158
+ - agi_eval_lsat_ar: 0.1087
159
+ - bigbench_cs_algorithms: 0.3667
160
+ - bigbench_operators: 0.1619
161
+ - bigbench_repeat_copy_logic: 0.0000
162
+ - squad: 0.2326
163
+ - coqa: 0.2043
164
+ - boolq: -0.3085
165
+ - bigbench_language_identification: 0.1815
166
+
167
+
168
  ## Midtraining
169
+ timestamp: 2025-11-29 14:15:20
170
 
171
+ - run: d20
172
  - device_type:
173
  - dtype: bfloat16
174
  - num_iterations: -1
 
185
  - dry_run: 0
186
  - Number of iterations: 809
187
  - DDP world size: 8
188
+ - Minimum validation bpb: 0.3954
189
 
190
 
191
  ## Chat evaluation mid
192
+ timestamp: 2025-11-29 14:21:52
193
 
194
  - source: mid
195
  - task_name: None
 
203
  - step: None
204
  - max_problems: None
205
  - device_type:
206
+ - ARC-Easy: 0.4322
207
+ - ARC-Challenge: 0.3336
208
+ - MMLU: 0.3363
209
+ - GSM8K: 0.0326
210
+ - HumanEval: 0.0976
211
+ - SpellingBee: 0.9766
212
+ - ChatCORE metric: 0.2627
213
 
214
 
215
  ## Chat SFT
216
+ timestamp: 2025-11-29 14:24:18
217
 
218
+ - run: d20
219
  - source: mid
220
  - device_type:
221
  - dtype: bfloat16
 
234
  - eval_metrics_max_problems: 1024
235
  - Training rows: 22,439
236
  - Number of iterations: 701
237
+ - Training loss: 0.5388
238
+ - Validation loss: 1.0110
239
 
240
 
241
  ## Chat evaluation sft
242
+ timestamp: 2025-11-29 14:29:57
243
 
244
  - source: sft
245
  - task_name: None
 
253
  - step: None
254
  - max_problems: None
255
  - device_type:
256
+ - ARC-Easy: 0.4630
257
+ - ARC-Challenge: 0.3234
258
+ - MMLU: 0.3222
259
+ - GSM8K: 0.0508
260
+ - HumanEval: 0.1220
261
  - SpellingBee: 0.9883
262
+ - ChatCORE metric: 0.2732
263
 
264
 
265
  ## Summary
266
 
267
+ - Characters: 423,422
268
+ - Lines: 10,322
269
+ - Files: 51
270
+ - Tokens (approx): 105,855
271
+ - Dependencies (uv.lock lines): 2,252
272
 
273
  | Metric | BASE | MID | SFT | RL |
274
  |-----------------|----------|----------|----------|----------|
275
+ | CORE | 0.2065 | - | - | - |
276
+ | ARC-Challenge | - | 0.3336 | 0.3234 | - |
277
+ | ARC-Easy | - | 0.4322 | 0.4630 | - |
278
+ | GSM8K | - | 0.0326 | 0.0508 | - |
279
+ | HumanEval | - | 0.0976 | 0.1220 | - |
280
+ | MMLU | - | 0.3363 | 0.3222 | - |
281
+ | ChatCORE | - | 0.2627 | 0.2732 | - |
282
+
283
+ Total wall clock time: 3h47m
report/latest/tokenizer-evaluation.md ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Tokenizer evaluation
2
+ timestamp: 2025-11-29 10:43:47
3
+
4
+ ### Comparison with GPT-2
5
+
6
+ | Text Type | Bytes | GPT-2 Tokens | GPT-2 Ratio | Ours Tokens | Ours Ratio | Relative Diff % |
7
+ |-----------|-------|--------------|--------------|-------------|------------|-----------------|
8
+ | news | 1819 | 404 | 4.50 | 375 | 4.85 | +7.2% |
9
+ | korean | 893 | 745 | 1.20 | 712 | 1.25 | +4.4% |
10
+ | code | 1259 | 576 | 2.19 | 492 | 2.56 | +14.6% |
11
+ | math | 1834 | 936 | 1.96 | 966 | 1.90 | -3.2% |
12
+ | science | 1112 | 260 | 4.28 | 228 | 4.88 | +12.3% |
13
+ | fwe-train | 4208518 | 900364 | 4.67 | 856883 | 4.91 | +4.8% |
14
+ | fwe-val | 4908443 | 1059062 | 4.63 | 1010352 | 4.86 | +4.6% |
15
+
16
+ ### Comparison with GPT-4
17
+
18
+ | Text Type | Bytes | GPT-4 Tokens | GPT-4 Ratio | Ours Tokens | Ours Ratio | Relative Diff % |
19
+ |-----------|-------|--------------|--------------|-------------|------------|-----------------|
20
+ | news | 1819 | 387 | 4.70 | 375 | 4.85 | +3.1% |
21
+ | korean | 893 | 364 | 2.45 | 712 | 1.25 | -95.6% |
22
+ | code | 1259 | 309 | 4.07 | 492 | 2.56 | -59.2% |
23
+ | math | 1834 | 832 | 2.20 | 966 | 1.90 | -16.1% |
24
+ | science | 1112 | 249 | 4.47 | 228 | 4.88 | +8.4% |
25
+ | fwe-train | 4208518 | 874799 | 4.81 | 856883 | 4.91 | +2.0% |
26
+ | fwe-val | 4908443 | 1029691 | 4.77 | 1010352 | 4.86 | +1.9% |
27
+
report/latest/tokenizer-training.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Tokenizer training
2
+ timestamp: 2025-11-29 10:43:39
3
+
4
+ - max_chars: 2,000,000,000
5
+ - doc_cap: 10,000
6
+ - vocab_size: 65,536
7
+ - train_time: 62.1400
8
+ - num_special_tokens: 9
9
+ - token_bytes_min: 1
10
+ - token_bytes_max: 32
11
+ - token_bytes_mean: 6.9197
12
+ - token_bytes_std: 2.8748
13
+