Upload via push_to_hf.py

Browse files

Files changed (11) hide show

report/latest/base-model-evaluation.md +28 -0
report/latest/base-model-loss.md +13 -0
report/latest/base-model-training.md +45 -0
report/latest/chat-evaluation-mid.md +8 -8
report/latest/chat-evaluation-sft.md +7 -7
report/latest/chat-sft.md +4 -4
report/latest/header.md +13 -13
report/latest/midtraining.md +3 -3
report/latest/report.md +180 -48
report/latest/tokenizer-evaluation.md +27 -0
report/latest/tokenizer-training.md +13 -0

report/latest/base-model-evaluation.md ADDED Viewed

	@@ -0,0 +1,28 @@

+## Base model evaluation
+timestamp: 2025-11-29 14:05:35
+- Model: base_model (step 21400)
+- CORE metric: 0.2065
+- hellaswag_zeroshot: 0.2604
+- jeopardy: 0.1190
+- bigbench_qa_wikidata: 0.5148
+- arc_easy: 0.5230
+- arc_challenge: 0.1206
+- copa: 0.4200
+- commonsense_qa: 0.0448
+- piqa: 0.3765
+- openbook_qa: 0.1227
+- lambada_openai: 0.3722
+- hellaswag: 0.2647
+- winograd: 0.2674
+- winogrande: 0.0687
+- bigbench_dyck_languages: 0.1200
+- agi_eval_lsat_ar: 0.1087
+- bigbench_cs_algorithms: 0.3667
+- bigbench_operators: 0.1619
+- bigbench_repeat_copy_logic: 0.0000
+- squad: 0.2326
+- coqa: 0.2043
+- boolq: -0.3085
+- bigbench_language_identification: 0.1815

report/latest/base-model-loss.md ADDED Viewed

	@@ -0,0 +1,13 @@

+## Base model loss
+timestamp: 2025-11-29 14:02:20
+- train bpb: 0.8148
+- val bpb: 0.8122
+- sample 0: <|bos|>The capital of France is Paris. It is the largest city in France and the second largest city in Europe
+- sample 1: <|bos|>The chemical symbol of gold is Au. It is a soft, malleable, ductile, and malleable metal. It
+- sample 2: <|bos|>If yesterday was Friday, then tomorrow will be Monday. If tomorrow is Monday, then tomorrow will be Tuesday. If tomorrow is
+- sample 3: <|bos|>The opposite of hot is cold. The opposite of cold is hot. The opposite of hot is cold.
+- sample 4: <|bos|>The planets of the solar system are: Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, Neptune,
+- sample 5: <|bos|>My favorite color is blue. I love blue because it is a color that is associated with happiness and
+- sample 6: <|bos|>If 5*x + 3 = 13, then x is 3. If 5*x + 3 = 13, then

report/latest/base-model-training.md ADDED Viewed

	@@ -0,0 +1,45 @@

+## Base model training
+timestamp: 2025-11-29 14:01:35
+- run: d20
+- device_type:
+- depth: 20
+- max_seq_len: 2048
+- num_iterations: -1
+- target_flops: -1.0000
+- target_param_data_ratio: 20
+- device_batch_size: 32
+- total_batch_size: 524,288
+- embedding_lr: 0.2000
+- unembedding_lr: 0.0040
+- weight_decay: 0.0000
+- matrix_lr: 0.0200
+- grad_clip: 1.0000
+- warmup_ratio: 0.0000
+- warmdown_ratio: 0.2000
+- final_lr_frac: 0.0000
+- resume_from_step: -1
+- eval_every: 250
+- eval_tokens: 10,485,760
+- core_metric_every: 2000
+- core_metric_max_per_task: 500
+- sample_every: 2000
+- save_every: -1
+- model_tag:
+- Number of parameters: 560,988,160
+- Number of FLOPs per token: 3.491758e+09
+- Calculated number of iterations: 21,400
+- Number of training tokens: 11,219,763,200
+- Tokens : Params ratio: 20.0000
+- DDP world size: 8
+- warmup_ratio: 0.0000
+- warmdown_ratio: 0.2000
+- final_lr_frac: 0.0000
+- Minimum validation bpb: 0.8120
+- Final validation bpb: 0.8120
+- CORE metric estimate: 0.2220
+- MFU %: 48.17%
+- Total training flops: 3.917670e+19
+- Total training time: 186.38m
+- Peak memory usage: 75422.52MiB

report/latest/chat-evaluation-mid.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ## Chat evaluation mid
-timestamp: 2025-11-28 20:35:29
 - source: mid
 - task_name: None
@@ -13,11 +13,11 @@ timestamp: 2025-11-28 20:35:29
 - step: None
 - max_problems: None
 - device_type:
-- ARC-Easy: 0.4516
-- ARC-Challenge: 0.3217
-- MMLU: 0.3326
-- GSM8K: 0.0349
-- HumanEval: 0.0000
-- SpellingBee: 0.9805
-- ChatCORE metric: 0.2483

 ## Chat evaluation mid
+timestamp: 2025-11-29 14:21:52
 - source: mid
 - task_name: None
 - step: None
 - max_problems: None
 - device_type:
+- ARC-Easy: 0.4322
+- ARC-Challenge: 0.3336
+- MMLU: 0.3363
+- GSM8K: 0.0326
+- HumanEval: 0.0976
+- SpellingBee: 0.9766
+- ChatCORE metric: 0.2627

report/latest/chat-evaluation-sft.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ## Chat evaluation sft
-timestamp: 2025-11-28 20:46:42
 - source: sft
 - task_name: None
@@ -13,11 +13,11 @@ timestamp: 2025-11-28 20:46:42
 - step: None
 - max_problems: None
 - device_type:
-- ARC-Easy: 0.4790
-- ARC-Challenge: 0.3362
-- MMLU: 0.3319
-- GSM8K: 0.0637
-- HumanEval: 0.0061
 - SpellingBee: 0.9883
-- ChatCORE metric: 0.2646

 ## Chat evaluation sft
+timestamp: 2025-11-29 14:29:57
 - source: sft
 - task_name: None
 - step: None
 - max_problems: None
 - device_type:
+- ARC-Easy: 0.4630
+- ARC-Challenge: 0.3234
+- MMLU: 0.3222
+- GSM8K: 0.0508
+- HumanEval: 0.1220
 - SpellingBee: 0.9883
+- ChatCORE metric: 0.2732

report/latest/chat-sft.md CHANGED Viewed

@@ -1,7 +1,7 @@
 ## Chat SFT
-timestamp: 2025-11-28 20:38:13
-- run: dummy
 - source: mid
 - device_type:
 - dtype: bfloat16
@@ -20,6 +20,6 @@ timestamp: 2025-11-28 20:38:13
 - eval_metrics_max_problems: 1024
 - Training rows: 22,439
 - Number of iterations: 701
-- Training loss: 0.5186
-- Validation loss: 1.0106

 ## Chat SFT
+timestamp: 2025-11-29 14:24:18
+- run: d20
 - source: mid
 - device_type:
 - dtype: bfloat16
 - eval_metrics_max_problems: 1024
 - Training rows: 22,439
 - Number of iterations: 701
+- Training loss: 0.5388
+- Validation loss: 1.0110

report/latest/header.md CHANGED Viewed

@@ -1,36 +1,36 @@
 # nanochat training report
-Generated: 2025-11-28 15:10:26
 ## Environment
 ### Git Information
 - Branch: master
-- Commit: 4a87a0d (dirty)
-- Message: Merge pull request #299 from samjabrahams/rotary_embedding_head_dim_comment_clea
 ### Hardware
 - Platform: Linux
-- CPUs: 80 cores (160 logical)
-- Memory: 1511.8 GB
 - GPUs: 8x NVIDIA H100 80GB HBM3
-- GPU Memory: 633.7 GB total
 - CUDA Version: 12.8
 - Hourly Rate: $24.00/hour
 ### Software
-- Python: 3.10.18
 - PyTorch: 2.8.0+cu128
 ### Bloat
-- Characters: 405,381
-- Lines: 9,847
-- Files: 48
-- Tokens (approx): 101,345
-- Dependencies (uv.lock lines): 2,218
-Run started: 2025-11-28 15:10:29
 ---

 # nanochat training report
+Generated: 2025-11-29 10:42:07
 ## Environment
 ### Git Information
 - Branch: master
+- Commit: 90d6352 (dirty)
+- Message: Add Runpod one-click template; push/pull to/from HF; add hf_transfer
 ### Hardware
 - Platform: Linux
+- CPUs: 128 cores (256 logical)
+- Memory: 1511.5 GB
 - GPUs: 8x NVIDIA H100 80GB HBM3
+- GPU Memory: 633.5 GB total
 - CUDA Version: 12.8
 - Hourly Rate: $24.00/hour
 ### Software
+- Python: 3.10.12
 - PyTorch: 2.8.0+cu128
 ### Bloat
+- Characters: 423,422
+- Lines: 10,322
+- Files: 51
+- Tokens (approx): 105,855
+- Dependencies (uv.lock lines): 2,252
+Run started: 2025-11-29 10:42:10
 ---

report/latest/midtraining.md CHANGED Viewed

@@ -1,7 +1,7 @@
 ## Midtraining
-timestamp: 2025-11-28 20:25:25
-- run: dummy
 - device_type:
 - dtype: bfloat16
 - num_iterations: -1
@@ -18,5 +18,5 @@ timestamp: 2025-11-28 20:25:25
 - dry_run: 0
 - Number of iterations: 809
 - DDP world size: 8
-- Minimum validation bpb: 0.3953

 ## Midtraining
+timestamp: 2025-11-29 14:15:20
+- run: d20
 - device_type:
 - dtype: bfloat16
 - num_iterations: -1
 - dry_run: 0
 - Number of iterations: 809
 - DDP world size: 8
+- Minimum validation bpb: 0.3954

report/latest/report.md CHANGED Viewed

@@ -1,43 +1,174 @@
 # nanochat training report
-Generated: 2025-11-28 15:10:26
 ## Environment
 ### Git Information
 - Branch: master
-- Commit: 4a87a0d (dirty)
-- Message: Merge pull request #299 from samjabrahams/rotary_embedding_head_dim_comment_clea
 ### Hardware
 - Platform: Linux
-- CPUs: 80 cores (160 logical)
-- Memory: 1511.8 GB
 - GPUs: 8x NVIDIA H100 80GB HBM3
-- GPU Memory: 633.7 GB total
 - CUDA Version: 12.8
 - Hourly Rate: $24.00/hour
 ### Software
-- Python: 3.10.18
 - PyTorch: 2.8.0+cu128
 ### Bloat
-- Characters: 405,381
-- Lines: 9,847
-- Files: 48
-- Tokens (approx): 101,345
-- Dependencies (uv.lock lines): 2,218
-Run started: 2025-11-28 15:10:29
 ---
 ## Midtraining
-timestamp: 2025-11-28 20:25:25
-- run: dummy
 - device_type:
 - dtype: bfloat16
 - num_iterations: -1
@@ -54,11 +185,11 @@ timestamp: 2025-11-28 20:25:25
 - dry_run: 0
 - Number of iterations: 809
 - DDP world size: 8
-- Minimum validation bpb: 0.3953
 ## Chat evaluation mid
-timestamp: 2025-11-28 20:35:29
 - source: mid
 - task_name: None
@@ -72,19 +203,19 @@ timestamp: 2025-11-28 20:35:29
 - step: None
 - max_problems: None
 - device_type:
-- ARC-Easy: 0.4516
-- ARC-Challenge: 0.3217
-- MMLU: 0.3326
-- GSM8K: 0.0349
-- HumanEval: 0.0000
-- SpellingBee: 0.9805
-- ChatCORE metric: 0.2483
 ## Chat SFT
-timestamp: 2025-11-28 20:38:13
-- run: dummy
 - source: mid
 - device_type:
 - dtype: bfloat16
@@ -103,12 +234,12 @@ timestamp: 2025-11-28 20:38:13
 - eval_metrics_max_problems: 1024
 - Training rows: 22,439
 - Number of iterations: 701
-- Training loss: 0.5186
-- Validation loss: 1.0106
 ## Chat evaluation sft
-timestamp: 2025-11-28 20:46:42
 - source: sft
 - task_name: None
@@ -122,30 +253,31 @@ timestamp: 2025-11-28 20:46:42
 - step: None
 - max_problems: None
 - device_type:
-- ARC-Easy: 0.4790
-- ARC-Challenge: 0.3362
-- MMLU: 0.3319
-- GSM8K: 0.0637
-- HumanEval: 0.0061
 - SpellingBee: 0.9883
-- ChatCORE metric: 0.2646
 ## Summary
-- Characters: 405,381
-- Lines: 9,847
-- Files: 48
-- Tokens (approx): 101,345
-- Dependencies (uv.lock lines): 2,218
 | Metric          | BASE     | MID      | SFT      | RL       |
 |-----------------|----------|----------|----------|----------|
-| ARC-Challenge   | -        | 0.3217   | 0.3362   | -        |
-| ARC-Easy        | -        | 0.4516   | 0.4790   | -        |
-| GSM8K           | -        | 0.0349   | 0.0637   | -        |
-| HumanEval       | -        | 0.0000   | 0.0061   | -        |
-| MMLU            | -        | 0.3326   | 0.3319   | -        |
-| ChatCORE        | -        | 0.2483   | 0.2646   | -        |
-Total wall clock time: 5h36m

 # nanochat training report
+Generated: 2025-11-29 10:42:07
 ## Environment
 ### Git Information
 - Branch: master
+- Commit: 90d6352 (dirty)
+- Message: Add Runpod one-click template; push/pull to/from HF; add hf_transfer
 ### Hardware
 - Platform: Linux
+- CPUs: 128 cores (256 logical)
+- Memory: 1511.5 GB
 - GPUs: 8x NVIDIA H100 80GB HBM3
+- GPU Memory: 633.5 GB total
 - CUDA Version: 12.8
 - Hourly Rate: $24.00/hour
 ### Software
+- Python: 3.10.12
 - PyTorch: 2.8.0+cu128
 ### Bloat
+- Characters: 423,422
+- Lines: 10,322
+- Files: 51
+- Tokens (approx): 105,855
+- Dependencies (uv.lock lines): 2,252
+Run started: 2025-11-29 10:42:10
 ---
+## Tokenizer training
+timestamp: 2025-11-29 10:43:39
+- max_chars: 2,000,000,000
+- doc_cap: 10,000
+- vocab_size: 65,536
+- train_time: 62.1400
+- num_special_tokens: 9
+- token_bytes_min: 1
+- token_bytes_max: 32
+- token_bytes_mean: 6.9197
+- token_bytes_std: 2.8748
+## Tokenizer evaluation
+timestamp: 2025-11-29 10:43:47
+### Comparison with GPT-2
+| Text Type | Bytes | GPT-2 Tokens | GPT-2 Ratio | Ours Tokens | Ours Ratio | Relative Diff % |
+|-----------|-------|--------------|--------------|-------------|------------|-----------------|
+| news | 1819 | 404 | 4.50 | 375 | 4.85 | +7.2% |
+| korean | 893 | 745 | 1.20 | 712 | 1.25 | +4.4% |
+| code | 1259 | 576 | 2.19 | 492 | 2.56 | +14.6% |
+| math | 1834 | 936 | 1.96 | 966 | 1.90 | -3.2% |
+| science | 1112 | 260 | 4.28 | 228 | 4.88 | +12.3% |
+| fwe-train | 4208518 | 900364 | 4.67 | 856883 | 4.91 | +4.8% |
+| fwe-val | 4908443 | 1059062 | 4.63 | 1010352 | 4.86 | +4.6% |
+### Comparison with GPT-4
+| Text Type | Bytes | GPT-4 Tokens | GPT-4 Ratio | Ours Tokens | Ours Ratio | Relative Diff % |
+|-----------|-------|--------------|--------------|-------------|------------|-----------------|
+| news | 1819 | 387 | 4.70 | 375 | 4.85 | +3.1% |
+| korean | 893 | 364 | 2.45 | 712 | 1.25 | -95.6% |
+| code | 1259 | 309 | 4.07 | 492 | 2.56 | -59.2% |
+| math | 1834 | 832 | 2.20 | 966 | 1.90 | -16.1% |
+| science | 1112 | 249 | 4.47 | 228 | 4.88 | +8.4% |
+| fwe-train | 4208518 | 874799 | 4.81 | 856883 | 4.91 | +2.0% |
+| fwe-val | 4908443 | 1029691 | 4.77 | 1010352 | 4.86 | +1.9% |
+## Base model training
+timestamp: 2025-11-29 14:01:35
+- run: d20
+- device_type:
+- depth: 20
+- max_seq_len: 2048
+- num_iterations: -1
+- target_flops: -1.0000
+- target_param_data_ratio: 20
+- device_batch_size: 32
+- total_batch_size: 524,288
+- embedding_lr: 0.2000
+- unembedding_lr: 0.0040
+- weight_decay: 0.0000
+- matrix_lr: 0.0200
+- grad_clip: 1.0000
+- warmup_ratio: 0.0000
+- warmdown_ratio: 0.2000
+- final_lr_frac: 0.0000
+- resume_from_step: -1
+- eval_every: 250
+- eval_tokens: 10,485,760
+- core_metric_every: 2000
+- core_metric_max_per_task: 500
+- sample_every: 2000
+- save_every: -1
+- model_tag:
+- Number of parameters: 560,988,160
+- Number of FLOPs per token: 3.491758e+09
+- Calculated number of iterations: 21,400
+- Number of training tokens: 11,219,763,200
+- Tokens : Params ratio: 20.0000
+- DDP world size: 8
+- warmup_ratio: 0.0000
+- warmdown_ratio: 0.2000
+- final_lr_frac: 0.0000
+- Minimum validation bpb: 0.8120
+- Final validation bpb: 0.8120
+- CORE metric estimate: 0.2220
+- MFU %: 48.17%
+- Total training flops: 3.917670e+19
+- Total training time: 186.38m
+- Peak memory usage: 75422.52MiB
+## Base model loss
+timestamp: 2025-11-29 14:02:20
+- train bpb: 0.8148
+- val bpb: 0.8122
+- sample 0: <|bos|>The capital of France is Paris. It is the largest city in France and the second largest city in Europe
+- sample 1: <|bos|>The chemical symbol of gold is Au. It is a soft, malleable, ductile, and malleable metal. It
+- sample 2: <|bos|>If yesterday was Friday, then tomorrow will be Monday. If tomorrow is Monday, then tomorrow will be Tuesday. If tomorrow is
+- sample 3: <|bos|>The opposite of hot is cold. The opposite of cold is hot. The opposite of hot is cold.
+- sample 4: <|bos|>The planets of the solar system are: Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, Neptune,
+- sample 5: <|bos|>My favorite color is blue. I love blue because it is a color that is associated with happiness and
+- sample 6: <|bos|>If 5*x + 3 = 13, then x is 3. If 5*x + 3 = 13, then
+## Base model evaluation
+timestamp: 2025-11-29 14:05:35
+- Model: base_model (step 21400)
+- CORE metric: 0.2065
+- hellaswag_zeroshot: 0.2604
+- jeopardy: 0.1190
+- bigbench_qa_wikidata: 0.5148
+- arc_easy: 0.5230
+- arc_challenge: 0.1206
+- copa: 0.4200
+- commonsense_qa: 0.0448
+- piqa: 0.3765
+- openbook_qa: 0.1227
+- lambada_openai: 0.3722
+- hellaswag: 0.2647
+- winograd: 0.2674
+- winogrande: 0.0687
+- bigbench_dyck_languages: 0.1200
+- agi_eval_lsat_ar: 0.1087
+- bigbench_cs_algorithms: 0.3667
+- bigbench_operators: 0.1619
+- bigbench_repeat_copy_logic: 0.0000
+- squad: 0.2326
+- coqa: 0.2043
+- boolq: -0.3085
+- bigbench_language_identification: 0.1815
 ## Midtraining
+timestamp: 2025-11-29 14:15:20
+- run: d20
 - device_type:
 - dtype: bfloat16
 - num_iterations: -1
 - dry_run: 0
 - Number of iterations: 809
 - DDP world size: 8
+- Minimum validation bpb: 0.3954
 ## Chat evaluation mid
+timestamp: 2025-11-29 14:21:52
 - source: mid
 - task_name: None
 - step: None
 - max_problems: None
 - device_type:
+- ARC-Easy: 0.4322
+- ARC-Challenge: 0.3336
+- MMLU: 0.3363
+- GSM8K: 0.0326
+- HumanEval: 0.0976
+- SpellingBee: 0.9766
+- ChatCORE metric: 0.2627
 ## Chat SFT
+timestamp: 2025-11-29 14:24:18
+- run: d20
 - source: mid
 - device_type:
 - dtype: bfloat16
 - eval_metrics_max_problems: 1024
 - Training rows: 22,439
 - Number of iterations: 701
+- Training loss: 0.5388
+- Validation loss: 1.0110
 ## Chat evaluation sft
+timestamp: 2025-11-29 14:29:57
 - source: sft
 - task_name: None
 - step: None
 - max_problems: None
 - device_type:
+- ARC-Easy: 0.4630
+- ARC-Challenge: 0.3234
+- MMLU: 0.3222
+- GSM8K: 0.0508
+- HumanEval: 0.1220
 - SpellingBee: 0.9883
+- ChatCORE metric: 0.2732
 ## Summary
+- Characters: 423,422
+- Lines: 10,322
+- Files: 51
+- Tokens (approx): 105,855
+- Dependencies (uv.lock lines): 2,252
 | Metric          | BASE     | MID      | SFT      | RL       |
 |-----------------|----------|----------|----------|----------|
+| CORE            | 0.2065   | -        | -        | -        |
+| ARC-Challenge   | -        | 0.3336   | 0.3234   | -        |
+| ARC-Easy        | -        | 0.4322   | 0.4630   | -        |
+| GSM8K           | -        | 0.0326   | 0.0508   | -        |
+| HumanEval       | -        | 0.0976   | 0.1220   | -        |
+| MMLU            | -        | 0.3363   | 0.3222   | -        |
+| ChatCORE        | -        | 0.2627   | 0.2732   | -        |
+Total wall clock time: 3h47m

report/latest/tokenizer-evaluation.md ADDED Viewed

	@@ -0,0 +1,27 @@

+## Tokenizer evaluation
+timestamp: 2025-11-29 10:43:47
+### Comparison with GPT-2
+| Text Type | Bytes | GPT-2 Tokens | GPT-2 Ratio | Ours Tokens | Ours Ratio | Relative Diff % |
+|-----------|-------|--------------|--------------|-------------|------------|-----------------|
+| news | 1819 | 404 | 4.50 | 375 | 4.85 | +7.2% |
+| korean | 893 | 745 | 1.20 | 712 | 1.25 | +4.4% |
+| code | 1259 | 576 | 2.19 | 492 | 2.56 | +14.6% |
+| math | 1834 | 936 | 1.96 | 966 | 1.90 | -3.2% |
+| science | 1112 | 260 | 4.28 | 228 | 4.88 | +12.3% |
+| fwe-train | 4208518 | 900364 | 4.67 | 856883 | 4.91 | +4.8% |
+| fwe-val | 4908443 | 1059062 | 4.63 | 1010352 | 4.86 | +4.6% |
+### Comparison with GPT-4
+| Text Type | Bytes | GPT-4 Tokens | GPT-4 Ratio | Ours Tokens | Ours Ratio | Relative Diff % |
+|-----------|-------|--------------|--------------|-------------|------------|-----------------|
+| news | 1819 | 387 | 4.70 | 375 | 4.85 | +3.1% |
+| korean | 893 | 364 | 2.45 | 712 | 1.25 | -95.6% |
+| code | 1259 | 309 | 4.07 | 492 | 2.56 | -59.2% |
+| math | 1834 | 832 | 2.20 | 966 | 1.90 | -16.1% |
+| science | 1112 | 249 | 4.47 | 228 | 4.88 | +8.4% |
+| fwe-train | 4208518 | 874799 | 4.81 | 856883 | 4.91 | +2.0% |
+| fwe-val | 4908443 | 1029691 | 4.77 | 1010352 | 4.86 | +1.9% |

report/latest/tokenizer-training.md ADDED Viewed

	@@ -0,0 +1,13 @@

+## Tokenizer training
+timestamp: 2025-11-29 10:43:39
+- max_chars: 2,000,000,000
+- doc_cap: 10,000
+- vocab_size: 65,536
+- train_time: 62.1400
+- num_special_tokens: 9
+- token_bytes_min: 1
+- token_bytes_max: 32
+- token_bytes_mean: 6.9197
+- token_bytes_std: 2.8748