duoduoyeah commited on
Commit
ba0d57a
·
verified ·
1 Parent(s): e34808b

Add files using upload-large-folder tool

Browse files
gpt_d8_next1_r40_v4096_implicit_simple/base_checkpoints/d8/meta_006000.json ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "step": 6000,
3
+ "val_bpb": 0,
4
+ "model_config": {
5
+ "sequence_len": 512,
6
+ "vocab_size": 4096,
7
+ "n_layer": 8,
8
+ "n_head": 4,
9
+ "n_kv_head": 4,
10
+ "n_embd": 512
11
+ },
12
+ "user_config": {
13
+ "run": "gpt_d8_next1_r40_v4096_implicit_simple",
14
+ "device_type": "",
15
+ "depth": 8,
16
+ "max_seq_len": 512,
17
+ "target_shift": 1,
18
+ "mask_token": 0,
19
+ "num_iterations": -1,
20
+ "target_flops": -1.0,
21
+ "target_param_data_ratio": 40,
22
+ "device_batch_size": 64,
23
+ "total_batch_size": 131072,
24
+ "embedding_lr": 0.2,
25
+ "unembedding_lr": 0.004,
26
+ "weight_decay": 0.0,
27
+ "matrix_lr": 0.02,
28
+ "grad_clip": 1.0,
29
+ "warmup_ratio": 0.0,
30
+ "warmdown_ratio": 0.2,
31
+ "final_lr_frac": 0.0,
32
+ "resume_from_step": -1,
33
+ "eval_every": -1,
34
+ "eval_tokens": 10485760,
35
+ "core_metric_every": -1,
36
+ "core_metric_max_per_task": 500,
37
+ "sample_every": -1,
38
+ "save_every": 6000,
39
+ "model_tag": ""
40
+ },
41
+ "device_batch_size": 64,
42
+ "max_seq_len": 512,
43
+ "dataloader_state_dict": {
44
+ "pq_idx": 5,
45
+ "rg_idx": 68
46
+ },
47
+ "loop_state": {
48
+ "min_val_bpb": Infinity,
49
+ "smooth_train_loss": 1.5400441557553806,
50
+ "total_training_time": 1316.3449952602386
51
+ }
52
+ }
gpt_d8_next1_r40_v4096_implicit_simple/base_checkpoints/d8/meta_008960.json ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "step": 8960,
3
+ "val_bpb": 0,
4
+ "model_config": {
5
+ "sequence_len": 512,
6
+ "vocab_size": 4096,
7
+ "n_layer": 8,
8
+ "n_head": 4,
9
+ "n_kv_head": 4,
10
+ "n_embd": 512
11
+ },
12
+ "user_config": {
13
+ "run": "gpt_d8_next1_r40_v4096_implicit_simple",
14
+ "device_type": "",
15
+ "depth": 8,
16
+ "max_seq_len": 512,
17
+ "target_shift": 1,
18
+ "mask_token": 0,
19
+ "num_iterations": -1,
20
+ "target_flops": -1.0,
21
+ "target_param_data_ratio": 40,
22
+ "device_batch_size": 64,
23
+ "total_batch_size": 131072,
24
+ "embedding_lr": 0.2,
25
+ "unembedding_lr": 0.004,
26
+ "weight_decay": 0.0,
27
+ "matrix_lr": 0.02,
28
+ "grad_clip": 1.0,
29
+ "warmup_ratio": 0.0,
30
+ "warmdown_ratio": 0.2,
31
+ "final_lr_frac": 0.0,
32
+ "resume_from_step": -1,
33
+ "eval_every": -1,
34
+ "eval_tokens": 10485760,
35
+ "core_metric_every": -1,
36
+ "core_metric_max_per_task": 500,
37
+ "sample_every": -1,
38
+ "save_every": 6000,
39
+ "model_tag": ""
40
+ },
41
+ "device_batch_size": 64,
42
+ "max_seq_len": 512,
43
+ "dataloader_state_dict": {
44
+ "pq_idx": 3,
45
+ "rg_idx": 184
46
+ },
47
+ "loop_state": {
48
+ "min_val_bpb": Infinity,
49
+ "smooth_train_loss": 1.4397647744770112,
50
+ "total_training_time": 1966.5658521652222
51
+ }
52
+ }
gpt_d8_next1_r40_v4096_implicit_simple/base_checkpoints/d8/model_006000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:acbd40ad8b1fa9a93c5f68550718ed913e21b6e8652bd1d89c4949a2f2d263b4
3
+ size 113266917
gpt_d8_next1_r40_v4096_implicit_simple/base_checkpoints/d8/model_008960.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:34eece22aae8df67eda517924313258bcbd3a92649c2979b9e396522c7bab0cd
3
+ size 113266917
gpt_d8_next1_r40_v4096_implicit_simple/base_checkpoints/d8/optim_006000_rank0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5afa5239797045d3d8aaf4469ec23e9e76eb0415360ae404fd44ab2361460e48
3
+ size 125847061
gpt_d8_next1_r40_v4096_implicit_simple/base_checkpoints/d8/optim_008960_rank0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:276cd6db689a7bef2a995cec480cd1257e89cdfd9200293334ba712a9e189c34
3
+ size 125847061
gpt_d8_next1_r40_v4096_implicit_simple/report/base-model-training.md ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Base model training
2
+ timestamp: 2026-01-11 05:48:46
3
+
4
+ - run: gpt_d8_next1_r40_v4096_implicit_simple
5
+ - device_type:
6
+ - depth: 8
7
+ - max_seq_len: 512
8
+ - target_shift: 1
9
+ - mask_token: 0
10
+ - num_iterations: -1
11
+ - target_flops: -1.0000
12
+ - target_param_data_ratio: 40
13
+ - device_batch_size: 64
14
+ - total_batch_size: 131,072
15
+ - embedding_lr: 0.2000
16
+ - unembedding_lr: 0.0040
17
+ - weight_decay: 0.0000
18
+ - matrix_lr: 0.0200
19
+ - grad_clip: 1.0000
20
+ - warmup_ratio: 0.0000
21
+ - warmdown_ratio: 0.2000
22
+ - final_lr_frac: 0.0000
23
+ - resume_from_step: -1
24
+ - eval_every: -1
25
+ - eval_tokens: 10,485,760
26
+ - core_metric_every: -1
27
+ - core_metric_max_per_task: 500
28
+ - sample_every: -1
29
+ - save_every: 6000
30
+ - model_tag:
31
+ - Number of parameters: 29,360,128
32
+ - Number of FLOPs per token: 1.887437e+08
33
+ - Calculated number of iterations: 8960
34
+ - Number of training tokens: 1,174,405,120
35
+ - Tokens : Params ratio: 40.0000
36
+ - DDP world size: 1
37
+ - warmup_ratio: 0.0000
38
+ - warmdown_ratio: 0.2000
39
+ - final_lr_frac: 0.0000
40
+ - Minimum validation bpb: inf
41
+ - Final validation bpb: 0
42
+ - CORE metric estimate: None
43
+ - MFU %: 11.46%
44
+ - Total training flops: 2.216615e+17
45
+ - Total training time: 32.78m
46
+ - Peak memory usage: 5353.53MiB
47
+
gpt_d8_next1_r40_v4096_implicit_simple/report/header.md ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # nanochat training report
2
+
3
+ Generated: 2026-01-11 05:15:02
4
+
5
+ ## Environment
6
+
7
+ ### Git Information
8
+ - Branch: tokenizer
9
+ - Commit: 06bfb76 (clean)
10
+ - Message: base train debug
11
+
12
+ ### Hardware
13
+ - Platform: Linux
14
+ - CPUs: 6 cores (12 logical)
15
+ - Memory: 167.1 GB
16
+ - GPUs: 1x NVIDIA A100-SXM4-80GB
17
+ - GPU Memory: 79.3 GB total
18
+ - CUDA Version: 12.8
19
+ - Hourly Rate: $1.79/hour
20
+
21
+ ### Software
22
+ - Python: 3.12.12
23
+ - PyTorch: 2.9.1+cu128
24
+
25
+
26
+ ### Bloat
27
+ - Characters: 590,099
28
+ - Lines: 14,582
29
+ - Files: 87
30
+ - Tokens (approx): 147,524
31
+ - Dependencies (uv.lock lines): 2,749
32
+
33
+ Run started: 2026-01-11 05:15:02
34
+
35
+ ---
36
+
gpt_d8_next1_r40_v4096_implicit_simple/simple_story_data/shard_00000.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc68646376a7cb8fa0a5a180e650c9c4de1c01e64de377c0286eec0cad8448d3
3
+ size 81779008
gpt_d8_next1_r40_v4096_implicit_simple/simple_story_data/shard_00001.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ddff7f98eb9a0df4f578c43c5fa034c4559ab6787b2cd3cea84fca88704662be
3
+ size 81663463
gpt_d8_next1_r40_v4096_implicit_simple/simple_story_data/shard_00002.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e0e7454be37151c327d3fd2dcc1fdd3138c4268d28fadbca69003a922a04667e
3
+ size 81869169
gpt_d8_next1_r40_v4096_implicit_simple/simple_story_data/shard_00003.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:63631e880c1b7910c5d7c94b835b558c1004cd26dcf5ccf9606d84b224590cfe
3
+ size 81870659
gpt_d8_next1_r40_v4096_implicit_simple/simple_story_data/shard_00004.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c43d84ea2a95d7ceefa291e5bd48caf579d94ce5fb448096046b3b9223da7fbe
3
+ size 81978716
gpt_d8_next1_r40_v4096_implicit_simple/simple_story_data/shard_00005.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f7206f1ef464c42db1358c47a3f07f8e288c122ea52018b0338f1923be36889
3
+ size 81684523
gpt_d8_next1_r40_v4096_implicit_simple/simple_story_data/shard_00006.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f19e9e98a2167d244ac0d2d60c0b48c9d51449be27e3e58ad0ee587f09bc63e
3
+ size 81848408
gpt_d8_next1_r40_v4096_implicit_simple/simple_story_data/shard_00007.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:71ca94f3715924ed744c180862a1a152e51440ad32aa359b34a8856686870b20
3
+ size 81846031
gpt_d8_next1_r40_v4096_implicit_simple/tokenizer/token_bytes.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:17f468b0626e587a9948f73588629d679198a6f38fe834875017b222ec2443d7
3
+ size 17961
gpt_d8_next1_r40_v4096_implicit_simple/tokenizer/tokenizer.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:893aa318d37e34b8852086f8fcdffcd9d45242706dc6f06dc5d315499aa5a633
3
+ size 45915