duoduoyeah commited on
Commit
ffcf3c7
·
verified ·
1 Parent(s): d13f6e4

Add files using upload-large-folder tool

Browse files
gpt_d8_next2_r40_v8192_implicit_simple/base_checkpoints/d8/meta_006000.json ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "step": 6000,
3
+ "val_bpb": 0,
4
+ "model_config": {
5
+ "sequence_len": 512,
6
+ "vocab_size": 8192,
7
+ "n_layer": 8,
8
+ "n_head": 4,
9
+ "n_kv_head": 4,
10
+ "n_embd": 512
11
+ },
12
+ "user_config": {
13
+ "run": "gpt_d8_next2_r40_v8192_implicit_simple",
14
+ "device_type": "",
15
+ "depth": 8,
16
+ "max_seq_len": 512,
17
+ "target_shift": 2,
18
+ "mask_token": 0,
19
+ "num_iterations": -1,
20
+ "target_flops": -1.0,
21
+ "target_param_data_ratio": 40,
22
+ "device_batch_size": 128,
23
+ "total_batch_size": 131072,
24
+ "embedding_lr": 0.2,
25
+ "unembedding_lr": 0.004,
26
+ "weight_decay": 0.0,
27
+ "matrix_lr": 0.02,
28
+ "grad_clip": 1.0,
29
+ "warmup_ratio": 0.0,
30
+ "warmdown_ratio": 0.2,
31
+ "final_lr_frac": 0.0,
32
+ "resume_from_step": -1,
33
+ "eval_every": -1,
34
+ "eval_tokens": 10485760,
35
+ "core_metric_every": -1,
36
+ "core_metric_max_per_task": 500,
37
+ "sample_every": -1,
38
+ "save_every": 6000,
39
+ "model_tag": ""
40
+ },
41
+ "device_batch_size": 128,
42
+ "max_seq_len": 512,
43
+ "dataloader_state_dict": {
44
+ "pq_idx": 5,
45
+ "rg_idx": 188
46
+ },
47
+ "loop_state": {
48
+ "min_val_bpb": Infinity,
49
+ "smooth_train_loss": 2.593795311860216,
50
+ "total_training_time": 1335.3098776340485
51
+ }
52
+ }
gpt_d8_next2_r40_v8192_implicit_simple/base_checkpoints/d8/meta_010240.json ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "step": 10240,
3
+ "val_bpb": 0,
4
+ "model_config": {
5
+ "sequence_len": 512,
6
+ "vocab_size": 8192,
7
+ "n_layer": 8,
8
+ "n_head": 4,
9
+ "n_kv_head": 4,
10
+ "n_embd": 512
11
+ },
12
+ "user_config": {
13
+ "run": "gpt_d8_next2_r40_v8192_implicit_simple",
14
+ "device_type": "",
15
+ "depth": 8,
16
+ "max_seq_len": 512,
17
+ "target_shift": 2,
18
+ "mask_token": 0,
19
+ "num_iterations": -1,
20
+ "target_flops": -1.0,
21
+ "target_param_data_ratio": 40,
22
+ "device_batch_size": 128,
23
+ "total_batch_size": 131072,
24
+ "embedding_lr": 0.2,
25
+ "unembedding_lr": 0.004,
26
+ "weight_decay": 0.0,
27
+ "matrix_lr": 0.02,
28
+ "grad_clip": 1.0,
29
+ "warmup_ratio": 0.0,
30
+ "warmdown_ratio": 0.2,
31
+ "final_lr_frac": 0.0,
32
+ "resume_from_step": -1,
33
+ "eval_every": -1,
34
+ "eval_tokens": 10485760,
35
+ "core_metric_every": -1,
36
+ "core_metric_max_per_task": 500,
37
+ "sample_every": -1,
38
+ "save_every": 6000,
39
+ "model_tag": ""
40
+ },
41
+ "device_batch_size": 128,
42
+ "max_seq_len": 512,
43
+ "dataloader_state_dict": {
44
+ "pq_idx": 7,
45
+ "rg_idx": 161
46
+ },
47
+ "loop_state": {
48
+ "min_val_bpb": Infinity,
49
+ "smooth_train_loss": 2.4645395893493127,
50
+ "total_training_time": 2282.8761949539185
51
+ }
52
+ }
gpt_d8_next2_r40_v8192_implicit_simple/base_checkpoints/d8/model_006000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c59e2181cfee16bdca90829a12562fcec528ec2c3ff7b8e737b797f6f43df58
3
+ size 125849829
gpt_d8_next2_r40_v8192_implicit_simple/base_checkpoints/d8/model_010240.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42846a6d475655f9c55bf052dc97d8a0fca5407c3c02d026deaafbd30f3f95cd
3
+ size 125849829
gpt_d8_next2_r40_v8192_implicit_simple/base_checkpoints/d8/optim_006000_rank0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e8199147bf8f456be34e0524be53fe9c57db993a28507c161c8d9966d74c673
3
+ size 151012885
gpt_d8_next2_r40_v8192_implicit_simple/base_checkpoints/d8/optim_010240_rank0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb16b726fbef28faa0d6f4f62e88020dd2e892aeb01be36058c17f6abba530fc
3
+ size 151012885
gpt_d8_next2_r40_v8192_implicit_simple/report/base-model-training.md ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Base model training
2
+ timestamp: 2026-01-11 07:03:28
3
+
4
+ - run: gpt_d8_next2_r40_v8192_implicit_simple
5
+ - device_type:
6
+ - depth: 8
7
+ - max_seq_len: 512
8
+ - target_shift: 2
9
+ - mask_token: 0
10
+ - num_iterations: -1
11
+ - target_flops: -1.0000
12
+ - target_param_data_ratio: 40
13
+ - device_batch_size: 128
14
+ - total_batch_size: 131,072
15
+ - embedding_lr: 0.2000
16
+ - unembedding_lr: 0.0040
17
+ - weight_decay: 0.0000
18
+ - matrix_lr: 0.0200
19
+ - grad_clip: 1.0000
20
+ - warmup_ratio: 0.0000
21
+ - warmdown_ratio: 0.2000
22
+ - final_lr_frac: 0.0000
23
+ - resume_from_step: -1
24
+ - eval_every: -1
25
+ - eval_tokens: 10,485,760
26
+ - core_metric_every: -1
27
+ - core_metric_max_per_task: 500
28
+ - sample_every: -1
29
+ - save_every: 6000
30
+ - model_tag:
31
+ - Number of parameters: 33,554,432
32
+ - Number of FLOPs per token: 2.013266e+08
33
+ - Calculated number of iterations: 10,240
34
+ - Number of training tokens: 1,342,177,280
35
+ - Tokens : Params ratio: 40.0000
36
+ - DDP world size: 1
37
+ - warmup_ratio: 0.0000
38
+ - warmdown_ratio: 0.2000
39
+ - final_lr_frac: 0.0000
40
+ - Minimum validation bpb: inf
41
+ - Final validation bpb: 0
42
+ - CORE metric estimate: None
43
+ - MFU %: 12.02%
44
+ - Total training flops: 2.702160e+17
45
+ - Total training time: 38.05m
46
+ - Peak memory usage: 10864.29MiB
47
+
gpt_d8_next2_r40_v8192_implicit_simple/report/header.md ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # nanochat training report
2
+
3
+ Generated: 2026-01-11 06:24:28
4
+
5
+ ## Environment
6
+
7
+ ### Git Information
8
+ - Branch: tokenizer
9
+ - Commit: 06bfb76 (clean)
10
+ - Message: base train debug
11
+
12
+ ### Hardware
13
+ - Platform: Linux
14
+ - CPUs: 6 cores (12 logical)
15
+ - Memory: 167.1 GB
16
+ - GPUs: 1x NVIDIA A100-SXM4-80GB
17
+ - GPU Memory: 79.3 GB total
18
+ - CUDA Version: 12.8
19
+ - Hourly Rate: $1.79/hour
20
+
21
+ ### Software
22
+ - Python: 3.12.12
23
+ - PyTorch: 2.9.1+cu128
24
+
25
+
26
+ ### Bloat
27
+ - Characters: 590,099
28
+ - Lines: 14,582
29
+ - Files: 87
30
+ - Tokens (approx): 147,524
31
+ - Dependencies (uv.lock lines): 2,749
32
+
33
+ Run started: 2026-01-11 06:24:28
34
+
35
+ ---
36
+
gpt_d8_next2_r40_v8192_implicit_simple/simple_story_data/shard_00000.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc68646376a7cb8fa0a5a180e650c9c4de1c01e64de377c0286eec0cad8448d3
3
+ size 81779008
gpt_d8_next2_r40_v8192_implicit_simple/simple_story_data/shard_00001.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ddff7f98eb9a0df4f578c43c5fa034c4559ab6787b2cd3cea84fca88704662be
3
+ size 81663463
gpt_d8_next2_r40_v8192_implicit_simple/simple_story_data/shard_00002.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e0e7454be37151c327d3fd2dcc1fdd3138c4268d28fadbca69003a922a04667e
3
+ size 81869169
gpt_d8_next2_r40_v8192_implicit_simple/simple_story_data/shard_00003.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:63631e880c1b7910c5d7c94b835b558c1004cd26dcf5ccf9606d84b224590cfe
3
+ size 81870659
gpt_d8_next2_r40_v8192_implicit_simple/simple_story_data/shard_00004.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c43d84ea2a95d7ceefa291e5bd48caf579d94ce5fb448096046b3b9223da7fbe
3
+ size 81978716
gpt_d8_next2_r40_v8192_implicit_simple/simple_story_data/shard_00005.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f7206f1ef464c42db1358c47a3f07f8e288c122ea52018b0338f1923be36889
3
+ size 81684523
gpt_d8_next2_r40_v8192_implicit_simple/simple_story_data/shard_00006.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f19e9e98a2167d244ac0d2d60c0b48c9d51449be27e3e58ad0ee587f09bc63e
3
+ size 81848408
gpt_d8_next2_r40_v8192_implicit_simple/simple_story_data/shard_00007.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:71ca94f3715924ed744c180862a1a152e51440ad32aa359b34a8856686870b20
3
+ size 81846031
gpt_d8_next2_r40_v8192_implicit_simple/tokenizer/token_bytes.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3df121641225414b3f0ce0fcc4c77ed8ec7f34067b1a0bba50f2b798d449a38e
3
+ size 34345
gpt_d8_next2_r40_v8192_implicit_simple/tokenizer/tokenizer.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5423e385104a5dfeccaf17344cbb9bb2d046a3ebc73b22249c40221fd09a53ca
3
+ size 97311