duoduoyeah commited on
Commit
3f740ef
·
verified ·
1 Parent(s): dc10b32

add gpt_d8_next3_r40

Browse files
gpt_d8_next3_r40/base_checkpoints/d8/meta_008960.json ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "step": 8960,
3
+ "model_config": {
4
+ "sequence_len": 512,
5
+ "vocab_size": 4096,
6
+ "n_layer": 8,
7
+ "n_head": 4,
8
+ "n_kv_head": 4,
9
+ "n_embd": 512,
10
+ "target_shift": 3
11
+ },
12
+ "user_config": {
13
+ "run": "gpt_d8_next3_r40",
14
+ "device_type": "",
15
+ "model_architecture": "Karpathy_gpt2",
16
+ "model_type": "next_token_ar",
17
+ "target_shift": 3,
18
+ "depth": 8,
19
+ "max_seq_len": 512,
20
+ "block_size": 8,
21
+ "prefix_pure_tokens": 1,
22
+ "is_causal": true,
23
+ "noise_total_steps": 16,
24
+ "pdlm_stage": "stage2",
25
+ "bd3lm_compute_matched": true,
26
+ "mtp_loss_beta": 0.8,
27
+ "n_future_tokens": 4,
28
+ "mtp_loss_weight": 1.0,
29
+ "soft_p_within": 1.0,
30
+ "noise_count": 64,
31
+ "loss_weight_mode": "manual",
32
+ "loss_weight_warmup_steps": 10,
33
+ "stage1_target_mode": "pure",
34
+ "debug": false,
35
+ "num_iterations": -1,
36
+ "target_flops": -1.0,
37
+ "target_param_data_ratio": 40,
38
+ "device_batch_size": 64,
39
+ "total_batch_size": 131072,
40
+ "embedding_lr": 0.2,
41
+ "unembedding_lr": 0.004,
42
+ "weight_decay": 0.0,
43
+ "matrix_lr": 0.02,
44
+ "grad_clip": 1.0,
45
+ "warmup_ratio": 0.0,
46
+ "warmdown_ratio": 0.2,
47
+ "final_lr_frac": 0.0,
48
+ "resume_from_step": -1,
49
+ "eval_every": 2500,
50
+ "eval_num_batches": 20,
51
+ "eval_num_batches_final": 100,
52
+ "save_every": -1,
53
+ "gradient_track_every": 0,
54
+ "gradient_block_size": 4,
55
+ "model_tag": ""
56
+ },
57
+ "device_batch_size": 64,
58
+ "max_seq_len": 512,
59
+ "dataloader_state_dict": {
60
+ "pq_idx": 9,
61
+ "rg_idx": 183,
62
+ "epoch": 1
63
+ },
64
+ "loop_state": {
65
+ "smooth_train_loss": 3.365408381108792,
66
+ "total_training_time": 1785.0645592212677,
67
+ "total_effective_tokens": 1174405120
68
+ }
69
+ }
gpt_d8_next3_r40/base_checkpoints/d8/model_008960.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:847c537ed8eaab4f14c4267184de459eaadfd6295c59d38568bc6d3559bbe1e0
3
+ size 113266917
gpt_d8_next3_r40/base_checkpoints/d8/optim_008960_rank0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a3c4a52bdb68943c4883082fe182d5d9b0f63e0ddea5f7ceded3e50f56fd25e8
3
+ size 125847061
gpt_d8_next3_r40/report/base-model-training.md ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Base model training
2
+ timestamp: 2026-03-02 00:27:50
3
+
4
+ - run: gpt_d8_next3_r40
5
+ - device_type:
6
+ - model_architecture: Karpathy_gpt2
7
+ - model_type: next_token_ar
8
+ - target_shift: 3
9
+ - depth: 8
10
+ - max_seq_len: 512
11
+ - block_size: 8
12
+ - prefix_pure_tokens: 1
13
+ - is_causal: True
14
+ - noise_total_steps: 16
15
+ - pdlm_stage: stage2
16
+ - bd3lm_compute_matched: True
17
+ - mtp_loss_beta: 0.8000
18
+ - n_future_tokens: 4
19
+ - mtp_loss_weight: 1.0000
20
+ - soft_p_within: 1.0000
21
+ - noise_count: 64
22
+ - loss_weight_mode: manual
23
+ - loss_weight_warmup_steps: 10
24
+ - stage1_target_mode: pure
25
+ - debug: False
26
+ - num_iterations: -1
27
+ - target_flops: -1.0000
28
+ - target_param_data_ratio: 40
29
+ - device_batch_size: 64
30
+ - total_batch_size: 131,072
31
+ - embedding_lr: 0.2000
32
+ - unembedding_lr: 0.0040
33
+ - weight_decay: 0.0000
34
+ - matrix_lr: 0.0200
35
+ - grad_clip: 1.0000
36
+ - warmup_ratio: 0.0000
37
+ - warmdown_ratio: 0.2000
38
+ - final_lr_frac: 0.0000
39
+ - resume_from_step: -1
40
+ - eval_every: 2500
41
+ - eval_num_batches: 20
42
+ - eval_num_batches_final: 100
43
+ - save_every: -1
44
+ - gradient_track_every: 0
45
+ - gradient_block_size: 4
46
+ - model_tag:
47
+ - Number of parameters: 29,360,128
48
+ - Number of FLOPs per token: 1.887437e+08
49
+ - Calculated number of iterations: 8960
50
+ - Number of training tokens: 1,174,405,120
51
+ - Tokens : Params ratio: 40.0000
52
+ - DDP world size: 1
53
+ - warmup_ratio: 0.0000
54
+ - warmdown_ratio: 0.2000
55
+ - final_lr_frac: 0.0000
56
+ - MFU %: 12.61%
57
+ - Total training flops: 2.216615e+17
58
+ - Total training time: 29.75m
59
+ - Peak memory usage: 5354.51MiB
60
+ - Total effective tokens: 1,174,405,120
61
+ - Actual effective ratio: 1.0000
62
+
gpt_d8_next3_r40/tokenizer/token_bytes.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:17f468b0626e587a9948f73588629d679198a6f38fe834875017b222ec2443d7
3
+ size 17961
gpt_d8_next3_r40/tokenizer/tokenizer.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:893aa318d37e34b8852086f8fcdffcd9d45242706dc6f06dc5d315499aa5a633
3
+ size 45915