duoduoyeah commited on
Commit
67a474b
·
verified ·
1 Parent(s): d67c720

Add files using upload-large-folder tool

Browse files
bd3lm_d8_b4_ts2_r40/base_checkpoints/d8/meta_008960.json ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "step": 8960,
3
+ "model_config": {
4
+ "sequence_len": 512,
5
+ "pure_vocab_size": 4096,
6
+ "all_vocab_size": 4097,
7
+ "n_layer": 8,
8
+ "n_head": 4,
9
+ "n_kv_head": 4,
10
+ "n_embd": 512,
11
+ "prefix_pure_tokens": 1,
12
+ "mask_token_id": 4096,
13
+ "is_causal": false,
14
+ "bucket_size": 4,
15
+ "model_name": "bd3lm_d8_b4_ts2_r40",
16
+ "target_shift": 2
17
+ },
18
+ "user_config": {
19
+ "run": "bd3lm_d8_b4_ts2_r40",
20
+ "device_type": "",
21
+ "model_architecture": "Karpathy_gpt2",
22
+ "model_type": "bd3lm",
23
+ "target_shift": 2,
24
+ "depth": 8,
25
+ "max_seq_len": 512,
26
+ "block_size": 4,
27
+ "prefix_pure_tokens": 1,
28
+ "is_causal": false,
29
+ "noise_total_steps": 16,
30
+ "bd3lm_compute_matched": true,
31
+ "debug": false,
32
+ "num_iterations": -1,
33
+ "target_flops": -1.0,
34
+ "target_param_data_ratio": 40,
35
+ "device_batch_size": 128,
36
+ "total_batch_size": 131072,
37
+ "embedding_lr": 0.2,
38
+ "unembedding_lr": 0.004,
39
+ "weight_decay": 0.0,
40
+ "matrix_lr": 0.02,
41
+ "grad_clip": 1.0,
42
+ "warmup_ratio": 0.0,
43
+ "warmdown_ratio": 0.2,
44
+ "final_lr_frac": 0.0,
45
+ "resume_from_step": -1,
46
+ "eval_every": 2500,
47
+ "eval_num_batches": 20,
48
+ "eval_num_batches_final": 100,
49
+ "save_every": -1,
50
+ "model_tag": ""
51
+ },
52
+ "device_batch_size": 128,
53
+ "max_seq_len": 512,
54
+ "dataloader_state_dict": {
55
+ "pq_idx": 9,
56
+ "rg_idx": 183,
57
+ "epoch": 1
58
+ },
59
+ "loop_state": {
60
+ "smooth_train_loss": 2.180243093118573,
61
+ "total_training_time": 5075.819852352142,
62
+ "total_effective_tokens": 292458624
63
+ }
64
+ }
bd3lm_d8_b4_ts2_r40/base_checkpoints/d8/model_008960.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:468ad76ba1ecf696b2c7f915d2d72e1b0265d3d034c2ad31bbabb1004ae25ca2
3
+ size 113267941
bd3lm_d8_b4_ts2_r40/base_checkpoints/d8/optim_008960_rank0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f69213febca1125560ad292c3babe2a5dcd04b7feebcd96e109ebf0d2957311
3
+ size 125849109
bd3lm_d8_b4_ts2_r40/report/base-model-training.md ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Base model training
2
+ timestamp: 2026-01-21 07:00:20
3
+
4
+ - run: bd3lm_d8_b4_ts2_r40
5
+ - device_type:
6
+ - model_architecture: Karpathy_gpt2
7
+ - model_type: bd3lm
8
+ - target_shift: 2
9
+ - depth: 8
10
+ - max_seq_len: 512
11
+ - block_size: 4
12
+ - prefix_pure_tokens: 1
13
+ - is_causal: False
14
+ - noise_total_steps: 16
15
+ - bd3lm_compute_matched: True
16
+ - debug: False
17
+ - num_iterations: -1
18
+ - target_flops: -1.0000
19
+ - target_param_data_ratio: 40
20
+ - device_batch_size: 128
21
+ - total_batch_size: 131,072
22
+ - embedding_lr: 0.2000
23
+ - unembedding_lr: 0.0040
24
+ - weight_decay: 0.0000
25
+ - matrix_lr: 0.0200
26
+ - grad_clip: 1.0000
27
+ - warmup_ratio: 0.0000
28
+ - warmdown_ratio: 0.2000
29
+ - final_lr_frac: 0.0000
30
+ - resume_from_step: -1
31
+ - eval_every: 2500
32
+ - eval_num_batches: 20
33
+ - eval_num_batches_final: 100
34
+ - save_every: -1
35
+ - model_tag:
36
+ - Number of parameters: 29,360,640
37
+ - Number of FLOPs per token: 1.887437e+08
38
+ - Calculated number of iterations: 8960
39
+ - Number of training tokens: 1,174,405,120
40
+ - Tokens : Params ratio: 39.9993
41
+ - DDP world size: 1
42
+ - warmup_ratio: 0.0000
43
+ - warmdown_ratio: 0.2000
44
+ - final_lr_frac: 0.0000
45
+ - MFU %: 4.41%
46
+ - Total training flops: 2.216615e+17
47
+ - Total training time: 84.60m
48
+ - Peak memory usage: 21211.15MiB
49
+ - Total effective tokens: 292,458,624
50
+ - Actual effective ratio: 0.2490
51
+
bd3lm_d8_b4_ts2_r40/report/header.md ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # nanochat training report
2
+
3
+ Generated: 2026-01-21 05:33:05
4
+
5
+ ## Environment
6
+
7
+ ### Git Information
8
+ - Branch: bd3lm
9
+ - Commit: 2b786fb (clean)
10
+ - Message: update train bd3lm
11
+
12
+ ### Hardware
13
+ - Platform: Linux
14
+ - CPUs: 6 cores (12 logical)
15
+ - Memory: 167.1 GB
16
+ - GPUs: 1x NVIDIA A100-SXM4-80GB
17
+ - GPU Memory: 79.3 GB total
18
+ - CUDA Version: 12.6
19
+ - Hourly Rate: $1.79/hour
20
+
21
+ ### Software
22
+ - Python: 3.12.12
23
+ - PyTorch: 2.9.0+cu126
24
+
25
+
26
+ ### Bloat
27
+ - Characters: 0
28
+ - Lines: 1
29
+ - Files: 0
30
+ - Tokens (approx): 0
31
+ - Dependencies (uv.lock lines): 2,749
32
+
33
+ Run started: 2026-01-21 05:33:05
34
+
35
+ ---
36
+
bd3lm_d8_b4_ts2_r40/tokenizer/token_bytes.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c9fc16eea9e2ae748a805af58bc1421b6f3bf428bb13182d855422ac9861ac1
3
+ size 17961
bd3lm_d8_b4_ts2_r40/tokenizer/tokenizer.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28812600bbe6a7417775a1b7f79577659b515e3a86380b72ebe7c481b58bbd5e
3
+ size 45939