duoduoyeah commited on
Commit
eaf9595
·
verified ·
1 Parent(s): 4adc0ca

Add files using upload-large-folder tool

Browse files
pdlm_d8_bs4_pr1_r25_non_ca_samenoisy/base_checkpoints/d8/meta_005000.json ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "step": 5000,
3
+ "val_bpb": 0,
4
+ "model_config": {
5
+ "sequence_len": 1024,
6
+ "pure_vocab_size": 4096,
7
+ "all_vocab_size": 4917,
8
+ "n_layer": 8,
9
+ "n_head": 4,
10
+ "n_kv_head": 4,
11
+ "n_embd": 512,
12
+ "prefix_pure_tokens": 1,
13
+ "mask_token_id": 4096,
14
+ "is_causal": false
15
+ },
16
+ "user_config": {
17
+ "run": "pdlm_d8_bs4_pr1_r25_non_ca_samenoisy",
18
+ "device_type": "",
19
+ "depth": 8,
20
+ "max_seq_len": 1024,
21
+ "block_size": 4,
22
+ "prefix_pure_tokens": 1,
23
+ "is_causal": false,
24
+ "noise_total_steps": 0,
25
+ "debug": false,
26
+ "num_iterations": -1,
27
+ "target_flops": -1.0,
28
+ "target_param_data_ratio": 25,
29
+ "device_batch_size": 64,
30
+ "total_batch_size": 131072,
31
+ "embedding_lr": 0.2,
32
+ "unembedding_lr": 0.004,
33
+ "weight_decay": 0.0,
34
+ "matrix_lr": 0.02,
35
+ "grad_clip": 1.0,
36
+ "warmup_ratio": 0.0,
37
+ "warmdown_ratio": 0.2,
38
+ "final_lr_frac": 0.0,
39
+ "resume_from_step": -1,
40
+ "eval_every": -1,
41
+ "eval_tokens": 10485760,
42
+ "core_metric_every": -1,
43
+ "core_metric_max_per_task": 500,
44
+ "sample_every": 2000,
45
+ "save_every": 5000,
46
+ "model_tag": ""
47
+ },
48
+ "device_batch_size": 64,
49
+ "max_seq_len": 1024,
50
+ "dataloader_state_dict": {
51
+ "pq_idx": 10,
52
+ "rg_idx": 123
53
+ },
54
+ "loop_state": {
55
+ "min_val_bpb": Infinity,
56
+ "smooth_train_loss": 1.559914570357708,
57
+ "total_training_time": 3959.402800321579
58
+ }
59
+ }
pdlm_d8_bs4_pr1_r25_non_ca_samenoisy/base_checkpoints/d8/meta_005680.json ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "step": 5680,
3
+ "val_bpb": 0,
4
+ "model_config": {
5
+ "sequence_len": 1024,
6
+ "pure_vocab_size": 4096,
7
+ "all_vocab_size": 4917,
8
+ "n_layer": 8,
9
+ "n_head": 4,
10
+ "n_kv_head": 4,
11
+ "n_embd": 512,
12
+ "prefix_pure_tokens": 1,
13
+ "mask_token_id": 4096,
14
+ "is_causal": false
15
+ },
16
+ "user_config": {
17
+ "run": "pdlm_d8_bs4_pr1_r25_non_ca_samenoisy",
18
+ "device_type": "",
19
+ "depth": 8,
20
+ "max_seq_len": 1024,
21
+ "block_size": 4,
22
+ "prefix_pure_tokens": 1,
23
+ "is_causal": false,
24
+ "noise_total_steps": 0,
25
+ "debug": false,
26
+ "num_iterations": -1,
27
+ "target_flops": -1.0,
28
+ "target_param_data_ratio": 25,
29
+ "device_batch_size": 64,
30
+ "total_batch_size": 131072,
31
+ "embedding_lr": 0.2,
32
+ "unembedding_lr": 0.004,
33
+ "weight_decay": 0.0,
34
+ "matrix_lr": 0.02,
35
+ "grad_clip": 1.0,
36
+ "warmup_ratio": 0.0,
37
+ "warmdown_ratio": 0.2,
38
+ "final_lr_frac": 0.0,
39
+ "resume_from_step": -1,
40
+ "eval_every": -1,
41
+ "eval_tokens": 10485760,
42
+ "core_metric_every": -1,
43
+ "core_metric_max_per_task": 500,
44
+ "sample_every": 2000,
45
+ "save_every": 5000,
46
+ "model_tag": ""
47
+ },
48
+ "device_batch_size": 64,
49
+ "max_seq_len": 1024,
50
+ "dataloader_state_dict": {
51
+ "pq_idx": 12,
52
+ "rg_idx": 23
53
+ },
54
+ "loop_state": {
55
+ "min_val_bpb": Infinity,
56
+ "smooth_train_loss": 1.551141215111763,
57
+ "total_training_time": 4498.662185430527
58
+ }
59
+ }
pdlm_d8_bs4_pr1_r25_non_ca_samenoisy/base_checkpoints/d8/model_005000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f98991e9922212df95c7d67b24056c53b23c096f9b74511c6eef0be5c99099d9
3
+ size 114107621
pdlm_d8_bs4_pr1_r25_non_ca_samenoisy/base_checkpoints/d8/model_005680.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ce8935720be3ab2a76ccceb3007a26e5cd2560704f97c231de5ce28abd153584
3
+ size 114107621
pdlm_d8_bs4_pr1_r25_non_ca_samenoisy/base_checkpoints/d8/optim_005000_rank0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b0a35a2c9dc2b3e89876f395c40cd0472ff50870e717ced247c7e8ad988a644
3
+ size 127528469
pdlm_d8_bs4_pr1_r25_non_ca_samenoisy/base_checkpoints/d8/optim_005680_rank0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c07223f1f8653e8b6a0c604d97b2a48c8a23e8b72fde48a1365d9e78e0c8fb57
3
+ size 127528469
pdlm_d8_bs4_pr1_r25_non_ca_samenoisy/report/base-model-training.md ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Base model training
2
+ timestamp: 2026-01-02 21:00:50
3
+
4
+ - run: pdlm_d8_bs4_pr1_r25_non_ca_samenoisy
5
+ - device_type:
6
+ - depth: 8
7
+ - max_seq_len: 1024
8
+ - block_size: 4
9
+ - prefix_pure_tokens: 1
10
+ - is_causal: False
11
+ - noise_total_steps: 0
12
+ - debug: False
13
+ - num_iterations: -1
14
+ - target_flops: -1.0000
15
+ - target_param_data_ratio: 25
16
+ - device_batch_size: 64
17
+ - total_batch_size: 131,072
18
+ - embedding_lr: 0.2000
19
+ - unembedding_lr: 0.0040
20
+ - weight_decay: 0.0000
21
+ - matrix_lr: 0.0200
22
+ - grad_clip: 1.0000
23
+ - warmup_ratio: 0.0000
24
+ - warmdown_ratio: 0.2000
25
+ - final_lr_frac: 0.0000
26
+ - resume_from_step: -1
27
+ - eval_every: -1
28
+ - eval_tokens: 10,485,760
29
+ - core_metric_every: -1
30
+ - core_metric_max_per_task: 500
31
+ - sample_every: 2000
32
+ - save_every: 5000
33
+ - model_tag:
34
+ - Number of parameters: 29,780,480
35
+ - Number of FLOPs per token: 2.139095e+08
36
+ - Calculated number of iterations: 5680
37
+ - Number of training tokens: 744,488,960
38
+ - Tokens : Params ratio: 24.9992
39
+ - DDP world size: 1
40
+ - warmup_ratio: 0.0000
41
+ - warmdown_ratio: 0.2000
42
+ - final_lr_frac: 0.0000
43
+ - Minimum validation bpb: inf
44
+ - Final validation bpb: 0
45
+ - CORE metric estimate: None
46
+ - MFU %: 3.58%
47
+ - Total training flops: 1.592533e+17
48
+ - Total training time: 74.98m
49
+ - Peak memory usage: 21217.98MiB
50
+
pdlm_d8_bs4_pr1_r25_non_ca_samenoisy/report/header.md ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # nanochat training report
2
+
3
+ Generated: 2026-01-02 19:43:57
4
+
5
+ ## Environment
6
+
7
+ ### Git Information
8
+ - Branch: tokenizer
9
+ - Commit: 72f06c8 (clean)
10
+ - Message: add response to the dump file
11
+
12
+ ### Hardware
13
+ - Platform: Linux
14
+ - CPUs: 6 cores (12 logical)
15
+ - Memory: 167.1 GB
16
+ - GPUs: 1x NVIDIA A100-SXM4-80GB
17
+ - GPU Memory: 79.3 GB total
18
+ - CUDA Version: 12.6
19
+ - Hourly Rate: $1.79/hour
20
+
21
+ ### Software
22
+ - Python: 3.12.12
23
+ - PyTorch: 2.9.0+cu126
24
+
25
+
26
+ ### Bloat
27
+ - Characters: 543,851
28
+ - Lines: 13,317
29
+ - Files: 72
30
+ - Tokens (approx): 135,962
31
+ - Dependencies (uv.lock lines): 2,218
32
+
33
+ Run started: 2026-01-02 19:43:58
34
+
35
+ ---
36
+
pdlm_d8_bs4_pr1_r25_non_ca_samenoisy/tokenizer/token_bytes.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c8b99c0d5a1b87b87118e840f69510440302023cd514b241614fb562373d7ce
3
+ size 17961
pdlm_d8_bs4_pr1_r25_non_ca_samenoisy/tokenizer/token_maps.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e018730860c8e77e8cbba4b57ac9c7ba6798b5926dce925743959b932a099964
3
+ size 1850237
pdlm_d8_bs4_pr1_r25_non_ca_samenoisy/tokenizer/tokenizer.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4f874c4250ec76e2e8c4f97e91c55cfdf74d9f8eedaae14cd22db36bb718ee19
3
+ size 61662