duoduoyeah commited on 5 days ago

Commit

ba0d57a

verified ·

1 Parent(s): e34808b

Add files using upload-large-folder tool

Browse files

Files changed (18) hide show

gpt_d8_next1_r40_v4096_implicit_simple/base_checkpoints/d8/meta_006000.json +52 -0
gpt_d8_next1_r40_v4096_implicit_simple/base_checkpoints/d8/meta_008960.json +52 -0
gpt_d8_next1_r40_v4096_implicit_simple/base_checkpoints/d8/model_006000.pt +3 -0
gpt_d8_next1_r40_v4096_implicit_simple/base_checkpoints/d8/model_008960.pt +3 -0
gpt_d8_next1_r40_v4096_implicit_simple/base_checkpoints/d8/optim_006000_rank0.pt +3 -0
gpt_d8_next1_r40_v4096_implicit_simple/base_checkpoints/d8/optim_008960_rank0.pt +3 -0
gpt_d8_next1_r40_v4096_implicit_simple/report/base-model-training.md +47 -0
gpt_d8_next1_r40_v4096_implicit_simple/report/header.md +36 -0
gpt_d8_next1_r40_v4096_implicit_simple/simple_story_data/shard_00000.parquet +3 -0
gpt_d8_next1_r40_v4096_implicit_simple/simple_story_data/shard_00001.parquet +3 -0
gpt_d8_next1_r40_v4096_implicit_simple/simple_story_data/shard_00002.parquet +3 -0
gpt_d8_next1_r40_v4096_implicit_simple/simple_story_data/shard_00003.parquet +3 -0
gpt_d8_next1_r40_v4096_implicit_simple/simple_story_data/shard_00004.parquet +3 -0
gpt_d8_next1_r40_v4096_implicit_simple/simple_story_data/shard_00005.parquet +3 -0
gpt_d8_next1_r40_v4096_implicit_simple/simple_story_data/shard_00006.parquet +3 -0
gpt_d8_next1_r40_v4096_implicit_simple/simple_story_data/shard_00007.parquet +3 -0
gpt_d8_next1_r40_v4096_implicit_simple/tokenizer/token_bytes.pt +3 -0
gpt_d8_next1_r40_v4096_implicit_simple/tokenizer/tokenizer.pkl +3 -0

gpt_d8_next1_r40_v4096_implicit_simple/base_checkpoints/d8/meta_006000.json ADDED Viewed

	@@ -0,0 +1,52 @@

+{
+  "step": 6000,
+  "val_bpb": 0,
+  "model_config": {
+    "sequence_len": 512,
+    "vocab_size": 4096,
+    "n_layer": 8,
+    "n_head": 4,
+    "n_kv_head": 4,
+    "n_embd": 512
+  },
+  "user_config": {
+    "run": "gpt_d8_next1_r40_v4096_implicit_simple",
+    "device_type": "",
+    "depth": 8,
+    "max_seq_len": 512,
+    "target_shift": 1,
+    "mask_token": 0,
+    "num_iterations": -1,
+    "target_flops": -1.0,
+    "target_param_data_ratio": 40,
+    "device_batch_size": 64,
+    "total_batch_size": 131072,
+    "embedding_lr": 0.2,
+    "unembedding_lr": 0.004,
+    "weight_decay": 0.0,
+    "matrix_lr": 0.02,
+    "grad_clip": 1.0,
+    "warmup_ratio": 0.0,
+    "warmdown_ratio": 0.2,
+    "final_lr_frac": 0.0,
+    "resume_from_step": -1,
+    "eval_every": -1,
+    "eval_tokens": 10485760,
+    "core_metric_every": -1,
+    "core_metric_max_per_task": 500,
+    "sample_every": -1,
+    "save_every": 6000,
+    "model_tag": ""
+  },
+  "device_batch_size": 64,
+  "max_seq_len": 512,
+  "dataloader_state_dict": {
+    "pq_idx": 5,
+    "rg_idx": 68
+  },
+  "loop_state": {
+    "min_val_bpb": Infinity,
+    "smooth_train_loss": 1.5400441557553806,
+    "total_training_time": 1316.3449952602386
+  }
+}

gpt_d8_next1_r40_v4096_implicit_simple/base_checkpoints/d8/meta_008960.json ADDED Viewed

	@@ -0,0 +1,52 @@

+{
+  "step": 8960,
+  "val_bpb": 0,
+  "model_config": {
+    "sequence_len": 512,
+    "vocab_size": 4096,
+    "n_layer": 8,
+    "n_head": 4,
+    "n_kv_head": 4,
+    "n_embd": 512
+  },
+  "user_config": {
+    "run": "gpt_d8_next1_r40_v4096_implicit_simple",
+    "device_type": "",
+    "depth": 8,
+    "max_seq_len": 512,
+    "target_shift": 1,
+    "mask_token": 0,
+    "num_iterations": -1,
+    "target_flops": -1.0,
+    "target_param_data_ratio": 40,
+    "device_batch_size": 64,
+    "total_batch_size": 131072,
+    "embedding_lr": 0.2,
+    "unembedding_lr": 0.004,
+    "weight_decay": 0.0,
+    "matrix_lr": 0.02,
+    "grad_clip": 1.0,
+    "warmup_ratio": 0.0,
+    "warmdown_ratio": 0.2,
+    "final_lr_frac": 0.0,
+    "resume_from_step": -1,
+    "eval_every": -1,
+    "eval_tokens": 10485760,
+    "core_metric_every": -1,
+    "core_metric_max_per_task": 500,
+    "sample_every": -1,
+    "save_every": 6000,
+    "model_tag": ""
+  },
+  "device_batch_size": 64,
+  "max_seq_len": 512,
+  "dataloader_state_dict": {
+    "pq_idx": 3,
+    "rg_idx": 184
+  },
+  "loop_state": {
+    "min_val_bpb": Infinity,
+    "smooth_train_loss": 1.4397647744770112,
+    "total_training_time": 1966.5658521652222
+  }
+}

gpt_d8_next1_r40_v4096_implicit_simple/base_checkpoints/d8/model_006000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:acbd40ad8b1fa9a93c5f68550718ed913e21b6e8652bd1d89c4949a2f2d263b4
+size 113266917

gpt_d8_next1_r40_v4096_implicit_simple/base_checkpoints/d8/model_008960.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:34eece22aae8df67eda517924313258bcbd3a92649c2979b9e396522c7bab0cd
+size 113266917

gpt_d8_next1_r40_v4096_implicit_simple/base_checkpoints/d8/optim_006000_rank0.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5afa5239797045d3d8aaf4469ec23e9e76eb0415360ae404fd44ab2361460e48
+size 125847061

gpt_d8_next1_r40_v4096_implicit_simple/base_checkpoints/d8/optim_008960_rank0.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:276cd6db689a7bef2a995cec480cd1257e89cdfd9200293334ba712a9e189c34
+size 125847061

gpt_d8_next1_r40_v4096_implicit_simple/report/base-model-training.md ADDED Viewed

	@@ -0,0 +1,47 @@

+## Base model training
+timestamp: 2026-01-11 05:48:46
+- run: gpt_d8_next1_r40_v4096_implicit_simple
+- device_type:
+- depth: 8
+- max_seq_len: 512
+- target_shift: 1
+- mask_token: 0
+- num_iterations: -1
+- target_flops: -1.0000
+- target_param_data_ratio: 40
+- device_batch_size: 64
+- total_batch_size: 131,072
+- embedding_lr: 0.2000
+- unembedding_lr: 0.0040
+- weight_decay: 0.0000
+- matrix_lr: 0.0200
+- grad_clip: 1.0000
+- warmup_ratio: 0.0000
+- warmdown_ratio: 0.2000
+- final_lr_frac: 0.0000
+- resume_from_step: -1
+- eval_every: -1
+- eval_tokens: 10,485,760
+- core_metric_every: -1
+- core_metric_max_per_task: 500
+- sample_every: -1
+- save_every: 6000
+- model_tag:
+- Number of parameters: 29,360,128
+- Number of FLOPs per token: 1.887437e+08
+- Calculated number of iterations: 8960
+- Number of training tokens: 1,174,405,120
+- Tokens : Params ratio: 40.0000
+- DDP world size: 1
+- warmup_ratio: 0.0000
+- warmdown_ratio: 0.2000
+- final_lr_frac: 0.0000
+- Minimum validation bpb: inf
+- Final validation bpb: 0
+- CORE metric estimate: None
+- MFU %: 11.46%
+- Total training flops: 2.216615e+17
+- Total training time: 32.78m
+- Peak memory usage: 5353.53MiB

gpt_d8_next1_r40_v4096_implicit_simple/report/header.md ADDED Viewed

	@@ -0,0 +1,36 @@

+# nanochat training report
+Generated: 2026-01-11 05:15:02
+## Environment
+### Git Information
+- Branch: tokenizer
+- Commit: 06bfb76 (clean)
+- Message: base train debug
+### Hardware
+- Platform: Linux
+- CPUs: 6 cores (12 logical)
+- Memory: 167.1 GB
+- GPUs: 1x NVIDIA A100-SXM4-80GB
+- GPU Memory: 79.3 GB total
+- CUDA Version: 12.8
+- Hourly Rate: $1.79/hour
+### Software
+- Python: 3.12.12
+- PyTorch: 2.9.1+cu128
+### Bloat
+- Characters: 590,099
+- Lines: 14,582
+- Files: 87
+- Tokens (approx): 147,524
+- Dependencies (uv.lock lines): 2,749
+Run started: 2026-01-11 05:15:02
+---

gpt_d8_next1_r40_v4096_implicit_simple/simple_story_data/shard_00000.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cc68646376a7cb8fa0a5a180e650c9c4de1c01e64de377c0286eec0cad8448d3
+size 81779008

gpt_d8_next1_r40_v4096_implicit_simple/simple_story_data/shard_00001.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ddff7f98eb9a0df4f578c43c5fa034c4559ab6787b2cd3cea84fca88704662be
+size 81663463

gpt_d8_next1_r40_v4096_implicit_simple/simple_story_data/shard_00002.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e0e7454be37151c327d3fd2dcc1fdd3138c4268d28fadbca69003a922a04667e
+size 81869169

gpt_d8_next1_r40_v4096_implicit_simple/simple_story_data/shard_00003.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:63631e880c1b7910c5d7c94b835b558c1004cd26dcf5ccf9606d84b224590cfe
+size 81870659

gpt_d8_next1_r40_v4096_implicit_simple/simple_story_data/shard_00004.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c43d84ea2a95d7ceefa291e5bd48caf579d94ce5fb448096046b3b9223da7fbe
+size 81978716

gpt_d8_next1_r40_v4096_implicit_simple/simple_story_data/shard_00005.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3f7206f1ef464c42db1358c47a3f07f8e288c122ea52018b0338f1923be36889
+size 81684523

gpt_d8_next1_r40_v4096_implicit_simple/simple_story_data/shard_00006.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6f19e9e98a2167d244ac0d2d60c0b48c9d51449be27e3e58ad0ee587f09bc63e
+size 81848408

gpt_d8_next1_r40_v4096_implicit_simple/simple_story_data/shard_00007.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:71ca94f3715924ed744c180862a1a152e51440ad32aa359b34a8856686870b20
+size 81846031

gpt_d8_next1_r40_v4096_implicit_simple/tokenizer/token_bytes.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:17f468b0626e587a9948f73588629d679198a6f38fe834875017b222ec2443d7
+size 17961

gpt_d8_next1_r40_v4096_implicit_simple/tokenizer/tokenizer.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:893aa318d37e34b8852086f8fcdffcd9d45242706dc6f06dc5d315499aa5a633
+size 45915