duoduoyeah commited on 3 days ago

Commit

ffcf3c7

verified ·

1 Parent(s): d13f6e4

Add files using upload-large-folder tool

Browse files

Files changed (18) hide show

gpt_d8_next2_r40_v8192_implicit_simple/base_checkpoints/d8/meta_006000.json +52 -0
gpt_d8_next2_r40_v8192_implicit_simple/base_checkpoints/d8/meta_010240.json +52 -0
gpt_d8_next2_r40_v8192_implicit_simple/base_checkpoints/d8/model_006000.pt +3 -0
gpt_d8_next2_r40_v8192_implicit_simple/base_checkpoints/d8/model_010240.pt +3 -0
gpt_d8_next2_r40_v8192_implicit_simple/base_checkpoints/d8/optim_006000_rank0.pt +3 -0
gpt_d8_next2_r40_v8192_implicit_simple/base_checkpoints/d8/optim_010240_rank0.pt +3 -0
gpt_d8_next2_r40_v8192_implicit_simple/report/base-model-training.md +47 -0
gpt_d8_next2_r40_v8192_implicit_simple/report/header.md +36 -0
gpt_d8_next2_r40_v8192_implicit_simple/simple_story_data/shard_00000.parquet +3 -0
gpt_d8_next2_r40_v8192_implicit_simple/simple_story_data/shard_00001.parquet +3 -0
gpt_d8_next2_r40_v8192_implicit_simple/simple_story_data/shard_00002.parquet +3 -0
gpt_d8_next2_r40_v8192_implicit_simple/simple_story_data/shard_00003.parquet +3 -0
gpt_d8_next2_r40_v8192_implicit_simple/simple_story_data/shard_00004.parquet +3 -0
gpt_d8_next2_r40_v8192_implicit_simple/simple_story_data/shard_00005.parquet +3 -0
gpt_d8_next2_r40_v8192_implicit_simple/simple_story_data/shard_00006.parquet +3 -0
gpt_d8_next2_r40_v8192_implicit_simple/simple_story_data/shard_00007.parquet +3 -0
gpt_d8_next2_r40_v8192_implicit_simple/tokenizer/token_bytes.pt +3 -0
gpt_d8_next2_r40_v8192_implicit_simple/tokenizer/tokenizer.pkl +3 -0

gpt_d8_next2_r40_v8192_implicit_simple/base_checkpoints/d8/meta_006000.json ADDED Viewed

	@@ -0,0 +1,52 @@

+{
+  "step": 6000,
+  "val_bpb": 0,
+  "model_config": {
+    "sequence_len": 512,
+    "vocab_size": 8192,
+    "n_layer": 8,
+    "n_head": 4,
+    "n_kv_head": 4,
+    "n_embd": 512
+  },
+  "user_config": {
+    "run": "gpt_d8_next2_r40_v8192_implicit_simple",
+    "device_type": "",
+    "depth": 8,
+    "max_seq_len": 512,
+    "target_shift": 2,
+    "mask_token": 0,
+    "num_iterations": -1,
+    "target_flops": -1.0,
+    "target_param_data_ratio": 40,
+    "device_batch_size": 128,
+    "total_batch_size": 131072,
+    "embedding_lr": 0.2,
+    "unembedding_lr": 0.004,
+    "weight_decay": 0.0,
+    "matrix_lr": 0.02,
+    "grad_clip": 1.0,
+    "warmup_ratio": 0.0,
+    "warmdown_ratio": 0.2,
+    "final_lr_frac": 0.0,
+    "resume_from_step": -1,
+    "eval_every": -1,
+    "eval_tokens": 10485760,
+    "core_metric_every": -1,
+    "core_metric_max_per_task": 500,
+    "sample_every": -1,
+    "save_every": 6000,
+    "model_tag": ""
+  },
+  "device_batch_size": 128,
+  "max_seq_len": 512,
+  "dataloader_state_dict": {
+    "pq_idx": 5,
+    "rg_idx": 188
+  },
+  "loop_state": {
+    "min_val_bpb": Infinity,
+    "smooth_train_loss": 2.593795311860216,
+    "total_training_time": 1335.3098776340485
+  }
+}

gpt_d8_next2_r40_v8192_implicit_simple/base_checkpoints/d8/meta_010240.json ADDED Viewed

	@@ -0,0 +1,52 @@

+{
+  "step": 10240,
+  "val_bpb": 0,
+  "model_config": {
+    "sequence_len": 512,
+    "vocab_size": 8192,
+    "n_layer": 8,
+    "n_head": 4,
+    "n_kv_head": 4,
+    "n_embd": 512
+  },
+  "user_config": {
+    "run": "gpt_d8_next2_r40_v8192_implicit_simple",
+    "device_type": "",
+    "depth": 8,
+    "max_seq_len": 512,
+    "target_shift": 2,
+    "mask_token": 0,
+    "num_iterations": -1,
+    "target_flops": -1.0,
+    "target_param_data_ratio": 40,
+    "device_batch_size": 128,
+    "total_batch_size": 131072,
+    "embedding_lr": 0.2,
+    "unembedding_lr": 0.004,
+    "weight_decay": 0.0,
+    "matrix_lr": 0.02,
+    "grad_clip": 1.0,
+    "warmup_ratio": 0.0,
+    "warmdown_ratio": 0.2,
+    "final_lr_frac": 0.0,
+    "resume_from_step": -1,
+    "eval_every": -1,
+    "eval_tokens": 10485760,
+    "core_metric_every": -1,
+    "core_metric_max_per_task": 500,
+    "sample_every": -1,
+    "save_every": 6000,
+    "model_tag": ""
+  },
+  "device_batch_size": 128,
+  "max_seq_len": 512,
+  "dataloader_state_dict": {
+    "pq_idx": 7,
+    "rg_idx": 161
+  },
+  "loop_state": {
+    "min_val_bpb": Infinity,
+    "smooth_train_loss": 2.4645395893493127,
+    "total_training_time": 2282.8761949539185
+  }
+}

gpt_d8_next2_r40_v8192_implicit_simple/base_checkpoints/d8/model_006000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8c59e2181cfee16bdca90829a12562fcec528ec2c3ff7b8e737b797f6f43df58
+size 125849829

gpt_d8_next2_r40_v8192_implicit_simple/base_checkpoints/d8/model_010240.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:42846a6d475655f9c55bf052dc97d8a0fca5407c3c02d026deaafbd30f3f95cd
+size 125849829

gpt_d8_next2_r40_v8192_implicit_simple/base_checkpoints/d8/optim_006000_rank0.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2e8199147bf8f456be34e0524be53fe9c57db993a28507c161c8d9966d74c673
+size 151012885

gpt_d8_next2_r40_v8192_implicit_simple/base_checkpoints/d8/optim_010240_rank0.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cb16b726fbef28faa0d6f4f62e88020dd2e892aeb01be36058c17f6abba530fc
+size 151012885

gpt_d8_next2_r40_v8192_implicit_simple/report/base-model-training.md ADDED Viewed

	@@ -0,0 +1,47 @@

+## Base model training
+timestamp: 2026-01-11 07:03:28
+- run: gpt_d8_next2_r40_v8192_implicit_simple
+- device_type:
+- depth: 8
+- max_seq_len: 512
+- target_shift: 2
+- mask_token: 0
+- num_iterations: -1
+- target_flops: -1.0000
+- target_param_data_ratio: 40
+- device_batch_size: 128
+- total_batch_size: 131,072
+- embedding_lr: 0.2000
+- unembedding_lr: 0.0040
+- weight_decay: 0.0000
+- matrix_lr: 0.0200
+- grad_clip: 1.0000
+- warmup_ratio: 0.0000
+- warmdown_ratio: 0.2000
+- final_lr_frac: 0.0000
+- resume_from_step: -1
+- eval_every: -1
+- eval_tokens: 10,485,760
+- core_metric_every: -1
+- core_metric_max_per_task: 500
+- sample_every: -1
+- save_every: 6000
+- model_tag:
+- Number of parameters: 33,554,432
+- Number of FLOPs per token: 2.013266e+08
+- Calculated number of iterations: 10,240
+- Number of training tokens: 1,342,177,280
+- Tokens : Params ratio: 40.0000
+- DDP world size: 1
+- warmup_ratio: 0.0000
+- warmdown_ratio: 0.2000
+- final_lr_frac: 0.0000
+- Minimum validation bpb: inf
+- Final validation bpb: 0
+- CORE metric estimate: None
+- MFU %: 12.02%
+- Total training flops: 2.702160e+17
+- Total training time: 38.05m
+- Peak memory usage: 10864.29MiB

gpt_d8_next2_r40_v8192_implicit_simple/report/header.md ADDED Viewed

	@@ -0,0 +1,36 @@

+# nanochat training report
+Generated: 2026-01-11 06:24:28
+## Environment
+### Git Information
+- Branch: tokenizer
+- Commit: 06bfb76 (clean)
+- Message: base train debug
+### Hardware
+- Platform: Linux
+- CPUs: 6 cores (12 logical)
+- Memory: 167.1 GB
+- GPUs: 1x NVIDIA A100-SXM4-80GB
+- GPU Memory: 79.3 GB total
+- CUDA Version: 12.8
+- Hourly Rate: $1.79/hour
+### Software
+- Python: 3.12.12
+- PyTorch: 2.9.1+cu128
+### Bloat
+- Characters: 590,099
+- Lines: 14,582
+- Files: 87
+- Tokens (approx): 147,524
+- Dependencies (uv.lock lines): 2,749
+Run started: 2026-01-11 06:24:28
+---

gpt_d8_next2_r40_v8192_implicit_simple/simple_story_data/shard_00000.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cc68646376a7cb8fa0a5a180e650c9c4de1c01e64de377c0286eec0cad8448d3
+size 81779008

gpt_d8_next2_r40_v8192_implicit_simple/simple_story_data/shard_00001.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ddff7f98eb9a0df4f578c43c5fa034c4559ab6787b2cd3cea84fca88704662be
+size 81663463

gpt_d8_next2_r40_v8192_implicit_simple/simple_story_data/shard_00002.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e0e7454be37151c327d3fd2dcc1fdd3138c4268d28fadbca69003a922a04667e
+size 81869169

gpt_d8_next2_r40_v8192_implicit_simple/simple_story_data/shard_00003.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:63631e880c1b7910c5d7c94b835b558c1004cd26dcf5ccf9606d84b224590cfe
+size 81870659

gpt_d8_next2_r40_v8192_implicit_simple/simple_story_data/shard_00004.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c43d84ea2a95d7ceefa291e5bd48caf579d94ce5fb448096046b3b9223da7fbe
+size 81978716

gpt_d8_next2_r40_v8192_implicit_simple/simple_story_data/shard_00005.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3f7206f1ef464c42db1358c47a3f07f8e288c122ea52018b0338f1923be36889
+size 81684523

gpt_d8_next2_r40_v8192_implicit_simple/simple_story_data/shard_00006.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6f19e9e98a2167d244ac0d2d60c0b48c9d51449be27e3e58ad0ee587f09bc63e
+size 81848408

gpt_d8_next2_r40_v8192_implicit_simple/simple_story_data/shard_00007.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:71ca94f3715924ed744c180862a1a152e51440ad32aa359b34a8856686870b20
+size 81846031

gpt_d8_next2_r40_v8192_implicit_simple/tokenizer/token_bytes.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3df121641225414b3f0ce0fcc4c77ed8ec7f34067b1a0bba50f2b798d449a38e
+size 34345

gpt_d8_next2_r40_v8192_implicit_simple/tokenizer/tokenizer.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5423e385104a5dfeccaf17344cbb9bb2d046a3ebc73b22249c40221fd09a53ca
+size 97311