diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..ba1033c91c5e3d4633cda9162971171f3baefd07 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+wandb/run-20250915_224933-8ifme58a/run-8ifme58a.wandb filter=lfs diff=lfs merge=lfs -text
diff --git a/checkpoints/metadata_000000032768.json b/checkpoints/metadata_000000032768.json
new file mode 100644
index 0000000000000000000000000000000000000000..e823432cebd74c2a751426d8c3452b54b851f743
--- /dev/null
+++ b/checkpoints/metadata_000000032768.json
@@ -0,0 +1 @@
+{"step": 1, "tokens_seen": 32768, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.868239402770996}
\ No newline at end of file
diff --git a/checkpoints/metadata_000000327680.json b/checkpoints/metadata_000000327680.json
new file mode 100644
index 0000000000000000000000000000000000000000..c884bc92c13a5062c3ffe4f336f897dca77144ae
--- /dev/null
+++ b/checkpoints/metadata_000000327680.json
@@ -0,0 +1 @@
+{"step": 10, "tokens_seen": 327680, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.86287446604396}
\ No newline at end of file
diff --git a/checkpoints/metadata_000000360448.json b/checkpoints/metadata_000000360448.json
new file mode 100644
index 0000000000000000000000000000000000000000..34b09017df956e1f4c94e6533cdcf61cb8a58437
--- /dev/null
+++ b/checkpoints/metadata_000000360448.json
@@ -0,0 +1 @@
+{"step": 11, "tokens_seen": 360448, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.860849172158668}
\ No newline at end of file
diff --git a/checkpoints/metadata_000000425984.json b/checkpoints/metadata_000000425984.json
new file mode 100644
index 0000000000000000000000000000000000000000..f95cbca648eaa23ef03ce2b5b8c0ba0bed873771
--- /dev/null
+++ b/checkpoints/metadata_000000425984.json
@@ -0,0 +1 @@
+{"step": 13, "tokens_seen": 425984, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.855003761761374}
\ No newline at end of file
diff --git a/checkpoints/metadata_000000458752.json b/checkpoints/metadata_000000458752.json
new file mode 100644
index 0000000000000000000000000000000000000000..06bba4d8b8ed93cb435482e87acf9784105a178b
--- /dev/null
+++ b/checkpoints/metadata_000000458752.json
@@ -0,0 +1 @@
+{"step": 14, "tokens_seen": 458752, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.851786966560143}
\ No newline at end of file
diff --git a/checkpoints/metadata_000000491520.json b/checkpoints/metadata_000000491520.json
new file mode 100644
index 0000000000000000000000000000000000000000..c9ffb4c049f6b21451cdd1cab29cbb078d00af40
--- /dev/null
+++ b/checkpoints/metadata_000000491520.json
@@ -0,0 +1 @@
+{"step": 15, "tokens_seen": 491520, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.847815453095714}
\ No newline at end of file
diff --git a/checkpoints/metadata_000000557056.json b/checkpoints/metadata_000000557056.json
new file mode 100644
index 0000000000000000000000000000000000000000..f4797a1f90683984f551bd513d4ef272f5bbafa8
--- /dev/null
+++ b/checkpoints/metadata_000000557056.json
@@ -0,0 +1 @@
+{"step": 17, "tokens_seen": 557056, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.838863768585243}
\ No newline at end of file
diff --git a/checkpoints/metadata_000000622592.json b/checkpoints/metadata_000000622592.json
new file mode 100644
index 0000000000000000000000000000000000000000..fa19bc472debfcb845daf61e81f5511862779edc
--- /dev/null
+++ b/checkpoints/metadata_000000622592.json
@@ -0,0 +1 @@
+{"step": 19, "tokens_seen": 622592, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.82865379795969}
\ No newline at end of file
diff --git a/checkpoints/metadata_000000688128.json b/checkpoints/metadata_000000688128.json
new file mode 100644
index 0000000000000000000000000000000000000000..0f91759bab29eb2f5a78735106c46013be36d350
--- /dev/null
+++ b/checkpoints/metadata_000000688128.json
@@ -0,0 +1 @@
+{"step": 21, "tokens_seen": 688128, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.81652450790489}
\ No newline at end of file
diff --git a/checkpoints/metadata_000000753664.json b/checkpoints/metadata_000000753664.json
new file mode 100644
index 0000000000000000000000000000000000000000..655259242c4a22e4fa23d8fe20ce793584ffef83
--- /dev/null
+++ b/checkpoints/metadata_000000753664.json
@@ -0,0 +1 @@
+{"step": 23, "tokens_seen": 753664, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.803400143292523}
\ No newline at end of file
diff --git a/checkpoints/metadata_000000819200.json b/checkpoints/metadata_000000819200.json
new file mode 100644
index 0000000000000000000000000000000000000000..ab5c63125415d2d3eda9b5286eac7a65f256fe01
--- /dev/null
+++ b/checkpoints/metadata_000000819200.json
@@ -0,0 +1 @@
+{"step": 25, "tokens_seen": 819200, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.789504261580376}
\ No newline at end of file
diff --git a/checkpoints/metadata_000000917504.json b/checkpoints/metadata_000000917504.json
new file mode 100644
index 0000000000000000000000000000000000000000..70271940556b40cc82f68c1f314feed5ec2a8ce6
--- /dev/null
+++ b/checkpoints/metadata_000000917504.json
@@ -0,0 +1 @@
+{"step": 28, "tokens_seen": 917504, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.767103813584455}
\ No newline at end of file
diff --git a/checkpoints/metadata_000000983040.json b/checkpoints/metadata_000000983040.json
new file mode 100644
index 0000000000000000000000000000000000000000..70bb6c29d5c959c32a595acf5e8ab88dec4a2624
--- /dev/null
+++ b/checkpoints/metadata_000000983040.json
@@ -0,0 +1 @@
+{"step": 30, "tokens_seen": 983040, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.751470975617709}
\ No newline at end of file
diff --git a/checkpoints/metadata_000001114112.json b/checkpoints/metadata_000001114112.json
new file mode 100644
index 0000000000000000000000000000000000000000..94034f23a5a6b096d6323209b5c77640830606e9
--- /dev/null
+++ b/checkpoints/metadata_000001114112.json
@@ -0,0 +1 @@
+{"step": 34, "tokens_seen": 1114112, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.718526433361209}
\ No newline at end of file
diff --git a/checkpoints/metadata_000001212416.json b/checkpoints/metadata_000001212416.json
new file mode 100644
index 0000000000000000000000000000000000000000..01cfd15b5004e7b4f119ac047e382dc2070176bd
--- /dev/null
+++ b/checkpoints/metadata_000001212416.json
@@ -0,0 +1 @@
+{"step": 37, "tokens_seen": 1212416, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.690941976049386}
\ No newline at end of file
diff --git a/checkpoints/metadata_000001343488.json b/checkpoints/metadata_000001343488.json
new file mode 100644
index 0000000000000000000000000000000000000000..742d47d7591b12469516dfeb9d7028d3269f2bc9
--- /dev/null
+++ b/checkpoints/metadata_000001343488.json
@@ -0,0 +1 @@
+{"step": 41, "tokens_seen": 1343488, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.653392703937909}
\ No newline at end of file
diff --git a/checkpoints/metadata_000001474560.json b/checkpoints/metadata_000001474560.json
new file mode 100644
index 0000000000000000000000000000000000000000..06d87357cc1db26e37a9f213da758a581ea5f217
--- /dev/null
+++ b/checkpoints/metadata_000001474560.json
@@ -0,0 +1 @@
+{"step": 45, "tokens_seen": 1474560, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.612077045224545}
\ No newline at end of file
diff --git a/checkpoints/metadata_000001605632.json b/checkpoints/metadata_000001605632.json
new file mode 100644
index 0000000000000000000000000000000000000000..0d94c0fb9d57c70e0876c51a50cb69793640411d
--- /dev/null
+++ b/checkpoints/metadata_000001605632.json
@@ -0,0 +1 @@
+{"step": 49, "tokens_seen": 1605632, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.567273835750603}
\ No newline at end of file
diff --git a/checkpoints/metadata_000001769472.json b/checkpoints/metadata_000001769472.json
new file mode 100644
index 0000000000000000000000000000000000000000..5d2de27fce7383db44e8cc76ac050721e7ccafc1
--- /dev/null
+++ b/checkpoints/metadata_000001769472.json
@@ -0,0 +1 @@
+{"step": 54, "tokens_seen": 1769472, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.506361314834873}
\ No newline at end of file
diff --git a/checkpoints/metadata_000001966080.json b/checkpoints/metadata_000001966080.json
new file mode 100644
index 0000000000000000000000000000000000000000..18476de076a58ecc55bb18ae8ae0c05a4e9a56e5
--- /dev/null
+++ b/checkpoints/metadata_000001966080.json
@@ -0,0 +1 @@
+{"step": 60, "tokens_seen": 1966080, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.425973733834365}
\ No newline at end of file
diff --git a/checkpoints/metadata_000002162688.json b/checkpoints/metadata_000002162688.json
new file mode 100644
index 0000000000000000000000000000000000000000..e58f10df7537a4a398343655979d48b6ebd2776c
--- /dev/null
+++ b/checkpoints/metadata_000002162688.json
@@ -0,0 +1 @@
+{"step": 66, "tokens_seen": 2162688, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.33862359508608}
\ No newline at end of file
diff --git a/checkpoints/metadata_000002359296.json b/checkpoints/metadata_000002359296.json
new file mode 100644
index 0000000000000000000000000000000000000000..e5b737c7fefddf4372abb26f1f40b41ddddcf31b
--- /dev/null
+++ b/checkpoints/metadata_000002359296.json
@@ -0,0 +1 @@
+{"step": 72, "tokens_seen": 2359296, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.241863871109027}
\ No newline at end of file
diff --git a/checkpoints/metadata_000002621440.json b/checkpoints/metadata_000002621440.json
new file mode 100644
index 0000000000000000000000000000000000000000..c61b2c97915f20ab6c0286031b20eab2b54588a2
--- /dev/null
+++ b/checkpoints/metadata_000002621440.json
@@ -0,0 +1 @@
+{"step": 80, "tokens_seen": 2621440, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.102892860456649}
\ No newline at end of file
diff --git a/checkpoints/metadata_000002883584.json b/checkpoints/metadata_000002883584.json
new file mode 100644
index 0000000000000000000000000000000000000000..b573d77f65b935c61536cb4460ea19837419e0cd
--- /dev/null
+++ b/checkpoints/metadata_000002883584.json
@@ -0,0 +1 @@
+{"step": 88, "tokens_seen": 2883584, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.952368899713482}
\ No newline at end of file
diff --git a/checkpoints/metadata_000003178496.json b/checkpoints/metadata_000003178496.json
new file mode 100644
index 0000000000000000000000000000000000000000..c952274d39233757eb598a2355e44e1aa189754e
--- /dev/null
+++ b/checkpoints/metadata_000003178496.json
@@ -0,0 +1 @@
+{"step": 97, "tokens_seen": 3178496, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.771469820217888}
\ No newline at end of file
diff --git a/checkpoints/metadata_000003473408.json b/checkpoints/metadata_000003473408.json
new file mode 100644
index 0000000000000000000000000000000000000000..2f97446f2be71fba8dac54d951b26c8cf6d6788a
--- /dev/null
+++ b/checkpoints/metadata_000003473408.json
@@ -0,0 +1 @@
+{"step": 106, "tokens_seen": 3473408, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.586473450890022}
\ No newline at end of file
diff --git a/checkpoints/metadata_000003833856.json b/checkpoints/metadata_000003833856.json
new file mode 100644
index 0000000000000000000000000000000000000000..5fa43cabb61e3d8f03be5207dfbcebf0130cea50
--- /dev/null
+++ b/checkpoints/metadata_000003833856.json
@@ -0,0 +1 @@
+{"step": 117, "tokens_seen": 3833856, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.363055311448852}
\ No newline at end of file
diff --git a/checkpoints/metadata_000004227072.json b/checkpoints/metadata_000004227072.json
new file mode 100644
index 0000000000000000000000000000000000000000..21083d2d92daac684bcc04ddab81f04c636df8a4
--- /dev/null
+++ b/checkpoints/metadata_000004227072.json
@@ -0,0 +1 @@
+{"step": 129, "tokens_seen": 4227072, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.123362399046647}
\ No newline at end of file
diff --git a/checkpoints/metadata_000004653056.json b/checkpoints/metadata_000004653056.json
new file mode 100644
index 0000000000000000000000000000000000000000..9c4ce9ed4508e76313e2baf9b77332fecc385fd6
--- /dev/null
+++ b/checkpoints/metadata_000004653056.json
@@ -0,0 +1 @@
+{"step": 142, "tokens_seen": 4653056, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 8.878306496041152}
\ No newline at end of file
diff --git a/checkpoints/metadata_000005111808.json b/checkpoints/metadata_000005111808.json
new file mode 100644
index 0000000000000000000000000000000000000000..9c4b1392802209dbff3b96149b477e4c4dcebbe9
--- /dev/null
+++ b/checkpoints/metadata_000005111808.json
@@ -0,0 +1 @@
+{"step": 156, "tokens_seen": 5111808, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 8.628132063043422}
\ No newline at end of file
diff --git a/checkpoints/metadata_000005603328.json b/checkpoints/metadata_000005603328.json
new file mode 100644
index 0000000000000000000000000000000000000000..79fe3b0e48f3b48581e211a9fb05397c7a285585
--- /dev/null
+++ b/checkpoints/metadata_000005603328.json
@@ -0,0 +1 @@
+{"step": 171, "tokens_seen": 5603328, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 8.386896464716122}
\ No newline at end of file
diff --git a/checkpoints/metadata_000006193152.json b/checkpoints/metadata_000006193152.json
new file mode 100644
index 0000000000000000000000000000000000000000..a3b7d6b84d371cc56b2eef5e15c9c153f378d1a0
--- /dev/null
+++ b/checkpoints/metadata_000006193152.json
@@ -0,0 +1 @@
+{"step": 189, "tokens_seen": 6193152, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 8.111954654120947}
\ No newline at end of file
diff --git a/checkpoints/metadata_000006782976.json b/checkpoints/metadata_000006782976.json
new file mode 100644
index 0000000000000000000000000000000000000000..64c5ccdd01c67eb4d7b47b675ee684b2807d8b99
--- /dev/null
+++ b/checkpoints/metadata_000006782976.json
@@ -0,0 +1 @@
+{"step": 207, "tokens_seen": 6782976, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.871622025437105}
\ No newline at end of file
diff --git a/checkpoints/metadata_000007471104.json b/checkpoints/metadata_000007471104.json
new file mode 100644
index 0000000000000000000000000000000000000000..710a03c9b8729982cb8d22912b06644d4ff3a880
--- /dev/null
+++ b/checkpoints/metadata_000007471104.json
@@ -0,0 +1 @@
+{"step": 228, "tokens_seen": 7471104, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.622474502719788}
\ No newline at end of file
diff --git a/checkpoints/metadata_000008224768.json b/checkpoints/metadata_000008224768.json
new file mode 100644
index 0000000000000000000000000000000000000000..406a6419d0a02781787d18014e7f4d7c32a80c13
--- /dev/null
+++ b/checkpoints/metadata_000008224768.json
@@ -0,0 +1 @@
+{"step": 251, "tokens_seen": 8224768, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.378238665020556}
\ No newline at end of file
diff --git a/checkpoints/metadata_000009043968.json b/checkpoints/metadata_000009043968.json
new file mode 100644
index 0000000000000000000000000000000000000000..7a91bd6ba6e6bb5558cb98dd601fb610fbab8ff0
--- /dev/null
+++ b/checkpoints/metadata_000009043968.json
@@ -0,0 +1 @@
+{"step": 276, "tokens_seen": 9043968, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.162658799131882}
\ No newline at end of file
diff --git a/checkpoints/metadata_000009961472.json b/checkpoints/metadata_000009961472.json
new file mode 100644
index 0000000000000000000000000000000000000000..535092a8df18bfd9280cca0685e680a3c4395f00
--- /dev/null
+++ b/checkpoints/metadata_000009961472.json
@@ -0,0 +1 @@
+{"step": 304, "tokens_seen": 9961472, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.955142179675744}
\ No newline at end of file
diff --git a/checkpoints/metadata_000010944512.json b/checkpoints/metadata_000010944512.json
new file mode 100644
index 0000000000000000000000000000000000000000..a5eccecc7223b2634c634aa2f3bf642e3268819e
--- /dev/null
+++ b/checkpoints/metadata_000010944512.json
@@ -0,0 +1 @@
+{"step": 334, "tokens_seen": 10944512, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.77638006383328}
\ No newline at end of file
diff --git a/checkpoints/metadata_000012058624.json b/checkpoints/metadata_000012058624.json
new file mode 100644
index 0000000000000000000000000000000000000000..80ee8ab8b4fc249c38b4ba3f38f2bbd2f1e98095
--- /dev/null
+++ b/checkpoints/metadata_000012058624.json
@@ -0,0 +1 @@
+{"step": 368, "tokens_seen": 12058624, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.612526241307541}
\ No newline at end of file
diff --git a/checkpoints/metadata_000013271040.json b/checkpoints/metadata_000013271040.json
new file mode 100644
index 0000000000000000000000000000000000000000..eaf0ad650d033a7bd429933be32d5e2eb01ba545
--- /dev/null
+++ b/checkpoints/metadata_000013271040.json
@@ -0,0 +1 @@
+{"step": 405, "tokens_seen": 13271040, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.459114666590705}
\ No newline at end of file
diff --git a/checkpoints/metadata_000014581760.json b/checkpoints/metadata_000014581760.json
new file mode 100644
index 0000000000000000000000000000000000000000..1beda0aee22a847efb2ece4a9fa0f927318624c3
--- /dev/null
+++ b/checkpoints/metadata_000014581760.json
@@ -0,0 +1 @@
+{"step": 445, "tokens_seen": 14581760, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.336574145804579}
\ No newline at end of file
diff --git a/checkpoints/metadata_000016056320.json b/checkpoints/metadata_000016056320.json
new file mode 100644
index 0000000000000000000000000000000000000000..617a610c90468dc031fbb83508c144d0f87c77e8
--- /dev/null
+++ b/checkpoints/metadata_000016056320.json
@@ -0,0 +1 @@
+{"step": 490, "tokens_seen": 16056320, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.228235339046665}
\ No newline at end of file
diff --git a/checkpoints/metadata_000016384000.json b/checkpoints/metadata_000016384000.json
new file mode 100644
index 0000000000000000000000000000000000000000..198cc1f9bae23794cb6b6edc42d0ef6ae5da1163
--- /dev/null
+++ b/checkpoints/metadata_000016384000.json
@@ -0,0 +1 @@
+{"step": 500, "tokens_seen": 16384000, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.213594791045388}
\ No newline at end of file
diff --git a/checkpoints/metadata_000017661952.json b/checkpoints/metadata_000017661952.json
new file mode 100644
index 0000000000000000000000000000000000000000..d373208c2130e7bbce34d420e9a82f8d9c2bf80e
--- /dev/null
+++ b/checkpoints/metadata_000017661952.json
@@ -0,0 +1 @@
+{"step": 539, "tokens_seen": 17661952, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.140286869443522}
\ No newline at end of file
diff --git a/checkpoints/metadata_000019431424.json b/checkpoints/metadata_000019431424.json
new file mode 100644
index 0000000000000000000000000000000000000000..5a53a43a38be8b3a5c4db5a9bad2055a1aa077b5
--- /dev/null
+++ b/checkpoints/metadata_000019431424.json
@@ -0,0 +1 @@
+{"step": 593, "tokens_seen": 19431424, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.0473806001260115}
\ No newline at end of file
diff --git a/checkpoints/metadata_000021364736.json b/checkpoints/metadata_000021364736.json
new file mode 100644
index 0000000000000000000000000000000000000000..b6b29874aa924d778ec3c673934033bf9cedaaa1
--- /dev/null
+++ b/checkpoints/metadata_000021364736.json
@@ -0,0 +1 @@
+{"step": 652, "tokens_seen": 21364736, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.957438122870984}
\ No newline at end of file
diff --git a/checkpoints/metadata_000023494656.json b/checkpoints/metadata_000023494656.json
new file mode 100644
index 0000000000000000000000000000000000000000..3a791e15097d08209212af1e96a91b7880dd3e4e
--- /dev/null
+++ b/checkpoints/metadata_000023494656.json
@@ -0,0 +1 @@
+{"step": 717, "tokens_seen": 23494656, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.8766060924943435}
\ No newline at end of file
diff --git a/checkpoints/metadata_000025853952.json b/checkpoints/metadata_000025853952.json
new file mode 100644
index 0000000000000000000000000000000000000000..081d777225d5dda020cae3f71330c4525cdc32bf
--- /dev/null
+++ b/checkpoints/metadata_000025853952.json
@@ -0,0 +1 @@
+{"step": 789, "tokens_seen": 25853952, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.80125696269241}
\ No newline at end of file
diff --git a/checkpoints/metadata_000028442624.json b/checkpoints/metadata_000028442624.json
new file mode 100644
index 0000000000000000000000000000000000000000..2de022df5f2c64b9e5e351dd3ab00eee641b5e12
--- /dev/null
+++ b/checkpoints/metadata_000028442624.json
@@ -0,0 +1 @@
+{"step": 868, "tokens_seen": 28442624, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.721048508033244}
\ No newline at end of file
diff --git a/checkpoints/metadata_000031293440.json b/checkpoints/metadata_000031293440.json
new file mode 100644
index 0000000000000000000000000000000000000000..224dfff8c27c2eab58523eac822419b1bc6928d2
--- /dev/null
+++ b/checkpoints/metadata_000031293440.json
@@ -0,0 +1 @@
+{"step": 955, "tokens_seen": 31293440, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.636232083658478}
\ No newline at end of file
diff --git a/checkpoints/metadata_000032768000.json b/checkpoints/metadata_000032768000.json
new file mode 100644
index 0000000000000000000000000000000000000000..80d7d0e060f7ea411c310ce429be85321eeecede
--- /dev/null
+++ b/checkpoints/metadata_000032768000.json
@@ -0,0 +1 @@
+{"step": 1000, "tokens_seen": 32768000, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.5762510995092445}
\ No newline at end of file
diff --git a/checkpoints/metadata_000034439168.json b/checkpoints/metadata_000034439168.json
new file mode 100644
index 0000000000000000000000000000000000000000..733be73f1094522ff388d97b4eeeeffece40fcb5
--- /dev/null
+++ b/checkpoints/metadata_000034439168.json
@@ -0,0 +1 @@
+{"step": 1051, "tokens_seen": 34439168, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.524304788150752}
\ No newline at end of file
diff --git a/checkpoints/metadata_000037879808.json b/checkpoints/metadata_000037879808.json
new file mode 100644
index 0000000000000000000000000000000000000000..7229a4c6089fbfe70af36011eea2ab288b2ffa17
--- /dev/null
+++ b/checkpoints/metadata_000037879808.json
@@ -0,0 +1 @@
+{"step": 1156, "tokens_seen": 37879808, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.400836040966362}
\ No newline at end of file
diff --git a/checkpoints/metadata_000041648128.json b/checkpoints/metadata_000041648128.json
new file mode 100644
index 0000000000000000000000000000000000000000..b8bc685acc415c7c64dab650491e52a03a890452
--- /dev/null
+++ b/checkpoints/metadata_000041648128.json
@@ -0,0 +1 @@
+{"step": 1271, "tokens_seen": 41648128, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.300884781054707}
\ No newline at end of file
diff --git a/checkpoints/metadata_000045842432.json b/checkpoints/metadata_000045842432.json
new file mode 100644
index 0000000000000000000000000000000000000000..1baadb648d9bb1881c5df6ff7cbe9f47f335bc3f
--- /dev/null
+++ b/checkpoints/metadata_000045842432.json
@@ -0,0 +1 @@
+{"step": 1399, "tokens_seen": 45842432, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.198663083015544}
\ No newline at end of file
diff --git a/checkpoints/metadata_000049152000.json b/checkpoints/metadata_000049152000.json
new file mode 100644
index 0000000000000000000000000000000000000000..dfd50f4ebf907e7616f8af104e8ac14e86bc43ec
--- /dev/null
+++ b/checkpoints/metadata_000049152000.json
@@ -0,0 +1 @@
+{"step": 1500, "tokens_seen": 49152000, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.131232018757109}
\ No newline at end of file
diff --git a/checkpoints/metadata_000050397184.json b/checkpoints/metadata_000050397184.json
new file mode 100644
index 0000000000000000000000000000000000000000..bf3a81ed5ac03a490c84d250838d0b5f7692fc88
--- /dev/null
+++ b/checkpoints/metadata_000050397184.json
@@ -0,0 +1 @@
+{"step": 1538, "tokens_seen": 50397184, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.10805858550132}
\ No newline at end of file
diff --git a/checkpoints/metadata_000055443456.json b/checkpoints/metadata_000055443456.json
new file mode 100644
index 0000000000000000000000000000000000000000..47fe93fb438233c9c9a0371e37fbc2725c85715f
--- /dev/null
+++ b/checkpoints/metadata_000055443456.json
@@ -0,0 +1 @@
+{"step": 1692, "tokens_seen": 55443456, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.0346353737327245}
\ No newline at end of file
diff --git a/checkpoints/metadata_000061014016.json b/checkpoints/metadata_000061014016.json
new file mode 100644
index 0000000000000000000000000000000000000000..513e7bff5c34cfe2e36c1c9e179f0a3e74443977
--- /dev/null
+++ b/checkpoints/metadata_000061014016.json
@@ -0,0 +1 @@
+{"step": 1862, "tokens_seen": 61014016, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.92742670405853}
\ No newline at end of file
diff --git a/checkpoints/metadata_000065536000.json b/checkpoints/metadata_000065536000.json
new file mode 100644
index 0000000000000000000000000000000000000000..a3ee127c19c28ba4dbbe00c560dc673c9970f718
--- /dev/null
+++ b/checkpoints/metadata_000065536000.json
@@ -0,0 +1 @@
+{"step": 2000, "tokens_seen": 65536000, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.833024285536252}
\ No newline at end of file
diff --git a/checkpoints/metadata_000067108864.json b/checkpoints/metadata_000067108864.json
new file mode 100644
index 0000000000000000000000000000000000000000..84f0bf6c1584cd879e3dbecc36f8189904c22505
--- /dev/null
+++ b/checkpoints/metadata_000067108864.json
@@ -0,0 +1 @@
+{"step": 2048, "tokens_seen": 67108864, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.79637444664496}
\ No newline at end of file
diff --git a/checkpoints/metadata_000073826304.json b/checkpoints/metadata_000073826304.json
new file mode 100644
index 0000000000000000000000000000000000000000..e49886eb86c18d82bce0343709c02ad30f94b280
--- /dev/null
+++ b/checkpoints/metadata_000073826304.json
@@ -0,0 +1 @@
+{"step": 2253, "tokens_seen": 73826304, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.65944140237159}
\ No newline at end of file
diff --git a/checkpoints/metadata_000081199104.json b/checkpoints/metadata_000081199104.json
new file mode 100644
index 0000000000000000000000000000000000000000..69b24ef6d07a193941e1bff2f2fcc693913ffef7
--- /dev/null
+++ b/checkpoints/metadata_000081199104.json
@@ -0,0 +1 @@
+{"step": 2478, "tokens_seen": 81199104, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.560279875402151}
\ No newline at end of file
diff --git a/checkpoints/metadata_000081920000.json b/checkpoints/metadata_000081920000.json
new file mode 100644
index 0000000000000000000000000000000000000000..202759bcf328b1b0707d35120d271612d01c69ff
--- /dev/null
+++ b/checkpoints/metadata_000081920000.json
@@ -0,0 +1 @@
+{"step": 2500, "tokens_seen": 81920000, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.548369499176888}
\ No newline at end of file
diff --git a/checkpoints/metadata_000089325568.json b/checkpoints/metadata_000089325568.json
new file mode 100644
index 0000000000000000000000000000000000000000..de4311521b35c59562622e1b57fef9a6a9942a86
--- /dev/null
+++ b/checkpoints/metadata_000089325568.json
@@ -0,0 +1 @@
+{"step": 2726, "tokens_seen": 89325568, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.471046873033792}
\ No newline at end of file
diff --git a/checkpoints/metadata_000098271232.json b/checkpoints/metadata_000098271232.json
new file mode 100644
index 0000000000000000000000000000000000000000..edc5d9bce43654092cc42e426b3b3ec538d8ad5a
--- /dev/null
+++ b/checkpoints/metadata_000098271232.json
@@ -0,0 +1 @@
+{"step": 2999, "tokens_seen": 98271232, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.391406051206394}
\ No newline at end of file
diff --git a/checkpoints/metadata_000098304000.json b/checkpoints/metadata_000098304000.json
new file mode 100644
index 0000000000000000000000000000000000000000..e0b9dba33fb5365a304d93dd82755a2af744f18b
--- /dev/null
+++ b/checkpoints/metadata_000098304000.json
@@ -0,0 +1 @@
+{"step": 3000, "tokens_seen": 98304000, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.39022718108587}
\ No newline at end of file
diff --git a/checkpoints/metadata_000108068864.json b/checkpoints/metadata_000108068864.json
new file mode 100644
index 0000000000000000000000000000000000000000..eff8df0bc7c0dad25dfc081a9e68066525ebf1e3
--- /dev/null
+++ b/checkpoints/metadata_000108068864.json
@@ -0,0 +1 @@
+{"step": 3298, "tokens_seen": 108068864, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.315138558058488}
\ No newline at end of file
diff --git a/checkpoints/metadata_000114688000.json b/checkpoints/metadata_000114688000.json
new file mode 100644
index 0000000000000000000000000000000000000000..ec9a35fb91ef12d6c2e03a662a5cfb0b527e5bec
--- /dev/null
+++ b/checkpoints/metadata_000114688000.json
@@ -0,0 +1 @@
+{"step": 3500, "tokens_seen": 114688000, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.276147792756605}
\ No newline at end of file
diff --git a/checkpoints/metadata_000118882304.json b/checkpoints/metadata_000118882304.json
new file mode 100644
index 0000000000000000000000000000000000000000..92016022108c92ca5d1cf9c5153ad61b28871299
--- /dev/null
+++ b/checkpoints/metadata_000118882304.json
@@ -0,0 +1 @@
+{"step": 3628, "tokens_seen": 118882304, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.242676949767395}
\ No newline at end of file
diff --git a/checkpoints/metadata_000130777088.json b/checkpoints/metadata_000130777088.json
new file mode 100644
index 0000000000000000000000000000000000000000..e302871b5495bd297faa074eaf89ddf44195ccbf
--- /dev/null
+++ b/checkpoints/metadata_000130777088.json
@@ -0,0 +1 @@
+{"step": 3991, "tokens_seen": 130777088, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.223668290631027}
\ No newline at end of file
diff --git a/checkpoints/metadata_000131072000.json b/checkpoints/metadata_000131072000.json
new file mode 100644
index 0000000000000000000000000000000000000000..7911f4947fe4e3232d6a8698e21442154a840128
--- /dev/null
+++ b/checkpoints/metadata_000131072000.json
@@ -0,0 +1 @@
+{"step": 4000, "tokens_seen": 131072000, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.21741597038181}
\ No newline at end of file
diff --git a/checkpoints/metadata_000143851520.json b/checkpoints/metadata_000143851520.json
new file mode 100644
index 0000000000000000000000000000000000000000..56387e1a765bdc0f07cd01619b90f8a0f5fd2d1c
--- /dev/null
+++ b/checkpoints/metadata_000143851520.json
@@ -0,0 +1 @@
+{"step": 4390, "tokens_seen": 143851520, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.166576048768662}
\ No newline at end of file
diff --git a/checkpoints/metadata_000147456000.json b/checkpoints/metadata_000147456000.json
new file mode 100644
index 0000000000000000000000000000000000000000..59f3f7826880d30cac07c1e811b246fba46296f6
--- /dev/null
+++ b/checkpoints/metadata_000147456000.json
@@ -0,0 +1 @@
+{"step": 4500, "tokens_seen": 147456000, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.150285915576986}
\ No newline at end of file
diff --git a/checkpoints/metadata_000158269440.json b/checkpoints/metadata_000158269440.json
new file mode 100644
index 0000000000000000000000000000000000000000..8bf8357d12f756e933fb080fc87d1ba72e5d53f2
--- /dev/null
+++ b/checkpoints/metadata_000158269440.json
@@ -0,0 +1 @@
+{"step": 4830, "tokens_seen": 158269440, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.127049286140794}
\ No newline at end of file
diff --git a/checkpoints/metadata_000163840000.json b/checkpoints/metadata_000163840000.json
new file mode 100644
index 0000000000000000000000000000000000000000..a474302bdbcfdec6f39495b34c7da4079cbd1669
--- /dev/null
+++ b/checkpoints/metadata_000163840000.json
@@ -0,0 +1 @@
+{"step": 5000, "tokens_seen": 163840000, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.1352184322363374}
\ No newline at end of file
diff --git a/checkpoints/metadata_000174096384.json b/checkpoints/metadata_000174096384.json
new file mode 100644
index 0000000000000000000000000000000000000000..3929d7161b3330d96f6303f68828353d77da9714
--- /dev/null
+++ b/checkpoints/metadata_000174096384.json
@@ -0,0 +1 @@
+{"step": 5313, "tokens_seen": 174096384, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.089648539560446}
\ No newline at end of file
diff --git a/checkpoints/metadata_000180224000.json b/checkpoints/metadata_000180224000.json
new file mode 100644
index 0000000000000000000000000000000000000000..d177577eba1d45b0e934ccf7f139b07c281d11dc
--- /dev/null
+++ b/checkpoints/metadata_000180224000.json
@@ -0,0 +1 @@
+{"step": 5500, "tokens_seen": 180224000, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.093543784488962}
\ No newline at end of file
diff --git a/checkpoints/metadata_000191496192.json b/checkpoints/metadata_000191496192.json
new file mode 100644
index 0000000000000000000000000000000000000000..9fa807364e46594f379d7af3362626f6d74095e4
--- /dev/null
+++ b/checkpoints/metadata_000191496192.json
@@ -0,0 +1 @@
+{"step": 5844, "tokens_seen": 191496192, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.0670730832317945}
\ No newline at end of file
diff --git a/checkpoints/metadata_000196608000.json b/checkpoints/metadata_000196608000.json
new file mode 100644
index 0000000000000000000000000000000000000000..3c99ba89b75c21fae254abfe342b38024820164e
--- /dev/null
+++ b/checkpoints/metadata_000196608000.json
@@ -0,0 +1 @@
+{"step": 6000, "tokens_seen": 196608000, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.048539307487607}
\ No newline at end of file
diff --git a/checkpoints/model_weights_000000032768.pt b/checkpoints/model_weights_000000032768.pt
new file mode 100644
index 0000000000000000000000000000000000000000..03511d91e914d121706074672ed3717e4b31658a
--- /dev/null
+++ b/checkpoints/model_weights_000000032768.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:78ce67efbdfc3c9279831326e38cbb39268eb87b8627e4d8e3239dbba09c7ac9
+size 225208789
diff --git a/checkpoints/model_weights_000000327680.pt b/checkpoints/model_weights_000000327680.pt
new file mode 100644
index 0000000000000000000000000000000000000000..018dcdc2ad4e7b1186df7179b088370f040f2505
--- /dev/null
+++ b/checkpoints/model_weights_000000327680.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d0d1547f4d794b9c3e9a3b27c7bee23438512c7212729532c2c55e687b15e9b5
+size 225208789
diff --git a/checkpoints/model_weights_000000360448.pt b/checkpoints/model_weights_000000360448.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a89a8698e80ca4325167f501a543cea993900de8
--- /dev/null
+++ b/checkpoints/model_weights_000000360448.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e5bb2f326069c83b76db2c3df3f21a075ae3a8375097af08c6f300189378b20a
+size 225208789
diff --git a/checkpoints/model_weights_000000425984.pt b/checkpoints/model_weights_000000425984.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4cdd140e9d54e93ca60c4213a12e51f6b69056e2
--- /dev/null
+++ b/checkpoints/model_weights_000000425984.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fd196f321053e220b58ed5f25c07eaaaa6a390aa526c4ce05b23bec622956b9a
+size 225208789
diff --git a/checkpoints/model_weights_000000458752.pt b/checkpoints/model_weights_000000458752.pt
new file mode 100644
index 0000000000000000000000000000000000000000..70721911b22f2c9f813de099121f3d2a7b877c2c
--- /dev/null
+++ b/checkpoints/model_weights_000000458752.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:276b134c6db99d0fbcfa6680e7a6cb2f3850fbd46f3b5265839ae15b21924a5e
+size 225208789
diff --git a/checkpoints/model_weights_000000491520.pt b/checkpoints/model_weights_000000491520.pt
new file mode 100644
index 0000000000000000000000000000000000000000..519f6b0d11e89a75dfb072ac5cc24ced1ab71f53
--- /dev/null
+++ b/checkpoints/model_weights_000000491520.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d40881882d4429168cb8f3bdd055bb6d7b8639b24b1ccb40870bf8eccfa4e9ca
+size 225208789
diff --git a/checkpoints/model_weights_000000557056.pt b/checkpoints/model_weights_000000557056.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7e12a6865ea9082d1a9b4b9ae9aa0efdc6940816
--- /dev/null
+++ b/checkpoints/model_weights_000000557056.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0879054c19f75bf5710c8a767a1ea621d04536757d9cdebbe8be0a54a18cce6f
+size 225208789
diff --git a/checkpoints/model_weights_000000622592.pt b/checkpoints/model_weights_000000622592.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2a52f091c58e9dfd7a0785619f27bb7ae42917ac
--- /dev/null
+++ b/checkpoints/model_weights_000000622592.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4431b188f676d1b1cac470b905a048a91904f54dd4e78b0ae1061562a77cf115
+size 225208789
diff --git a/checkpoints/model_weights_000000688128.pt b/checkpoints/model_weights_000000688128.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2edb38c4b68e71218f93a2521ba2c6951c1f2993
--- /dev/null
+++ b/checkpoints/model_weights_000000688128.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4e18889c65fbd58b824bbd60e948a5ad0131aad0c1422178c5ff6c149eda5de1
+size 225208789
diff --git a/checkpoints/model_weights_000000753664.pt b/checkpoints/model_weights_000000753664.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f6ad676b9f1bf675c854622a7b9bc8b80fc16021
--- /dev/null
+++ b/checkpoints/model_weights_000000753664.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5c17669e457d6c4044635dab89884222d3b8896afa6f3b45a38d6f08530535a9
+size 225208789
diff --git a/checkpoints/model_weights_000000819200.pt b/checkpoints/model_weights_000000819200.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6046c08215d7f4784671a06b759748c368d7bc41
--- /dev/null
+++ b/checkpoints/model_weights_000000819200.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bca20cee00c4b76689290d3eb6741c52f3dd5b8589bc083cf2bb0b8289e231fe
+size 225208789
diff --git a/checkpoints/model_weights_000000917504.pt b/checkpoints/model_weights_000000917504.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8526a74ae62bf65ce3df40bde3f2639300e8efd5
--- /dev/null
+++ b/checkpoints/model_weights_000000917504.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:71b7fc779f60b8c4dd08927063e25c6c5e0e2392a11955e57eab96643b3723a1
+size 225208789
diff --git a/checkpoints/model_weights_000000983040.pt b/checkpoints/model_weights_000000983040.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a9f1d2a2dc5e7b2a400589a5e523755b9176a448
--- /dev/null
+++ b/checkpoints/model_weights_000000983040.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7c9f1076e2dbf347eaa9ffc44962f40910edd5115fd1aba368e1251732104c40
+size 225208789
diff --git a/checkpoints/model_weights_000001114112.pt b/checkpoints/model_weights_000001114112.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0336e18774419d17968d164091ae676baa079007
--- /dev/null
+++ b/checkpoints/model_weights_000001114112.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a0fddbe629e9ac2195c5197c149e0f5a2890de88a0da2c44c2adf1ee0524c134
+size 225208789
diff --git a/checkpoints/model_weights_000001212416.pt b/checkpoints/model_weights_000001212416.pt
new file mode 100644
index 0000000000000000000000000000000000000000..43996a2bd71e7c74eca1f6b19800d28494a74aba
--- /dev/null
+++ b/checkpoints/model_weights_000001212416.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bf649663341b538bce68fb26c9b859cfd5d89fbe7ebe1871fe124ddf890323bf
+size 225208789
diff --git a/checkpoints/model_weights_000001343488.pt b/checkpoints/model_weights_000001343488.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2c1fde4d0e63d45e5954a8349ebfc305bb61cc26
--- /dev/null
+++ b/checkpoints/model_weights_000001343488.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dce061161236392ec7a9b30a53ef60368c36421b9d7fd79f1863bf7ab0ced620
+size 225208789
diff --git a/checkpoints/model_weights_000001474560.pt b/checkpoints/model_weights_000001474560.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0f9262b0f113195920ccb40a03a7f7804a78f60a
--- /dev/null
+++ b/checkpoints/model_weights_000001474560.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7201561a220b5332e247236579eaaba979daf915111f49dd4375a2c22f1330d9
+size 225208789
diff --git a/checkpoints/model_weights_000001605632.pt b/checkpoints/model_weights_000001605632.pt
new file mode 100644
index 0000000000000000000000000000000000000000..11a04d281a03865652caaf78a21d2b69843ac6a5
--- /dev/null
+++ b/checkpoints/model_weights_000001605632.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3ae13b18f93dcdcfa429840dd60884081453af8908f7e8a25f5de565e104eccd
+size 225208789
diff --git a/checkpoints/model_weights_000001769472.pt b/checkpoints/model_weights_000001769472.pt
new file mode 100644
index 0000000000000000000000000000000000000000..621dfb27bca2da55279ab70c79f83d0c17284360
--- /dev/null
+++ b/checkpoints/model_weights_000001769472.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d92da2c1dfd126f7e1280603480fa6e21871f8408e0df087732419cb4ba0ddd8
+size 225208789
diff --git a/checkpoints/model_weights_000001966080.pt b/checkpoints/model_weights_000001966080.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d3d266c5053e796e40ce9c6b597d1c5a582fef74
--- /dev/null
+++ b/checkpoints/model_weights_000001966080.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:453aa97314f5c89b77882b903242e37806e19f3af1d056f40b443260af24d5b4
+size 225208789
diff --git a/checkpoints/model_weights_000002162688.pt b/checkpoints/model_weights_000002162688.pt
new file mode 100644
index 0000000000000000000000000000000000000000..cf50d4b100bd4914df8209ebc1c8ab7091f3ae1f
--- /dev/null
+++ b/checkpoints/model_weights_000002162688.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8bb67eb6590d2c09981f81667f28036942529a995bca2669bf7c1ebc00262064
+size 225208789
diff --git a/checkpoints/model_weights_000002359296.pt b/checkpoints/model_weights_000002359296.pt
new file mode 100644
index 0000000000000000000000000000000000000000..004740592b80d6709afa6e4f7c94be9eda09ba32
--- /dev/null
+++ b/checkpoints/model_weights_000002359296.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f75c3c9a909ad9f3b9bd8f703d9c1268130e32534596ab76c64fa016a3dee90b
+size 225208789
diff --git a/checkpoints/model_weights_000002621440.pt b/checkpoints/model_weights_000002621440.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5ee97c29f2ed1d583e8a2bf4877dac71d4a7cf94
--- /dev/null
+++ b/checkpoints/model_weights_000002621440.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b85f7345bbffe856eba7ca954b6a6f47db7c61359e9dd21fdaae5b4cf39e16f
+size 225208789
diff --git a/checkpoints/model_weights_000002883584.pt b/checkpoints/model_weights_000002883584.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b52de3655fd2a4dcc7c362dafe85262037805e7d
--- /dev/null
+++ b/checkpoints/model_weights_000002883584.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:41d61d58ec9e95a83ca1fa333559e66316267291114d2ed30e4067d37819c217
+size 225208789
diff --git a/checkpoints/model_weights_000003178496.pt b/checkpoints/model_weights_000003178496.pt
new file mode 100644
index 0000000000000000000000000000000000000000..cdea2467b0da9b6666a28972fc59f9764b783067
--- /dev/null
+++ b/checkpoints/model_weights_000003178496.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2d744219eb57a243c2dac707af37826e31bc23335117f12ae105b2d4846624fe
+size 225208789
diff --git a/checkpoints/model_weights_000003473408.pt b/checkpoints/model_weights_000003473408.pt
new file mode 100644
index 0000000000000000000000000000000000000000..76811614a190a552b76ab472ba7c343310707749
--- /dev/null
+++ b/checkpoints/model_weights_000003473408.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:41912278f817b94f81c50789865169282f5fdae30da83efe31670f5a99f79df6
+size 225208789
diff --git a/checkpoints/model_weights_000003833856.pt b/checkpoints/model_weights_000003833856.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a3a5afba04836553760f20085422b52384b98688
--- /dev/null
+++ b/checkpoints/model_weights_000003833856.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4f45ee9371f94c103ad30f3f37f0f245178b165c616d59ceac3aa54b062a0b05
+size 225208789
diff --git a/checkpoints/model_weights_000004227072.pt b/checkpoints/model_weights_000004227072.pt
new file mode 100644
index 0000000000000000000000000000000000000000..43675aad1170d5d38f1dc2cf8d08530d1e270229
--- /dev/null
+++ b/checkpoints/model_weights_000004227072.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ebe5c512e7132a42a44c3d75ff3cf10307ddabad087051f5c9019af68f875e76
+size 225208789
diff --git a/checkpoints/model_weights_000004653056.pt b/checkpoints/model_weights_000004653056.pt
new file mode 100644
index 0000000000000000000000000000000000000000..df1a5b521a793b7e5a1ca66e620c80c84d25d94c
--- /dev/null
+++ b/checkpoints/model_weights_000004653056.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fa553ff873958abc6e3836798a195dcef0569c54a94eeb31352cc029bec445ee
+size 225208789
diff --git a/checkpoints/model_weights_000005111808.pt b/checkpoints/model_weights_000005111808.pt
new file mode 100644
index 0000000000000000000000000000000000000000..bfda0a55bdc5147d0bfe461a30c443f4002c584a
--- /dev/null
+++ b/checkpoints/model_weights_000005111808.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4a94a5f2dc262b4d8de1ad2b9a20261f1df509a17fd8b518c867b5c16ff15db8
+size 225208789
diff --git a/checkpoints/model_weights_000005603328.pt b/checkpoints/model_weights_000005603328.pt
new file mode 100644
index 0000000000000000000000000000000000000000..200fa8a01b6428f1a64b0d9f93e55397e6e290ae
--- /dev/null
+++ b/checkpoints/model_weights_000005603328.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5618bf536fa59d6cc6d930dfb91a524f22f9e2a1b56c62061645aeaa575af62d
+size 225208789
diff --git a/checkpoints/model_weights_000006193152.pt b/checkpoints/model_weights_000006193152.pt
new file mode 100644
index 0000000000000000000000000000000000000000..27166a4e002276d4472365df582f1f6c857789b3
--- /dev/null
+++ b/checkpoints/model_weights_000006193152.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bb4f260d90b72bffd1f1d87c6b2a705a7826f1d6045b7bf0c46a03a64c1f43ec
+size 225208789
diff --git a/checkpoints/model_weights_000006782976.pt b/checkpoints/model_weights_000006782976.pt
new file mode 100644
index 0000000000000000000000000000000000000000..faf11b7898ebebe8b94543a022f5a6ded24e5179
--- /dev/null
+++ b/checkpoints/model_weights_000006782976.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4fdcd99b1de8148c92748d74a22be95506bbec0c87b7ae4983b6bf61eecfb403
+size 225208789
diff --git a/checkpoints/model_weights_000007471104.pt b/checkpoints/model_weights_000007471104.pt
new file mode 100644
index 0000000000000000000000000000000000000000..83c4c5807e1a64ff016cbcaf95f674ac3d50df2d
--- /dev/null
+++ b/checkpoints/model_weights_000007471104.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f6521c95a46a039473c88a04b1a945cc9434a141719cf996a7b76caffb351542
+size 225208789
diff --git a/checkpoints/model_weights_000008224768.pt b/checkpoints/model_weights_000008224768.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a9996645d357a8c25f8fe51e6104e0b9f9dff900
--- /dev/null
+++ b/checkpoints/model_weights_000008224768.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de5ca8b070c194a672dfc3fbb9acfe2d65859bda83f5bf31e46e18437f9e5aa5
+size 225208789
diff --git a/checkpoints/model_weights_000009043968.pt b/checkpoints/model_weights_000009043968.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b041f58f5cc375c20e1b38f763cb78e82a74efef
--- /dev/null
+++ b/checkpoints/model_weights_000009043968.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3e6ddb19282048b4ae2910f94d1251ca3fdb8c891118ef30fd75cd9490e37cff
+size 225208789
diff --git a/checkpoints/model_weights_000009961472.pt b/checkpoints/model_weights_000009961472.pt
new file mode 100644
index 0000000000000000000000000000000000000000..dd61ae8ac93ac934299c27ee78e878d6b66daf27
--- /dev/null
+++ b/checkpoints/model_weights_000009961472.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a4147ddaa169b6ce4553e685febd939ecb34f1ada5685bc47d0cb3682644d09e
+size 225208789
diff --git a/checkpoints/model_weights_000010944512.pt b/checkpoints/model_weights_000010944512.pt
new file mode 100644
index 0000000000000000000000000000000000000000..094f3666464277da6310ca689f9a5d886a8a7a63
--- /dev/null
+++ b/checkpoints/model_weights_000010944512.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:def3173817420ada9d84bbb12bfe9058c53ae5cd4e1aefc6a348c268fd211aad
+size 225208789
diff --git a/checkpoints/model_weights_000012058624.pt b/checkpoints/model_weights_000012058624.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7a5d9aa49faf61e8a4bcf56988c09660e9c300b6
--- /dev/null
+++ b/checkpoints/model_weights_000012058624.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5fe0beb00a18f19e92f0b41f05bd831145127c6af47cedff069dfcc49527c5b4
+size 225208789
diff --git a/checkpoints/model_weights_000013271040.pt b/checkpoints/model_weights_000013271040.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b155ecd667837743a3fa54324d68f9846f879ad2
--- /dev/null
+++ b/checkpoints/model_weights_000013271040.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2122da26813d8e86b96aa1e3c62814c73d9add93a51cde66ba935aa936be48ce
+size 225208789
diff --git a/checkpoints/model_weights_000014581760.pt b/checkpoints/model_weights_000014581760.pt
new file mode 100644
index 0000000000000000000000000000000000000000..129b64c1fa9b84ef11068db0b7b1fa4557a2465f
--- /dev/null
+++ b/checkpoints/model_weights_000014581760.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2d9ec953d875e051812837ec53eab22dd90f9ffe4c63209bce9ba93254d02051
+size 225208789
diff --git a/checkpoints/model_weights_000016056320.pt b/checkpoints/model_weights_000016056320.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a40129e07e6ea9d1396c5423b10519b0212eb4c3
--- /dev/null
+++ b/checkpoints/model_weights_000016056320.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:499dfcb83eae9a078f31bc9c19ce70a82901357a578e94965e6dbc6e2360e389
+size 225208789
diff --git a/checkpoints/model_weights_000016384000.pt b/checkpoints/model_weights_000016384000.pt
new file mode 100644
index 0000000000000000000000000000000000000000..379ff44621156391420e4b007a2c31576b71b0d1
--- /dev/null
+++ b/checkpoints/model_weights_000016384000.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e61c29d7ac829ddb0a00f7d9083e6ca18edb7a7d426fa751fb0132159446374a
+size 225208789
diff --git a/checkpoints/model_weights_000017661952.pt b/checkpoints/model_weights_000017661952.pt
new file mode 100644
index 0000000000000000000000000000000000000000..52787036089ac6605aa40e9f5b72f78a1af4b128
--- /dev/null
+++ b/checkpoints/model_weights_000017661952.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1c04b96f667562f55a007eb4c5657f57c8cda4db0b6bbaa7f3354793f9f1ba4b
+size 225208789
diff --git a/checkpoints/model_weights_000019431424.pt b/checkpoints/model_weights_000019431424.pt
new file mode 100644
index 0000000000000000000000000000000000000000..dd7a01de4275487b926ef1fa15ae6c76f8734687
--- /dev/null
+++ b/checkpoints/model_weights_000019431424.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fb3064908bec541ed3d45d3ea463d450ed12786ed220b8a576f614b025ee3252
+size 225208789
diff --git a/checkpoints/model_weights_000021364736.pt b/checkpoints/model_weights_000021364736.pt
new file mode 100644
index 0000000000000000000000000000000000000000..00492ddaba9125af08c091b881b39dd7aac180da
--- /dev/null
+++ b/checkpoints/model_weights_000021364736.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d8a17e585ea4e411b4c6de03d5cc2ed94e9a92544954f5a9face20e7a074988c
+size 225208789
diff --git a/checkpoints/model_weights_000023494656.pt b/checkpoints/model_weights_000023494656.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a879e6744d8726a4ffb77bbd02140edcc0df459f
--- /dev/null
+++ b/checkpoints/model_weights_000023494656.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1a9f95b5bcd0c498e4e83fab93d62a8971be42f4da1e28ef2b0fc7fa6d4e10d6
+size 225208789
diff --git a/checkpoints/model_weights_000025853952.pt b/checkpoints/model_weights_000025853952.pt
new file mode 100644
index 0000000000000000000000000000000000000000..339e19bc057927f01810cb5a3e0a9cd9a6960f6c
--- /dev/null
+++ b/checkpoints/model_weights_000025853952.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e47d455aebed0bd9575c3ec6aec76c058074ba7b83d7f97c4bae82ea02299af1
+size 225208789
diff --git a/checkpoints/model_weights_000028442624.pt b/checkpoints/model_weights_000028442624.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ca50b43ce81453d6a5f06a10c2c3f9e34c7c53aa
--- /dev/null
+++ b/checkpoints/model_weights_000028442624.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:538d974f09db96dfe8f52d491c84123f90f6d564a98554e265232094f30e0e28
+size 225208789
diff --git a/checkpoints/model_weights_000031293440.pt b/checkpoints/model_weights_000031293440.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9d613d90a062298e2fd27487596217f81ba7be54
--- /dev/null
+++ b/checkpoints/model_weights_000031293440.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aa27ba715697004e37e0eb8469a5d306134378c2739f245ba5f16122d7fa2336
+size 225208789
diff --git a/checkpoints/model_weights_000032768000.pt b/checkpoints/model_weights_000032768000.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b6353c00aed73935412cacacd24b5c42ee615529
--- /dev/null
+++ b/checkpoints/model_weights_000032768000.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6cf0983ba64245185207cd3a57a4a8a79d0a4419a9db0018d99b5fe069ede4cd
+size 225208789
diff --git a/checkpoints/model_weights_000034439168.pt b/checkpoints/model_weights_000034439168.pt
new file mode 100644
index 0000000000000000000000000000000000000000..71e98deba0732c845e07b5bc31ee947c536f9d52
--- /dev/null
+++ b/checkpoints/model_weights_000034439168.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4f76bf6c814205b64871b32e733bee254bf1c7a43248f78a5d96d682c7a1959f
+size 225208789
diff --git a/checkpoints/model_weights_000037879808.pt b/checkpoints/model_weights_000037879808.pt
new file mode 100644
index 0000000000000000000000000000000000000000..57d3bb7c6108e3c1e3ca9618ab4a7d4f40546079
--- /dev/null
+++ b/checkpoints/model_weights_000037879808.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1538170d8c73461332a4dc4b1884f29719932cac377b97b0257da66df1137442
+size 225208789
diff --git a/checkpoints/model_weights_000041648128.pt b/checkpoints/model_weights_000041648128.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a2c3ef73ecf9979abadb9df2524747584b472b35
--- /dev/null
+++ b/checkpoints/model_weights_000041648128.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f424b38192151bc80aa9a98691bb85c591795d8234002bd201420e6797220582
+size 225208789
diff --git a/checkpoints/model_weights_000045842432.pt b/checkpoints/model_weights_000045842432.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9059e87ea98c411ce30f7957ec3b65eebb418885
--- /dev/null
+++ b/checkpoints/model_weights_000045842432.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:37c3909e49c5d36c26330f3d0b4dbe87a42ffdb7ec7ee4bc8c0905706739939a
+size 225208789
diff --git a/checkpoints/model_weights_000049152000.pt b/checkpoints/model_weights_000049152000.pt
new file mode 100644
index 0000000000000000000000000000000000000000..41a2830ec3433949e77ace81b4bedee9908a584d
--- /dev/null
+++ b/checkpoints/model_weights_000049152000.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:29308ad63ef76a3225b63b5270926deeb1ec2364e5d0b49225ce3db0d81227d8
+size 225208789
diff --git a/checkpoints/model_weights_000050397184.pt b/checkpoints/model_weights_000050397184.pt
new file mode 100644
index 0000000000000000000000000000000000000000..16f7a55f84bba75df6743abce8c7415764335df8
--- /dev/null
+++ b/checkpoints/model_weights_000050397184.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8a4828f17e6c5688d760197e0205d6b3b8c9a6798073a3db22a827815345e885
+size 225208789
diff --git a/checkpoints/model_weights_000055443456.pt b/checkpoints/model_weights_000055443456.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b4fe872996113862bf1930bee34019c65c740f33
--- /dev/null
+++ b/checkpoints/model_weights_000055443456.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f0d23dab5eae8e841ac88c9b731b3ac8049838622886b9e9adc67d5e880d5b21
+size 225208789
diff --git a/checkpoints/model_weights_000061014016.pt b/checkpoints/model_weights_000061014016.pt
new file mode 100644
index 0000000000000000000000000000000000000000..208f241345c10d23f0b2c601ebe2a9bcd8ba62c6
--- /dev/null
+++ b/checkpoints/model_weights_000061014016.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e951be836d87de50d11e8776f3888a9cc0827886e078b6184080b6d1bb092dd0
+size 225208789
diff --git a/checkpoints/model_weights_000065536000.pt b/checkpoints/model_weights_000065536000.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f5af4afcd565feb662f146ed8ffdf6cefb34a4c6
--- /dev/null
+++ b/checkpoints/model_weights_000065536000.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:50a7d8ccdfde3bbbe1df9c6973b806646794dede30ade1fa2d31af8264536c75
+size 225208789
diff --git a/checkpoints/model_weights_000067108864.pt b/checkpoints/model_weights_000067108864.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f8df4fe0a444e7880bcaacfb882b748a2d1b5252
--- /dev/null
+++ b/checkpoints/model_weights_000067108864.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d122a641af1acedef541149f66f8fb8e22721b0230efc5ac5a5d0091d5dc902c
+size 225208789
diff --git a/checkpoints/model_weights_000073826304.pt b/checkpoints/model_weights_000073826304.pt
new file mode 100644
index 0000000000000000000000000000000000000000..153004baa6d25838e51b80bc86961c70a2cf86fb
--- /dev/null
+++ b/checkpoints/model_weights_000073826304.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ea340f5f38658804f9a831891ed2cf84b800732ad73bdd71d6572833f4fcbebd
+size 225208789
diff --git a/checkpoints/model_weights_000081199104.pt b/checkpoints/model_weights_000081199104.pt
new file mode 100644
index 0000000000000000000000000000000000000000..01d7b449d94516b2efff4cdeb3ebe2e063be7bd4
--- /dev/null
+++ b/checkpoints/model_weights_000081199104.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:81d20e20c5f8080f038742757e5eb7d6355f346e193e47af0bbd7bc74eab7344
+size 225208789
diff --git a/checkpoints/model_weights_000081920000.pt b/checkpoints/model_weights_000081920000.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f05306adce54166d7302eaebd49c89a8ed2cfe66
--- /dev/null
+++ b/checkpoints/model_weights_000081920000.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6161864d96d021679f87e5050505c12760684b94c1273c8c6735dda78f732d93
+size 225208789
diff --git a/checkpoints/model_weights_000089325568.pt b/checkpoints/model_weights_000089325568.pt
new file mode 100644
index 0000000000000000000000000000000000000000..10e6c066dc246e378449ec3d5410f5e0a62d5619
--- /dev/null
+++ b/checkpoints/model_weights_000089325568.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4b7071f4bdd2b61ec28b210267078b70131865bcde8afdbbd3bda6ee8923d120
+size 225208789
diff --git a/checkpoints/model_weights_000098271232.pt b/checkpoints/model_weights_000098271232.pt
new file mode 100644
index 0000000000000000000000000000000000000000..741fc0055e246055236d641646fd76e09d96d436
--- /dev/null
+++ b/checkpoints/model_weights_000098271232.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:89b5b25222787b4a5a82bfe35a21e5f6cbbdacbcd2cb21bbd2205733639d9f2a
+size 225208789
diff --git a/checkpoints/model_weights_000098304000.pt b/checkpoints/model_weights_000098304000.pt
new file mode 100644
index 0000000000000000000000000000000000000000..dcb1aaa8be66a3f2ec6269a2d03c118e57d03424
--- /dev/null
+++ b/checkpoints/model_weights_000098304000.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5a4acd26c9ea48d98eb36f126814819e717d92c760543585a979eb7c98215b23
+size 225208789
diff --git a/checkpoints/model_weights_000108068864.pt b/checkpoints/model_weights_000108068864.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b49aec10069accfa1de775c7151a83d55a19dd66
--- /dev/null
+++ b/checkpoints/model_weights_000108068864.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:643697f63c757e11d81dd559bcdf4f6c318c77c78e8191573984ae47123cf02d
+size 225208789
diff --git a/checkpoints/model_weights_000114688000.pt b/checkpoints/model_weights_000114688000.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d75c0eb861c419ee7e96721ca5ddd6016a6f0685
--- /dev/null
+++ b/checkpoints/model_weights_000114688000.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1df4a40cb5f0b4d3c315701d13e17ddc6299d7cf0e0fc2bd5c14d010fc46da04
+size 225208789
diff --git a/checkpoints/model_weights_000118882304.pt b/checkpoints/model_weights_000118882304.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7379a942e86b3835321815404ec4b6b7e9807c18
--- /dev/null
+++ b/checkpoints/model_weights_000118882304.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d96ccfca68ef5d7fca3f4c63595b62381544e6eba8ec4548d27da2477ea45fcc
+size 225208789
diff --git a/checkpoints/model_weights_000130777088.pt b/checkpoints/model_weights_000130777088.pt
new file mode 100644
index 0000000000000000000000000000000000000000..79cf10f7572c5f61406d3966dc4438dfc4498ebb
--- /dev/null
+++ b/checkpoints/model_weights_000130777088.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ac666e705bb115c53b66af1e2a6cd0cc8e23dab35ff9ebd99a1711da66d35d25
+size 225208789
diff --git a/checkpoints/model_weights_000131072000.pt b/checkpoints/model_weights_000131072000.pt
new file mode 100644
index 0000000000000000000000000000000000000000..29c85b905fe54cd763f59fec684c672a8df6bc55
--- /dev/null
+++ b/checkpoints/model_weights_000131072000.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bebee9dc80249bd95d54884b36eedcfff20d068a96123347646928b888190d45
+size 225208789
diff --git a/checkpoints/model_weights_000143851520.pt b/checkpoints/model_weights_000143851520.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8894bd843ddc84521d17de45160ff687e1bce838
--- /dev/null
+++ b/checkpoints/model_weights_000143851520.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0f213291334c421b6e2b00db8f76837911d5c129cc65e5cd3c707e10cb01cde0
+size 225208789
diff --git a/checkpoints/model_weights_000147456000.pt b/checkpoints/model_weights_000147456000.pt
new file mode 100644
index 0000000000000000000000000000000000000000..81c358ceee639c14692024632dd1e5246e7edfa9
--- /dev/null
+++ b/checkpoints/model_weights_000147456000.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4182935c9e2281e10141ac3417aa2596f4de46bc04b89bb8932845918c5b4dd2
+size 225208789
diff --git a/checkpoints/model_weights_000158269440.pt b/checkpoints/model_weights_000158269440.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3d96076f2bccb35b8147a9c5b6f078b57f68ca5e
--- /dev/null
+++ b/checkpoints/model_weights_000158269440.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:019b6230f6b5e9ad9896af7535431df79ffb308660b3b3b7444b0a157b88da68
+size 225208789
diff --git a/checkpoints/model_weights_000163840000.pt b/checkpoints/model_weights_000163840000.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2510ca73e828a0938a47fc54d4fb18bb23d207a7
--- /dev/null
+++ b/checkpoints/model_weights_000163840000.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c5472d945cc53600407cf6f40350dc44d94effcae253d33ec1ef290717999ccf
+size 225208789
diff --git a/checkpoints/model_weights_000174096384.pt b/checkpoints/model_weights_000174096384.pt
new file mode 100644
index 0000000000000000000000000000000000000000..49f4569da772846e5ddbac8d78a46914b8fb87e5
--- /dev/null
+++ b/checkpoints/model_weights_000174096384.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:14956f15309f17541bc568344d8fdab8540316170732cc369d34543ad7477527
+size 225208789
diff --git a/checkpoints/model_weights_000180224000.pt b/checkpoints/model_weights_000180224000.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a91f928c75733f0cdfba375fe12ed28fa1b1c1cb
--- /dev/null
+++ b/checkpoints/model_weights_000180224000.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a707f6648eea2f7da1b42674f86ea63cafeb10cd41e2afc381bd982cc79144be
+size 225208789
diff --git a/checkpoints/model_weights_000191496192.pt b/checkpoints/model_weights_000191496192.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8f51eca32b9ccbc86389a19dc40cc1087d0445f4
--- /dev/null
+++ b/checkpoints/model_weights_000191496192.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:745f26c7f1955ac0e13be5fcbb3f046a28c5b744e29e12b73ddba2ea11c216a2
+size 225208789
diff --git a/checkpoints/model_weights_000196608000.pt b/checkpoints/model_weights_000196608000.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5159a523a74c6866a44ea3ff9a77f37bb78440aa
--- /dev/null
+++ b/checkpoints/model_weights_000196608000.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f881c7b001ab863b0ad96e3eb4d20c3b352545c103760122894ea670d328872b
+size 225208789
diff --git a/config.toml b/config.toml
new file mode 100644
index 0000000000000000000000000000000000000000..d79b6f7e934d872d882504e93be661bbbcf3a834
--- /dev/null
+++ b/config.toml
@@ -0,0 +1,32 @@
+model_name = "c4_code-200m-duplicate"
+n_layers = 2
+d_model = 512
+d_mlp = 2048
+d_head = 64
+n_heads = 8
+attn_only = false
+layer_norm_eps = 1e-05
+init_range = 0.02
+n_ctx = 1024
+d_vocab = 48262
+dataset_name = "eoinf/c4_code-200m"
+tokenizer_name = "NeelNanda/gpt-neox-tokenizer-digits"
+seed = 10
+device = "cuda"
+use_bfloat16_matmul = false
+batch_size_per_device = 32
+n_devices = 1
+batches_per_step = 1
+max_tokens = 200000000
+lr_hidden = 0.002
+lr_vector = 0.001
+lr_schedule = "constant_with_warmup"
+warmup_tokens = 30000000
+weight_decay = 0.05
+grad_norm_clip = 1.0
+train_loss_moving_average_beta = 0.99
+log_interval = 25
+save_checkpoints = true
+checkpoint_interval = 500
+checkpoint_interval_ratio = 1.10
+save_log_checkpoints = true
diff --git a/latest_checkpoint.pt b/latest_checkpoint.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2fcdbf9a09d11969f8787f48b6a3427f4f1ef610
--- /dev/null
+++ b/latest_checkpoint.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7636bfd91084c9ca52593fc4300877275728e5161a31aaa1428d8f14c0bc688b
+size 225208311
diff --git a/latest_metadata.json b/latest_metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..3c99ba89b75c21fae254abfe342b38024820164e
--- /dev/null
+++ b/latest_metadata.json
@@ -0,0 +1 @@
+{"step": 6000, "tokens_seen": 196608000, "config": {"model_name": "c4_code-200m-duplicate", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4_code-200m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.048539307487607}
\ No newline at end of file
diff --git a/latest_optimizer.pt b/latest_optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2206aa857500618d3dbf06d532787af602595378
--- /dev/null
+++ b/latest_optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c770a5917e2e1a1b463b14ccb9dd6b6f7b1fff029d754e53b0efdbb7c46afdd2
+size 450422547
diff --git a/run.sh b/run.sh
new file mode 100644
index 0000000000000000000000000000000000000000..5e78fbd811f1b0ca25e3c932a4c6ee3c0fa5e675
--- /dev/null
+++ b/run.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+# Check if "restart" argument is passed to force normal training
+if [ "$1" = "restart" ]; then
+    echo "Force restart: Running normal training ..."
+    python -c "
+import os
+from toy_models.models.trainer import train_transformer_from_config
+current_dir = os.getcwd()
+train_transformer_from_config('config.toml', current_dir)
+"
+else
+    # Check for checkpoints and run appropriate training
+    python -c "
+import os
+from pathlib import Path
+from toy_models.models.trainer import train_transformer_from_config, restart_from_checkpoint
+current_dir = os.getcwd()
+# Check if checkpoints directory exists and has .pt files
+latest_checkpoint = Path('latest_checkpoint.pt')
+if latest_checkpoint.exists():
+    print(f'Found checkpoint: {latest_checkpoint}. Restarting from checkpoint...')
+    restart_from_checkpoint(current_dir)
+else:
+    print('Starting training from beginning ...')
+    train_transformer_from_config(current_dir)
+"
+fi
\ No newline at end of file
diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log
new file mode 100644
index 0000000000000000000000000000000000000000..f748583a60cb94c8c22f2c4d998551c3a655704f
--- /dev/null
+++ b/wandb/debug-internal.log
@@ -0,0 +1,12 @@
+{"time":"2025-09-15T22:49:33.470091482Z","level":"INFO","msg":"stream: starting","core version":"0.21.4"}
+{"time":"2025-09-15T22:49:33.680162962Z","level":"INFO","msg":"stream: created new stream","id":"8ifme58a"}
+{"time":"2025-09-15T22:49:33.680205928Z","level":"INFO","msg":"stream: started","id":"8ifme58a"}
+{"time":"2025-09-15T22:49:33.680242811Z","level":"INFO","msg":"writer: started","stream_id":"8ifme58a"}
+{"time":"2025-09-15T22:49:33.68030656Z","level":"INFO","msg":"handler: started","stream_id":"8ifme58a"}
+{"time":"2025-09-15T22:49:33.680252446Z","level":"INFO","msg":"sender: started","stream_id":"8ifme58a"}
+{"time":"2025-09-15T23:59:30.672107041Z","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run metadata","runtime_seconds":0.000502052}],"total_operations":1}}
+{"time":"2025-09-15T23:59:30.991214003Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
+{"time":"2025-09-15T23:59:31.113023987Z","level":"INFO","msg":"stream: closing","id":"8ifme58a"}
+{"time":"2025-09-15T23:59:31.113070418Z","level":"INFO","msg":"handler: closed","stream_id":"8ifme58a"}
+{"time":"2025-09-15T23:59:31.113133266Z","level":"INFO","msg":"sender: closed","stream_id":"8ifme58a"}
+{"time":"2025-09-15T23:59:31.113153008Z","level":"INFO","msg":"stream: closed","id":"8ifme58a"}
diff --git a/wandb/debug.log b/wandb/debug.log
new file mode 100644
index 0000000000000000000000000000000000000000..a0dc08998071c1aef96a5d232c69b08fe35aa435
--- /dev/null
+++ b/wandb/debug.log
@@ -0,0 +1,26 @@
+2025-09-15 22:49:33,198 INFO    MainThread:292 [wandb_setup.py:_flush():81] Current SDK version is 0.21.4
+2025-09-15 22:49:33,198 INFO    MainThread:292 [wandb_setup.py:_flush():81] Configure stats pid to 292
+2025-09-15 22:49:33,198 INFO    MainThread:292 [wandb_setup.py:_flush():81] Loading settings from /root/.config/wandb/settings
+2025-09-15 22:49:33,198 INFO    MainThread:292 [wandb_setup.py:_flush():81] Loading settings from /notebooks/toy_models/model_training/c4_code-200m-duplicate/wandb/settings
+2025-09-15 22:49:33,198 INFO    MainThread:292 [wandb_setup.py:_flush():81] Loading settings from environment variables
+2025-09-15 22:49:33,198 INFO    MainThread:292 [wandb_init.py:setup_run_log_directory():686] Logging user logs to /notebooks/toy_models/model_training/c4_code-200m-duplicate/wandb/run-20250915_224933-8ifme58a/logs/debug.log
+2025-09-15 22:49:33,198 INFO    MainThread:292 [wandb_init.py:setup_run_log_directory():687] Logging internal logs to /notebooks/toy_models/model_training/c4_code-200m-duplicate/wandb/run-20250915_224933-8ifme58a/logs/debug-internal.log
+2025-09-15 22:49:33,198 INFO    MainThread:292 [wandb_init.py:init():813] calling init triggers
+2025-09-15 22:49:33,198 INFO    MainThread:292 [wandb_init.py:init():818] wandb.init called with sweep_config: {}
+config: {'model_name': 'c4_code-200m-duplicate', 'n_layers': 2, 'd_model': 512, 'd_mlp': 2048, 'd_head': 64, 'n_heads': 8, 'attn_only': False, 'layer_norm_eps': 1e-05, 'init_range': 0.02, 'n_ctx': 1024, 'd_vocab': 48262, 'dataset_name': 'eoinf/c4_code-200m', 'tokenizer_name': 'NeelNanda/gpt-neox-tokenizer-digits', 'seed': 10, 'device': 'cuda', 'use_bfloat16_matmul': False, 'batch_size_per_device': 32, 'n_devices': 1, 'batches_per_step': 1, 'max_tokens': 200000000, 'lr_hidden': 0.002, 'lr_vector': 0.001, 'lr_schedule': 'constant_with_warmup', 'warmup_tokens': 30000000, 'weight_decay': 0.05, 'grad_norm_clip': 1.0, 'train_loss_moving_average_beta': 0.99, 'log_interval': 25, 'save_checkpoints': True, 'checkpoint_interval': 500, 'checkpoint_interval_ratio': 1.1, 'save_log_checkpoints': True, 'use_wandb': True, 'batch_size': 32, 'tokens_per_step': 32768, 'warmup_steps': 915, 'max_steps': 6103, '_wandb': {}}
+2025-09-15 22:49:33,198 INFO    MainThread:292 [wandb_init.py:init():854] starting backend
+2025-09-15 22:49:33,457 INFO    MainThread:292 [wandb_init.py:init():857] sending inform_init request
+2025-09-15 22:49:33,466 INFO    MainThread:292 [wandb_init.py:init():865] backend started and connected
+2025-09-15 22:49:33,467 INFO    MainThread:292 [wandb_init.py:init():936] updated telemetry
+2025-09-15 22:49:33,475 INFO    MainThread:292 [wandb_init.py:init():960] communicating run to backend with 90.0 second timeout
+2025-09-15 22:49:33,882 INFO    MainThread:292 [wandb_init.py:init():1011] starting run threads in backend
+2025-09-15 22:49:34,265 INFO    MainThread:292 [wandb_run.py:_console_start():2506] atexit reg
+2025-09-15 22:49:34,265 INFO    MainThread:292 [wandb_run.py:_redirect():2354] redirect: wrap_raw
+2025-09-15 22:49:34,265 INFO    MainThread:292 [wandb_run.py:_redirect():2423] Wrapping output streams.
+2025-09-15 22:49:34,265 INFO    MainThread:292 [wandb_run.py:_redirect():2446] Redirects installed.
+2025-09-15 22:49:34,275 INFO    MainThread:292 [wandb_init.py:init():1049] run started, returning control to user process
+2025-09-15 23:59:30,667 INFO    MainThread:292 [wandb_run.py:_finish():2272] finishing run tzach/toy-transformer-replication/8ifme58a
+2025-09-15 23:59:30,670 INFO    MainThread:292 [wandb_run.py:_atexit_cleanup():2471] got exitcode: 0
+2025-09-15 23:59:30,671 INFO    MainThread:292 [wandb_run.py:_restore():2453] restore
+2025-09-15 23:59:30,671 INFO    MainThread:292 [wandb_run.py:_restore():2459] restore done
+2025-09-15 23:59:31,112 INFO    MainThread:292 [wandb_run.py:_footer_sync_info():3867] logging synced files
diff --git a/wandb/run-20250915_224933-8ifme58a/files/config.yaml b/wandb/run-20250915_224933-8ifme58a/files/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..830dd8b53d9d1e250d9d9e3a0878bc9c318a01c3
--- /dev/null
+++ b/wandb/run-20250915_224933-8ifme58a/files/config.yaml
@@ -0,0 +1,134 @@
+_wandb:
+    value:
+        cli_version: 0.21.4
+        e:
+            06et8bi69wc4jspxlpqys3j0r2vjrawb:
+                cpu_count: 8
+                cpu_count_logical: 8
+                cudaVersion: "12.4"
+                disk:
+                    /:
+                        total: "262240792576"
+                        used: "125140271104"
+                email: tzfof8@gmail.com
+                executable: /notebooks/toy_models/.toy_models_env/bin/python
+                git:
+                    commit: 12726eab9fc560dea9bf6aaf8aecd690c95aed21
+                    remote: https://github.com/jgroh3/toy_models.git
+                gpu: NVIDIA RTX A6000
+                gpu_count: 1
+                gpu_nvidia:
+                    - architecture: Ampere
+                      cudaCores: 10752
+                      memoryTotal: "51527024640"
+                      name: NVIDIA RTX A6000
+                      uuid: GPU-6bec9865-f1b0-5db9-8f6d-d3fd9b73eecf
+                host: njgrtoyynl
+                memory:
+                    total: "47332843520"
+                os: Linux-5.19.0-45-generic-x86_64-with-glibc2.35
+                program: <python with no main file>
+                python: CPython 3.11.7
+                root: /notebooks/toy_models/model_training/c4_code-200m-duplicate
+                startedAt: "2025-09-15T22:49:33.194685Z"
+                writerId: 06et8bi69wc4jspxlpqys3j0r2vjrawb
+        m: []
+        python_version: 3.11.7
+        t:
+            "1":
+                - 1
+                - 11
+                - 49
+                - 51
+                - 71
+            "2":
+                - 1
+                - 11
+                - 49
+                - 51
+                - 71
+            "3":
+                - 2
+                - 13
+                - 15
+                - 16
+                - 61
+            "4": 3.11.7
+            "5": 0.21.4
+            "6": 4.56.1
+            "12": 0.21.4
+            "13": linux-x86_64
+attn_only:
+    value: false
+batch_size:
+    value: 32
+batch_size_per_device:
+    value: 32
+batches_per_step:
+    value: 1
+checkpoint_interval:
+    value: 500
+checkpoint_interval_ratio:
+    value: 1.1
+d_head:
+    value: 64
+d_mlp:
+    value: 2048
+d_model:
+    value: 512
+d_vocab:
+    value: 48262
+dataset_name:
+    value: eoinf/c4_code-200m
+device:
+    value: cuda
+grad_norm_clip:
+    value: 1
+init_range:
+    value: 0.02
+layer_norm_eps:
+    value: 1e-05
+log_interval:
+    value: 25
+lr_hidden:
+    value: 0.002
+lr_schedule:
+    value: constant_with_warmup
+lr_vector:
+    value: 0.001
+max_steps:
+    value: 6103
+max_tokens:
+    value: 200000000
+model_name:
+    value: c4_code-200m-duplicate
+n_ctx:
+    value: 1024
+n_devices:
+    value: 1
+n_heads:
+    value: 8
+n_layers:
+    value: 2
+save_checkpoints:
+    value: true
+save_log_checkpoints:
+    value: true
+seed:
+    value: 10
+tokenizer_name:
+    value: NeelNanda/gpt-neox-tokenizer-digits
+tokens_per_step:
+    value: 32768
+train_loss_moving_average_beta:
+    value: 0.99
+use_bfloat16_matmul:
+    value: false
+use_wandb:
+    value: true
+warmup_steps:
+    value: 915
+warmup_tokens:
+    value: 30000000
+weight_decay:
+    value: 0.05
diff --git a/wandb/run-20250915_224933-8ifme58a/files/output.log b/wandb/run-20250915_224933-8ifme58a/files/output.log
new file mode 100644
index 0000000000000000000000000000000000000000..33930c40953a25046b6d0233598dc9a2f01d4792
--- /dev/null
+++ b/wandb/run-20250915_224933-8ifme58a/files/output.log
@@ -0,0 +1,252 @@
+Training on cuda
+Model: 2L, 512d, 8h
+Max steps: 6,103, Max tokens: 200,000,000
+Warmup steps: 915, Warmup tokens: 30,000,000
+Batch size per device: 32
+Context length: 1024
+Learning rates - Hidden: 0.002, Vector: 0.001
+                                                                                                                                                                                                       
+Step 25 | Tokens: 819,200 | Train Loss EWMA: 10.7895 | Learning Rate: 0.000055 | Progress: 0.00410
+Step 50 | Tokens: 1,638,400 | Train Loss EWMA: 10.5558 | Learning Rate: 0.000109 | Progress: 0.00819
+Step 75 | Tokens: 2,457,600 | Train Loss EWMA: 10.1913 | Learning Rate: 0.000164 | Progress: 0.01229
+Step 100 | Tokens: 3,276,800 | Train Loss EWMA: 9.7113 | Learning Rate: 0.000219 | Progress: 0.01638
+Step 125 | Tokens: 4,096,000 | Train Loss EWMA: 9.2015 | Learning Rate: 0.000273 | Progress: 0.02048
+Step 150 | Tokens: 4,915,200 | Train Loss EWMA: 8.7328 | Learning Rate: 0.000328 | Progress: 0.02458
+Step 175 | Tokens: 5,734,400 | Train Loss EWMA: 8.3222 | Learning Rate: 0.000383 | Progress: 0.02867
+Step 200 | Tokens: 6,553,600 | Train Loss EWMA: 7.9618 | Learning Rate: 0.000437 | Progress: 0.03277
+Step 225 | Tokens: 7,372,800 | Train Loss EWMA: 7.6558 | Learning Rate: 0.000492 | Progress: 0.03686
+Step 250 | Tokens: 8,192,000 | Train Loss EWMA: 7.3896 | Learning Rate: 0.000546 | Progress: 0.04096
+Step 275 | Tokens: 9,011,200 | Train Loss EWMA: 7.1689 | Learning Rate: 0.000601 | Progress: 0.04506
+Step 300 | Tokens: 9,830,400 | Train Loss EWMA: 6.9817 | Learning Rate: 0.000656 | Progress: 0.04915
+Step 325 | Tokens: 10,649,600 | Train Loss EWMA: 6.8292 | Learning Rate: 0.000710 | Progress: 0.05325
+Step 350 | Tokens: 11,468,800 | Train Loss EWMA: 6.6892 | Learning Rate: 0.000765 | Progress: 0.05734
+Step 375 | Tokens: 12,288,000 | Train Loss EWMA: 6.5775 | Learning Rate: 0.000820 | Progress: 0.06144
+Step 400 | Tokens: 13,107,200 | Train Loss EWMA: 6.4811 | Learning Rate: 0.000874 | Progress: 0.06554
+Step 425 | Tokens: 13,926,400 | Train Loss EWMA: 6.3982 | Learning Rate: 0.000929 | Progress: 0.06963
+Step 450 | Tokens: 14,745,600 | Train Loss EWMA: 6.3248 | Learning Rate: 0.000984 | Progress: 0.07373
+Step 475 | Tokens: 15,564,800 | Train Loss EWMA: 6.2612 | Learning Rate: 0.001038 | Progress: 0.07782
+Step 500 | Tokens: 16,384,000 | Train Loss EWMA: 6.2136 | Learning Rate: 0.001093 | Progress: 0.08192
+Step 525 | Tokens: 17,203,200 | Train Loss EWMA: 6.1616 | Learning Rate: 0.001148 | Progress: 0.08602
+Step 550 | Tokens: 18,022,400 | Train Loss EWMA: 6.1161 | Learning Rate: 0.001202 | Progress: 0.09011
+Step 575 | Tokens: 18,841,600 | Train Loss EWMA: 6.0719 | Learning Rate: 0.001257 | Progress: 0.09421
+Step 600 | Tokens: 19,660,800 | Train Loss EWMA: 6.0335 | Learning Rate: 0.001311 | Progress: 0.09830
+Step 625 | Tokens: 20,480,000 | Train Loss EWMA: 5.9917 | Learning Rate: 0.001366 | Progress: 0.10240
+Step 650 | Tokens: 21,299,200 | Train Loss EWMA: 5.9610 | Learning Rate: 0.001421 | Progress: 0.10650
+Step 675 | Tokens: 22,118,400 | Train Loss EWMA: 5.9277 | Learning Rate: 0.001475 | Progress: 0.11059
+Step 700 | Tokens: 22,937,600 | Train Loss EWMA: 5.8957 | Learning Rate: 0.001530 | Progress: 0.11469
+Step 725 | Tokens: 23,756,800 | Train Loss EWMA: 5.8645 | Learning Rate: 0.001585 | Progress: 0.11878
+Step 750 | Tokens: 24,576,000 | Train Loss EWMA: 5.8371 | Learning Rate: 0.001639 | Progress: 0.12288
+Step 775 | Tokens: 25,395,200 | Train Loss EWMA: 5.8143 | Learning Rate: 0.001694 | Progress: 0.12698
+Step 800 | Tokens: 26,214,400 | Train Loss EWMA: 5.7904 | Learning Rate: 0.001749 | Progress: 0.13107
+Step 825 | Tokens: 27,033,600 | Train Loss EWMA: 5.7622 | Learning Rate: 0.001803 | Progress: 0.13517
+Step 850 | Tokens: 27,852,800 | Train Loss EWMA: 5.7361 | Learning Rate: 0.001858 | Progress: 0.13926
+Step 875 | Tokens: 28,672,000 | Train Loss EWMA: 5.7132 | Learning Rate: 0.001913 | Progress: 0.14336
+Step 900 | Tokens: 29,491,200 | Train Loss EWMA: 5.6928 | Learning Rate: 0.001967 | Progress: 0.14746
+Step 925 | Tokens: 30,310,400 | Train Loss EWMA: 5.6693 | Learning Rate: 0.002000 | Progress: 0.15155
+Step 950 | Tokens: 31,129,600 | Train Loss EWMA: 5.6423 | Learning Rate: 0.002000 | Progress: 0.15565
+Step 975 | Tokens: 31,948,800 | Train Loss EWMA: 5.6040 | Learning Rate: 0.002000 | Progress: 0.15974
+Step 1,000 | Tokens: 32,768,000 | Train Loss EWMA: 5.5763 | Learning Rate: 0.002000 | Progress: 0.16384
+Step 1,025 | Tokens: 33,587,200 | Train Loss EWMA: 5.5502 | Learning Rate: 0.002000 | Progress: 0.16794
+Step 1,050 | Tokens: 34,406,400 | Train Loss EWMA: 5.5266 | Learning Rate: 0.002000 | Progress: 0.17203
+Step 1,075 | Tokens: 35,225,600 | Train Loss EWMA: 5.4915 | Learning Rate: 0.002000 | Progress: 0.17613
+Step 1,100 | Tokens: 36,044,800 | Train Loss EWMA: 5.4664 | Learning Rate: 0.002000 | Progress: 0.18022
+Step 1,125 | Tokens: 36,864,000 | Train Loss EWMA: 5.4356 | Learning Rate: 0.002000 | Progress: 0.18432
+Step 1,150 | Tokens: 37,683,200 | Train Loss EWMA: 5.4053 | Learning Rate: 0.002000 | Progress: 0.18842
+Step 1,175 | Tokens: 38,502,400 | Train Loss EWMA: 5.3819 | Learning Rate: 0.002000 | Progress: 0.19251
+Step 1,200 | Tokens: 39,321,600 | Train Loss EWMA: 5.3555 | Learning Rate: 0.002000 | Progress: 0.19661
+Step 1,225 | Tokens: 40,140,800 | Train Loss EWMA: 5.3308 | Learning Rate: 0.002000 | Progress: 0.20070
+Step 1,250 | Tokens: 40,960,000 | Train Loss EWMA: 5.3168 | Learning Rate: 0.002000 | Progress: 0.20480
+Step 1,275 | Tokens: 41,779,200 | Train Loss EWMA: 5.2968 | Learning Rate: 0.002000 | Progress: 0.20890
+Step 1,300 | Tokens: 42,598,400 | Train Loss EWMA: 5.2740 | Learning Rate: 0.002000 | Progress: 0.21299
+Step 1,325 | Tokens: 43,417,600 | Train Loss EWMA: 5.2592 | Learning Rate: 0.002000 | Progress: 0.21709
+Step 1,350 | Tokens: 44,236,800 | Train Loss EWMA: 5.2403 | Learning Rate: 0.002000 | Progress: 0.22118
+Step 1,375 | Tokens: 45,056,000 | Train Loss EWMA: 5.2239 | Learning Rate: 0.002000 | Progress: 0.22528
+Step 1,400 | Tokens: 45,875,200 | Train Loss EWMA: 5.1982 | Learning Rate: 0.002000 | Progress: 0.22938
+Step 1,425 | Tokens: 46,694,400 | Train Loss EWMA: 5.1789 | Learning Rate: 0.002000 | Progress: 0.23347
+Step 1,450 | Tokens: 47,513,600 | Train Loss EWMA: 5.1691 | Learning Rate: 0.002000 | Progress: 0.23757
+Step 1,475 | Tokens: 48,332,800 | Train Loss EWMA: 5.1489 | Learning Rate: 0.002000 | Progress: 0.24166
+Step 1,500 | Tokens: 49,152,000 | Train Loss EWMA: 5.1312 | Learning Rate: 0.002000 | Progress: 0.24576
+Step 1,525 | Tokens: 49,971,200 | Train Loss EWMA: 5.1129 | Learning Rate: 0.002000 | Progress: 0.24986
+Step 1,550 | Tokens: 50,790,400 | Train Loss EWMA: 5.0994 | Learning Rate: 0.002000 | Progress: 0.25395
+Step 1,575 | Tokens: 51,609,600 | Train Loss EWMA: 5.0977 | Learning Rate: 0.002000 | Progress: 0.25805
+Step 1,600 | Tokens: 52,428,800 | Train Loss EWMA: 5.0810 | Learning Rate: 0.002000 | Progress: 0.26214
+Step 1,625 | Tokens: 53,248,000 | Train Loss EWMA: 5.0633 | Learning Rate: 0.002000 | Progress: 0.26624
+Step 1,650 | Tokens: 54,067,200 | Train Loss EWMA: 5.0582 | Learning Rate: 0.002000 | Progress: 0.27034
+Step 1,675 | Tokens: 54,886,400 | Train Loss EWMA: 5.0450 | Learning Rate: 0.002000 | Progress: 0.27443
+Step 1,700 | Tokens: 55,705,600 | Train Loss EWMA: 5.0327 | Learning Rate: 0.002000 | Progress: 0.27853
+Step 1,725 | Tokens: 56,524,800 | Train Loss EWMA: 5.0160 | Learning Rate: 0.002000 | Progress: 0.28262
+Step 1,750 | Tokens: 57,344,000 | Train Loss EWMA: 5.0110 | Learning Rate: 0.002000 | Progress: 0.28672
+Step 1,775 | Tokens: 58,163,200 | Train Loss EWMA: 4.9911 | Learning Rate: 0.002000 | Progress: 0.29082
+Step 1,800 | Tokens: 58,982,400 | Train Loss EWMA: 4.9731 | Learning Rate: 0.002000 | Progress: 0.29491
+Step 1,825 | Tokens: 59,801,600 | Train Loss EWMA: 4.9521 | Learning Rate: 0.002000 | Progress: 0.29901
+Step 1,850 | Tokens: 60,620,800 | Train Loss EWMA: 4.9372 | Learning Rate: 0.002000 | Progress: 0.30310
+Step 1,875 | Tokens: 61,440,000 | Train Loss EWMA: 4.9163 | Learning Rate: 0.002000 | Progress: 0.30720
+Step 1,900 | Tokens: 62,259,200 | Train Loss EWMA: 4.8942 | Learning Rate: 0.002000 | Progress: 0.31130
+Step 1,925 | Tokens: 63,078,400 | Train Loss EWMA: 4.8781 | Learning Rate: 0.002000 | Progress: 0.31539
+Step 1,950 | Tokens: 63,897,600 | Train Loss EWMA: 4.8693 | Learning Rate: 0.002000 | Progress: 0.31949
+Step 1,975 | Tokens: 64,716,800 | Train Loss EWMA: 4.8496 | Learning Rate: 0.002000 | Progress: 0.32358
+Step 2,000 | Tokens: 65,536,000 | Train Loss EWMA: 4.8330 | Learning Rate: 0.002000 | Progress: 0.32768
+Step 2,025 | Tokens: 66,355,200 | Train Loss EWMA: 4.8163 | Learning Rate: 0.002000 | Progress: 0.33178
+Step 2,050 | Tokens: 67,174,400 | Train Loss EWMA: 4.7965 | Learning Rate: 0.002000 | Progress: 0.33587
+Step 2,075 | Tokens: 67,993,600 | Train Loss EWMA: 4.7694 | Learning Rate: 0.002000 | Progress: 0.33997
+Step 2,100 | Tokens: 68,812,800 | Train Loss EWMA: 4.7515 | Learning Rate: 0.002000 | Progress: 0.34406
+Step 2,125 | Tokens: 69,632,000 | Train Loss EWMA: 4.7349 | Learning Rate: 0.002000 | Progress: 0.34816
+Step 2,150 | Tokens: 70,451,200 | Train Loss EWMA: 4.7293 | Learning Rate: 0.002000 | Progress: 0.35226
+Step 2,175 | Tokens: 71,270,400 | Train Loss EWMA: 4.7089 | Learning Rate: 0.002000 | Progress: 0.35635
+Step 2,200 | Tokens: 72,089,600 | Train Loss EWMA: 4.7032 | Learning Rate: 0.002000 | Progress: 0.36045
+Step 2,225 | Tokens: 72,908,800 | Train Loss EWMA: 4.6812 | Learning Rate: 0.002000 | Progress: 0.36454
+Step 2,250 | Tokens: 73,728,000 | Train Loss EWMA: 4.6641 | Learning Rate: 0.002000 | Progress: 0.36864
+Step 2,275 | Tokens: 74,547,200 | Train Loss EWMA: 4.6563 | Learning Rate: 0.002000 | Progress: 0.37274
+Step 2,300 | Tokens: 75,366,400 | Train Loss EWMA: 4.6402 | Learning Rate: 0.002000 | Progress: 0.37683
+Step 2,325 | Tokens: 76,185,600 | Train Loss EWMA: 4.6354 | Learning Rate: 0.002000 | Progress: 0.38093
+Step 2,350 | Tokens: 77,004,800 | Train Loss EWMA: 4.6230 | Learning Rate: 0.002000 | Progress: 0.38502
+Step 2,375 | Tokens: 77,824,000 | Train Loss EWMA: 4.6178 | Learning Rate: 0.002000 | Progress: 0.38912
+Step 2,400 | Tokens: 78,643,200 | Train Loss EWMA: 4.6020 | Learning Rate: 0.002000 | Progress: 0.39322
+Step 2,425 | Tokens: 79,462,400 | Train Loss EWMA: 4.5801 | Learning Rate: 0.002000 | Progress: 0.39731
+Step 2,450 | Tokens: 80,281,600 | Train Loss EWMA: 4.5774 | Learning Rate: 0.002000 | Progress: 0.40141
+Step 2,475 | Tokens: 81,100,800 | Train Loss EWMA: 4.5609 | Learning Rate: 0.002000 | Progress: 0.40550
+Step 2,500 | Tokens: 81,920,000 | Train Loss EWMA: 4.5484 | Learning Rate: 0.002000 | Progress: 0.40960
+Step 2,525 | Tokens: 82,739,200 | Train Loss EWMA: 4.5461 | Learning Rate: 0.002000 | Progress: 0.41370
+Step 2,550 | Tokens: 83,558,400 | Train Loss EWMA: 4.5251 | Learning Rate: 0.002000 | Progress: 0.41779
+Step 2,575 | Tokens: 84,377,600 | Train Loss EWMA: 4.5113 | Learning Rate: 0.002000 | Progress: 0.42189
+Step 2,600 | Tokens: 85,196,800 | Train Loss EWMA: 4.5008 | Learning Rate: 0.002000 | Progress: 0.42598
+Step 2,625 | Tokens: 86,016,000 | Train Loss EWMA: 4.5015 | Learning Rate: 0.002000 | Progress: 0.43008
+Step 2,650 | Tokens: 86,835,200 | Train Loss EWMA: 4.4908 | Learning Rate: 0.002000 | Progress: 0.43418
+Step 2,675 | Tokens: 87,654,400 | Train Loss EWMA: 4.4890 | Learning Rate: 0.002000 | Progress: 0.43827
+Step 2,700 | Tokens: 88,473,600 | Train Loss EWMA: 4.4816 | Learning Rate: 0.002000 | Progress: 0.44237
+Step 2,725 | Tokens: 89,292,800 | Train Loss EWMA: 4.4716 | Learning Rate: 0.002000 | Progress: 0.44646
+Step 2,750 | Tokens: 90,112,000 | Train Loss EWMA: 4.4590 | Learning Rate: 0.002000 | Progress: 0.45056
+Step 2,775 | Tokens: 90,931,200 | Train Loss EWMA: 4.4562 | Learning Rate: 0.002000 | Progress: 0.45466
+Step 2,800 | Tokens: 91,750,400 | Train Loss EWMA: 4.4517 | Learning Rate: 0.002000 | Progress: 0.45875
+Step 2,825 | Tokens: 92,569,600 | Train Loss EWMA: 4.4441 | Learning Rate: 0.002000 | Progress: 0.46285
+Step 2,850 | Tokens: 93,388,800 | Train Loss EWMA: 4.4387 | Learning Rate: 0.002000 | Progress: 0.46694
+Step 2,875 | Tokens: 94,208,000 | Train Loss EWMA: 4.4304 | Learning Rate: 0.002000 | Progress: 0.47104
+Step 2,900 | Tokens: 95,027,200 | Train Loss EWMA: 4.4382 | Learning Rate: 0.002000 | Progress: 0.47514
+Step 2,925 | Tokens: 95,846,400 | Train Loss EWMA: 4.4234 | Learning Rate: 0.002000 | Progress: 0.47923
+Step 2,950 | Tokens: 96,665,600 | Train Loss EWMA: 4.4093 | Learning Rate: 0.002000 | Progress: 0.48333
+Step 2,975 | Tokens: 97,484,800 | Train Loss EWMA: 4.4015 | Learning Rate: 0.002000 | Progress: 0.48742
+Step 3,000 | Tokens: 98,304,000 | Train Loss EWMA: 4.3902 | Learning Rate: 0.002000 | Progress: 0.49152
+Step 3,025 | Tokens: 99,123,200 | Train Loss EWMA: 4.3836 | Learning Rate: 0.002000 | Progress: 0.49562
+Step 3,050 | Tokens: 99,942,400 | Train Loss EWMA: 4.3782 | Learning Rate: 0.002000 | Progress: 0.49971
+Step 3,075 | Tokens: 100,761,600 | Train Loss EWMA: 4.3647 | Learning Rate: 0.002000 | Progress: 0.50381
+Step 3,100 | Tokens: 101,580,800 | Train Loss EWMA: 4.3722 | Learning Rate: 0.002000 | Progress: 0.50790
+Step 3,125 | Tokens: 102,400,000 | Train Loss EWMA: 4.3757 | Learning Rate: 0.002000 | Progress: 0.51200
+Step 3,150 | Tokens: 103,219,200 | Train Loss EWMA: 4.3735 | Learning Rate: 0.002000 | Progress: 0.51610
+Step 3,175 | Tokens: 104,038,400 | Train Loss EWMA: 4.3554 | Learning Rate: 0.002000 | Progress: 0.52019
+Step 3,200 | Tokens: 104,857,600 | Train Loss EWMA: 4.3471 | Learning Rate: 0.002000 | Progress: 0.52429
+Step 3,225 | Tokens: 105,676,800 | Train Loss EWMA: 4.3332 | Learning Rate: 0.002000 | Progress: 0.52838
+Step 3,250 | Tokens: 106,496,000 | Train Loss EWMA: 4.3291 | Learning Rate: 0.002000 | Progress: 0.53248
+Step 3,275 | Tokens: 107,315,200 | Train Loss EWMA: 4.3207 | Learning Rate: 0.002000 | Progress: 0.53658
+Step 3,300 | Tokens: 108,134,400 | Train Loss EWMA: 4.3147 | Learning Rate: 0.002000 | Progress: 0.54067
+Step 3,325 | Tokens: 108,953,600 | Train Loss EWMA: 4.3112 | Learning Rate: 0.002000 | Progress: 0.54477
+Step 3,350 | Tokens: 109,772,800 | Train Loss EWMA: 4.3054 | Learning Rate: 0.002000 | Progress: 0.54886
+Step 3,375 | Tokens: 110,592,000 | Train Loss EWMA: 4.2959 | Learning Rate: 0.002000 | Progress: 0.55296
+Step 3,400 | Tokens: 111,411,200 | Train Loss EWMA: 4.2946 | Learning Rate: 0.002000 | Progress: 0.55706
+Step 3,425 | Tokens: 112,230,400 | Train Loss EWMA: 4.2881 | Learning Rate: 0.002000 | Progress: 0.56115
+Step 3,450 | Tokens: 113,049,600 | Train Loss EWMA: 4.2853 | Learning Rate: 0.002000 | Progress: 0.56525
+Step 3,475 | Tokens: 113,868,800 | Train Loss EWMA: 4.2763 | Learning Rate: 0.002000 | Progress: 0.56934
+Step 3,500 | Tokens: 114,688,000 | Train Loss EWMA: 4.2761 | Learning Rate: 0.002000 | Progress: 0.57344
+Step 3,525 | Tokens: 115,507,200 | Train Loss EWMA: 4.2715 | Learning Rate: 0.002000 | Progress: 0.57754
+Step 3,550 | Tokens: 116,326,400 | Train Loss EWMA: 4.2740 | Learning Rate: 0.002000 | Progress: 0.58163
+Step 3,575 | Tokens: 117,145,600 | Train Loss EWMA: 4.2577 | Learning Rate: 0.002000 | Progress: 0.58573
+Step 3,600 | Tokens: 117,964,800 | Train Loss EWMA: 4.2548 | Learning Rate: 0.002000 | Progress: 0.58982
+Step 3,625 | Tokens: 118,784,000 | Train Loss EWMA: 4.2399 | Learning Rate: 0.002000 | Progress: 0.59392
+Step 3,650 | Tokens: 119,603,200 | Train Loss EWMA: 4.2425 | Learning Rate: 0.002000 | Progress: 0.59802
+Step 3,675 | Tokens: 120,422,400 | Train Loss EWMA: 4.2433 | Learning Rate: 0.002000 | Progress: 0.60211
+Step 3,700 | Tokens: 121,241,600 | Train Loss EWMA: 4.2502 | Learning Rate: 0.002000 | Progress: 0.60621
+Step 3,725 | Tokens: 122,060,800 | Train Loss EWMA: 4.2387 | Learning Rate: 0.002000 | Progress: 0.61030
+Step 3,750 | Tokens: 122,880,000 | Train Loss EWMA: 4.2413 | Learning Rate: 0.002000 | Progress: 0.61440
+Step 3,775 | Tokens: 123,699,200 | Train Loss EWMA: 4.2434 | Learning Rate: 0.002000 | Progress: 0.61850
+Step 3,800 | Tokens: 124,518,400 | Train Loss EWMA: 4.2362 | Learning Rate: 0.002000 | Progress: 0.62259
+Step 3,825 | Tokens: 125,337,600 | Train Loss EWMA: 4.2369 | Learning Rate: 0.002000 | Progress: 0.62669
+Step 3,850 | Tokens: 126,156,800 | Train Loss EWMA: 4.2243 | Learning Rate: 0.002000 | Progress: 0.63078
+Step 3,875 | Tokens: 126,976,000 | Train Loss EWMA: 4.2264 | Learning Rate: 0.002000 | Progress: 0.63488
+Step 3,900 | Tokens: 127,795,200 | Train Loss EWMA: 4.2274 | Learning Rate: 0.002000 | Progress: 0.63898
+Step 3,925 | Tokens: 128,614,400 | Train Loss EWMA: 4.2247 | Learning Rate: 0.002000 | Progress: 0.64307
+Step 3,950 | Tokens: 129,433,600 | Train Loss EWMA: 4.2173 | Learning Rate: 0.002000 | Progress: 0.64717
+Step 3,975 | Tokens: 130,252,800 | Train Loss EWMA: 4.2262 | Learning Rate: 0.002000 | Progress: 0.65126
+Step 4,000 | Tokens: 131,072,000 | Train Loss EWMA: 4.2174 | Learning Rate: 0.002000 | Progress: 0.65536
+Step 4,025 | Tokens: 131,891,200 | Train Loss EWMA: 4.2262 | Learning Rate: 0.002000 | Progress: 0.65946
+Step 4,050 | Tokens: 132,710,400 | Train Loss EWMA: 4.2262 | Learning Rate: 0.002000 | Progress: 0.66355
+Step 4,075 | Tokens: 133,529,600 | Train Loss EWMA: 4.2232 | Learning Rate: 0.002000 | Progress: 0.66765
+Step 4,100 | Tokens: 134,348,800 | Train Loss EWMA: 4.2176 | Learning Rate: 0.002000 | Progress: 0.67174
+Step 4,125 | Tokens: 135,168,000 | Train Loss EWMA: 4.2115 | Learning Rate: 0.002000 | Progress: 0.67584
+Step 4,150 | Tokens: 135,987,200 | Train Loss EWMA: 4.2057 | Learning Rate: 0.002000 | Progress: 0.67994
+Step 4,175 | Tokens: 136,806,400 | Train Loss EWMA: 4.2023 | Learning Rate: 0.002000 | Progress: 0.68403
+Step 4,200 | Tokens: 137,625,600 | Train Loss EWMA: 4.1974 | Learning Rate: 0.002000 | Progress: 0.68813
+Step 4,225 | Tokens: 138,444,800 | Train Loss EWMA: 4.1993 | Learning Rate: 0.002000 | Progress: 0.69222
+Step 4,250 | Tokens: 139,264,000 | Train Loss EWMA: 4.1928 | Learning Rate: 0.002000 | Progress: 0.69632
+Step 4,275 | Tokens: 140,083,200 | Train Loss EWMA: 4.1832 | Learning Rate: 0.002000 | Progress: 0.70042
+Step 4,300 | Tokens: 140,902,400 | Train Loss EWMA: 4.1866 | Learning Rate: 0.002000 | Progress: 0.70451
+Step 4,325 | Tokens: 141,721,600 | Train Loss EWMA: 4.1791 | Learning Rate: 0.002000 | Progress: 0.70861
+Step 4,350 | Tokens: 142,540,800 | Train Loss EWMA: 4.1746 | Learning Rate: 0.002000 | Progress: 0.71270
+Step 4,375 | Tokens: 143,360,000 | Train Loss EWMA: 4.1720 | Learning Rate: 0.002000 | Progress: 0.71680
+Step 4,400 | Tokens: 144,179,200 | Train Loss EWMA: 4.1575 | Learning Rate: 0.002000 | Progress: 0.72090
+Step 4,425 | Tokens: 144,998,400 | Train Loss EWMA: 4.1598 | Learning Rate: 0.002000 | Progress: 0.72499
+Step 4,450 | Tokens: 145,817,600 | Train Loss EWMA: 4.1606 | Learning Rate: 0.002000 | Progress: 0.72909
+Step 4,475 | Tokens: 146,636,800 | Train Loss EWMA: 4.1493 | Learning Rate: 0.002000 | Progress: 0.73318
+Step 4,500 | Tokens: 147,456,000 | Train Loss EWMA: 4.1503 | Learning Rate: 0.002000 | Progress: 0.73728
+Step 4,525 | Tokens: 148,275,200 | Train Loss EWMA: 4.1483 | Learning Rate: 0.002000 | Progress: 0.74138
+Step 4,550 | Tokens: 149,094,400 | Train Loss EWMA: 4.1514 | Learning Rate: 0.002000 | Progress: 0.74547
+Step 4,575 | Tokens: 149,913,600 | Train Loss EWMA: 4.1442 | Learning Rate: 0.002000 | Progress: 0.74957
+Step 4,600 | Tokens: 150,732,800 | Train Loss EWMA: 4.1549 | Learning Rate: 0.002000 | Progress: 0.75366
+Step 4,625 | Tokens: 151,552,000 | Train Loss EWMA: 4.1514 | Learning Rate: 0.002000 | Progress: 0.75776
+Step 4,650 | Tokens: 152,371,200 | Train Loss EWMA: 4.1625 | Learning Rate: 0.002000 | Progress: 0.76186
+Step 4,675 | Tokens: 153,190,400 | Train Loss EWMA: 4.1642 | Learning Rate: 0.002000 | Progress: 0.76595
+Step 4,700 | Tokens: 154,009,600 | Train Loss EWMA: 4.1598 | Learning Rate: 0.002000 | Progress: 0.77005
+Step 4,725 | Tokens: 154,828,800 | Train Loss EWMA: 4.1494 | Learning Rate: 0.002000 | Progress: 0.77414
+Step 4,750 | Tokens: 155,648,000 | Train Loss EWMA: 4.1534 | Learning Rate: 0.002000 | Progress: 0.77824
+Step 4,775 | Tokens: 156,467,200 | Train Loss EWMA: 4.1543 | Learning Rate: 0.002000 | Progress: 0.78234
+Step 4,800 | Tokens: 157,286,400 | Train Loss EWMA: 4.1389 | Learning Rate: 0.002000 | Progress: 0.78643
+Step 4,825 | Tokens: 158,105,600 | Train Loss EWMA: 4.1303 | Learning Rate: 0.002000 | Progress: 0.79053
+Step 4,850 | Tokens: 158,924,800 | Train Loss EWMA: 4.1404 | Learning Rate: 0.002000 | Progress: 0.79462
+Step 4,875 | Tokens: 159,744,000 | Train Loss EWMA: 4.1499 | Learning Rate: 0.002000 | Progress: 0.79872
+Step 4,900 | Tokens: 160,563,200 | Train Loss EWMA: 4.1628 | Learning Rate: 0.002000 | Progress: 0.80282
+Step 4,925 | Tokens: 161,382,400 | Train Loss EWMA: 4.1524 | Learning Rate: 0.002000 | Progress: 0.80691
+Step 4,950 | Tokens: 162,201,600 | Train Loss EWMA: 4.1490 | Learning Rate: 0.002000 | Progress: 0.81101
+Step 4,975 | Tokens: 163,020,800 | Train Loss EWMA: 4.1434 | Learning Rate: 0.002000 | Progress: 0.81510
+Step 5,000 | Tokens: 163,840,000 | Train Loss EWMA: 4.1352 | Learning Rate: 0.002000 | Progress: 0.81920
+Step 5,025 | Tokens: 164,659,200 | Train Loss EWMA: 4.1331 | Learning Rate: 0.002000 | Progress: 0.82330
+Step 5,050 | Tokens: 165,478,400 | Train Loss EWMA: 4.1287 | Learning Rate: 0.002000 | Progress: 0.82739
+Step 5,075 | Tokens: 166,297,600 | Train Loss EWMA: 4.1321 | Learning Rate: 0.002000 | Progress: 0.83149
+Step 5,100 | Tokens: 167,116,800 | Train Loss EWMA: 4.1372 | Learning Rate: 0.002000 | Progress: 0.83558
+Step 5,125 | Tokens: 167,936,000 | Train Loss EWMA: 4.1144 | Learning Rate: 0.002000 | Progress: 0.83968
+Step 5,150 | Tokens: 168,755,200 | Train Loss EWMA: 4.1104 | Learning Rate: 0.002000 | Progress: 0.84378
+Step 5,175 | Tokens: 169,574,400 | Train Loss EWMA: 4.0976 | Learning Rate: 0.002000 | Progress: 0.84787
+Step 5,200 | Tokens: 170,393,600 | Train Loss EWMA: 4.0943 | Learning Rate: 0.002000 | Progress: 0.85197
+Step 5,225 | Tokens: 171,212,800 | Train Loss EWMA: 4.0941 | Learning Rate: 0.002000 | Progress: 0.85606
+Step 5,250 | Tokens: 172,032,000 | Train Loss EWMA: 4.0946 | Learning Rate: 0.002000 | Progress: 0.86016
+Step 5,275 | Tokens: 172,851,200 | Train Loss EWMA: 4.0926 | Learning Rate: 0.002000 | Progress: 0.86426
+Step 5,300 | Tokens: 173,670,400 | Train Loss EWMA: 4.0924 | Learning Rate: 0.002000 | Progress: 0.86835
+Step 5,325 | Tokens: 174,489,600 | Train Loss EWMA: 4.0977 | Learning Rate: 0.002000 | Progress: 0.87245
+Step 5,350 | Tokens: 175,308,800 | Train Loss EWMA: 4.1026 | Learning Rate: 0.002000 | Progress: 0.87654
+Step 5,375 | Tokens: 176,128,000 | Train Loss EWMA: 4.1057 | Learning Rate: 0.002000 | Progress: 0.88064
+Step 5,400 | Tokens: 176,947,200 | Train Loss EWMA: 4.1031 | Learning Rate: 0.002000 | Progress: 0.88474
+Step 5,425 | Tokens: 177,766,400 | Train Loss EWMA: 4.1072 | Learning Rate: 0.002000 | Progress: 0.88883
+Step 5,450 | Tokens: 178,585,600 | Train Loss EWMA: 4.1002 | Learning Rate: 0.002000 | Progress: 0.89293
+Step 5,475 | Tokens: 179,404,800 | Train Loss EWMA: 4.0991 | Learning Rate: 0.002000 | Progress: 0.89702
+Step 5,500 | Tokens: 180,224,000 | Train Loss EWMA: 4.0935 | Learning Rate: 0.002000 | Progress: 0.90112
+Step 5,525 | Tokens: 181,043,200 | Train Loss EWMA: 4.0943 | Learning Rate: 0.002000 | Progress: 0.90522
+Step 5,550 | Tokens: 181,862,400 | Train Loss EWMA: 4.0944 | Learning Rate: 0.002000 | Progress: 0.90931
+Step 5,575 | Tokens: 182,681,600 | Train Loss EWMA: 4.0900 | Learning Rate: 0.002000 | Progress: 0.91341
+Step 5,600 | Tokens: 183,500,800 | Train Loss EWMA: 4.0930 | Learning Rate: 0.002000 | Progress: 0.91750
+Step 5,625 | Tokens: 184,320,000 | Train Loss EWMA: 4.0783 | Learning Rate: 0.002000 | Progress: 0.92160
+Step 5,650 | Tokens: 185,139,200 | Train Loss EWMA: 4.0793 | Learning Rate: 0.002000 | Progress: 0.92570
+Step 5,675 | Tokens: 185,958,400 | Train Loss EWMA: 4.0871 | Learning Rate: 0.002000 | Progress: 0.92979
+Step 5,700 | Tokens: 186,777,600 | Train Loss EWMA: 4.0832 | Learning Rate: 0.002000 | Progress: 0.93389
+Step 5,725 | Tokens: 187,596,800 | Train Loss EWMA: 4.0825 | Learning Rate: 0.002000 | Progress: 0.93798
+Step 5,750 | Tokens: 188,416,000 | Train Loss EWMA: 4.0781 | Learning Rate: 0.002000 | Progress: 0.94208
+Step 5,775 | Tokens: 189,235,200 | Train Loss EWMA: 4.0704 | Learning Rate: 0.002000 | Progress: 0.94618
+Step 5,800 | Tokens: 190,054,400 | Train Loss EWMA: 4.0670 | Learning Rate: 0.002000 | Progress: 0.95027
+Step 5,825 | Tokens: 190,873,600 | Train Loss EWMA: 4.0677 | Learning Rate: 0.002000 | Progress: 0.95437
+Step 5,850 | Tokens: 191,692,800 | Train Loss EWMA: 4.0652 | Learning Rate: 0.002000 | Progress: 0.95846
+Step 5,875 | Tokens: 192,512,000 | Train Loss EWMA: 4.0763 | Learning Rate: 0.002000 | Progress: 0.96256
+Step 5,900 | Tokens: 193,331,200 | Train Loss EWMA: 4.0736 | Learning Rate: 0.002000 | Progress: 0.96666
+Step 5,925 | Tokens: 194,150,400 | Train Loss EWMA: 4.0803 | Learning Rate: 0.002000 | Progress: 0.97075
+Step 5,950 | Tokens: 194,969,600 | Train Loss EWMA: 4.0727 | Learning Rate: 0.002000 | Progress: 0.97485
+Step 5,975 | Tokens: 195,788,800 | Train Loss EWMA: 4.0616 | Learning Rate: 0.002000 | Progress: 0.97894
+Step 6,000 | Tokens: 196,608,000 | Train Loss EWMA: 4.0485 | Learning Rate: 0.002000 | Progress: 0.98304
+Step 6,025 | Tokens: 197,427,200 | Train Loss EWMA: 4.0403 | Learning Rate: 0.002000 | Progress: 0.98714
+Step 6,050 | Tokens: 198,246,400 | Train Loss EWMA: 4.0412 | Learning Rate: 0.002000 | Progress: 0.99123
+Step 6,075 | Tokens: 199,065,600 | Train Loss EWMA: 4.0379 | Learning Rate: 0.002000 | Progress: 0.99533
+Step 6,100 | Tokens: 199,884,800 | Train Loss EWMA: 4.0350 | Learning Rate: 0.002000 | Progress: 0.99942
diff --git a/wandb/run-20250915_224933-8ifme58a/files/requirements.txt b/wandb/run-20250915_224933-8ifme58a/files/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ba4ea9984d2e87ab2846009f43584e075c3b3e7b
--- /dev/null
+++ b/wandb/run-20250915_224933-8ifme58a/files/requirements.txt
@@ -0,0 +1,215 @@
+fsspec==2025.3.0
+PyYAML==6.0.2
+certifi==2025.8.3
+comm==0.2.3
+widgetsnbextension==4.0.14
+Jinja2==3.1.6
+rich==14.1.0
+circuitsvis==1.43.3
+hf-xet==1.1.9
+param==2.2.1
+httpcore==1.0.9
+nvidia-cuda-cupti-cu12==12.8.90
+nvidia-cuda-nvrtc-cu12==12.8.93
+asttokens==3.0.0
+filelock==3.19.1
+types-python-dateutil==2.9.0.20250822
+cycler==0.12.1
+stack-data==0.6.3
+jupyter_server==2.17.0
+aiosignal==1.4.0
+xyzservices==2025.4.0
+lark==1.2.2
+ptyprocess==0.7.0
+xxhash==3.5.0
+mpmath==1.3.0
+seaborn==0.13.2
+wadler_lindig==0.1.7
+nbformat==5.10.4
+panel==1.8.0
+accelerate==1.10.1
+plotly==6.3.0
+narwhals==2.4.0
+huggingface-hub==0.34.4
+sentencepiece==0.2.1
+torchvision==0.23.0
+ipython==9.5.0
+tqdm==4.67.1
+contourpy==1.3.3
+nvidia-nvtx-cu12==12.8.90
+nvidia-cuda-runtime-cu12==12.8.90
+yarl==1.20.1
+charset-normalizer==3.4.3
+jupyter-events==0.12.0
+nbclient==0.10.2
+numpy==1.26.4
+decorator==5.2.1
+networkx==3.5
+smmap==5.0.2
+nbconvert==7.16.6
+pytz==2025.2
+aiohappyeyeballs==2.6.1
+requests==2.32.5
+tinycss2==1.4.0
+defusedxml==0.7.1
+matplotlib-inline==0.1.7
+rpds-py==0.27.1
+wandb==0.21.4
+jedi==0.19.2
+pathspec==0.12.1
+transformer-lens==2.16.1
+sympy==1.14.0
+jupyterlab_pygments==0.3.0
+overrides==7.7.0
+notebook_shim==0.2.4
+jupyter==1.1.1
+protobuf==6.32.1
+better-abc==0.0.3
+jsonpointer==3.0.0
+terminado==0.18.1
+cfgv==3.4.0
+rfc3987-syntax==1.1.0
+annotated-types==0.7.0
+pyarrow==21.0.0
+webencodings==0.5.1
+wcwidth==0.2.13
+jupyterlab_server==2.27.3
+argon2-cffi-bindings==25.1.0
+nvidia-nvjitlink-cu12==12.8.93
+jaxtyping==0.3.2
+Pygments==2.19.2
+torch==2.8.0
+rfc3339-validator==0.1.4
+urllib3==2.5.0
+jupyterlab_widgets==3.0.15
+ipykernel==6.30.1
+nvidia-cudnn-cu12==9.10.2.21
+beautifulsoup4==4.13.5
+babel==2.17.0
+pure_eval==0.2.3
+pyparsing==3.2.3
+nvidia-cublas-cu12==12.8.4.1
+regex==2025.9.1
+pycparser==2.23
+soupsieve==2.8
+pytest-cov==7.0.0
+sniffio==1.3.1
+mypy==1.18.1
+notebook==7.4.5
+packaging==25.0
+h11==0.16.0
+psutil==7.0.0
+pexpect==4.9.0
+gitdb==4.0.12
+rfc3986-validator==0.1.1
+pyzmq==27.1.0
+jupyterlab==4.4.7
+toy_models==0.1.0
+torchaudio==2.8.0
+cffi==2.0.0
+mypy_extensions==1.1.0
+attrs==25.3.0
+transformers==4.56.1
+jupyter_core==5.8.1
+bleach==6.2.0
+fqdn==1.5.1
+async-lru==2.0.5
+nvidia-nccl-cu12==2.27.3
+GitPython==3.1.45
+referencing==0.36.2
+click==8.2.1
+prometheus_client==0.22.1
+bokeh==3.8.0
+httpx==0.28.1
+setuptools==80.9.0
+argon2-cffi==25.1.0
+multidict==6.6.4
+pyviz_comms==3.0.6
+arrow==1.3.0
+beartype==0.14.1
+ipywidgets==8.1.7
+pydantic_core==2.33.2
+markdown-it-py==4.0.0
+pandas==2.3.2
+virtualenv==20.34.0
+python-dotenv==1.1.1
+isoduration==20.11.0
+python-dateutil==2.9.0.post0
+nodeenv==1.9.1
+nvidia-curand-cu12==10.3.9.90
+webcolors==24.11.1
+MarkupSafe==3.0.2
+nvidia-cusolver-cu12==11.7.3.90
+Send2Trash==1.8.3
+coverage==7.10.6
+jupyter_server_terminals==0.5.3
+debugpy==1.8.16
+json5==0.12.1
+linkify-it-py==2.0.3
+importlib_metadata==8.7.0
+nvidia-cufft-cu12==11.3.3.83
+distlib==0.4.0
+typing-inspection==0.4.1
+identify==2.6.14
+nvidia-cufile-cu12==1.13.1.3
+mdurl==0.1.2
+websocket-client==1.8.0
+jsonschema==4.25.1
+python-json-logger==3.3.0
+typing_extensions==4.15.0
+tokenizers==0.22.0
+ipympl==0.9.7
+einops==0.8.1
+jupyter_client==8.6.3
+ipython_pygments_lexers==1.1.1
+h5py==3.14.0
+tabulate==0.9.0
+propcache==0.3.2
+ruff==0.13.0
+tornado==6.5.2
+typeguard==4.4.4
+tomlkit==0.13.2
+pluggy==1.6.0
+pydantic==2.11.7
+zipp==3.23.0
+fancy-einsum==0.0.3
+fastjsonschema==2.21.2
+datasets==4.0.0
+fonttools==4.59.2
+executing==2.2.1
+pillow==11.3.0
+uc-micro-py==1.0.3
+Markdown==3.9
+pre_commit==4.3.0
+aiohttp==3.12.15
+mistune==3.1.4
+tzdata==2025.2
+parso==0.8.5
+triton==3.4.0
+kiwisolver==1.4.9
+idna==3.10
+multiprocess==0.70.16
+dill==0.3.8
+jupyter-lsp==2.3.0
+platformdirs==4.4.0
+sentry-sdk==2.37.1
+prompt_toolkit==3.0.52
+jsonschema-specifications==2025.9.1
+pytest==8.4.2
+mdit-py-plugins==0.5.0
+transformers-stream-generator==0.0.5
+nvidia-cusparselt-cu12==0.7.1
+pandocfilters==1.5.1
+jupyter-console==6.6.3
+anyio==4.10.0
+six==1.17.0
+holoviews==1.21.0
+matplotlib==3.10.6
+colorcet==3.1.0
+uri-template==1.3.0
+nest-asyncio==1.6.0
+nvidia-cusparse-cu12==12.5.8.93
+iniconfig==2.1.0
+traitlets==5.14.3
+safetensors==0.6.2
+frozenlist==1.7.0
diff --git a/wandb/run-20250915_224933-8ifme58a/files/wandb-metadata.json b/wandb/run-20250915_224933-8ifme58a/files/wandb-metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..7c3dd1bf054a924b3c713e6ae233870fc9c9c168
--- /dev/null
+++ b/wandb/run-20250915_224933-8ifme58a/files/wandb-metadata.json
@@ -0,0 +1,38 @@
+{
+  "os":  "Linux-5.19.0-45-generic-x86_64-with-glibc2.35",
+  "python":  "CPython 3.11.7",
+  "startedAt":  "2025-09-15T22:49:33.194685Z",
+  "program":  "<python with no main file>",
+  "git":  {
+    "remote":  "https://github.com/jgroh3/toy_models.git",
+    "commit":  "12726eab9fc560dea9bf6aaf8aecd690c95aed21"
+  },
+  "email":  "tzfof8@gmail.com",
+  "root":  "/notebooks/toy_models/model_training/c4_code-200m-duplicate",
+  "host":  "njgrtoyynl",
+  "executable":  "/notebooks/toy_models/.toy_models_env/bin/python",
+  "cpu_count":  8,
+  "cpu_count_logical":  8,
+  "gpu":  "NVIDIA RTX A6000",
+  "gpu_count":  1,
+  "disk":  {
+    "/":  {
+      "total":  "262240792576",
+      "used":  "125140271104"
+    }
+  },
+  "memory":  {
+    "total":  "47332843520"
+  },
+  "gpu_nvidia":  [
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere",
+      "uuid":  "GPU-6bec9865-f1b0-5db9-8f6d-d3fd9b73eecf"
+    }
+  ],
+  "cudaVersion":  "12.4",
+  "writerId":  "06et8bi69wc4jspxlpqys3j0r2vjrawb"
+}
\ No newline at end of file
diff --git a/wandb/run-20250915_224933-8ifme58a/files/wandb-summary.json b/wandb/run-20250915_224933-8ifme58a/files/wandb-summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..4257aa39cf376c8068d8f9a168f98f9c7c1cc2a1
--- /dev/null
+++ b/wandb/run-20250915_224933-8ifme58a/files/wandb-summary.json
@@ -0,0 +1 @@
+{"_step":6100,"_wandb":{"runtime":4196},"tokens_per_second":32768,"progress":0.999424,"_timestamp":1.757980768651727e+09,"tokens_seen":199884800,"_runtime":4196.788694233,"train_loss":4.074721336364746,"train_loss_ewma":4.034968964676767,"learning_rate":0.002,"step":6100}
\ No newline at end of file
diff --git a/wandb/run-20250915_224933-8ifme58a/logs/debug-core.log b/wandb/run-20250915_224933-8ifme58a/logs/debug-core.log
new file mode 100644
index 0000000000000000000000000000000000000000..3771af7a07ee485c2aae02105c32fbecc267817d
--- /dev/null
+++ b/wandb/run-20250915_224933-8ifme58a/logs/debug-core.log
@@ -0,0 +1,16 @@
+{"time":"2025-09-15T22:49:33.447555326Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpsou9gpx3/port-292.txt","pid":292,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
+{"time":"2025-09-15T22:49:33.448243521Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":292}
+{"time":"2025-09-15T22:49:33.448204224Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-292-355-1379266032/socket","Net":"unix"}}
+{"time":"2025-09-15T22:49:33.457316542Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
+{"time":"2025-09-15T22:49:33.469943705Z","level":"INFO","msg":"handleInformInit: received","streamId":"8ifme58a","id":"1(@)"}
+{"time":"2025-09-15T22:49:33.680212599Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"8ifme58a","id":"1(@)"}
+{"time":"2025-09-15T23:59:31.112986209Z","level":"INFO","msg":"handleInformFinish: finish message received","streamId":"8ifme58a","id":"1(@)"}
+{"time":"2025-09-15T23:59:31.119052918Z","level":"INFO","msg":"handleInformFinish: stream closed","streamId":"8ifme58a","id":"1(@)"}
+{"time":"2025-09-15T23:59:31.119079366Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
+{"time":"2025-09-15T23:59:31.11909552Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"}
+{"time":"2025-09-15T23:59:31.119109723Z","level":"INFO","msg":"server is shutting down"}
+{"time":"2025-09-15T23:59:31.119123582Z","level":"INFO","msg":"connection: closing","id":"1(@)"}
+{"time":"2025-09-15T23:59:31.119195599Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/tmp/wandb-292-355-1379266032/socket","Net":"unix"}}
+{"time":"2025-09-15T23:59:31.119222527Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
+{"time":"2025-09-15T23:59:31.119242497Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"}
+{"time":"2025-09-15T23:59:31.119249222Z","level":"INFO","msg":"server is closed"}
diff --git a/wandb/run-20250915_224933-8ifme58a/logs/debug-internal.log b/wandb/run-20250915_224933-8ifme58a/logs/debug-internal.log
new file mode 100644
index 0000000000000000000000000000000000000000..f748583a60cb94c8c22f2c4d998551c3a655704f
--- /dev/null
+++ b/wandb/run-20250915_224933-8ifme58a/logs/debug-internal.log
@@ -0,0 +1,12 @@
+{"time":"2025-09-15T22:49:33.470091482Z","level":"INFO","msg":"stream: starting","core version":"0.21.4"}
+{"time":"2025-09-15T22:49:33.680162962Z","level":"INFO","msg":"stream: created new stream","id":"8ifme58a"}
+{"time":"2025-09-15T22:49:33.680205928Z","level":"INFO","msg":"stream: started","id":"8ifme58a"}
+{"time":"2025-09-15T22:49:33.680242811Z","level":"INFO","msg":"writer: started","stream_id":"8ifme58a"}
+{"time":"2025-09-15T22:49:33.68030656Z","level":"INFO","msg":"handler: started","stream_id":"8ifme58a"}
+{"time":"2025-09-15T22:49:33.680252446Z","level":"INFO","msg":"sender: started","stream_id":"8ifme58a"}
+{"time":"2025-09-15T23:59:30.672107041Z","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run metadata","runtime_seconds":0.000502052}],"total_operations":1}}
+{"time":"2025-09-15T23:59:30.991214003Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
+{"time":"2025-09-15T23:59:31.113023987Z","level":"INFO","msg":"stream: closing","id":"8ifme58a"}
+{"time":"2025-09-15T23:59:31.113070418Z","level":"INFO","msg":"handler: closed","stream_id":"8ifme58a"}
+{"time":"2025-09-15T23:59:31.113133266Z","level":"INFO","msg":"sender: closed","stream_id":"8ifme58a"}
+{"time":"2025-09-15T23:59:31.113153008Z","level":"INFO","msg":"stream: closed","id":"8ifme58a"}
diff --git a/wandb/run-20250915_224933-8ifme58a/logs/debug.log b/wandb/run-20250915_224933-8ifme58a/logs/debug.log
new file mode 100644
index 0000000000000000000000000000000000000000..a0dc08998071c1aef96a5d232c69b08fe35aa435
--- /dev/null
+++ b/wandb/run-20250915_224933-8ifme58a/logs/debug.log
@@ -0,0 +1,26 @@
+2025-09-15 22:49:33,198 INFO    MainThread:292 [wandb_setup.py:_flush():81] Current SDK version is 0.21.4
+2025-09-15 22:49:33,198 INFO    MainThread:292 [wandb_setup.py:_flush():81] Configure stats pid to 292
+2025-09-15 22:49:33,198 INFO    MainThread:292 [wandb_setup.py:_flush():81] Loading settings from /root/.config/wandb/settings
+2025-09-15 22:49:33,198 INFO    MainThread:292 [wandb_setup.py:_flush():81] Loading settings from /notebooks/toy_models/model_training/c4_code-200m-duplicate/wandb/settings
+2025-09-15 22:49:33,198 INFO    MainThread:292 [wandb_setup.py:_flush():81] Loading settings from environment variables
+2025-09-15 22:49:33,198 INFO    MainThread:292 [wandb_init.py:setup_run_log_directory():686] Logging user logs to /notebooks/toy_models/model_training/c4_code-200m-duplicate/wandb/run-20250915_224933-8ifme58a/logs/debug.log
+2025-09-15 22:49:33,198 INFO    MainThread:292 [wandb_init.py:setup_run_log_directory():687] Logging internal logs to /notebooks/toy_models/model_training/c4_code-200m-duplicate/wandb/run-20250915_224933-8ifme58a/logs/debug-internal.log
+2025-09-15 22:49:33,198 INFO    MainThread:292 [wandb_init.py:init():813] calling init triggers
+2025-09-15 22:49:33,198 INFO    MainThread:292 [wandb_init.py:init():818] wandb.init called with sweep_config: {}
+config: {'model_name': 'c4_code-200m-duplicate', 'n_layers': 2, 'd_model': 512, 'd_mlp': 2048, 'd_head': 64, 'n_heads': 8, 'attn_only': False, 'layer_norm_eps': 1e-05, 'init_range': 0.02, 'n_ctx': 1024, 'd_vocab': 48262, 'dataset_name': 'eoinf/c4_code-200m', 'tokenizer_name': 'NeelNanda/gpt-neox-tokenizer-digits', 'seed': 10, 'device': 'cuda', 'use_bfloat16_matmul': False, 'batch_size_per_device': 32, 'n_devices': 1, 'batches_per_step': 1, 'max_tokens': 200000000, 'lr_hidden': 0.002, 'lr_vector': 0.001, 'lr_schedule': 'constant_with_warmup', 'warmup_tokens': 30000000, 'weight_decay': 0.05, 'grad_norm_clip': 1.0, 'train_loss_moving_average_beta': 0.99, 'log_interval': 25, 'save_checkpoints': True, 'checkpoint_interval': 500, 'checkpoint_interval_ratio': 1.1, 'save_log_checkpoints': True, 'use_wandb': True, 'batch_size': 32, 'tokens_per_step': 32768, 'warmup_steps': 915, 'max_steps': 6103, '_wandb': {}}
+2025-09-15 22:49:33,198 INFO    MainThread:292 [wandb_init.py:init():854] starting backend
+2025-09-15 22:49:33,457 INFO    MainThread:292 [wandb_init.py:init():857] sending inform_init request
+2025-09-15 22:49:33,466 INFO    MainThread:292 [wandb_init.py:init():865] backend started and connected
+2025-09-15 22:49:33,467 INFO    MainThread:292 [wandb_init.py:init():936] updated telemetry
+2025-09-15 22:49:33,475 INFO    MainThread:292 [wandb_init.py:init():960] communicating run to backend with 90.0 second timeout
+2025-09-15 22:49:33,882 INFO    MainThread:292 [wandb_init.py:init():1011] starting run threads in backend
+2025-09-15 22:49:34,265 INFO    MainThread:292 [wandb_run.py:_console_start():2506] atexit reg
+2025-09-15 22:49:34,265 INFO    MainThread:292 [wandb_run.py:_redirect():2354] redirect: wrap_raw
+2025-09-15 22:49:34,265 INFO    MainThread:292 [wandb_run.py:_redirect():2423] Wrapping output streams.
+2025-09-15 22:49:34,265 INFO    MainThread:292 [wandb_run.py:_redirect():2446] Redirects installed.
+2025-09-15 22:49:34,275 INFO    MainThread:292 [wandb_init.py:init():1049] run started, returning control to user process
+2025-09-15 23:59:30,667 INFO    MainThread:292 [wandb_run.py:_finish():2272] finishing run tzach/toy-transformer-replication/8ifme58a
+2025-09-15 23:59:30,670 INFO    MainThread:292 [wandb_run.py:_atexit_cleanup():2471] got exitcode: 0
+2025-09-15 23:59:30,671 INFO    MainThread:292 [wandb_run.py:_restore():2453] restore
+2025-09-15 23:59:30,671 INFO    MainThread:292 [wandb_run.py:_restore():2459] restore done
+2025-09-15 23:59:31,112 INFO    MainThread:292 [wandb_run.py:_footer_sync_info():3867] logging synced files
diff --git a/wandb/run-20250915_224933-8ifme58a/run-8ifme58a.wandb b/wandb/run-20250915_224933-8ifme58a/run-8ifme58a.wandb
new file mode 100644
index 0000000000000000000000000000000000000000..af589968fcf568e56844bb068dd207c7ee9be100
--- /dev/null
+++ b/wandb/run-20250915_224933-8ifme58a/run-8ifme58a.wandb
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3e689593d4df226446420ddaae98bdcb64adf6c1e5e41f6cdf8924c857c1434c
+size 4782546