Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .gitattributes +1 -0
- checkpoints/metadata_000000032768.json +1 -0
- checkpoints/metadata_000000327680.json +1 -0
- checkpoints/metadata_000000360448.json +1 -0
- checkpoints/metadata_000000425984.json +1 -0
- checkpoints/metadata_000000458752.json +1 -0
- checkpoints/metadata_000000491520.json +1 -0
- checkpoints/metadata_000000557056.json +1 -0
- checkpoints/metadata_000000622592.json +1 -0
- checkpoints/metadata_000000688128.json +1 -0
- checkpoints/metadata_000000753664.json +1 -0
- checkpoints/metadata_000000819200.json +1 -0
- checkpoints/metadata_000000917504.json +1 -0
- checkpoints/metadata_000000983040.json +1 -0
- checkpoints/metadata_000001114112.json +1 -0
- checkpoints/metadata_000001212416.json +1 -0
- checkpoints/metadata_000001343488.json +1 -0
- checkpoints/metadata_000001474560.json +1 -0
- checkpoints/metadata_000001605632.json +1 -0
- checkpoints/metadata_000001769472.json +1 -0
- checkpoints/metadata_000001966080.json +1 -0
- checkpoints/metadata_000002162688.json +1 -0
- checkpoints/metadata_000002359296.json +1 -0
- checkpoints/metadata_000002621440.json +1 -0
- checkpoints/metadata_000002883584.json +1 -0
- checkpoints/metadata_000003178496.json +1 -0
- checkpoints/metadata_000003473408.json +1 -0
- checkpoints/metadata_000003833856.json +1 -0
- checkpoints/metadata_000004227072.json +1 -0
- checkpoints/metadata_000004653056.json +1 -0
- checkpoints/metadata_000005111808.json +1 -0
- checkpoints/metadata_000005603328.json +1 -0
- checkpoints/metadata_000006193152.json +1 -0
- checkpoints/metadata_000006782976.json +1 -0
- checkpoints/metadata_000007471104.json +1 -0
- checkpoints/metadata_000008224768.json +1 -0
- checkpoints/metadata_000009043968.json +1 -0
- checkpoints/metadata_000009961472.json +1 -0
- checkpoints/metadata_000010944512.json +1 -0
- checkpoints/metadata_000012058624.json +1 -0
- checkpoints/metadata_000013271040.json +1 -0
- checkpoints/metadata_000014581760.json +1 -0
- checkpoints/metadata_000016056320.json +1 -0
- checkpoints/metadata_000016384000.json +1 -0
- checkpoints/metadata_000017661952.json +1 -0
- checkpoints/metadata_000019431424.json +1 -0
- checkpoints/metadata_000021364736.json +1 -0
- checkpoints/metadata_000023494656.json +1 -0
- checkpoints/metadata_000025853952.json +1 -0
- checkpoints/metadata_000028442624.json +1 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
wandb/run-20251101_050718-x0he2mby/run-x0he2mby.wandb filter=lfs diff=lfs merge=lfs -text
|
checkpoints/metadata_000000032768.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 1, "tokens_seen": 32768, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.504129409790039}
|
checkpoints/metadata_000000327680.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 10, "tokens_seen": 327680, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.495558547202545}
|
checkpoints/metadata_000000360448.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 11, "tokens_seen": 360448, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.492465802382986}
|
checkpoints/metadata_000000425984.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 13, "tokens_seen": 425984, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.48443794637458}
|
checkpoints/metadata_000000458752.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 14, "tokens_seen": 458752, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.479435028092475}
|
checkpoints/metadata_000000491520.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 15, "tokens_seen": 491520, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.473799117447781}
|
checkpoints/metadata_000000557056.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 17, "tokens_seen": 557056, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.46143174252293}
|
checkpoints/metadata_000000622592.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 19, "tokens_seen": 622592, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.447867588954788}
|
checkpoints/metadata_000000688128.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 21, "tokens_seen": 688128, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.432014080788836}
|
checkpoints/metadata_000000753664.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 23, "tokens_seen": 753664, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.415017053033225}
|
checkpoints/metadata_000000819200.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 25, "tokens_seen": 819200, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.395785776391488}
|
checkpoints/metadata_000000917504.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 28, "tokens_seen": 917504, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.367264762902195}
|
checkpoints/metadata_000000983040.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 30, "tokens_seen": 983040, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.347077211671865}
|
checkpoints/metadata_000001114112.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 34, "tokens_seen": 1114112, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.305477788660623}
|
checkpoints/metadata_000001212416.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 37, "tokens_seen": 1212416, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.271077942205011}
|
checkpoints/metadata_000001343488.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 41, "tokens_seen": 1343488, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.221748212046837}
|
checkpoints/metadata_000001474560.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 45, "tokens_seen": 1474560, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.167200491768405}
|
checkpoints/metadata_000001605632.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 49, "tokens_seen": 1605632, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.107531033976013}
|
checkpoints/metadata_000001769472.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 54, "tokens_seen": 1769472, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.026064940279571}
|
checkpoints/metadata_000001966080.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 60, "tokens_seen": 1966080, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.92032294310719}
|
checkpoints/metadata_000002162688.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 66, "tokens_seen": 2162688, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.801876731497748}
|
checkpoints/metadata_000002359296.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 72, "tokens_seen": 2359296, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.675951440395803}
|
checkpoints/metadata_000002621440.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 80, "tokens_seen": 2621440, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.49608959662518}
|
checkpoints/metadata_000002883584.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 88, "tokens_seen": 2883584, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.305093544243261}
|
checkpoints/metadata_000003178496.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 97, "tokens_seen": 3178496, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.088921241499149}
|
checkpoints/metadata_000003473408.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 106, "tokens_seen": 3473408, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 8.869608010287346}
|
checkpoints/metadata_000003833856.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 117, "tokens_seen": 3833856, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 8.61439148235503}
|
checkpoints/metadata_000004227072.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 129, "tokens_seen": 4227072, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 8.35317155220863}
|
checkpoints/metadata_000004653056.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 142, "tokens_seen": 4653056, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 8.09067977685243}
|
checkpoints/metadata_000005111808.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 156, "tokens_seen": 5111808, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.82408076517334}
|
checkpoints/metadata_000005603328.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 171, "tokens_seen": 5603328, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.5609016385690495}
|
checkpoints/metadata_000006193152.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 189, "tokens_seen": 6193152, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.27145520178176}
|
checkpoints/metadata_000006782976.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 207, "tokens_seen": 6782976, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.010424915475919}
|
checkpoints/metadata_000007471104.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 228, "tokens_seen": 7471104, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.74492772170685}
|
checkpoints/metadata_000008224768.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 251, "tokens_seen": 8224768, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.488508960587569}
|
checkpoints/metadata_000009043968.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 276, "tokens_seen": 9043968, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.244596883086281}
|
checkpoints/metadata_000009961472.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 304, "tokens_seen": 9961472, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.01574416153393}
|
checkpoints/metadata_000010944512.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 334, "tokens_seen": 10944512, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.799362957823811}
|
checkpoints/metadata_000012058624.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 368, "tokens_seen": 12058624, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.604772800413414}
|
checkpoints/metadata_000013271040.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 405, "tokens_seen": 13271040, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.424803652632412}
|
checkpoints/metadata_000014581760.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 445, "tokens_seen": 14581760, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.266946949866824}
|
checkpoints/metadata_000016056320.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 490, "tokens_seen": 16056320, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.121108634885812}
|
checkpoints/metadata_000016384000.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 500, "tokens_seen": 16384000, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.0922184937286685}
|
checkpoints/metadata_000017661952.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 539, "tokens_seen": 17661952, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.992866171579676}
|
checkpoints/metadata_000019431424.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 593, "tokens_seen": 19431424, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.878253635465713}
|
checkpoints/metadata_000021364736.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 652, "tokens_seen": 21364736, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.7473133185118135}
|
checkpoints/metadata_000023494656.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 717, "tokens_seen": 23494656, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.627843826891225}
|
checkpoints/metadata_000025853952.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 789, "tokens_seen": 25853952, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.504817329035256}
|
checkpoints/metadata_000028442624.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 868, "tokens_seen": 28442624, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.3874906557707165}
|