diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..e53781a1ca4377f246a76cebfea2f653ccdacce2 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +wandb/run-20251231_041335-rcpwhdwq/run-rcpwhdwq.wandb filter=lfs diff=lfs merge=lfs -text diff --git a/checkpoints/metadata_000000032768.json b/checkpoints/metadata_000000032768.json new file mode 100644 index 0000000000000000000000000000000000000000..6b035659d59d8d3c0ec75fe75b46d2123d252d08 --- /dev/null +++ b/checkpoints/metadata_000000032768.json @@ -0,0 +1 @@ +{"step": 1, "tokens_seen": 32768, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.49576473236084} \ No newline at end of file diff --git a/checkpoints/metadata_000000327680.json b/checkpoints/metadata_000000327680.json new file mode 100644 index 0000000000000000000000000000000000000000..fc7496143767640c6a50515e8e71e2553f76fe88 --- /dev/null +++ b/checkpoints/metadata_000000327680.json @@ -0,0 +1 @@ +{"step": 10, "tokens_seen": 327680, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.489271014897858} \ No newline at end of file diff --git a/checkpoints/metadata_000000360448.json b/checkpoints/metadata_000000360448.json new file mode 100644 index 0000000000000000000000000000000000000000..6634ee955c81ba14b2689210e136666e02c08175 --- /dev/null +++ b/checkpoints/metadata_000000360448.json @@ -0,0 +1 @@ +{"step": 11, "tokens_seen": 360448, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.486908068924294} \ No newline at end of file diff --git a/checkpoints/metadata_000000425984.json b/checkpoints/metadata_000000425984.json new file mode 100644 index 0000000000000000000000000000000000000000..6b382bb03b43e212cd85e8559d222c3de7d021fc --- /dev/null +++ b/checkpoints/metadata_000000425984.json @@ -0,0 +1 @@ +{"step": 13, "tokens_seen": 425984, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.480020528303415} \ No newline at end of file diff --git a/checkpoints/metadata_000000458752.json b/checkpoints/metadata_000000458752.json new file mode 100644 index 0000000000000000000000000000000000000000..5307efa5314d211f25943cbb6fadf9603511e3a3 --- /dev/null +++ b/checkpoints/metadata_000000458752.json @@ -0,0 +1 @@ +{"step": 14, "tokens_seen": 458752, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.476222539359492} \ No newline at end of file diff --git a/checkpoints/metadata_000000491520.json b/checkpoints/metadata_000000491520.json new file mode 100644 index 0000000000000000000000000000000000000000..d8a47eb33211b52143a0300299cb9cede31e6ce1 --- /dev/null +++ b/checkpoints/metadata_000000491520.json @@ -0,0 +1 @@ +{"step": 15, "tokens_seen": 491520, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.471817875077347} \ No newline at end of file diff --git a/checkpoints/metadata_000000557056.json b/checkpoints/metadata_000000557056.json new file mode 100644 index 0000000000000000000000000000000000000000..4f76c3296a9af61b902e0e506e1d1e0befe59f77 --- /dev/null +++ b/checkpoints/metadata_000000557056.json @@ -0,0 +1 @@ +{"step": 17, "tokens_seen": 557056, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.461444721377925} \ No newline at end of file diff --git a/checkpoints/metadata_000000622592.json b/checkpoints/metadata_000000622592.json new file mode 100644 index 0000000000000000000000000000000000000000..5181064d756a568b7dbaffcdd6bcd498fa765fda --- /dev/null +++ b/checkpoints/metadata_000000622592.json @@ -0,0 +1 @@ +{"step": 19, "tokens_seen": 622592, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.450074711939168} \ No newline at end of file diff --git a/checkpoints/metadata_000000688128.json b/checkpoints/metadata_000000688128.json new file mode 100644 index 0000000000000000000000000000000000000000..4eff7e68fd0b46fd69cb06e6e6af34af88f116f0 --- /dev/null +++ b/checkpoints/metadata_000000688128.json @@ -0,0 +1 @@ +{"step": 21, "tokens_seen": 688128, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.436854510667338} \ No newline at end of file diff --git a/checkpoints/metadata_000000753664.json b/checkpoints/metadata_000000753664.json new file mode 100644 index 0000000000000000000000000000000000000000..29abdbb5dda2986598441ae52a3358c3cb7149fd --- /dev/null +++ b/checkpoints/metadata_000000753664.json @@ -0,0 +1 @@ +{"step": 23, "tokens_seen": 753664, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.421326776395324} \ No newline at end of file diff --git a/checkpoints/metadata_000000819200.json b/checkpoints/metadata_000000819200.json new file mode 100644 index 0000000000000000000000000000000000000000..89c4059dff8318fae6963de42c9bb56057bed38d --- /dev/null +++ b/checkpoints/metadata_000000819200.json @@ -0,0 +1 @@ +{"step": 25, "tokens_seen": 819200, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.406062962610608} \ No newline at end of file diff --git a/checkpoints/metadata_000000917504.json b/checkpoints/metadata_000000917504.json new file mode 100644 index 0000000000000000000000000000000000000000..f19469e538e1669c3f6f3cee41e6ed3d4ab5743d --- /dev/null +++ b/checkpoints/metadata_000000917504.json @@ -0,0 +1 @@ +{"step": 28, "tokens_seen": 917504, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.380216730427893} \ No newline at end of file diff --git a/checkpoints/metadata_000000983040.json b/checkpoints/metadata_000000983040.json new file mode 100644 index 0000000000000000000000000000000000000000..654af07484d349018897afa9a537959b507630dd --- /dev/null +++ b/checkpoints/metadata_000000983040.json @@ -0,0 +1 @@ +{"step": 30, "tokens_seen": 983040, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.361557481262672} \ No newline at end of file diff --git a/checkpoints/metadata_000001114112.json b/checkpoints/metadata_000001114112.json new file mode 100644 index 0000000000000000000000000000000000000000..1dcce9d287e1851ea6ad29a45ab396ae08bd8fd2 --- /dev/null +++ b/checkpoints/metadata_000001114112.json @@ -0,0 +1 @@ +{"step": 34, "tokens_seen": 1114112, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.322935347271303} \ No newline at end of file diff --git a/checkpoints/metadata_000001212416.json b/checkpoints/metadata_000001212416.json new file mode 100644 index 0000000000000000000000000000000000000000..07511f1b9f032e3824ca57f71af96e817bb8425c --- /dev/null +++ b/checkpoints/metadata_000001212416.json @@ -0,0 +1 @@ +{"step": 37, "tokens_seen": 1212416, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.29365823078775} \ No newline at end of file diff --git a/checkpoints/metadata_000001343488.json b/checkpoints/metadata_000001343488.json new file mode 100644 index 0000000000000000000000000000000000000000..06acc5dc67067b78526a81510cf04ec2cdea5109 --- /dev/null +++ b/checkpoints/metadata_000001343488.json @@ -0,0 +1 @@ +{"step": 41, "tokens_seen": 1343488, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.25230630071821} \ No newline at end of file diff --git a/checkpoints/metadata_000001474560.json b/checkpoints/metadata_000001474560.json new file mode 100644 index 0000000000000000000000000000000000000000..3396c6aa61827c965631f919f33355f8b70576b5 --- /dev/null +++ b/checkpoints/metadata_000001474560.json @@ -0,0 +1 @@ +{"step": 45, "tokens_seen": 1474560, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.205625424863927} \ No newline at end of file diff --git a/checkpoints/metadata_000001605632.json b/checkpoints/metadata_000001605632.json new file mode 100644 index 0000000000000000000000000000000000000000..2ad2eb6a22b1062a41a44c0010e4f3b2cb220109 --- /dev/null +++ b/checkpoints/metadata_000001605632.json @@ -0,0 +1 @@ +{"step": 49, "tokens_seen": 1605632, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.153022574240959} \ No newline at end of file diff --git a/checkpoints/metadata_000001769472.json b/checkpoints/metadata_000001769472.json new file mode 100644 index 0000000000000000000000000000000000000000..998f7a9c34746dbc937a6b7098c6d6a4fd02d421 --- /dev/null +++ b/checkpoints/metadata_000001769472.json @@ -0,0 +1 @@ +{"step": 54, "tokens_seen": 1769472, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.083493927306938} \ No newline at end of file diff --git a/checkpoints/metadata_000001966080.json b/checkpoints/metadata_000001966080.json new file mode 100644 index 0000000000000000000000000000000000000000..39ce8adb4ac239d686854d5997f86725bdcd80c2 --- /dev/null +++ b/checkpoints/metadata_000001966080.json @@ -0,0 +1 @@ +{"step": 60, "tokens_seen": 1966080, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.98845465531522} \ No newline at end of file diff --git a/checkpoints/metadata_000002162688.json b/checkpoints/metadata_000002162688.json new file mode 100644 index 0000000000000000000000000000000000000000..cf9e491d2f3e13231b4bca39fa05b7c0cdae85ac --- /dev/null +++ b/checkpoints/metadata_000002162688.json @@ -0,0 +1 @@ +{"step": 66, "tokens_seen": 2162688, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.884245887293245} \ No newline at end of file diff --git a/checkpoints/metadata_000002359296.json b/checkpoints/metadata_000002359296.json new file mode 100644 index 0000000000000000000000000000000000000000..035e271d0d12751aadfb39ac9af6479e0512a9ea --- /dev/null +++ b/checkpoints/metadata_000002359296.json @@ -0,0 +1 @@ +{"step": 72, "tokens_seen": 2359296, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.768536555912432} \ No newline at end of file diff --git a/checkpoints/metadata_000002621440.json b/checkpoints/metadata_000002621440.json new file mode 100644 index 0000000000000000000000000000000000000000..81cd2b425285afece52761557c571702a7f7c3fa --- /dev/null +++ b/checkpoints/metadata_000002621440.json @@ -0,0 +1 @@ +{"step": 80, "tokens_seen": 2621440, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.606525903235314} \ No newline at end of file diff --git a/checkpoints/metadata_000002883584.json b/checkpoints/metadata_000002883584.json new file mode 100644 index 0000000000000000000000000000000000000000..6ce95cfef35ed6ada9cd2f105765ef39e640500b --- /dev/null +++ b/checkpoints/metadata_000002883584.json @@ -0,0 +1 @@ +{"step": 88, "tokens_seen": 2883584, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.43076275962358} \ No newline at end of file diff --git a/checkpoints/metadata_000003178496.json b/checkpoints/metadata_000003178496.json new file mode 100644 index 0000000000000000000000000000000000000000..3d94f368777e0c15595e053d6fc0568d23fb20f6 --- /dev/null +++ b/checkpoints/metadata_000003178496.json @@ -0,0 +1 @@ +{"step": 97, "tokens_seen": 3178496, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.234859681878145} \ No newline at end of file diff --git a/checkpoints/metadata_000003473408.json b/checkpoints/metadata_000003473408.json new file mode 100644 index 0000000000000000000000000000000000000000..d67f3f4b6c9cd7db6cc4efb3d0e8436f21ee454b --- /dev/null +++ b/checkpoints/metadata_000003473408.json @@ -0,0 +1 @@ +{"step": 106, "tokens_seen": 3473408, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.034691563120559} \ No newline at end of file diff --git a/checkpoints/metadata_000003833856.json b/checkpoints/metadata_000003833856.json new file mode 100644 index 0000000000000000000000000000000000000000..ed92b07c57f87dd5641e66d01305049289c7d139 --- /dev/null +++ b/checkpoints/metadata_000003833856.json @@ -0,0 +1 @@ +{"step": 117, "tokens_seen": 3833856, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 8.801142133427442} \ No newline at end of file diff --git a/checkpoints/metadata_000004227072.json b/checkpoints/metadata_000004227072.json new file mode 100644 index 0000000000000000000000000000000000000000..ccb0b0b55558a9471ab4930150984141853050b3 --- /dev/null +++ b/checkpoints/metadata_000004227072.json @@ -0,0 +1 @@ +{"step": 129, "tokens_seen": 4227072, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 8.551099128843015} \ No newline at end of file diff --git a/checkpoints/metadata_000004653056.json b/checkpoints/metadata_000004653056.json new file mode 100644 index 0000000000000000000000000000000000000000..9f3e70fc35e3b880865159f26b29bc0a2f93650a --- /dev/null +++ b/checkpoints/metadata_000004653056.json @@ -0,0 +1 @@ +{"step": 142, "tokens_seen": 4653056, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 8.296211401898146} \ No newline at end of file diff --git a/checkpoints/metadata_000005111808.json b/checkpoints/metadata_000005111808.json new file mode 100644 index 0000000000000000000000000000000000000000..67d4e4fb552f280bde493d93cfd8c5bec4bb9bae --- /dev/null +++ b/checkpoints/metadata_000005111808.json @@ -0,0 +1 @@ +{"step": 156, "tokens_seen": 5111808, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 8.048747803701078} \ No newline at end of file diff --git a/checkpoints/metadata_000005603328.json b/checkpoints/metadata_000005603328.json new file mode 100644 index 0000000000000000000000000000000000000000..be029d327b7434c4e16072f1173bbb6181387640 --- /dev/null +++ b/checkpoints/metadata_000005603328.json @@ -0,0 +1 @@ +{"step": 171, "tokens_seen": 5603328, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.797746822614157} \ No newline at end of file diff --git a/checkpoints/metadata_000006193152.json b/checkpoints/metadata_000006193152.json new file mode 100644 index 0000000000000000000000000000000000000000..68acfbd7f28610ac85c95c7e93bc5314084aa0ba --- /dev/null +++ b/checkpoints/metadata_000006193152.json @@ -0,0 +1 @@ +{"step": 189, "tokens_seen": 6193152, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.519633677901135} \ No newline at end of file diff --git a/checkpoints/metadata_000006782976.json b/checkpoints/metadata_000006782976.json new file mode 100644 index 0000000000000000000000000000000000000000..8d8157b009247bf65b1ff3de8999d3536fae1c99 --- /dev/null +++ b/checkpoints/metadata_000006782976.json @@ -0,0 +1 @@ +{"step": 207, "tokens_seen": 6782976, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.275066928377646} \ No newline at end of file diff --git a/checkpoints/metadata_000007471104.json b/checkpoints/metadata_000007471104.json new file mode 100644 index 0000000000000000000000000000000000000000..9fcd6aaf571c7df7ca832373b4242ed99d11f9d1 --- /dev/null +++ b/checkpoints/metadata_000007471104.json @@ -0,0 +1 @@ +{"step": 228, "tokens_seen": 7471104, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.012536098590285} \ No newline at end of file diff --git a/checkpoints/metadata_000008224768.json b/checkpoints/metadata_000008224768.json new file mode 100644 index 0000000000000000000000000000000000000000..cadc3217b816a19aab2b292150dbc3d8e1098efd --- /dev/null +++ b/checkpoints/metadata_000008224768.json @@ -0,0 +1 @@ +{"step": 251, "tokens_seen": 8224768, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.770420143932772} \ No newline at end of file diff --git a/checkpoints/metadata_000009043968.json b/checkpoints/metadata_000009043968.json new file mode 100644 index 0000000000000000000000000000000000000000..191327cb91b5cb3553cc6bb415b98d51c7a42cbd --- /dev/null +++ b/checkpoints/metadata_000009043968.json @@ -0,0 +1 @@ +{"step": 276, "tokens_seen": 9043968, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.538090242940533} \ No newline at end of file diff --git a/checkpoints/metadata_000009961472.json b/checkpoints/metadata_000009961472.json new file mode 100644 index 0000000000000000000000000000000000000000..19bd1376716c9a84bf1baa2b3afce211de885c94 --- /dev/null +++ b/checkpoints/metadata_000009961472.json @@ -0,0 +1 @@ +{"step": 304, "tokens_seen": 9961472, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.312653355166862} \ No newline at end of file diff --git a/checkpoints/metadata_000010944512.json b/checkpoints/metadata_000010944512.json new file mode 100644 index 0000000000000000000000000000000000000000..04ecc66fc44213b8e96504445e4b7c0cdf9c5711 --- /dev/null +++ b/checkpoints/metadata_000010944512.json @@ -0,0 +1 @@ +{"step": 334, "tokens_seen": 10944512, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.114092575298325} \ No newline at end of file diff --git a/checkpoints/metadata_000012058624.json b/checkpoints/metadata_000012058624.json new file mode 100644 index 0000000000000000000000000000000000000000..13647eea12d2038e2788460dad7fcf17fd9aff19 --- /dev/null +++ b/checkpoints/metadata_000012058624.json @@ -0,0 +1 @@ +{"step": 368, "tokens_seen": 12058624, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.9209395087650725} \ No newline at end of file diff --git a/checkpoints/metadata_000013271040.json b/checkpoints/metadata_000013271040.json new file mode 100644 index 0000000000000000000000000000000000000000..1184cffa04a5e048dfbd47721dd3b3f2838b0ec3 --- /dev/null +++ b/checkpoints/metadata_000013271040.json @@ -0,0 +1 @@ +{"step": 405, "tokens_seen": 13271040, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.753501363318461} \ No newline at end of file diff --git a/checkpoints/metadata_000014581760.json b/checkpoints/metadata_000014581760.json new file mode 100644 index 0000000000000000000000000000000000000000..d844a88baefa70cfbe4d149cb29ade8bcdfd9636 --- /dev/null +++ b/checkpoints/metadata_000014581760.json @@ -0,0 +1 @@ +{"step": 445, "tokens_seen": 14581760, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.596605428542577} \ No newline at end of file diff --git a/checkpoints/metadata_000016056320.json b/checkpoints/metadata_000016056320.json new file mode 100644 index 0000000000000000000000000000000000000000..e8059277561a5fef8e61601b62a396f20a6ab6a8 --- /dev/null +++ b/checkpoints/metadata_000016056320.json @@ -0,0 +1 @@ +{"step": 490, "tokens_seen": 16056320, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.45863992015659} \ No newline at end of file diff --git a/checkpoints/metadata_000016384000.json b/checkpoints/metadata_000016384000.json new file mode 100644 index 0000000000000000000000000000000000000000..633177a89c2605282d1bbaba53aa7711ef9a628f --- /dev/null +++ b/checkpoints/metadata_000016384000.json @@ -0,0 +1 @@ +{"step": 500, "tokens_seen": 16384000, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.437849043420827} \ No newline at end of file diff --git a/checkpoints/metadata_000017661952.json b/checkpoints/metadata_000017661952.json new file mode 100644 index 0000000000000000000000000000000000000000..caa5ee5f5d31184a32442a14577dbb826e9b306a --- /dev/null +++ b/checkpoints/metadata_000017661952.json @@ -0,0 +1 @@ +{"step": 539, "tokens_seen": 17661952, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.334396277413484} \ No newline at end of file diff --git a/checkpoints/metadata_000019431424.json b/checkpoints/metadata_000019431424.json new file mode 100644 index 0000000000000000000000000000000000000000..97162e9bfdeabffbdb4aea04f21e32ea0df02da6 --- /dev/null +++ b/checkpoints/metadata_000019431424.json @@ -0,0 +1 @@ +{"step": 593, "tokens_seen": 19431424, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.216249834811531} \ No newline at end of file diff --git a/checkpoints/metadata_000021364736.json b/checkpoints/metadata_000021364736.json new file mode 100644 index 0000000000000000000000000000000000000000..642c651c137f6ce7a1c5c623d7dae96b1edb038a --- /dev/null +++ b/checkpoints/metadata_000021364736.json @@ -0,0 +1 @@ +{"step": 652, "tokens_seen": 21364736, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.134114651799934} \ No newline at end of file diff --git a/checkpoints/metadata_000023494656.json b/checkpoints/metadata_000023494656.json new file mode 100644 index 0000000000000000000000000000000000000000..97bd5b4ecd7d4d1e6632bbf937b6731c2d956393 --- /dev/null +++ b/checkpoints/metadata_000023494656.json @@ -0,0 +1 @@ +{"step": 717, "tokens_seen": 23494656, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.050643151094726} \ No newline at end of file diff --git a/checkpoints/metadata_000025853952.json b/checkpoints/metadata_000025853952.json new file mode 100644 index 0000000000000000000000000000000000000000..d709f1f2e6cfe1b5482de153bfe86853c9f1ee83 --- /dev/null +++ b/checkpoints/metadata_000025853952.json @@ -0,0 +1 @@ +{"step": 789, "tokens_seen": 25853952, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.960413834664021} \ No newline at end of file diff --git a/checkpoints/metadata_000028442624.json b/checkpoints/metadata_000028442624.json new file mode 100644 index 0000000000000000000000000000000000000000..5ad826ba29fbfa7a08895cb651c3df1281ffbad2 --- /dev/null +++ b/checkpoints/metadata_000028442624.json @@ -0,0 +1 @@ +{"step": 868, "tokens_seen": 28442624, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.856463249483292} \ No newline at end of file diff --git a/checkpoints/metadata_000031293440.json b/checkpoints/metadata_000031293440.json new file mode 100644 index 0000000000000000000000000000000000000000..55782f0a39f1fc5c75cedbb0b747dc594917f86b --- /dev/null +++ b/checkpoints/metadata_000031293440.json @@ -0,0 +1 @@ +{"step": 955, "tokens_seen": 31293440, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.74281889293806} \ No newline at end of file diff --git a/checkpoints/metadata_000032768000.json b/checkpoints/metadata_000032768000.json new file mode 100644 index 0000000000000000000000000000000000000000..472e19775955f89e73a3a8c6802e336d1b6fe895 --- /dev/null +++ b/checkpoints/metadata_000032768000.json @@ -0,0 +1 @@ +{"step": 1000, "tokens_seen": 32768000, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.682774409589139} \ No newline at end of file diff --git a/checkpoints/metadata_000034439168.json b/checkpoints/metadata_000034439168.json new file mode 100644 index 0000000000000000000000000000000000000000..6686ce56b70c1a4201cc87513b95e0537c441205 --- /dev/null +++ b/checkpoints/metadata_000034439168.json @@ -0,0 +1 @@ +{"step": 1051, "tokens_seen": 34439168, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.618787638072126} \ No newline at end of file diff --git a/checkpoints/metadata_000037879808.json b/checkpoints/metadata_000037879808.json new file mode 100644 index 0000000000000000000000000000000000000000..edb9a09754a0a4aedd790135f0b9df69c8631f49 --- /dev/null +++ b/checkpoints/metadata_000037879808.json @@ -0,0 +1 @@ +{"step": 1156, "tokens_seen": 37879808, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.513422889362494} \ No newline at end of file diff --git a/checkpoints/metadata_000041648128.json b/checkpoints/metadata_000041648128.json new file mode 100644 index 0000000000000000000000000000000000000000..6f2477d4dac313b821b3441311f5880fbf097345 --- /dev/null +++ b/checkpoints/metadata_000041648128.json @@ -0,0 +1 @@ +{"step": 1271, "tokens_seen": 41648128, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.407097927902551} \ No newline at end of file diff --git a/checkpoints/metadata_000045842432.json b/checkpoints/metadata_000045842432.json new file mode 100644 index 0000000000000000000000000000000000000000..6f52c71945f2f90684e1e0edfe53abfdd6635c71 --- /dev/null +++ b/checkpoints/metadata_000045842432.json @@ -0,0 +1 @@ +{"step": 1399, "tokens_seen": 45842432, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.306885544485731} \ No newline at end of file diff --git a/checkpoints/metadata_000049152000.json b/checkpoints/metadata_000049152000.json new file mode 100644 index 0000000000000000000000000000000000000000..f5e5b32b674690f9a8d60ab177aeaa10f038ca08 --- /dev/null +++ b/checkpoints/metadata_000049152000.json @@ -0,0 +1 @@ +{"step": 1500, "tokens_seen": 49152000, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.258898076778814} \ No newline at end of file diff --git a/checkpoints/metadata_000050397184.json b/checkpoints/metadata_000050397184.json new file mode 100644 index 0000000000000000000000000000000000000000..b90afe59cf9f643d58a08790101a397b5a2af314 --- /dev/null +++ b/checkpoints/metadata_000050397184.json @@ -0,0 +1 @@ +{"step": 1538, "tokens_seen": 50397184, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.2489897021009035} \ No newline at end of file diff --git a/checkpoints/metadata_000055443456.json b/checkpoints/metadata_000055443456.json new file mode 100644 index 0000000000000000000000000000000000000000..69deef9fadd0d3d671e234968f251081fb00b7e7 --- /dev/null +++ b/checkpoints/metadata_000055443456.json @@ -0,0 +1 @@ +{"step": 1692, "tokens_seen": 55443456, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.135171840934691} \ No newline at end of file diff --git a/checkpoints/metadata_000061014016.json b/checkpoints/metadata_000061014016.json new file mode 100644 index 0000000000000000000000000000000000000000..8558609e005579881b7d5342f0e806b0a5ff2868 --- /dev/null +++ b/checkpoints/metadata_000061014016.json @@ -0,0 +1 @@ +{"step": 1862, "tokens_seen": 61014016, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.035979109664432} \ No newline at end of file diff --git a/checkpoints/metadata_000065536000.json b/checkpoints/metadata_000065536000.json new file mode 100644 index 0000000000000000000000000000000000000000..e3be95226583991c763b78d6f2aa436f3d306383 --- /dev/null +++ b/checkpoints/metadata_000065536000.json @@ -0,0 +1 @@ +{"step": 2000, "tokens_seen": 65536000, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.9804432837057444} \ No newline at end of file diff --git a/checkpoints/metadata_000067108864.json b/checkpoints/metadata_000067108864.json new file mode 100644 index 0000000000000000000000000000000000000000..c2771d708797a0cb2590e14c0dd67782cf9c7031 --- /dev/null +++ b/checkpoints/metadata_000067108864.json @@ -0,0 +1 @@ +{"step": 2048, "tokens_seen": 67108864, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.94973242724698} \ No newline at end of file diff --git a/checkpoints/metadata_000073826304.json b/checkpoints/metadata_000073826304.json new file mode 100644 index 0000000000000000000000000000000000000000..a96581375650e47427fbe372d4a8e801abab7d21 --- /dev/null +++ b/checkpoints/metadata_000073826304.json @@ -0,0 +1 @@ +{"step": 2253, "tokens_seen": 73826304, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.828770700775608} \ No newline at end of file diff --git a/checkpoints/metadata_000081199104.json b/checkpoints/metadata_000081199104.json new file mode 100644 index 0000000000000000000000000000000000000000..56311db7988d5082b725e8b19c3c00e8e9531e4f --- /dev/null +++ b/checkpoints/metadata_000081199104.json @@ -0,0 +1 @@ +{"step": 2478, "tokens_seen": 81199104, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.755152782066348} \ No newline at end of file diff --git a/checkpoints/metadata_000081920000.json b/checkpoints/metadata_000081920000.json new file mode 100644 index 0000000000000000000000000000000000000000..1fc1022366dffa6cd3a20e9de52b058f66960280 --- /dev/null +++ b/checkpoints/metadata_000081920000.json @@ -0,0 +1 @@ +{"step": 2500, "tokens_seen": 81920000, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.742841892262455} \ No newline at end of file diff --git a/checkpoints/metadata_000089325568.json b/checkpoints/metadata_000089325568.json new file mode 100644 index 0000000000000000000000000000000000000000..2756494eaab7ca266fceeacbf2ee7e748b4952c6 --- /dev/null +++ b/checkpoints/metadata_000089325568.json @@ -0,0 +1 @@ +{"step": 2726, "tokens_seen": 89325568, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.678196586589879} \ No newline at end of file diff --git a/checkpoints/metadata_000098271232.json b/checkpoints/metadata_000098271232.json new file mode 100644 index 0000000000000000000000000000000000000000..738f68c476061c5cfd949021c58b04a743d1a38a --- /dev/null +++ b/checkpoints/metadata_000098271232.json @@ -0,0 +1 @@ +{"step": 2999, "tokens_seen": 98271232, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.6260121580105293} \ No newline at end of file diff --git a/checkpoints/metadata_000098304000.json b/checkpoints/metadata_000098304000.json new file mode 100644 index 0000000000000000000000000000000000000000..9aad7469b4ed2d37105464eee185e8b15911162f --- /dev/null +++ b/checkpoints/metadata_000098304000.json @@ -0,0 +1 @@ +{"step": 3000, "tokens_seen": 98304000, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.6253684366722756} \ No newline at end of file diff --git a/checkpoints/metadata_000108068864.json b/checkpoints/metadata_000108068864.json new file mode 100644 index 0000000000000000000000000000000000000000..87c2b17eb8479f7d3f87e1c0eca93c088c46b141 --- /dev/null +++ b/checkpoints/metadata_000108068864.json @@ -0,0 +1 @@ +{"step": 3298, "tokens_seen": 108068864, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.5603166235533545} \ No newline at end of file diff --git a/checkpoints/metadata_000114688000.json b/checkpoints/metadata_000114688000.json new file mode 100644 index 0000000000000000000000000000000000000000..b4ec38ae60c9366a2ef6cfd4475ab67f577007ab --- /dev/null +++ b/checkpoints/metadata_000114688000.json @@ -0,0 +1 @@ +{"step": 3500, "tokens_seen": 114688000, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.5327589075822057} \ No newline at end of file diff --git a/checkpoints/metadata_000118882304.json b/checkpoints/metadata_000118882304.json new file mode 100644 index 0000000000000000000000000000000000000000..59ecaf437ce979a4df8edca9b1c49a98f16eb562 --- /dev/null +++ b/checkpoints/metadata_000118882304.json @@ -0,0 +1 @@ +{"step": 3628, "tokens_seen": 118882304, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.5365184080068772} \ No newline at end of file diff --git a/checkpoints/metadata_000130777088.json b/checkpoints/metadata_000130777088.json new file mode 100644 index 0000000000000000000000000000000000000000..bdcb3c0821bf3c86f3c0d19c487342b063364d5f --- /dev/null +++ b/checkpoints/metadata_000130777088.json @@ -0,0 +1 @@ +{"step": 3991, "tokens_seen": 130777088, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.4849287678850778} \ No newline at end of file diff --git a/checkpoints/metadata_000131072000.json b/checkpoints/metadata_000131072000.json new file mode 100644 index 0000000000000000000000000000000000000000..3e64de3eea7c38848a9a1779d705cbf0dfb7dae7 --- /dev/null +++ b/checkpoints/metadata_000131072000.json @@ -0,0 +1 @@ +{"step": 4000, "tokens_seen": 131072000, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.491663701579793} \ No newline at end of file diff --git a/checkpoints/metadata_000143851520.json b/checkpoints/metadata_000143851520.json new file mode 100644 index 0000000000000000000000000000000000000000..989b30f4e12063f22b4a95c0af39ee8be76a1847 --- /dev/null +++ b/checkpoints/metadata_000143851520.json @@ -0,0 +1 @@ +{"step": 4390, "tokens_seen": 143851520, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.4505928349233104} \ No newline at end of file diff --git a/checkpoints/metadata_000147456000.json b/checkpoints/metadata_000147456000.json new file mode 100644 index 0000000000000000000000000000000000000000..337aa1b9f40cd55d5207347abd396ce98210a7f3 --- /dev/null +++ b/checkpoints/metadata_000147456000.json @@ -0,0 +1 @@ +{"step": 4500, "tokens_seen": 147456000, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.438957314038425} \ No newline at end of file diff --git a/checkpoints/metadata_000158269440.json b/checkpoints/metadata_000158269440.json new file mode 100644 index 0000000000000000000000000000000000000000..88062f8abd8fd7ba6d6f9da285d650ff4dd7e32e --- /dev/null +++ b/checkpoints/metadata_000158269440.json @@ -0,0 +1 @@ +{"step": 4830, "tokens_seen": 158269440, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.415345575914301} \ No newline at end of file diff --git a/checkpoints/metadata_000163840000.json b/checkpoints/metadata_000163840000.json new file mode 100644 index 0000000000000000000000000000000000000000..fc09340f066d827ae277a62dcdbd28f859371792 --- /dev/null +++ b/checkpoints/metadata_000163840000.json @@ -0,0 +1 @@ +{"step": 5000, "tokens_seen": 163840000, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.400300587335751} \ No newline at end of file diff --git a/checkpoints/metadata_000174096384.json b/checkpoints/metadata_000174096384.json new file mode 100644 index 0000000000000000000000000000000000000000..cb991f52102019ede1491880e53dbba4863caa5a --- /dev/null +++ b/checkpoints/metadata_000174096384.json @@ -0,0 +1 @@ +{"step": 5313, "tokens_seen": 174096384, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.3922818153145586} \ No newline at end of file diff --git a/checkpoints/metadata_000180224000.json b/checkpoints/metadata_000180224000.json new file mode 100644 index 0000000000000000000000000000000000000000..72123385664c824a52c40d0ac4d9ce6ebf9b11ad --- /dev/null +++ b/checkpoints/metadata_000180224000.json @@ -0,0 +1 @@ +{"step": 5500, "tokens_seen": 180224000, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.3694135921767043} \ No newline at end of file diff --git a/checkpoints/metadata_000191496192.json b/checkpoints/metadata_000191496192.json new file mode 100644 index 0000000000000000000000000000000000000000..ba995bf3d14e658f9a2470708c29626c27173578 --- /dev/null +++ b/checkpoints/metadata_000191496192.json @@ -0,0 +1 @@ +{"step": 5844, "tokens_seen": 191496192, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.349445043386691} \ No newline at end of file diff --git a/checkpoints/metadata_000196608000.json b/checkpoints/metadata_000196608000.json new file mode 100644 index 0000000000000000000000000000000000000000..5bece7db5ddbba6e2445ad87052b956ea33ae1c5 --- /dev/null +++ b/checkpoints/metadata_000196608000.json @@ -0,0 +1 @@ +{"step": 6000, "tokens_seen": 196608000, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.3381633233461496} \ No newline at end of file diff --git a/checkpoints/metadata_000196706304.json b/checkpoints/metadata_000196706304.json new file mode 100644 index 0000000000000000000000000000000000000000..0cdd8e32be36f5ee02c26af07ab5e4c869675210 --- /dev/null +++ b/checkpoints/metadata_000196706304.json @@ -0,0 +1 @@ +{"step": 6003, "tokens_seen": 196706304, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.339771910686082} \ No newline at end of file diff --git a/checkpoints/metadata_000197361664.json b/checkpoints/metadata_000197361664.json new file mode 100644 index 0000000000000000000000000000000000000000..2006b497a78e2ba68c21317535bad65bcb718539 --- /dev/null +++ b/checkpoints/metadata_000197361664.json @@ -0,0 +1 @@ +{"step": 6023, "tokens_seen": 197361664, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.3420097908234303} \ No newline at end of file diff --git a/checkpoints/metadata_000198017024.json b/checkpoints/metadata_000198017024.json new file mode 100644 index 0000000000000000000000000000000000000000..97198301f887855f3c4ded59e4408f513f464492 --- /dev/null +++ b/checkpoints/metadata_000198017024.json @@ -0,0 +1 @@ +{"step": 6043, "tokens_seen": 198017024, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.341477207381869} \ No newline at end of file diff --git a/checkpoints/metadata_000198672384.json b/checkpoints/metadata_000198672384.json new file mode 100644 index 0000000000000000000000000000000000000000..9df0762a28ae2dcded2fb3d18ccbd7341aac0b64 --- /dev/null +++ b/checkpoints/metadata_000198672384.json @@ -0,0 +1 @@ +{"step": 6063, "tokens_seen": 198672384, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.3311880922994432} \ No newline at end of file diff --git a/checkpoints/metadata_000199327744.json b/checkpoints/metadata_000199327744.json new file mode 100644 index 0000000000000000000000000000000000000000..0a36e1cc885b4dab69ee50ef1bfa6f4e96161cb9 --- /dev/null +++ b/checkpoints/metadata_000199327744.json @@ -0,0 +1 @@ +{"step": 6083, "tokens_seen": 199327744, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.337485003676205} \ No newline at end of file diff --git a/checkpoints/metadata_000199950336.json b/checkpoints/metadata_000199950336.json new file mode 100644 index 0000000000000000000000000000000000000000..6ec3e7b61c2ee239a48585839cadca37dc4717c7 --- /dev/null +++ b/checkpoints/metadata_000199950336.json @@ -0,0 +1 @@ +{"step": 6102, "tokens_seen": 199950336, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.3297589359767783} \ No newline at end of file diff --git a/checkpoints/model_weights_000000032768.pt b/checkpoints/model_weights_000000032768.pt new file mode 100644 index 0000000000000000000000000000000000000000..b078b2917bda543b2dcc8cbf674e7870d98b0346 --- /dev/null +++ b/checkpoints/model_weights_000000032768.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db649a536cb0a57b27c8d60c5bb440a77954c7b96613bd240a49eda4fca2fb84 +size 158534613 diff --git a/checkpoints/model_weights_000000327680.pt b/checkpoints/model_weights_000000327680.pt new file mode 100644 index 0000000000000000000000000000000000000000..010416a76b4542cc3886482c2c2c791448318bd8 --- /dev/null +++ b/checkpoints/model_weights_000000327680.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0844095b40c6c205f8aa003daee209f548b419c27234ece615934efe6bcfef4d +size 158534613 diff --git a/checkpoints/model_weights_000000360448.pt b/checkpoints/model_weights_000000360448.pt new file mode 100644 index 0000000000000000000000000000000000000000..76249d857adc7adfeb5fa3677481aa8af6227e43 --- /dev/null +++ b/checkpoints/model_weights_000000360448.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d967d2b0d751ebc6a0f73602feb3e1b6b2f24b271634e05d1ecf96be2d9fa965 +size 158534613 diff --git a/checkpoints/model_weights_000000425984.pt b/checkpoints/model_weights_000000425984.pt new file mode 100644 index 0000000000000000000000000000000000000000..890865e42f96c41efbe1257cd7da283a9c094c1c --- /dev/null +++ b/checkpoints/model_weights_000000425984.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08f34f0aea3913321ee6caf842edbb720bbe5b36df16164622209ae459ef7092 +size 158534613 diff --git a/checkpoints/model_weights_000000458752.pt b/checkpoints/model_weights_000000458752.pt new file mode 100644 index 0000000000000000000000000000000000000000..e0b47567c3cdc3d8227667f48a8e3e34856b5968 --- /dev/null +++ b/checkpoints/model_weights_000000458752.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c21c99b64fc04c8f94acc88ce56d3ed63b5fd54a2eb01fa95e56efc8bb70f989 +size 158534613 diff --git a/checkpoints/model_weights_000000491520.pt b/checkpoints/model_weights_000000491520.pt new file mode 100644 index 0000000000000000000000000000000000000000..ebda7807c36f72e87bd89eb17eb5983cf85baf0d --- /dev/null +++ b/checkpoints/model_weights_000000491520.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3741083e542f3cc6e654a24d577c800e9d5afd58a5a7107c8308bfe715a11d3e +size 158534613 diff --git a/checkpoints/model_weights_000000557056.pt b/checkpoints/model_weights_000000557056.pt new file mode 100644 index 0000000000000000000000000000000000000000..9abb9f464b93d019dfdefad6920f23e9be0459fb --- /dev/null +++ b/checkpoints/model_weights_000000557056.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec142617905939ead4c01e0b98532f9cb9b489b4d2833136f6e070ca68159e4e +size 158534613 diff --git a/checkpoints/model_weights_000000622592.pt b/checkpoints/model_weights_000000622592.pt new file mode 100644 index 0000000000000000000000000000000000000000..e46d1b3be126fc3f4b836cfbf1e1085ecbc0282e --- /dev/null +++ b/checkpoints/model_weights_000000622592.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50623832f57de902334967142148f4b651992e425d5b4c3d812ac01e7e3eeefc +size 158534613 diff --git a/checkpoints/model_weights_000000688128.pt b/checkpoints/model_weights_000000688128.pt new file mode 100644 index 0000000000000000000000000000000000000000..e8f1712161306e3a30bcd95c18b55afab74d319b --- /dev/null +++ b/checkpoints/model_weights_000000688128.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:561e0445db9e96dbda2b849551989eafb5a67f0cc563935cdef73b1f752a2be0 +size 158534613 diff --git a/checkpoints/model_weights_000000753664.pt b/checkpoints/model_weights_000000753664.pt new file mode 100644 index 0000000000000000000000000000000000000000..bfb6786cac18f794035f7b7ceb9ec79bad14f275 --- /dev/null +++ b/checkpoints/model_weights_000000753664.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d291ec2726b652973ffa88991ca0acb6561c3d5ac54e00eb8b3f3e7715e9ea3 +size 158534613 diff --git a/checkpoints/model_weights_000000819200.pt b/checkpoints/model_weights_000000819200.pt new file mode 100644 index 0000000000000000000000000000000000000000..b189b72590371292bbece300e025ca3403932a41 --- /dev/null +++ b/checkpoints/model_weights_000000819200.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03ecc4d2c10e7da01d272423ec7fb95e53c146eee0db3e229940c2df3b7704ad +size 158534613 diff --git a/checkpoints/model_weights_000000917504.pt b/checkpoints/model_weights_000000917504.pt new file mode 100644 index 0000000000000000000000000000000000000000..28aed2bedc9e41e4da424abc8d9f9b878ce08b37 --- /dev/null +++ b/checkpoints/model_weights_000000917504.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a66dab961ad4c99e36941aab736f772c0b4de54edcce78a9e060ac9485ea367f +size 158534613 diff --git a/checkpoints/model_weights_000000983040.pt b/checkpoints/model_weights_000000983040.pt new file mode 100644 index 0000000000000000000000000000000000000000..06c14343d335ac72aaf07fd7afec34338f60770d --- /dev/null +++ b/checkpoints/model_weights_000000983040.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5b16a9d73702701e1b102753d0ba78df1717bd703f9f16cbbeade028a2a9200 +size 158534613 diff --git a/checkpoints/model_weights_000001114112.pt b/checkpoints/model_weights_000001114112.pt new file mode 100644 index 0000000000000000000000000000000000000000..ac7eed249fbf3ee858c989a986b357a00d147a65 --- /dev/null +++ b/checkpoints/model_weights_000001114112.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f28e504c2fac823e8fdbb84796c2a92a93c580694541089949aa142adc309461 +size 158534613 diff --git a/checkpoints/model_weights_000001212416.pt b/checkpoints/model_weights_000001212416.pt new file mode 100644 index 0000000000000000000000000000000000000000..c59ec7c5a5aeb7bf42635d4277cbce7f81e5f3cb --- /dev/null +++ b/checkpoints/model_weights_000001212416.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:817eca74ee684f0c5918ac33d4c9da9665523830ab6269407ec1c0f062b58b27 +size 158534613 diff --git a/checkpoints/model_weights_000001343488.pt b/checkpoints/model_weights_000001343488.pt new file mode 100644 index 0000000000000000000000000000000000000000..5cbbaafce81e62f8332403bb5fccc6deff229f27 --- /dev/null +++ b/checkpoints/model_weights_000001343488.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:269c5d0f2e16cd9efc8a662a3c9fe68c472e672ec4b9e56f2712482defa32ad8 +size 158534613 diff --git a/checkpoints/model_weights_000001474560.pt b/checkpoints/model_weights_000001474560.pt new file mode 100644 index 0000000000000000000000000000000000000000..55580973bc782a2c368ce1eb24cb2aa44667fffb --- /dev/null +++ b/checkpoints/model_weights_000001474560.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:68e73179d5841247af717aea519235c70a3475c30e24dcce98dd6e4237d91977 +size 158534613 diff --git a/checkpoints/model_weights_000001605632.pt b/checkpoints/model_weights_000001605632.pt new file mode 100644 index 0000000000000000000000000000000000000000..565bc73f44c393c10aeff431a4ddb03a70ca37af --- /dev/null +++ b/checkpoints/model_weights_000001605632.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c99e5e9c779f7d33c22e482d7d9fba9547ec64fc3617f690c74dd88dc060fd95 +size 158534613 diff --git a/checkpoints/model_weights_000001769472.pt b/checkpoints/model_weights_000001769472.pt new file mode 100644 index 0000000000000000000000000000000000000000..84c41cc08e88bf779c6da8afa87dbe0fc65c5bcf --- /dev/null +++ b/checkpoints/model_weights_000001769472.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:187db728f5def7b0b5c48c222c73a2f7fe41d5652c83aa76da53fdaa27f85b46 +size 158534613 diff --git a/checkpoints/model_weights_000001966080.pt b/checkpoints/model_weights_000001966080.pt new file mode 100644 index 0000000000000000000000000000000000000000..f9aa54aaceaf9a5792e917c2602c6dfae61dcc85 --- /dev/null +++ b/checkpoints/model_weights_000001966080.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55c18b0d8c568b68416545b87dddba2004cf9669a6fa5c4e7eab53880c36e019 +size 158534613 diff --git a/checkpoints/model_weights_000002162688.pt b/checkpoints/model_weights_000002162688.pt new file mode 100644 index 0000000000000000000000000000000000000000..000ff594c4fc65dc898623f2719ce36aba793323 --- /dev/null +++ b/checkpoints/model_weights_000002162688.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9267ff586fce72851ac663bf8abc59b828d7feb44906d86dd328e14e5d899549 +size 158534613 diff --git a/checkpoints/model_weights_000002359296.pt b/checkpoints/model_weights_000002359296.pt new file mode 100644 index 0000000000000000000000000000000000000000..83a2fde498332b54a1f8ce7a7b53e7f6341848a5 --- /dev/null +++ b/checkpoints/model_weights_000002359296.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:769f8b3e0d2e437f532e982aae688d61f5aebd507cf750ca24a535da628dc547 +size 158534613 diff --git a/checkpoints/model_weights_000002621440.pt b/checkpoints/model_weights_000002621440.pt new file mode 100644 index 0000000000000000000000000000000000000000..270b3c0929552ff1449c28a3dd3ef43ca01d1a45 --- /dev/null +++ b/checkpoints/model_weights_000002621440.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d21723cf446b72b5d5a6249f0badf79ad0eac71b6c73c5c580867904d2d9a330 +size 158534613 diff --git a/checkpoints/model_weights_000002883584.pt b/checkpoints/model_weights_000002883584.pt new file mode 100644 index 0000000000000000000000000000000000000000..5a49313eb755e1a5fd9e7551806b90a193e4814b --- /dev/null +++ b/checkpoints/model_weights_000002883584.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:496a3e26367411ec67701fca5a346cc14e5595575f1a5b910213b3d9067e6bc7 +size 158534613 diff --git a/checkpoints/model_weights_000003178496.pt b/checkpoints/model_weights_000003178496.pt new file mode 100644 index 0000000000000000000000000000000000000000..399fb60038e2d84650e5de0de92b3aa9dd2a5bd2 --- /dev/null +++ b/checkpoints/model_weights_000003178496.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4e4f979b15e1c487a43e3479120b0b6897cf089481031fcc26647fdf02c6832 +size 158534613 diff --git a/checkpoints/model_weights_000003473408.pt b/checkpoints/model_weights_000003473408.pt new file mode 100644 index 0000000000000000000000000000000000000000..6c500606e4ae39890b7e1ab5cdde1ee99a522348 --- /dev/null +++ b/checkpoints/model_weights_000003473408.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12debfd0aa7383d113abcf9a2fc9786b562b24d3bece4ca1a6ca5f21908a4f46 +size 158534613 diff --git a/checkpoints/model_weights_000003833856.pt b/checkpoints/model_weights_000003833856.pt new file mode 100644 index 0000000000000000000000000000000000000000..ce152cc7598c355255cb7c7d6db471893f2c031e --- /dev/null +++ b/checkpoints/model_weights_000003833856.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa7d52eeb60363a59f6ead918d830e2bc231731a8212e9105db1887c9712b8ee +size 158534613 diff --git a/checkpoints/model_weights_000004227072.pt b/checkpoints/model_weights_000004227072.pt new file mode 100644 index 0000000000000000000000000000000000000000..498453ab04018b1706dc8fbff6b59a5b199d4a49 --- /dev/null +++ b/checkpoints/model_weights_000004227072.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73cb3f09f9dbb599518a251475e3940df131ce083c5bb5a4460a265887050311 +size 158534613 diff --git a/checkpoints/model_weights_000004653056.pt b/checkpoints/model_weights_000004653056.pt new file mode 100644 index 0000000000000000000000000000000000000000..ada14a73c81b38c83ab282065679ad8d5f62465e --- /dev/null +++ b/checkpoints/model_weights_000004653056.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2bb8af2521aad4149072eded03fc86bdbc542cab375891d8fcf3b9e90814ea59 +size 158534613 diff --git a/checkpoints/model_weights_000005111808.pt b/checkpoints/model_weights_000005111808.pt new file mode 100644 index 0000000000000000000000000000000000000000..a75dacaf42d589f4996dd2198533b38c2ad35c5e --- /dev/null +++ b/checkpoints/model_weights_000005111808.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:060e9cb521ca36c82cfc29cd5efd3ac7b471d44000b203a375cb3bb0e2e771a4 +size 158534613 diff --git a/checkpoints/model_weights_000005603328.pt b/checkpoints/model_weights_000005603328.pt new file mode 100644 index 0000000000000000000000000000000000000000..e889b6739c0cf9cd545e0dad18f93ca21ee80c5c --- /dev/null +++ b/checkpoints/model_weights_000005603328.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df6f3c6ef04dfed4ee3b39b37fd7a5afb72706a4ee07c70be6c6840afd183b04 +size 158534613 diff --git a/checkpoints/model_weights_000006193152.pt b/checkpoints/model_weights_000006193152.pt new file mode 100644 index 0000000000000000000000000000000000000000..62d0e58746787b55b0472c3dbd5ca60c71439d55 --- /dev/null +++ b/checkpoints/model_weights_000006193152.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2bac97cb4f555d8426d6e1154cfa709b686b7a74b97010bb375f030c76becd32 +size 158534613 diff --git a/checkpoints/model_weights_000006782976.pt b/checkpoints/model_weights_000006782976.pt new file mode 100644 index 0000000000000000000000000000000000000000..aca61e886f1a8d562edc34882217bf2eb20b2218 --- /dev/null +++ b/checkpoints/model_weights_000006782976.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:614a41a8d56e5d22450822fd50f314c6b4b9ca4568b2386c82f35351f526e18b +size 158534613 diff --git a/checkpoints/model_weights_000007471104.pt b/checkpoints/model_weights_000007471104.pt new file mode 100644 index 0000000000000000000000000000000000000000..221939a5d96e09cf154de832b332a16a1e0b1fa0 --- /dev/null +++ b/checkpoints/model_weights_000007471104.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5c4454ecc4e63397dd411aaf330d43d712f79da13c40a82e42a3c18974e31df +size 158534613 diff --git a/checkpoints/model_weights_000008224768.pt b/checkpoints/model_weights_000008224768.pt new file mode 100644 index 0000000000000000000000000000000000000000..6603c6d856eb675ad1a683fa011eab761bfa9060 --- /dev/null +++ b/checkpoints/model_weights_000008224768.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b02e6733e448b7ed483d66af657146154431d8a73dc032eccf3af21c30faae9 +size 158534613 diff --git a/checkpoints/model_weights_000009043968.pt b/checkpoints/model_weights_000009043968.pt new file mode 100644 index 0000000000000000000000000000000000000000..ef2b143c542a7ec956d2c9f3fb5fa7f990b08300 --- /dev/null +++ b/checkpoints/model_weights_000009043968.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d83065ec41a303d3894cb30694a2cc7495462c3a100490a68e4e647629c4d5c +size 158534613 diff --git a/checkpoints/model_weights_000009961472.pt b/checkpoints/model_weights_000009961472.pt new file mode 100644 index 0000000000000000000000000000000000000000..1ced0d5c1fde28db39f062a70ce34fe9a0c4e56e --- /dev/null +++ b/checkpoints/model_weights_000009961472.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bbb7865aafe03ac3cbcdfbece9d78d032326911608770a5ad6203ad42fbe047b +size 158534613 diff --git a/checkpoints/model_weights_000010944512.pt b/checkpoints/model_weights_000010944512.pt new file mode 100644 index 0000000000000000000000000000000000000000..e8ec92d0efd2f53e9582f7c3d7baf8fd041b3c2a --- /dev/null +++ b/checkpoints/model_weights_000010944512.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:501a402165034e5364cde160378c7066bb9a28427664214154bbb569a836e673 +size 158534613 diff --git a/checkpoints/model_weights_000012058624.pt b/checkpoints/model_weights_000012058624.pt new file mode 100644 index 0000000000000000000000000000000000000000..f61e7ec737e6e6775f0248f8c743ca2fdcf05633 --- /dev/null +++ b/checkpoints/model_weights_000012058624.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b57efe1de1a51d2eb52c03426ad3a0f99a20ba851ced04bade08abca5881e101 +size 158534613 diff --git a/checkpoints/model_weights_000013271040.pt b/checkpoints/model_weights_000013271040.pt new file mode 100644 index 0000000000000000000000000000000000000000..ea3c6e2a850df87a9343e050ca2a63049b14ec87 --- /dev/null +++ b/checkpoints/model_weights_000013271040.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3393c878ac002e71c0780e9612dd904ec9ef99499e8c952dadf752b7051917b +size 158534613 diff --git a/checkpoints/model_weights_000014581760.pt b/checkpoints/model_weights_000014581760.pt new file mode 100644 index 0000000000000000000000000000000000000000..8d71a9b32b765c496bbce1e4fe44db26e59435a4 --- /dev/null +++ b/checkpoints/model_weights_000014581760.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9952c463d36cf66248576862ba16edc5803a7984b7d97e1b5354e57e03b42f37 +size 158534613 diff --git a/checkpoints/model_weights_000016056320.pt b/checkpoints/model_weights_000016056320.pt new file mode 100644 index 0000000000000000000000000000000000000000..2d901e91947b7157b24ccdb86e229f593642a3d6 --- /dev/null +++ b/checkpoints/model_weights_000016056320.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:168d4d86d44eb5a2f0264d78e9df7453066bb9e8b1729245f58ccc92bc0dec20 +size 158534613 diff --git a/checkpoints/model_weights_000016384000.pt b/checkpoints/model_weights_000016384000.pt new file mode 100644 index 0000000000000000000000000000000000000000..94014d56855befda98592ee050d5b64987d5f966 --- /dev/null +++ b/checkpoints/model_weights_000016384000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7686c512c7559aa32ac4828adf46debd7312f4019e8fc19d9cf9e245f95ea2a0 +size 158534613 diff --git a/checkpoints/model_weights_000017661952.pt b/checkpoints/model_weights_000017661952.pt new file mode 100644 index 0000000000000000000000000000000000000000..a4697afb67e80c4f27dfd3b7980c76079a0b15b9 --- /dev/null +++ b/checkpoints/model_weights_000017661952.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5df980ad2c31fe608b19c23703204d1f7f634f3966e3144b157ad1b645dff541 +size 158534613 diff --git a/checkpoints/model_weights_000019431424.pt b/checkpoints/model_weights_000019431424.pt new file mode 100644 index 0000000000000000000000000000000000000000..7e4d9dc54289b09627a02bc02e3a4707f4ff02ae --- /dev/null +++ b/checkpoints/model_weights_000019431424.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e47084ab764f51fd7ea3bb031d2ce76d2fc88dfcf4d5188281bcf962b579a26 +size 158534613 diff --git a/checkpoints/model_weights_000021364736.pt b/checkpoints/model_weights_000021364736.pt new file mode 100644 index 0000000000000000000000000000000000000000..35f505839347afc8fe5f8bc20749bc858f797c96 --- /dev/null +++ b/checkpoints/model_weights_000021364736.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a1ceeebbfa1024a1161b1b94a18c30305a67a6ee832a9386da65d10d4a02a7f +size 158534613 diff --git a/checkpoints/model_weights_000023494656.pt b/checkpoints/model_weights_000023494656.pt new file mode 100644 index 0000000000000000000000000000000000000000..9affd66d9b6dacf8bb9c130e6a0b0ac994d00d7c --- /dev/null +++ b/checkpoints/model_weights_000023494656.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d3202976fb1efbb3a03ca49f31ec50e1056ede3852154c4d556a645fa5f24e9 +size 158534613 diff --git a/checkpoints/model_weights_000025853952.pt b/checkpoints/model_weights_000025853952.pt new file mode 100644 index 0000000000000000000000000000000000000000..4a820e1aaba4e981956d1eac69240015e4a6708b --- /dev/null +++ b/checkpoints/model_weights_000025853952.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e8f169b1cf554ee30eeab8ed239cbab1a6c4f32dc2d932a08b3792d159f0c56 +size 158534613 diff --git a/checkpoints/model_weights_000028442624.pt b/checkpoints/model_weights_000028442624.pt new file mode 100644 index 0000000000000000000000000000000000000000..8eab06b668d179ddb518e59506534bea0d64406d --- /dev/null +++ b/checkpoints/model_weights_000028442624.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01feaf06b427faeab36311f365cdfd0044888beb8f40477218d809e58c49b519 +size 158534613 diff --git a/checkpoints/model_weights_000031293440.pt b/checkpoints/model_weights_000031293440.pt new file mode 100644 index 0000000000000000000000000000000000000000..87be18ea25ad983b7af865538d427eaa6d7878e7 --- /dev/null +++ b/checkpoints/model_weights_000031293440.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0dcb275a18ddc5bbbfe7faf0bf5a935a6c95667bbd57e615e897e1796c3877e +size 158534613 diff --git a/checkpoints/model_weights_000032768000.pt b/checkpoints/model_weights_000032768000.pt new file mode 100644 index 0000000000000000000000000000000000000000..d0c0bbe2a5b44cc6e82494ca5f2829d50e6c052e --- /dev/null +++ b/checkpoints/model_weights_000032768000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8eedae91cac6e8ad674fdef00da31761c93f74d5791f1adbfd3e92e5126631f7 +size 158534613 diff --git a/checkpoints/model_weights_000034439168.pt b/checkpoints/model_weights_000034439168.pt new file mode 100644 index 0000000000000000000000000000000000000000..d65deab5fb0acd7c8e5072886d8d4835920dd986 --- /dev/null +++ b/checkpoints/model_weights_000034439168.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8771f337c638768c7de24ee19a15ade743784abe9ae02efdf140aabb00bf3de +size 158534613 diff --git a/checkpoints/model_weights_000037879808.pt b/checkpoints/model_weights_000037879808.pt new file mode 100644 index 0000000000000000000000000000000000000000..8b9ba4e04500c23d1369038a07ebc1800f781da2 --- /dev/null +++ b/checkpoints/model_weights_000037879808.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62646fe574c282dfa7d420447a78be5fc9f7821c134920d4f8c634c0ebbbd93e +size 158534613 diff --git a/checkpoints/model_weights_000041648128.pt b/checkpoints/model_weights_000041648128.pt new file mode 100644 index 0000000000000000000000000000000000000000..7577d4cdc47c37fb742ab60d917a317a71b2a2fd --- /dev/null +++ b/checkpoints/model_weights_000041648128.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dfb1e3dd43cb10b106e247244c2a24db0ad0edc4828b15ce31b2b84e8066de14 +size 158534613 diff --git a/checkpoints/model_weights_000045842432.pt b/checkpoints/model_weights_000045842432.pt new file mode 100644 index 0000000000000000000000000000000000000000..8f4d00923fefd9d562882a8237c9abee96a3cce7 --- /dev/null +++ b/checkpoints/model_weights_000045842432.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02495f0afc3faa25d52ec2fa21b7e1757e9efbd6094b0e3632e75760de9b325a +size 158534613 diff --git a/checkpoints/model_weights_000049152000.pt b/checkpoints/model_weights_000049152000.pt new file mode 100644 index 0000000000000000000000000000000000000000..f7268678859a0c59a4e8c3f3105f77fdf5e908a6 --- /dev/null +++ b/checkpoints/model_weights_000049152000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:838bc4db2545546b9bb2761e5715dc94bc8aff490fdfef879f55299fea74ec83 +size 158534613 diff --git a/checkpoints/model_weights_000050397184.pt b/checkpoints/model_weights_000050397184.pt new file mode 100644 index 0000000000000000000000000000000000000000..dd6d214dff08909648c227ee6cb28b07c1d74c7b --- /dev/null +++ b/checkpoints/model_weights_000050397184.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b44dec871ecaf6e1fba2931aa6b4eb30a455f5a3159ca9d162a94bbe48fda099 +size 158534613 diff --git a/checkpoints/model_weights_000055443456.pt b/checkpoints/model_weights_000055443456.pt new file mode 100644 index 0000000000000000000000000000000000000000..b63931b5d9d44674821724f14769d86b6bde0710 --- /dev/null +++ b/checkpoints/model_weights_000055443456.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4d8cf6db891bb780b01f8230e27d29bd5cad64edb00a8dc944ac3e70ff0c017 +size 158534613 diff --git a/checkpoints/model_weights_000061014016.pt b/checkpoints/model_weights_000061014016.pt new file mode 100644 index 0000000000000000000000000000000000000000..262b39e03f71201fbf501b3e862b9a7b939fe3b8 --- /dev/null +++ b/checkpoints/model_weights_000061014016.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b30209e2e677c1cc5beeb7a887bc7bf6d52740aa5b9161a28c26636f806e450 +size 158534613 diff --git a/checkpoints/model_weights_000065536000.pt b/checkpoints/model_weights_000065536000.pt new file mode 100644 index 0000000000000000000000000000000000000000..3bc88725a248e0def2a2a095f96ec9762506c5dd --- /dev/null +++ b/checkpoints/model_weights_000065536000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:538a5f470da12c48f9a7202966707dbe1b296397a572f0c9d4a30cd3e1bfc34d +size 158534613 diff --git a/checkpoints/model_weights_000067108864.pt b/checkpoints/model_weights_000067108864.pt new file mode 100644 index 0000000000000000000000000000000000000000..36353d6ba8b4e373090be7c47a95e615f652603f --- /dev/null +++ b/checkpoints/model_weights_000067108864.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f458882d0a911f4c43fd65de6df963b7cbe90d1d8b133adda8b6c8c2d660faa +size 158534613 diff --git a/checkpoints/model_weights_000073826304.pt b/checkpoints/model_weights_000073826304.pt new file mode 100644 index 0000000000000000000000000000000000000000..872e54d0400f215d94f9cc3a5ca1ec4c2143da4f --- /dev/null +++ b/checkpoints/model_weights_000073826304.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae693df5fd78d08e8ac6d59571d5c012eaa83ccee4345b661b0bdc4406d5b9ff +size 158534613 diff --git a/checkpoints/model_weights_000081199104.pt b/checkpoints/model_weights_000081199104.pt new file mode 100644 index 0000000000000000000000000000000000000000..4bdacd0cee29b1783c33b54f775bc66d926882a1 --- /dev/null +++ b/checkpoints/model_weights_000081199104.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8068a1a9c1dc166511d23661e5b8ce237342c1328b0ea3afe807f663313aa0d7 +size 158534613 diff --git a/checkpoints/model_weights_000081920000.pt b/checkpoints/model_weights_000081920000.pt new file mode 100644 index 0000000000000000000000000000000000000000..4de23b7a8cb96aa8d603281602384dab8d586af9 --- /dev/null +++ b/checkpoints/model_weights_000081920000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e6237049bc2dca13d4d2e05862b3ed9fef77137e30c534821f5423af430d234 +size 158534613 diff --git a/checkpoints/model_weights_000089325568.pt b/checkpoints/model_weights_000089325568.pt new file mode 100644 index 0000000000000000000000000000000000000000..d9857d76f465dd2a9e430c17bcef539e0e3960fc --- /dev/null +++ b/checkpoints/model_weights_000089325568.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f55d7fe84e387279e3fda2d5d405e010946c98daf8770a3feb8ea5240139974 +size 158534613 diff --git a/checkpoints/model_weights_000098271232.pt b/checkpoints/model_weights_000098271232.pt new file mode 100644 index 0000000000000000000000000000000000000000..ef7cf52006705c3f798a0be08f649bb4efa40476 --- /dev/null +++ b/checkpoints/model_weights_000098271232.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c0d9ed2cab5a96cb2bb95a8803a1810296ccfbf7ed1bf2556339ed037e06d61 +size 158534613 diff --git a/checkpoints/model_weights_000098304000.pt b/checkpoints/model_weights_000098304000.pt new file mode 100644 index 0000000000000000000000000000000000000000..930e4175cddee8e69550e12fb1a4913119c21164 --- /dev/null +++ b/checkpoints/model_weights_000098304000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52850b99ecb9d1dd11f8660a968ca22799b97e5f73413c35498c66056be033cf +size 158534613 diff --git a/checkpoints/model_weights_000108068864.pt b/checkpoints/model_weights_000108068864.pt new file mode 100644 index 0000000000000000000000000000000000000000..f2223324f0dee521974af526fe426dcb0c4c5edf --- /dev/null +++ b/checkpoints/model_weights_000108068864.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:228b68fa1d65001193babd81d5b94bc60cd31e65f69812921ca2706f1c32682d +size 158534613 diff --git a/checkpoints/model_weights_000114688000.pt b/checkpoints/model_weights_000114688000.pt new file mode 100644 index 0000000000000000000000000000000000000000..2df7d16389adb5f094f869be3a077c9f8143a078 --- /dev/null +++ b/checkpoints/model_weights_000114688000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0678a1399b05cf7fe5222aab96476e9fef3a361273eceb6cc1daa9ce4b29a395 +size 158534613 diff --git a/checkpoints/model_weights_000118882304.pt b/checkpoints/model_weights_000118882304.pt new file mode 100644 index 0000000000000000000000000000000000000000..ce5e776739b5886d983e9088e337b23ce5adc902 --- /dev/null +++ b/checkpoints/model_weights_000118882304.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:948aa7dc97196eedd3e251b2052dbc88e2cf465975c1c3e77e6c23305912b141 +size 158534613 diff --git a/checkpoints/model_weights_000130777088.pt b/checkpoints/model_weights_000130777088.pt new file mode 100644 index 0000000000000000000000000000000000000000..7582d2ff092408e079dc13752f8cd569ea408a4d --- /dev/null +++ b/checkpoints/model_weights_000130777088.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:904324a3690fb8682b4544e217a0e1839a9c896568ac5196de8662d41f17f238 +size 158534613 diff --git a/checkpoints/model_weights_000131072000.pt b/checkpoints/model_weights_000131072000.pt new file mode 100644 index 0000000000000000000000000000000000000000..0e0e2f5fedc068a21c91ecf63c27b5b28c85d82f --- /dev/null +++ b/checkpoints/model_weights_000131072000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:efbc82c8e77f670e4bcd6928563aa8da70f3989008288437824a0fd65e0d7fa8 +size 158534613 diff --git a/checkpoints/model_weights_000143851520.pt b/checkpoints/model_weights_000143851520.pt new file mode 100644 index 0000000000000000000000000000000000000000..06dabc54b997be599e68d65869d88739ae0ddb93 --- /dev/null +++ b/checkpoints/model_weights_000143851520.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6867b8007b997de860ba437f9c9edff2419047b94d2cb87851ef7e73f161f44 +size 158534613 diff --git a/checkpoints/model_weights_000147456000.pt b/checkpoints/model_weights_000147456000.pt new file mode 100644 index 0000000000000000000000000000000000000000..c9b1d87dbb767c7515b3134ecf99f646eba89d73 --- /dev/null +++ b/checkpoints/model_weights_000147456000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9997b2a0cf343484ff043d3260f694048592903d49112824274116fb9154c493 +size 158534613 diff --git a/checkpoints/model_weights_000158269440.pt b/checkpoints/model_weights_000158269440.pt new file mode 100644 index 0000000000000000000000000000000000000000..0ee5aebb1bd30673d743bbd3ee74e6e2346f9f63 --- /dev/null +++ b/checkpoints/model_weights_000158269440.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:943e1d457cec8bc1bb69fa9f150dde3bd02dcb61d03d495c4fffea7f933bf42e +size 158534613 diff --git a/checkpoints/model_weights_000163840000.pt b/checkpoints/model_weights_000163840000.pt new file mode 100644 index 0000000000000000000000000000000000000000..f015f418747665933977bdf7c5b4249e20faac2d --- /dev/null +++ b/checkpoints/model_weights_000163840000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9782cb4038a553623a3f550caa0e8457c04dec2cea3b9f1a9cc31bb12fa169b +size 158534613 diff --git a/checkpoints/model_weights_000174096384.pt b/checkpoints/model_weights_000174096384.pt new file mode 100644 index 0000000000000000000000000000000000000000..0050e2f0e162be10dae763e76a6a28cfa4ab8551 --- /dev/null +++ b/checkpoints/model_weights_000174096384.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:20ad24c2fed680c4f8a3b3c2e5ffe38087ad71c3baf505c358984f751d6d1090 +size 158534613 diff --git a/checkpoints/model_weights_000180224000.pt b/checkpoints/model_weights_000180224000.pt new file mode 100644 index 0000000000000000000000000000000000000000..d64b56d1e41093a44f8818e02f2c2d6eb5aed122 --- /dev/null +++ b/checkpoints/model_weights_000180224000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34b68c07102ebfba9a8b9444e73d03f4b694052c42be929cb2c3c7c3cb5da26f +size 158534613 diff --git a/checkpoints/model_weights_000191496192.pt b/checkpoints/model_weights_000191496192.pt new file mode 100644 index 0000000000000000000000000000000000000000..8c59fd943af91c9170966f2e6f1311fd0a259d45 --- /dev/null +++ b/checkpoints/model_weights_000191496192.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4194c5137ee6dd82b82e075e617ac85f748dc22adf5759b41e2aa8025413e56f +size 158534613 diff --git a/checkpoints/model_weights_000196608000.pt b/checkpoints/model_weights_000196608000.pt new file mode 100644 index 0000000000000000000000000000000000000000..ca7ffdaaa4b8fcc638ed251b0ec18175bc1f4265 --- /dev/null +++ b/checkpoints/model_weights_000196608000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d6b9ab66a6b5c7fb1c8359f9c40f50c2c00c6d97ee5cd4f915c2c1f82881222 +size 158534613 diff --git a/checkpoints/model_weights_000196706304.pt b/checkpoints/model_weights_000196706304.pt new file mode 100644 index 0000000000000000000000000000000000000000..a8c4f7299b1721fb9a6c329dbb54085ac702e91e --- /dev/null +++ b/checkpoints/model_weights_000196706304.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:950526dbeaea5af5fa11499aa297611503a5d9bf36300117a99ce0c2d5b907f6 +size 158534613 diff --git a/checkpoints/model_weights_000197361664.pt b/checkpoints/model_weights_000197361664.pt new file mode 100644 index 0000000000000000000000000000000000000000..438cbddb7bd7c0fad90f34e07fbf2790f2257a72 --- /dev/null +++ b/checkpoints/model_weights_000197361664.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc35a270b84a5bd430015b0fb6f0a642addbce1e091d8c57ab8aa9cce752bf1a +size 158534613 diff --git a/checkpoints/model_weights_000198017024.pt b/checkpoints/model_weights_000198017024.pt new file mode 100644 index 0000000000000000000000000000000000000000..1d192abd297876849fb41fb5286c4ec83262666a --- /dev/null +++ b/checkpoints/model_weights_000198017024.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8aa1a8cdf8577cfcdc6ee314fc58b5e27cd825cfbe8d431f8dd70c222c138d67 +size 158534613 diff --git a/checkpoints/model_weights_000198672384.pt b/checkpoints/model_weights_000198672384.pt new file mode 100644 index 0000000000000000000000000000000000000000..f1d412c9d1e0cdf3e4e221351e562f887d00a392 --- /dev/null +++ b/checkpoints/model_weights_000198672384.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2587d05db846690a7e8bb3172862ccd126efde97da03d2f770237a68b4be2a08 +size 158534613 diff --git a/checkpoints/model_weights_000199327744.pt b/checkpoints/model_weights_000199327744.pt new file mode 100644 index 0000000000000000000000000000000000000000..b59833e0bcdc77e443f85bd3b37f8c4f875b87da --- /dev/null +++ b/checkpoints/model_weights_000199327744.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19d6838c87b2a09a3093633ef59c5204495caf57122d2c6c92b55c78254f1803 +size 158534613 diff --git a/checkpoints/model_weights_000199950336.pt b/checkpoints/model_weights_000199950336.pt new file mode 100644 index 0000000000000000000000000000000000000000..186b9a6df86d3ad783bf949cfca4660a873f2fc8 --- /dev/null +++ b/checkpoints/model_weights_000199950336.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a9afe15d4c9da4a293fb6fdd7802a71c358bde61243b6ef19e8676a1477eecc +size 158534613 diff --git a/config.toml b/config.toml new file mode 100644 index 0000000000000000000000000000000000000000..e6eebd69247608cf212130733f277c546003f405 --- /dev/null +++ b/config.toml @@ -0,0 +1,31 @@ +model_name = "pile_llama_replace_17367" +n_layers = 2 +d_model = 512 +d_mlp = 2048 +d_head = 64 +n_heads = 8 +attn_only = false +layer_norm_eps = 1e-05 +init_range = 0.02 +n_ctx = 1024 +d_vocab = 32000 +seed = 10 +device = "cuda" +use_bfloat16_matmul = false +batch_size_per_device = 32 +n_devices = 1 +batches_per_step = 1 +max_tokens = 200000000 +lr_hidden = 0.002 +lr_vector = 0.001 +lr_schedule = "constant_with_warmup" +warmup_tokens = 30000000 +weight_decay = 0.05 +grad_norm_clip = 1.0 +train_loss_moving_average_beta = 0.99 +log_interval = 25 +save_checkpoints = true +checkpoint_interval = 500 +checkpoint_interval_ratio = 1.1 +save_log_checkpoints = true +dataset_name = "eoinf/PL_Replace17367_L2" \ No newline at end of file diff --git a/latest_checkpoint.pt b/latest_checkpoint.pt new file mode 100644 index 0000000000000000000000000000000000000000..223638296612cfe54904f15e182c6c8a6b898287 --- /dev/null +++ b/latest_checkpoint.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58ada9c42a49f934ce561c7b996cae9d5597db4359d3665d0975da171e129418 +size 158534135 diff --git a/latest_metadata.json b/latest_metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..6ec3e7b61c2ee239a48585839cadca37dc4717c7 --- /dev/null +++ b/latest_metadata.json @@ -0,0 +1 @@ +{"step": 6102, "tokens_seen": 199950336, "config": {"model_name": "pile_llama_replace_17367", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.3297589359767783} \ No newline at end of file diff --git a/latest_optimizer.pt b/latest_optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..3f490f0aee52046699085e741189b6ef22a5ef17 --- /dev/null +++ b/latest_optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4dc1972fa4da9c026f3955f4ba137819079218998fdba01ebe048de1a32e4a4 +size 317074195 diff --git a/run.sh b/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..7f267f430e5b2294c4e308dc61c1c813beb7e718 --- /dev/null +++ b/run.sh @@ -0,0 +1,27 @@ +#!/bin/bash +# Check if "restart" argument is passed to force normal training +if [ "$1" = "restart" ]; then + echo "Force restart: Running normal training ..." + python -c " +import os +from toy_models.models.trainer import train_transformer_from_config +current_dir = os.getcwd() +train_transformer_from_config('config.toml', current_dir) +" +else + # Check for checkpoints and run appropriate training + python -c " +import os +from pathlib import Path +from toy_models.models.trainer import train_transformer_from_config, restart_from_checkpoint +current_dir = os.getcwd() +# Check if checkpoints directory exists and has .pt files +latest_checkpoint = Path('latest_checkpoint.pt') +if latest_checkpoint.exists(): + print(f'Found checkpoint: {latest_checkpoint}. Restarting from checkpoint...') + restart_from_checkpoint(current_dir) +else: + print('Starting training from beginning ...') + train_transformer_from_config(current_dir) +" +fi diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..dbca95af5d1baa491685242c0752cab44fb14abc --- /dev/null +++ b/wandb/debug-internal.log @@ -0,0 +1,12 @@ +{"time":"2025-12-31T04:13:36.706447837Z","level":"INFO","msg":"stream: starting","core version":"0.23.0"} +{"time":"2025-12-31T04:13:36.934918403Z","level":"INFO","msg":"stream: created new stream","id":"rcpwhdwq"} +{"time":"2025-12-31T04:13:36.935017542Z","level":"INFO","msg":"handler: started","stream_id":"rcpwhdwq"} +{"time":"2025-12-31T04:13:36.93542806Z","level":"INFO","msg":"stream: started","id":"rcpwhdwq"} +{"time":"2025-12-31T04:13:36.935448923Z","level":"INFO","msg":"writer: started","stream_id":"rcpwhdwq"} +{"time":"2025-12-31T04:13:36.935464456Z","level":"INFO","msg":"sender: started","stream_id":"rcpwhdwq"} +{"time":"2025-12-31T05:04:07.32044204Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-12-31T05:04:07.451349813Z","level":"INFO","msg":"handler: operation stats","stats":{}} +{"time":"2025-12-31T05:04:07.458273419Z","level":"INFO","msg":"stream: closing","id":"rcpwhdwq"} +{"time":"2025-12-31T05:04:07.458313318Z","level":"INFO","msg":"handler: closed","stream_id":"rcpwhdwq"} +{"time":"2025-12-31T05:04:07.458379979Z","level":"INFO","msg":"sender: closed","stream_id":"rcpwhdwq"} +{"time":"2025-12-31T05:04:07.45838614Z","level":"INFO","msg":"stream: closed","id":"rcpwhdwq"} diff --git a/wandb/debug.log b/wandb/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..290ba5295b71336c5f7da4f72cf5c24e3c18c9be --- /dev/null +++ b/wandb/debug.log @@ -0,0 +1,26 @@ +2025-12-31 04:13:35,490 INFO MainThread:27729 [wandb_setup.py:_flush():80] Current SDK version is 0.23.0 +2025-12-31 04:13:35,490 INFO MainThread:27729 [wandb_setup.py:_flush():80] Configure stats pid to 27729 +2025-12-31 04:13:35,490 INFO MainThread:27729 [wandb_setup.py:_flush():80] Loading settings from /root/.config/wandb/settings +2025-12-31 04:13:35,490 INFO MainThread:27729 [wandb_setup.py:_flush():80] Loading settings from /notebooks/toy_models/model_training/pile_llama_replace_17367_dataset_name_PL_Replace17367_L2/wandb/settings +2025-12-31 04:13:35,490 INFO MainThread:27729 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-31 04:13:35,490 INFO MainThread:27729 [wandb_init.py:setup_run_log_directory():713] Logging user logs to /notebooks/toy_models/model_training/pile_llama_replace_17367_dataset_name_PL_Replace17367_L2/wandb/run-20251231_041335-rcpwhdwq/logs/debug.log +2025-12-31 04:13:35,490 INFO MainThread:27729 [wandb_init.py:setup_run_log_directory():714] Logging internal logs to /notebooks/toy_models/model_training/pile_llama_replace_17367_dataset_name_PL_Replace17367_L2/wandb/run-20251231_041335-rcpwhdwq/logs/debug-internal.log +2025-12-31 04:13:35,490 INFO MainThread:27729 [wandb_init.py:init():840] calling init triggers +2025-12-31 04:13:35,491 INFO MainThread:27729 [wandb_init.py:init():845] wandb.init called with sweep_config: {} +config: {'model_name': 'pile_llama_replace_17367', 'n_layers': 2, 'd_model': 512, 'd_mlp': 2048, 'd_head': 64, 'n_heads': 8, 'attn_only': False, 'layer_norm_eps': 1e-05, 'init_range': 0.02, 'n_ctx': 1024, 'd_vocab': 32000, 'dataset_name': 'eoinf/PL_Replace17367_L2', 'tokenizer_name': '', 'seed': 10, 'data_seed': 10, 'device': 'cuda', 'use_bfloat16_matmul': False, 'batch_size_per_device': 32, 'n_devices': 1, 'batches_per_step': 1, 'max_tokens': 200000000, 'lr_hidden': 0.002, 'lr_vector': 0.001, 'lr_schedule': 'constant_with_warmup', 'warmup_tokens': 30000000, 'weight_decay': 0.05, 'grad_norm_clip': 1.0, 'train_loss_moving_average_beta': 0.99, 'log_interval': 25, 'save_checkpoints': True, 'checkpoint_interval': 500, 'checkpoint_interval_ratio': 1.1, 'save_log_checkpoints': True, 'use_wandb': True, 'batch_size': 32, 'tokens_per_step': 32768, 'warmup_steps': 915, 'max_steps': 6103, '_wandb': {}} +2025-12-31 04:13:35,491 INFO MainThread:27729 [wandb_init.py:init():888] starting backend +2025-12-31 04:13:36,679 INFO MainThread:27729 [wandb_init.py:init():891] sending inform_init request +2025-12-31 04:13:36,704 INFO MainThread:27729 [wandb_init.py:init():899] backend started and connected +2025-12-31 04:13:36,705 INFO MainThread:27729 [wandb_init.py:init():969] updated telemetry +2025-12-31 04:13:36,977 INFO MainThread:27729 [wandb_init.py:init():993] communicating run to backend with 90.0 second timeout +2025-12-31 04:13:37,188 INFO MainThread:27729 [wandb_init.py:init():1040] starting run threads in backend +2025-12-31 04:13:37,990 INFO MainThread:27729 [wandb_run.py:_console_start():2504] atexit reg +2025-12-31 04:13:37,990 INFO MainThread:27729 [wandb_run.py:_redirect():2352] redirect: wrap_raw +2025-12-31 04:13:37,990 INFO MainThread:27729 [wandb_run.py:_redirect():2421] Wrapping output streams. +2025-12-31 04:13:37,990 INFO MainThread:27729 [wandb_run.py:_redirect():2444] Redirects installed. +2025-12-31 04:13:38,000 INFO MainThread:27729 [wandb_init.py:init():1080] run started, returning control to user process +2025-12-31 05:04:06,859 INFO MainThread:27729 [wandb_run.py:_finish():2270] finishing run eoin/toy-transformer-replication/rcpwhdwq +2025-12-31 05:04:06,866 INFO MainThread:27729 [wandb_run.py:_atexit_cleanup():2469] got exitcode: 0 +2025-12-31 05:04:06,866 INFO MainThread:27729 [wandb_run.py:_restore():2451] restore +2025-12-31 05:04:06,866 INFO MainThread:27729 [wandb_run.py:_restore():2457] restore done +2025-12-31 05:04:07,456 INFO MainThread:27729 [wandb_run.py:_footer_sync_info():3853] logging synced files diff --git a/wandb/run-20251231_041335-rcpwhdwq/files/config.yaml b/wandb/run-20251231_041335-rcpwhdwq/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..cd6f7851dd9af8bb1711478624fe5f507c025e19 --- /dev/null +++ b/wandb/run-20251231_041335-rcpwhdwq/files/config.yaml @@ -0,0 +1,140 @@ +_wandb: + value: + cli_version: 0.23.0 + e: + t15uolpoxil7dt93apt5qafwy48pcz9t: + cpu_count: 8 + cpu_count_logical: 8 + cudaVersion: "12.4" + disk: + /: + total: "262240792576" + used: "152930381824" + email: efarrel4@tcd.ie + executable: /notebooks/toy_models/.toy_models_env/bin/python + git: + commit: 4f9f8fa4b099afc1076224d155d6c8a3785cb4d0 + remote: git@github.com:jgroh3/toy_models.git + gpu: NVIDIA RTX A6000 + gpu_count: 1 + gpu_nvidia: + - architecture: Ampere + cudaCores: 10752 + memoryTotal: "51527024640" + name: NVIDIA RTX A6000 + uuid: GPU-d804d0d5-d870-1347-486a-9c6017366349 + host: ntz6bbfjhf + memory: + total: "47332843520" + os: Linux-5.19.0-45-generic-x86_64-with-glibc2.35 + program: + python: CPython 3.11.7 + root: /notebooks/toy_models/model_training/pile_llama_replace_17367_dataset_name_PL_Replace17367_L2 + startedAt: "2025-12-31T04:13:35.480922Z" + writerId: t15uolpoxil7dt93apt5qafwy48pcz9t + m: [] + python_version: 3.11.7 + t: + "1": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 71 + "2": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 71 + "3": + - 2 + - 13 + - 15 + - 16 + - 61 + "4": 3.11.7 + "5": 0.23.0 + "6": 4.57.3 + "12": 0.23.0 + "13": linux-x86_64 +attn_only: + value: false +batch_size: + value: 32 +batch_size_per_device: + value: 32 +batches_per_step: + value: 1 +checkpoint_interval: + value: 500 +checkpoint_interval_ratio: + value: 1.1 +d_head: + value: 64 +d_mlp: + value: 2048 +d_model: + value: 512 +d_vocab: + value: 32000 +data_seed: + value: 10 +dataset_name: + value: eoinf/PL_Replace17367_L2 +device: + value: cuda +grad_norm_clip: + value: 1 +init_range: + value: 0.02 +layer_norm_eps: + value: 1e-05 +log_interval: + value: 25 +lr_hidden: + value: 0.002 +lr_schedule: + value: constant_with_warmup +lr_vector: + value: 0.001 +max_steps: + value: 6103 +max_tokens: + value: 200000000 +model_name: + value: pile_llama_replace_17367 +n_ctx: + value: 1024 +n_devices: + value: 1 +n_heads: + value: 8 +n_layers: + value: 2 +save_checkpoints: + value: true +save_log_checkpoints: + value: true +seed: + value: 10 +tokenizer_name: + value: "" +tokens_per_step: + value: 32768 +train_loss_moving_average_beta: + value: 0.99 +use_bfloat16_matmul: + value: false +use_wandb: + value: true +warmup_steps: + value: 915 +warmup_tokens: + value: 30000000 +weight_decay: + value: 0.05 diff --git a/wandb/run-20251231_041335-rcpwhdwq/files/output.log b/wandb/run-20251231_041335-rcpwhdwq/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..e1e39dac0979c8854adff5a7136d08b09147afc9 --- /dev/null +++ b/wandb/run-20251231_041335-rcpwhdwq/files/output.log @@ -0,0 +1,252 @@ +Training on cuda +Model: 2L, 512d, 8h +Max steps: 6,103, Max tokens: 200,000,000 +Warmup steps: 915, Warmup tokens: 30,000,000 +Batch size per device: 32 +Context length: 1024 +Learning rates - Hidden: 0.002, Vector: 0.001 + +Step 25 | Tokens: 819,200 | Train Loss EWMA: 10.4061 | Learning Rate: 0.000055 | Progress: 0.00410 +Step 50 | Tokens: 1,638,400 | Train Loss EWMA: 10.1390 | Learning Rate: 0.000109 | Progress: 0.00819 +Step 75 | Tokens: 2,457,600 | Train Loss EWMA: 9.7094 | Learning Rate: 0.000164 | Progress: 0.01229 +Step 100 | Tokens: 3,276,800 | Train Loss EWMA: 9.1650 | Learning Rate: 0.000219 | Progress: 0.01638 +Step 125 | Tokens: 4,096,000 | Train Loss EWMA: 8.6323 | Learning Rate: 0.000273 | Progress: 0.02048 +Step 150 | Tokens: 4,915,200 | Train Loss EWMA: 8.1507 | Learning Rate: 0.000328 | Progress: 0.02458 +Step 175 | Tokens: 5,734,400 | Train Loss EWMA: 7.7347 | Learning Rate: 0.000383 | Progress: 0.02867 +Step 200 | Tokens: 6,553,600 | Train Loss EWMA: 7.3670 | Learning Rate: 0.000437 | Progress: 0.03277 +Step 225 | Tokens: 7,372,800 | Train Loss EWMA: 7.0462 | Learning Rate: 0.000492 | Progress: 0.03686 +Step 250 | Tokens: 8,192,000 | Train Loss EWMA: 6.7814 | Learning Rate: 0.000546 | Progress: 0.04096 +Step 275 | Tokens: 9,011,200 | Train Loss EWMA: 6.5482 | Learning Rate: 0.000601 | Progress: 0.04506 +Step 300 | Tokens: 9,830,400 | Train Loss EWMA: 6.3447 | Learning Rate: 0.000656 | Progress: 0.04915 +Step 325 | Tokens: 10,649,600 | Train Loss EWMA: 6.1717 | Learning Rate: 0.000710 | Progress: 0.05325 +Step 350 | Tokens: 11,468,800 | Train Loss EWMA: 6.0207 | Learning Rate: 0.000765 | Progress: 0.05734 +Step 375 | Tokens: 12,288,000 | Train Loss EWMA: 5.8894 | Learning Rate: 0.000820 | Progress: 0.06144 +Step 400 | Tokens: 13,107,200 | Train Loss EWMA: 5.7760 | Learning Rate: 0.000874 | Progress: 0.06554 +Step 425 | Tokens: 13,926,400 | Train Loss EWMA: 5.6691 | Learning Rate: 0.000929 | Progress: 0.06963 +Step 450 | Tokens: 14,745,600 | Train Loss EWMA: 5.5801 | Learning Rate: 0.000984 | Progress: 0.07373 +Step 475 | Tokens: 15,564,800 | Train Loss EWMA: 5.4966 | Learning Rate: 0.001038 | Progress: 0.07782 +Step 500 | Tokens: 16,384,000 | Train Loss EWMA: 5.4378 | Learning Rate: 0.001093 | Progress: 0.08192 +Step 525 | Tokens: 17,203,200 | Train Loss EWMA: 5.3694 | Learning Rate: 0.001148 | Progress: 0.08602 +Step 550 | Tokens: 18,022,400 | Train Loss EWMA: 5.3047 | Learning Rate: 0.001202 | Progress: 0.09011 +Step 575 | Tokens: 18,841,600 | Train Loss EWMA: 5.2459 | Learning Rate: 0.001257 | Progress: 0.09421 +Step 600 | Tokens: 19,660,800 | Train Loss EWMA: 5.2043 | Learning Rate: 0.001311 | Progress: 0.09830 +Step 625 | Tokens: 20,480,000 | Train Loss EWMA: 5.1614 | Learning Rate: 0.001366 | Progress: 0.10240 +Step 650 | Tokens: 21,299,200 | Train Loss EWMA: 5.1368 | Learning Rate: 0.001421 | Progress: 0.10650 +Step 675 | Tokens: 22,118,400 | Train Loss EWMA: 5.1008 | Learning Rate: 0.001475 | Progress: 0.11059 +Step 700 | Tokens: 22,937,600 | Train Loss EWMA: 5.0704 | Learning Rate: 0.001530 | Progress: 0.11469 +Step 725 | Tokens: 23,756,800 | Train Loss EWMA: 5.0396 | Learning Rate: 0.001585 | Progress: 0.11878 +Step 750 | Tokens: 24,576,000 | Train Loss EWMA: 5.0003 | Learning Rate: 0.001639 | Progress: 0.12288 +Step 775 | Tokens: 25,395,200 | Train Loss EWMA: 4.9766 | Learning Rate: 0.001694 | Progress: 0.12698 +Step 800 | Tokens: 26,214,400 | Train Loss EWMA: 4.9497 | Learning Rate: 0.001749 | Progress: 0.13107 +Step 825 | Tokens: 27,033,600 | Train Loss EWMA: 4.9111 | Learning Rate: 0.001803 | Progress: 0.13517 +Step 850 | Tokens: 27,852,800 | Train Loss EWMA: 4.8829 | Learning Rate: 0.001858 | Progress: 0.13926 +Step 875 | Tokens: 28,672,000 | Train Loss EWMA: 4.8473 | Learning Rate: 0.001913 | Progress: 0.14336 +Step 900 | Tokens: 29,491,200 | Train Loss EWMA: 4.8146 | Learning Rate: 0.001967 | Progress: 0.14746 +Step 925 | Tokens: 30,310,400 | Train Loss EWMA: 4.7772 | Learning Rate: 0.002000 | Progress: 0.15155 +Step 950 | Tokens: 31,129,600 | Train Loss EWMA: 4.7472 | Learning Rate: 0.002000 | Progress: 0.15565 +Step 975 | Tokens: 31,948,800 | Train Loss EWMA: 4.7187 | Learning Rate: 0.002000 | Progress: 0.15974 +Step 1,000 | Tokens: 32,768,000 | Train Loss EWMA: 4.6828 | Learning Rate: 0.002000 | Progress: 0.16384 +Step 1,025 | Tokens: 33,587,200 | Train Loss EWMA: 4.6462 | Learning Rate: 0.002000 | Progress: 0.16794 +Step 1,050 | Tokens: 34,406,400 | Train Loss EWMA: 4.6200 | Learning Rate: 0.002000 | Progress: 0.17203 +Step 1,075 | Tokens: 35,225,600 | Train Loss EWMA: 4.5935 | Learning Rate: 0.002000 | Progress: 0.17613 +Step 1,100 | Tokens: 36,044,800 | Train Loss EWMA: 4.5665 | Learning Rate: 0.002000 | Progress: 0.18022 +Step 1,125 | Tokens: 36,864,000 | Train Loss EWMA: 4.5387 | Learning Rate: 0.002000 | Progress: 0.18432 +Step 1,150 | Tokens: 37,683,200 | Train Loss EWMA: 4.5196 | Learning Rate: 0.002000 | Progress: 0.18842 +Step 1,175 | Tokens: 38,502,400 | Train Loss EWMA: 4.4917 | Learning Rate: 0.002000 | Progress: 0.19251 +Step 1,200 | Tokens: 39,321,600 | Train Loss EWMA: 4.4736 | Learning Rate: 0.002000 | Progress: 0.19661 +Step 1,225 | Tokens: 40,140,800 | Train Loss EWMA: 4.4533 | Learning Rate: 0.002000 | Progress: 0.20070 +Step 1,250 | Tokens: 40,960,000 | Train Loss EWMA: 4.4259 | Learning Rate: 0.002000 | Progress: 0.20480 +Step 1,275 | Tokens: 41,779,200 | Train Loss EWMA: 4.4059 | Learning Rate: 0.002000 | Progress: 0.20890 +Step 1,300 | Tokens: 42,598,400 | Train Loss EWMA: 4.3857 | Learning Rate: 0.002000 | Progress: 0.21299 +Step 1,325 | Tokens: 43,417,600 | Train Loss EWMA: 4.3734 | Learning Rate: 0.002000 | Progress: 0.21709 +Step 1,350 | Tokens: 44,236,800 | Train Loss EWMA: 4.3439 | Learning Rate: 0.002000 | Progress: 0.22118 +Step 1,375 | Tokens: 45,056,000 | Train Loss EWMA: 4.3231 | Learning Rate: 0.002000 | Progress: 0.22528 +Step 1,400 | Tokens: 45,875,200 | Train Loss EWMA: 4.3073 | Learning Rate: 0.002000 | Progress: 0.22938 +Step 1,425 | Tokens: 46,694,400 | Train Loss EWMA: 4.2979 | Learning Rate: 0.002000 | Progress: 0.23347 +Step 1,450 | Tokens: 47,513,600 | Train Loss EWMA: 4.2882 | Learning Rate: 0.002000 | Progress: 0.23757 +Step 1,475 | Tokens: 48,332,800 | Train Loss EWMA: 4.2723 | Learning Rate: 0.002000 | Progress: 0.24166 +Step 1,500 | Tokens: 49,152,000 | Train Loss EWMA: 4.2589 | Learning Rate: 0.002000 | Progress: 0.24576 +Step 1,525 | Tokens: 49,971,200 | Train Loss EWMA: 4.2554 | Learning Rate: 0.002000 | Progress: 0.24986 +Step 1,550 | Tokens: 50,790,400 | Train Loss EWMA: 4.2379 | Learning Rate: 0.002000 | Progress: 0.25395 +Step 1,575 | Tokens: 51,609,600 | Train Loss EWMA: 4.2197 | Learning Rate: 0.002000 | Progress: 0.25805 +Step 1,600 | Tokens: 52,428,800 | Train Loss EWMA: 4.2049 | Learning Rate: 0.002000 | Progress: 0.26214 +Step 1,625 | Tokens: 53,248,000 | Train Loss EWMA: 4.1866 | Learning Rate: 0.002000 | Progress: 0.26624 +Step 1,650 | Tokens: 54,067,200 | Train Loss EWMA: 4.1737 | Learning Rate: 0.002000 | Progress: 0.27034 +Step 1,675 | Tokens: 54,886,400 | Train Loss EWMA: 4.1542 | Learning Rate: 0.002000 | Progress: 0.27443 +Step 1,700 | Tokens: 55,705,600 | Train Loss EWMA: 4.1315 | Learning Rate: 0.002000 | Progress: 0.27853 +Step 1,725 | Tokens: 56,524,800 | Train Loss EWMA: 4.1219 | Learning Rate: 0.002000 | Progress: 0.28262 +Step 1,750 | Tokens: 57,344,000 | Train Loss EWMA: 4.1012 | Learning Rate: 0.002000 | Progress: 0.28672 +Step 1,775 | Tokens: 58,163,200 | Train Loss EWMA: 4.0825 | Learning Rate: 0.002000 | Progress: 0.29082 +Step 1,800 | Tokens: 58,982,400 | Train Loss EWMA: 4.0805 | Learning Rate: 0.002000 | Progress: 0.29491 +Step 1,825 | Tokens: 59,801,600 | Train Loss EWMA: 4.0604 | Learning Rate: 0.002000 | Progress: 0.29901 +Step 1,850 | Tokens: 60,620,800 | Train Loss EWMA: 4.0432 | Learning Rate: 0.002000 | Progress: 0.30310 +Step 1,875 | Tokens: 61,440,000 | Train Loss EWMA: 4.0303 | Learning Rate: 0.002000 | Progress: 0.30720 +Step 1,900 | Tokens: 62,259,200 | Train Loss EWMA: 4.0192 | Learning Rate: 0.002000 | Progress: 0.31130 +Step 1,925 | Tokens: 63,078,400 | Train Loss EWMA: 4.0168 | Learning Rate: 0.002000 | Progress: 0.31539 +Step 1,950 | Tokens: 63,897,600 | Train Loss EWMA: 4.0065 | Learning Rate: 0.002000 | Progress: 0.31949 +Step 1,975 | Tokens: 64,716,800 | Train Loss EWMA: 3.9941 | Learning Rate: 0.002000 | Progress: 0.32358 +Step 2,000 | Tokens: 65,536,000 | Train Loss EWMA: 3.9804 | Learning Rate: 0.002000 | Progress: 0.32768 +Step 2,025 | Tokens: 66,355,200 | Train Loss EWMA: 3.9638 | Learning Rate: 0.002000 | Progress: 0.33178 +Step 2,050 | Tokens: 67,174,400 | Train Loss EWMA: 3.9497 | Learning Rate: 0.002000 | Progress: 0.33587 +Step 2,075 | Tokens: 67,993,600 | Train Loss EWMA: 3.9219 | Learning Rate: 0.002000 | Progress: 0.33997 +Step 2,100 | Tokens: 68,812,800 | Train Loss EWMA: 3.8983 | Learning Rate: 0.002000 | Progress: 0.34406 +Step 2,125 | Tokens: 69,632,000 | Train Loss EWMA: 3.8879 | Learning Rate: 0.002000 | Progress: 0.34816 +Step 2,150 | Tokens: 70,451,200 | Train Loss EWMA: 3.8784 | Learning Rate: 0.002000 | Progress: 0.35226 +Step 2,175 | Tokens: 71,270,400 | Train Loss EWMA: 3.8679 | Learning Rate: 0.002000 | Progress: 0.35635 +Step 2,200 | Tokens: 72,089,600 | Train Loss EWMA: 3.8603 | Learning Rate: 0.002000 | Progress: 0.36045 +Step 2,225 | Tokens: 72,908,800 | Train Loss EWMA: 3.8418 | Learning Rate: 0.002000 | Progress: 0.36454 +Step 2,250 | Tokens: 73,728,000 | Train Loss EWMA: 3.8300 | Learning Rate: 0.002000 | Progress: 0.36864 +Step 2,275 | Tokens: 74,547,200 | Train Loss EWMA: 3.8225 | Learning Rate: 0.002000 | Progress: 0.37274 +Step 2,300 | Tokens: 75,366,400 | Train Loss EWMA: 3.8138 | Learning Rate: 0.002000 | Progress: 0.37683 +Step 2,325 | Tokens: 76,185,600 | Train Loss EWMA: 3.8059 | Learning Rate: 0.002000 | Progress: 0.38093 +Step 2,350 | Tokens: 77,004,800 | Train Loss EWMA: 3.8000 | Learning Rate: 0.002000 | Progress: 0.38502 +Step 2,375 | Tokens: 77,824,000 | Train Loss EWMA: 3.7888 | Learning Rate: 0.002000 | Progress: 0.38912 +Step 2,400 | Tokens: 78,643,200 | Train Loss EWMA: 3.7760 | Learning Rate: 0.002000 | Progress: 0.39322 +Step 2,425 | Tokens: 79,462,400 | Train Loss EWMA: 3.7691 | Learning Rate: 0.002000 | Progress: 0.39731 +Step 2,450 | Tokens: 80,281,600 | Train Loss EWMA: 3.7668 | Learning Rate: 0.002000 | Progress: 0.40141 +Step 2,475 | Tokens: 81,100,800 | Train Loss EWMA: 3.7544 | Learning Rate: 0.002000 | Progress: 0.40550 +Step 2,500 | Tokens: 81,920,000 | Train Loss EWMA: 3.7428 | Learning Rate: 0.002000 | Progress: 0.40960 +Step 2,525 | Tokens: 82,739,200 | Train Loss EWMA: 3.7332 | Learning Rate: 0.002000 | Progress: 0.41370 +Step 2,550 | Tokens: 83,558,400 | Train Loss EWMA: 3.7252 | Learning Rate: 0.002000 | Progress: 0.41779 +Step 2,575 | Tokens: 84,377,600 | Train Loss EWMA: 3.7095 | Learning Rate: 0.002000 | Progress: 0.42189 +Step 2,600 | Tokens: 85,196,800 | Train Loss EWMA: 3.7004 | Learning Rate: 0.002000 | Progress: 0.42598 +Step 2,625 | Tokens: 86,016,000 | Train Loss EWMA: 3.6994 | Learning Rate: 0.002000 | Progress: 0.43008 +Step 2,650 | Tokens: 86,835,200 | Train Loss EWMA: 3.6936 | Learning Rate: 0.002000 | Progress: 0.43418 +Step 2,675 | Tokens: 87,654,400 | Train Loss EWMA: 3.6911 | Learning Rate: 0.002000 | Progress: 0.43827 +Step 2,700 | Tokens: 88,473,600 | Train Loss EWMA: 3.6789 | Learning Rate: 0.002000 | Progress: 0.44237 +Step 2,725 | Tokens: 89,292,800 | Train Loss EWMA: 3.6779 | Learning Rate: 0.002000 | Progress: 0.44646 +Step 2,750 | Tokens: 90,112,000 | Train Loss EWMA: 3.6668 | Learning Rate: 0.002000 | Progress: 0.45056 +Step 2,775 | Tokens: 90,931,200 | Train Loss EWMA: 3.6614 | Learning Rate: 0.002000 | Progress: 0.45466 +Step 2,800 | Tokens: 91,750,400 | Train Loss EWMA: 3.6547 | Learning Rate: 0.002000 | Progress: 0.45875 +Step 2,825 | Tokens: 92,569,600 | Train Loss EWMA: 3.6564 | Learning Rate: 0.002000 | Progress: 0.46285 +Step 2,850 | Tokens: 93,388,800 | Train Loss EWMA: 3.6531 | Learning Rate: 0.002000 | Progress: 0.46694 +Step 2,875 | Tokens: 94,208,000 | Train Loss EWMA: 3.6408 | Learning Rate: 0.002000 | Progress: 0.47104 +Step 2,900 | Tokens: 95,027,200 | Train Loss EWMA: 3.6366 | Learning Rate: 0.002000 | Progress: 0.47514 +Step 2,925 | Tokens: 95,846,400 | Train Loss EWMA: 3.6305 | Learning Rate: 0.002000 | Progress: 0.47923 +Step 2,950 | Tokens: 96,665,600 | Train Loss EWMA: 3.6273 | Learning Rate: 0.002000 | Progress: 0.48333 +Step 2,975 | Tokens: 97,484,800 | Train Loss EWMA: 3.6257 | Learning Rate: 0.002000 | Progress: 0.48742 +Step 3,000 | Tokens: 98,304,000 | Train Loss EWMA: 3.6254 | Learning Rate: 0.002000 | Progress: 0.49152 +Step 3,025 | Tokens: 99,123,200 | Train Loss EWMA: 3.6228 | Learning Rate: 0.002000 | Progress: 0.49562 +Step 3,050 | Tokens: 99,942,400 | Train Loss EWMA: 3.6105 | Learning Rate: 0.002000 | Progress: 0.49971 +Step 3,075 | Tokens: 100,761,600 | Train Loss EWMA: 3.6060 | Learning Rate: 0.002000 | Progress: 0.50381 +Step 3,100 | Tokens: 101,580,800 | Train Loss EWMA: 3.6027 | Learning Rate: 0.002000 | Progress: 0.50790 +Step 3,125 | Tokens: 102,400,000 | Train Loss EWMA: 3.6005 | Learning Rate: 0.002000 | Progress: 0.51200 +Step 3,150 | Tokens: 103,219,200 | Train Loss EWMA: 3.5936 | Learning Rate: 0.002000 | Progress: 0.51610 +Step 3,175 | Tokens: 104,038,400 | Train Loss EWMA: 3.5922 | Learning Rate: 0.002000 | Progress: 0.52019 +Step 3,200 | Tokens: 104,857,600 | Train Loss EWMA: 3.5764 | Learning Rate: 0.002000 | Progress: 0.52429 +Step 3,225 | Tokens: 105,676,800 | Train Loss EWMA: 3.5673 | Learning Rate: 0.002000 | Progress: 0.52838 +Step 3,250 | Tokens: 106,496,000 | Train Loss EWMA: 3.5692 | Learning Rate: 0.002000 | Progress: 0.53248 +Step 3,275 | Tokens: 107,315,200 | Train Loss EWMA: 3.5735 | Learning Rate: 0.002000 | Progress: 0.53658 +Step 3,300 | Tokens: 108,134,400 | Train Loss EWMA: 3.5586 | Learning Rate: 0.002000 | Progress: 0.54067 +Step 3,325 | Tokens: 108,953,600 | Train Loss EWMA: 3.5602 | Learning Rate: 0.002000 | Progress: 0.54477 +Step 3,350 | Tokens: 109,772,800 | Train Loss EWMA: 3.5609 | Learning Rate: 0.002000 | Progress: 0.54886 +Step 3,375 | Tokens: 110,592,000 | Train Loss EWMA: 3.5442 | Learning Rate: 0.002000 | Progress: 0.55296 +Step 3,400 | Tokens: 111,411,200 | Train Loss EWMA: 3.5420 | Learning Rate: 0.002000 | Progress: 0.55706 +Step 3,425 | Tokens: 112,230,400 | Train Loss EWMA: 3.5382 | Learning Rate: 0.002000 | Progress: 0.56115 +Step 3,450 | Tokens: 113,049,600 | Train Loss EWMA: 3.5387 | Learning Rate: 0.002000 | Progress: 0.56525 +Step 3,475 | Tokens: 113,868,800 | Train Loss EWMA: 3.5438 | Learning Rate: 0.002000 | Progress: 0.56934 +Step 3,500 | Tokens: 114,688,000 | Train Loss EWMA: 3.5328 | Learning Rate: 0.002000 | Progress: 0.57344 +Step 3,525 | Tokens: 115,507,200 | Train Loss EWMA: 3.5349 | Learning Rate: 0.002000 | Progress: 0.57754 +Step 3,550 | Tokens: 116,326,400 | Train Loss EWMA: 3.5293 | Learning Rate: 0.002000 | Progress: 0.58163 +Step 3,575 | Tokens: 117,145,600 | Train Loss EWMA: 3.5397 | Learning Rate: 0.002000 | Progress: 0.58573 +Step 3,600 | Tokens: 117,964,800 | Train Loss EWMA: 3.5321 | Learning Rate: 0.002000 | Progress: 0.58982 +Step 3,625 | Tokens: 118,784,000 | Train Loss EWMA: 3.5342 | Learning Rate: 0.002000 | Progress: 0.59392 +Step 3,650 | Tokens: 119,603,200 | Train Loss EWMA: 3.5308 | Learning Rate: 0.002000 | Progress: 0.59802 +Step 3,675 | Tokens: 120,422,400 | Train Loss EWMA: 3.5310 | Learning Rate: 0.002000 | Progress: 0.60211 +Step 3,700 | Tokens: 121,241,600 | Train Loss EWMA: 3.5265 | Learning Rate: 0.002000 | Progress: 0.60621 +Step 3,725 | Tokens: 122,060,800 | Train Loss EWMA: 3.5224 | Learning Rate: 0.002000 | Progress: 0.61030 +Step 3,750 | Tokens: 122,880,000 | Train Loss EWMA: 3.5152 | Learning Rate: 0.002000 | Progress: 0.61440 +Step 3,775 | Tokens: 123,699,200 | Train Loss EWMA: 3.5086 | Learning Rate: 0.002000 | Progress: 0.61850 +Step 3,800 | Tokens: 124,518,400 | Train Loss EWMA: 3.5014 | Learning Rate: 0.002000 | Progress: 0.62259 +Step 3,825 | Tokens: 125,337,600 | Train Loss EWMA: 3.5034 | Learning Rate: 0.002000 | Progress: 0.62669 +Step 3,850 | Tokens: 126,156,800 | Train Loss EWMA: 3.5083 | Learning Rate: 0.002000 | Progress: 0.63078 +Step 3,875 | Tokens: 126,976,000 | Train Loss EWMA: 3.4982 | Learning Rate: 0.002000 | Progress: 0.63488 +Step 3,900 | Tokens: 127,795,200 | Train Loss EWMA: 3.5043 | Learning Rate: 0.002000 | Progress: 0.63898 +Step 3,925 | Tokens: 128,614,400 | Train Loss EWMA: 3.4995 | Learning Rate: 0.002000 | Progress: 0.64307 +Step 3,950 | Tokens: 129,433,600 | Train Loss EWMA: 3.4976 | Learning Rate: 0.002000 | Progress: 0.64717 +Step 3,975 | Tokens: 130,252,800 | Train Loss EWMA: 3.4855 | Learning Rate: 0.002000 | Progress: 0.65126 +Step 4,000 | Tokens: 131,072,000 | Train Loss EWMA: 3.4917 | Learning Rate: 0.002000 | Progress: 0.65536 +Step 4,025 | Tokens: 131,891,200 | Train Loss EWMA: 3.4852 | Learning Rate: 0.002000 | Progress: 0.65946 +Step 4,050 | Tokens: 132,710,400 | Train Loss EWMA: 3.4671 | Learning Rate: 0.002000 | Progress: 0.66355 +Step 4,075 | Tokens: 133,529,600 | Train Loss EWMA: 3.4624 | Learning Rate: 0.002000 | Progress: 0.66765 +Step 4,100 | Tokens: 134,348,800 | Train Loss EWMA: 3.4597 | Learning Rate: 0.002000 | Progress: 0.67174 +Step 4,125 | Tokens: 135,168,000 | Train Loss EWMA: 3.4633 | Learning Rate: 0.002000 | Progress: 0.67584 +Step 4,150 | Tokens: 135,987,200 | Train Loss EWMA: 3.4656 | Learning Rate: 0.002000 | Progress: 0.67994 +Step 4,175 | Tokens: 136,806,400 | Train Loss EWMA: 3.4603 | Learning Rate: 0.002000 | Progress: 0.68403 +Step 4,200 | Tokens: 137,625,600 | Train Loss EWMA: 3.4700 | Learning Rate: 0.002000 | Progress: 0.68813 +Step 4,225 | Tokens: 138,444,800 | Train Loss EWMA: 3.4605 | Learning Rate: 0.002000 | Progress: 0.69222 +Step 4,250 | Tokens: 139,264,000 | Train Loss EWMA: 3.4632 | Learning Rate: 0.002000 | Progress: 0.69632 +Step 4,275 | Tokens: 140,083,200 | Train Loss EWMA: 3.4635 | Learning Rate: 0.002000 | Progress: 0.70042 +Step 4,300 | Tokens: 140,902,400 | Train Loss EWMA: 3.4559 | Learning Rate: 0.002000 | Progress: 0.70451 +Step 4,325 | Tokens: 141,721,600 | Train Loss EWMA: 3.4551 | Learning Rate: 0.002000 | Progress: 0.70861 +Step 4,350 | Tokens: 142,540,800 | Train Loss EWMA: 3.4500 | Learning Rate: 0.002000 | Progress: 0.71270 +Step 4,375 | Tokens: 143,360,000 | Train Loss EWMA: 3.4490 | Learning Rate: 0.002000 | Progress: 0.71680 +Step 4,400 | Tokens: 144,179,200 | Train Loss EWMA: 3.4534 | Learning Rate: 0.002000 | Progress: 0.72090 +Step 4,425 | Tokens: 144,998,400 | Train Loss EWMA: 3.4600 | Learning Rate: 0.002000 | Progress: 0.72499 +Step 4,450 | Tokens: 145,817,600 | Train Loss EWMA: 3.4570 | Learning Rate: 0.002000 | Progress: 0.72909 +Step 4,475 | Tokens: 146,636,800 | Train Loss EWMA: 3.4552 | Learning Rate: 0.002000 | Progress: 0.73318 +Step 4,500 | Tokens: 147,456,000 | Train Loss EWMA: 3.4390 | Learning Rate: 0.002000 | Progress: 0.73728 +Step 4,525 | Tokens: 148,275,200 | Train Loss EWMA: 3.4231 | Learning Rate: 0.002000 | Progress: 0.74138 +Step 4,550 | Tokens: 149,094,400 | Train Loss EWMA: 3.4239 | Learning Rate: 0.002000 | Progress: 0.74547 +Step 4,575 | Tokens: 149,913,600 | Train Loss EWMA: 3.4207 | Learning Rate: 0.002000 | Progress: 0.74957 +Step 4,600 | Tokens: 150,732,800 | Train Loss EWMA: 3.4268 | Learning Rate: 0.002000 | Progress: 0.75366 +Step 4,625 | Tokens: 151,552,000 | Train Loss EWMA: 3.4199 | Learning Rate: 0.002000 | Progress: 0.75776 +Step 4,650 | Tokens: 152,371,200 | Train Loss EWMA: 3.4099 | Learning Rate: 0.002000 | Progress: 0.76186 +Step 4,675 | Tokens: 153,190,400 | Train Loss EWMA: 3.4032 | Learning Rate: 0.002000 | Progress: 0.76595 +Step 4,700 | Tokens: 154,009,600 | Train Loss EWMA: 3.4101 | Learning Rate: 0.002000 | Progress: 0.77005 +Step 4,725 | Tokens: 154,828,800 | Train Loss EWMA: 3.4147 | Learning Rate: 0.002000 | Progress: 0.77414 +Step 4,750 | Tokens: 155,648,000 | Train Loss EWMA: 3.4144 | Learning Rate: 0.002000 | Progress: 0.77824 +Step 4,775 | Tokens: 156,467,200 | Train Loss EWMA: 3.4161 | Learning Rate: 0.002000 | Progress: 0.78234 +Step 4,800 | Tokens: 157,286,400 | Train Loss EWMA: 3.4180 | Learning Rate: 0.002000 | Progress: 0.78643 +Step 4,825 | Tokens: 158,105,600 | Train Loss EWMA: 3.4145 | Learning Rate: 0.002000 | Progress: 0.79053 +Step 4,850 | Tokens: 158,924,800 | Train Loss EWMA: 3.4145 | Learning Rate: 0.002000 | Progress: 0.79462 +Step 4,875 | Tokens: 159,744,000 | Train Loss EWMA: 3.4144 | Learning Rate: 0.002000 | Progress: 0.79872 +Step 4,900 | Tokens: 160,563,200 | Train Loss EWMA: 3.4159 | Learning Rate: 0.002000 | Progress: 0.80282 +Step 4,925 | Tokens: 161,382,400 | Train Loss EWMA: 3.4036 | Learning Rate: 0.002000 | Progress: 0.80691 +Step 4,950 | Tokens: 162,201,600 | Train Loss EWMA: 3.4061 | Learning Rate: 0.002000 | Progress: 0.81101 +Step 4,975 | Tokens: 163,020,800 | Train Loss EWMA: 3.4076 | Learning Rate: 0.002000 | Progress: 0.81510 +Step 5,000 | Tokens: 163,840,000 | Train Loss EWMA: 3.4003 | Learning Rate: 0.002000 | Progress: 0.81920 +Step 5,025 | Tokens: 164,659,200 | Train Loss EWMA: 3.4058 | Learning Rate: 0.002000 | Progress: 0.82330 +Step 5,050 | Tokens: 165,478,400 | Train Loss EWMA: 3.4012 | Learning Rate: 0.002000 | Progress: 0.82739 +Step 5,075 | Tokens: 166,297,600 | Train Loss EWMA: 3.4007 | Learning Rate: 0.002000 | Progress: 0.83149 +Step 5,100 | Tokens: 167,116,800 | Train Loss EWMA: 3.3959 | Learning Rate: 0.002000 | Progress: 0.83558 +Step 5,125 | Tokens: 167,936,000 | Train Loss EWMA: 3.3893 | Learning Rate: 0.002000 | Progress: 0.83968 +Step 5,150 | Tokens: 168,755,200 | Train Loss EWMA: 3.3900 | Learning Rate: 0.002000 | Progress: 0.84378 +Step 5,175 | Tokens: 169,574,400 | Train Loss EWMA: 3.3895 | Learning Rate: 0.002000 | Progress: 0.84787 +Step 5,200 | Tokens: 170,393,600 | Train Loss EWMA: 3.3911 | Learning Rate: 0.002000 | Progress: 0.85197 +Step 5,225 | Tokens: 171,212,800 | Train Loss EWMA: 3.3914 | Learning Rate: 0.002000 | Progress: 0.85606 +Step 5,250 | Tokens: 172,032,000 | Train Loss EWMA: 3.3955 | Learning Rate: 0.002000 | Progress: 0.86016 +Step 5,275 | Tokens: 172,851,200 | Train Loss EWMA: 3.3941 | Learning Rate: 0.002000 | Progress: 0.86426 +Step 5,300 | Tokens: 173,670,400 | Train Loss EWMA: 3.3914 | Learning Rate: 0.002000 | Progress: 0.86835 +Step 5,325 | Tokens: 174,489,600 | Train Loss EWMA: 3.3933 | Learning Rate: 0.002000 | Progress: 0.87245 +Step 5,350 | Tokens: 175,308,800 | Train Loss EWMA: 3.3897 | Learning Rate: 0.002000 | Progress: 0.87654 +Step 5,375 | Tokens: 176,128,000 | Train Loss EWMA: 3.3912 | Learning Rate: 0.002000 | Progress: 0.88064 +Step 5,400 | Tokens: 176,947,200 | Train Loss EWMA: 3.3792 | Learning Rate: 0.002000 | Progress: 0.88474 +Step 5,425 | Tokens: 177,766,400 | Train Loss EWMA: 3.3773 | Learning Rate: 0.002000 | Progress: 0.88883 +Step 5,450 | Tokens: 178,585,600 | Train Loss EWMA: 3.3713 | Learning Rate: 0.002000 | Progress: 0.89293 +Step 5,475 | Tokens: 179,404,800 | Train Loss EWMA: 3.3718 | Learning Rate: 0.002000 | Progress: 0.89702 +Step 5,500 | Tokens: 180,224,000 | Train Loss EWMA: 3.3694 | Learning Rate: 0.002000 | Progress: 0.90112 +Step 5,525 | Tokens: 181,043,200 | Train Loss EWMA: 3.3679 | Learning Rate: 0.002000 | Progress: 0.90522 +Step 5,550 | Tokens: 181,862,400 | Train Loss EWMA: 3.3653 | Learning Rate: 0.002000 | Progress: 0.90931 +Step 5,575 | Tokens: 182,681,600 | Train Loss EWMA: 3.3675 | Learning Rate: 0.002000 | Progress: 0.91341 +Step 5,600 | Tokens: 183,500,800 | Train Loss EWMA: 3.3695 | Learning Rate: 0.002000 | Progress: 0.91750 +Step 5,625 | Tokens: 184,320,000 | Train Loss EWMA: 3.3628 | Learning Rate: 0.002000 | Progress: 0.92160 +Step 5,650 | Tokens: 185,139,200 | Train Loss EWMA: 3.3585 | Learning Rate: 0.002000 | Progress: 0.92570 +Step 5,675 | Tokens: 185,958,400 | Train Loss EWMA: 3.3567 | Learning Rate: 0.002000 | Progress: 0.92979 +Step 5,700 | Tokens: 186,777,600 | Train Loss EWMA: 3.3571 | Learning Rate: 0.002000 | Progress: 0.93389 +Step 5,725 | Tokens: 187,596,800 | Train Loss EWMA: 3.3574 | Learning Rate: 0.002000 | Progress: 0.93798 +Step 5,750 | Tokens: 188,416,000 | Train Loss EWMA: 3.3626 | Learning Rate: 0.002000 | Progress: 0.94208 +Step 5,775 | Tokens: 189,235,200 | Train Loss EWMA: 3.3529 | Learning Rate: 0.002000 | Progress: 0.94618 +Step 5,800 | Tokens: 190,054,400 | Train Loss EWMA: 3.3479 | Learning Rate: 0.002000 | Progress: 0.95027 +Step 5,825 | Tokens: 190,873,600 | Train Loss EWMA: 3.3473 | Learning Rate: 0.002000 | Progress: 0.95437 +Step 5,850 | Tokens: 191,692,800 | Train Loss EWMA: 3.3493 | Learning Rate: 0.002000 | Progress: 0.95846 +Step 5,875 | Tokens: 192,512,000 | Train Loss EWMA: 3.3445 | Learning Rate: 0.002000 | Progress: 0.96256 +Step 5,900 | Tokens: 193,331,200 | Train Loss EWMA: 3.3409 | Learning Rate: 0.002000 | Progress: 0.96666 +Step 5,925 | Tokens: 194,150,400 | Train Loss EWMA: 3.3432 | Learning Rate: 0.002000 | Progress: 0.97075 +Step 5,950 | Tokens: 194,969,600 | Train Loss EWMA: 3.3464 | Learning Rate: 0.002000 | Progress: 0.97485 +Step 5,975 | Tokens: 195,788,800 | Train Loss EWMA: 3.3457 | Learning Rate: 0.002000 | Progress: 0.97894 +Step 6,000 | Tokens: 196,608,000 | Train Loss EWMA: 3.3382 | Learning Rate: 0.002000 | Progress: 0.98304 +Step 6,025 | Tokens: 197,427,200 | Train Loss EWMA: 3.3435 | Learning Rate: 0.002000 | Progress: 0.98714 +Step 6,050 | Tokens: 198,246,400 | Train Loss EWMA: 3.3362 | Learning Rate: 0.002000 | Progress: 0.99123 +Step 6,075 | Tokens: 199,065,600 | Train Loss EWMA: 3.3348 | Learning Rate: 0.002000 | Progress: 0.99533 +Step 6,100 | Tokens: 199,884,800 | Train Loss EWMA: 3.3306 | Learning Rate: 0.002000 | Progress: 0.99942 diff --git a/wandb/run-20251231_041335-rcpwhdwq/files/requirements.txt b/wandb/run-20251231_041335-rcpwhdwq/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..c34564dd28dff4e5b3cc658bec9331c931574195 --- /dev/null +++ b/wandb/run-20251231_041335-rcpwhdwq/files/requirements.txt @@ -0,0 +1,222 @@ +comm==0.2.3 +pandas==2.3.3 +Jinja2==3.1.6 +circuitsvis==1.43.3 +httpcore==1.0.9 +charset-normalizer==3.4.4 +nvidia-cuda-cupti-cu12==12.8.90 +nvidia-cuda-nvrtc-cu12==12.8.93 +ipympl==0.9.8 +bleach==6.3.0 +pyparsing==3.2.5 +bokeh==3.8.1 +cycler==0.12.1 +fonttools==4.61.0 +nvidia-nccl-cu12==2.27.5 +stack-data==0.6.3 +jupyter_server==2.17.0 +aiosignal==1.4.0 +nvidia-nvshmem-cu12==3.3.20 +websocket-client==1.9.0 +lark==1.3.1 +platformdirs==4.5.0 +ptyprocess==0.7.0 +pydantic_core==2.41.5 +mpmath==1.3.0 +seaborn==0.13.2 +wadler_lindig==0.1.7 +jupyterlab==4.5.0 +nbformat==5.10.4 +joblib==1.5.2 +fsspec==2025.10.0 +sentencepiece==0.2.1 +tqdm==4.67.1 +contourpy==1.3.3 +virtualenv==20.35.4 +coverage==7.12.0 +referencing==0.37.0 +nvidia-nvtx-cu12==12.8.90 +nvidia-cuda-runtime-cu12==12.8.90 +click==8.3.1 +tokenizers==0.22.1 +mypy==1.19.0 +jupyter-events==0.12.0 +filelock==3.20.0 +ruff==0.14.7 +accelerate==1.12.0 +nbclient==0.10.2 +numpy==1.26.4 +decorator==5.2.1 +threadpoolctl==3.6.0 +identify==2.6.15 +smmap==5.0.2 +xxhash==3.6.0 +nbconvert==7.16.6 +protobuf==6.33.1 +pytz==2025.2 +aiohappyeyeballs==2.6.1 +requests==2.32.5 +tinycss2==1.4.0 +ipywidgets==8.1.8 +defusedxml==0.7.1 +dill==0.4.0 +asttokens==3.0.1 +jedi==0.19.2 +pillow==12.0.0 +pathspec==0.12.1 +transformer-lens==2.16.1 +sympy==1.14.0 +arrow==1.4.0 +jupyterlab_pygments==0.3.0 +overrides==7.7.0 +notebook_shim==0.2.4 +jupyter==1.1.1 +wandb==0.23.0 +Markdown==3.10 +beautifulsoup4==4.14.3 +better-abc==0.0.3 +jsonpointer==3.0.0 +terminado==0.18.1 +rfc3987-syntax==1.1.0 +safetensors==0.7.0 +h5py==3.15.1 +annotated-types==0.7.0 +webencodings==0.5.1 +argon2-cffi-bindings==25.1.0 +nvidia-nvjitlink-cu12==12.8.93 +Pygments==2.19.2 +widgetsnbextension==4.0.15 +rfc3339-validator==0.1.4 +wcwidth==0.2.14 +urllib3==2.5.0 +certifi==2025.11.12 +typing-inspection==0.4.2 +nvidia-cudnn-cu12==9.10.2.21 +babel==2.17.0 +pure_eval==0.2.3 +nvidia-cublas-cu12==12.8.4.1 +pycparser==2.23 +transformers==4.57.3 +narwhals==2.13.0 +jupyter_core==5.9.1 +soupsieve==2.8 +pytest-cov==7.0.0 +yarl==1.22.0 +packaging==25.0 +ipykernel==7.1.0 +h11==0.16.0 +pexpect==4.9.0 +zstandard==0.25.0 +gitdb==4.0.12 +triton==3.5.1 +rfc3986-validator==0.1.1 +pyzmq==27.1.0 +toy_models==0.1.0 +cffi==2.0.0 +mypy_extensions==1.1.0 +matplotlib-inline==0.2.1 +statsmodels==0.14.6 +hf-xet==1.2.0 +python-dotenv==1.2.1 +fqdn==1.5.1 +async-lru==2.0.5 +GitPython==3.1.45 +pyarrow==22.0.0 +debugpy==1.8.17 +jaxtyping==0.3.3 +rpds-py==0.30.0 +pre_commit==4.5.0 +httpx==0.28.1 +pytest==9.0.1 +setuptools==80.9.0 +huggingface-hub==0.36.0 +argon2-cffi==25.1.0 +patsy==1.0.2 +plotly==6.5.0 +anyio==4.12.0 +pyviz_comms==3.0.6 +networkx==3.6 +scikit-learn==1.8.0 +beartype==0.14.1 +markdown-it-py==4.0.0 +librt==0.6.3 +isoduration==20.11.0 +python-dateutil==2.9.0.post0 +idna==3.11 +nodeenv==1.9.1 +nvidia-curand-cu12==10.3.9.90 +torch==2.9.1 +nvidia-cusolver-cu12==11.7.3.90 +Send2Trash==1.8.3 +webcolors==25.10.0 +multiprocess==0.70.18 +holoviews==1.22.0 +MarkupSafe==3.0.3 +jupyter_server_terminals==0.5.3 +attrs==25.4.0 +notebook==7.5.0 +json5==0.12.1 +psutil==7.1.3 +pydantic==2.12.5 +linkify-it-py==2.0.3 +torchaudio==2.9.1 +importlib_metadata==8.7.0 +nvidia-cufft-cu12==11.3.3.83 +distlib==0.4.0 +nvidia-cufile-cu12==1.13.1.3 +mdurl==0.1.2 +jsonschema==4.25.1 +typing_extensions==4.15.0 +param==2.3.1 +ipython==9.7.0 +einops==0.8.1 +aiohttp==3.13.2 +jupyter_client==8.6.3 +ipython_pygments_lexers==1.1.1 +tabulate==0.9.0 +multidict==6.7.0 +tornado==6.5.2 +rich==14.2.0 +typeguard==4.4.4 +regex==2025.11.3 +prometheus_client==0.23.1 +tomlkit==0.13.2 +python-json-logger==4.0.0 +pluggy==1.6.0 +scipy==1.16.3 +jupyterlab_server==2.28.0 +zipp==3.23.0 +fancy-einsum==0.0.3 +fastjsonschema==2.21.2 +executing==2.2.1 +uc-micro-py==1.0.3 +propcache==0.4.1 +datasets==4.4.1 +mistune==3.1.4 +tzdata==2025.2 +parso==0.8.5 +kiwisolver==1.4.9 +torchvision==0.24.1 +sentry-sdk==2.46.0 +jupyter-lsp==2.3.0 +matplotlib==3.10.7 +panel==1.8.3 +prompt_toolkit==3.0.52 +jsonschema-specifications==2025.9.1 +mdit-py-plugins==0.5.0 +transformers-stream-generator==0.0.5 +xyzservices==2025.11.0 +nvidia-cusparselt-cu12==0.7.1 +pandocfilters==1.5.1 +jupyter-console==6.6.3 +six==1.17.0 +iniconfig==2.3.0 +colorcet==3.1.0 +cfgv==3.5.0 +jupyterlab_widgets==3.0.16 +uri-template==1.3.0 +PyYAML==6.0.3 +frozenlist==1.8.0 +nest-asyncio==1.6.0 +nvidia-cusparse-cu12==12.5.8.93 +traitlets==5.14.3 diff --git a/wandb/run-20251231_041335-rcpwhdwq/files/wandb-metadata.json b/wandb/run-20251231_041335-rcpwhdwq/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..b1ada44f089aab3355635e8044ae585f2e796a74 --- /dev/null +++ b/wandb/run-20251231_041335-rcpwhdwq/files/wandb-metadata.json @@ -0,0 +1,38 @@ +{ + "os": "Linux-5.19.0-45-generic-x86_64-with-glibc2.35", + "python": "CPython 3.11.7", + "startedAt": "2025-12-31T04:13:35.480922Z", + "program": "", + "git": { + "remote": "git@github.com:jgroh3/toy_models.git", + "commit": "4f9f8fa4b099afc1076224d155d6c8a3785cb4d0" + }, + "email": "efarrel4@tcd.ie", + "root": "/notebooks/toy_models/model_training/pile_llama_replace_17367_dataset_name_PL_Replace17367_L2", + "host": "ntz6bbfjhf", + "executable": "/notebooks/toy_models/.toy_models_env/bin/python", + "cpu_count": 8, + "cpu_count_logical": 8, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 1, + "disk": { + "/": { + "total": "262240792576", + "used": "152930381824" + } + }, + "memory": { + "total": "47332843520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere", + "uuid": "GPU-d804d0d5-d870-1347-486a-9c6017366349" + } + ], + "cudaVersion": "12.4", + "writerId": "t15uolpoxil7dt93apt5qafwy48pcz9t" +} \ No newline at end of file diff --git a/wandb/run-20251231_041335-rcpwhdwq/files/wandb-summary.json b/wandb/run-20251231_041335-rcpwhdwq/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..5bcdbae3c260b97c57c07c98b93e4102912bb435 --- /dev/null +++ b/wandb/run-20251231_041335-rcpwhdwq/files/wandb-summary.json @@ -0,0 +1 @@ +{"learning_rate":0.002,"progress":0.999424,"train_loss":3.145951271057129,"tokens_seen":199884800,"train_loss_ewma":3.3306438650925676,"tokens_per_second":32768,"_step":6100,"_wandb":{"runtime":3029},"_runtime":3029.760863604,"step":6100,"_timestamp":1.767157444450835e+09} \ No newline at end of file diff --git a/wandb/run-20251231_041335-rcpwhdwq/logs/debug-internal.log b/wandb/run-20251231_041335-rcpwhdwq/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..dbca95af5d1baa491685242c0752cab44fb14abc --- /dev/null +++ b/wandb/run-20251231_041335-rcpwhdwq/logs/debug-internal.log @@ -0,0 +1,12 @@ +{"time":"2025-12-31T04:13:36.706447837Z","level":"INFO","msg":"stream: starting","core version":"0.23.0"} +{"time":"2025-12-31T04:13:36.934918403Z","level":"INFO","msg":"stream: created new stream","id":"rcpwhdwq"} +{"time":"2025-12-31T04:13:36.935017542Z","level":"INFO","msg":"handler: started","stream_id":"rcpwhdwq"} +{"time":"2025-12-31T04:13:36.93542806Z","level":"INFO","msg":"stream: started","id":"rcpwhdwq"} +{"time":"2025-12-31T04:13:36.935448923Z","level":"INFO","msg":"writer: started","stream_id":"rcpwhdwq"} +{"time":"2025-12-31T04:13:36.935464456Z","level":"INFO","msg":"sender: started","stream_id":"rcpwhdwq"} +{"time":"2025-12-31T05:04:07.32044204Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-12-31T05:04:07.451349813Z","level":"INFO","msg":"handler: operation stats","stats":{}} +{"time":"2025-12-31T05:04:07.458273419Z","level":"INFO","msg":"stream: closing","id":"rcpwhdwq"} +{"time":"2025-12-31T05:04:07.458313318Z","level":"INFO","msg":"handler: closed","stream_id":"rcpwhdwq"} +{"time":"2025-12-31T05:04:07.458379979Z","level":"INFO","msg":"sender: closed","stream_id":"rcpwhdwq"} +{"time":"2025-12-31T05:04:07.45838614Z","level":"INFO","msg":"stream: closed","id":"rcpwhdwq"} diff --git a/wandb/run-20251231_041335-rcpwhdwq/logs/debug.log b/wandb/run-20251231_041335-rcpwhdwq/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..290ba5295b71336c5f7da4f72cf5c24e3c18c9be --- /dev/null +++ b/wandb/run-20251231_041335-rcpwhdwq/logs/debug.log @@ -0,0 +1,26 @@ +2025-12-31 04:13:35,490 INFO MainThread:27729 [wandb_setup.py:_flush():80] Current SDK version is 0.23.0 +2025-12-31 04:13:35,490 INFO MainThread:27729 [wandb_setup.py:_flush():80] Configure stats pid to 27729 +2025-12-31 04:13:35,490 INFO MainThread:27729 [wandb_setup.py:_flush():80] Loading settings from /root/.config/wandb/settings +2025-12-31 04:13:35,490 INFO MainThread:27729 [wandb_setup.py:_flush():80] Loading settings from /notebooks/toy_models/model_training/pile_llama_replace_17367_dataset_name_PL_Replace17367_L2/wandb/settings +2025-12-31 04:13:35,490 INFO MainThread:27729 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-31 04:13:35,490 INFO MainThread:27729 [wandb_init.py:setup_run_log_directory():713] Logging user logs to /notebooks/toy_models/model_training/pile_llama_replace_17367_dataset_name_PL_Replace17367_L2/wandb/run-20251231_041335-rcpwhdwq/logs/debug.log +2025-12-31 04:13:35,490 INFO MainThread:27729 [wandb_init.py:setup_run_log_directory():714] Logging internal logs to /notebooks/toy_models/model_training/pile_llama_replace_17367_dataset_name_PL_Replace17367_L2/wandb/run-20251231_041335-rcpwhdwq/logs/debug-internal.log +2025-12-31 04:13:35,490 INFO MainThread:27729 [wandb_init.py:init():840] calling init triggers +2025-12-31 04:13:35,491 INFO MainThread:27729 [wandb_init.py:init():845] wandb.init called with sweep_config: {} +config: {'model_name': 'pile_llama_replace_17367', 'n_layers': 2, 'd_model': 512, 'd_mlp': 2048, 'd_head': 64, 'n_heads': 8, 'attn_only': False, 'layer_norm_eps': 1e-05, 'init_range': 0.02, 'n_ctx': 1024, 'd_vocab': 32000, 'dataset_name': 'eoinf/PL_Replace17367_L2', 'tokenizer_name': '', 'seed': 10, 'data_seed': 10, 'device': 'cuda', 'use_bfloat16_matmul': False, 'batch_size_per_device': 32, 'n_devices': 1, 'batches_per_step': 1, 'max_tokens': 200000000, 'lr_hidden': 0.002, 'lr_vector': 0.001, 'lr_schedule': 'constant_with_warmup', 'warmup_tokens': 30000000, 'weight_decay': 0.05, 'grad_norm_clip': 1.0, 'train_loss_moving_average_beta': 0.99, 'log_interval': 25, 'save_checkpoints': True, 'checkpoint_interval': 500, 'checkpoint_interval_ratio': 1.1, 'save_log_checkpoints': True, 'use_wandb': True, 'batch_size': 32, 'tokens_per_step': 32768, 'warmup_steps': 915, 'max_steps': 6103, '_wandb': {}} +2025-12-31 04:13:35,491 INFO MainThread:27729 [wandb_init.py:init():888] starting backend +2025-12-31 04:13:36,679 INFO MainThread:27729 [wandb_init.py:init():891] sending inform_init request +2025-12-31 04:13:36,704 INFO MainThread:27729 [wandb_init.py:init():899] backend started and connected +2025-12-31 04:13:36,705 INFO MainThread:27729 [wandb_init.py:init():969] updated telemetry +2025-12-31 04:13:36,977 INFO MainThread:27729 [wandb_init.py:init():993] communicating run to backend with 90.0 second timeout +2025-12-31 04:13:37,188 INFO MainThread:27729 [wandb_init.py:init():1040] starting run threads in backend +2025-12-31 04:13:37,990 INFO MainThread:27729 [wandb_run.py:_console_start():2504] atexit reg +2025-12-31 04:13:37,990 INFO MainThread:27729 [wandb_run.py:_redirect():2352] redirect: wrap_raw +2025-12-31 04:13:37,990 INFO MainThread:27729 [wandb_run.py:_redirect():2421] Wrapping output streams. +2025-12-31 04:13:37,990 INFO MainThread:27729 [wandb_run.py:_redirect():2444] Redirects installed. +2025-12-31 04:13:38,000 INFO MainThread:27729 [wandb_init.py:init():1080] run started, returning control to user process +2025-12-31 05:04:06,859 INFO MainThread:27729 [wandb_run.py:_finish():2270] finishing run eoin/toy-transformer-replication/rcpwhdwq +2025-12-31 05:04:06,866 INFO MainThread:27729 [wandb_run.py:_atexit_cleanup():2469] got exitcode: 0 +2025-12-31 05:04:06,866 INFO MainThread:27729 [wandb_run.py:_restore():2451] restore +2025-12-31 05:04:06,866 INFO MainThread:27729 [wandb_run.py:_restore():2457] restore done +2025-12-31 05:04:07,456 INFO MainThread:27729 [wandb_run.py:_footer_sync_info():3853] logging synced files diff --git a/wandb/run-20251231_041335-rcpwhdwq/run-rcpwhdwq.wandb b/wandb/run-20251231_041335-rcpwhdwq/run-rcpwhdwq.wandb new file mode 100644 index 0000000000000000000000000000000000000000..f88040e12b246dee7b213d81bf4f2a826e3b3ca3 --- /dev/null +++ b/wandb/run-20251231_041335-rcpwhdwq/run-rcpwhdwq.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39b6533ea50b7db099e8e9bc5b7c4bc9ab0c561c91cb5da0571593a68c31d546 +size 3651115