diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..3404359a88ca8680f1d67d2122e025f78a03e1bf 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +wandb/run-20251201_125310-vmyfcav3/run-vmyfcav3.wandb filter=lfs diff=lfs merge=lfs -text diff --git a/checkpoints/metadata_000000032768.json b/checkpoints/metadata_000000032768.json new file mode 100644 index 0000000000000000000000000000000000000000..7dc16fe61f78a586c44e5d65df4e18ce096edbcd --- /dev/null +++ b/checkpoints/metadata_000000032768.json @@ -0,0 +1 @@ +{"step": 1, "tokens_seen": 32768, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.880701065063477} \ No newline at end of file diff --git a/checkpoints/metadata_000000327680.json b/checkpoints/metadata_000000327680.json new file mode 100644 index 0000000000000000000000000000000000000000..3b020b18baec71e9b01658936a7e3b265e16af6c --- /dev/null +++ b/checkpoints/metadata_000000327680.json @@ -0,0 +1 @@ +{"step": 10, "tokens_seen": 327680, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.874085762320984} \ No newline at end of file diff --git a/checkpoints/metadata_000000360448.json b/checkpoints/metadata_000000360448.json new file mode 100644 index 0000000000000000000000000000000000000000..4b2600714953abeceb354f41c8317ad340a1a187 --- /dev/null +++ b/checkpoints/metadata_000000360448.json @@ -0,0 +1 @@ +{"step": 11, "tokens_seen": 360448, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.871655939854023} \ No newline at end of file diff --git a/checkpoints/metadata_000000425984.json b/checkpoints/metadata_000000425984.json new file mode 100644 index 0000000000000000000000000000000000000000..824332d2662ed07d9914dde365e01d441e0fff26 --- /dev/null +++ b/checkpoints/metadata_000000425984.json @@ -0,0 +1 @@ +{"step": 13, "tokens_seen": 425984, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.86451177288292} \ No newline at end of file diff --git a/checkpoints/metadata_000000458752.json b/checkpoints/metadata_000000458752.json new file mode 100644 index 0000000000000000000000000000000000000000..46d5fda7e1dc3555589c296441bcc5051e65bfcd --- /dev/null +++ b/checkpoints/metadata_000000458752.json @@ -0,0 +1 @@ +{"step": 14, "tokens_seen": 458752, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.860605996889932} \ No newline at end of file diff --git a/checkpoints/metadata_000000491520.json b/checkpoints/metadata_000000491520.json new file mode 100644 index 0000000000000000000000000000000000000000..70103db35d4d284a0a0cffc4acad248183472817 --- /dev/null +++ b/checkpoints/metadata_000000491520.json @@ -0,0 +1 @@ +{"step": 15, "tokens_seen": 491520, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.856088806015881} \ No newline at end of file diff --git a/checkpoints/metadata_000000557056.json b/checkpoints/metadata_000000557056.json new file mode 100644 index 0000000000000000000000000000000000000000..3f77cb8ba77ba4f527470c20a55e119223b9bcf9 --- /dev/null +++ b/checkpoints/metadata_000000557056.json @@ -0,0 +1 @@ +{"step": 17, "tokens_seen": 557056, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.845433728844982} \ No newline at end of file diff --git a/checkpoints/metadata_000000622592.json b/checkpoints/metadata_000000622592.json new file mode 100644 index 0000000000000000000000000000000000000000..b4ba142461808f79f3c14748e313d17a05767669 --- /dev/null +++ b/checkpoints/metadata_000000622592.json @@ -0,0 +1 @@ +{"step": 19, "tokens_seen": 622592, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.833798608642553} \ No newline at end of file diff --git a/checkpoints/metadata_000000688128.json b/checkpoints/metadata_000000688128.json new file mode 100644 index 0000000000000000000000000000000000000000..4390498d5010783a72b16992b027a6c133157e2b --- /dev/null +++ b/checkpoints/metadata_000000688128.json @@ -0,0 +1 @@ +{"step": 21, "tokens_seen": 688128, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.820115133647766} \ No newline at end of file diff --git a/checkpoints/metadata_000000753664.json b/checkpoints/metadata_000000753664.json new file mode 100644 index 0000000000000000000000000000000000000000..c167081354d2057e063c18d65a1eb351ed2bdc0d --- /dev/null +++ b/checkpoints/metadata_000000753664.json @@ -0,0 +1 @@ +{"step": 23, "tokens_seen": 753664, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.80443918042839} \ No newline at end of file diff --git a/checkpoints/metadata_000000819200.json b/checkpoints/metadata_000000819200.json new file mode 100644 index 0000000000000000000000000000000000000000..472428394db9f9fe216a22c616cbd8aa37997828 --- /dev/null +++ b/checkpoints/metadata_000000819200.json @@ -0,0 +1 @@ +{"step": 25, "tokens_seen": 819200, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.788736199243115} \ No newline at end of file diff --git a/checkpoints/metadata_000000917504.json b/checkpoints/metadata_000000917504.json new file mode 100644 index 0000000000000000000000000000000000000000..50851cc2d09a5be16c1bb57042bbbd755618a90a --- /dev/null +++ b/checkpoints/metadata_000000917504.json @@ -0,0 +1 @@ +{"step": 28, "tokens_seen": 917504, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.762629465300208} \ No newline at end of file diff --git a/checkpoints/metadata_000000983040.json b/checkpoints/metadata_000000983040.json new file mode 100644 index 0000000000000000000000000000000000000000..9abd5bdfefac0471f454652ad8e1e7ebe0ca535d --- /dev/null +++ b/checkpoints/metadata_000000983040.json @@ -0,0 +1 @@ +{"step": 30, "tokens_seen": 983040, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.743774321096343} \ No newline at end of file diff --git a/checkpoints/metadata_000001114112.json b/checkpoints/metadata_000001114112.json new file mode 100644 index 0000000000000000000000000000000000000000..bc0ffafbb14ba8f32b7c899bcff87bfdf2412b0a --- /dev/null +++ b/checkpoints/metadata_000001114112.json @@ -0,0 +1 @@ +{"step": 34, "tokens_seen": 1114112, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.704827108182956} \ No newline at end of file diff --git a/checkpoints/metadata_000001212416.json b/checkpoints/metadata_000001212416.json new file mode 100644 index 0000000000000000000000000000000000000000..29c6dc48175ca9e22ce59c609194389da6cf4e6a --- /dev/null +++ b/checkpoints/metadata_000001212416.json @@ -0,0 +1 @@ +{"step": 37, "tokens_seen": 1212416, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.675236911659713} \ No newline at end of file diff --git a/checkpoints/metadata_000001343488.json b/checkpoints/metadata_000001343488.json new file mode 100644 index 0000000000000000000000000000000000000000..1f31caa8ce84c011388903b4d283a55d65b9fa4c --- /dev/null +++ b/checkpoints/metadata_000001343488.json @@ -0,0 +1 @@ +{"step": 41, "tokens_seen": 1343488, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.632936919868332} \ No newline at end of file diff --git a/checkpoints/metadata_000001474560.json b/checkpoints/metadata_000001474560.json new file mode 100644 index 0000000000000000000000000000000000000000..dfd0908913984f0edcac900be06ec1a7c10fe204 --- /dev/null +++ b/checkpoints/metadata_000001474560.json @@ -0,0 +1 @@ +{"step": 45, "tokens_seen": 1474560, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.585650668632995} \ No newline at end of file diff --git a/checkpoints/metadata_000001605632.json b/checkpoints/metadata_000001605632.json new file mode 100644 index 0000000000000000000000000000000000000000..6deecafe93e70ea03ff73b070a04d4f129eda115 --- /dev/null +++ b/checkpoints/metadata_000001605632.json @@ -0,0 +1 @@ +{"step": 49, "tokens_seen": 1605632, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.532807250231963} \ No newline at end of file diff --git a/checkpoints/metadata_000001769472.json b/checkpoints/metadata_000001769472.json new file mode 100644 index 0000000000000000000000000000000000000000..f2da2510edadcc4b8ed37c98f6060fc6e930be5c --- /dev/null +++ b/checkpoints/metadata_000001769472.json @@ -0,0 +1 @@ +{"step": 54, "tokens_seen": 1769472, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.46160945653635} \ No newline at end of file diff --git a/checkpoints/metadata_000001966080.json b/checkpoints/metadata_000001966080.json new file mode 100644 index 0000000000000000000000000000000000000000..13bfb15a028951c1ce4c469a9e5d2893b4a839a4 --- /dev/null +++ b/checkpoints/metadata_000001966080.json @@ -0,0 +1 @@ +{"step": 60, "tokens_seen": 1966080, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.364225655611309} \ No newline at end of file diff --git a/checkpoints/metadata_000002162688.json b/checkpoints/metadata_000002162688.json new file mode 100644 index 0000000000000000000000000000000000000000..caa9573ee83aa3686e51cd0035c5530e67f49768 --- /dev/null +++ b/checkpoints/metadata_000002162688.json @@ -0,0 +1 @@ +{"step": 66, "tokens_seen": 2162688, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.256680661593931} \ No newline at end of file diff --git a/checkpoints/metadata_000002359296.json b/checkpoints/metadata_000002359296.json new file mode 100644 index 0000000000000000000000000000000000000000..bf4688d1316139df02d9e5db1960c2c0c513b199 --- /dev/null +++ b/checkpoints/metadata_000002359296.json @@ -0,0 +1 @@ +{"step": 72, "tokens_seen": 2359296, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.137002222766471} \ No newline at end of file diff --git a/checkpoints/metadata_000002621440.json b/checkpoints/metadata_000002621440.json new file mode 100644 index 0000000000000000000000000000000000000000..0c6271d227d850b0bad603e2cdc889b19fbc30e5 --- /dev/null +++ b/checkpoints/metadata_000002621440.json @@ -0,0 +1 @@ +{"step": 80, "tokens_seen": 2621440, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.968184119798677} \ No newline at end of file diff --git a/checkpoints/metadata_000002883584.json b/checkpoints/metadata_000002883584.json new file mode 100644 index 0000000000000000000000000000000000000000..be0b46a93191a059d10f407e536f56922e58cb30 --- /dev/null +++ b/checkpoints/metadata_000002883584.json @@ -0,0 +1 @@ +{"step": 88, "tokens_seen": 2883584, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.78487706300749} \ No newline at end of file diff --git a/checkpoints/metadata_000003178496.json b/checkpoints/metadata_000003178496.json new file mode 100644 index 0000000000000000000000000000000000000000..916b591f20634ddeba7ba866d6839319525dbd54 --- /dev/null +++ b/checkpoints/metadata_000003178496.json @@ -0,0 +1 @@ +{"step": 97, "tokens_seen": 3178496, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.577905105539879} \ No newline at end of file diff --git a/checkpoints/metadata_000003473408.json b/checkpoints/metadata_000003473408.json new file mode 100644 index 0000000000000000000000000000000000000000..97ca47ae302c534666194137f895156893e5eeb4 --- /dev/null +++ b/checkpoints/metadata_000003473408.json @@ -0,0 +1 @@ +{"step": 106, "tokens_seen": 3473408, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.367795659755558} \ No newline at end of file diff --git a/checkpoints/metadata_000003833856.json b/checkpoints/metadata_000003833856.json new file mode 100644 index 0000000000000000000000000000000000000000..859f95e67ce5f7ad15afeb738a26460064e1b45e --- /dev/null +++ b/checkpoints/metadata_000003833856.json @@ -0,0 +1 @@ +{"step": 117, "tokens_seen": 3833856, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.124259551285764} \ No newline at end of file diff --git a/checkpoints/metadata_000004227072.json b/checkpoints/metadata_000004227072.json new file mode 100644 index 0000000000000000000000000000000000000000..449151e0a59cb904be28756d252896991593bacb --- /dev/null +++ b/checkpoints/metadata_000004227072.json @@ -0,0 +1 @@ +{"step": 129, "tokens_seen": 4227072, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 8.870777312717207} \ No newline at end of file diff --git a/checkpoints/metadata_000004653056.json b/checkpoints/metadata_000004653056.json new file mode 100644 index 0000000000000000000000000000000000000000..d75130571a261b1db5133032672263df9cba4390 --- /dev/null +++ b/checkpoints/metadata_000004653056.json @@ -0,0 +1 @@ +{"step": 142, "tokens_seen": 4653056, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 8.620178344647027} \ No newline at end of file diff --git a/checkpoints/metadata_000005111808.json b/checkpoints/metadata_000005111808.json new file mode 100644 index 0000000000000000000000000000000000000000..e1e86cef1b7eba5402e3c64516fad71f9996b301 --- /dev/null +++ b/checkpoints/metadata_000005111808.json @@ -0,0 +1 @@ +{"step": 156, "tokens_seen": 5111808, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 8.386278807249074} \ No newline at end of file diff --git a/checkpoints/metadata_000005603328.json b/checkpoints/metadata_000005603328.json new file mode 100644 index 0000000000000000000000000000000000000000..cc3f0e9b83778c19390b152acd9e485f10428d30 --- /dev/null +++ b/checkpoints/metadata_000005603328.json @@ -0,0 +1 @@ +{"step": 171, "tokens_seen": 5603328, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 8.162539927160653} \ No newline at end of file diff --git a/checkpoints/metadata_000006193152.json b/checkpoints/metadata_000006193152.json new file mode 100644 index 0000000000000000000000000000000000000000..423c0698b5e7f320be867fc41041c1384e8b874b --- /dev/null +++ b/checkpoints/metadata_000006193152.json @@ -0,0 +1 @@ +{"step": 189, "tokens_seen": 6193152, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.926514569834904} \ No newline at end of file diff --git a/checkpoints/metadata_000006782976.json b/checkpoints/metadata_000006782976.json new file mode 100644 index 0000000000000000000000000000000000000000..65b21f16f4f601217bcf2d29ebf72bb03514fbd3 --- /dev/null +++ b/checkpoints/metadata_000006782976.json @@ -0,0 +1 @@ +{"step": 207, "tokens_seen": 6782976, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.733811820927238} \ No newline at end of file diff --git a/checkpoints/metadata_000007471104.json b/checkpoints/metadata_000007471104.json new file mode 100644 index 0000000000000000000000000000000000000000..d9e742ba33d7f3ec5ed8e92d50fbcf671bdc4cd3 --- /dev/null +++ b/checkpoints/metadata_000007471104.json @@ -0,0 +1 @@ +{"step": 228, "tokens_seen": 7471104, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.532752542822307} \ No newline at end of file diff --git a/checkpoints/metadata_000008224768.json b/checkpoints/metadata_000008224768.json new file mode 100644 index 0000000000000000000000000000000000000000..0cef5d6a671b3dd961327769bfd1c4caf6599bbd --- /dev/null +++ b/checkpoints/metadata_000008224768.json @@ -0,0 +1 @@ +{"step": 251, "tokens_seen": 8224768, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.360202823974224} \ No newline at end of file diff --git a/checkpoints/metadata_000009043968.json b/checkpoints/metadata_000009043968.json new file mode 100644 index 0000000000000000000000000000000000000000..c199cfdfeecb2ddff3ba04478988188bc0510434 --- /dev/null +++ b/checkpoints/metadata_000009043968.json @@ -0,0 +1 @@ +{"step": 276, "tokens_seen": 9043968, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.203661029297021} \ No newline at end of file diff --git a/checkpoints/metadata_000009961472.json b/checkpoints/metadata_000009961472.json new file mode 100644 index 0000000000000000000000000000000000000000..40d8e0a2586513efabbf69ead1197d0b4eb80e86 --- /dev/null +++ b/checkpoints/metadata_000009961472.json @@ -0,0 +1 @@ +{"step": 304, "tokens_seen": 9961472, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.059855035553507} \ No newline at end of file diff --git a/checkpoints/metadata_000010944512.json b/checkpoints/metadata_000010944512.json new file mode 100644 index 0000000000000000000000000000000000000000..031680e53f04147c34d1edcd1cac78ce9226b727 --- /dev/null +++ b/checkpoints/metadata_000010944512.json @@ -0,0 +1 @@ +{"step": 334, "tokens_seen": 10944512, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.946378966720816} \ No newline at end of file diff --git a/checkpoints/metadata_000012058624.json b/checkpoints/metadata_000012058624.json new file mode 100644 index 0000000000000000000000000000000000000000..ed2026fb484e169b1ac9bcbee39c95412d6ee682 --- /dev/null +++ b/checkpoints/metadata_000012058624.json @@ -0,0 +1 @@ +{"step": 368, "tokens_seen": 12058624, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.843224700465778} \ No newline at end of file diff --git a/checkpoints/metadata_000013271040.json b/checkpoints/metadata_000013271040.json new file mode 100644 index 0000000000000000000000000000000000000000..c3252ca10f849c06916f41821f53f2f504c3d11d --- /dev/null +++ b/checkpoints/metadata_000013271040.json @@ -0,0 +1 @@ +{"step": 405, "tokens_seen": 13271040, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.759152331680156} \ No newline at end of file diff --git a/checkpoints/metadata_000014581760.json b/checkpoints/metadata_000014581760.json new file mode 100644 index 0000000000000000000000000000000000000000..b7a75f6c5d309f2440da2555627f2d37c36e87ac --- /dev/null +++ b/checkpoints/metadata_000014581760.json @@ -0,0 +1 @@ +{"step": 445, "tokens_seen": 14581760, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.684683880940126} \ No newline at end of file diff --git a/checkpoints/metadata_000016056320.json b/checkpoints/metadata_000016056320.json new file mode 100644 index 0000000000000000000000000000000000000000..5b021d66bd62916fdf7b907b7fa4677781785e34 --- /dev/null +++ b/checkpoints/metadata_000016056320.json @@ -0,0 +1 @@ +{"step": 490, "tokens_seen": 16056320, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.611764182726821} \ No newline at end of file diff --git a/checkpoints/metadata_000016384000.json b/checkpoints/metadata_000016384000.json new file mode 100644 index 0000000000000000000000000000000000000000..398e64b95cd64c2c631a80ea4c79ae92505b2ff5 --- /dev/null +++ b/checkpoints/metadata_000016384000.json @@ -0,0 +1 @@ +{"step": 500, "tokens_seen": 16384000, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.607439684145686} \ No newline at end of file diff --git a/checkpoints/metadata_000017661952.json b/checkpoints/metadata_000017661952.json new file mode 100644 index 0000000000000000000000000000000000000000..d6fef1eddf6d2a427c71bc637807f531d7064975 --- /dev/null +++ b/checkpoints/metadata_000017661952.json @@ -0,0 +1 @@ +{"step": 539, "tokens_seen": 17661952, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.561271751261685} \ No newline at end of file diff --git a/checkpoints/metadata_000019431424.json b/checkpoints/metadata_000019431424.json new file mode 100644 index 0000000000000000000000000000000000000000..e28fc68be0af50b8ef8cc380bd43e55a249487ca --- /dev/null +++ b/checkpoints/metadata_000019431424.json @@ -0,0 +1 @@ +{"step": 593, "tokens_seen": 19431424, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.5113422043339595} \ No newline at end of file diff --git a/checkpoints/metadata_000021364736.json b/checkpoints/metadata_000021364736.json new file mode 100644 index 0000000000000000000000000000000000000000..c20cdfac5a7ad95fe98c4eb05eb2602e265791cd --- /dev/null +++ b/checkpoints/metadata_000021364736.json @@ -0,0 +1 @@ +{"step": 652, "tokens_seen": 21364736, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.478097135763412} \ No newline at end of file diff --git a/checkpoints/metadata_000023494656.json b/checkpoints/metadata_000023494656.json new file mode 100644 index 0000000000000000000000000000000000000000..810c0798dceba5d1c88b571105e83edc765ef447 --- /dev/null +++ b/checkpoints/metadata_000023494656.json @@ -0,0 +1 @@ +{"step": 717, "tokens_seen": 23494656, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.440018214984044} \ No newline at end of file diff --git a/checkpoints/metadata_000025853952.json b/checkpoints/metadata_000025853952.json new file mode 100644 index 0000000000000000000000000000000000000000..d79466a9eeda8b323da5e5c1b5ed4d35502384f7 --- /dev/null +++ b/checkpoints/metadata_000025853952.json @@ -0,0 +1 @@ +{"step": 789, "tokens_seen": 25853952, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.393019462761252} \ No newline at end of file diff --git a/checkpoints/metadata_000028442624.json b/checkpoints/metadata_000028442624.json new file mode 100644 index 0000000000000000000000000000000000000000..ddc1a335a9e4f950f76be45f13d230c691eaa060 --- /dev/null +++ b/checkpoints/metadata_000028442624.json @@ -0,0 +1 @@ +{"step": 868, "tokens_seen": 28442624, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.343419669475438} \ No newline at end of file diff --git a/checkpoints/metadata_000031293440.json b/checkpoints/metadata_000031293440.json new file mode 100644 index 0000000000000000000000000000000000000000..368c5bfb3f807d5b1215b0fd26da299527902f22 --- /dev/null +++ b/checkpoints/metadata_000031293440.json @@ -0,0 +1 @@ +{"step": 955, "tokens_seen": 31293440, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.302454665719291} \ No newline at end of file diff --git a/checkpoints/metadata_000032768000.json b/checkpoints/metadata_000032768000.json new file mode 100644 index 0000000000000000000000000000000000000000..28b6d493f55982a18b5b202a6ed03ab7f9fcc4fd --- /dev/null +++ b/checkpoints/metadata_000032768000.json @@ -0,0 +1 @@ +{"step": 1000, "tokens_seen": 32768000, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.285215335984246} \ No newline at end of file diff --git a/checkpoints/metadata_000034439168.json b/checkpoints/metadata_000034439168.json new file mode 100644 index 0000000000000000000000000000000000000000..2b19b29bb4a8f2cf3f7b9525b1316a1f5cf2f45d --- /dev/null +++ b/checkpoints/metadata_000034439168.json @@ -0,0 +1 @@ +{"step": 1051, "tokens_seen": 34439168, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.258494130954357} \ No newline at end of file diff --git a/checkpoints/metadata_000037879808.json b/checkpoints/metadata_000037879808.json new file mode 100644 index 0000000000000000000000000000000000000000..72a2beb3babe6f87b678c589c186bb1d0f5ce9e1 --- /dev/null +++ b/checkpoints/metadata_000037879808.json @@ -0,0 +1 @@ +{"step": 1156, "tokens_seen": 37879808, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.219525202924349} \ No newline at end of file diff --git a/checkpoints/metadata_000041648128.json b/checkpoints/metadata_000041648128.json new file mode 100644 index 0000000000000000000000000000000000000000..c02d900c238a6c3db5c3fc2cf93ae3ca99f2a29f --- /dev/null +++ b/checkpoints/metadata_000041648128.json @@ -0,0 +1 @@ +{"step": 1271, "tokens_seen": 41648128, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.180814860873083} \ No newline at end of file diff --git a/checkpoints/metadata_000045842432.json b/checkpoints/metadata_000045842432.json new file mode 100644 index 0000000000000000000000000000000000000000..368c5ab252b3d4a0ca45fbe22ecdbf35931fac71 --- /dev/null +++ b/checkpoints/metadata_000045842432.json @@ -0,0 +1 @@ +{"step": 1399, "tokens_seen": 45842432, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.142902416609009} \ No newline at end of file diff --git a/checkpoints/metadata_000049152000.json b/checkpoints/metadata_000049152000.json new file mode 100644 index 0000000000000000000000000000000000000000..92648e19fd07eade3add513635d29fc620c064c9 --- /dev/null +++ b/checkpoints/metadata_000049152000.json @@ -0,0 +1 @@ +{"step": 1500, "tokens_seen": 49152000, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.1204380364008575} \ No newline at end of file diff --git a/checkpoints/metadata_000050397184.json b/checkpoints/metadata_000050397184.json new file mode 100644 index 0000000000000000000000000000000000000000..2c255ca347125300247c29cf2141b00b00738501 --- /dev/null +++ b/checkpoints/metadata_000050397184.json @@ -0,0 +1 @@ +{"step": 1538, "tokens_seen": 50397184, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.122008556642273} \ No newline at end of file diff --git a/checkpoints/metadata_000055443456.json b/checkpoints/metadata_000055443456.json new file mode 100644 index 0000000000000000000000000000000000000000..a69206cf7ac0086f3209d057ec28c39077b1bff7 --- /dev/null +++ b/checkpoints/metadata_000055443456.json @@ -0,0 +1 @@ +{"step": 1692, "tokens_seen": 55443456, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.9937206925296325} \ No newline at end of file diff --git a/checkpoints/metadata_000061014016.json b/checkpoints/metadata_000061014016.json new file mode 100644 index 0000000000000000000000000000000000000000..c16d0cd2634671ecbe9132064077cfed3613a6a2 --- /dev/null +++ b/checkpoints/metadata_000061014016.json @@ -0,0 +1 @@ +{"step": 1862, "tokens_seen": 61014016, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.820489241870171} \ No newline at end of file diff --git a/checkpoints/metadata_000065536000.json b/checkpoints/metadata_000065536000.json new file mode 100644 index 0000000000000000000000000000000000000000..3c2b03fbe357dd73b81dabebf3fcecc2ef7f2a76 --- /dev/null +++ b/checkpoints/metadata_000065536000.json @@ -0,0 +1 @@ +{"step": 2000, "tokens_seen": 65536000, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.758742714157343} \ No newline at end of file diff --git a/checkpoints/metadata_000067108864.json b/checkpoints/metadata_000067108864.json new file mode 100644 index 0000000000000000000000000000000000000000..e2f1ef19043b58a187caabcf642b79b8c6492ab3 --- /dev/null +++ b/checkpoints/metadata_000067108864.json @@ -0,0 +1 @@ +{"step": 2048, "tokens_seen": 67108864, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.743611303736589} \ No newline at end of file diff --git a/checkpoints/metadata_000073826304.json b/checkpoints/metadata_000073826304.json new file mode 100644 index 0000000000000000000000000000000000000000..f957ccbe8ac8e3d4e17c93e4a2871d30b51f893e --- /dev/null +++ b/checkpoints/metadata_000073826304.json @@ -0,0 +1 @@ +{"step": 2253, "tokens_seen": 73826304, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.640187058171916} \ No newline at end of file diff --git a/checkpoints/metadata_000081199104.json b/checkpoints/metadata_000081199104.json new file mode 100644 index 0000000000000000000000000000000000000000..b66319321f1eace39865259c707c592e757abea3 --- /dev/null +++ b/checkpoints/metadata_000081199104.json @@ -0,0 +1 @@ +{"step": 2478, "tokens_seen": 81199104, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.584639670534244} \ No newline at end of file diff --git a/checkpoints/metadata_000081920000.json b/checkpoints/metadata_000081920000.json new file mode 100644 index 0000000000000000000000000000000000000000..6bc246b868e94b6778abc63bb60b12bdff2b4cff --- /dev/null +++ b/checkpoints/metadata_000081920000.json @@ -0,0 +1 @@ +{"step": 2500, "tokens_seen": 81920000, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.577965609636758} \ No newline at end of file diff --git a/checkpoints/metadata_000089325568.json b/checkpoints/metadata_000089325568.json new file mode 100644 index 0000000000000000000000000000000000000000..094e0a71e5087242e92ee6b2d80d8fb3bd93473e --- /dev/null +++ b/checkpoints/metadata_000089325568.json @@ -0,0 +1 @@ +{"step": 2726, "tokens_seen": 89325568, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.5536165674343385} \ No newline at end of file diff --git a/checkpoints/metadata_000098271232.json b/checkpoints/metadata_000098271232.json new file mode 100644 index 0000000000000000000000000000000000000000..ab06556ce5052112a9615a294948151111de003a --- /dev/null +++ b/checkpoints/metadata_000098271232.json @@ -0,0 +1 @@ +{"step": 2999, "tokens_seen": 98271232, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.528572409943393} \ No newline at end of file diff --git a/checkpoints/metadata_000098304000.json b/checkpoints/metadata_000098304000.json new file mode 100644 index 0000000000000000000000000000000000000000..682625a018e514c2b253d490d1e2ac6a1b1a5053 --- /dev/null +++ b/checkpoints/metadata_000098304000.json @@ -0,0 +1 @@ +{"step": 3000, "tokens_seen": 98304000, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.527903437514796} \ No newline at end of file diff --git a/checkpoints/metadata_000108068864.json b/checkpoints/metadata_000108068864.json new file mode 100644 index 0000000000000000000000000000000000000000..f343de9c8336b9d4049c1717ec0395482b15540b --- /dev/null +++ b/checkpoints/metadata_000108068864.json @@ -0,0 +1 @@ +{"step": 3298, "tokens_seen": 108068864, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.474267953908344} \ No newline at end of file diff --git a/checkpoints/metadata_000114688000.json b/checkpoints/metadata_000114688000.json new file mode 100644 index 0000000000000000000000000000000000000000..592d54afffd9aa2f6b354a3e96ad10e59f06be95 --- /dev/null +++ b/checkpoints/metadata_000114688000.json @@ -0,0 +1 @@ +{"step": 3500, "tokens_seen": 114688000, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.451178116069364} \ No newline at end of file diff --git a/checkpoints/metadata_000118882304.json b/checkpoints/metadata_000118882304.json new file mode 100644 index 0000000000000000000000000000000000000000..f18551fec2dc49fa43fc69037383ef45dde3552a --- /dev/null +++ b/checkpoints/metadata_000118882304.json @@ -0,0 +1 @@ +{"step": 3628, "tokens_seen": 118882304, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.475987472038353} \ No newline at end of file diff --git a/checkpoints/metadata_000130777088.json b/checkpoints/metadata_000130777088.json new file mode 100644 index 0000000000000000000000000000000000000000..15567c1b78263fb93b3f23710dc4b5003970fd32 --- /dev/null +++ b/checkpoints/metadata_000130777088.json @@ -0,0 +1 @@ +{"step": 3991, "tokens_seen": 130777088, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.452874879120459} \ No newline at end of file diff --git a/checkpoints/metadata_000131072000.json b/checkpoints/metadata_000131072000.json new file mode 100644 index 0000000000000000000000000000000000000000..7f12ee84cb347c82ec18fcfd8973c0ca11161274 --- /dev/null +++ b/checkpoints/metadata_000131072000.json @@ -0,0 +1 @@ +{"step": 4000, "tokens_seen": 131072000, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.461579984809138} \ No newline at end of file diff --git a/checkpoints/metadata_000143851520.json b/checkpoints/metadata_000143851520.json new file mode 100644 index 0000000000000000000000000000000000000000..13c03325c5c090fe0696623026551d683eaabe9d --- /dev/null +++ b/checkpoints/metadata_000143851520.json @@ -0,0 +1 @@ +{"step": 4390, "tokens_seen": 143851520, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.436226535154103} \ No newline at end of file diff --git a/checkpoints/metadata_000147456000.json b/checkpoints/metadata_000147456000.json new file mode 100644 index 0000000000000000000000000000000000000000..7ad5c49bb41d406016bab4081476ba5faec07fc2 --- /dev/null +++ b/checkpoints/metadata_000147456000.json @@ -0,0 +1 @@ +{"step": 4500, "tokens_seen": 147456000, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.429161529630855} \ No newline at end of file diff --git a/checkpoints/metadata_000158269440.json b/checkpoints/metadata_000158269440.json new file mode 100644 index 0000000000000000000000000000000000000000..8a480f0d9a2f6e605d93942ade964322a0bd2101 --- /dev/null +++ b/checkpoints/metadata_000158269440.json @@ -0,0 +1 @@ +{"step": 4830, "tokens_seen": 158269440, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.430090383958139} \ No newline at end of file diff --git a/checkpoints/metadata_000163840000.json b/checkpoints/metadata_000163840000.json new file mode 100644 index 0000000000000000000000000000000000000000..632f7b4c03159350132aac3a2a7538571010a346 --- /dev/null +++ b/checkpoints/metadata_000163840000.json @@ -0,0 +1 @@ +{"step": 5000, "tokens_seen": 163840000, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.415997648967172} \ No newline at end of file diff --git a/checkpoints/metadata_000174096384.json b/checkpoints/metadata_000174096384.json new file mode 100644 index 0000000000000000000000000000000000000000..3a06ac873157863c9b6705a9ba874ccfaf782779 --- /dev/null +++ b/checkpoints/metadata_000174096384.json @@ -0,0 +1 @@ +{"step": 5313, "tokens_seen": 174096384, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.429205909344167} \ No newline at end of file diff --git a/checkpoints/metadata_000180224000.json b/checkpoints/metadata_000180224000.json new file mode 100644 index 0000000000000000000000000000000000000000..ae3cb9eaaa2d7fbbff8f91856ae82d469b7b8a5e --- /dev/null +++ b/checkpoints/metadata_000180224000.json @@ -0,0 +1 @@ +{"step": 5500, "tokens_seen": 180224000, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.401127455728999} \ No newline at end of file diff --git a/checkpoints/metadata_000191496192.json b/checkpoints/metadata_000191496192.json new file mode 100644 index 0000000000000000000000000000000000000000..b16935f9524fb79d2770c04d5f084dc7658354d6 --- /dev/null +++ b/checkpoints/metadata_000191496192.json @@ -0,0 +1 @@ +{"step": 5844, "tokens_seen": 191496192, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.381930973428213} \ No newline at end of file diff --git a/checkpoints/metadata_000196608000.json b/checkpoints/metadata_000196608000.json new file mode 100644 index 0000000000000000000000000000000000000000..241139a9b46c279ea3061b45c37850d8c8322514 --- /dev/null +++ b/checkpoints/metadata_000196608000.json @@ -0,0 +1 @@ +{"step": 6000, "tokens_seen": 196608000, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.387779490557722} \ No newline at end of file diff --git a/checkpoints/metadata_000196706304.json b/checkpoints/metadata_000196706304.json new file mode 100644 index 0000000000000000000000000000000000000000..9d85499a921fc5b0bdd76e618ede15e6b6743945 --- /dev/null +++ b/checkpoints/metadata_000196706304.json @@ -0,0 +1 @@ +{"step": 6003, "tokens_seen": 196706304, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.389660908312495} \ No newline at end of file diff --git a/checkpoints/metadata_000197361664.json b/checkpoints/metadata_000197361664.json new file mode 100644 index 0000000000000000000000000000000000000000..54822ec880f1f77e55c99dcb7c589c650c2c7f51 --- /dev/null +++ b/checkpoints/metadata_000197361664.json @@ -0,0 +1 @@ +{"step": 6023, "tokens_seen": 197361664, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.38753160695625} \ No newline at end of file diff --git a/checkpoints/metadata_000198017024.json b/checkpoints/metadata_000198017024.json new file mode 100644 index 0000000000000000000000000000000000000000..5e78823d6c50da0bedb232385457202816e54773 --- /dev/null +++ b/checkpoints/metadata_000198017024.json @@ -0,0 +1 @@ +{"step": 6043, "tokens_seen": 198017024, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.3960716516071905} \ No newline at end of file diff --git a/checkpoints/metadata_000198672384.json b/checkpoints/metadata_000198672384.json new file mode 100644 index 0000000000000000000000000000000000000000..9739808b73303859aac867cf47d409ec9f135d32 --- /dev/null +++ b/checkpoints/metadata_000198672384.json @@ -0,0 +1 @@ +{"step": 6063, "tokens_seen": 198672384, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.383908843416934} \ No newline at end of file diff --git a/checkpoints/metadata_000199327744.json b/checkpoints/metadata_000199327744.json new file mode 100644 index 0000000000000000000000000000000000000000..110ae1c517bcc6ca340d902c77af01ce6c788746 --- /dev/null +++ b/checkpoints/metadata_000199327744.json @@ -0,0 +1 @@ +{"step": 6083, "tokens_seen": 199327744, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.394206951405519} \ No newline at end of file diff --git a/checkpoints/metadata_000199950336.json b/checkpoints/metadata_000199950336.json new file mode 100644 index 0000000000000000000000000000000000000000..079dee70be72f28ab15eb3de19d272e12501c02e --- /dev/null +++ b/checkpoints/metadata_000199950336.json @@ -0,0 +1 @@ +{"step": 6102, "tokens_seen": 199950336, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.384666312222928} \ No newline at end of file diff --git a/checkpoints/model_weights_000000032768.pt b/checkpoints/model_weights_000000032768.pt new file mode 100644 index 0000000000000000000000000000000000000000..03511d91e914d121706074672ed3717e4b31658a --- /dev/null +++ b/checkpoints/model_weights_000000032768.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78ce67efbdfc3c9279831326e38cbb39268eb87b8627e4d8e3239dbba09c7ac9 +size 225208789 diff --git a/checkpoints/model_weights_000000327680.pt b/checkpoints/model_weights_000000327680.pt new file mode 100644 index 0000000000000000000000000000000000000000..b8d16439fc1c1c68d0f7b5b875c5480a5ac901d5 --- /dev/null +++ b/checkpoints/model_weights_000000327680.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12df0a8fd8902db47f090aa6d2e2d2b67769c67e61ea6cc60fe2f88b853cf0df +size 225208789 diff --git a/checkpoints/model_weights_000000360448.pt b/checkpoints/model_weights_000000360448.pt new file mode 100644 index 0000000000000000000000000000000000000000..19601a6ea574a0824700f7ff4880728870ea50a4 --- /dev/null +++ b/checkpoints/model_weights_000000360448.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a5c1b07ad12e2e4c58dfda2faa23721a24bbee3b4194895653b5a17450292bb +size 225208789 diff --git a/checkpoints/model_weights_000000425984.pt b/checkpoints/model_weights_000000425984.pt new file mode 100644 index 0000000000000000000000000000000000000000..f5e189e82fc56929fe820ba25db01852077a6a27 --- /dev/null +++ b/checkpoints/model_weights_000000425984.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c699ab12a803dbebbb1622cc72f3ceb99d64aa83b7ff94cf6d38d445df80580f +size 225208789 diff --git a/checkpoints/model_weights_000000458752.pt b/checkpoints/model_weights_000000458752.pt new file mode 100644 index 0000000000000000000000000000000000000000..08211d98d4ba9723224e5d5565c8ff5ee3de2162 --- /dev/null +++ b/checkpoints/model_weights_000000458752.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6358b2f93f6f735ca6db8e0a902b32091b72a677ea8797d0b5d478c6fb5c8744 +size 225208789 diff --git a/checkpoints/model_weights_000000491520.pt b/checkpoints/model_weights_000000491520.pt new file mode 100644 index 0000000000000000000000000000000000000000..48d070bdcc3c71984f30d083d3b6b22d14e3756b --- /dev/null +++ b/checkpoints/model_weights_000000491520.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f66461409c54a3f4d8ac639b87a9b2c61ad948b091e2e42e253c25c575602506 +size 225208789 diff --git a/checkpoints/model_weights_000000557056.pt b/checkpoints/model_weights_000000557056.pt new file mode 100644 index 0000000000000000000000000000000000000000..82c61f0460d01e011427679945de954e08c4ed8b --- /dev/null +++ b/checkpoints/model_weights_000000557056.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4dea365bbc7cbf32c583bc058a0ac87e09e1cb298d928aaf51def3019cff6124 +size 225208789 diff --git a/checkpoints/model_weights_000000622592.pt b/checkpoints/model_weights_000000622592.pt new file mode 100644 index 0000000000000000000000000000000000000000..fa9b543d1d12939e432d396e252543493311560a --- /dev/null +++ b/checkpoints/model_weights_000000622592.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b42c70ea0ad6be7c2b4e477bf35abd6dde90d9985f2805c44332fe3c784290e1 +size 225208789 diff --git a/checkpoints/model_weights_000000688128.pt b/checkpoints/model_weights_000000688128.pt new file mode 100644 index 0000000000000000000000000000000000000000..3afda948bb0f11ca86c84ff04e6cd24144776e8a --- /dev/null +++ b/checkpoints/model_weights_000000688128.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2845a554c7923d8d91a7ae162f4845ea4e1275432ce565994fd239c47a6063d +size 225208789 diff --git a/checkpoints/model_weights_000000753664.pt b/checkpoints/model_weights_000000753664.pt new file mode 100644 index 0000000000000000000000000000000000000000..38f9f35a8f9f6bd08233c2c5daed09923c850e24 --- /dev/null +++ b/checkpoints/model_weights_000000753664.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd1656e15e9a8a30fd6dbbe273a820eeca19df1cccca00727fe162502b636c52 +size 225208789 diff --git a/checkpoints/model_weights_000000819200.pt b/checkpoints/model_weights_000000819200.pt new file mode 100644 index 0000000000000000000000000000000000000000..4981e4c5efe7d38aba5c42e1db4e89a606b59a32 --- /dev/null +++ b/checkpoints/model_weights_000000819200.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0bf30d860de2f8164c4f5adef9238062de83687284b7d35d978f899d0a8d309d +size 225208789 diff --git a/checkpoints/model_weights_000000917504.pt b/checkpoints/model_weights_000000917504.pt new file mode 100644 index 0000000000000000000000000000000000000000..14d5db6af699d47c283a0980abca267570078a50 --- /dev/null +++ b/checkpoints/model_weights_000000917504.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27d33d9f4b2b8d24a4ca874b80e2a884004230c2ee50b6dfa3b82281eee5624c +size 225208789 diff --git a/checkpoints/model_weights_000000983040.pt b/checkpoints/model_weights_000000983040.pt new file mode 100644 index 0000000000000000000000000000000000000000..4b98a334b6f6bd6a80e3402c976612e7cc5f1f77 --- /dev/null +++ b/checkpoints/model_weights_000000983040.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6a83389de79fe2a4e15885c97229111349c4f9d4733881834b21d67c09bd5e2 +size 225208789 diff --git a/checkpoints/model_weights_000001114112.pt b/checkpoints/model_weights_000001114112.pt new file mode 100644 index 0000000000000000000000000000000000000000..2070ced5a3e9449ad17ced0f6836316912a35c0d --- /dev/null +++ b/checkpoints/model_weights_000001114112.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8725a2e43e1c63ed8a2c4dc2430a20ba77433841ce07e769eb434b7d6de72bd5 +size 225208789 diff --git a/checkpoints/model_weights_000001212416.pt b/checkpoints/model_weights_000001212416.pt new file mode 100644 index 0000000000000000000000000000000000000000..b54488f2f430e4cfb20ceae66837e7487ef4ce26 --- /dev/null +++ b/checkpoints/model_weights_000001212416.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ce9fa553fcc1aeac6d2fdf61f6885eafa16d12d7a9d55798edd78f4978258dd +size 225208789 diff --git a/checkpoints/model_weights_000001343488.pt b/checkpoints/model_weights_000001343488.pt new file mode 100644 index 0000000000000000000000000000000000000000..aa21d3634bcd764b3814fd88b6ef45f0ab280075 --- /dev/null +++ b/checkpoints/model_weights_000001343488.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e20d869b57bebbf082dd8c76fd3ee96d5cfeaf8d990d8c4a5b935b729b26678b +size 225208789 diff --git a/checkpoints/model_weights_000001474560.pt b/checkpoints/model_weights_000001474560.pt new file mode 100644 index 0000000000000000000000000000000000000000..d9d9a8c885a6bc1b8a8dd2ef4402688c86c847cd --- /dev/null +++ b/checkpoints/model_weights_000001474560.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1052592800087d43c6a21e3346c30747a0f8c27e8aea479e2c6982ae417f8cd9 +size 225208789 diff --git a/checkpoints/model_weights_000001605632.pt b/checkpoints/model_weights_000001605632.pt new file mode 100644 index 0000000000000000000000000000000000000000..007b37fa1535c2ff209d52fe83d8235bd449caff --- /dev/null +++ b/checkpoints/model_weights_000001605632.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:72470e38f212a3bc5a5c913a0b0dcf238515917663c3758726ca737ac86d8236 +size 225208789 diff --git a/checkpoints/model_weights_000001769472.pt b/checkpoints/model_weights_000001769472.pt new file mode 100644 index 0000000000000000000000000000000000000000..0affbfa47adb9718346608478e5e55bf32186b87 --- /dev/null +++ b/checkpoints/model_weights_000001769472.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:862a0385ccb1487281d24c3fb14eff056f3a56acf4db33cab0b4e54aee4a2571 +size 225208789 diff --git a/checkpoints/model_weights_000001966080.pt b/checkpoints/model_weights_000001966080.pt new file mode 100644 index 0000000000000000000000000000000000000000..0fd437b34464588f6002d10ba67e7416e1f73090 --- /dev/null +++ b/checkpoints/model_weights_000001966080.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3389c0757d5b7e16a0d9b5afdba3ce4300d88ea25c2e7880e06ae69d3dde3a3 +size 225208789 diff --git a/checkpoints/model_weights_000002162688.pt b/checkpoints/model_weights_000002162688.pt new file mode 100644 index 0000000000000000000000000000000000000000..4c0622663ecd2d6190f094c4e803a4fb13395caf --- /dev/null +++ b/checkpoints/model_weights_000002162688.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:48eebad33faaa2bc8fdf45d53bab8d0282c913c4a29d45d804b00b2360c26e98 +size 225208789 diff --git a/checkpoints/model_weights_000002359296.pt b/checkpoints/model_weights_000002359296.pt new file mode 100644 index 0000000000000000000000000000000000000000..59c9dedee2d1d6f44a78b542a2f856679ed189da --- /dev/null +++ b/checkpoints/model_weights_000002359296.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b5cdab2417128f99d72045d254890c4022f647ffe3d9d83612d8ba5af62bf5c +size 225208789 diff --git a/checkpoints/model_weights_000002621440.pt b/checkpoints/model_weights_000002621440.pt new file mode 100644 index 0000000000000000000000000000000000000000..98d73c0475ee882c1cf8da75136d31af85e791c8 --- /dev/null +++ b/checkpoints/model_weights_000002621440.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee6f1a2d0164ce86db3162b94b884398c578337cdce277903485fc30b95d37e3 +size 225208789 diff --git a/checkpoints/model_weights_000002883584.pt b/checkpoints/model_weights_000002883584.pt new file mode 100644 index 0000000000000000000000000000000000000000..513d3b2c15d865b411f5e6f970edda7b27c93889 --- /dev/null +++ b/checkpoints/model_weights_000002883584.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a28539a4ee1751bf188999056ca90689120b8f4d911e1f81cf76e14cd705d165 +size 225208789 diff --git a/checkpoints/model_weights_000003178496.pt b/checkpoints/model_weights_000003178496.pt new file mode 100644 index 0000000000000000000000000000000000000000..99c6a92c6d4a9f9e78a42eff2084f0517b96f718 --- /dev/null +++ b/checkpoints/model_weights_000003178496.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69c4ed13572b7f2d98c92c23ec7357c2faca882f5c825bdf169151f29eeb368e +size 225208789 diff --git a/checkpoints/model_weights_000003473408.pt b/checkpoints/model_weights_000003473408.pt new file mode 100644 index 0000000000000000000000000000000000000000..d7f5c8d4e048ceb1a50fe36af30346254d879514 --- /dev/null +++ b/checkpoints/model_weights_000003473408.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f5917a1cdd6e9d5cc9543433c403b6dd5806794301ed685ae6bc87c7f252c46 +size 225208789 diff --git a/checkpoints/model_weights_000003833856.pt b/checkpoints/model_weights_000003833856.pt new file mode 100644 index 0000000000000000000000000000000000000000..08749d33e7c3fe5d99f9af4346420d93596ad8df --- /dev/null +++ b/checkpoints/model_weights_000003833856.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:045582fce47dd711eb07bee6a6f5355fd49346af9b70e071da80e106e8264298 +size 225208789 diff --git a/checkpoints/model_weights_000004227072.pt b/checkpoints/model_weights_000004227072.pt new file mode 100644 index 0000000000000000000000000000000000000000..71d400ffcb2418759ceecc0d47fda2e22f5fce66 --- /dev/null +++ b/checkpoints/model_weights_000004227072.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1e093af7a0ba48640b5b427db369db29fd9a3d5a0cb8ce62481b2cc980612d7 +size 225208789 diff --git a/checkpoints/model_weights_000004653056.pt b/checkpoints/model_weights_000004653056.pt new file mode 100644 index 0000000000000000000000000000000000000000..da463c065d816576c3a380489da81455f8a4e25f --- /dev/null +++ b/checkpoints/model_weights_000004653056.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4e7d0f7f9cc5e110fd14f8e71f5d44ad3d3a85b9b55fea70497a7e2399bb019 +size 225208789 diff --git a/checkpoints/model_weights_000005111808.pt b/checkpoints/model_weights_000005111808.pt new file mode 100644 index 0000000000000000000000000000000000000000..73d555b32801c54ae51fbc39de3154af12df2959 --- /dev/null +++ b/checkpoints/model_weights_000005111808.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:868f45aeeadcfcf14dca3db48260aec56b2111b041351a57f50f6a11a6c41eb0 +size 225208789 diff --git a/checkpoints/model_weights_000005603328.pt b/checkpoints/model_weights_000005603328.pt new file mode 100644 index 0000000000000000000000000000000000000000..68e726820955c71252c421a61d6127ce093d5187 --- /dev/null +++ b/checkpoints/model_weights_000005603328.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de8f86007fb03d8305908d7655ff2649957af2a1ae9e6797ff7a512067f65bad +size 225208789 diff --git a/checkpoints/model_weights_000006193152.pt b/checkpoints/model_weights_000006193152.pt new file mode 100644 index 0000000000000000000000000000000000000000..f76f3ad3b2fe44d030c1ad1fe1d97e61fb0dfc23 --- /dev/null +++ b/checkpoints/model_weights_000006193152.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5d0309f35226d87dba6a94afeca7d42b132181cad2c4cf8ea6e74e474772e26 +size 225208789 diff --git a/checkpoints/model_weights_000006782976.pt b/checkpoints/model_weights_000006782976.pt new file mode 100644 index 0000000000000000000000000000000000000000..ee4ee991f19509cd969565e847b2fb0928a6eadd --- /dev/null +++ b/checkpoints/model_weights_000006782976.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa605a4b6a1dec2e8a22a1cd94bf2c3b9fe36641da0f58a9efc6c576b2e5e23b +size 225208789 diff --git a/checkpoints/model_weights_000007471104.pt b/checkpoints/model_weights_000007471104.pt new file mode 100644 index 0000000000000000000000000000000000000000..43e4d8d6a68cf023b6e02fb6975537b9ed2faa10 --- /dev/null +++ b/checkpoints/model_weights_000007471104.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ae6184ddaf1d957f156835282d6751ebe2242d4e0dae08227fe304e9d591510 +size 225208789 diff --git a/checkpoints/model_weights_000008224768.pt b/checkpoints/model_weights_000008224768.pt new file mode 100644 index 0000000000000000000000000000000000000000..ca5282e3612d7fc5b8dd545c99b792f086d350de --- /dev/null +++ b/checkpoints/model_weights_000008224768.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d7300d1d21d7270a76d9e73d86c53a707ed516227bdc10c0d598b1a0a3e1bec +size 225208789 diff --git a/checkpoints/model_weights_000009043968.pt b/checkpoints/model_weights_000009043968.pt new file mode 100644 index 0000000000000000000000000000000000000000..6ae2e8b5791d806b5098932a608ad24b00c9704a --- /dev/null +++ b/checkpoints/model_weights_000009043968.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51afc998a904ae5d8283613322f64be695eaf1daad90fd0f86292ea4e72c2791 +size 225208789 diff --git a/checkpoints/model_weights_000009961472.pt b/checkpoints/model_weights_000009961472.pt new file mode 100644 index 0000000000000000000000000000000000000000..48eb34d124b18295bd2d388e75dee7fd8521ab9b --- /dev/null +++ b/checkpoints/model_weights_000009961472.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fb26389a42f5d03353981885febfa93b5beb959a9132f3d97f03d7c9fa51ed4 +size 225208789 diff --git a/checkpoints/model_weights_000010944512.pt b/checkpoints/model_weights_000010944512.pt new file mode 100644 index 0000000000000000000000000000000000000000..9b3792d30aef9fc736117fcef1528d2ff1c3eafc --- /dev/null +++ b/checkpoints/model_weights_000010944512.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a6a5ba7afa8d9e8cb91a8ef6893355e51b6a06351f2e6713d6e091aa0823227 +size 225208789 diff --git a/checkpoints/model_weights_000012058624.pt b/checkpoints/model_weights_000012058624.pt new file mode 100644 index 0000000000000000000000000000000000000000..6db6e592cb9e675d9eaefff834f253fdeffe2913 --- /dev/null +++ b/checkpoints/model_weights_000012058624.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:122da76acc3f53984dd9e9ad90f149e3c1095bcd7dbb507d559d7648b10e3d2c +size 225208789 diff --git a/checkpoints/model_weights_000013271040.pt b/checkpoints/model_weights_000013271040.pt new file mode 100644 index 0000000000000000000000000000000000000000..fe3c2a9ce27039ac62cb3d8813dac4e5cf68cb16 --- /dev/null +++ b/checkpoints/model_weights_000013271040.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ba40161ff235cc3b8d168d62d5260e602a32567344180a276c6223507cc884a +size 225208789 diff --git a/checkpoints/model_weights_000014581760.pt b/checkpoints/model_weights_000014581760.pt new file mode 100644 index 0000000000000000000000000000000000000000..a69c4730c08d7a44a26092e9b0063aed9740f762 --- /dev/null +++ b/checkpoints/model_weights_000014581760.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ece646d1deae9617d2d6fa3db4c075928eac8384612c806aa9084954882305d7 +size 225208789 diff --git a/checkpoints/model_weights_000016056320.pt b/checkpoints/model_weights_000016056320.pt new file mode 100644 index 0000000000000000000000000000000000000000..5d77b9e241101e1286d5d48738e0975104ec68cc --- /dev/null +++ b/checkpoints/model_weights_000016056320.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78ee198c040354c30524c989af088ad91bd6445f244496a2794955b15816c1a1 +size 225208789 diff --git a/checkpoints/model_weights_000016384000.pt b/checkpoints/model_weights_000016384000.pt new file mode 100644 index 0000000000000000000000000000000000000000..6efb97635e77d7aae6cdc8a8657fcd9f488ccc59 --- /dev/null +++ b/checkpoints/model_weights_000016384000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81d26fe3e053e6054dd29e3d5c39092012fb2cb733920a6cd753a2243aff4b15 +size 225208789 diff --git a/checkpoints/model_weights_000017661952.pt b/checkpoints/model_weights_000017661952.pt new file mode 100644 index 0000000000000000000000000000000000000000..a65d4462659100f64d7243c2bdb16fd7ac98635d --- /dev/null +++ b/checkpoints/model_weights_000017661952.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f8ca7440dc19ee99cfab9dfe8cc49db1067cedf69d17c9434b26ae74bd3e080 +size 225208789 diff --git a/checkpoints/model_weights_000019431424.pt b/checkpoints/model_weights_000019431424.pt new file mode 100644 index 0000000000000000000000000000000000000000..5d414016ffe61801a278f75b523d529b40a650ac --- /dev/null +++ b/checkpoints/model_weights_000019431424.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:212937ae2a882f8f258f43b4f4f078433f7dc67328312a4a0b793d0747e3a0df +size 225208789 diff --git a/checkpoints/model_weights_000021364736.pt b/checkpoints/model_weights_000021364736.pt new file mode 100644 index 0000000000000000000000000000000000000000..76ae460ec0274af081b4185222911e5e9d6ebe5b --- /dev/null +++ b/checkpoints/model_weights_000021364736.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ed2f57c9cbef1d07a4a853848c14f76d0d4ff2eac9eaf66dd85c233dcfb5a1e +size 225208789 diff --git a/checkpoints/model_weights_000023494656.pt b/checkpoints/model_weights_000023494656.pt new file mode 100644 index 0000000000000000000000000000000000000000..2af723172ae48f2c924c3ba4961f9bcadf2f2cfa --- /dev/null +++ b/checkpoints/model_weights_000023494656.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27e3e568e27421dfc99542290729086c50af7092db8561d5470aeead5a3671f1 +size 225208789 diff --git a/checkpoints/model_weights_000025853952.pt b/checkpoints/model_weights_000025853952.pt new file mode 100644 index 0000000000000000000000000000000000000000..08abfbf6bb64645209c409254f8b3e2ce0fa2f46 --- /dev/null +++ b/checkpoints/model_weights_000025853952.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69805666ff3dbd5ae506c0532ee1e681a3351d1188182a552f9d210ba7a85344 +size 225208789 diff --git a/checkpoints/model_weights_000028442624.pt b/checkpoints/model_weights_000028442624.pt new file mode 100644 index 0000000000000000000000000000000000000000..e205c9e4595a379b38fa71c32ec44b7af478bdc5 --- /dev/null +++ b/checkpoints/model_weights_000028442624.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15c08d10ce0bbf4cfb75929f650cd947f6500a1da008d72c7492f7ac3cf4f5b6 +size 225208789 diff --git a/checkpoints/model_weights_000031293440.pt b/checkpoints/model_weights_000031293440.pt new file mode 100644 index 0000000000000000000000000000000000000000..fc42833e33145c48700b5678f8700353c44598ab --- /dev/null +++ b/checkpoints/model_weights_000031293440.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fdb9080d629907e92df6df107aab097b22334275a15b00be15a59905b09bd37b +size 225208789 diff --git a/checkpoints/model_weights_000032768000.pt b/checkpoints/model_weights_000032768000.pt new file mode 100644 index 0000000000000000000000000000000000000000..825c2a61c02df63838f55433a8f71f88f979d58b --- /dev/null +++ b/checkpoints/model_weights_000032768000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e591dd50fbbb32d182cdc1eded712bffab2dcdc2cf2335766dac04e018980ce7 +size 225208789 diff --git a/checkpoints/model_weights_000034439168.pt b/checkpoints/model_weights_000034439168.pt new file mode 100644 index 0000000000000000000000000000000000000000..256dfeb8ae4615554087551484472abb3abbf52e --- /dev/null +++ b/checkpoints/model_weights_000034439168.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2373f94546fef124d97e89316cd0f4c0d4f3266739f50273cfe2bc2581252ba5 +size 225208789 diff --git a/checkpoints/model_weights_000037879808.pt b/checkpoints/model_weights_000037879808.pt new file mode 100644 index 0000000000000000000000000000000000000000..83859b082006d1c5c54753443d31a1426b1a2414 --- /dev/null +++ b/checkpoints/model_weights_000037879808.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:816dd333b5996e3bfc2b5cfa14a57b1ab35e9a51784df12d59cde48c3d7c3a98 +size 225208789 diff --git a/checkpoints/model_weights_000041648128.pt b/checkpoints/model_weights_000041648128.pt new file mode 100644 index 0000000000000000000000000000000000000000..94f183d005e72f832af131ad522e1d4469c842ab --- /dev/null +++ b/checkpoints/model_weights_000041648128.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aafaeb6a1b87f5409898fe8d49713545d72621842585905493c19343939a9f08 +size 225208789 diff --git a/checkpoints/model_weights_000045842432.pt b/checkpoints/model_weights_000045842432.pt new file mode 100644 index 0000000000000000000000000000000000000000..73dc839c54363e79bc5fab3dfdca4986a7510ab6 --- /dev/null +++ b/checkpoints/model_weights_000045842432.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c14aca8325c16f2dba066f2eeee4764876392d1ff1e8f014fc3bd4195a10f640 +size 225208789 diff --git a/checkpoints/model_weights_000049152000.pt b/checkpoints/model_weights_000049152000.pt new file mode 100644 index 0000000000000000000000000000000000000000..6b23bf7355ac0584a5489a1288272a93d2496c81 --- /dev/null +++ b/checkpoints/model_weights_000049152000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62080fb5f8fc63ac0d2227530b9b3585d718a6b77b34606a3cc829e65ac87019 +size 225208789 diff --git a/checkpoints/model_weights_000050397184.pt b/checkpoints/model_weights_000050397184.pt new file mode 100644 index 0000000000000000000000000000000000000000..79d4f9400813ee821301307df5d04acd1ab8d518 --- /dev/null +++ b/checkpoints/model_weights_000050397184.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78c7e90e082bf14dc82b0a3526911c9d917492c4ec987bb4f25ec9b614476e82 +size 225208789 diff --git a/checkpoints/model_weights_000055443456.pt b/checkpoints/model_weights_000055443456.pt new file mode 100644 index 0000000000000000000000000000000000000000..df86a3462217727acd7898d436fcb9ff04eacd12 --- /dev/null +++ b/checkpoints/model_weights_000055443456.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e3cfa70c81e507d811fe6e8ad0eb5e36f99ecbec56ffdc379dd29e846b27994 +size 225208789 diff --git a/checkpoints/model_weights_000061014016.pt b/checkpoints/model_weights_000061014016.pt new file mode 100644 index 0000000000000000000000000000000000000000..11ae33b46c2992b205b17fcfcdb8b8028c4a1ab8 --- /dev/null +++ b/checkpoints/model_weights_000061014016.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8026791926693032ceca1f850b152a7dbd42e01f4f24b300688f1f92b88a86f3 +size 225208789 diff --git a/checkpoints/model_weights_000065536000.pt b/checkpoints/model_weights_000065536000.pt new file mode 100644 index 0000000000000000000000000000000000000000..d480e2d6b4bfcf92fe2d8f1c9585f2b5735b9a91 --- /dev/null +++ b/checkpoints/model_weights_000065536000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2cf594f62721c2793b9180d87244370bb413129ddc7fc2314ec47ccb4f53692 +size 225208789 diff --git a/checkpoints/model_weights_000067108864.pt b/checkpoints/model_weights_000067108864.pt new file mode 100644 index 0000000000000000000000000000000000000000..e3a0f73a45165f24e0c510d774d7ff111f638902 --- /dev/null +++ b/checkpoints/model_weights_000067108864.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:94c5eea36455366fff39ad32ec44f08e0b59c9b1bbfa3c361491bf410233f813 +size 225208789 diff --git a/checkpoints/model_weights_000073826304.pt b/checkpoints/model_weights_000073826304.pt new file mode 100644 index 0000000000000000000000000000000000000000..4c059f82a8c38159e4842f864862357eaaf9145c --- /dev/null +++ b/checkpoints/model_weights_000073826304.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b89745c5ec220b531110c7b3f98bd9d50804ca8104c7bbe3a6be86ba6773c7c6 +size 225208789 diff --git a/checkpoints/model_weights_000081199104.pt b/checkpoints/model_weights_000081199104.pt new file mode 100644 index 0000000000000000000000000000000000000000..27cec87064aaeda7621dd07234e349cf488f3b47 --- /dev/null +++ b/checkpoints/model_weights_000081199104.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:87f9f19827b41f08fcd70c02adb622a4be9534b7542906526b256b7f7df70284 +size 225208789 diff --git a/checkpoints/model_weights_000081920000.pt b/checkpoints/model_weights_000081920000.pt new file mode 100644 index 0000000000000000000000000000000000000000..9562d852f438edd2e836ef8cd56ce1b9f637075a --- /dev/null +++ b/checkpoints/model_weights_000081920000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a2638bced6b2e614668b3ddc25d843e8c77846f180077bc2cf53844e4d64384 +size 225208789 diff --git a/checkpoints/model_weights_000089325568.pt b/checkpoints/model_weights_000089325568.pt new file mode 100644 index 0000000000000000000000000000000000000000..868c6608b4d98f0e26a8f23061eb26dd1f2754fe --- /dev/null +++ b/checkpoints/model_weights_000089325568.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83bdb2fbcb3ba76c49d1dd3a61b923a4788665244ff2c6eff63371ab3daffaa7 +size 225208789 diff --git a/checkpoints/model_weights_000098271232.pt b/checkpoints/model_weights_000098271232.pt new file mode 100644 index 0000000000000000000000000000000000000000..b793fdb58325810fa72e2cd23624ca084692171d --- /dev/null +++ b/checkpoints/model_weights_000098271232.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a23cd7265f44e66bf07b89284be182afc1b7a0165669741066c292a6d61aa1b +size 225208789 diff --git a/checkpoints/model_weights_000098304000.pt b/checkpoints/model_weights_000098304000.pt new file mode 100644 index 0000000000000000000000000000000000000000..5191cae64a3b5c598b067faecb62d81724e18164 --- /dev/null +++ b/checkpoints/model_weights_000098304000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9cdae233277bc108951b88bebc612b89a3dab5af13e5d17c888b97603d9d09a +size 225208789 diff --git a/checkpoints/model_weights_000108068864.pt b/checkpoints/model_weights_000108068864.pt new file mode 100644 index 0000000000000000000000000000000000000000..df7847edf770bbfc4790054e8e3fec3640ae19d9 --- /dev/null +++ b/checkpoints/model_weights_000108068864.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dce8d4883dd94b81e8613ab4791b092a3500fa09a714d24d53d194d070c85795 +size 225208789 diff --git a/checkpoints/model_weights_000114688000.pt b/checkpoints/model_weights_000114688000.pt new file mode 100644 index 0000000000000000000000000000000000000000..fe81393cb9488f0ac6602aa7da6637a387c312fd --- /dev/null +++ b/checkpoints/model_weights_000114688000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0d7c2236df3199ee26bf65332d45d385736ea76f4b305ad8055c0956fa30db7 +size 225208789 diff --git a/checkpoints/model_weights_000118882304.pt b/checkpoints/model_weights_000118882304.pt new file mode 100644 index 0000000000000000000000000000000000000000..47739d19a0163a2fdb2109626f5686d8b9eafd34 --- /dev/null +++ b/checkpoints/model_weights_000118882304.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be1b74bbe7bbc23a6b46af85b3c57503350c96cf28b04a74a68ee5d47faefe33 +size 225208789 diff --git a/checkpoints/model_weights_000130777088.pt b/checkpoints/model_weights_000130777088.pt new file mode 100644 index 0000000000000000000000000000000000000000..8b51b420c3b0c999a2aa57f87281a8c57834550a --- /dev/null +++ b/checkpoints/model_weights_000130777088.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d5c5f19298c5027b0f392b79b67befe6f6bf89b36c35f55d888360b8f2c155b +size 225208789 diff --git a/checkpoints/model_weights_000131072000.pt b/checkpoints/model_weights_000131072000.pt new file mode 100644 index 0000000000000000000000000000000000000000..f16ebd709ee11438bc42068728311b8eb1ef3da6 --- /dev/null +++ b/checkpoints/model_weights_000131072000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:622c8f23a52513b133f2c4b0e4c725305aad0fec0ad01072c5bd9cd47923fe4a +size 225208789 diff --git a/checkpoints/model_weights_000143851520.pt b/checkpoints/model_weights_000143851520.pt new file mode 100644 index 0000000000000000000000000000000000000000..b2ab2370a43f7f99c94daa629d3881603f406568 --- /dev/null +++ b/checkpoints/model_weights_000143851520.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09d44129c64d8bddc649302b49207103a19603a055f4ed787d9ce2ecd38662ad +size 225208789 diff --git a/checkpoints/model_weights_000147456000.pt b/checkpoints/model_weights_000147456000.pt new file mode 100644 index 0000000000000000000000000000000000000000..75e4530395316a2378117d260e59bf2fcb4d93fa --- /dev/null +++ b/checkpoints/model_weights_000147456000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff495cb19507c9bd5a0f9c40676f683bcc41b1a11886e43a8ee8f249e83ac5c5 +size 225208789 diff --git a/checkpoints/model_weights_000158269440.pt b/checkpoints/model_weights_000158269440.pt new file mode 100644 index 0000000000000000000000000000000000000000..e497ffb0d91731f19e1cf64dfc7dbab15cb4dcfc --- /dev/null +++ b/checkpoints/model_weights_000158269440.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c736460b1276318edfdadfd0f5c3a613c96ffe436a253bb1e77b41fe26f8b66 +size 225208789 diff --git a/checkpoints/model_weights_000163840000.pt b/checkpoints/model_weights_000163840000.pt new file mode 100644 index 0000000000000000000000000000000000000000..5d35287e6cea8c92c69c04f86981d693ce2cb0ec --- /dev/null +++ b/checkpoints/model_weights_000163840000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85b0c53ad427f90161abbb9a083b6209b2e71bceda5618172f8e0e7a08b63d34 +size 225208789 diff --git a/checkpoints/model_weights_000174096384.pt b/checkpoints/model_weights_000174096384.pt new file mode 100644 index 0000000000000000000000000000000000000000..f3d042f734f04eae19f14f127165f0cdcef364ed --- /dev/null +++ b/checkpoints/model_weights_000174096384.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a99c45c8d3703525aac62a781582a975b9ff74121e5fc938ca335089b06c46f5 +size 225208789 diff --git a/checkpoints/model_weights_000180224000.pt b/checkpoints/model_weights_000180224000.pt new file mode 100644 index 0000000000000000000000000000000000000000..a42e080f6560f692d1a9ee5abef899b0b66fe168 --- /dev/null +++ b/checkpoints/model_weights_000180224000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7a884ff0c615fbd8f42f5f955ce47fe1afea0b3d64f40a758777a92e69e847d +size 225208789 diff --git a/checkpoints/model_weights_000191496192.pt b/checkpoints/model_weights_000191496192.pt new file mode 100644 index 0000000000000000000000000000000000000000..b935d3c38ec75ccb65465107a00eb8093cc57898 --- /dev/null +++ b/checkpoints/model_weights_000191496192.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ffd13c5c65f70d99b13f808af20de6f07ebf793afbe807cb6f35178796ed40a5 +size 225208789 diff --git a/checkpoints/model_weights_000196608000.pt b/checkpoints/model_weights_000196608000.pt new file mode 100644 index 0000000000000000000000000000000000000000..5efef661d2276495e1da80e5b4cda51b5c05f3ba --- /dev/null +++ b/checkpoints/model_weights_000196608000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df8976bd1812dcbc8b8d1768b3632ae87e14328f029949fa51e99438e2340b16 +size 225208789 diff --git a/checkpoints/model_weights_000196706304.pt b/checkpoints/model_weights_000196706304.pt new file mode 100644 index 0000000000000000000000000000000000000000..cee8bb7c3b293b84e3154d3a7570cbc71ae1bfd8 --- /dev/null +++ b/checkpoints/model_weights_000196706304.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a45b8a5608849afca87f8943d255d121fa6a94c5b519c0f0bde8aba4a7bc3164 +size 225208789 diff --git a/checkpoints/model_weights_000197361664.pt b/checkpoints/model_weights_000197361664.pt new file mode 100644 index 0000000000000000000000000000000000000000..0d2a56fa74c3e1c659612ce891e950c52aa42779 --- /dev/null +++ b/checkpoints/model_weights_000197361664.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39fab2dd7351b6adb584e32022837c364c289be2b41da6fbc3b175f543045c89 +size 225208789 diff --git a/checkpoints/model_weights_000198017024.pt b/checkpoints/model_weights_000198017024.pt new file mode 100644 index 0000000000000000000000000000000000000000..09ccee7f55a870ad3fd9ee5b0d6e8b5273eceec9 --- /dev/null +++ b/checkpoints/model_weights_000198017024.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e4decfe126e54fb87caba3833715f43030382fee361b2e3901e8ad94fbd6f58 +size 225208789 diff --git a/checkpoints/model_weights_000198672384.pt b/checkpoints/model_weights_000198672384.pt new file mode 100644 index 0000000000000000000000000000000000000000..705c27bdf2a2be6a8565372612c9f3f2c633aa8c --- /dev/null +++ b/checkpoints/model_weights_000198672384.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:262e9dad299cecce1359e38770fcc092b892ef489990dc4cb70f32ed1d59de65 +size 225208789 diff --git a/checkpoints/model_weights_000199327744.pt b/checkpoints/model_weights_000199327744.pt new file mode 100644 index 0000000000000000000000000000000000000000..36ac7711fc222d47457f387a7b5f48f648bc6c31 --- /dev/null +++ b/checkpoints/model_weights_000199327744.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e7aaa8aa9d77f7f1fdd7690fae94fbb5a9c837f7c84a58deb818e39d8e57447 +size 225208789 diff --git a/checkpoints/model_weights_000199950336.pt b/checkpoints/model_weights_000199950336.pt new file mode 100644 index 0000000000000000000000000000000000000000..900fcdef9ad45d8d8152d511d9f59e9df146d48b --- /dev/null +++ b/checkpoints/model_weights_000199950336.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8673a79f61ad9190fd7bf55ce5dfef888f05f49e4ea0489775d4c7703f2a27e1 +size 225208789 diff --git a/config.toml b/config.toml new file mode 100644 index 0000000000000000000000000000000000000000..4b18a7b00196a2a69eecf39e9c5ac40411ac254a --- /dev/null +++ b/config.toml @@ -0,0 +1,31 @@ +model_name = "pile_llama_grid" +n_layers = 2 +d_model = 512 +d_mlp = 2048 +d_head = 64 +n_heads = 8 +attn_only = false +layer_norm_eps = 1e-05 +init_range = 0.02 +n_ctx = 1024 +d_vocab = 48262 +seed = 10 +device = "cuda" +use_bfloat16_matmul = false +batch_size_per_device = 32 +n_devices = 1 +batches_per_step = 1 +max_tokens = 200000000 +lr_hidden = 0.002 +lr_vector = 0.001 +lr_schedule = "constant_with_warmup" +warmup_tokens = 30000000 +weight_decay = 0.05 +grad_norm_clip = 1.0 +train_loss_moving_average_beta = 0.99 +log_interval = 25 +save_checkpoints = true +checkpoint_interval = 500 +checkpoint_interval_ratio = 1.1 +save_log_checkpoints = true +dataset_name = "eoinf/PL_SeqUnmodified_NonSeqObserved_L2" \ No newline at end of file diff --git a/latest_checkpoint.pt b/latest_checkpoint.pt new file mode 100644 index 0000000000000000000000000000000000000000..9f6a5ca44bdcae0062eab9eb294c76d8eab99c33 --- /dev/null +++ b/latest_checkpoint.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ecb9bb87979e11e71f47cd10a955fb6d6cf5ef15d2d05df994725ede711abb29 +size 225208311 diff --git a/latest_metadata.json b/latest_metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..079dee70be72f28ab15eb3de19d272e12501c02e --- /dev/null +++ b/latest_metadata.json @@ -0,0 +1 @@ +{"step": 6102, "tokens_seen": 199950336, "config": {"model_name": "pile_llama_grid", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/PL_SeqUnmodified_NonSeqObserved_L2", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.384666312222928} \ No newline at end of file diff --git a/latest_optimizer.pt b/latest_optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..32cbe7ccc1c41bfa6cbf09afde1371fc61e09032 --- /dev/null +++ b/latest_optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:febf2a7272865337435f9e146cacfadb356154e9fb2b945085875532e34c6cf1 +size 450422547 diff --git a/run.sh b/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..7f267f430e5b2294c4e308dc61c1c813beb7e718 --- /dev/null +++ b/run.sh @@ -0,0 +1,27 @@ +#!/bin/bash +# Check if "restart" argument is passed to force normal training +if [ "$1" = "restart" ]; then + echo "Force restart: Running normal training ..." + python -c " +import os +from toy_models.models.trainer import train_transformer_from_config +current_dir = os.getcwd() +train_transformer_from_config('config.toml', current_dir) +" +else + # Check for checkpoints and run appropriate training + python -c " +import os +from pathlib import Path +from toy_models.models.trainer import train_transformer_from_config, restart_from_checkpoint +current_dir = os.getcwd() +# Check if checkpoints directory exists and has .pt files +latest_checkpoint = Path('latest_checkpoint.pt') +if latest_checkpoint.exists(): + print(f'Found checkpoint: {latest_checkpoint}. Restarting from checkpoint...') + restart_from_checkpoint(current_dir) +else: + print('Starting training from beginning ...') + train_transformer_from_config(current_dir) +" +fi diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..97ac36e44d6864d6f040652d2372412a88a1e581 --- /dev/null +++ b/wandb/debug-internal.log @@ -0,0 +1,12 @@ +{"time":"2025-12-01T12:53:10.90397339Z","level":"INFO","msg":"stream: starting","core version":"0.21.1"} +{"time":"2025-12-01T12:53:11.146477688Z","level":"INFO","msg":"stream: created new stream","id":"vmyfcav3"} +{"time":"2025-12-01T12:53:11.146518395Z","level":"INFO","msg":"stream: started","id":"vmyfcav3"} +{"time":"2025-12-01T12:53:11.14653942Z","level":"INFO","msg":"writer: started","stream_id":"vmyfcav3"} +{"time":"2025-12-01T12:53:11.146576902Z","level":"INFO","msg":"handler: started","stream_id":"vmyfcav3"} +{"time":"2025-12-01T12:53:11.14660377Z","level":"INFO","msg":"sender: started","stream_id":"vmyfcav3"} +{"time":"2025-12-01T14:01:26.287122125Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-12-01T14:01:26.427139132Z","level":"INFO","msg":"handler: operation stats","stats":{}} +{"time":"2025-12-01T14:01:26.429682025Z","level":"INFO","msg":"stream: closing","id":"vmyfcav3"} +{"time":"2025-12-01T14:01:26.429706395Z","level":"INFO","msg":"handler: closed","stream_id":"vmyfcav3"} +{"time":"2025-12-01T14:01:26.429761128Z","level":"INFO","msg":"sender: closed","stream_id":"vmyfcav3"} +{"time":"2025-12-01T14:01:26.429768246Z","level":"INFO","msg":"stream: closed","id":"vmyfcav3"} diff --git a/wandb/debug.log b/wandb/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..ac400842409a2893e77a7c9dc71bb53406d73333 --- /dev/null +++ b/wandb/debug.log @@ -0,0 +1,28 @@ +2025-12-01 12:53:10,418 INFO MainThread:20285 [wandb_setup.py:_flush():80] Current SDK version is 0.21.1 +2025-12-01 12:53:10,418 INFO MainThread:20285 [wandb_setup.py:_flush():80] Configure stats pid to 20285 +2025-12-01 12:53:10,418 INFO MainThread:20285 [wandb_setup.py:_flush():80] Loading settings from /root/.config/wandb/settings +2025-12-01 12:53:10,418 INFO MainThread:20285 [wandb_setup.py:_flush():80] Loading settings from /notebooks/toy_models/model_training/pile_llama_grid_dataset_name_PL_SeqUnmodified_NonSeqObserved_L2/wandb/settings +2025-12-01 12:53:10,418 INFO MainThread:20285 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-01 12:53:10,418 INFO MainThread:20285 [wandb_init.py:setup_run_log_directory():703] Logging user logs to /notebooks/toy_models/model_training/pile_llama_grid_dataset_name_PL_SeqUnmodified_NonSeqObserved_L2/wandb/run-20251201_125310-vmyfcav3/logs/debug.log +2025-12-01 12:53:10,418 INFO MainThread:20285 [wandb_init.py:setup_run_log_directory():704] Logging internal logs to /notebooks/toy_models/model_training/pile_llama_grid_dataset_name_PL_SeqUnmodified_NonSeqObserved_L2/wandb/run-20251201_125310-vmyfcav3/logs/debug-internal.log +2025-12-01 12:53:10,418 INFO MainThread:20285 [wandb_init.py:init():830] calling init triggers +2025-12-01 12:53:10,418 INFO MainThread:20285 [wandb_init.py:init():835] wandb.init called with sweep_config: {} +config: {'model_name': 'pile_llama_grid', 'n_layers': 2, 'd_model': 512, 'd_mlp': 2048, 'd_head': 64, 'n_heads': 8, 'attn_only': False, 'layer_norm_eps': 1e-05, 'init_range': 0.02, 'n_ctx': 1024, 'd_vocab': 48262, 'dataset_name': 'eoinf/PL_SeqUnmodified_NonSeqObserved_L2', 'tokenizer_name': '', 'seed': 10, 'device': 'cuda', 'use_bfloat16_matmul': False, 'batch_size_per_device': 32, 'n_devices': 1, 'batches_per_step': 1, 'max_tokens': 200000000, 'lr_hidden': 0.002, 'lr_vector': 0.001, 'lr_schedule': 'constant_with_warmup', 'warmup_tokens': 30000000, 'weight_decay': 0.05, 'grad_norm_clip': 1.0, 'train_loss_moving_average_beta': 0.99, 'log_interval': 25, 'save_checkpoints': True, 'checkpoint_interval': 500, 'checkpoint_interval_ratio': 1.1, 'save_log_checkpoints': True, 'use_wandb': True, 'batch_size': 32, 'tokens_per_step': 32768, 'warmup_steps': 915, 'max_steps': 6103, '_wandb': {}} +2025-12-01 12:53:10,418 INFO MainThread:20285 [wandb_init.py:init():871] starting backend +2025-12-01 12:53:10,892 INFO MainThread:20285 [wandb_init.py:init():874] sending inform_init request +2025-12-01 12:53:10,902 INFO MainThread:20285 [wandb_init.py:init():882] backend started and connected +2025-12-01 12:53:10,903 INFO MainThread:20285 [wandb_init.py:init():953] updated telemetry +2025-12-01 12:53:10,971 INFO MainThread:20285 [wandb_init.py:init():977] communicating run to backend with 90.0 second timeout +2025-12-01 12:53:11,452 INFO MainThread:20285 [wandb_init.py:init():1029] starting run threads in backend +2025-12-01 12:53:12,200 INFO MainThread:20285 [wandb_run.py:_console_start():2494] atexit reg +2025-12-01 12:53:12,201 INFO MainThread:20285 [wandb_run.py:_redirect():2342] redirect: wrap_raw +2025-12-01 12:53:12,201 INFO MainThread:20285 [wandb_run.py:_redirect():2411] Wrapping output streams. +2025-12-01 12:53:12,201 INFO MainThread:20285 [wandb_run.py:_redirect():2434] Redirects installed. +2025-12-01 12:53:12,208 INFO MainThread:20285 [wandb_init.py:init():1075] run started, returning control to user process +2025-12-01 14:01:25,928 INFO MainThread:20285 [wandb_run.py:_finish():2260] finishing run eoin/toy-transformer-replication/vmyfcav3 +2025-12-01 14:01:25,931 INFO MainThread:20285 [wandb_run.py:_atexit_cleanup():2459] got exitcode: 0 +2025-12-01 14:01:25,931 INFO MainThread:20285 [wandb_run.py:_restore():2441] restore +2025-12-01 14:01:25,931 INFO MainThread:20285 [wandb_run.py:_restore():2447] restore done +2025-12-01 14:01:26,428 INFO MainThread:20285 [wandb_run.py:_footer_history_summary_info():3895] rendering history +2025-12-01 14:01:26,428 INFO MainThread:20285 [wandb_run.py:_footer_history_summary_info():3927] rendering summary +2025-12-01 14:01:26,429 INFO MainThread:20285 [wandb_run.py:_footer_sync_info():3856] logging synced files diff --git a/wandb/run-20251201_125310-vmyfcav3/files/config.yaml b/wandb/run-20251201_125310-vmyfcav3/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5db5fd6907ef0f94fb13fb8f3c752b189188f094 --- /dev/null +++ b/wandb/run-20251201_125310-vmyfcav3/files/config.yaml @@ -0,0 +1,134 @@ +_wandb: + value: + cli_version: 0.21.1 + e: + uxb3ioqlaif63jojqpnysdkwauhpsncj: + cpu_count: 8 + cpu_count_logical: 8 + cudaVersion: "12.4" + disk: + /: + total: "262240792576" + used: "142933057536" + email: efarrel4@tcd.ie + executable: /notebooks/toy_models/.toy_models_env/bin/python + git: + commit: 666423afacb222750f02b390eb7c844da5b0afc5 + remote: git@github.com:jgroh3/toy_models.git + gpu: NVIDIA RTX A6000 + gpu_count: 1 + gpu_nvidia: + - architecture: Ampere + cudaCores: 10752 + memoryTotal: "51527024640" + name: NVIDIA RTX A6000 + uuid: GPU-119b219a-d9f5-9f14-672c-316f35eafabe + host: nj33madmn6 + memory: + total: "47332843520" + os: Linux-5.19.0-45-generic-x86_64-with-glibc2.35 + program: + python: CPython 3.11.7 + root: /notebooks/toy_models/model_training/pile_llama_grid_dataset_name_PL_SeqUnmodified_NonSeqObserved_L2 + startedAt: "2025-12-01T12:53:10.411266Z" + writerId: uxb3ioqlaif63jojqpnysdkwauhpsncj + m: [] + python_version: 3.11.7 + t: + "1": + - 1 + - 11 + - 49 + - 51 + - 71 + "2": + - 1 + - 11 + - 49 + - 51 + - 71 + "3": + - 2 + - 13 + - 15 + - 16 + - 61 + "4": 3.11.7 + "5": 0.21.1 + "6": 4.55.4 + "12": 0.21.1 + "13": linux-x86_64 +attn_only: + value: false +batch_size: + value: 32 +batch_size_per_device: + value: 32 +batches_per_step: + value: 1 +checkpoint_interval: + value: 500 +checkpoint_interval_ratio: + value: 1.1 +d_head: + value: 64 +d_mlp: + value: 2048 +d_model: + value: 512 +d_vocab: + value: 48262 +dataset_name: + value: eoinf/PL_SeqUnmodified_NonSeqObserved_L2 +device: + value: cuda +grad_norm_clip: + value: 1 +init_range: + value: 0.02 +layer_norm_eps: + value: 1e-05 +log_interval: + value: 25 +lr_hidden: + value: 0.002 +lr_schedule: + value: constant_with_warmup +lr_vector: + value: 0.001 +max_steps: + value: 6103 +max_tokens: + value: 200000000 +model_name: + value: pile_llama_grid +n_ctx: + value: 1024 +n_devices: + value: 1 +n_heads: + value: 8 +n_layers: + value: 2 +save_checkpoints: + value: true +save_log_checkpoints: + value: true +seed: + value: 10 +tokenizer_name: + value: "" +tokens_per_step: + value: 32768 +train_loss_moving_average_beta: + value: 0.99 +use_bfloat16_matmul: + value: false +use_wandb: + value: true +warmup_steps: + value: 915 +warmup_tokens: + value: 30000000 +weight_decay: + value: 0.05 diff --git a/wandb/run-20251201_125310-vmyfcav3/files/output.log b/wandb/run-20251201_125310-vmyfcav3/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..0e047b8d7f365cc84d80420c17dee80d8fa7de94 --- /dev/null +++ b/wandb/run-20251201_125310-vmyfcav3/files/output.log @@ -0,0 +1,252 @@ +Training on cuda +Model: 2L, 512d, 8h +Max steps: 6,103, Max tokens: 200,000,000 +Warmup steps: 915, Warmup tokens: 30,000,000 +Batch size per device: 32 +Context length: 1024 +Learning rates - Hidden: 0.002, Vector: 0.001 + +Step 25 | Tokens: 819,200 | Train Loss EWMA: 10.7887 | Learning Rate: 0.000055 | Progress: 0.00410 +Step 50 | Tokens: 1,638,400 | Train Loss EWMA: 10.5186 | Learning Rate: 0.000109 | Progress: 0.00819 +Step 75 | Tokens: 2,457,600 | Train Loss EWMA: 10.0754 | Learning Rate: 0.000164 | Progress: 0.01229 +Step 100 | Tokens: 3,276,800 | Train Loss EWMA: 9.5048 | Learning Rate: 0.000219 | Progress: 0.01638 +Step 125 | Tokens: 4,096,000 | Train Loss EWMA: 8.9525 | Learning Rate: 0.000273 | Progress: 0.02048 +Step 150 | Tokens: 4,915,200 | Train Loss EWMA: 8.4809 | Learning Rate: 0.000328 | Progress: 0.02458 +Step 175 | Tokens: 5,734,400 | Train Loss EWMA: 8.1084 | Learning Rate: 0.000383 | Progress: 0.02867 +Step 200 | Tokens: 6,553,600 | Train Loss EWMA: 7.8048 | Learning Rate: 0.000437 | Progress: 0.03277 +Step 225 | Tokens: 7,372,800 | Train Loss EWMA: 7.5569 | Learning Rate: 0.000492 | Progress: 0.03686 +Step 250 | Tokens: 8,192,000 | Train Loss EWMA: 7.3680 | Learning Rate: 0.000546 | Progress: 0.04096 +Step 275 | Tokens: 9,011,200 | Train Loss EWMA: 7.2111 | Learning Rate: 0.000601 | Progress: 0.04506 +Step 300 | Tokens: 9,830,400 | Train Loss EWMA: 7.0808 | Learning Rate: 0.000656 | Progress: 0.04915 +Step 325 | Tokens: 10,649,600 | Train Loss EWMA: 6.9781 | Learning Rate: 0.000710 | Progress: 0.05325 +Step 350 | Tokens: 11,468,800 | Train Loss EWMA: 6.8965 | Learning Rate: 0.000765 | Progress: 0.05734 +Step 375 | Tokens: 12,288,000 | Train Loss EWMA: 6.8284 | Learning Rate: 0.000820 | Progress: 0.06144 +Step 400 | Tokens: 13,107,200 | Train Loss EWMA: 6.7719 | Learning Rate: 0.000874 | Progress: 0.06554 +Step 425 | Tokens: 13,926,400 | Train Loss EWMA: 6.7162 | Learning Rate: 0.000929 | Progress: 0.06963 +Step 450 | Tokens: 14,745,600 | Train Loss EWMA: 6.6773 | Learning Rate: 0.000984 | Progress: 0.07373 +Step 475 | Tokens: 15,564,800 | Train Loss EWMA: 6.6348 | Learning Rate: 0.001038 | Progress: 0.07782 +Step 500 | Tokens: 16,384,000 | Train Loss EWMA: 6.6074 | Learning Rate: 0.001093 | Progress: 0.08192 +Step 525 | Tokens: 17,203,200 | Train Loss EWMA: 6.5788 | Learning Rate: 0.001148 | Progress: 0.08602 +Step 550 | Tokens: 18,022,400 | Train Loss EWMA: 6.5508 | Learning Rate: 0.001202 | Progress: 0.09011 +Step 575 | Tokens: 18,841,600 | Train Loss EWMA: 6.5198 | Learning Rate: 0.001257 | Progress: 0.09421 +Step 600 | Tokens: 19,660,800 | Train Loss EWMA: 6.5032 | Learning Rate: 0.001311 | Progress: 0.09830 +Step 625 | Tokens: 20,480,000 | Train Loss EWMA: 6.4847 | Learning Rate: 0.001366 | Progress: 0.10240 +Step 650 | Tokens: 21,299,200 | Train Loss EWMA: 6.4792 | Learning Rate: 0.001421 | Progress: 0.10650 +Step 675 | Tokens: 22,118,400 | Train Loss EWMA: 6.4609 | Learning Rate: 0.001475 | Progress: 0.11059 +Step 700 | Tokens: 22,937,600 | Train Loss EWMA: 6.4504 | Learning Rate: 0.001530 | Progress: 0.11469 +Step 725 | Tokens: 23,756,800 | Train Loss EWMA: 6.4322 | Learning Rate: 0.001585 | Progress: 0.11878 +Step 750 | Tokens: 24,576,000 | Train Loss EWMA: 6.4119 | Learning Rate: 0.001639 | Progress: 0.12288 +Step 775 | Tokens: 25,395,200 | Train Loss EWMA: 6.4007 | Learning Rate: 0.001694 | Progress: 0.12698 +Step 800 | Tokens: 26,214,400 | Train Loss EWMA: 6.3889 | Learning Rate: 0.001749 | Progress: 0.13107 +Step 825 | Tokens: 27,033,600 | Train Loss EWMA: 6.3731 | Learning Rate: 0.001803 | Progress: 0.13517 +Step 850 | Tokens: 27,852,800 | Train Loss EWMA: 6.3603 | Learning Rate: 0.001858 | Progress: 0.13926 +Step 875 | Tokens: 28,672,000 | Train Loss EWMA: 6.3426 | Learning Rate: 0.001913 | Progress: 0.14336 +Step 900 | Tokens: 29,491,200 | Train Loss EWMA: 6.3292 | Learning Rate: 0.001967 | Progress: 0.14746 +Step 925 | Tokens: 30,310,400 | Train Loss EWMA: 6.3070 | Learning Rate: 0.002000 | Progress: 0.15155 +Step 950 | Tokens: 31,129,600 | Train Loss EWMA: 6.3021 | Learning Rate: 0.002000 | Progress: 0.15565 +Step 975 | Tokens: 31,948,800 | Train Loss EWMA: 6.2985 | Learning Rate: 0.002000 | Progress: 0.15974 +Step 1,000 | Tokens: 32,768,000 | Train Loss EWMA: 6.2852 | Learning Rate: 0.002000 | Progress: 0.16384 +Step 1,025 | Tokens: 33,587,200 | Train Loss EWMA: 6.2746 | Learning Rate: 0.002000 | Progress: 0.16794 +Step 1,050 | Tokens: 34,406,400 | Train Loss EWMA: 6.2597 | Learning Rate: 0.002000 | Progress: 0.17203 +Step 1,075 | Tokens: 35,225,600 | Train Loss EWMA: 6.2508 | Learning Rate: 0.002000 | Progress: 0.17613 +Step 1,100 | Tokens: 36,044,800 | Train Loss EWMA: 6.2377 | Learning Rate: 0.002000 | Progress: 0.18022 +Step 1,125 | Tokens: 36,864,000 | Train Loss EWMA: 6.2275 | Learning Rate: 0.002000 | Progress: 0.18432 +Step 1,150 | Tokens: 37,683,200 | Train Loss EWMA: 6.2208 | Learning Rate: 0.002000 | Progress: 0.18842 +Step 1,175 | Tokens: 38,502,400 | Train Loss EWMA: 6.2107 | Learning Rate: 0.002000 | Progress: 0.19251 +Step 1,200 | Tokens: 39,321,600 | Train Loss EWMA: 6.2040 | Learning Rate: 0.002000 | Progress: 0.19661 +Step 1,225 | Tokens: 40,140,800 | Train Loss EWMA: 6.1971 | Learning Rate: 0.002000 | Progress: 0.20070 +Step 1,250 | Tokens: 40,960,000 | Train Loss EWMA: 6.1853 | Learning Rate: 0.002000 | Progress: 0.20480 +Step 1,275 | Tokens: 41,779,200 | Train Loss EWMA: 6.1801 | Learning Rate: 0.002000 | Progress: 0.20890 +Step 1,300 | Tokens: 42,598,400 | Train Loss EWMA: 6.1721 | Learning Rate: 0.002000 | Progress: 0.21299 +Step 1,325 | Tokens: 43,417,600 | Train Loss EWMA: 6.1784 | Learning Rate: 0.002000 | Progress: 0.21709 +Step 1,350 | Tokens: 44,236,800 | Train Loss EWMA: 6.1622 | Learning Rate: 0.002000 | Progress: 0.22118 +Step 1,375 | Tokens: 45,056,000 | Train Loss EWMA: 6.1522 | Learning Rate: 0.002000 | Progress: 0.22528 +Step 1,400 | Tokens: 45,875,200 | Train Loss EWMA: 6.1433 | Learning Rate: 0.002000 | Progress: 0.22938 +Step 1,425 | Tokens: 46,694,400 | Train Loss EWMA: 6.1402 | Learning Rate: 0.002000 | Progress: 0.23347 +Step 1,450 | Tokens: 47,513,600 | Train Loss EWMA: 6.1361 | Learning Rate: 0.002000 | Progress: 0.23757 +Step 1,475 | Tokens: 48,332,800 | Train Loss EWMA: 6.1215 | Learning Rate: 0.002000 | Progress: 0.24166 +Step 1,500 | Tokens: 49,152,000 | Train Loss EWMA: 6.1204 | Learning Rate: 0.002000 | Progress: 0.24576 +Step 1,525 | Tokens: 49,971,200 | Train Loss EWMA: 6.1281 | Learning Rate: 0.002000 | Progress: 0.24986 +Step 1,550 | Tokens: 50,790,400 | Train Loss EWMA: 6.1099 | Learning Rate: 0.002000 | Progress: 0.25395 +Step 1,575 | Tokens: 51,609,600 | Train Loss EWMA: 6.0979 | Learning Rate: 0.002000 | Progress: 0.25805 +Step 1,600 | Tokens: 52,428,800 | Train Loss EWMA: 6.0805 | Learning Rate: 0.002000 | Progress: 0.26214 +Step 1,625 | Tokens: 53,248,000 | Train Loss EWMA: 6.0590 | Learning Rate: 0.002000 | Progress: 0.26624 +Step 1,650 | Tokens: 54,067,200 | Train Loss EWMA: 6.0480 | Learning Rate: 0.002000 | Progress: 0.27034 +Step 1,675 | Tokens: 54,886,400 | Train Loss EWMA: 6.0210 | Learning Rate: 0.002000 | Progress: 0.27443 +Step 1,700 | Tokens: 55,705,600 | Train Loss EWMA: 5.9861 | Learning Rate: 0.002000 | Progress: 0.27853 +Step 1,725 | Tokens: 56,524,800 | Train Loss EWMA: 5.9594 | Learning Rate: 0.002000 | Progress: 0.28262 +Step 1,750 | Tokens: 57,344,000 | Train Loss EWMA: 5.9253 | Learning Rate: 0.002000 | Progress: 0.28672 +Step 1,775 | Tokens: 58,163,200 | Train Loss EWMA: 5.8933 | Learning Rate: 0.002000 | Progress: 0.29082 +Step 1,800 | Tokens: 58,982,400 | Train Loss EWMA: 5.8859 | Learning Rate: 0.002000 | Progress: 0.29491 +Step 1,825 | Tokens: 59,801,600 | Train Loss EWMA: 5.8544 | Learning Rate: 0.002000 | Progress: 0.29901 +Step 1,850 | Tokens: 60,620,800 | Train Loss EWMA: 5.8299 | Learning Rate: 0.002000 | Progress: 0.30310 +Step 1,875 | Tokens: 61,440,000 | Train Loss EWMA: 5.8081 | Learning Rate: 0.002000 | Progress: 0.30720 +Step 1,900 | Tokens: 62,259,200 | Train Loss EWMA: 5.7967 | Learning Rate: 0.002000 | Progress: 0.31130 +Step 1,925 | Tokens: 63,078,400 | Train Loss EWMA: 5.7929 | Learning Rate: 0.002000 | Progress: 0.31539 +Step 1,950 | Tokens: 63,897,600 | Train Loss EWMA: 5.7782 | Learning Rate: 0.002000 | Progress: 0.31949 +Step 1,975 | Tokens: 64,716,800 | Train Loss EWMA: 5.7689 | Learning Rate: 0.002000 | Progress: 0.32358 +Step 2,000 | Tokens: 65,536,000 | Train Loss EWMA: 5.7587 | Learning Rate: 0.002000 | Progress: 0.32768 +Step 2,025 | Tokens: 66,355,200 | Train Loss EWMA: 5.7491 | Learning Rate: 0.002000 | Progress: 0.33178 +Step 2,050 | Tokens: 67,174,400 | Train Loss EWMA: 5.7433 | Learning Rate: 0.002000 | Progress: 0.33587 +Step 2,075 | Tokens: 67,993,600 | Train Loss EWMA: 5.7193 | Learning Rate: 0.002000 | Progress: 0.33997 +Step 2,100 | Tokens: 68,812,800 | Train Loss EWMA: 5.6995 | Learning Rate: 0.002000 | Progress: 0.34406 +Step 2,125 | Tokens: 69,632,000 | Train Loss EWMA: 5.6988 | Learning Rate: 0.002000 | Progress: 0.34816 +Step 2,150 | Tokens: 70,451,200 | Train Loss EWMA: 5.6834 | Learning Rate: 0.002000 | Progress: 0.35226 +Step 2,175 | Tokens: 71,270,400 | Train Loss EWMA: 5.6769 | Learning Rate: 0.002000 | Progress: 0.35635 +Step 2,200 | Tokens: 72,089,600 | Train Loss EWMA: 5.6703 | Learning Rate: 0.002000 | Progress: 0.36045 +Step 2,225 | Tokens: 72,908,800 | Train Loss EWMA: 5.6509 | Learning Rate: 0.002000 | Progress: 0.36454 +Step 2,250 | Tokens: 73,728,000 | Train Loss EWMA: 5.6419 | Learning Rate: 0.002000 | Progress: 0.36864 +Step 2,275 | Tokens: 74,547,200 | Train Loss EWMA: 5.6387 | Learning Rate: 0.002000 | Progress: 0.37274 +Step 2,300 | Tokens: 75,366,400 | Train Loss EWMA: 5.6305 | Learning Rate: 0.002000 | Progress: 0.37683 +Step 2,325 | Tokens: 76,185,600 | Train Loss EWMA: 5.6248 | Learning Rate: 0.002000 | Progress: 0.38093 +Step 2,350 | Tokens: 77,004,800 | Train Loss EWMA: 5.6259 | Learning Rate: 0.002000 | Progress: 0.38502 +Step 2,375 | Tokens: 77,824,000 | Train Loss EWMA: 5.6109 | Learning Rate: 0.002000 | Progress: 0.38912 +Step 2,400 | Tokens: 78,643,200 | Train Loss EWMA: 5.6030 | Learning Rate: 0.002000 | Progress: 0.39322 +Step 2,425 | Tokens: 79,462,400 | Train Loss EWMA: 5.6055 | Learning Rate: 0.002000 | Progress: 0.39731 +Step 2,450 | Tokens: 80,281,600 | Train Loss EWMA: 5.5917 | Learning Rate: 0.002000 | Progress: 0.40141 +Step 2,475 | Tokens: 81,100,800 | Train Loss EWMA: 5.5817 | Learning Rate: 0.002000 | Progress: 0.40550 +Step 2,500 | Tokens: 81,920,000 | Train Loss EWMA: 5.5780 | Learning Rate: 0.002000 | Progress: 0.40960 +Step 2,525 | Tokens: 82,739,200 | Train Loss EWMA: 5.5722 | Learning Rate: 0.002000 | Progress: 0.41370 +Step 2,550 | Tokens: 83,558,400 | Train Loss EWMA: 5.5716 | Learning Rate: 0.002000 | Progress: 0.41779 +Step 2,575 | Tokens: 84,377,600 | Train Loss EWMA: 5.5602 | Learning Rate: 0.002000 | Progress: 0.42189 +Step 2,600 | Tokens: 85,196,800 | Train Loss EWMA: 5.5587 | Learning Rate: 0.002000 | Progress: 0.42598 +Step 2,625 | Tokens: 86,016,000 | Train Loss EWMA: 5.5650 | Learning Rate: 0.002000 | Progress: 0.43008 +Step 2,650 | Tokens: 86,835,200 | Train Loss EWMA: 5.5640 | Learning Rate: 0.002000 | Progress: 0.43418 +Step 2,675 | Tokens: 87,654,400 | Train Loss EWMA: 5.5688 | Learning Rate: 0.002000 | Progress: 0.43827 +Step 2,700 | Tokens: 88,473,600 | Train Loss EWMA: 5.5543 | Learning Rate: 0.002000 | Progress: 0.44237 +Step 2,725 | Tokens: 89,292,800 | Train Loss EWMA: 5.5522 | Learning Rate: 0.002000 | Progress: 0.44646 +Step 2,750 | Tokens: 90,112,000 | Train Loss EWMA: 5.5441 | Learning Rate: 0.002000 | Progress: 0.45056 +Step 2,775 | Tokens: 90,931,200 | Train Loss EWMA: 5.5398 | Learning Rate: 0.002000 | Progress: 0.45466 +Step 2,800 | Tokens: 91,750,400 | Train Loss EWMA: 5.5404 | Learning Rate: 0.002000 | Progress: 0.45875 +Step 2,825 | Tokens: 92,569,600 | Train Loss EWMA: 5.5417 | Learning Rate: 0.002000 | Progress: 0.46285 +Step 2,850 | Tokens: 93,388,800 | Train Loss EWMA: 5.5405 | Learning Rate: 0.002000 | Progress: 0.46694 +Step 2,875 | Tokens: 94,208,000 | Train Loss EWMA: 5.5338 | Learning Rate: 0.002000 | Progress: 0.47104 +Step 2,900 | Tokens: 95,027,200 | Train Loss EWMA: 5.5303 | Learning Rate: 0.002000 | Progress: 0.47514 +Step 2,925 | Tokens: 95,846,400 | Train Loss EWMA: 5.5234 | Learning Rate: 0.002000 | Progress: 0.47923 +Step 2,950 | Tokens: 96,665,600 | Train Loss EWMA: 5.5249 | Learning Rate: 0.002000 | Progress: 0.48333 +Step 2,975 | Tokens: 97,484,800 | Train Loss EWMA: 5.5231 | Learning Rate: 0.002000 | Progress: 0.48742 +Step 3,000 | Tokens: 98,304,000 | Train Loss EWMA: 5.5279 | Learning Rate: 0.002000 | Progress: 0.49152 +Step 3,025 | Tokens: 99,123,200 | Train Loss EWMA: 5.5328 | Learning Rate: 0.002000 | Progress: 0.49562 +Step 3,050 | Tokens: 99,942,400 | Train Loss EWMA: 5.5217 | Learning Rate: 0.002000 | Progress: 0.49971 +Step 3,075 | Tokens: 100,761,600 | Train Loss EWMA: 5.5234 | Learning Rate: 0.002000 | Progress: 0.50381 +Step 3,100 | Tokens: 101,580,800 | Train Loss EWMA: 5.5243 | Learning Rate: 0.002000 | Progress: 0.50790 +Step 3,125 | Tokens: 102,400,000 | Train Loss EWMA: 5.5269 | Learning Rate: 0.002000 | Progress: 0.51200 +Step 3,150 | Tokens: 103,219,200 | Train Loss EWMA: 5.5230 | Learning Rate: 0.002000 | Progress: 0.51610 +Step 3,175 | Tokens: 104,038,400 | Train Loss EWMA: 5.5250 | Learning Rate: 0.002000 | Progress: 0.52019 +Step 3,200 | Tokens: 104,857,600 | Train Loss EWMA: 5.5052 | Learning Rate: 0.002000 | Progress: 0.52429 +Step 3,225 | Tokens: 105,676,800 | Train Loss EWMA: 5.4933 | Learning Rate: 0.002000 | Progress: 0.52838 +Step 3,250 | Tokens: 106,496,000 | Train Loss EWMA: 5.4924 | Learning Rate: 0.002000 | Progress: 0.53248 +Step 3,275 | Tokens: 107,315,200 | Train Loss EWMA: 5.4998 | Learning Rate: 0.002000 | Progress: 0.53658 +Step 3,300 | Tokens: 108,134,400 | Train Loss EWMA: 5.4740 | Learning Rate: 0.002000 | Progress: 0.54067 +Step 3,325 | Tokens: 108,953,600 | Train Loss EWMA: 5.4797 | Learning Rate: 0.002000 | Progress: 0.54477 +Step 3,350 | Tokens: 109,772,800 | Train Loss EWMA: 5.4794 | Learning Rate: 0.002000 | Progress: 0.54886 +Step 3,375 | Tokens: 110,592,000 | Train Loss EWMA: 5.4622 | Learning Rate: 0.002000 | Progress: 0.55296 +Step 3,400 | Tokens: 111,411,200 | Train Loss EWMA: 5.4546 | Learning Rate: 0.002000 | Progress: 0.55706 +Step 3,425 | Tokens: 112,230,400 | Train Loss EWMA: 5.4620 | Learning Rate: 0.002000 | Progress: 0.56115 +Step 3,450 | Tokens: 113,049,600 | Train Loss EWMA: 5.4621 | Learning Rate: 0.002000 | Progress: 0.56525 +Step 3,475 | Tokens: 113,868,800 | Train Loss EWMA: 5.4600 | Learning Rate: 0.002000 | Progress: 0.56934 +Step 3,500 | Tokens: 114,688,000 | Train Loss EWMA: 5.4512 | Learning Rate: 0.002000 | Progress: 0.57344 +Step 3,525 | Tokens: 115,507,200 | Train Loss EWMA: 5.4555 | Learning Rate: 0.002000 | Progress: 0.57754 +Step 3,550 | Tokens: 116,326,400 | Train Loss EWMA: 5.4540 | Learning Rate: 0.002000 | Progress: 0.58163 +Step 3,575 | Tokens: 117,145,600 | Train Loss EWMA: 5.4712 | Learning Rate: 0.002000 | Progress: 0.58573 +Step 3,600 | Tokens: 117,964,800 | Train Loss EWMA: 5.4643 | Learning Rate: 0.002000 | Progress: 0.58982 +Step 3,625 | Tokens: 118,784,000 | Train Loss EWMA: 5.4717 | Learning Rate: 0.002000 | Progress: 0.59392 +Step 3,650 | Tokens: 119,603,200 | Train Loss EWMA: 5.4739 | Learning Rate: 0.002000 | Progress: 0.59802 +Step 3,675 | Tokens: 120,422,400 | Train Loss EWMA: 5.4735 | Learning Rate: 0.002000 | Progress: 0.60211 +Step 3,700 | Tokens: 121,241,600 | Train Loss EWMA: 5.4739 | Learning Rate: 0.002000 | Progress: 0.60621 +Step 3,725 | Tokens: 122,060,800 | Train Loss EWMA: 5.4732 | Learning Rate: 0.002000 | Progress: 0.61030 +Step 3,750 | Tokens: 122,880,000 | Train Loss EWMA: 5.4645 | Learning Rate: 0.002000 | Progress: 0.61440 +Step 3,775 | Tokens: 123,699,200 | Train Loss EWMA: 5.4660 | Learning Rate: 0.002000 | Progress: 0.61850 +Step 3,800 | Tokens: 124,518,400 | Train Loss EWMA: 5.4507 | Learning Rate: 0.002000 | Progress: 0.62259 +Step 3,825 | Tokens: 125,337,600 | Train Loss EWMA: 5.4639 | Learning Rate: 0.002000 | Progress: 0.62669 +Step 3,850 | Tokens: 126,156,800 | Train Loss EWMA: 5.4688 | Learning Rate: 0.002000 | Progress: 0.63078 +Step 3,875 | Tokens: 126,976,000 | Train Loss EWMA: 5.4555 | Learning Rate: 0.002000 | Progress: 0.63488 +Step 3,900 | Tokens: 127,795,200 | Train Loss EWMA: 5.4628 | Learning Rate: 0.002000 | Progress: 0.63898 +Step 3,925 | Tokens: 128,614,400 | Train Loss EWMA: 5.4560 | Learning Rate: 0.002000 | Progress: 0.64307 +Step 3,950 | Tokens: 129,433,600 | Train Loss EWMA: 5.4593 | Learning Rate: 0.002000 | Progress: 0.64717 +Step 3,975 | Tokens: 130,252,800 | Train Loss EWMA: 5.4488 | Learning Rate: 0.002000 | Progress: 0.65126 +Step 4,000 | Tokens: 131,072,000 | Train Loss EWMA: 5.4616 | Learning Rate: 0.002000 | Progress: 0.65536 +Step 4,025 | Tokens: 131,891,200 | Train Loss EWMA: 5.4563 | Learning Rate: 0.002000 | Progress: 0.65946 +Step 4,050 | Tokens: 132,710,400 | Train Loss EWMA: 5.4385 | Learning Rate: 0.002000 | Progress: 0.66355 +Step 4,075 | Tokens: 133,529,600 | Train Loss EWMA: 5.4382 | Learning Rate: 0.002000 | Progress: 0.66765 +Step 4,100 | Tokens: 134,348,800 | Train Loss EWMA: 5.4332 | Learning Rate: 0.002000 | Progress: 0.67174 +Step 4,125 | Tokens: 135,168,000 | Train Loss EWMA: 5.4375 | Learning Rate: 0.002000 | Progress: 0.67584 +Step 4,150 | Tokens: 135,987,200 | Train Loss EWMA: 5.4523 | Learning Rate: 0.002000 | Progress: 0.67994 +Step 4,175 | Tokens: 136,806,400 | Train Loss EWMA: 5.4413 | Learning Rate: 0.002000 | Progress: 0.68403 +Step 4,200 | Tokens: 137,625,600 | Train Loss EWMA: 5.4546 | Learning Rate: 0.002000 | Progress: 0.68813 +Step 4,225 | Tokens: 138,444,800 | Train Loss EWMA: 5.4421 | Learning Rate: 0.002000 | Progress: 0.69222 +Step 4,250 | Tokens: 139,264,000 | Train Loss EWMA: 5.4493 | Learning Rate: 0.002000 | Progress: 0.69632 +Step 4,275 | Tokens: 140,083,200 | Train Loss EWMA: 5.4525 | Learning Rate: 0.002000 | Progress: 0.70042 +Step 4,300 | Tokens: 140,902,400 | Train Loss EWMA: 5.4376 | Learning Rate: 0.002000 | Progress: 0.70451 +Step 4,325 | Tokens: 141,721,600 | Train Loss EWMA: 5.4356 | Learning Rate: 0.002000 | Progress: 0.70861 +Step 4,350 | Tokens: 142,540,800 | Train Loss EWMA: 5.4357 | Learning Rate: 0.002000 | Progress: 0.71270 +Step 4,375 | Tokens: 143,360,000 | Train Loss EWMA: 5.4328 | Learning Rate: 0.002000 | Progress: 0.71680 +Step 4,400 | Tokens: 144,179,200 | Train Loss EWMA: 5.4423 | Learning Rate: 0.002000 | Progress: 0.72090 +Step 4,425 | Tokens: 144,998,400 | Train Loss EWMA: 5.4529 | Learning Rate: 0.002000 | Progress: 0.72499 +Step 4,450 | Tokens: 145,817,600 | Train Loss EWMA: 5.4435 | Learning Rate: 0.002000 | Progress: 0.72909 +Step 4,475 | Tokens: 146,636,800 | Train Loss EWMA: 5.4435 | Learning Rate: 0.002000 | Progress: 0.73318 +Step 4,500 | Tokens: 147,456,000 | Train Loss EWMA: 5.4292 | Learning Rate: 0.002000 | Progress: 0.73728 +Step 4,525 | Tokens: 148,275,200 | Train Loss EWMA: 5.4085 | Learning Rate: 0.002000 | Progress: 0.74138 +Step 4,550 | Tokens: 149,094,400 | Train Loss EWMA: 5.4099 | Learning Rate: 0.002000 | Progress: 0.74547 +Step 4,575 | Tokens: 149,913,600 | Train Loss EWMA: 5.4106 | Learning Rate: 0.002000 | Progress: 0.74957 +Step 4,600 | Tokens: 150,732,800 | Train Loss EWMA: 5.4258 | Learning Rate: 0.002000 | Progress: 0.75366 +Step 4,625 | Tokens: 151,552,000 | Train Loss EWMA: 5.4148 | Learning Rate: 0.002000 | Progress: 0.75776 +Step 4,650 | Tokens: 152,371,200 | Train Loss EWMA: 5.4111 | Learning Rate: 0.002000 | Progress: 0.76186 +Step 4,675 | Tokens: 153,190,400 | Train Loss EWMA: 5.4041 | Learning Rate: 0.002000 | Progress: 0.76595 +Step 4,700 | Tokens: 154,009,600 | Train Loss EWMA: 5.4177 | Learning Rate: 0.002000 | Progress: 0.77005 +Step 4,725 | Tokens: 154,828,800 | Train Loss EWMA: 5.4213 | Learning Rate: 0.002000 | Progress: 0.77414 +Step 4,750 | Tokens: 155,648,000 | Train Loss EWMA: 5.4155 | Learning Rate: 0.002000 | Progress: 0.77824 +Step 4,775 | Tokens: 156,467,200 | Train Loss EWMA: 5.4218 | Learning Rate: 0.002000 | Progress: 0.78234 +Step 4,800 | Tokens: 157,286,400 | Train Loss EWMA: 5.4238 | Learning Rate: 0.002000 | Progress: 0.78643 +Step 4,825 | Tokens: 158,105,600 | Train Loss EWMA: 5.4277 | Learning Rate: 0.002000 | Progress: 0.79053 +Step 4,850 | Tokens: 158,924,800 | Train Loss EWMA: 5.4332 | Learning Rate: 0.002000 | Progress: 0.79462 +Step 4,875 | Tokens: 159,744,000 | Train Loss EWMA: 5.4302 | Learning Rate: 0.002000 | Progress: 0.79872 +Step 4,900 | Tokens: 160,563,200 | Train Loss EWMA: 5.4330 | Learning Rate: 0.002000 | Progress: 0.80282 +Step 4,925 | Tokens: 161,382,400 | Train Loss EWMA: 5.4217 | Learning Rate: 0.002000 | Progress: 0.80691 +Step 4,950 | Tokens: 162,201,600 | Train Loss EWMA: 5.4205 | Learning Rate: 0.002000 | Progress: 0.81101 +Step 4,975 | Tokens: 163,020,800 | Train Loss EWMA: 5.4243 | Learning Rate: 0.002000 | Progress: 0.81510 +Step 5,000 | Tokens: 163,840,000 | Train Loss EWMA: 5.4160 | Learning Rate: 0.002000 | Progress: 0.81920 +Step 5,025 | Tokens: 164,659,200 | Train Loss EWMA: 5.4294 | Learning Rate: 0.002000 | Progress: 0.82330 +Step 5,050 | Tokens: 165,478,400 | Train Loss EWMA: 5.4196 | Learning Rate: 0.002000 | Progress: 0.82739 +Step 5,075 | Tokens: 166,297,600 | Train Loss EWMA: 5.4241 | Learning Rate: 0.002000 | Progress: 0.83149 +Step 5,100 | Tokens: 167,116,800 | Train Loss EWMA: 5.4234 | Learning Rate: 0.002000 | Progress: 0.83558 +Step 5,125 | Tokens: 167,936,000 | Train Loss EWMA: 5.4181 | Learning Rate: 0.002000 | Progress: 0.83968 +Step 5,150 | Tokens: 168,755,200 | Train Loss EWMA: 5.4170 | Learning Rate: 0.002000 | Progress: 0.84378 +Step 5,175 | Tokens: 169,574,400 | Train Loss EWMA: 5.4185 | Learning Rate: 0.002000 | Progress: 0.84787 +Step 5,200 | Tokens: 170,393,600 | Train Loss EWMA: 5.4212 | Learning Rate: 0.002000 | Progress: 0.85197 +Step 5,225 | Tokens: 171,212,800 | Train Loss EWMA: 5.4241 | Learning Rate: 0.002000 | Progress: 0.85606 +Step 5,250 | Tokens: 172,032,000 | Train Loss EWMA: 5.4226 | Learning Rate: 0.002000 | Progress: 0.86016 +Step 5,275 | Tokens: 172,851,200 | Train Loss EWMA: 5.4243 | Learning Rate: 0.002000 | Progress: 0.86426 +Step 5,300 | Tokens: 173,670,400 | Train Loss EWMA: 5.4278 | Learning Rate: 0.002000 | Progress: 0.86835 +Step 5,325 | Tokens: 174,489,600 | Train Loss EWMA: 5.4293 | Learning Rate: 0.002000 | Progress: 0.87245 +Step 5,350 | Tokens: 175,308,800 | Train Loss EWMA: 5.4223 | Learning Rate: 0.002000 | Progress: 0.87654 +Step 5,375 | Tokens: 176,128,000 | Train Loss EWMA: 5.4279 | Learning Rate: 0.002000 | Progress: 0.88064 +Step 5,400 | Tokens: 176,947,200 | Train Loss EWMA: 5.4144 | Learning Rate: 0.002000 | Progress: 0.88474 +Step 5,425 | Tokens: 177,766,400 | Train Loss EWMA: 5.4108 | Learning Rate: 0.002000 | Progress: 0.88883 +Step 5,450 | Tokens: 178,585,600 | Train Loss EWMA: 5.4040 | Learning Rate: 0.002000 | Progress: 0.89293 +Step 5,475 | Tokens: 179,404,800 | Train Loss EWMA: 5.4005 | Learning Rate: 0.002000 | Progress: 0.89702 +Step 5,500 | Tokens: 180,224,000 | Train Loss EWMA: 5.4011 | Learning Rate: 0.002000 | Progress: 0.90112 +Step 5,525 | Tokens: 181,043,200 | Train Loss EWMA: 5.4037 | Learning Rate: 0.002000 | Progress: 0.90522 +Step 5,550 | Tokens: 181,862,400 | Train Loss EWMA: 5.3988 | Learning Rate: 0.002000 | Progress: 0.90931 +Step 5,575 | Tokens: 182,681,600 | Train Loss EWMA: 5.4053 | Learning Rate: 0.002000 | Progress: 0.91341 +Step 5,600 | Tokens: 183,500,800 | Train Loss EWMA: 5.4081 | Learning Rate: 0.002000 | Progress: 0.91750 +Step 5,625 | Tokens: 184,320,000 | Train Loss EWMA: 5.4023 | Learning Rate: 0.002000 | Progress: 0.92160 +Step 5,650 | Tokens: 185,139,200 | Train Loss EWMA: 5.3903 | Learning Rate: 0.002000 | Progress: 0.92570 +Step 5,675 | Tokens: 185,958,400 | Train Loss EWMA: 5.3932 | Learning Rate: 0.002000 | Progress: 0.92979 +Step 5,700 | Tokens: 186,777,600 | Train Loss EWMA: 5.3932 | Learning Rate: 0.002000 | Progress: 0.93389 +Step 5,725 | Tokens: 187,596,800 | Train Loss EWMA: 5.3947 | Learning Rate: 0.002000 | Progress: 0.93798 +Step 5,750 | Tokens: 188,416,000 | Train Loss EWMA: 5.4071 | Learning Rate: 0.002000 | Progress: 0.94208 +Step 5,775 | Tokens: 189,235,200 | Train Loss EWMA: 5.4011 | Learning Rate: 0.002000 | Progress: 0.94618 +Step 5,800 | Tokens: 190,054,400 | Train Loss EWMA: 5.3906 | Learning Rate: 0.002000 | Progress: 0.95027 +Step 5,825 | Tokens: 190,873,600 | Train Loss EWMA: 5.3821 | Learning Rate: 0.002000 | Progress: 0.95437 +Step 5,850 | Tokens: 191,692,800 | Train Loss EWMA: 5.3819 | Learning Rate: 0.002000 | Progress: 0.95846 +Step 5,875 | Tokens: 192,512,000 | Train Loss EWMA: 5.3861 | Learning Rate: 0.002000 | Progress: 0.96256 +Step 5,900 | Tokens: 193,331,200 | Train Loss EWMA: 5.3809 | Learning Rate: 0.002000 | Progress: 0.96666 +Step 5,925 | Tokens: 194,150,400 | Train Loss EWMA: 5.3854 | Learning Rate: 0.002000 | Progress: 0.97075 +Step 5,950 | Tokens: 194,969,600 | Train Loss EWMA: 5.3970 | Learning Rate: 0.002000 | Progress: 0.97485 +Step 5,975 | Tokens: 195,788,800 | Train Loss EWMA: 5.3956 | Learning Rate: 0.002000 | Progress: 0.97894 +Step 6,000 | Tokens: 196,608,000 | Train Loss EWMA: 5.3878 | Learning Rate: 0.002000 | Progress: 0.98304 +Step 6,025 | Tokens: 197,427,200 | Train Loss EWMA: 5.3924 | Learning Rate: 0.002000 | Progress: 0.98714 +Step 6,050 | Tokens: 198,246,400 | Train Loss EWMA: 5.3912 | Learning Rate: 0.002000 | Progress: 0.99123 +Step 6,075 | Tokens: 199,065,600 | Train Loss EWMA: 5.3879 | Learning Rate: 0.002000 | Progress: 0.99533 +Step 6,100 | Tokens: 199,884,800 | Train Loss EWMA: 5.3863 | Learning Rate: 0.002000 | Progress: 0.99942 diff --git a/wandb/run-20251201_125310-vmyfcav3/files/requirements.txt b/wandb/run-20251201_125310-vmyfcav3/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..c2cbab9ffb6c27c4669296998dc5731d4f57bca4 --- /dev/null +++ b/wandb/run-20251201_125310-vmyfcav3/files/requirements.txt @@ -0,0 +1,217 @@ +fsspec==2025.3.0 +PyYAML==6.0.2 +certifi==2025.8.3 +comm==0.2.3 +hf-xet==1.1.8 +widgetsnbextension==4.0.14 +Jinja2==3.1.6 +rich==14.1.0 +circuitsvis==1.43.3 +param==2.2.1 +httpcore==1.0.9 +nvidia-cuda-cupti-cu12==12.8.90 +pytest==8.4.1 +nvidia-cuda-nvrtc-cu12==12.8.93 +asttokens==3.0.0 +filelock==3.19.1 +jsonschema-specifications==2025.4.1 +types-python-dateutil==2.9.0.20250822 +cycler==0.12.1 +stack-data==0.6.3 +jupyter_server==2.17.0 +aiosignal==1.4.0 +xyzservices==2025.4.0 +lark==1.2.2 +ptyprocess==0.7.0 +xxhash==3.5.0 +mpmath==1.3.0 +seaborn==0.13.2 +wadler_lindig==0.1.7 +typing_extensions==4.14.1 +nbformat==5.10.4 +jupyterlab==4.4.6 +plotly==6.3.0 +bokeh==3.7.3 +huggingface-hub==0.34.4 +sentencepiece==0.2.1 +torchvision==0.23.0 +tqdm==4.67.1 +contourpy==1.3.3 +nvidia-nvtx-cu12==12.8.90 +nvidia-cuda-runtime-cu12==12.8.90 +yarl==1.20.1 +charset-normalizer==3.4.3 +jupyter-events==0.12.0 +nbclient==0.10.2 +numpy==1.26.4 +decorator==5.2.1 +networkx==3.5 +smmap==5.0.2 +nbconvert==7.16.6 +pytz==2025.2 +aiohappyeyeballs==2.6.1 +requests==2.32.5 +tinycss2==1.4.0 +ruff==0.12.10 +defusedxml==0.7.1 +matplotlib-inline==0.1.7 +identify==2.6.13 +jedi==0.19.2 +pathspec==0.12.1 +transformer-lens==2.16.1 +sympy==1.14.0 +jupyterlab_pygments==0.3.0 +overrides==7.7.0 +notebook_shim==0.2.4 +matplotlib==3.10.5 +jupyter==1.1.1 +accelerate==1.10.0 +better-abc==0.0.3 +jsonpointer==3.0.0 +terminado==0.18.1 +cfgv==3.4.0 +rfc3987-syntax==1.1.0 +annotated-types==0.7.0 +pyarrow==21.0.0 +webencodings==0.5.1 +wcwidth==0.2.13 +mistune==3.1.3 +cffi==1.17.1 +jupyterlab_server==2.27.3 +argon2-cffi-bindings==25.1.0 +nvidia-nvjitlink-cu12==12.8.93 +jaxtyping==0.3.2 +Pygments==2.19.2 +panel==1.7.5 +torch==2.8.0 +rfc3339-validator==0.1.4 +urllib3==2.5.0 +jupyterlab_widgets==3.0.15 +ipykernel==6.30.1 +nvidia-cudnn-cu12==9.10.2.21 +babel==2.17.0 +transformers==4.55.4 +pure_eval==0.2.3 +pyparsing==3.2.3 +nvidia-cublas-cu12==12.8.4.1 +sniffio==1.3.1 +notebook==7.4.5 +pycparser==2.22 +packaging==25.0 +h11==0.16.0 +psutil==7.0.0 +pexpect==4.9.0 +zstandard==0.25.0 +gitdb==4.0.12 +rfc3986-validator==0.1.1 +toy_models==0.1.0 +torchaudio==2.8.0 +pyzmq==27.0.2 +mypy_extensions==1.1.0 +prompt_toolkit==3.0.51 +pytest-cov==6.2.1 +attrs==25.3.0 +regex==2025.7.34 +jupyter_core==5.8.1 +bleach==6.2.0 +fqdn==1.5.1 +async-lru==2.0.5 +nvidia-nccl-cu12==2.27.3 +GitPython==3.1.45 +referencing==0.36.2 +click==8.2.1 +fonttools==4.59.1 +prometheus_client==0.22.1 +httpx==0.28.1 +setuptools==80.9.0 +argon2-cffi==25.1.0 +multidict==6.6.4 +pyviz_comms==3.0.6 +executing==2.2.0 +arrow==1.3.0 +sentry-sdk==2.35.0 +beartype==0.14.1 +coverage==7.10.4 +ipywidgets==8.1.7 +pydantic_core==2.33.2 +tokenizers==0.21.4 +markdown-it-py==4.0.0 +pandas==2.3.2 +virtualenv==20.34.0 +python-dotenv==1.1.1 +isoduration==20.11.0 +python-dateutil==2.9.0.post0 +nodeenv==1.9.1 +nvidia-curand-cu12==10.3.9.90 +webcolors==24.11.1 +mypy==1.17.1 +MarkupSafe==3.0.2 +nvidia-cusolver-cu12==11.7.3.90 +Send2Trash==1.8.3 +protobuf==6.32.0 +jupyter_server_terminals==0.5.3 +debugpy==1.8.16 +json5==0.12.1 +linkify-it-py==2.0.3 +importlib_metadata==8.7.0 +nvidia-cufft-cu12==11.3.3.83 +distlib==0.4.0 +typing-inspection==0.4.1 +rpds-py==0.27.0 +nvidia-cufile-cu12==1.13.1.3 +mdurl==0.1.2 +websocket-client==1.8.0 +jsonschema==4.25.1 +python-json-logger==3.3.0 +ipympl==0.9.7 +einops==0.8.1 +jupyter_client==8.6.3 +ipython_pygments_lexers==1.1.1 +h5py==3.14.0 +tabulate==0.9.0 +propcache==0.3.2 +tornado==6.5.2 +typeguard==4.4.4 +tomlkit==0.13.2 +pluggy==1.6.0 +pydantic==2.11.7 +ipython==9.4.0 +zipp==3.23.0 +fancy-einsum==0.0.3 +fastjsonschema==2.21.2 +datasets==4.0.0 +Markdown==3.8.2 +pillow==11.3.0 +uc-micro-py==1.0.3 +pre_commit==4.3.0 +beautifulsoup4==4.13.4 +soupsieve==2.7 +aiohttp==3.12.15 +wandb==0.21.1 +tzdata==2025.2 +jupyter-lsp==2.2.6 +triton==3.4.0 +kiwisolver==1.4.9 +idna==3.10 +narwhals==2.1.2 +multiprocess==0.70.16 +dill==0.3.8 +mdit-py-plugins==0.5.0 +transformers-stream-generator==0.0.5 +nvidia-cusparselt-cu12==0.7.1 +parso==0.8.4 +pandocfilters==1.5.1 +jupyter-console==6.6.3 +anyio==4.10.0 +six==1.17.0 +holoviews==1.21.0 +colorcet==3.1.0 +uri-template==1.3.0 +nest-asyncio==1.6.0 +nvidia-cusparse-cu12==12.5.8.93 +platformdirs==4.3.8 +iniconfig==2.1.0 +traitlets==5.14.3 +safetensors==0.6.2 +frozenlist==1.7.0 +toy_models==0.1.0 diff --git a/wandb/run-20251201_125310-vmyfcav3/files/wandb-metadata.json b/wandb/run-20251201_125310-vmyfcav3/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..451adb7a81810e4954e49afe23a6405718c39629 --- /dev/null +++ b/wandb/run-20251201_125310-vmyfcav3/files/wandb-metadata.json @@ -0,0 +1,38 @@ +{ + "os": "Linux-5.19.0-45-generic-x86_64-with-glibc2.35", + "python": "CPython 3.11.7", + "startedAt": "2025-12-01T12:53:10.411266Z", + "program": "", + "git": { + "remote": "git@github.com:jgroh3/toy_models.git", + "commit": "666423afacb222750f02b390eb7c844da5b0afc5" + }, + "email": "efarrel4@tcd.ie", + "root": "/notebooks/toy_models/model_training/pile_llama_grid_dataset_name_PL_SeqUnmodified_NonSeqObserved_L2", + "host": "nj33madmn6", + "executable": "/notebooks/toy_models/.toy_models_env/bin/python", + "cpu_count": 8, + "cpu_count_logical": 8, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 1, + "disk": { + "/": { + "total": "262240792576", + "used": "142933057536" + } + }, + "memory": { + "total": "47332843520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere", + "uuid": "GPU-119b219a-d9f5-9f14-672c-316f35eafabe" + } + ], + "cudaVersion": "12.4", + "writerId": "uxb3ioqlaif63jojqpnysdkwauhpsncj" +} \ No newline at end of file diff --git a/wandb/run-20251201_125310-vmyfcav3/files/wandb-summary.json b/wandb/run-20251201_125310-vmyfcav3/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..de417b867254565a80ccb56d6b5a0fbdfe438c1b --- /dev/null +++ b/wandb/run-20251201_125310-vmyfcav3/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":4094},"step":6100,"_timestamp":1.7645976826206312e+09,"tokens_per_second":32768,"progress":0.999424,"train_loss":5.109295845031738,"tokens_seen":199884800,"_step":6100,"train_loss_ewma":5.386275135275631,"_runtime":4094.477637336,"learning_rate":0.002} \ No newline at end of file diff --git a/wandb/run-20251201_125310-vmyfcav3/logs/debug-internal.log b/wandb/run-20251201_125310-vmyfcav3/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..97ac36e44d6864d6f040652d2372412a88a1e581 --- /dev/null +++ b/wandb/run-20251201_125310-vmyfcav3/logs/debug-internal.log @@ -0,0 +1,12 @@ +{"time":"2025-12-01T12:53:10.90397339Z","level":"INFO","msg":"stream: starting","core version":"0.21.1"} +{"time":"2025-12-01T12:53:11.146477688Z","level":"INFO","msg":"stream: created new stream","id":"vmyfcav3"} +{"time":"2025-12-01T12:53:11.146518395Z","level":"INFO","msg":"stream: started","id":"vmyfcav3"} +{"time":"2025-12-01T12:53:11.14653942Z","level":"INFO","msg":"writer: started","stream_id":"vmyfcav3"} +{"time":"2025-12-01T12:53:11.146576902Z","level":"INFO","msg":"handler: started","stream_id":"vmyfcav3"} +{"time":"2025-12-01T12:53:11.14660377Z","level":"INFO","msg":"sender: started","stream_id":"vmyfcav3"} +{"time":"2025-12-01T14:01:26.287122125Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-12-01T14:01:26.427139132Z","level":"INFO","msg":"handler: operation stats","stats":{}} +{"time":"2025-12-01T14:01:26.429682025Z","level":"INFO","msg":"stream: closing","id":"vmyfcav3"} +{"time":"2025-12-01T14:01:26.429706395Z","level":"INFO","msg":"handler: closed","stream_id":"vmyfcav3"} +{"time":"2025-12-01T14:01:26.429761128Z","level":"INFO","msg":"sender: closed","stream_id":"vmyfcav3"} +{"time":"2025-12-01T14:01:26.429768246Z","level":"INFO","msg":"stream: closed","id":"vmyfcav3"} diff --git a/wandb/run-20251201_125310-vmyfcav3/logs/debug.log b/wandb/run-20251201_125310-vmyfcav3/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..ac400842409a2893e77a7c9dc71bb53406d73333 --- /dev/null +++ b/wandb/run-20251201_125310-vmyfcav3/logs/debug.log @@ -0,0 +1,28 @@ +2025-12-01 12:53:10,418 INFO MainThread:20285 [wandb_setup.py:_flush():80] Current SDK version is 0.21.1 +2025-12-01 12:53:10,418 INFO MainThread:20285 [wandb_setup.py:_flush():80] Configure stats pid to 20285 +2025-12-01 12:53:10,418 INFO MainThread:20285 [wandb_setup.py:_flush():80] Loading settings from /root/.config/wandb/settings +2025-12-01 12:53:10,418 INFO MainThread:20285 [wandb_setup.py:_flush():80] Loading settings from /notebooks/toy_models/model_training/pile_llama_grid_dataset_name_PL_SeqUnmodified_NonSeqObserved_L2/wandb/settings +2025-12-01 12:53:10,418 INFO MainThread:20285 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-01 12:53:10,418 INFO MainThread:20285 [wandb_init.py:setup_run_log_directory():703] Logging user logs to /notebooks/toy_models/model_training/pile_llama_grid_dataset_name_PL_SeqUnmodified_NonSeqObserved_L2/wandb/run-20251201_125310-vmyfcav3/logs/debug.log +2025-12-01 12:53:10,418 INFO MainThread:20285 [wandb_init.py:setup_run_log_directory():704] Logging internal logs to /notebooks/toy_models/model_training/pile_llama_grid_dataset_name_PL_SeqUnmodified_NonSeqObserved_L2/wandb/run-20251201_125310-vmyfcav3/logs/debug-internal.log +2025-12-01 12:53:10,418 INFO MainThread:20285 [wandb_init.py:init():830] calling init triggers +2025-12-01 12:53:10,418 INFO MainThread:20285 [wandb_init.py:init():835] wandb.init called with sweep_config: {} +config: {'model_name': 'pile_llama_grid', 'n_layers': 2, 'd_model': 512, 'd_mlp': 2048, 'd_head': 64, 'n_heads': 8, 'attn_only': False, 'layer_norm_eps': 1e-05, 'init_range': 0.02, 'n_ctx': 1024, 'd_vocab': 48262, 'dataset_name': 'eoinf/PL_SeqUnmodified_NonSeqObserved_L2', 'tokenizer_name': '', 'seed': 10, 'device': 'cuda', 'use_bfloat16_matmul': False, 'batch_size_per_device': 32, 'n_devices': 1, 'batches_per_step': 1, 'max_tokens': 200000000, 'lr_hidden': 0.002, 'lr_vector': 0.001, 'lr_schedule': 'constant_with_warmup', 'warmup_tokens': 30000000, 'weight_decay': 0.05, 'grad_norm_clip': 1.0, 'train_loss_moving_average_beta': 0.99, 'log_interval': 25, 'save_checkpoints': True, 'checkpoint_interval': 500, 'checkpoint_interval_ratio': 1.1, 'save_log_checkpoints': True, 'use_wandb': True, 'batch_size': 32, 'tokens_per_step': 32768, 'warmup_steps': 915, 'max_steps': 6103, '_wandb': {}} +2025-12-01 12:53:10,418 INFO MainThread:20285 [wandb_init.py:init():871] starting backend +2025-12-01 12:53:10,892 INFO MainThread:20285 [wandb_init.py:init():874] sending inform_init request +2025-12-01 12:53:10,902 INFO MainThread:20285 [wandb_init.py:init():882] backend started and connected +2025-12-01 12:53:10,903 INFO MainThread:20285 [wandb_init.py:init():953] updated telemetry +2025-12-01 12:53:10,971 INFO MainThread:20285 [wandb_init.py:init():977] communicating run to backend with 90.0 second timeout +2025-12-01 12:53:11,452 INFO MainThread:20285 [wandb_init.py:init():1029] starting run threads in backend +2025-12-01 12:53:12,200 INFO MainThread:20285 [wandb_run.py:_console_start():2494] atexit reg +2025-12-01 12:53:12,201 INFO MainThread:20285 [wandb_run.py:_redirect():2342] redirect: wrap_raw +2025-12-01 12:53:12,201 INFO MainThread:20285 [wandb_run.py:_redirect():2411] Wrapping output streams. +2025-12-01 12:53:12,201 INFO MainThread:20285 [wandb_run.py:_redirect():2434] Redirects installed. +2025-12-01 12:53:12,208 INFO MainThread:20285 [wandb_init.py:init():1075] run started, returning control to user process +2025-12-01 14:01:25,928 INFO MainThread:20285 [wandb_run.py:_finish():2260] finishing run eoin/toy-transformer-replication/vmyfcav3 +2025-12-01 14:01:25,931 INFO MainThread:20285 [wandb_run.py:_atexit_cleanup():2459] got exitcode: 0 +2025-12-01 14:01:25,931 INFO MainThread:20285 [wandb_run.py:_restore():2441] restore +2025-12-01 14:01:25,931 INFO MainThread:20285 [wandb_run.py:_restore():2447] restore done +2025-12-01 14:01:26,428 INFO MainThread:20285 [wandb_run.py:_footer_history_summary_info():3895] rendering history +2025-12-01 14:01:26,428 INFO MainThread:20285 [wandb_run.py:_footer_history_summary_info():3927] rendering summary +2025-12-01 14:01:26,429 INFO MainThread:20285 [wandb_run.py:_footer_sync_info():3856] logging synced files diff --git a/wandb/run-20251201_125310-vmyfcav3/run-vmyfcav3.wandb b/wandb/run-20251201_125310-vmyfcav3/run-vmyfcav3.wandb new file mode 100644 index 0000000000000000000000000000000000000000..22079e65e8520db13f91d9750a7eb90657d67bbd --- /dev/null +++ b/wandb/run-20251201_125310-vmyfcav3/run-vmyfcav3.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0d70dbb546c30f239eda182d0baa6e7e911bceb9435500d0733205ce7db7004 +size 5405055