diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..7a8d481f408f6e8eccf23f91971f83625b3a1c74 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +wandb/run-20260227_134232-xgg2g05z/run-xgg2g05z.wandb filter=lfs diff=lfs merge=lfs -text diff --git a/checkpoints/metadata_000000032768.json b/checkpoints/metadata_000000032768.json new file mode 100644 index 0000000000000000000000000000000000000000..948486a5ac28a7697c81621d37c88c6004ccab55 --- /dev/null +++ b/checkpoints/metadata_000000032768.json @@ -0,0 +1 @@ +{"step": 1, "tokens_seen": 32768, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.37802791595459} \ No newline at end of file diff --git a/checkpoints/metadata_000000327680.json b/checkpoints/metadata_000000327680.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe4b7508dde4253b2403fa72eeb2a7ffa57a13a --- /dev/null +++ b/checkpoints/metadata_000000327680.json @@ -0,0 +1 @@ +{"step": 10, "tokens_seen": 327680, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.377717663851968} \ No newline at end of file diff --git a/checkpoints/metadata_000000360448.json b/checkpoints/metadata_000000360448.json new file mode 100644 index 0000000000000000000000000000000000000000..75321983f9cf21cd95b715c6d941d30640fbf485 --- /dev/null +++ b/checkpoints/metadata_000000360448.json @@ -0,0 +1 @@ +{"step": 11, "tokens_seen": 360448, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.377598009414987} \ No newline at end of file diff --git a/checkpoints/metadata_000000425984.json b/checkpoints/metadata_000000425984.json new file mode 100644 index 0000000000000000000000000000000000000000..6a0fc22fe63c1f0cbebfb56ce4d37edc2bb06d41 --- /dev/null +++ b/checkpoints/metadata_000000425984.json @@ -0,0 +1 @@ +{"step": 13, "tokens_seen": 425984, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.377289953169777} \ No newline at end of file diff --git a/checkpoints/metadata_000000458752.json b/checkpoints/metadata_000000458752.json new file mode 100644 index 0000000000000000000000000000000000000000..a36a23c0bd176c3c7304de8a1ee0687ee38c6310 --- /dev/null +++ b/checkpoints/metadata_000000458752.json @@ -0,0 +1 @@ +{"step": 14, "tokens_seen": 458752, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.37713998607216} \ No newline at end of file diff --git a/checkpoints/metadata_000000491520.json b/checkpoints/metadata_000000491520.json new file mode 100644 index 0000000000000000000000000000000000000000..9360bea53969c932a489b25152c99fc3149f0e49 --- /dev/null +++ b/checkpoints/metadata_000000491520.json @@ -0,0 +1 @@ +{"step": 15, "tokens_seen": 491520, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.376938789992566} \ No newline at end of file diff --git a/checkpoints/metadata_000000557056.json b/checkpoints/metadata_000000557056.json new file mode 100644 index 0000000000000000000000000000000000000000..4300560de28b05c3aa19536182157dad183bef9e --- /dev/null +++ b/checkpoints/metadata_000000557056.json @@ -0,0 +1 @@ +{"step": 17, "tokens_seen": 557056, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.376426411036498} \ No newline at end of file diff --git a/checkpoints/metadata_000000622592.json b/checkpoints/metadata_000000622592.json new file mode 100644 index 0000000000000000000000000000000000000000..afb18ea0ff53bb798ac4cfc927f72cb3e0c25643 --- /dev/null +++ b/checkpoints/metadata_000000622592.json @@ -0,0 +1 @@ +{"step": 19, "tokens_seen": 622592, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.375824231378044} \ No newline at end of file diff --git a/checkpoints/metadata_000000688128.json b/checkpoints/metadata_000000688128.json new file mode 100644 index 0000000000000000000000000000000000000000..214713b91c599d58de971a5f6cd833b054b165a6 --- /dev/null +++ b/checkpoints/metadata_000000688128.json @@ -0,0 +1 @@ +{"step": 21, "tokens_seen": 688128, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.375048947032507} \ No newline at end of file diff --git a/checkpoints/metadata_000000753664.json b/checkpoints/metadata_000000753664.json new file mode 100644 index 0000000000000000000000000000000000000000..f0aaaa29deef575563ff1378b6f3fad0972546af --- /dev/null +++ b/checkpoints/metadata_000000753664.json @@ -0,0 +1 @@ +{"step": 23, "tokens_seen": 753664, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.374101782438617} \ No newline at end of file diff --git a/checkpoints/metadata_000000819200.json b/checkpoints/metadata_000000819200.json new file mode 100644 index 0000000000000000000000000000000000000000..4e62e9f1036888746c35c66070042104d97062df --- /dev/null +++ b/checkpoints/metadata_000000819200.json @@ -0,0 +1 @@ +{"step": 25, "tokens_seen": 819200, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.373075912404031} \ No newline at end of file diff --git a/checkpoints/metadata_000000917504.json b/checkpoints/metadata_000000917504.json new file mode 100644 index 0000000000000000000000000000000000000000..1a9e2b68f6226dd0f5fbe11bfcea4fd0167e58d7 --- /dev/null +++ b/checkpoints/metadata_000000917504.json @@ -0,0 +1 @@ +{"step": 28, "tokens_seen": 917504, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.371158385457182} \ No newline at end of file diff --git a/checkpoints/metadata_000000983040.json b/checkpoints/metadata_000000983040.json new file mode 100644 index 0000000000000000000000000000000000000000..355b21a380dc8498bcf1525043bfe1262bab2fe4 --- /dev/null +++ b/checkpoints/metadata_000000983040.json @@ -0,0 +1 @@ +{"step": 30, "tokens_seen": 983040, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.369674488318335} \ No newline at end of file diff --git a/checkpoints/metadata_000001114112.json b/checkpoints/metadata_000001114112.json new file mode 100644 index 0000000000000000000000000000000000000000..39b1de1809a49477d4ec4cec3284adb5d1cf9a94 --- /dev/null +++ b/checkpoints/metadata_000001114112.json @@ -0,0 +1 @@ +{"step": 34, "tokens_seen": 1114112, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.366430059073076} \ No newline at end of file diff --git a/checkpoints/metadata_000001212416.json b/checkpoints/metadata_000001212416.json new file mode 100644 index 0000000000000000000000000000000000000000..9facdc018496ed9aa2cda3b1fba2e32d85ab3e69 --- /dev/null +++ b/checkpoints/metadata_000001212416.json @@ -0,0 +1 @@ +{"step": 37, "tokens_seen": 1212416, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.363905413956502} \ No newline at end of file diff --git a/checkpoints/metadata_000001343488.json b/checkpoints/metadata_000001343488.json new file mode 100644 index 0000000000000000000000000000000000000000..15f92181297e7c599a6b7e7a3c47eb09904d67b4 --- /dev/null +++ b/checkpoints/metadata_000001343488.json @@ -0,0 +1 @@ +{"step": 41, "tokens_seen": 1343488, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.360284077764357} \ No newline at end of file diff --git a/checkpoints/metadata_000001474560.json b/checkpoints/metadata_000001474560.json new file mode 100644 index 0000000000000000000000000000000000000000..6711e7a90afebdd668c6af1e70a42c1208d10411 --- /dev/null +++ b/checkpoints/metadata_000001474560.json @@ -0,0 +1 @@ +{"step": 45, "tokens_seen": 1474560, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.356239770337442} \ No newline at end of file diff --git a/checkpoints/metadata_000001605632.json b/checkpoints/metadata_000001605632.json new file mode 100644 index 0000000000000000000000000000000000000000..799da4d8c5664ec9a6fdd025b604bf5f10f41042 --- /dev/null +++ b/checkpoints/metadata_000001605632.json @@ -0,0 +1 @@ +{"step": 49, "tokens_seen": 1605632, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.351873034624731} \ No newline at end of file diff --git a/checkpoints/metadata_000001769472.json b/checkpoints/metadata_000001769472.json new file mode 100644 index 0000000000000000000000000000000000000000..3de6a7ef26729e12c7cf1bd0e285de72fe7e86f2 --- /dev/null +++ b/checkpoints/metadata_000001769472.json @@ -0,0 +1 @@ +{"step": 54, "tokens_seen": 1769472, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.346144734261483} \ No newline at end of file diff --git a/checkpoints/metadata_000001966080.json b/checkpoints/metadata_000001966080.json new file mode 100644 index 0000000000000000000000000000000000000000..0dc10a1a96915b3dcdee1fa74c479f933fbb7531 --- /dev/null +++ b/checkpoints/metadata_000001966080.json @@ -0,0 +1 @@ +{"step": 60, "tokens_seen": 1966080, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.338452705876588} \ No newline at end of file diff --git a/checkpoints/metadata_000002162688.json b/checkpoints/metadata_000002162688.json new file mode 100644 index 0000000000000000000000000000000000000000..804a1f5494082f6eb881a8aadde9b219ced1bcda --- /dev/null +++ b/checkpoints/metadata_000002162688.json @@ -0,0 +1 @@ +{"step": 66, "tokens_seen": 2162688, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.329902555050262} \ No newline at end of file diff --git a/checkpoints/metadata_000002359296.json b/checkpoints/metadata_000002359296.json new file mode 100644 index 0000000000000000000000000000000000000000..1d1e1edbb8b6e18f94d1c19deb9202c5c7f171f6 --- /dev/null +++ b/checkpoints/metadata_000002359296.json @@ -0,0 +1 @@ +{"step": 72, "tokens_seen": 2359296, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.320310576133869} \ No newline at end of file diff --git a/checkpoints/metadata_000002621440.json b/checkpoints/metadata_000002621440.json new file mode 100644 index 0000000000000000000000000000000000000000..fd42a8ebc9ddaffbd36c212093630fc3172c61c3 --- /dev/null +++ b/checkpoints/metadata_000002621440.json @@ -0,0 +1 @@ +{"step": 80, "tokens_seen": 2621440, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.306776841127029} \ No newline at end of file diff --git a/checkpoints/metadata_000002883584.json b/checkpoints/metadata_000002883584.json new file mode 100644 index 0000000000000000000000000000000000000000..8f1da07a7f1cbae833339976071a0ebbb62362ad --- /dev/null +++ b/checkpoints/metadata_000002883584.json @@ -0,0 +1 @@ +{"step": 88, "tokens_seen": 2883584, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.291280306903579} \ No newline at end of file diff --git a/checkpoints/metadata_000003178496.json b/checkpoints/metadata_000003178496.json new file mode 100644 index 0000000000000000000000000000000000000000..345e2f2aa3e4193f9f2bcf6ed0c2e51cff8a9aa0 --- /dev/null +++ b/checkpoints/metadata_000003178496.json @@ -0,0 +1 @@ +{"step": 97, "tokens_seen": 3178496, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.272834772368334} \ No newline at end of file diff --git a/checkpoints/metadata_000003473408.json b/checkpoints/metadata_000003473408.json new file mode 100644 index 0000000000000000000000000000000000000000..3461b5d3fa181d7d5bde88b58f797a700b2c718d --- /dev/null +++ b/checkpoints/metadata_000003473408.json @@ -0,0 +1 @@ +{"step": 106, "tokens_seen": 3473408, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.251895300224053} \ No newline at end of file diff --git a/checkpoints/metadata_000003833856.json b/checkpoints/metadata_000003833856.json new file mode 100644 index 0000000000000000000000000000000000000000..d07cf25e987a3534431a8237a5155606e3149831 --- /dev/null +++ b/checkpoints/metadata_000003833856.json @@ -0,0 +1 @@ +{"step": 117, "tokens_seen": 3833856, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.223635847809346} \ No newline at end of file diff --git a/checkpoints/metadata_000004227072.json b/checkpoints/metadata_000004227072.json new file mode 100644 index 0000000000000000000000000000000000000000..acbc418410e214b41cd2375adaef9cbd85b4b957 --- /dev/null +++ b/checkpoints/metadata_000004227072.json @@ -0,0 +1 @@ +{"step": 129, "tokens_seen": 4227072, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.18756683950548} \ No newline at end of file diff --git a/checkpoints/metadata_000004653056.json b/checkpoints/metadata_000004653056.json new file mode 100644 index 0000000000000000000000000000000000000000..3e3b1c76c515a881dfdadfa496c8d9fa4ea5bce6 --- /dev/null +++ b/checkpoints/metadata_000004653056.json @@ -0,0 +1 @@ +{"step": 142, "tokens_seen": 4653056, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.143479916836643} \ No newline at end of file diff --git a/checkpoints/metadata_000005111808.json b/checkpoints/metadata_000005111808.json new file mode 100644 index 0000000000000000000000000000000000000000..cd42f29dcb3d2478943013c8359f051eec501cd1 --- /dev/null +++ b/checkpoints/metadata_000005111808.json @@ -0,0 +1 @@ +{"step": 156, "tokens_seen": 5111808, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.09093993966237} \ No newline at end of file diff --git a/checkpoints/metadata_000005603328.json b/checkpoints/metadata_000005603328.json new file mode 100644 index 0000000000000000000000000000000000000000..626146dfa4071c678b7c3034725a660020fd28d2 --- /dev/null +++ b/checkpoints/metadata_000005603328.json @@ -0,0 +1 @@ +{"step": 171, "tokens_seen": 5603328, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.02597446004432} \ No newline at end of file diff --git a/checkpoints/metadata_000006193152.json b/checkpoints/metadata_000006193152.json new file mode 100644 index 0000000000000000000000000000000000000000..b59454cf89287edb921168597792e319fea016bd --- /dev/null +++ b/checkpoints/metadata_000006193152.json @@ -0,0 +1 @@ +{"step": 189, "tokens_seen": 6193152, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.937735552300891} \ No newline at end of file diff --git a/checkpoints/metadata_000006782976.json b/checkpoints/metadata_000006782976.json new file mode 100644 index 0000000000000000000000000000000000000000..9a0c4d9e0e4c38a735ccfcb8b6c66a432d6747b8 --- /dev/null +++ b/checkpoints/metadata_000006782976.json @@ -0,0 +1 @@ +{"step": 207, "tokens_seen": 6782976, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.839895830130107} \ No newline at end of file diff --git a/checkpoints/metadata_000007471104.json b/checkpoints/metadata_000007471104.json new file mode 100644 index 0000000000000000000000000000000000000000..e16ce59a28057a0c3a28e1192400972911c20254 --- /dev/null +++ b/checkpoints/metadata_000007471104.json @@ -0,0 +1 @@ +{"step": 228, "tokens_seen": 7471104, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.712325186414569} \ No newline at end of file diff --git a/checkpoints/metadata_000008224768.json b/checkpoints/metadata_000008224768.json new file mode 100644 index 0000000000000000000000000000000000000000..1915fc5248988ebbccf0b5495b27a83a86553fd1 --- /dev/null +++ b/checkpoints/metadata_000008224768.json @@ -0,0 +1 @@ +{"step": 251, "tokens_seen": 8224768, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.559793606155328} \ No newline at end of file diff --git a/checkpoints/metadata_000009043968.json b/checkpoints/metadata_000009043968.json new file mode 100644 index 0000000000000000000000000000000000000000..1ad96057760edfc9d7bf0957251bcb8615bc7932 --- /dev/null +++ b/checkpoints/metadata_000009043968.json @@ -0,0 +1 @@ +{"step": 276, "tokens_seen": 9043968, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.37848964639277} \ No newline at end of file diff --git a/checkpoints/metadata_000009961472.json b/checkpoints/metadata_000009961472.json new file mode 100644 index 0000000000000000000000000000000000000000..68b9a0600dabe4a7d53ce8569170c4598c7536a4 --- /dev/null +++ b/checkpoints/metadata_000009961472.json @@ -0,0 +1 @@ +{"step": 304, "tokens_seen": 9961472, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.159768193631905} \ No newline at end of file diff --git a/checkpoints/metadata_000010944512.json b/checkpoints/metadata_000010944512.json new file mode 100644 index 0000000000000000000000000000000000000000..03c58d3eee782c313d898e5e343dd52667125863 --- /dev/null +++ b/checkpoints/metadata_000010944512.json @@ -0,0 +1 @@ +{"step": 334, "tokens_seen": 10944512, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 8.916276569890632} \ No newline at end of file diff --git a/checkpoints/metadata_000012058624.json b/checkpoints/metadata_000012058624.json new file mode 100644 index 0000000000000000000000000000000000000000..62a8ab833831946d977bc03172905081d948f0be --- /dev/null +++ b/checkpoints/metadata_000012058624.json @@ -0,0 +1 @@ +{"step": 368, "tokens_seen": 12058624, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 8.639004766891704} \ No newline at end of file diff --git a/checkpoints/metadata_000013271040.json b/checkpoints/metadata_000013271040.json new file mode 100644 index 0000000000000000000000000000000000000000..aee78a8d486c8cda680484e6ca15ab5bdfb7f777 --- /dev/null +++ b/checkpoints/metadata_000013271040.json @@ -0,0 +1 @@ +{"step": 405, "tokens_seen": 13271040, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 8.342905782277333} \ No newline at end of file diff --git a/checkpoints/metadata_000014581760.json b/checkpoints/metadata_000014581760.json new file mode 100644 index 0000000000000000000000000000000000000000..e944648c78fe45e3b9cc83139e7b7fa32f8eb253 --- /dev/null +++ b/checkpoints/metadata_000014581760.json @@ -0,0 +1 @@ +{"step": 445, "tokens_seen": 14581760, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 8.04564433660178} \ No newline at end of file diff --git a/checkpoints/metadata_000016056320.json b/checkpoints/metadata_000016056320.json new file mode 100644 index 0000000000000000000000000000000000000000..d8549ff87320d3783b0d628cf79b5a82fd14d0c2 --- /dev/null +++ b/checkpoints/metadata_000016056320.json @@ -0,0 +1 @@ +{"step": 490, "tokens_seen": 16056320, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.753923420557582} \ No newline at end of file diff --git a/checkpoints/metadata_000016384000.json b/checkpoints/metadata_000016384000.json new file mode 100644 index 0000000000000000000000000000000000000000..1427216e6cf0261772596d611245c34a8751866d --- /dev/null +++ b/checkpoints/metadata_000016384000.json @@ -0,0 +1 @@ +{"step": 500, "tokens_seen": 16384000, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.69892857486997} \ No newline at end of file diff --git a/checkpoints/metadata_000017661952.json b/checkpoints/metadata_000017661952.json new file mode 100644 index 0000000000000000000000000000000000000000..5cfeea53c5b21a99267a4f4c5299109af5ba9cfe --- /dev/null +++ b/checkpoints/metadata_000017661952.json @@ -0,0 +1 @@ +{"step": 539, "tokens_seen": 17661952, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.474507673806345} \ No newline at end of file diff --git a/checkpoints/metadata_000019431424.json b/checkpoints/metadata_000019431424.json new file mode 100644 index 0000000000000000000000000000000000000000..6fb6fc3b225092a24ec1dcd5773d59b5665d0c17 --- /dev/null +++ b/checkpoints/metadata_000019431424.json @@ -0,0 +1 @@ +{"step": 593, "tokens_seen": 19431424, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.193267811340626} \ No newline at end of file diff --git a/checkpoints/metadata_000021364736.json b/checkpoints/metadata_000021364736.json new file mode 100644 index 0000000000000000000000000000000000000000..7aaa32801a481b21d9a0fd83e0ca66260fff2349 --- /dev/null +++ b/checkpoints/metadata_000021364736.json @@ -0,0 +1 @@ +{"step": 652, "tokens_seen": 21364736, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.930870263745759} \ No newline at end of file diff --git a/checkpoints/metadata_000023494656.json b/checkpoints/metadata_000023494656.json new file mode 100644 index 0000000000000000000000000000000000000000..7fe08ef76e23c583ec64eebe7f6712f3b5a28d91 --- /dev/null +++ b/checkpoints/metadata_000023494656.json @@ -0,0 +1 @@ +{"step": 717, "tokens_seen": 23494656, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.663814045828802} \ No newline at end of file diff --git a/checkpoints/metadata_000025853952.json b/checkpoints/metadata_000025853952.json new file mode 100644 index 0000000000000000000000000000000000000000..a011ea534120dc4d599ea96c51bbed057c568c4f --- /dev/null +++ b/checkpoints/metadata_000025853952.json @@ -0,0 +1 @@ +{"step": 789, "tokens_seen": 25853952, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.410301177091514} \ No newline at end of file diff --git a/checkpoints/metadata_000028442624.json b/checkpoints/metadata_000028442624.json new file mode 100644 index 0000000000000000000000000000000000000000..a4b63db1cc264dba383b0a1ca7506f10a6b74ae9 --- /dev/null +++ b/checkpoints/metadata_000028442624.json @@ -0,0 +1 @@ +{"step": 868, "tokens_seen": 28442624, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.198303353421292} \ No newline at end of file diff --git a/checkpoints/metadata_000031293440.json b/checkpoints/metadata_000031293440.json new file mode 100644 index 0000000000000000000000000000000000000000..5bb6ef3d89b9de3e63dd47f58d6edbc4b8bd0b65 --- /dev/null +++ b/checkpoints/metadata_000031293440.json @@ -0,0 +1 @@ +{"step": 955, "tokens_seen": 31293440, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.015554921984739} \ No newline at end of file diff --git a/checkpoints/metadata_000032768000.json b/checkpoints/metadata_000032768000.json new file mode 100644 index 0000000000000000000000000000000000000000..24482c2d1d23194981339e62ed83127217ce0573 --- /dev/null +++ b/checkpoints/metadata_000032768000.json @@ -0,0 +1 @@ +{"step": 1000, "tokens_seen": 32768000, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.934235169144132} \ No newline at end of file diff --git a/checkpoints/metadata_000034439168.json b/checkpoints/metadata_000034439168.json new file mode 100644 index 0000000000000000000000000000000000000000..267ff6967fafe2da603c4ecc87c1ac4f1632d245 --- /dev/null +++ b/checkpoints/metadata_000034439168.json @@ -0,0 +1 @@ +{"step": 1051, "tokens_seen": 34439168, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.86148603257462} \ No newline at end of file diff --git a/checkpoints/metadata_000037879808.json b/checkpoints/metadata_000037879808.json new file mode 100644 index 0000000000000000000000000000000000000000..57d6aa9ac77fd535241a981f2be76065e423f390 --- /dev/null +++ b/checkpoints/metadata_000037879808.json @@ -0,0 +1 @@ +{"step": 1156, "tokens_seen": 37879808, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.722394680608445} \ No newline at end of file diff --git a/checkpoints/metadata_000041648128.json b/checkpoints/metadata_000041648128.json new file mode 100644 index 0000000000000000000000000000000000000000..1eebba8516fcfbd47e35b86e5f8a3e0984011e3e --- /dev/null +++ b/checkpoints/metadata_000041648128.json @@ -0,0 +1 @@ +{"step": 1271, "tokens_seen": 41648128, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.595662854172081} \ No newline at end of file diff --git a/checkpoints/metadata_000045842432.json b/checkpoints/metadata_000045842432.json new file mode 100644 index 0000000000000000000000000000000000000000..0c90f85c240a22a2b1be737d4de70ba99fe517ae --- /dev/null +++ b/checkpoints/metadata_000045842432.json @@ -0,0 +1 @@ +{"step": 1399, "tokens_seen": 45842432, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.481559532027179} \ No newline at end of file diff --git a/checkpoints/metadata_000049152000.json b/checkpoints/metadata_000049152000.json new file mode 100644 index 0000000000000000000000000000000000000000..a9a691d4f182b481dc668b60c3cac8d09a84e200 --- /dev/null +++ b/checkpoints/metadata_000049152000.json @@ -0,0 +1 @@ +{"step": 1500, "tokens_seen": 49152000, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.410202518456709} \ No newline at end of file diff --git a/checkpoints/metadata_000050397184.json b/checkpoints/metadata_000050397184.json new file mode 100644 index 0000000000000000000000000000000000000000..0c81a1ab77a917f88ae7f316f240b97ccef848a7 --- /dev/null +++ b/checkpoints/metadata_000050397184.json @@ -0,0 +1 @@ +{"step": 1538, "tokens_seen": 50397184, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.391036377053482} \ No newline at end of file diff --git a/checkpoints/metadata_000055443456.json b/checkpoints/metadata_000055443456.json new file mode 100644 index 0000000000000000000000000000000000000000..79d9a5356b9975307a237ff04bd3dea02eddf8aa --- /dev/null +++ b/checkpoints/metadata_000055443456.json @@ -0,0 +1 @@ +{"step": 1692, "tokens_seen": 55443456, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.264752251073995} \ No newline at end of file diff --git a/checkpoints/metadata_000061014016.json b/checkpoints/metadata_000061014016.json new file mode 100644 index 0000000000000000000000000000000000000000..22e34a02b9a7c23d2f210da6b908f134dfae41db --- /dev/null +++ b/checkpoints/metadata_000061014016.json @@ -0,0 +1 @@ +{"step": 1862, "tokens_seen": 61014016, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.16835314900116} \ No newline at end of file diff --git a/checkpoints/metadata_000065536000.json b/checkpoints/metadata_000065536000.json new file mode 100644 index 0000000000000000000000000000000000000000..d05db5e3724dcf6795bd53448b9e2f313aabc5d6 --- /dev/null +++ b/checkpoints/metadata_000065536000.json @@ -0,0 +1 @@ +{"step": 2000, "tokens_seen": 65536000, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.128405205875491} \ No newline at end of file diff --git a/checkpoints/metadata_000067108864.json b/checkpoints/metadata_000067108864.json new file mode 100644 index 0000000000000000000000000000000000000000..cfa5c676396a1fe7df890da03498ac70108a96bf --- /dev/null +++ b/checkpoints/metadata_000067108864.json @@ -0,0 +1 @@ +{"step": 2048, "tokens_seen": 67108864, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.106572343088341} \ No newline at end of file diff --git a/checkpoints/metadata_000073826304.json b/checkpoints/metadata_000073826304.json new file mode 100644 index 0000000000000000000000000000000000000000..577a61e7924e558ffe22c0f100e1b65db35e748a --- /dev/null +++ b/checkpoints/metadata_000073826304.json @@ -0,0 +1 @@ +{"step": 2253, "tokens_seen": 73826304, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.006926012807003} \ No newline at end of file diff --git a/checkpoints/metadata_000081199104.json b/checkpoints/metadata_000081199104.json new file mode 100644 index 0000000000000000000000000000000000000000..0974ac2c3474727ea1e877924a5abee36bda0789 --- /dev/null +++ b/checkpoints/metadata_000081199104.json @@ -0,0 +1 @@ +{"step": 2478, "tokens_seen": 81199104, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.933177476443977} \ No newline at end of file diff --git a/checkpoints/metadata_000081920000.json b/checkpoints/metadata_000081920000.json new file mode 100644 index 0000000000000000000000000000000000000000..37c644dd10bcfa36af1355d9444d5a282d676c54 --- /dev/null +++ b/checkpoints/metadata_000081920000.json @@ -0,0 +1 @@ +{"step": 2500, "tokens_seen": 81920000, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.9208537116448525} \ No newline at end of file diff --git a/checkpoints/metadata_000089325568.json b/checkpoints/metadata_000089325568.json new file mode 100644 index 0000000000000000000000000000000000000000..803829a072be9dd5a6902d342289a827798ae5b1 --- /dev/null +++ b/checkpoints/metadata_000089325568.json @@ -0,0 +1 @@ +{"step": 2726, "tokens_seen": 89325568, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.848176349333717} \ No newline at end of file diff --git a/checkpoints/metadata_000098271232.json b/checkpoints/metadata_000098271232.json new file mode 100644 index 0000000000000000000000000000000000000000..db96bbdba87e549350965e1db7072d5b6a45df87 --- /dev/null +++ b/checkpoints/metadata_000098271232.json @@ -0,0 +1 @@ +{"step": 2999, "tokens_seen": 98271232, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.779301594018391} \ No newline at end of file diff --git a/checkpoints/metadata_000098304000.json b/checkpoints/metadata_000098304000.json new file mode 100644 index 0000000000000000000000000000000000000000..ba5fcd2ac40858e7446e355e89c2e38ce5f61bc7 --- /dev/null +++ b/checkpoints/metadata_000098304000.json @@ -0,0 +1 @@ +{"step": 3000, "tokens_seen": 98304000, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.77879907967275} \ No newline at end of file diff --git a/checkpoints/metadata_000108068864.json b/checkpoints/metadata_000108068864.json new file mode 100644 index 0000000000000000000000000000000000000000..262cd1f20621433949c6ad8739614e0b83365066 --- /dev/null +++ b/checkpoints/metadata_000108068864.json @@ -0,0 +1 @@ +{"step": 3298, "tokens_seen": 108068864, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.708962575452245} \ No newline at end of file diff --git a/checkpoints/metadata_000114688000.json b/checkpoints/metadata_000114688000.json new file mode 100644 index 0000000000000000000000000000000000000000..9dc0fd501740b749aa895dbe630b6439638f39d9 --- /dev/null +++ b/checkpoints/metadata_000114688000.json @@ -0,0 +1 @@ +{"step": 3500, "tokens_seen": 114688000, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.6614505142477745} \ No newline at end of file diff --git a/checkpoints/metadata_000118882304.json b/checkpoints/metadata_000118882304.json new file mode 100644 index 0000000000000000000000000000000000000000..304ff9a35ea9cabc45dadf84b8148641e0776b9f --- /dev/null +++ b/checkpoints/metadata_000118882304.json @@ -0,0 +1 @@ +{"step": 3628, "tokens_seen": 118882304, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.656248985245747} \ No newline at end of file diff --git a/checkpoints/metadata_000130777088.json b/checkpoints/metadata_000130777088.json new file mode 100644 index 0000000000000000000000000000000000000000..418c765b6fa8be5ea68af850f3e2f7221d0d83ee --- /dev/null +++ b/checkpoints/metadata_000130777088.json @@ -0,0 +1 @@ +{"step": 3991, "tokens_seen": 130777088, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.588666820514513} \ No newline at end of file diff --git a/checkpoints/metadata_000131072000.json b/checkpoints/metadata_000131072000.json new file mode 100644 index 0000000000000000000000000000000000000000..d8e3f755388647db5a6fb12409596c777ef8362e --- /dev/null +++ b/checkpoints/metadata_000131072000.json @@ -0,0 +1 @@ +{"step": 4000, "tokens_seen": 131072000, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.591545362584007} \ No newline at end of file diff --git a/checkpoints/metadata_000143851520.json b/checkpoints/metadata_000143851520.json new file mode 100644 index 0000000000000000000000000000000000000000..d077f65ced40a53329162cd85b833933a06aeb35 --- /dev/null +++ b/checkpoints/metadata_000143851520.json @@ -0,0 +1 @@ +{"step": 4390, "tokens_seen": 143851520, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.536326406227966} \ No newline at end of file diff --git a/checkpoints/metadata_000147456000.json b/checkpoints/metadata_000147456000.json new file mode 100644 index 0000000000000000000000000000000000000000..97a9eefc9b32968dcfd71d75fde8013a63ab197a --- /dev/null +++ b/checkpoints/metadata_000147456000.json @@ -0,0 +1 @@ +{"step": 4500, "tokens_seen": 147456000, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.521793252642749} \ No newline at end of file diff --git a/checkpoints/metadata_000158269440.json b/checkpoints/metadata_000158269440.json new file mode 100644 index 0000000000000000000000000000000000000000..fc8bb11671ff46b5b31463a908ee8f6beb4da4e5 --- /dev/null +++ b/checkpoints/metadata_000158269440.json @@ -0,0 +1 @@ +{"step": 4830, "tokens_seen": 158269440, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.482066852492843} \ No newline at end of file diff --git a/checkpoints/metadata_000163840000.json b/checkpoints/metadata_000163840000.json new file mode 100644 index 0000000000000000000000000000000000000000..db2cf36ac741c65d70603e1a1f527ee1f72c5045 --- /dev/null +++ b/checkpoints/metadata_000163840000.json @@ -0,0 +1 @@ +{"step": 5000, "tokens_seen": 163840000, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.452723213494113} \ No newline at end of file diff --git a/checkpoints/metadata_000174096384.json b/checkpoints/metadata_000174096384.json new file mode 100644 index 0000000000000000000000000000000000000000..46994e9a97061d36fc03a72c3bb8171324c20ac2 --- /dev/null +++ b/checkpoints/metadata_000174096384.json @@ -0,0 +1 @@ +{"step": 5313, "tokens_seen": 174096384, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.427388350671607} \ No newline at end of file diff --git a/checkpoints/metadata_000180224000.json b/checkpoints/metadata_000180224000.json new file mode 100644 index 0000000000000000000000000000000000000000..09eec1b598cd9b35bc58d39250df3907678a427b --- /dev/null +++ b/checkpoints/metadata_000180224000.json @@ -0,0 +1 @@ +{"step": 5500, "tokens_seen": 180224000, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.406859341510559} \ No newline at end of file diff --git a/checkpoints/metadata_000191496192.json b/checkpoints/metadata_000191496192.json new file mode 100644 index 0000000000000000000000000000000000000000..46f4e64c36233c6cc826a99daa46e04a5e3b46df --- /dev/null +++ b/checkpoints/metadata_000191496192.json @@ -0,0 +1 @@ +{"step": 5844, "tokens_seen": 191496192, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.384557168951904} \ No newline at end of file diff --git a/checkpoints/metadata_000196608000.json b/checkpoints/metadata_000196608000.json new file mode 100644 index 0000000000000000000000000000000000000000..a3155b275fa94aa86e6d38e263a595b5c3328b6d --- /dev/null +++ b/checkpoints/metadata_000196608000.json @@ -0,0 +1 @@ +{"step": 6000, "tokens_seen": 196608000, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.364176421519139} \ No newline at end of file diff --git a/checkpoints/metadata_000196706304.json b/checkpoints/metadata_000196706304.json new file mode 100644 index 0000000000000000000000000000000000000000..22d7a852c860ffa22fa0771fd30ba1890c17aa66 --- /dev/null +++ b/checkpoints/metadata_000196706304.json @@ -0,0 +1 @@ +{"step": 6003, "tokens_seen": 196706304, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.3663209643265555} \ No newline at end of file diff --git a/checkpoints/metadata_000197361664.json b/checkpoints/metadata_000197361664.json new file mode 100644 index 0000000000000000000000000000000000000000..aab962daac931b5874b807850d07d51366481e8c --- /dev/null +++ b/checkpoints/metadata_000197361664.json @@ -0,0 +1 @@ +{"step": 6023, "tokens_seen": 197361664, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.37131507577088} \ No newline at end of file diff --git a/checkpoints/metadata_000198017024.json b/checkpoints/metadata_000198017024.json new file mode 100644 index 0000000000000000000000000000000000000000..b353f007fa661d5ec7154d1c8a1110f69cd6eb2b --- /dev/null +++ b/checkpoints/metadata_000198017024.json @@ -0,0 +1 @@ +{"step": 6043, "tokens_seen": 198017024, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.365107339055634} \ No newline at end of file diff --git a/checkpoints/metadata_000198672384.json b/checkpoints/metadata_000198672384.json new file mode 100644 index 0000000000000000000000000000000000000000..0886019848f5fd1a749720ef3c00f98938b68feb --- /dev/null +++ b/checkpoints/metadata_000198672384.json @@ -0,0 +1 @@ +{"step": 6063, "tokens_seen": 198672384, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.357320301139362} \ No newline at end of file diff --git a/checkpoints/metadata_000199327744.json b/checkpoints/metadata_000199327744.json new file mode 100644 index 0000000000000000000000000000000000000000..6a6675d3cfb9a65b14b5caa680e32a94ce70f50b --- /dev/null +++ b/checkpoints/metadata_000199327744.json @@ -0,0 +1 @@ +{"step": 6083, "tokens_seen": 199327744, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.3636480457425755} \ No newline at end of file diff --git a/checkpoints/metadata_000199950336.json b/checkpoints/metadata_000199950336.json new file mode 100644 index 0000000000000000000000000000000000000000..aca77bc4b09d7bfe2a14dc908d8f4db476053780 --- /dev/null +++ b/checkpoints/metadata_000199950336.json @@ -0,0 +1 @@ +{"step": 6102, "tokens_seen": 199950336, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.355453716313937} \ No newline at end of file diff --git a/checkpoints/model_weights_000000032768.pt b/checkpoints/model_weights_000000032768.pt new file mode 100644 index 0000000000000000000000000000000000000000..2eb37505200ea7e844a42afcd0fce1f2dbd9c75e --- /dev/null +++ b/checkpoints/model_weights_000000032768.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d514be3a7160ea382fb0adcc68308b55ab4e6b3e5fa6e95222dd404be1bfe034 +size 19966357 diff --git a/checkpoints/model_weights_000000327680.pt b/checkpoints/model_weights_000000327680.pt new file mode 100644 index 0000000000000000000000000000000000000000..e13170c003647e93ed030698391f7061433f9b0f --- /dev/null +++ b/checkpoints/model_weights_000000327680.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae6cfa5f56c1bf623ca9dea05738d704da8d7b8b678c3890dd2cf64933b23bc6 +size 19966357 diff --git a/checkpoints/model_weights_000000360448.pt b/checkpoints/model_weights_000000360448.pt new file mode 100644 index 0000000000000000000000000000000000000000..30f33a93a44ccaa8b4884d9de8275d7b56d6f1d1 --- /dev/null +++ b/checkpoints/model_weights_000000360448.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82ee97aa2e7544cf0725704efb74a3395317e817d79304ae67c8882dd5a8bd14 +size 19966357 diff --git a/checkpoints/model_weights_000000425984.pt b/checkpoints/model_weights_000000425984.pt new file mode 100644 index 0000000000000000000000000000000000000000..87bd36d7a50ff7a4fe0afc721387716f73391858 --- /dev/null +++ b/checkpoints/model_weights_000000425984.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e53a54058442f8a083bc6f9c80e05feeddf23fa62a7d176ab67073bc9da36a6b +size 19966357 diff --git a/checkpoints/model_weights_000000458752.pt b/checkpoints/model_weights_000000458752.pt new file mode 100644 index 0000000000000000000000000000000000000000..6323fdef3e4387dfff501aa874f6a02651f9f504 --- /dev/null +++ b/checkpoints/model_weights_000000458752.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:572a20f1591daaf4e54ac31fdc8518560b1d72a7aec156bad51b50588cc9ba9e +size 19966357 diff --git a/checkpoints/model_weights_000000491520.pt b/checkpoints/model_weights_000000491520.pt new file mode 100644 index 0000000000000000000000000000000000000000..cc36e3b47ae5eea0f62788fa22e1a1c9e243d2d1 --- /dev/null +++ b/checkpoints/model_weights_000000491520.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a81609f965d1fc261c3d814a0cc3b7085db3fe820bf4af509b0cb1de2d96b0eb +size 19966357 diff --git a/checkpoints/model_weights_000000557056.pt b/checkpoints/model_weights_000000557056.pt new file mode 100644 index 0000000000000000000000000000000000000000..25c7afa8298123f261ce7c33c107c937693d3d33 --- /dev/null +++ b/checkpoints/model_weights_000000557056.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d40f261a3987300f4f4b9acb295e0018613c1b922aa39906b36da2b59da3df14 +size 19966357 diff --git a/checkpoints/model_weights_000000622592.pt b/checkpoints/model_weights_000000622592.pt new file mode 100644 index 0000000000000000000000000000000000000000..a14b11b70b4b0cd9209832674ef8d88148376630 --- /dev/null +++ b/checkpoints/model_weights_000000622592.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c261f32f6a072760c85492a491a58ed6a4bdf29694fcddece54f92b9d6029023 +size 19966357 diff --git a/checkpoints/model_weights_000000688128.pt b/checkpoints/model_weights_000000688128.pt new file mode 100644 index 0000000000000000000000000000000000000000..b63fa831ab864f1a9509a28833a712446b5023ca --- /dev/null +++ b/checkpoints/model_weights_000000688128.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38c3f6c179a6f53b7cd890d41ff8b0a2506fa2ea2909992106d197115ddcf6f4 +size 19966357 diff --git a/checkpoints/model_weights_000000753664.pt b/checkpoints/model_weights_000000753664.pt new file mode 100644 index 0000000000000000000000000000000000000000..d14d693db6618393cb5a9a501060387b92c96971 --- /dev/null +++ b/checkpoints/model_weights_000000753664.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dbba18c1c4016464f1579e9fe5aaa0076eb8ac8da9d470d0320099f8ed28454f +size 19966357 diff --git a/checkpoints/model_weights_000000819200.pt b/checkpoints/model_weights_000000819200.pt new file mode 100644 index 0000000000000000000000000000000000000000..8ffaa70194dc51f6e091ddf150b83caee93007a3 --- /dev/null +++ b/checkpoints/model_weights_000000819200.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ef4ee5aae7846efe8e71c53da39e94b92931a71ac540efa1ef75fdd6079807a +size 19966357 diff --git a/checkpoints/model_weights_000000917504.pt b/checkpoints/model_weights_000000917504.pt new file mode 100644 index 0000000000000000000000000000000000000000..abdc14b26758da9da85a69aba614360a1da13f63 --- /dev/null +++ b/checkpoints/model_weights_000000917504.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cdf76c3be0697b211ab62715df6877c409a173773f8b0e5c9828c7c6181edbee +size 19966357 diff --git a/checkpoints/model_weights_000000983040.pt b/checkpoints/model_weights_000000983040.pt new file mode 100644 index 0000000000000000000000000000000000000000..2855fc429031b05c4c1192b35ff4e48cfebbc6dd --- /dev/null +++ b/checkpoints/model_weights_000000983040.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5660d71b130882e1021898a0919d5bc0fc1faf92799a3e2b402a950de9fef8e +size 19966357 diff --git a/checkpoints/model_weights_000001114112.pt b/checkpoints/model_weights_000001114112.pt new file mode 100644 index 0000000000000000000000000000000000000000..04c4aca53e1472f437053cd61a4b3fc1ed05114c --- /dev/null +++ b/checkpoints/model_weights_000001114112.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf426b34f0944a4a503afcdf39b4c4ddb141a5a1053f5705b588953236575213 +size 19966357 diff --git a/checkpoints/model_weights_000001212416.pt b/checkpoints/model_weights_000001212416.pt new file mode 100644 index 0000000000000000000000000000000000000000..ddeb01fe678b19ff4f6c41ac4cf9a233a0631a21 --- /dev/null +++ b/checkpoints/model_weights_000001212416.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9fe195777e8b88fbc34ed8e0477fc9c96a03156ceddae422023241d4e854d53f +size 19966357 diff --git a/checkpoints/model_weights_000001343488.pt b/checkpoints/model_weights_000001343488.pt new file mode 100644 index 0000000000000000000000000000000000000000..8e91e63d89d75701dfd596dcd640d61226c19b4f --- /dev/null +++ b/checkpoints/model_weights_000001343488.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07b0d4d4999453cace23112f548e93a8cd7ecd58b4c656ae54ebee10695d08d3 +size 19966357 diff --git a/checkpoints/model_weights_000001474560.pt b/checkpoints/model_weights_000001474560.pt new file mode 100644 index 0000000000000000000000000000000000000000..9a9f34a2203b6a99a64fbefcc87e6286bdafb511 --- /dev/null +++ b/checkpoints/model_weights_000001474560.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ececc03d342cc8505c907bd581fd67e9a1011c157d1fc6952409877183b7932 +size 19966357 diff --git a/checkpoints/model_weights_000001605632.pt b/checkpoints/model_weights_000001605632.pt new file mode 100644 index 0000000000000000000000000000000000000000..2d1aaa9bda252bccedb304eb975e9d0cf59a8441 --- /dev/null +++ b/checkpoints/model_weights_000001605632.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0df4efdf61785a4f381c4b0b43903458518e2f23e4aad77af0c4da216803bded +size 19966357 diff --git a/checkpoints/model_weights_000001769472.pt b/checkpoints/model_weights_000001769472.pt new file mode 100644 index 0000000000000000000000000000000000000000..fcbe599d7f5d4ca4cb846842072571bfbb634b0e --- /dev/null +++ b/checkpoints/model_weights_000001769472.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02e97362897c0f329979e1303c72340afbf293ce2a6a9bce7cb887a17bcc12b9 +size 19966357 diff --git a/checkpoints/model_weights_000001966080.pt b/checkpoints/model_weights_000001966080.pt new file mode 100644 index 0000000000000000000000000000000000000000..7b590a37e42b554f83793d4ebd5dc99d3418f204 --- /dev/null +++ b/checkpoints/model_weights_000001966080.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bcd5ab740eeed841329b8bf0fb001a41cfadf8f32a28ebb76492292ee247ab03 +size 19966357 diff --git a/checkpoints/model_weights_000002162688.pt b/checkpoints/model_weights_000002162688.pt new file mode 100644 index 0000000000000000000000000000000000000000..a3b2e742c1d157699a4903956fc6142ea28cc615 --- /dev/null +++ b/checkpoints/model_weights_000002162688.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3d78f57db53786000941b896e279a5f1d7095f8839b9f11008e657094c5c7d5 +size 19966357 diff --git a/checkpoints/model_weights_000002359296.pt b/checkpoints/model_weights_000002359296.pt new file mode 100644 index 0000000000000000000000000000000000000000..8005aa06d856fc3aac27eb66a0041d00625355f9 --- /dev/null +++ b/checkpoints/model_weights_000002359296.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14e64fa44256299a0e5bc6ab253aa902ff95d297448b7c9a67290bccda0d3963 +size 19966357 diff --git a/checkpoints/model_weights_000002621440.pt b/checkpoints/model_weights_000002621440.pt new file mode 100644 index 0000000000000000000000000000000000000000..fb8d45c5f8fb3c0460801268293c8c636c9f534c --- /dev/null +++ b/checkpoints/model_weights_000002621440.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86a75c46529f70350a1572d7c576990da5ea98504937c7ce08dc0f226248100c +size 19966357 diff --git a/checkpoints/model_weights_000002883584.pt b/checkpoints/model_weights_000002883584.pt new file mode 100644 index 0000000000000000000000000000000000000000..12413474e184dd0b6629873f2250736617bf97ed --- /dev/null +++ b/checkpoints/model_weights_000002883584.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f7c32abc6ad0236b0282743cd6d58744fb7d21469ad0eafe3b44ef823bb7d96 +size 19966357 diff --git a/checkpoints/model_weights_000003178496.pt b/checkpoints/model_weights_000003178496.pt new file mode 100644 index 0000000000000000000000000000000000000000..eab6abb29edb8d233e78a2610accc36603a6155d --- /dev/null +++ b/checkpoints/model_weights_000003178496.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:521aae92df17377fc0f50196dd6cf6e7042eac3c7b1364133e471cab8dd62381 +size 19966357 diff --git a/checkpoints/model_weights_000003473408.pt b/checkpoints/model_weights_000003473408.pt new file mode 100644 index 0000000000000000000000000000000000000000..db24f032d4ee8d9822e3e94a6f524cbef3eac423 --- /dev/null +++ b/checkpoints/model_weights_000003473408.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e49524260953aac124f843009149704806bbf6b657cd386a82243049251fdf2 +size 19966357 diff --git a/checkpoints/model_weights_000003833856.pt b/checkpoints/model_weights_000003833856.pt new file mode 100644 index 0000000000000000000000000000000000000000..dab9359c1ede0b187feaacdcda8f326dfe15d5d5 --- /dev/null +++ b/checkpoints/model_weights_000003833856.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f649e7aa3b01573964b416409f164f29af52ecdd8a4a5ed5c7e7476fd7177d4 +size 19966357 diff --git a/checkpoints/model_weights_000004227072.pt b/checkpoints/model_weights_000004227072.pt new file mode 100644 index 0000000000000000000000000000000000000000..8f7f28c4e4db32374fabbf79976439695683daab --- /dev/null +++ b/checkpoints/model_weights_000004227072.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b0631350a6aedd277a7fc580ddf231fdb955382821528091a137c34e6d3fe4a +size 19966357 diff --git a/checkpoints/model_weights_000004653056.pt b/checkpoints/model_weights_000004653056.pt new file mode 100644 index 0000000000000000000000000000000000000000..3c07551ce44b26adecc7d046e1ef88b8fe25f33f --- /dev/null +++ b/checkpoints/model_weights_000004653056.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ad4208cd0cdfbb262d997c3eb1904af28ccd472be0fcc8b2dc837ea4c29fa12 +size 19966357 diff --git a/checkpoints/model_weights_000005111808.pt b/checkpoints/model_weights_000005111808.pt new file mode 100644 index 0000000000000000000000000000000000000000..c532202b56a43a92d3ff9562522123d89832f56f --- /dev/null +++ b/checkpoints/model_weights_000005111808.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1d4314ddc9e23f2e83a5847abcea04e705311a4e1df6ab4b16eac1cb9261751 +size 19966357 diff --git a/checkpoints/model_weights_000005603328.pt b/checkpoints/model_weights_000005603328.pt new file mode 100644 index 0000000000000000000000000000000000000000..dfc73cf8fde79efc23933dac478d7d7962d3f815 --- /dev/null +++ b/checkpoints/model_weights_000005603328.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e297db998acc2ed290a837474ee15f0e59363cc3acc3cd2377d9cbd3cb0829b +size 19966357 diff --git a/checkpoints/model_weights_000006193152.pt b/checkpoints/model_weights_000006193152.pt new file mode 100644 index 0000000000000000000000000000000000000000..61f227b588c4034a5f421e6ba0e09e355c3a4487 --- /dev/null +++ b/checkpoints/model_weights_000006193152.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9bd5963e5b4969f676d146e32a4df0107848d86fb38914b4acf439085f08d815 +size 19966357 diff --git a/checkpoints/model_weights_000006782976.pt b/checkpoints/model_weights_000006782976.pt new file mode 100644 index 0000000000000000000000000000000000000000..1e74f1ab6318b838d57fb665945c7514426a1154 --- /dev/null +++ b/checkpoints/model_weights_000006782976.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e043b043f1732b40cd2afe2520fbd6363fcb2f223e80c242447ce1d7b3374f4f +size 19966357 diff --git a/checkpoints/model_weights_000007471104.pt b/checkpoints/model_weights_000007471104.pt new file mode 100644 index 0000000000000000000000000000000000000000..09dea484dcfd232c52e44b8f1da4e9302ef7b649 --- /dev/null +++ b/checkpoints/model_weights_000007471104.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a935ba98101e794d0f8cd284b7fab877bc799f1f8f1fa328c9049e7607772846 +size 19966357 diff --git a/checkpoints/model_weights_000008224768.pt b/checkpoints/model_weights_000008224768.pt new file mode 100644 index 0000000000000000000000000000000000000000..91e33cf67e4b74592f81dbacb7fcb432e8241ea4 --- /dev/null +++ b/checkpoints/model_weights_000008224768.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:583ac89b8768725fcea33c746e1ca1abcaf06b53c4a1acc074dbba9d271b23e2 +size 19966357 diff --git a/checkpoints/model_weights_000009043968.pt b/checkpoints/model_weights_000009043968.pt new file mode 100644 index 0000000000000000000000000000000000000000..cb1eecfa634b5d954533958a9c4dfd7261806c97 --- /dev/null +++ b/checkpoints/model_weights_000009043968.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab5e5a0fa693a1401f5e359b7e836ace90dc59cade341835ce4b54266d5c7ea9 +size 19966357 diff --git a/checkpoints/model_weights_000009961472.pt b/checkpoints/model_weights_000009961472.pt new file mode 100644 index 0000000000000000000000000000000000000000..32173421a909bc7cf1e6816198d0e29c2c4fd466 --- /dev/null +++ b/checkpoints/model_weights_000009961472.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c447cbf49f33d8b6c9ac55a60072c2455cc045e77afc18ad1cb18c1fdacc202d +size 19966357 diff --git a/checkpoints/model_weights_000010944512.pt b/checkpoints/model_weights_000010944512.pt new file mode 100644 index 0000000000000000000000000000000000000000..65bace7400e71787edce7d9fe97ae0064a393f0c --- /dev/null +++ b/checkpoints/model_weights_000010944512.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db989d60b457ba000ee5beb178ca1c500bc11541a0ab4694e7b65c07bf8a6246 +size 19966357 diff --git a/checkpoints/model_weights_000012058624.pt b/checkpoints/model_weights_000012058624.pt new file mode 100644 index 0000000000000000000000000000000000000000..8c26248f62dc70c3db6d42a29f6dc1b33bbbcebf --- /dev/null +++ b/checkpoints/model_weights_000012058624.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e08a3318b324adbccd9a6b730788049c3c9ec54789181fce82fef889de8339d1 +size 19966357 diff --git a/checkpoints/model_weights_000013271040.pt b/checkpoints/model_weights_000013271040.pt new file mode 100644 index 0000000000000000000000000000000000000000..1a979c46fb36adf9a1d4552eb926ec1909c858d7 --- /dev/null +++ b/checkpoints/model_weights_000013271040.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e590414241fcfab91880a773e3555b914ba177e92d6c4352010c8b1e7cce716 +size 19966357 diff --git a/checkpoints/model_weights_000014581760.pt b/checkpoints/model_weights_000014581760.pt new file mode 100644 index 0000000000000000000000000000000000000000..411536729401f218524deb06f9f8d08b133c1654 --- /dev/null +++ b/checkpoints/model_weights_000014581760.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce6700b418a7901e09fa6cd7864e2bde7b25e7e28b0bc944b1ea64a0ae411590 +size 19966357 diff --git a/checkpoints/model_weights_000016056320.pt b/checkpoints/model_weights_000016056320.pt new file mode 100644 index 0000000000000000000000000000000000000000..bf5604dc3b418f9dca6c192958db12a5b668278e --- /dev/null +++ b/checkpoints/model_weights_000016056320.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0415cb2bad3dcb1af93391f18746f601127ed0ab1fcec95791bb412b456f22d +size 19966357 diff --git a/checkpoints/model_weights_000016384000.pt b/checkpoints/model_weights_000016384000.pt new file mode 100644 index 0000000000000000000000000000000000000000..e662fc9873123d040259dc55ea3fdb580b411b92 --- /dev/null +++ b/checkpoints/model_weights_000016384000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef4cd59b0dab911cd79b37930a3e03af4c481dad90cef0ae5efce19383b131ca +size 19966357 diff --git a/checkpoints/model_weights_000017661952.pt b/checkpoints/model_weights_000017661952.pt new file mode 100644 index 0000000000000000000000000000000000000000..ec07e3a868f82bd59b2e6c02972b1826b677a13d --- /dev/null +++ b/checkpoints/model_weights_000017661952.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73bbfc5a42d7f9b5151fe303b87eb480e073854c10a85070fb4e4ec0a206e989 +size 19966357 diff --git a/checkpoints/model_weights_000019431424.pt b/checkpoints/model_weights_000019431424.pt new file mode 100644 index 0000000000000000000000000000000000000000..070600537ce6098ca014fa4001c967ca740096f8 --- /dev/null +++ b/checkpoints/model_weights_000019431424.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5a111cda26e1a35b690e00a700b140d715463d443048858e3b8f1432704cfaa +size 19966357 diff --git a/checkpoints/model_weights_000021364736.pt b/checkpoints/model_weights_000021364736.pt new file mode 100644 index 0000000000000000000000000000000000000000..a8d22e4f3c21aa1ef825346d14facea6ddc3b880 --- /dev/null +++ b/checkpoints/model_weights_000021364736.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8483ff52b1fbe39de3c477d4b04ac12f84c4354eefdc21bef846505710c27226 +size 19966357 diff --git a/checkpoints/model_weights_000023494656.pt b/checkpoints/model_weights_000023494656.pt new file mode 100644 index 0000000000000000000000000000000000000000..fab91df40f3be52888d5db0dc1ca56ab1b7d8c78 --- /dev/null +++ b/checkpoints/model_weights_000023494656.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54d6b159bcad936051c497b3ee7777b8b9faed84c53844e4e7a9de2a45c68dab +size 19966357 diff --git a/checkpoints/model_weights_000025853952.pt b/checkpoints/model_weights_000025853952.pt new file mode 100644 index 0000000000000000000000000000000000000000..72eb43706641ead170b33d1fd8b05daffc648ad5 --- /dev/null +++ b/checkpoints/model_weights_000025853952.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71fd1ec11c9ab4c58da6d31a4ee5c2cbe4a8dcb37c2089e821bc45a2bf05d638 +size 19966357 diff --git a/checkpoints/model_weights_000028442624.pt b/checkpoints/model_weights_000028442624.pt new file mode 100644 index 0000000000000000000000000000000000000000..8cd5a6fb61f1b402a1cb2c80a791bf09f3ea7a03 --- /dev/null +++ b/checkpoints/model_weights_000028442624.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ecf3fa9eeab98225a3bdfa6ddbbe964ec623dade0bd8d37203363bdca064395 +size 19966357 diff --git a/checkpoints/model_weights_000031293440.pt b/checkpoints/model_weights_000031293440.pt new file mode 100644 index 0000000000000000000000000000000000000000..a13dc33a7759511f498efddb2a98e17a06a8fb9a --- /dev/null +++ b/checkpoints/model_weights_000031293440.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5440a8e5f52cee785c57b25163551daf717b354e72c8c0bcdd08c61e7a13d098 +size 19966357 diff --git a/checkpoints/model_weights_000032768000.pt b/checkpoints/model_weights_000032768000.pt new file mode 100644 index 0000000000000000000000000000000000000000..8d5833a6ae7a9dd6dc1a962c326c2c45237d3420 --- /dev/null +++ b/checkpoints/model_weights_000032768000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cadebed7e04c28d31b70cc6891d180f1b077bb826147364e945ddea20b265456 +size 19966357 diff --git a/checkpoints/model_weights_000034439168.pt b/checkpoints/model_weights_000034439168.pt new file mode 100644 index 0000000000000000000000000000000000000000..8c5514730a126aef1cd0585a41beed35a8fb31a7 --- /dev/null +++ b/checkpoints/model_weights_000034439168.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1668c7f64b4ac815f09c02d22bdb9a41e518ee152564c2e94966335fb8fcf914 +size 19966357 diff --git a/checkpoints/model_weights_000037879808.pt b/checkpoints/model_weights_000037879808.pt new file mode 100644 index 0000000000000000000000000000000000000000..a89e924f1edeb6e5874bd68695002d5ee9eba72c --- /dev/null +++ b/checkpoints/model_weights_000037879808.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4e8f0689a3e1686179a62a5e39cc75d5876e4e76711153af673ed58f23cc24f +size 19966357 diff --git a/checkpoints/model_weights_000041648128.pt b/checkpoints/model_weights_000041648128.pt new file mode 100644 index 0000000000000000000000000000000000000000..6eee256d86dabe586531ae34fb389efe2954b0ab --- /dev/null +++ b/checkpoints/model_weights_000041648128.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d552ea1ba3ca3984cf81190e44eb26a8dea42b6bc4fd07f1999c6e59064157c2 +size 19966357 diff --git a/checkpoints/model_weights_000045842432.pt b/checkpoints/model_weights_000045842432.pt new file mode 100644 index 0000000000000000000000000000000000000000..d85d7e97bac8e864f6bb83fa4cf8821589248ed9 --- /dev/null +++ b/checkpoints/model_weights_000045842432.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:448a210bd74870cc83438c31f0c22376238ca3e1a5a90cd94b1ec72f683000da +size 19966357 diff --git a/checkpoints/model_weights_000049152000.pt b/checkpoints/model_weights_000049152000.pt new file mode 100644 index 0000000000000000000000000000000000000000..488264ff86f178f2b3a3ea0d10f25dc25ca9c780 --- /dev/null +++ b/checkpoints/model_weights_000049152000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:397d52e5aa8b7e313b1aa3ad0a02d479f7f9a65090d042200ad158c405bf0798 +size 19966357 diff --git a/checkpoints/model_weights_000050397184.pt b/checkpoints/model_weights_000050397184.pt new file mode 100644 index 0000000000000000000000000000000000000000..f056cf28019a25a866e2d36d7565d1e07010472d --- /dev/null +++ b/checkpoints/model_weights_000050397184.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58adbebc817b65c63da4fc26338bd6b79cfe0f5398a53af8dd94a63434d2163f +size 19966357 diff --git a/checkpoints/model_weights_000055443456.pt b/checkpoints/model_weights_000055443456.pt new file mode 100644 index 0000000000000000000000000000000000000000..f1e947d353bc47292d09283638ed5023ac2ecc27 --- /dev/null +++ b/checkpoints/model_weights_000055443456.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08c3513577d94f6c2520525b16b4ee7baa571f355464b6ee5285ef71cf84c730 +size 19966357 diff --git a/checkpoints/model_weights_000061014016.pt b/checkpoints/model_weights_000061014016.pt new file mode 100644 index 0000000000000000000000000000000000000000..33e6e1db25180ad61ebcc74952c0520e69e18740 --- /dev/null +++ b/checkpoints/model_weights_000061014016.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:772caf62fd443536dd457f7a7eb40332145c49972b518e5c2692414eecfbbb8e +size 19966357 diff --git a/checkpoints/model_weights_000065536000.pt b/checkpoints/model_weights_000065536000.pt new file mode 100644 index 0000000000000000000000000000000000000000..e1568000ca97552f3cc139e5719cde6a698be361 --- /dev/null +++ b/checkpoints/model_weights_000065536000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2a3addcfec07d538752c330d072ce8a0faab42c2adb49eea979f70b4d45fc6f +size 19966357 diff --git a/checkpoints/model_weights_000067108864.pt b/checkpoints/model_weights_000067108864.pt new file mode 100644 index 0000000000000000000000000000000000000000..72497a690864b8e7539cbf87ab9f055a79c9832c --- /dev/null +++ b/checkpoints/model_weights_000067108864.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b909cd0a17cc512647f500ec1ed586f85aed62733f10ccc9c83f2b755f0a9861 +size 19966357 diff --git a/checkpoints/model_weights_000073826304.pt b/checkpoints/model_weights_000073826304.pt new file mode 100644 index 0000000000000000000000000000000000000000..a05d3e56ab21016f7f0f2d28bffa461de8a42e63 --- /dev/null +++ b/checkpoints/model_weights_000073826304.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb6db1c0236a3f9a71be1157320b2a500a8f851612ecba77c2604bee46d82adf +size 19966357 diff --git a/checkpoints/model_weights_000081199104.pt b/checkpoints/model_weights_000081199104.pt new file mode 100644 index 0000000000000000000000000000000000000000..ac5a61aa65a4d06f0c765e6f2cbfa5332249efd5 --- /dev/null +++ b/checkpoints/model_weights_000081199104.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58f8f7b8c1273fb29fc0a6b13171589d9f6787369348576a25ed68e791ea125a +size 19966357 diff --git a/checkpoints/model_weights_000081920000.pt b/checkpoints/model_weights_000081920000.pt new file mode 100644 index 0000000000000000000000000000000000000000..27702de12df6f0a5f6f8d6206ab88a3468f8dc3e --- /dev/null +++ b/checkpoints/model_weights_000081920000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7072cad2213fde3adb31bf9a3e960c4101b0f386aa2e3092f069f31053b84b94 +size 19966357 diff --git a/checkpoints/model_weights_000089325568.pt b/checkpoints/model_weights_000089325568.pt new file mode 100644 index 0000000000000000000000000000000000000000..ab72a832646f081e482a573d4ec24aa287f1e387 --- /dev/null +++ b/checkpoints/model_weights_000089325568.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:963a0856d483c270bf3e27711d4f5f3dc1ef650b66233b983b9305e50e6262c6 +size 19966357 diff --git a/checkpoints/model_weights_000098271232.pt b/checkpoints/model_weights_000098271232.pt new file mode 100644 index 0000000000000000000000000000000000000000..b8cfa7e7af716c32f303dfb0cbfc699c71bb2ee4 --- /dev/null +++ b/checkpoints/model_weights_000098271232.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:57de53e2a60cb4821b320f896ae4f5107e9587a6562af2c5f2cccd703127e5b6 +size 19966357 diff --git a/checkpoints/model_weights_000098304000.pt b/checkpoints/model_weights_000098304000.pt new file mode 100644 index 0000000000000000000000000000000000000000..97409f5d8912644f4698a1af4f7ff1f21af28fe5 --- /dev/null +++ b/checkpoints/model_weights_000098304000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:730863afe5a1645244d4090c7ed2953218381a9d19fa5983943ce463d089eb88 +size 19966357 diff --git a/checkpoints/model_weights_000108068864.pt b/checkpoints/model_weights_000108068864.pt new file mode 100644 index 0000000000000000000000000000000000000000..241d9636d959be1dcc5426a1496545952ee429c7 --- /dev/null +++ b/checkpoints/model_weights_000108068864.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1a269940cb28a3c5f774e7e4dca18775b77616889f7de3c8686ab4876a8756e +size 19966357 diff --git a/checkpoints/model_weights_000114688000.pt b/checkpoints/model_weights_000114688000.pt new file mode 100644 index 0000000000000000000000000000000000000000..e3b761fda7bafa200d0f081f62b2d4dbfb4d60d3 --- /dev/null +++ b/checkpoints/model_weights_000114688000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f6aed1b6e061b4f5956da295d5bc774ad84ebb0eb90597d89ba46786b9e5faf8 +size 19966357 diff --git a/checkpoints/model_weights_000118882304.pt b/checkpoints/model_weights_000118882304.pt new file mode 100644 index 0000000000000000000000000000000000000000..4b83f5d101a6c5848643ac9adfed132b68e161f1 --- /dev/null +++ b/checkpoints/model_weights_000118882304.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5fa70a5c07a3629d02d019faaa6aa2ec7d50f21bf12cadcc311355a5e4ef80a4 +size 19966357 diff --git a/checkpoints/model_weights_000130777088.pt b/checkpoints/model_weights_000130777088.pt new file mode 100644 index 0000000000000000000000000000000000000000..9340c911abd030ad92a5fafc47374f2bf43fd5c1 --- /dev/null +++ b/checkpoints/model_weights_000130777088.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ef2bbc756928f18c88816307bb0d960204762b8c4caad76df83e639519e438d +size 19966357 diff --git a/checkpoints/model_weights_000131072000.pt b/checkpoints/model_weights_000131072000.pt new file mode 100644 index 0000000000000000000000000000000000000000..a28607d900d61e0bbe77c041d0b199feabc28218 --- /dev/null +++ b/checkpoints/model_weights_000131072000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de1cfd263ca0133ac669a342bace34d7a92faae33dc52e7e32eec3af44d321b1 +size 19966357 diff --git a/checkpoints/model_weights_000143851520.pt b/checkpoints/model_weights_000143851520.pt new file mode 100644 index 0000000000000000000000000000000000000000..2d7c834645e836273d28f8a56454212c09694647 --- /dev/null +++ b/checkpoints/model_weights_000143851520.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70dd7492b96d486061822cc62b8a99f33822288e3529c7dcd08155dc64720deb +size 19966357 diff --git a/checkpoints/model_weights_000147456000.pt b/checkpoints/model_weights_000147456000.pt new file mode 100644 index 0000000000000000000000000000000000000000..9d37c64d46b85e452a7de37d0af4f62df1cb832a --- /dev/null +++ b/checkpoints/model_weights_000147456000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6632e6c941553fd54773a6629406169724eb5ae741ce8c3c4bc191606014ddf +size 19966357 diff --git a/checkpoints/model_weights_000158269440.pt b/checkpoints/model_weights_000158269440.pt new file mode 100644 index 0000000000000000000000000000000000000000..d093261d2612ae93e7ab8a82df0a72de0636464f --- /dev/null +++ b/checkpoints/model_weights_000158269440.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6fdb55093c259880bcc126eeeac8df9831d4110125358cd17c1fdf12d7996680 +size 19966357 diff --git a/checkpoints/model_weights_000163840000.pt b/checkpoints/model_weights_000163840000.pt new file mode 100644 index 0000000000000000000000000000000000000000..b3c89708d5a532b40cb921e08e51c4a43160f086 --- /dev/null +++ b/checkpoints/model_weights_000163840000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8cfff32730574da6eec2d70bb0a8f57a22bdcfa0677f9ecba510b055caf10ccf +size 19966357 diff --git a/checkpoints/model_weights_000174096384.pt b/checkpoints/model_weights_000174096384.pt new file mode 100644 index 0000000000000000000000000000000000000000..1290c858941c7b6ff55fe15bc26c3d0f2aabfe2c --- /dev/null +++ b/checkpoints/model_weights_000174096384.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f39755f66c9a550e85dfdb37ee7e4dfd3dbbb8bc3c2e203ce26e802985091a4 +size 19966357 diff --git a/checkpoints/model_weights_000180224000.pt b/checkpoints/model_weights_000180224000.pt new file mode 100644 index 0000000000000000000000000000000000000000..29afb990ce34f318ec227cb05409cfdf56ec8462 --- /dev/null +++ b/checkpoints/model_weights_000180224000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45f116d1f401f2c9ad05eb380e77bfccad5a6c68062ece817768c6f0269c5b58 +size 19966357 diff --git a/checkpoints/model_weights_000191496192.pt b/checkpoints/model_weights_000191496192.pt new file mode 100644 index 0000000000000000000000000000000000000000..804d27140f5aaba0a2bac8fa5440144c89c617ca --- /dev/null +++ b/checkpoints/model_weights_000191496192.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb8982ca505c5bacb0949028b3fd5d2e29fe0dbd5251ab306f2661294be24163 +size 19966357 diff --git a/checkpoints/model_weights_000196608000.pt b/checkpoints/model_weights_000196608000.pt new file mode 100644 index 0000000000000000000000000000000000000000..c201db0c5ef4e423bbab310c3db3d13d5c8a70eb --- /dev/null +++ b/checkpoints/model_weights_000196608000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49b61beb5e097f3fa4b85b38521abcd346eab7d39ceeabee3ad32c98c1a4a20f +size 19966357 diff --git a/checkpoints/model_weights_000196706304.pt b/checkpoints/model_weights_000196706304.pt new file mode 100644 index 0000000000000000000000000000000000000000..0e1023b65398fff43fd188e1c91c938747c32a06 --- /dev/null +++ b/checkpoints/model_weights_000196706304.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:997f6848994cbf468ebc2f837315b43bbbbf6b807034614c0dd31d4b5b9c0fff +size 19966357 diff --git a/checkpoints/model_weights_000197361664.pt b/checkpoints/model_weights_000197361664.pt new file mode 100644 index 0000000000000000000000000000000000000000..fd8be00bbfd45f40061138a3e570d139bdf1de0e --- /dev/null +++ b/checkpoints/model_weights_000197361664.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f9e5e5f6b680a7ce11ce23a4f7df21c2d765fdecbdaaa0436eee6d7d7936939 +size 19966357 diff --git a/checkpoints/model_weights_000198017024.pt b/checkpoints/model_weights_000198017024.pt new file mode 100644 index 0000000000000000000000000000000000000000..a36080b7be23597953e39a210d721129005e220a --- /dev/null +++ b/checkpoints/model_weights_000198017024.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9fb64479fae58fdc8f49d0800ffe57acfed3ac0ce9d55ace358eaeb283e1e296 +size 19966357 diff --git a/checkpoints/model_weights_000198672384.pt b/checkpoints/model_weights_000198672384.pt new file mode 100644 index 0000000000000000000000000000000000000000..41b7d209978bcf0db54419160142e2011df1628a --- /dev/null +++ b/checkpoints/model_weights_000198672384.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba45a8151768d1854737aaee96346b2c79a212fc5f6e5666fc8c341bc4541510 +size 19966357 diff --git a/checkpoints/model_weights_000199327744.pt b/checkpoints/model_weights_000199327744.pt new file mode 100644 index 0000000000000000000000000000000000000000..82901650318e691f874d6b059070dee9322508cb --- /dev/null +++ b/checkpoints/model_weights_000199327744.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:febb24f4157a00bbeeb7dcf56b4041fe49562819c8429a6d5013217a48b76070 +size 19966357 diff --git a/checkpoints/model_weights_000199950336.pt b/checkpoints/model_weights_000199950336.pt new file mode 100644 index 0000000000000000000000000000000000000000..aac20336ea769c717925645ad9fda9367e46215a --- /dev/null +++ b/checkpoints/model_weights_000199950336.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52661cdc404f6dc745d5d456f0b676aa203b58514a19d93d73e1c7e3f50704ac +size 19966357 diff --git a/config.toml b/config.toml new file mode 100644 index 0000000000000000000000000000000000000000..2156c9a1ddd544dc1d8dc00cc83abb21ec351aaf --- /dev/null +++ b/config.toml @@ -0,0 +1,31 @@ +model_name = "pile_llama_dmodel" +n_layers = 2 +d_mlp = 2048 +d_head = 64 +n_heads = 8 +attn_only = false +layer_norm_eps = 1e-05 +init_range = 0.02 +n_ctx = 1024 +d_vocab = 32000 +dataset_name = "eoinf/pile_llama" +seed = 10 +device = "cuda" +use_bfloat16_matmul = false +batch_size_per_device = 32 +n_devices = 1 +batches_per_step = 1 +max_tokens = 200000000 +lr_hidden = 0.001 +lr_vector = 0.0005 +lr_schedule = "constant_with_warmup" +warmup_tokens = 30000000 +weight_decay = 0.05 +grad_norm_clip = 1.0 +train_loss_moving_average_beta = 0.99 +log_interval = 25 +save_checkpoints = true +checkpoint_interval = 500 +checkpoint_interval_ratio = 1.1 +save_log_checkpoints = true +d_model = 64 \ No newline at end of file diff --git a/latest_checkpoint.pt b/latest_checkpoint.pt new file mode 100644 index 0000000000000000000000000000000000000000..c16be1b5905b28f0b6f9207337d7fc013dbba3dc --- /dev/null +++ b/latest_checkpoint.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:093ba52dc32301e988596bc876cd86bf9054fdc477263a53f920d8a5e1f5875c +size 19965879 diff --git a/latest_metadata.json b/latest_metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..aca77bc4b09d7bfe2a14dc908d8f4db476053780 --- /dev/null +++ b/latest_metadata.json @@ -0,0 +1 @@ +{"step": 6102, "tokens_seen": 199950336, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.355453716313937} \ No newline at end of file diff --git a/latest_optimizer.pt b/latest_optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..51f84e66ee056263bca65dd1d883268196c33378 --- /dev/null +++ b/latest_optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0eecc904a5d8e1329afa5802d33912fa6e60f8674766f8d8141e40de669d9660 +size 39937683 diff --git a/run.sh b/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..7f267f430e5b2294c4e308dc61c1c813beb7e718 --- /dev/null +++ b/run.sh @@ -0,0 +1,27 @@ +#!/bin/bash +# Check if "restart" argument is passed to force normal training +if [ "$1" = "restart" ]; then + echo "Force restart: Running normal training ..." + python -c " +import os +from toy_models.models.trainer import train_transformer_from_config +current_dir = os.getcwd() +train_transformer_from_config('config.toml', current_dir) +" +else + # Check for checkpoints and run appropriate training + python -c " +import os +from pathlib import Path +from toy_models.models.trainer import train_transformer_from_config, restart_from_checkpoint +current_dir = os.getcwd() +# Check if checkpoints directory exists and has .pt files +latest_checkpoint = Path('latest_checkpoint.pt') +if latest_checkpoint.exists(): + print(f'Found checkpoint: {latest_checkpoint}. Restarting from checkpoint...') + restart_from_checkpoint(current_dir) +else: + print('Starting training from beginning ...') + train_transformer_from_config(current_dir) +" +fi diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..4e191fce92d4fc2dbbac82d0125a9bcb6a88135f --- /dev/null +++ b/wandb/debug-internal.log @@ -0,0 +1,12 @@ +{"time":"2026-02-27T13:42:32.565052008Z","level":"INFO","msg":"stream: starting","core version":"0.21.4"} +{"time":"2026-02-27T13:42:32.797022254Z","level":"INFO","msg":"stream: created new stream","id":"xgg2g05z"} +{"time":"2026-02-27T13:42:32.797085166Z","level":"INFO","msg":"stream: started","id":"xgg2g05z"} +{"time":"2026-02-27T13:42:32.797094826Z","level":"INFO","msg":"writer: started","stream_id":"xgg2g05z"} +{"time":"2026-02-27T13:42:32.797122373Z","level":"INFO","msg":"sender: started","stream_id":"xgg2g05z"} +{"time":"2026-02-27T13:42:32.797166061Z","level":"INFO","msg":"handler: started","stream_id":"xgg2g05z"} +{"time":"2026-02-27T14:14:49.6300194Z","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run metadata","runtime_seconds":0.002007355}],"total_operations":1}} +{"time":"2026-02-27T14:14:50.04736134Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2026-02-27T14:14:50.391696957Z","level":"INFO","msg":"stream: closing","id":"xgg2g05z"} +{"time":"2026-02-27T14:14:50.391768471Z","level":"INFO","msg":"handler: closed","stream_id":"xgg2g05z"} +{"time":"2026-02-27T14:14:50.39180817Z","level":"INFO","msg":"sender: closed","stream_id":"xgg2g05z"} +{"time":"2026-02-27T14:14:50.39180817Z","level":"INFO","msg":"stream: closed","id":"xgg2g05z"} diff --git a/wandb/debug.log b/wandb/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..d5094b43877723353ea9d57ea6337d9391acdfc9 --- /dev/null +++ b/wandb/debug.log @@ -0,0 +1,26 @@ +2026-02-27 13:42:32,074 INFO MainThread:736 [wandb_setup.py:_flush():81] Current SDK version is 0.21.4 +2026-02-27 13:42:32,074 INFO MainThread:736 [wandb_setup.py:_flush():81] Configure stats pid to 736 +2026-02-27 13:42:32,074 INFO MainThread:736 [wandb_setup.py:_flush():81] Loading settings from /root/.config/wandb/settings +2026-02-27 13:42:32,074 INFO MainThread:736 [wandb_setup.py:_flush():81] Loading settings from /notebooks/toy_models/model_training/pile_llama_dmodel_d_model_64/wandb/settings +2026-02-27 13:42:32,074 INFO MainThread:736 [wandb_setup.py:_flush():81] Loading settings from environment variables +2026-02-27 13:42:32,074 INFO MainThread:736 [wandb_init.py:setup_run_log_directory():686] Logging user logs to /notebooks/toy_models/model_training/pile_llama_dmodel_d_model_64/wandb/run-20260227_134232-xgg2g05z/logs/debug.log +2026-02-27 13:42:32,074 INFO MainThread:736 [wandb_init.py:setup_run_log_directory():687] Logging internal logs to /notebooks/toy_models/model_training/pile_llama_dmodel_d_model_64/wandb/run-20260227_134232-xgg2g05z/logs/debug-internal.log +2026-02-27 13:42:32,074 INFO MainThread:736 [wandb_init.py:init():813] calling init triggers +2026-02-27 13:42:32,074 INFO MainThread:736 [wandb_init.py:init():818] wandb.init called with sweep_config: {} +config: {'model_name': 'pile_llama_dmodel', 'n_layers': 2, 'd_model': 64, 'd_mlp': 2048, 'd_head': 64, 'n_heads': 8, 'attn_only': False, 'layer_norm_eps': 1e-05, 'init_range': 0.02, 'n_ctx': 1024, 'd_vocab': 32000, 'dataset_name': 'eoinf/pile_llama', 'tokenizer_name': '', 'seed': 10, 'data_seed': 10, 'device': 'cuda', 'use_bfloat16_matmul': False, 'batch_size_per_device': 32, 'n_devices': 1, 'batches_per_step': 1, 'max_tokens': 200000000, 'lr_hidden': 0.001, 'lr_vector': 0.0005, 'lr_schedule': 'constant_with_warmup', 'warmup_tokens': 30000000, 'weight_decay': 0.05, 'grad_norm_clip': 1.0, 'train_loss_moving_average_beta': 0.99, 'log_interval': 25, 'save_checkpoints': True, 'checkpoint_interval': 500, 'checkpoint_interval_ratio': 1.1, 'save_log_checkpoints': True, 'use_wandb': True, 'batch_size': 32, 'tokens_per_step': 32768, 'warmup_steps': 915, 'max_steps': 6103, '_wandb': {}} +2026-02-27 13:42:32,074 INFO MainThread:736 [wandb_init.py:init():854] starting backend +2026-02-27 13:42:32,551 INFO MainThread:736 [wandb_init.py:init():857] sending inform_init request +2026-02-27 13:42:32,561 INFO MainThread:736 [wandb_init.py:init():865] backend started and connected +2026-02-27 13:42:32,563 INFO MainThread:736 [wandb_init.py:init():936] updated telemetry +2026-02-27 13:42:32,567 INFO MainThread:736 [wandb_init.py:init():960] communicating run to backend with 90.0 second timeout +2026-02-27 13:42:33,491 INFO MainThread:736 [wandb_init.py:init():1011] starting run threads in backend +2026-02-27 13:42:33,601 INFO MainThread:736 [wandb_run.py:_console_start():2506] atexit reg +2026-02-27 13:42:33,601 INFO MainThread:736 [wandb_run.py:_redirect():2354] redirect: wrap_raw +2026-02-27 13:42:33,601 INFO MainThread:736 [wandb_run.py:_redirect():2423] Wrapping output streams. +2026-02-27 13:42:33,601 INFO MainThread:736 [wandb_run.py:_redirect():2446] Redirects installed. +2026-02-27 13:42:33,611 INFO MainThread:736 [wandb_init.py:init():1049] run started, returning control to user process +2026-02-27 14:14:49,625 INFO MainThread:736 [wandb_run.py:_finish():2272] finishing run tzach/toy-transformer-replication/xgg2g05z +2026-02-27 14:14:49,627 INFO MainThread:736 [wandb_run.py:_atexit_cleanup():2471] got exitcode: 0 +2026-02-27 14:14:49,628 INFO MainThread:736 [wandb_run.py:_restore():2453] restore +2026-02-27 14:14:49,628 INFO MainThread:736 [wandb_run.py:_restore():2459] restore done +2026-02-27 14:14:50,391 INFO MainThread:736 [wandb_run.py:_footer_sync_info():3867] logging synced files diff --git a/wandb/run-20260227_134232-xgg2g05z/files/config.yaml b/wandb/run-20260227_134232-xgg2g05z/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0216452b8b20755a47d8893b4b0c86cb31070891 --- /dev/null +++ b/wandb/run-20260227_134232-xgg2g05z/files/config.yaml @@ -0,0 +1,140 @@ +_wandb: + value: + cli_version: 0.21.4 + e: + g56c3l0w0nf9nh07zyr2zn93htbfwhf6: + cpu_count: 8 + cpu_count_logical: 8 + cudaVersion: "12.4" + disk: + /: + total: "262240792576" + used: "134969348096" + email: tzfof8@gmail.com + executable: /notebooks/toy_models/.toy_models_env/bin/python + git: + commit: d722bb952956265d0387df9c35a76703a66824ec + remote: https://github.com/jgroh3/toy_models.git + gpu: NVIDIA RTX A6000 + gpu_count: 1 + gpu_nvidia: + - architecture: Ampere + cudaCores: 10752 + memoryTotal: "51527024640" + name: NVIDIA RTX A6000 + uuid: GPU-8bd59a2c-3013-13c5-bfdb-39c73a6c33fa + host: ne5ksb4j1i + memory: + total: "47332843520" + os: Linux-5.19.0-45-generic-x86_64-with-glibc2.35 + program: + python: CPython 3.11.7 + root: /notebooks/toy_models/model_training/pile_llama_dmodel_d_model_64 + startedAt: "2026-02-27T13:42:32.061506Z" + writerId: g56c3l0w0nf9nh07zyr2zn93htbfwhf6 + m: [] + python_version: 3.11.7 + t: + "1": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 71 + "2": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 71 + "3": + - 2 + - 13 + - 15 + - 16 + - 61 + "4": 3.11.7 + "5": 0.21.4 + "6": 4.56.1 + "12": 0.21.4 + "13": linux-x86_64 +attn_only: + value: false +batch_size: + value: 32 +batch_size_per_device: + value: 32 +batches_per_step: + value: 1 +checkpoint_interval: + value: 500 +checkpoint_interval_ratio: + value: 1.1 +d_head: + value: 64 +d_mlp: + value: 2048 +d_model: + value: 64 +d_vocab: + value: 32000 +data_seed: + value: 10 +dataset_name: + value: eoinf/pile_llama +device: + value: cuda +grad_norm_clip: + value: 1 +init_range: + value: 0.02 +layer_norm_eps: + value: 1e-05 +log_interval: + value: 25 +lr_hidden: + value: 0.001 +lr_schedule: + value: constant_with_warmup +lr_vector: + value: 0.0005 +max_steps: + value: 6103 +max_tokens: + value: 200000000 +model_name: + value: pile_llama_dmodel +n_ctx: + value: 1024 +n_devices: + value: 1 +n_heads: + value: 8 +n_layers: + value: 2 +save_checkpoints: + value: true +save_log_checkpoints: + value: true +seed: + value: 10 +tokenizer_name: + value: "" +tokens_per_step: + value: 32768 +train_loss_moving_average_beta: + value: 0.99 +use_bfloat16_matmul: + value: false +use_wandb: + value: true +warmup_steps: + value: 915 +warmup_tokens: + value: 30000000 +weight_decay: + value: 0.05 diff --git a/wandb/run-20260227_134232-xgg2g05z/files/output.log b/wandb/run-20260227_134232-xgg2g05z/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..11e78d75d4eba450f89077fc7194f1d75b7dcfd2 --- /dev/null +++ b/wandb/run-20260227_134232-xgg2g05z/files/output.log @@ -0,0 +1,252 @@ +Training on cuda +Model: 2L, 64d, 8h +Max steps: 6,103, Max tokens: 200,000,000 +Warmup steps: 915, Warmup tokens: 30,000,000 +Batch size per device: 32 +Context length: 1024 +Learning rates - Hidden: 0.001, Vector: 0.0005 + +Step 25 | Tokens: 819,200 | Train Loss EWMA: 10.3731 | Learning Rate: 0.000027 | Progress: 0.00410 +Step 50 | Tokens: 1,638,400 | Train Loss EWMA: 10.3507 | Learning Rate: 0.000055 | Progress: 0.00819 +Step 75 | Tokens: 2,457,600 | Train Loss EWMA: 10.3155 | Learning Rate: 0.000082 | Progress: 0.01229 +Step 100 | Tokens: 3,276,800 | Train Loss EWMA: 10.2658 | Learning Rate: 0.000109 | Progress: 0.01638 +Step 125 | Tokens: 4,096,000 | Train Loss EWMA: 10.1999 | Learning Rate: 0.000137 | Progress: 0.02048 +Step 150 | Tokens: 4,915,200 | Train Loss EWMA: 10.1141 | Learning Rate: 0.000164 | Progress: 0.02458 +Step 175 | Tokens: 5,734,400 | Train Loss EWMA: 10.0076 | Learning Rate: 0.000191 | Progress: 0.02867 +Step 200 | Tokens: 6,553,600 | Train Loss EWMA: 9.8789 | Learning Rate: 0.000219 | Progress: 0.03277 +Step 225 | Tokens: 7,372,800 | Train Loss EWMA: 9.7309 | Learning Rate: 0.000246 | Progress: 0.03686 +Step 250 | Tokens: 8,192,000 | Train Loss EWMA: 9.5672 | Learning Rate: 0.000273 | Progress: 0.04096 +Step 275 | Tokens: 9,011,200 | Train Loss EWMA: 9.3862 | Learning Rate: 0.000301 | Progress: 0.04506 +Step 300 | Tokens: 9,830,400 | Train Loss EWMA: 9.1923 | Learning Rate: 0.000328 | Progress: 0.04915 +Step 325 | Tokens: 10,649,600 | Train Loss EWMA: 8.9910 | Learning Rate: 0.000355 | Progress: 0.05325 +Step 350 | Tokens: 11,468,800 | Train Loss EWMA: 8.7869 | Learning Rate: 0.000383 | Progress: 0.05734 +Step 375 | Tokens: 12,288,000 | Train Loss EWMA: 8.5824 | Learning Rate: 0.000410 | Progress: 0.06144 +Step 400 | Tokens: 13,107,200 | Train Loss EWMA: 8.3820 | Learning Rate: 0.000437 | Progress: 0.06554 +Step 425 | Tokens: 13,926,400 | Train Loss EWMA: 8.1884 | Learning Rate: 0.000464 | Progress: 0.06963 +Step 450 | Tokens: 14,745,600 | Train Loss EWMA: 8.0121 | Learning Rate: 0.000492 | Progress: 0.07373 +Step 475 | Tokens: 15,564,800 | Train Loss EWMA: 7.8452 | Learning Rate: 0.000519 | Progress: 0.07782 +Step 500 | Tokens: 16,384,000 | Train Loss EWMA: 7.6989 | Learning Rate: 0.000546 | Progress: 0.08192 +Step 525 | Tokens: 17,203,200 | Train Loss EWMA: 7.5544 | Learning Rate: 0.000574 | Progress: 0.08602 +Step 550 | Tokens: 18,022,400 | Train Loss EWMA: 7.4150 | Learning Rate: 0.000601 | Progress: 0.09011 +Step 575 | Tokens: 18,841,600 | Train Loss EWMA: 7.2821 | Learning Rate: 0.000628 | Progress: 0.09421 +Step 600 | Tokens: 19,660,800 | Train Loss EWMA: 7.1612 | Learning Rate: 0.000656 | Progress: 0.09830 +Step 625 | Tokens: 20,480,000 | Train Loss EWMA: 7.0412 | Learning Rate: 0.000683 | Progress: 0.10240 +Step 650 | Tokens: 21,299,200 | Train Loss EWMA: 6.9382 | Learning Rate: 0.000710 | Progress: 0.10650 +Step 675 | Tokens: 22,118,400 | Train Loss EWMA: 6.8292 | Learning Rate: 0.000738 | Progress: 0.11059 +Step 700 | Tokens: 22,937,600 | Train Loss EWMA: 6.7306 | Learning Rate: 0.000765 | Progress: 0.11469 +Step 725 | Tokens: 23,756,800 | Train Loss EWMA: 6.6312 | Learning Rate: 0.000792 | Progress: 0.11878 +Step 750 | Tokens: 24,576,000 | Train Loss EWMA: 6.5371 | Learning Rate: 0.000820 | Progress: 0.12288 +Step 775 | Tokens: 25,395,200 | Train Loss EWMA: 6.4527 | Learning Rate: 0.000847 | Progress: 0.12698 +Step 800 | Tokens: 26,214,400 | Train Loss EWMA: 6.3808 | Learning Rate: 0.000874 | Progress: 0.13107 +Step 825 | Tokens: 27,033,600 | Train Loss EWMA: 6.3106 | Learning Rate: 0.000902 | Progress: 0.13517 +Step 850 | Tokens: 27,852,800 | Train Loss EWMA: 6.2492 | Learning Rate: 0.000929 | Progress: 0.13926 +Step 875 | Tokens: 28,672,000 | Train Loss EWMA: 6.1805 | Learning Rate: 0.000956 | Progress: 0.14336 +Step 900 | Tokens: 29,491,200 | Train Loss EWMA: 6.1249 | Learning Rate: 0.000984 | Progress: 0.14746 +Step 925 | Tokens: 30,310,400 | Train Loss EWMA: 6.0693 | Learning Rate: 0.001000 | Progress: 0.15155 +Step 950 | Tokens: 31,129,600 | Train Loss EWMA: 6.0243 | Learning Rate: 0.001000 | Progress: 0.15565 +Step 975 | Tokens: 31,948,800 | Train Loss EWMA: 5.9807 | Learning Rate: 0.001000 | Progress: 0.15974 +Step 1,000 | Tokens: 32,768,000 | Train Loss EWMA: 5.9342 | Learning Rate: 0.001000 | Progress: 0.16384 +Step 1,025 | Tokens: 33,587,200 | Train Loss EWMA: 5.8993 | Learning Rate: 0.001000 | Progress: 0.16794 +Step 1,050 | Tokens: 34,406,400 | Train Loss EWMA: 5.8621 | Learning Rate: 0.001000 | Progress: 0.17203 +Step 1,075 | Tokens: 35,225,600 | Train Loss EWMA: 5.8264 | Learning Rate: 0.001000 | Progress: 0.17613 +Step 1,100 | Tokens: 36,044,800 | Train Loss EWMA: 5.7941 | Learning Rate: 0.001000 | Progress: 0.18022 +Step 1,125 | Tokens: 36,864,000 | Train Loss EWMA: 5.7584 | Learning Rate: 0.001000 | Progress: 0.18432 +Step 1,150 | Tokens: 37,683,200 | Train Loss EWMA: 5.7298 | Learning Rate: 0.001000 | Progress: 0.18842 +Step 1,175 | Tokens: 38,502,400 | Train Loss EWMA: 5.6965 | Learning Rate: 0.001000 | Progress: 0.19251 +Step 1,200 | Tokens: 39,321,600 | Train Loss EWMA: 5.6721 | Learning Rate: 0.001000 | Progress: 0.19661 +Step 1,225 | Tokens: 40,140,800 | Train Loss EWMA: 5.6487 | Learning Rate: 0.001000 | Progress: 0.20070 +Step 1,250 | Tokens: 40,960,000 | Train Loss EWMA: 5.6170 | Learning Rate: 0.001000 | Progress: 0.20480 +Step 1,275 | Tokens: 41,779,200 | Train Loss EWMA: 5.5936 | Learning Rate: 0.001000 | Progress: 0.20890 +Step 1,300 | Tokens: 42,598,400 | Train Loss EWMA: 5.5702 | Learning Rate: 0.001000 | Progress: 0.21299 +Step 1,325 | Tokens: 43,417,600 | Train Loss EWMA: 5.5531 | Learning Rate: 0.001000 | Progress: 0.21709 +Step 1,350 | Tokens: 44,236,800 | Train Loss EWMA: 5.5231 | Learning Rate: 0.001000 | Progress: 0.22118 +Step 1,375 | Tokens: 45,056,000 | Train Loss EWMA: 5.5001 | Learning Rate: 0.001000 | Progress: 0.22528 +Step 1,400 | Tokens: 45,875,200 | Train Loss EWMA: 5.4815 | Learning Rate: 0.001000 | Progress: 0.22938 +Step 1,425 | Tokens: 46,694,400 | Train Loss EWMA: 5.4639 | Learning Rate: 0.001000 | Progress: 0.23347 +Step 1,450 | Tokens: 47,513,600 | Train Loss EWMA: 5.4463 | Learning Rate: 0.001000 | Progress: 0.23757 +Step 1,475 | Tokens: 48,332,800 | Train Loss EWMA: 5.4241 | Learning Rate: 0.001000 | Progress: 0.24166 +Step 1,500 | Tokens: 49,152,000 | Train Loss EWMA: 5.4102 | Learning Rate: 0.001000 | Progress: 0.24576 +Step 1,525 | Tokens: 49,971,200 | Train Loss EWMA: 5.3985 | Learning Rate: 0.001000 | Progress: 0.24986 +Step 1,550 | Tokens: 50,790,400 | Train Loss EWMA: 5.3773 | Learning Rate: 0.001000 | Progress: 0.25395 +Step 1,575 | Tokens: 51,609,600 | Train Loss EWMA: 5.3559 | Learning Rate: 0.001000 | Progress: 0.25805 +Step 1,600 | Tokens: 52,428,800 | Train Loss EWMA: 5.3373 | Learning Rate: 0.001000 | Progress: 0.26214 +Step 1,625 | Tokens: 53,248,000 | Train Loss EWMA: 5.3187 | Learning Rate: 0.001000 | Progress: 0.26624 +Step 1,650 | Tokens: 54,067,200 | Train Loss EWMA: 5.3045 | Learning Rate: 0.001000 | Progress: 0.27034 +Step 1,675 | Tokens: 54,886,400 | Train Loss EWMA: 5.2834 | Learning Rate: 0.001000 | Progress: 0.27443 +Step 1,700 | Tokens: 55,705,600 | Train Loss EWMA: 5.2594 | Learning Rate: 0.001000 | Progress: 0.27853 +Step 1,725 | Tokens: 56,524,800 | Train Loss EWMA: 5.2499 | Learning Rate: 0.001000 | Progress: 0.28262 +Step 1,750 | Tokens: 57,344,000 | Train Loss EWMA: 5.2349 | Learning Rate: 0.001000 | Progress: 0.28672 +Step 1,775 | Tokens: 58,163,200 | Train Loss EWMA: 5.2174 | Learning Rate: 0.001000 | Progress: 0.29082 +Step 1,800 | Tokens: 58,982,400 | Train Loss EWMA: 5.2145 | Learning Rate: 0.001000 | Progress: 0.29491 +Step 1,825 | Tokens: 59,801,600 | Train Loss EWMA: 5.1956 | Learning Rate: 0.001000 | Progress: 0.29901 +Step 1,850 | Tokens: 60,620,800 | Train Loss EWMA: 5.1765 | Learning Rate: 0.001000 | Progress: 0.30310 +Step 1,875 | Tokens: 61,440,000 | Train Loss EWMA: 5.1614 | Learning Rate: 0.001000 | Progress: 0.30720 +Step 1,900 | Tokens: 62,259,200 | Train Loss EWMA: 5.1537 | Learning Rate: 0.001000 | Progress: 0.31130 +Step 1,925 | Tokens: 63,078,400 | Train Loss EWMA: 5.1526 | Learning Rate: 0.001000 | Progress: 0.31539 +Step 1,950 | Tokens: 63,897,600 | Train Loss EWMA: 5.1453 | Learning Rate: 0.001000 | Progress: 0.31949 +Step 1,975 | Tokens: 64,716,800 | Train Loss EWMA: 5.1373 | Learning Rate: 0.001000 | Progress: 0.32358 +Step 2,000 | Tokens: 65,536,000 | Train Loss EWMA: 5.1284 | Learning Rate: 0.001000 | Progress: 0.32768 +Step 2,025 | Tokens: 66,355,200 | Train Loss EWMA: 5.1152 | Learning Rate: 0.001000 | Progress: 0.33178 +Step 2,050 | Tokens: 67,174,400 | Train Loss EWMA: 5.1071 | Learning Rate: 0.001000 | Progress: 0.33587 +Step 2,075 | Tokens: 67,993,600 | Train Loss EWMA: 5.0853 | Learning Rate: 0.001000 | Progress: 0.33997 +Step 2,100 | Tokens: 68,812,800 | Train Loss EWMA: 5.0669 | Learning Rate: 0.001000 | Progress: 0.34406 +Step 2,125 | Tokens: 69,632,000 | Train Loss EWMA: 5.0559 | Learning Rate: 0.001000 | Progress: 0.34816 +Step 2,150 | Tokens: 70,451,200 | Train Loss EWMA: 5.0515 | Learning Rate: 0.001000 | Progress: 0.35226 +Step 2,175 | Tokens: 71,270,400 | Train Loss EWMA: 5.0384 | Learning Rate: 0.001000 | Progress: 0.35635 +Step 2,200 | Tokens: 72,089,600 | Train Loss EWMA: 5.0336 | Learning Rate: 0.001000 | Progress: 0.36045 +Step 2,225 | Tokens: 72,908,800 | Train Loss EWMA: 5.0173 | Learning Rate: 0.001000 | Progress: 0.36454 +Step 2,250 | Tokens: 73,728,000 | Train Loss EWMA: 5.0069 | Learning Rate: 0.001000 | Progress: 0.36864 +Step 2,275 | Tokens: 74,547,200 | Train Loss EWMA: 4.9987 | Learning Rate: 0.001000 | Progress: 0.37274 +Step 2,300 | Tokens: 75,366,400 | Train Loss EWMA: 4.9887 | Learning Rate: 0.001000 | Progress: 0.37683 +Step 2,325 | Tokens: 76,185,600 | Train Loss EWMA: 4.9778 | Learning Rate: 0.001000 | Progress: 0.38093 +Step 2,350 | Tokens: 77,004,800 | Train Loss EWMA: 4.9702 | Learning Rate: 0.001000 | Progress: 0.38502 +Step 2,375 | Tokens: 77,824,000 | Train Loss EWMA: 4.9596 | Learning Rate: 0.001000 | Progress: 0.38912 +Step 2,400 | Tokens: 78,643,200 | Train Loss EWMA: 4.9489 | Learning Rate: 0.001000 | Progress: 0.39322 +Step 2,425 | Tokens: 79,462,400 | Train Loss EWMA: 4.9394 | Learning Rate: 0.001000 | Progress: 0.39731 +Step 2,450 | Tokens: 80,281,600 | Train Loss EWMA: 4.9437 | Learning Rate: 0.001000 | Progress: 0.40141 +Step 2,475 | Tokens: 81,100,800 | Train Loss EWMA: 4.9330 | Learning Rate: 0.001000 | Progress: 0.40550 +Step 2,500 | Tokens: 81,920,000 | Train Loss EWMA: 4.9209 | Learning Rate: 0.001000 | Progress: 0.40960 +Step 2,525 | Tokens: 82,739,200 | Train Loss EWMA: 4.9103 | Learning Rate: 0.001000 | Progress: 0.41370 +Step 2,550 | Tokens: 83,558,400 | Train Loss EWMA: 4.9061 | Learning Rate: 0.001000 | Progress: 0.41779 +Step 2,575 | Tokens: 84,377,600 | Train Loss EWMA: 4.8903 | Learning Rate: 0.001000 | Progress: 0.42189 +Step 2,600 | Tokens: 85,196,800 | Train Loss EWMA: 4.8799 | Learning Rate: 0.001000 | Progress: 0.42598 +Step 2,625 | Tokens: 86,016,000 | Train Loss EWMA: 4.8783 | Learning Rate: 0.001000 | Progress: 0.43008 +Step 2,650 | Tokens: 86,835,200 | Train Loss EWMA: 4.8706 | Learning Rate: 0.001000 | Progress: 0.43418 +Step 2,675 | Tokens: 87,654,400 | Train Loss EWMA: 4.8649 | Learning Rate: 0.001000 | Progress: 0.43827 +Step 2,700 | Tokens: 88,473,600 | Train Loss EWMA: 4.8482 | Learning Rate: 0.001000 | Progress: 0.44237 +Step 2,725 | Tokens: 89,292,800 | Train Loss EWMA: 4.8476 | Learning Rate: 0.001000 | Progress: 0.44646 +Step 2,750 | Tokens: 90,112,000 | Train Loss EWMA: 4.8361 | Learning Rate: 0.001000 | Progress: 0.45056 +Step 2,775 | Tokens: 90,931,200 | Train Loss EWMA: 4.8287 | Learning Rate: 0.001000 | Progress: 0.45466 +Step 2,800 | Tokens: 91,750,400 | Train Loss EWMA: 4.8226 | Learning Rate: 0.001000 | Progress: 0.45875 +Step 2,825 | Tokens: 92,569,600 | Train Loss EWMA: 4.8206 | Learning Rate: 0.001000 | Progress: 0.46285 +Step 2,850 | Tokens: 93,388,800 | Train Loss EWMA: 4.8140 | Learning Rate: 0.001000 | Progress: 0.46694 +Step 2,875 | Tokens: 94,208,000 | Train Loss EWMA: 4.8021 | Learning Rate: 0.001000 | Progress: 0.47104 +Step 2,900 | Tokens: 95,027,200 | Train Loss EWMA: 4.7932 | Learning Rate: 0.001000 | Progress: 0.47514 +Step 2,925 | Tokens: 95,846,400 | Train Loss EWMA: 4.7879 | Learning Rate: 0.001000 | Progress: 0.47923 +Step 2,950 | Tokens: 96,665,600 | Train Loss EWMA: 4.7808 | Learning Rate: 0.001000 | Progress: 0.48333 +Step 2,975 | Tokens: 97,484,800 | Train Loss EWMA: 4.7810 | Learning Rate: 0.001000 | Progress: 0.48742 +Step 3,000 | Tokens: 98,304,000 | Train Loss EWMA: 4.7788 | Learning Rate: 0.001000 | Progress: 0.49152 +Step 3,025 | Tokens: 99,123,200 | Train Loss EWMA: 4.7716 | Learning Rate: 0.001000 | Progress: 0.49562 +Step 3,050 | Tokens: 99,942,400 | Train Loss EWMA: 4.7633 | Learning Rate: 0.001000 | Progress: 0.49971 +Step 3,075 | Tokens: 100,761,600 | Train Loss EWMA: 4.7549 | Learning Rate: 0.001000 | Progress: 0.50381 +Step 3,100 | Tokens: 101,580,800 | Train Loss EWMA: 4.7521 | Learning Rate: 0.001000 | Progress: 0.50790 +Step 3,125 | Tokens: 102,400,000 | Train Loss EWMA: 4.7501 | Learning Rate: 0.001000 | Progress: 0.51200 +Step 3,150 | Tokens: 103,219,200 | Train Loss EWMA: 4.7424 | Learning Rate: 0.001000 | Progress: 0.51610 +Step 3,175 | Tokens: 104,038,400 | Train Loss EWMA: 4.7348 | Learning Rate: 0.001000 | Progress: 0.52019 +Step 3,200 | Tokens: 104,857,600 | Train Loss EWMA: 4.7242 | Learning Rate: 0.001000 | Progress: 0.52429 +Step 3,225 | Tokens: 105,676,800 | Train Loss EWMA: 4.7159 | Learning Rate: 0.001000 | Progress: 0.52838 +Step 3,250 | Tokens: 106,496,000 | Train Loss EWMA: 4.7129 | Learning Rate: 0.001000 | Progress: 0.53248 +Step 3,275 | Tokens: 107,315,200 | Train Loss EWMA: 4.7133 | Learning Rate: 0.001000 | Progress: 0.53658 +Step 3,300 | Tokens: 108,134,400 | Train Loss EWMA: 4.7071 | Learning Rate: 0.001000 | Progress: 0.54067 +Step 3,325 | Tokens: 108,953,600 | Train Loss EWMA: 4.6998 | Learning Rate: 0.001000 | Progress: 0.54477 +Step 3,350 | Tokens: 109,772,800 | Train Loss EWMA: 4.7033 | Learning Rate: 0.001000 | Progress: 0.54886 +Step 3,375 | Tokens: 110,592,000 | Train Loss EWMA: 4.6888 | Learning Rate: 0.001000 | Progress: 0.55296 +Step 3,400 | Tokens: 111,411,200 | Train Loss EWMA: 4.6855 | Learning Rate: 0.001000 | Progress: 0.55706 +Step 3,425 | Tokens: 112,230,400 | Train Loss EWMA: 4.6737 | Learning Rate: 0.001000 | Progress: 0.56115 +Step 3,450 | Tokens: 113,049,600 | Train Loss EWMA: 4.6720 | Learning Rate: 0.001000 | Progress: 0.56525 +Step 3,475 | Tokens: 113,868,800 | Train Loss EWMA: 4.6760 | Learning Rate: 0.001000 | Progress: 0.56934 +Step 3,500 | Tokens: 114,688,000 | Train Loss EWMA: 4.6615 | Learning Rate: 0.001000 | Progress: 0.57344 +Step 3,525 | Tokens: 115,507,200 | Train Loss EWMA: 4.6591 | Learning Rate: 0.001000 | Progress: 0.57754 +Step 3,550 | Tokens: 116,326,400 | Train Loss EWMA: 4.6520 | Learning Rate: 0.001000 | Progress: 0.58163 +Step 3,575 | Tokens: 117,145,600 | Train Loss EWMA: 4.6590 | Learning Rate: 0.001000 | Progress: 0.58573 +Step 3,600 | Tokens: 117,964,800 | Train Loss EWMA: 4.6533 | Learning Rate: 0.001000 | Progress: 0.58982 +Step 3,625 | Tokens: 118,784,000 | Train Loss EWMA: 4.6543 | Learning Rate: 0.001000 | Progress: 0.59392 +Step 3,650 | Tokens: 119,603,200 | Train Loss EWMA: 4.6505 | Learning Rate: 0.001000 | Progress: 0.59802 +Step 3,675 | Tokens: 120,422,400 | Train Loss EWMA: 4.6473 | Learning Rate: 0.001000 | Progress: 0.60211 +Step 3,700 | Tokens: 121,241,600 | Train Loss EWMA: 4.6398 | Learning Rate: 0.001000 | Progress: 0.60621 +Step 3,725 | Tokens: 122,060,800 | Train Loss EWMA: 4.6344 | Learning Rate: 0.001000 | Progress: 0.61030 +Step 3,750 | Tokens: 122,880,000 | Train Loss EWMA: 4.6246 | Learning Rate: 0.001000 | Progress: 0.61440 +Step 3,775 | Tokens: 123,699,200 | Train Loss EWMA: 4.6131 | Learning Rate: 0.001000 | Progress: 0.61850 +Step 3,800 | Tokens: 124,518,400 | Train Loss EWMA: 4.6072 | Learning Rate: 0.001000 | Progress: 0.62259 +Step 3,825 | Tokens: 125,337,600 | Train Loss EWMA: 4.6021 | Learning Rate: 0.001000 | Progress: 0.62669 +Step 3,850 | Tokens: 126,156,800 | Train Loss EWMA: 4.6076 | Learning Rate: 0.001000 | Progress: 0.63078 +Step 3,875 | Tokens: 126,976,000 | Train Loss EWMA: 4.6044 | Learning Rate: 0.001000 | Progress: 0.63488 +Step 3,900 | Tokens: 127,795,200 | Train Loss EWMA: 4.6096 | Learning Rate: 0.001000 | Progress: 0.63898 +Step 3,925 | Tokens: 128,614,400 | Train Loss EWMA: 4.5991 | Learning Rate: 0.001000 | Progress: 0.64307 +Step 3,950 | Tokens: 129,433,600 | Train Loss EWMA: 4.6000 | Learning Rate: 0.001000 | Progress: 0.64717 +Step 3,975 | Tokens: 130,252,800 | Train Loss EWMA: 4.5876 | Learning Rate: 0.001000 | Progress: 0.65126 +Step 4,000 | Tokens: 131,072,000 | Train Loss EWMA: 4.5915 | Learning Rate: 0.001000 | Progress: 0.65536 +Step 4,025 | Tokens: 131,891,200 | Train Loss EWMA: 4.5869 | Learning Rate: 0.001000 | Progress: 0.65946 +Step 4,050 | Tokens: 132,710,400 | Train Loss EWMA: 4.5746 | Learning Rate: 0.001000 | Progress: 0.66355 +Step 4,075 | Tokens: 133,529,600 | Train Loss EWMA: 4.5660 | Learning Rate: 0.001000 | Progress: 0.66765 +Step 4,100 | Tokens: 134,348,800 | Train Loss EWMA: 4.5604 | Learning Rate: 0.001000 | Progress: 0.67174 +Step 4,125 | Tokens: 135,168,000 | Train Loss EWMA: 4.5650 | Learning Rate: 0.001000 | Progress: 0.67584 +Step 4,150 | Tokens: 135,987,200 | Train Loss EWMA: 4.5635 | Learning Rate: 0.001000 | Progress: 0.67994 +Step 4,175 | Tokens: 136,806,400 | Train Loss EWMA: 4.5614 | Learning Rate: 0.001000 | Progress: 0.68403 +Step 4,200 | Tokens: 137,625,600 | Train Loss EWMA: 4.5692 | Learning Rate: 0.001000 | Progress: 0.68813 +Step 4,225 | Tokens: 138,444,800 | Train Loss EWMA: 4.5602 | Learning Rate: 0.001000 | Progress: 0.69222 +Step 4,250 | Tokens: 139,264,000 | Train Loss EWMA: 4.5575 | Learning Rate: 0.001000 | Progress: 0.69632 +Step 4,275 | Tokens: 140,083,200 | Train Loss EWMA: 4.5542 | Learning Rate: 0.001000 | Progress: 0.70042 +Step 4,300 | Tokens: 140,902,400 | Train Loss EWMA: 4.5417 | Learning Rate: 0.001000 | Progress: 0.70451 +Step 4,325 | Tokens: 141,721,600 | Train Loss EWMA: 4.5399 | Learning Rate: 0.001000 | Progress: 0.70861 +Step 4,350 | Tokens: 142,540,800 | Train Loss EWMA: 4.5342 | Learning Rate: 0.001000 | Progress: 0.71270 +Step 4,375 | Tokens: 143,360,000 | Train Loss EWMA: 4.5341 | Learning Rate: 0.001000 | Progress: 0.71680 +Step 4,400 | Tokens: 144,179,200 | Train Loss EWMA: 4.5384 | Learning Rate: 0.001000 | Progress: 0.72090 +Step 4,425 | Tokens: 144,998,400 | Train Loss EWMA: 4.5414 | Learning Rate: 0.001000 | Progress: 0.72499 +Step 4,450 | Tokens: 145,817,600 | Train Loss EWMA: 4.5427 | Learning Rate: 0.001000 | Progress: 0.72909 +Step 4,475 | Tokens: 146,636,800 | Train Loss EWMA: 4.5404 | Learning Rate: 0.001000 | Progress: 0.73318 +Step 4,500 | Tokens: 147,456,000 | Train Loss EWMA: 4.5218 | Learning Rate: 0.001000 | Progress: 0.73728 +Step 4,525 | Tokens: 148,275,200 | Train Loss EWMA: 4.5077 | Learning Rate: 0.001000 | Progress: 0.74138 +Step 4,550 | Tokens: 149,094,400 | Train Loss EWMA: 4.5063 | Learning Rate: 0.001000 | Progress: 0.74547 +Step 4,575 | Tokens: 149,913,600 | Train Loss EWMA: 4.4997 | Learning Rate: 0.001000 | Progress: 0.74957 +Step 4,600 | Tokens: 150,732,800 | Train Loss EWMA: 4.5019 | Learning Rate: 0.001000 | Progress: 0.75366 +Step 4,625 | Tokens: 151,552,000 | Train Loss EWMA: 4.4966 | Learning Rate: 0.001000 | Progress: 0.75776 +Step 4,650 | Tokens: 152,371,200 | Train Loss EWMA: 4.4896 | Learning Rate: 0.001000 | Progress: 0.76186 +Step 4,675 | Tokens: 153,190,400 | Train Loss EWMA: 4.4832 | Learning Rate: 0.001000 | Progress: 0.76595 +Step 4,700 | Tokens: 154,009,600 | Train Loss EWMA: 4.4832 | Learning Rate: 0.001000 | Progress: 0.77005 +Step 4,725 | Tokens: 154,828,800 | Train Loss EWMA: 4.4849 | Learning Rate: 0.001000 | Progress: 0.77414 +Step 4,750 | Tokens: 155,648,000 | Train Loss EWMA: 4.4860 | Learning Rate: 0.001000 | Progress: 0.77824 +Step 4,775 | Tokens: 156,467,200 | Train Loss EWMA: 4.4853 | Learning Rate: 0.001000 | Progress: 0.78234 +Step 4,800 | Tokens: 157,286,400 | Train Loss EWMA: 4.4883 | Learning Rate: 0.001000 | Progress: 0.78643 +Step 4,825 | Tokens: 158,105,600 | Train Loss EWMA: 4.4824 | Learning Rate: 0.001000 | Progress: 0.79053 +Step 4,850 | Tokens: 158,924,800 | Train Loss EWMA: 4.4788 | Learning Rate: 0.001000 | Progress: 0.79462 +Step 4,875 | Tokens: 159,744,000 | Train Loss EWMA: 4.4783 | Learning Rate: 0.001000 | Progress: 0.79872 +Step 4,900 | Tokens: 160,563,200 | Train Loss EWMA: 4.4732 | Learning Rate: 0.001000 | Progress: 0.80282 +Step 4,925 | Tokens: 161,382,400 | Train Loss EWMA: 4.4556 | Learning Rate: 0.001000 | Progress: 0.80691 +Step 4,950 | Tokens: 162,201,600 | Train Loss EWMA: 4.4584 | Learning Rate: 0.001000 | Progress: 0.81101 +Step 4,975 | Tokens: 163,020,800 | Train Loss EWMA: 4.4602 | Learning Rate: 0.001000 | Progress: 0.81510 +Step 5,000 | Tokens: 163,840,000 | Train Loss EWMA: 4.4527 | Learning Rate: 0.001000 | Progress: 0.81920 +Step 5,025 | Tokens: 164,659,200 | Train Loss EWMA: 4.4552 | Learning Rate: 0.001000 | Progress: 0.82330 +Step 5,050 | Tokens: 165,478,400 | Train Loss EWMA: 4.4510 | Learning Rate: 0.001000 | Progress: 0.82739 +Step 5,075 | Tokens: 166,297,600 | Train Loss EWMA: 4.4539 | Learning Rate: 0.001000 | Progress: 0.83149 +Step 5,100 | Tokens: 167,116,800 | Train Loss EWMA: 4.4457 | Learning Rate: 0.001000 | Progress: 0.83558 +Step 5,125 | Tokens: 167,936,000 | Train Loss EWMA: 4.4381 | Learning Rate: 0.001000 | Progress: 0.83968 +Step 5,150 | Tokens: 168,755,200 | Train Loss EWMA: 4.4350 | Learning Rate: 0.001000 | Progress: 0.84378 +Step 5,175 | Tokens: 169,574,400 | Train Loss EWMA: 4.4332 | Learning Rate: 0.001000 | Progress: 0.84787 +Step 5,200 | Tokens: 170,393,600 | Train Loss EWMA: 4.4343 | Learning Rate: 0.001000 | Progress: 0.85197 +Step 5,225 | Tokens: 171,212,800 | Train Loss EWMA: 4.4301 | Learning Rate: 0.001000 | Progress: 0.85606 +Step 5,250 | Tokens: 172,032,000 | Train Loss EWMA: 4.4329 | Learning Rate: 0.001000 | Progress: 0.86016 +Step 5,275 | Tokens: 172,851,200 | Train Loss EWMA: 4.4310 | Learning Rate: 0.001000 | Progress: 0.86426 +Step 5,300 | Tokens: 173,670,400 | Train Loss EWMA: 4.4280 | Learning Rate: 0.001000 | Progress: 0.86835 +Step 5,325 | Tokens: 174,489,600 | Train Loss EWMA: 4.4298 | Learning Rate: 0.001000 | Progress: 0.87245 +Step 5,350 | Tokens: 175,308,800 | Train Loss EWMA: 4.4261 | Learning Rate: 0.001000 | Progress: 0.87654 +Step 5,375 | Tokens: 176,128,000 | Train Loss EWMA: 4.4252 | Learning Rate: 0.001000 | Progress: 0.88064 +Step 5,400 | Tokens: 176,947,200 | Train Loss EWMA: 4.4147 | Learning Rate: 0.001000 | Progress: 0.88474 +Step 5,425 | Tokens: 177,766,400 | Train Loss EWMA: 4.4143 | Learning Rate: 0.001000 | Progress: 0.88883 +Step 5,450 | Tokens: 178,585,600 | Train Loss EWMA: 4.4076 | Learning Rate: 0.001000 | Progress: 0.89293 +Step 5,475 | Tokens: 179,404,800 | Train Loss EWMA: 4.4111 | Learning Rate: 0.001000 | Progress: 0.89702 +Step 5,500 | Tokens: 180,224,000 | Train Loss EWMA: 4.4069 | Learning Rate: 0.001000 | Progress: 0.90112 +Step 5,525 | Tokens: 181,043,200 | Train Loss EWMA: 4.4031 | Learning Rate: 0.001000 | Progress: 0.90522 +Step 5,550 | Tokens: 181,862,400 | Train Loss EWMA: 4.3992 | Learning Rate: 0.001000 | Progress: 0.90931 +Step 5,575 | Tokens: 182,681,600 | Train Loss EWMA: 4.4027 | Learning Rate: 0.001000 | Progress: 0.91341 +Step 5,600 | Tokens: 183,500,800 | Train Loss EWMA: 4.4040 | Learning Rate: 0.001000 | Progress: 0.91750 +Step 5,625 | Tokens: 184,320,000 | Train Loss EWMA: 4.3947 | Learning Rate: 0.001000 | Progress: 0.92160 +Step 5,650 | Tokens: 185,139,200 | Train Loss EWMA: 4.3931 | Learning Rate: 0.001000 | Progress: 0.92570 +Step 5,675 | Tokens: 185,958,400 | Train Loss EWMA: 4.3874 | Learning Rate: 0.001000 | Progress: 0.92979 +Step 5,700 | Tokens: 186,777,600 | Train Loss EWMA: 4.3839 | Learning Rate: 0.001000 | Progress: 0.93389 +Step 5,725 | Tokens: 187,596,800 | Train Loss EWMA: 4.3788 | Learning Rate: 0.001000 | Progress: 0.93798 +Step 5,750 | Tokens: 188,416,000 | Train Loss EWMA: 4.3831 | Learning Rate: 0.001000 | Progress: 0.94208 +Step 5,775 | Tokens: 189,235,200 | Train Loss EWMA: 4.3747 | Learning Rate: 0.001000 | Progress: 0.94618 +Step 5,800 | Tokens: 190,054,400 | Train Loss EWMA: 4.3778 | Learning Rate: 0.001000 | Progress: 0.95027 +Step 5,825 | Tokens: 190,873,600 | Train Loss EWMA: 4.3797 | Learning Rate: 0.001000 | Progress: 0.95437 +Step 5,850 | Tokens: 191,692,800 | Train Loss EWMA: 4.3827 | Learning Rate: 0.001000 | Progress: 0.95846 +Step 5,875 | Tokens: 192,512,000 | Train Loss EWMA: 4.3747 | Learning Rate: 0.001000 | Progress: 0.96256 +Step 5,900 | Tokens: 193,331,200 | Train Loss EWMA: 4.3698 | Learning Rate: 0.001000 | Progress: 0.96666 +Step 5,925 | Tokens: 194,150,400 | Train Loss EWMA: 4.3742 | Learning Rate: 0.001000 | Progress: 0.97075 +Step 5,950 | Tokens: 194,969,600 | Train Loss EWMA: 4.3739 | Learning Rate: 0.001000 | Progress: 0.97485 +Step 5,975 | Tokens: 195,788,800 | Train Loss EWMA: 4.3732 | Learning Rate: 0.001000 | Progress: 0.97894 +Step 6,000 | Tokens: 196,608,000 | Train Loss EWMA: 4.3642 | Learning Rate: 0.001000 | Progress: 0.98304 +Step 6,025 | Tokens: 197,427,200 | Train Loss EWMA: 4.3710 | Learning Rate: 0.001000 | Progress: 0.98714 +Step 6,050 | Tokens: 198,246,400 | Train Loss EWMA: 4.3608 | Learning Rate: 0.001000 | Progress: 0.99123 +Step 6,075 | Tokens: 199,065,600 | Train Loss EWMA: 4.3610 | Learning Rate: 0.001000 | Progress: 0.99533 +Step 6,100 | Tokens: 199,884,800 | Train Loss EWMA: 4.3556 | Learning Rate: 0.001000 | Progress: 0.99942 diff --git a/wandb/run-20260227_134232-xgg2g05z/files/requirements.txt b/wandb/run-20260227_134232-xgg2g05z/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..c32285d10ba18c2e783ff2ead305d5976caef668 --- /dev/null +++ b/wandb/run-20260227_134232-xgg2g05z/files/requirements.txt @@ -0,0 +1,222 @@ +fsspec==2025.3.0 +PyYAML==6.0.2 +certifi==2025.8.3 +comm==0.2.3 +widgetsnbextension==4.0.14 +Jinja2==3.1.6 +rich==14.1.0 +circuitsvis==1.43.3 +hf-xet==1.1.9 +param==2.2.1 +httpcore==1.0.9 +nvidia-cuda-cupti-cu12==12.8.90 +nvidia-cuda-nvrtc-cu12==12.8.93 +asttokens==3.0.0 +filelock==3.19.1 +types-python-dateutil==2.9.0.20250822 +cycler==0.12.1 +stack-data==0.6.3 +jupyter_server==2.17.0 +aiosignal==1.4.0 +xyzservices==2025.4.0 +lark==1.2.2 +ptyprocess==0.7.0 +xxhash==3.5.0 +mpmath==1.3.0 +seaborn==0.13.2 +wadler_lindig==0.1.7 +nbformat==5.10.4 +panel==1.8.0 +accelerate==1.10.1 +plotly==6.3.0 +narwhals==2.4.0 +huggingface-hub==0.34.4 +sentencepiece==0.2.1 +torchvision==0.23.0 +ipython==9.5.0 +tqdm==4.67.1 +contourpy==1.3.3 +nvidia-nvtx-cu12==12.8.90 +nvidia-cuda-runtime-cu12==12.8.90 +yarl==1.20.1 +charset-normalizer==3.4.3 +jupyter-events==0.12.0 +nbclient==0.10.2 +numpy==1.26.4 +decorator==5.2.1 +threadpoolctl==3.6.0 +networkx==3.5 +smmap==5.0.2 +nbconvert==7.16.6 +pytz==2025.2 +aiohappyeyeballs==2.6.1 +requests==2.32.5 +tinycss2==1.4.0 +defusedxml==0.7.1 +matplotlib-inline==0.1.7 +rpds-py==0.27.1 +wandb==0.21.4 +jedi==0.19.2 +pathspec==0.12.1 +transformer-lens==2.16.1 +sympy==1.14.0 +jupyterlab_pygments==0.3.0 +overrides==7.7.0 +notebook_shim==0.2.4 +jupyter==1.1.1 +protobuf==6.32.1 +better-abc==0.0.3 +jsonpointer==3.0.0 +terminado==0.18.1 +cfgv==3.4.0 +rfc3987-syntax==1.1.0 +annotated-types==0.7.0 +pyarrow==21.0.0 +webencodings==0.5.1 +wcwidth==0.2.13 +jupyterlab_server==2.27.3 +argon2-cffi-bindings==25.1.0 +nvidia-nvjitlink-cu12==12.8.93 +jaxtyping==0.3.2 +Pygments==2.19.2 +torch==2.8.0 +rfc3339-validator==0.1.4 +urllib3==2.5.0 +jupyterlab_widgets==3.0.15 +ipykernel==6.30.1 +nvidia-cudnn-cu12==9.10.2.21 +beautifulsoup4==4.13.5 +babel==2.17.0 +pure_eval==0.2.3 +pyparsing==3.2.3 +nvidia-cublas-cu12==12.8.4.1 +regex==2025.9.1 +pycparser==2.23 +soupsieve==2.8 +pytest-cov==7.0.0 +sniffio==1.3.1 +mypy==1.18.1 +notebook==7.4.5 +packaging==25.0 +h11==0.16.0 +psutil==7.0.0 +pexpect==4.9.0 +zstandard==0.25.0 +gitdb==4.0.12 +rfc3986-validator==0.1.1 +pyzmq==27.1.0 +jupyterlab==4.4.7 +toy_models==0.1.0 +torchaudio==2.8.0 +cffi==2.0.0 +mypy_extensions==1.1.0 +attrs==25.3.0 +statsmodels==0.14.6 +transformers==4.56.1 +jupyter_core==5.8.1 +bleach==6.2.0 +fqdn==1.5.1 +async-lru==2.0.5 +nvidia-nccl-cu12==2.27.3 +GitPython==3.1.45 +referencing==0.36.2 +click==8.2.1 +prometheus_client==0.22.1 +bokeh==3.8.0 +httpx==0.28.1 +setuptools==80.9.0 +argon2-cffi==25.1.0 +patsy==1.0.2 +multidict==6.6.4 +pyviz_comms==3.0.6 +arrow==1.3.0 +scikit-learn==1.8.0 +beartype==0.14.1 +ipywidgets==8.1.7 +pydantic_core==2.33.2 +markdown-it-py==4.0.0 +pandas==2.3.2 +virtualenv==20.34.0 +python-dotenv==1.1.1 +isoduration==20.11.0 +python-dateutil==2.9.0.post0 +nodeenv==1.9.1 +nvidia-curand-cu12==10.3.9.90 +webcolors==24.11.1 +MarkupSafe==3.0.2 +nvidia-cusolver-cu12==11.7.3.90 +Send2Trash==1.8.3 +coverage==7.10.6 +jupyter_server_terminals==0.5.3 +debugpy==1.8.16 +json5==0.12.1 +linkify-it-py==2.0.3 +importlib_metadata==8.7.0 +nvidia-cufft-cu12==11.3.3.83 +distlib==0.4.0 +typing-inspection==0.4.1 +identify==2.6.14 +nvidia-cufile-cu12==1.13.1.3 +scipy==1.17.0 +mdurl==0.1.2 +websocket-client==1.8.0 +jsonschema==4.25.1 +python-json-logger==3.3.0 +typing_extensions==4.15.0 +tokenizers==0.22.0 +ipympl==0.9.7 +einops==0.8.1 +jupyter_client==8.6.3 +ipython_pygments_lexers==1.1.1 +h5py==3.14.0 +tabulate==0.9.0 +propcache==0.3.2 +ruff==0.13.0 +tornado==6.5.2 +typeguard==4.4.4 +tomlkit==0.13.2 +pluggy==1.6.0 +pydantic==2.11.7 +zipp==3.23.0 +fancy-einsum==0.0.3 +fastjsonschema==2.21.2 +datasets==4.0.0 +fonttools==4.59.2 +executing==2.2.1 +pillow==11.3.0 +uc-micro-py==1.0.3 +Markdown==3.9 +pre_commit==4.3.0 +aiohttp==3.12.15 +mistune==3.1.4 +tzdata==2025.2 +parso==0.8.5 +triton==3.4.0 +kiwisolver==1.4.9 +idna==3.10 +multiprocess==0.70.16 +dill==0.3.8 +jupyter-lsp==2.3.0 +platformdirs==4.4.0 +sentry-sdk==2.37.1 +prompt_toolkit==3.0.52 +jsonschema-specifications==2025.9.1 +pytest==8.4.2 +mdit-py-plugins==0.5.0 +transformers-stream-generator==0.0.5 +nvidia-cusparselt-cu12==0.7.1 +joblib==1.5.3 +pandocfilters==1.5.1 +jupyter-console==6.6.3 +anyio==4.10.0 +six==1.17.0 +holoviews==1.21.0 +matplotlib==3.10.6 +colorcet==3.1.0 +uri-template==1.3.0 +nest-asyncio==1.6.0 +nvidia-cusparse-cu12==12.5.8.93 +iniconfig==2.1.0 +traitlets==5.14.3 +safetensors==0.6.2 +frozenlist==1.7.0 diff --git a/wandb/run-20260227_134232-xgg2g05z/files/wandb-metadata.json b/wandb/run-20260227_134232-xgg2g05z/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..616da9240ac1478ca16660c7d6d2d50b8c839709 --- /dev/null +++ b/wandb/run-20260227_134232-xgg2g05z/files/wandb-metadata.json @@ -0,0 +1,38 @@ +{ + "os": "Linux-5.19.0-45-generic-x86_64-with-glibc2.35", + "python": "CPython 3.11.7", + "startedAt": "2026-02-27T13:42:32.061506Z", + "program": "", + "git": { + "remote": "https://github.com/jgroh3/toy_models.git", + "commit": "d722bb952956265d0387df9c35a76703a66824ec" + }, + "email": "tzfof8@gmail.com", + "root": "/notebooks/toy_models/model_training/pile_llama_dmodel_d_model_64", + "host": "ne5ksb4j1i", + "executable": "/notebooks/toy_models/.toy_models_env/bin/python", + "cpu_count": 8, + "cpu_count_logical": 8, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 1, + "disk": { + "/": { + "total": "262240792576", + "used": "134969348096" + } + }, + "memory": { + "total": "47332843520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere", + "uuid": "GPU-8bd59a2c-3013-13c5-bfdb-39c73a6c33fa" + } + ], + "cudaVersion": "12.4", + "writerId": "g56c3l0w0nf9nh07zyr2zn93htbfwhf6" +} \ No newline at end of file diff --git a/wandb/run-20260227_134232-xgg2g05z/files/wandb-summary.json b/wandb/run-20260227_134232-xgg2g05z/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..e436d2c1a5ca7d6fc0e96c5040fcd4c5b56913d4 --- /dev/null +++ b/wandb/run-20260227_134232-xgg2g05z/files/wandb-summary.json @@ -0,0 +1 @@ +{"_runtime":1936.136135714,"tokens_per_second":32768,"progress":0.999424,"_wandb":{"runtime":1936},"step":6100,"train_loss":4.2433905601501465,"train_loss_ewma":4.355554509012181,"_timestamp":1.7722016885923805e+09,"learning_rate":0.001,"_step":6100,"tokens_seen":199884800} \ No newline at end of file diff --git a/wandb/run-20260227_134232-xgg2g05z/logs/debug-core.log b/wandb/run-20260227_134232-xgg2g05z/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..53ce26780db0c6899de65dcf441dc264f14df8d0 --- /dev/null +++ b/wandb/run-20260227_134232-xgg2g05z/logs/debug-core.log @@ -0,0 +1,16 @@ +{"time":"2026-02-27T13:42:32.386451069Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpyac6ju14/port-736.txt","pid":736,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2026-02-27T13:42:32.387071218Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":736} +{"time":"2026-02-27T13:42:32.387032177Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-736-795-1318082515/socket","Net":"unix"}} +{"time":"2026-02-27T13:42:32.551643921Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2026-02-27T13:42:32.56485775Z","level":"INFO","msg":"handleInformInit: received","streamId":"xgg2g05z","id":"1(@)"} +{"time":"2026-02-27T13:42:32.797085166Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"xgg2g05z","id":"1(@)"} +{"time":"2026-02-27T14:14:50.391682161Z","level":"INFO","msg":"handleInformFinish: finish message received","streamId":"xgg2g05z","id":"1(@)"} +{"time":"2026-02-27T14:14:50.395041057Z","level":"INFO","msg":"handleInformFinish: stream closed","streamId":"xgg2g05z","id":"1(@)"} +{"time":"2026-02-27T14:14:50.395055262Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"} +{"time":"2026-02-27T14:14:50.395068402Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"} +{"time":"2026-02-27T14:14:50.395113583Z","level":"INFO","msg":"connection: closing","id":"1(@)"} +{"time":"2026-02-27T14:14:50.395127503Z","level":"INFO","msg":"server is shutting down"} +{"time":"2026-02-27T14:14:50.39513038Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"} +{"time":"2026-02-27T14:14:50.395145395Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"} +{"time":"2026-02-27T14:14:50.395229263Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/tmp/wandb-736-795-1318082515/socket","Net":"unix"}} +{"time":"2026-02-27T14:14:50.395252595Z","level":"INFO","msg":"server is closed"} diff --git a/wandb/run-20260227_134232-xgg2g05z/logs/debug-internal.log b/wandb/run-20260227_134232-xgg2g05z/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..4e191fce92d4fc2dbbac82d0125a9bcb6a88135f --- /dev/null +++ b/wandb/run-20260227_134232-xgg2g05z/logs/debug-internal.log @@ -0,0 +1,12 @@ +{"time":"2026-02-27T13:42:32.565052008Z","level":"INFO","msg":"stream: starting","core version":"0.21.4"} +{"time":"2026-02-27T13:42:32.797022254Z","level":"INFO","msg":"stream: created new stream","id":"xgg2g05z"} +{"time":"2026-02-27T13:42:32.797085166Z","level":"INFO","msg":"stream: started","id":"xgg2g05z"} +{"time":"2026-02-27T13:42:32.797094826Z","level":"INFO","msg":"writer: started","stream_id":"xgg2g05z"} +{"time":"2026-02-27T13:42:32.797122373Z","level":"INFO","msg":"sender: started","stream_id":"xgg2g05z"} +{"time":"2026-02-27T13:42:32.797166061Z","level":"INFO","msg":"handler: started","stream_id":"xgg2g05z"} +{"time":"2026-02-27T14:14:49.6300194Z","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run metadata","runtime_seconds":0.002007355}],"total_operations":1}} +{"time":"2026-02-27T14:14:50.04736134Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2026-02-27T14:14:50.391696957Z","level":"INFO","msg":"stream: closing","id":"xgg2g05z"} +{"time":"2026-02-27T14:14:50.391768471Z","level":"INFO","msg":"handler: closed","stream_id":"xgg2g05z"} +{"time":"2026-02-27T14:14:50.39180817Z","level":"INFO","msg":"sender: closed","stream_id":"xgg2g05z"} +{"time":"2026-02-27T14:14:50.39180817Z","level":"INFO","msg":"stream: closed","id":"xgg2g05z"} diff --git a/wandb/run-20260227_134232-xgg2g05z/logs/debug.log b/wandb/run-20260227_134232-xgg2g05z/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..d5094b43877723353ea9d57ea6337d9391acdfc9 --- /dev/null +++ b/wandb/run-20260227_134232-xgg2g05z/logs/debug.log @@ -0,0 +1,26 @@ +2026-02-27 13:42:32,074 INFO MainThread:736 [wandb_setup.py:_flush():81] Current SDK version is 0.21.4 +2026-02-27 13:42:32,074 INFO MainThread:736 [wandb_setup.py:_flush():81] Configure stats pid to 736 +2026-02-27 13:42:32,074 INFO MainThread:736 [wandb_setup.py:_flush():81] Loading settings from /root/.config/wandb/settings +2026-02-27 13:42:32,074 INFO MainThread:736 [wandb_setup.py:_flush():81] Loading settings from /notebooks/toy_models/model_training/pile_llama_dmodel_d_model_64/wandb/settings +2026-02-27 13:42:32,074 INFO MainThread:736 [wandb_setup.py:_flush():81] Loading settings from environment variables +2026-02-27 13:42:32,074 INFO MainThread:736 [wandb_init.py:setup_run_log_directory():686] Logging user logs to /notebooks/toy_models/model_training/pile_llama_dmodel_d_model_64/wandb/run-20260227_134232-xgg2g05z/logs/debug.log +2026-02-27 13:42:32,074 INFO MainThread:736 [wandb_init.py:setup_run_log_directory():687] Logging internal logs to /notebooks/toy_models/model_training/pile_llama_dmodel_d_model_64/wandb/run-20260227_134232-xgg2g05z/logs/debug-internal.log +2026-02-27 13:42:32,074 INFO MainThread:736 [wandb_init.py:init():813] calling init triggers +2026-02-27 13:42:32,074 INFO MainThread:736 [wandb_init.py:init():818] wandb.init called with sweep_config: {} +config: {'model_name': 'pile_llama_dmodel', 'n_layers': 2, 'd_model': 64, 'd_mlp': 2048, 'd_head': 64, 'n_heads': 8, 'attn_only': False, 'layer_norm_eps': 1e-05, 'init_range': 0.02, 'n_ctx': 1024, 'd_vocab': 32000, 'dataset_name': 'eoinf/pile_llama', 'tokenizer_name': '', 'seed': 10, 'data_seed': 10, 'device': 'cuda', 'use_bfloat16_matmul': False, 'batch_size_per_device': 32, 'n_devices': 1, 'batches_per_step': 1, 'max_tokens': 200000000, 'lr_hidden': 0.001, 'lr_vector': 0.0005, 'lr_schedule': 'constant_with_warmup', 'warmup_tokens': 30000000, 'weight_decay': 0.05, 'grad_norm_clip': 1.0, 'train_loss_moving_average_beta': 0.99, 'log_interval': 25, 'save_checkpoints': True, 'checkpoint_interval': 500, 'checkpoint_interval_ratio': 1.1, 'save_log_checkpoints': True, 'use_wandb': True, 'batch_size': 32, 'tokens_per_step': 32768, 'warmup_steps': 915, 'max_steps': 6103, '_wandb': {}} +2026-02-27 13:42:32,074 INFO MainThread:736 [wandb_init.py:init():854] starting backend +2026-02-27 13:42:32,551 INFO MainThread:736 [wandb_init.py:init():857] sending inform_init request +2026-02-27 13:42:32,561 INFO MainThread:736 [wandb_init.py:init():865] backend started and connected +2026-02-27 13:42:32,563 INFO MainThread:736 [wandb_init.py:init():936] updated telemetry +2026-02-27 13:42:32,567 INFO MainThread:736 [wandb_init.py:init():960] communicating run to backend with 90.0 second timeout +2026-02-27 13:42:33,491 INFO MainThread:736 [wandb_init.py:init():1011] starting run threads in backend +2026-02-27 13:42:33,601 INFO MainThread:736 [wandb_run.py:_console_start():2506] atexit reg +2026-02-27 13:42:33,601 INFO MainThread:736 [wandb_run.py:_redirect():2354] redirect: wrap_raw +2026-02-27 13:42:33,601 INFO MainThread:736 [wandb_run.py:_redirect():2423] Wrapping output streams. +2026-02-27 13:42:33,601 INFO MainThread:736 [wandb_run.py:_redirect():2446] Redirects installed. +2026-02-27 13:42:33,611 INFO MainThread:736 [wandb_init.py:init():1049] run started, returning control to user process +2026-02-27 14:14:49,625 INFO MainThread:736 [wandb_run.py:_finish():2272] finishing run tzach/toy-transformer-replication/xgg2g05z +2026-02-27 14:14:49,627 INFO MainThread:736 [wandb_run.py:_atexit_cleanup():2471] got exitcode: 0 +2026-02-27 14:14:49,628 INFO MainThread:736 [wandb_run.py:_restore():2453] restore +2026-02-27 14:14:49,628 INFO MainThread:736 [wandb_run.py:_restore():2459] restore done +2026-02-27 14:14:50,391 INFO MainThread:736 [wandb_run.py:_footer_sync_info():3867] logging synced files diff --git a/wandb/run-20260227_134232-xgg2g05z/run-xgg2g05z.wandb b/wandb/run-20260227_134232-xgg2g05z/run-xgg2g05z.wandb new file mode 100644 index 0000000000000000000000000000000000000000..b2a328b0460fc736de3e7975e91447eb9e44bcb5 --- /dev/null +++ b/wandb/run-20260227_134232-xgg2g05z/run-xgg2g05z.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7fe24dca260ad112bc6bacbe81bac3b3ad90ebc8cb2a5e53af95d91035caf143 +size 4179961