diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..8e6192126b8e99cc94951a04494d7307f1f66497 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +wandb/run-20251101_050718-x0he2mby/run-x0he2mby.wandb filter=lfs diff=lfs merge=lfs -text diff --git a/checkpoints/metadata_000000032768.json b/checkpoints/metadata_000000032768.json new file mode 100644 index 0000000000000000000000000000000000000000..cab9ebf296514cff572e710f33752aabe422c07b --- /dev/null +++ b/checkpoints/metadata_000000032768.json @@ -0,0 +1 @@ +{"step": 1, "tokens_seen": 32768, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.504129409790039} \ No newline at end of file diff --git a/checkpoints/metadata_000000327680.json b/checkpoints/metadata_000000327680.json new file mode 100644 index 0000000000000000000000000000000000000000..5bca96ec04bffa07fdb9122839b921bb0fc65f72 --- /dev/null +++ b/checkpoints/metadata_000000327680.json @@ -0,0 +1 @@ +{"step": 10, "tokens_seen": 327680, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.495558547202545} \ No newline at end of file diff --git a/checkpoints/metadata_000000360448.json b/checkpoints/metadata_000000360448.json new file mode 100644 index 0000000000000000000000000000000000000000..7eec591e6be7118007fc0043c2812d480569d917 --- /dev/null +++ b/checkpoints/metadata_000000360448.json @@ -0,0 +1 @@ +{"step": 11, "tokens_seen": 360448, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.492465802382986} \ No newline at end of file diff --git a/checkpoints/metadata_000000425984.json b/checkpoints/metadata_000000425984.json new file mode 100644 index 0000000000000000000000000000000000000000..88c2f5f6f4036372820ab63de4dec79a50e8b48e --- /dev/null +++ b/checkpoints/metadata_000000425984.json @@ -0,0 +1 @@ +{"step": 13, "tokens_seen": 425984, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.48443794637458} \ No newline at end of file diff --git a/checkpoints/metadata_000000458752.json b/checkpoints/metadata_000000458752.json new file mode 100644 index 0000000000000000000000000000000000000000..3252dd237d025b783a393bc85bfaf5e39134a8bf --- /dev/null +++ b/checkpoints/metadata_000000458752.json @@ -0,0 +1 @@ +{"step": 14, "tokens_seen": 458752, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.479435028092475} \ No newline at end of file diff --git a/checkpoints/metadata_000000491520.json b/checkpoints/metadata_000000491520.json new file mode 100644 index 0000000000000000000000000000000000000000..b1fead27dce33a61449645773efddabd92eb9077 --- /dev/null +++ b/checkpoints/metadata_000000491520.json @@ -0,0 +1 @@ +{"step": 15, "tokens_seen": 491520, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.473799117447781} \ No newline at end of file diff --git a/checkpoints/metadata_000000557056.json b/checkpoints/metadata_000000557056.json new file mode 100644 index 0000000000000000000000000000000000000000..bc37be4928c4c5cce76e2d627b4eefb651e59a04 --- /dev/null +++ b/checkpoints/metadata_000000557056.json @@ -0,0 +1 @@ +{"step": 17, "tokens_seen": 557056, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.46143174252293} \ No newline at end of file diff --git a/checkpoints/metadata_000000622592.json b/checkpoints/metadata_000000622592.json new file mode 100644 index 0000000000000000000000000000000000000000..0b1bb93f7e0e9a974f69439e8e58b2f04826e367 --- /dev/null +++ b/checkpoints/metadata_000000622592.json @@ -0,0 +1 @@ +{"step": 19, "tokens_seen": 622592, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.447867588954788} \ No newline at end of file diff --git a/checkpoints/metadata_000000688128.json b/checkpoints/metadata_000000688128.json new file mode 100644 index 0000000000000000000000000000000000000000..f99e9ff9f9ee391a3570d93c9ccb723fae61839d --- /dev/null +++ b/checkpoints/metadata_000000688128.json @@ -0,0 +1 @@ +{"step": 21, "tokens_seen": 688128, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.432014080788836} \ No newline at end of file diff --git a/checkpoints/metadata_000000753664.json b/checkpoints/metadata_000000753664.json new file mode 100644 index 0000000000000000000000000000000000000000..690ef2b0bf5c54d2bd8255f8b3fba1136ccf07da --- /dev/null +++ b/checkpoints/metadata_000000753664.json @@ -0,0 +1 @@ +{"step": 23, "tokens_seen": 753664, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.415017053033225} \ No newline at end of file diff --git a/checkpoints/metadata_000000819200.json b/checkpoints/metadata_000000819200.json new file mode 100644 index 0000000000000000000000000000000000000000..a4a026f8d9328da3b12fb322e0520e90362b761d --- /dev/null +++ b/checkpoints/metadata_000000819200.json @@ -0,0 +1 @@ +{"step": 25, "tokens_seen": 819200, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.395785776391488} \ No newline at end of file diff --git a/checkpoints/metadata_000000917504.json b/checkpoints/metadata_000000917504.json new file mode 100644 index 0000000000000000000000000000000000000000..f287066f5137173edf502325188c05d0f6aa5008 --- /dev/null +++ b/checkpoints/metadata_000000917504.json @@ -0,0 +1 @@ +{"step": 28, "tokens_seen": 917504, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.367264762902195} \ No newline at end of file diff --git a/checkpoints/metadata_000000983040.json b/checkpoints/metadata_000000983040.json new file mode 100644 index 0000000000000000000000000000000000000000..6b002d36fb8a81c1c45db84bd08400d275fbb8ea --- /dev/null +++ b/checkpoints/metadata_000000983040.json @@ -0,0 +1 @@ +{"step": 30, "tokens_seen": 983040, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.347077211671865} \ No newline at end of file diff --git a/checkpoints/metadata_000001114112.json b/checkpoints/metadata_000001114112.json new file mode 100644 index 0000000000000000000000000000000000000000..a9687fee335d138105f1b973808622bd6c14702e --- /dev/null +++ b/checkpoints/metadata_000001114112.json @@ -0,0 +1 @@ +{"step": 34, "tokens_seen": 1114112, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.305477788660623} \ No newline at end of file diff --git a/checkpoints/metadata_000001212416.json b/checkpoints/metadata_000001212416.json new file mode 100644 index 0000000000000000000000000000000000000000..309d739eadb50fdb844e6d18f227fb45a7280f53 --- /dev/null +++ b/checkpoints/metadata_000001212416.json @@ -0,0 +1 @@ +{"step": 37, "tokens_seen": 1212416, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.271077942205011} \ No newline at end of file diff --git a/checkpoints/metadata_000001343488.json b/checkpoints/metadata_000001343488.json new file mode 100644 index 0000000000000000000000000000000000000000..97583f0d542c39a08772135291d1047ce4c9964e --- /dev/null +++ b/checkpoints/metadata_000001343488.json @@ -0,0 +1 @@ +{"step": 41, "tokens_seen": 1343488, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.221748212046837} \ No newline at end of file diff --git a/checkpoints/metadata_000001474560.json b/checkpoints/metadata_000001474560.json new file mode 100644 index 0000000000000000000000000000000000000000..06cead5907046cb008fb7b1bc0d18611b22c74da --- /dev/null +++ b/checkpoints/metadata_000001474560.json @@ -0,0 +1 @@ +{"step": 45, "tokens_seen": 1474560, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.167200491768405} \ No newline at end of file diff --git a/checkpoints/metadata_000001605632.json b/checkpoints/metadata_000001605632.json new file mode 100644 index 0000000000000000000000000000000000000000..06ce5670b8a17dbf9ad973681c02cbf047e797c8 --- /dev/null +++ b/checkpoints/metadata_000001605632.json @@ -0,0 +1 @@ +{"step": 49, "tokens_seen": 1605632, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.107531033976013} \ No newline at end of file diff --git a/checkpoints/metadata_000001769472.json b/checkpoints/metadata_000001769472.json new file mode 100644 index 0000000000000000000000000000000000000000..9e0612bb8a13c7e220fff5007679615caf804a35 --- /dev/null +++ b/checkpoints/metadata_000001769472.json @@ -0,0 +1 @@ +{"step": 54, "tokens_seen": 1769472, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.026064940279571} \ No newline at end of file diff --git a/checkpoints/metadata_000001966080.json b/checkpoints/metadata_000001966080.json new file mode 100644 index 0000000000000000000000000000000000000000..92abacb3fc648b0b45796c2f0eafd6f93e25a4e8 --- /dev/null +++ b/checkpoints/metadata_000001966080.json @@ -0,0 +1 @@ +{"step": 60, "tokens_seen": 1966080, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.92032294310719} \ No newline at end of file diff --git a/checkpoints/metadata_000002162688.json b/checkpoints/metadata_000002162688.json new file mode 100644 index 0000000000000000000000000000000000000000..f0c2ecf8245c9909f600b1b8dd7f3bac59193e31 --- /dev/null +++ b/checkpoints/metadata_000002162688.json @@ -0,0 +1 @@ +{"step": 66, "tokens_seen": 2162688, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.801876731497748} \ No newline at end of file diff --git a/checkpoints/metadata_000002359296.json b/checkpoints/metadata_000002359296.json new file mode 100644 index 0000000000000000000000000000000000000000..1c2f3b9b88121b89f5929864c1cd72cb1470b656 --- /dev/null +++ b/checkpoints/metadata_000002359296.json @@ -0,0 +1 @@ +{"step": 72, "tokens_seen": 2359296, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.675951440395803} \ No newline at end of file diff --git a/checkpoints/metadata_000002621440.json b/checkpoints/metadata_000002621440.json new file mode 100644 index 0000000000000000000000000000000000000000..40d8df0cdc63ee2e932cc7b3d04f5426055283d6 --- /dev/null +++ b/checkpoints/metadata_000002621440.json @@ -0,0 +1 @@ +{"step": 80, "tokens_seen": 2621440, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.49608959662518} \ No newline at end of file diff --git a/checkpoints/metadata_000002883584.json b/checkpoints/metadata_000002883584.json new file mode 100644 index 0000000000000000000000000000000000000000..cac34a8a829cea684b6b29dddbabdb453705a2e1 --- /dev/null +++ b/checkpoints/metadata_000002883584.json @@ -0,0 +1 @@ +{"step": 88, "tokens_seen": 2883584, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.305093544243261} \ No newline at end of file diff --git a/checkpoints/metadata_000003178496.json b/checkpoints/metadata_000003178496.json new file mode 100644 index 0000000000000000000000000000000000000000..09f5e308d7e7be26a570eb8a6db65e149f31f6eb --- /dev/null +++ b/checkpoints/metadata_000003178496.json @@ -0,0 +1 @@ +{"step": 97, "tokens_seen": 3178496, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.088921241499149} \ No newline at end of file diff --git a/checkpoints/metadata_000003473408.json b/checkpoints/metadata_000003473408.json new file mode 100644 index 0000000000000000000000000000000000000000..7a9df3ea88f450b3de622bc750ae81ef8d30c561 --- /dev/null +++ b/checkpoints/metadata_000003473408.json @@ -0,0 +1 @@ +{"step": 106, "tokens_seen": 3473408, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 8.869608010287346} \ No newline at end of file diff --git a/checkpoints/metadata_000003833856.json b/checkpoints/metadata_000003833856.json new file mode 100644 index 0000000000000000000000000000000000000000..440a9fa14930e9b0bb8684673c446e05b2f1884f --- /dev/null +++ b/checkpoints/metadata_000003833856.json @@ -0,0 +1 @@ +{"step": 117, "tokens_seen": 3833856, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 8.61439148235503} \ No newline at end of file diff --git a/checkpoints/metadata_000004227072.json b/checkpoints/metadata_000004227072.json new file mode 100644 index 0000000000000000000000000000000000000000..b84dc925491ca3663ced18798b689350f8a130e2 --- /dev/null +++ b/checkpoints/metadata_000004227072.json @@ -0,0 +1 @@ +{"step": 129, "tokens_seen": 4227072, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 8.35317155220863} \ No newline at end of file diff --git a/checkpoints/metadata_000004653056.json b/checkpoints/metadata_000004653056.json new file mode 100644 index 0000000000000000000000000000000000000000..ff6a6292b55638d38774da0a497c690b174e4593 --- /dev/null +++ b/checkpoints/metadata_000004653056.json @@ -0,0 +1 @@ +{"step": 142, "tokens_seen": 4653056, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 8.09067977685243} \ No newline at end of file diff --git a/checkpoints/metadata_000005111808.json b/checkpoints/metadata_000005111808.json new file mode 100644 index 0000000000000000000000000000000000000000..12927eca5619c7bfba165471814fe8532d3d5dbd --- /dev/null +++ b/checkpoints/metadata_000005111808.json @@ -0,0 +1 @@ +{"step": 156, "tokens_seen": 5111808, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.82408076517334} \ No newline at end of file diff --git a/checkpoints/metadata_000005603328.json b/checkpoints/metadata_000005603328.json new file mode 100644 index 0000000000000000000000000000000000000000..0414adb8a4230d13d3d120e886311c42d9017c05 --- /dev/null +++ b/checkpoints/metadata_000005603328.json @@ -0,0 +1 @@ +{"step": 171, "tokens_seen": 5603328, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.5609016385690495} \ No newline at end of file diff --git a/checkpoints/metadata_000006193152.json b/checkpoints/metadata_000006193152.json new file mode 100644 index 0000000000000000000000000000000000000000..c265d436574677d7baed2457b78789869117dd11 --- /dev/null +++ b/checkpoints/metadata_000006193152.json @@ -0,0 +1 @@ +{"step": 189, "tokens_seen": 6193152, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.27145520178176} \ No newline at end of file diff --git a/checkpoints/metadata_000006782976.json b/checkpoints/metadata_000006782976.json new file mode 100644 index 0000000000000000000000000000000000000000..edacab22ce286670ebfb9c257ad08fd60770b129 --- /dev/null +++ b/checkpoints/metadata_000006782976.json @@ -0,0 +1 @@ +{"step": 207, "tokens_seen": 6782976, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.010424915475919} \ No newline at end of file diff --git a/checkpoints/metadata_000007471104.json b/checkpoints/metadata_000007471104.json new file mode 100644 index 0000000000000000000000000000000000000000..f06ec6b682c2f010fbbef3694e4efcdda9586e08 --- /dev/null +++ b/checkpoints/metadata_000007471104.json @@ -0,0 +1 @@ +{"step": 228, "tokens_seen": 7471104, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.74492772170685} \ No newline at end of file diff --git a/checkpoints/metadata_000008224768.json b/checkpoints/metadata_000008224768.json new file mode 100644 index 0000000000000000000000000000000000000000..110d388f05986ce90d3a7a222371dd2e79fb9888 --- /dev/null +++ b/checkpoints/metadata_000008224768.json @@ -0,0 +1 @@ +{"step": 251, "tokens_seen": 8224768, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.488508960587569} \ No newline at end of file diff --git a/checkpoints/metadata_000009043968.json b/checkpoints/metadata_000009043968.json new file mode 100644 index 0000000000000000000000000000000000000000..30d730dbf9560ee506e136e05cd969e363b76352 --- /dev/null +++ b/checkpoints/metadata_000009043968.json @@ -0,0 +1 @@ +{"step": 276, "tokens_seen": 9043968, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.244596883086281} \ No newline at end of file diff --git a/checkpoints/metadata_000009961472.json b/checkpoints/metadata_000009961472.json new file mode 100644 index 0000000000000000000000000000000000000000..0f1e0ca79c2adcd22d5533eddaacfe56c2557904 --- /dev/null +++ b/checkpoints/metadata_000009961472.json @@ -0,0 +1 @@ +{"step": 304, "tokens_seen": 9961472, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.01574416153393} \ No newline at end of file diff --git a/checkpoints/metadata_000010944512.json b/checkpoints/metadata_000010944512.json new file mode 100644 index 0000000000000000000000000000000000000000..28e8e10a62a4a5ff5e44e988b87e59fa679b3fed --- /dev/null +++ b/checkpoints/metadata_000010944512.json @@ -0,0 +1 @@ +{"step": 334, "tokens_seen": 10944512, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.799362957823811} \ No newline at end of file diff --git a/checkpoints/metadata_000012058624.json b/checkpoints/metadata_000012058624.json new file mode 100644 index 0000000000000000000000000000000000000000..8e53d32d9fba2bcbfe1d1034d0ccc2cf60b8a84c --- /dev/null +++ b/checkpoints/metadata_000012058624.json @@ -0,0 +1 @@ +{"step": 368, "tokens_seen": 12058624, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.604772800413414} \ No newline at end of file diff --git a/checkpoints/metadata_000013271040.json b/checkpoints/metadata_000013271040.json new file mode 100644 index 0000000000000000000000000000000000000000..673d21d0b94c5447acd9200c677f6160a6159654 --- /dev/null +++ b/checkpoints/metadata_000013271040.json @@ -0,0 +1 @@ +{"step": 405, "tokens_seen": 13271040, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.424803652632412} \ No newline at end of file diff --git a/checkpoints/metadata_000014581760.json b/checkpoints/metadata_000014581760.json new file mode 100644 index 0000000000000000000000000000000000000000..8c23d38c072c67071c25490d6d39522d3289d29f --- /dev/null +++ b/checkpoints/metadata_000014581760.json @@ -0,0 +1 @@ +{"step": 445, "tokens_seen": 14581760, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.266946949866824} \ No newline at end of file diff --git a/checkpoints/metadata_000016056320.json b/checkpoints/metadata_000016056320.json new file mode 100644 index 0000000000000000000000000000000000000000..a78fba29e5dc900ef672fe43a0968c2741adf396 --- /dev/null +++ b/checkpoints/metadata_000016056320.json @@ -0,0 +1 @@ +{"step": 490, "tokens_seen": 16056320, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.121108634885812} \ No newline at end of file diff --git a/checkpoints/metadata_000016384000.json b/checkpoints/metadata_000016384000.json new file mode 100644 index 0000000000000000000000000000000000000000..0b49771abcc1ec80ebdf990b7f03352cb53eb269 --- /dev/null +++ b/checkpoints/metadata_000016384000.json @@ -0,0 +1 @@ +{"step": 500, "tokens_seen": 16384000, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.0922184937286685} \ No newline at end of file diff --git a/checkpoints/metadata_000017661952.json b/checkpoints/metadata_000017661952.json new file mode 100644 index 0000000000000000000000000000000000000000..2977d15940b186372a71aeb114b25b8c19fc53ac --- /dev/null +++ b/checkpoints/metadata_000017661952.json @@ -0,0 +1 @@ +{"step": 539, "tokens_seen": 17661952, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.992866171579676} \ No newline at end of file diff --git a/checkpoints/metadata_000019431424.json b/checkpoints/metadata_000019431424.json new file mode 100644 index 0000000000000000000000000000000000000000..73a3de15cf9bf989c06a0e223a4010c16827f141 --- /dev/null +++ b/checkpoints/metadata_000019431424.json @@ -0,0 +1 @@ +{"step": 593, "tokens_seen": 19431424, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.878253635465713} \ No newline at end of file diff --git a/checkpoints/metadata_000021364736.json b/checkpoints/metadata_000021364736.json new file mode 100644 index 0000000000000000000000000000000000000000..640c2ab5256ea4e0ca4f7d345fb899e0f6886d13 --- /dev/null +++ b/checkpoints/metadata_000021364736.json @@ -0,0 +1 @@ +{"step": 652, "tokens_seen": 21364736, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.7473133185118135} \ No newline at end of file diff --git a/checkpoints/metadata_000023494656.json b/checkpoints/metadata_000023494656.json new file mode 100644 index 0000000000000000000000000000000000000000..11d7ec76a3e410d7a2268650dced6c7598ab3322 --- /dev/null +++ b/checkpoints/metadata_000023494656.json @@ -0,0 +1 @@ +{"step": 717, "tokens_seen": 23494656, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.627843826891225} \ No newline at end of file diff --git a/checkpoints/metadata_000025853952.json b/checkpoints/metadata_000025853952.json new file mode 100644 index 0000000000000000000000000000000000000000..99c9605b1d199bb6f38f12f101c9cdab26832741 --- /dev/null +++ b/checkpoints/metadata_000025853952.json @@ -0,0 +1 @@ +{"step": 789, "tokens_seen": 25853952, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.504817329035256} \ No newline at end of file diff --git a/checkpoints/metadata_000028442624.json b/checkpoints/metadata_000028442624.json new file mode 100644 index 0000000000000000000000000000000000000000..584cf93dd536bfcdc3640e218dc112de41cc842a --- /dev/null +++ b/checkpoints/metadata_000028442624.json @@ -0,0 +1 @@ +{"step": 868, "tokens_seen": 28442624, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.3874906557707165} \ No newline at end of file diff --git a/checkpoints/metadata_000031293440.json b/checkpoints/metadata_000031293440.json new file mode 100644 index 0000000000000000000000000000000000000000..2dcfc3f1ab0a51e609828af64a361e59bb2532a5 --- /dev/null +++ b/checkpoints/metadata_000031293440.json @@ -0,0 +1 @@ +{"step": 955, "tokens_seen": 31293440, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.284133535172052} \ No newline at end of file diff --git a/checkpoints/metadata_000032768000.json b/checkpoints/metadata_000032768000.json new file mode 100644 index 0000000000000000000000000000000000000000..5ea1a28050c80d88a4922aacc51e14ff03b73ed3 --- /dev/null +++ b/checkpoints/metadata_000032768000.json @@ -0,0 +1 @@ +{"step": 1000, "tokens_seen": 32768000, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.2378749848380695} \ No newline at end of file diff --git a/checkpoints/metadata_000034439168.json b/checkpoints/metadata_000034439168.json new file mode 100644 index 0000000000000000000000000000000000000000..c7ca00b31bc4884340b676de921b80bc1acfa8d8 --- /dev/null +++ b/checkpoints/metadata_000034439168.json @@ -0,0 +1 @@ +{"step": 1051, "tokens_seen": 34439168, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.188817285499057} \ No newline at end of file diff --git a/checkpoints/metadata_000037879808.json b/checkpoints/metadata_000037879808.json new file mode 100644 index 0000000000000000000000000000000000000000..593e4f70827db3ca295f6a0d8e6c4148eaf6d452 --- /dev/null +++ b/checkpoints/metadata_000037879808.json @@ -0,0 +1 @@ +{"step": 1156, "tokens_seen": 37879808, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.098113963969436} \ No newline at end of file diff --git a/checkpoints/metadata_000041648128.json b/checkpoints/metadata_000041648128.json new file mode 100644 index 0000000000000000000000000000000000000000..fb6a8c7e372b00a94af1b38e53c33b88068b7d36 --- /dev/null +++ b/checkpoints/metadata_000041648128.json @@ -0,0 +1 @@ +{"step": 1271, "tokens_seen": 41648128, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.006975574043001} \ No newline at end of file diff --git a/checkpoints/metadata_000045842432.json b/checkpoints/metadata_000045842432.json new file mode 100644 index 0000000000000000000000000000000000000000..bac5951952a380f8f18d446d7f73cb0a1188c031 --- /dev/null +++ b/checkpoints/metadata_000045842432.json @@ -0,0 +1 @@ +{"step": 1399, "tokens_seen": 45842432, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.9261788687882957} \ No newline at end of file diff --git a/checkpoints/metadata_000049152000.json b/checkpoints/metadata_000049152000.json new file mode 100644 index 0000000000000000000000000000000000000000..6161f01d9c04f442d218b3b92a422300e0e400f1 --- /dev/null +++ b/checkpoints/metadata_000049152000.json @@ -0,0 +1 @@ +{"step": 1500, "tokens_seen": 49152000, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.8758232860877797} \ No newline at end of file diff --git a/checkpoints/metadata_000050397184.json b/checkpoints/metadata_000050397184.json new file mode 100644 index 0000000000000000000000000000000000000000..944f36451900ae2f7fdad156f279748bdb79c8b7 --- /dev/null +++ b/checkpoints/metadata_000050397184.json @@ -0,0 +1 @@ +{"step": 1538, "tokens_seen": 50397184, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.8591311093268574} \ No newline at end of file diff --git a/checkpoints/metadata_000055443456.json b/checkpoints/metadata_000055443456.json new file mode 100644 index 0000000000000000000000000000000000000000..701c0cbe74f925c4e6e032600425fba09f3c9519 --- /dev/null +++ b/checkpoints/metadata_000055443456.json @@ -0,0 +1 @@ +{"step": 1692, "tokens_seen": 55443456, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.7807525916975844} \ No newline at end of file diff --git a/checkpoints/metadata_000061014016.json b/checkpoints/metadata_000061014016.json new file mode 100644 index 0000000000000000000000000000000000000000..bf4ae03cdfbadf8c74b17eee202489517b9e740e --- /dev/null +++ b/checkpoints/metadata_000061014016.json @@ -0,0 +1 @@ +{"step": 1862, "tokens_seen": 61014016, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.698363237126387} \ No newline at end of file diff --git a/checkpoints/metadata_000065536000.json b/checkpoints/metadata_000065536000.json new file mode 100644 index 0000000000000000000000000000000000000000..dcbe5d5f051a58f4cc42a6195ae5da8e0546a8c7 --- /dev/null +++ b/checkpoints/metadata_000065536000.json @@ -0,0 +1 @@ +{"step": 2000, "tokens_seen": 65536000, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.6417533175405166} \ No newline at end of file diff --git a/checkpoints/metadata_000067108864.json b/checkpoints/metadata_000067108864.json new file mode 100644 index 0000000000000000000000000000000000000000..9f3a6961f7ca1d8a0f445d73ca0ae4a276d4be9e --- /dev/null +++ b/checkpoints/metadata_000067108864.json @@ -0,0 +1 @@ +{"step": 2048, "tokens_seen": 67108864, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.618408535883721} \ No newline at end of file diff --git a/checkpoints/metadata_000073826304.json b/checkpoints/metadata_000073826304.json new file mode 100644 index 0000000000000000000000000000000000000000..bee16b11cce19febc6608f45e8c5c24e3a691263 --- /dev/null +++ b/checkpoints/metadata_000073826304.json @@ -0,0 +1 @@ +{"step": 2253, "tokens_seen": 73826304, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.5608141763861445} \ No newline at end of file diff --git a/checkpoints/metadata_000081199104.json b/checkpoints/metadata_000081199104.json new file mode 100644 index 0000000000000000000000000000000000000000..253ccf1751ae520f11bc39a015140674e503d558 --- /dev/null +++ b/checkpoints/metadata_000081199104.json @@ -0,0 +1 @@ +{"step": 2478, "tokens_seen": 81199104, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.508173367949673} \ No newline at end of file diff --git a/checkpoints/metadata_000081920000.json b/checkpoints/metadata_000081920000.json new file mode 100644 index 0000000000000000000000000000000000000000..5bbe451cb0cc1004eef014a11395d09ccc346090 --- /dev/null +++ b/checkpoints/metadata_000081920000.json @@ -0,0 +1 @@ +{"step": 2500, "tokens_seen": 81920000, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.503352878625294} \ No newline at end of file diff --git a/checkpoints/metadata_000089325568.json b/checkpoints/metadata_000089325568.json new file mode 100644 index 0000000000000000000000000000000000000000..50f42c54320de822cdfad3a389d06418357b92b1 --- /dev/null +++ b/checkpoints/metadata_000089325568.json @@ -0,0 +1 @@ +{"step": 2726, "tokens_seen": 89325568, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.4590670421463705} \ No newline at end of file diff --git a/checkpoints/metadata_000098271232.json b/checkpoints/metadata_000098271232.json new file mode 100644 index 0000000000000000000000000000000000000000..d2fb710f19ecb77df80754238724258acf371353 --- /dev/null +++ b/checkpoints/metadata_000098271232.json @@ -0,0 +1 @@ +{"step": 2999, "tokens_seen": 98271232, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.4168162174185306} \ No newline at end of file diff --git a/checkpoints/metadata_000098304000.json b/checkpoints/metadata_000098304000.json new file mode 100644 index 0000000000000000000000000000000000000000..c632e4ed405d9c056f1eb4d7946cc7667f095446 --- /dev/null +++ b/checkpoints/metadata_000098304000.json @@ -0,0 +1 @@ +{"step": 3000, "tokens_seen": 98304000, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.416848075080771} \ No newline at end of file diff --git a/checkpoints/metadata_000108068864.json b/checkpoints/metadata_000108068864.json new file mode 100644 index 0000000000000000000000000000000000000000..8b708cd48d37ddd7b4a4ba020a184be6e04c908d --- /dev/null +++ b/checkpoints/metadata_000108068864.json @@ -0,0 +1 @@ +{"step": 3298, "tokens_seen": 108068864, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.3826930983194345} \ No newline at end of file diff --git a/checkpoints/metadata_000114688000.json b/checkpoints/metadata_000114688000.json new file mode 100644 index 0000000000000000000000000000000000000000..adba00e1aec2f3400d2ea9b84d3afe8a1a051054 --- /dev/null +++ b/checkpoints/metadata_000114688000.json @@ -0,0 +1 @@ +{"step": 3500, "tokens_seen": 114688000, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.3525031466174857} \ No newline at end of file diff --git a/checkpoints/metadata_000118882304.json b/checkpoints/metadata_000118882304.json new file mode 100644 index 0000000000000000000000000000000000000000..19b5bad0e8e4a3fe709092dca449f9862481338a --- /dev/null +++ b/checkpoints/metadata_000118882304.json @@ -0,0 +1 @@ +{"step": 3628, "tokens_seen": 118882304, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.340319485464509} \ No newline at end of file diff --git a/checkpoints/metadata_000130777088.json b/checkpoints/metadata_000130777088.json new file mode 100644 index 0000000000000000000000000000000000000000..899c424f75d33f9e6f911cb8496999406c549a1a --- /dev/null +++ b/checkpoints/metadata_000130777088.json @@ -0,0 +1 @@ +{"step": 3991, "tokens_seen": 130777088, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.305597207622769} \ No newline at end of file diff --git a/checkpoints/metadata_000131072000.json b/checkpoints/metadata_000131072000.json new file mode 100644 index 0000000000000000000000000000000000000000..e688a687c4c99ae320680d7d1ea7d4f83671a2ec --- /dev/null +++ b/checkpoints/metadata_000131072000.json @@ -0,0 +1 @@ +{"step": 4000, "tokens_seen": 131072000, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.302212976728656} \ No newline at end of file diff --git a/checkpoints/metadata_000143834112.json b/checkpoints/metadata_000143834112.json new file mode 100644 index 0000000000000000000000000000000000000000..78e5f9c3fe207f08cbc13b1e4d8d8d4a5ede9886 --- /dev/null +++ b/checkpoints/metadata_000143834112.json @@ -0,0 +1 @@ +{"step": 4390, "tokens_seen": 143834112, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.2657009637928134} \ No newline at end of file diff --git a/checkpoints/metadata_000147438592.json b/checkpoints/metadata_000147438592.json new file mode 100644 index 0000000000000000000000000000000000000000..49bc93d85fdd7cf8d36e19ca0f6f556095528ccf --- /dev/null +++ b/checkpoints/metadata_000147438592.json @@ -0,0 +1 @@ +{"step": 4500, "tokens_seen": 147438592, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.226117604896941} \ No newline at end of file diff --git a/checkpoints/metadata_000158252032.json b/checkpoints/metadata_000158252032.json new file mode 100644 index 0000000000000000000000000000000000000000..c15e2706d050c838f172e346047fa925af92ac7a --- /dev/null +++ b/checkpoints/metadata_000158252032.json @@ -0,0 +1 @@ +{"step": 4830, "tokens_seen": 158252032, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.2149550469212635} \ No newline at end of file diff --git a/checkpoints/metadata_000163822592.json b/checkpoints/metadata_000163822592.json new file mode 100644 index 0000000000000000000000000000000000000000..abd56a5d28996d7432dc9a1ff10db74dbb643a73 --- /dev/null +++ b/checkpoints/metadata_000163822592.json @@ -0,0 +1 @@ +{"step": 5000, "tokens_seen": 163822592, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.209756738454079} \ No newline at end of file diff --git a/checkpoints/metadata_000174078976.json b/checkpoints/metadata_000174078976.json new file mode 100644 index 0000000000000000000000000000000000000000..a7055459502d430105f2b0cfe42968848bbba19b --- /dev/null +++ b/checkpoints/metadata_000174078976.json @@ -0,0 +1 @@ +{"step": 5313, "tokens_seen": 174078976, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.1901269128420986} \ No newline at end of file diff --git a/checkpoints/metadata_000180206592.json b/checkpoints/metadata_000180206592.json new file mode 100644 index 0000000000000000000000000000000000000000..54d16ac80b70c4528fe300bb2cf22c6dadb4bdbd --- /dev/null +++ b/checkpoints/metadata_000180206592.json @@ -0,0 +1 @@ +{"step": 5500, "tokens_seen": 180206592, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.1945651690385124} \ No newline at end of file diff --git a/checkpoints/metadata_000191478784.json b/checkpoints/metadata_000191478784.json new file mode 100644 index 0000000000000000000000000000000000000000..0540cad19328c1b1f41f960c51821e0de79327a1 --- /dev/null +++ b/checkpoints/metadata_000191478784.json @@ -0,0 +1 @@ +{"step": 5844, "tokens_seen": 191478784, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.1820776388158563} \ No newline at end of file diff --git a/checkpoints/metadata_000196590592.json b/checkpoints/metadata_000196590592.json new file mode 100644 index 0000000000000000000000000000000000000000..9608125648cae8cf19d1036b9367326b6cf71075 --- /dev/null +++ b/checkpoints/metadata_000196590592.json @@ -0,0 +1 @@ +{"step": 6000, "tokens_seen": 196590592, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.1693632259766034} \ No newline at end of file diff --git a/checkpoints/metadata_000196688896.json b/checkpoints/metadata_000196688896.json new file mode 100644 index 0000000000000000000000000000000000000000..a5381d1a701d7628cda782854bb30352a0ab57be --- /dev/null +++ b/checkpoints/metadata_000196688896.json @@ -0,0 +1 @@ +{"step": 6003, "tokens_seen": 196688896, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.168952258049862} \ No newline at end of file diff --git a/checkpoints/metadata_000197344256.json b/checkpoints/metadata_000197344256.json new file mode 100644 index 0000000000000000000000000000000000000000..dea715b3847a57d6705a31f30872846e52a43684 --- /dev/null +++ b/checkpoints/metadata_000197344256.json @@ -0,0 +1 @@ +{"step": 6023, "tokens_seen": 197344256, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.171798920569715} \ No newline at end of file diff --git a/checkpoints/metadata_000197999616.json b/checkpoints/metadata_000197999616.json new file mode 100644 index 0000000000000000000000000000000000000000..9f76e3755e977f70163d968e43a58309e79ada50 --- /dev/null +++ b/checkpoints/metadata_000197999616.json @@ -0,0 +1 @@ +{"step": 6043, "tokens_seen": 197999616, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.1724705955477557} \ No newline at end of file diff --git a/checkpoints/metadata_000198654976.json b/checkpoints/metadata_000198654976.json new file mode 100644 index 0000000000000000000000000000000000000000..b1ef60939ac226eefe30ece7581a7af4268d48d4 --- /dev/null +++ b/checkpoints/metadata_000198654976.json @@ -0,0 +1 @@ +{"step": 6063, "tokens_seen": 198654976, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.16965488110196} \ No newline at end of file diff --git a/checkpoints/metadata_000199310336.json b/checkpoints/metadata_000199310336.json new file mode 100644 index 0000000000000000000000000000000000000000..a8400ef1dd8f03d4c846ef4614bba062c495c9ea --- /dev/null +++ b/checkpoints/metadata_000199310336.json @@ -0,0 +1 @@ +{"step": 6083, "tokens_seen": 199310336, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.173379423586945} \ No newline at end of file diff --git a/checkpoints/metadata_000199932928.json b/checkpoints/metadata_000199932928.json new file mode 100644 index 0000000000000000000000000000000000000000..b528453dd1f04d2fcb97266a922e54d655100505 --- /dev/null +++ b/checkpoints/metadata_000199932928.json @@ -0,0 +1 @@ +{"step": 6102, "tokens_seen": 199932928, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.1746377496608504} \ No newline at end of file diff --git a/checkpoints/model_weights_000000032768.pt b/checkpoints/model_weights_000000032768.pt new file mode 100644 index 0000000000000000000000000000000000000000..b078b2917bda543b2dcc8cbf674e7870d98b0346 --- /dev/null +++ b/checkpoints/model_weights_000000032768.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db649a536cb0a57b27c8d60c5bb440a77954c7b96613bd240a49eda4fca2fb84 +size 158534613 diff --git a/checkpoints/model_weights_000000327680.pt b/checkpoints/model_weights_000000327680.pt new file mode 100644 index 0000000000000000000000000000000000000000..633eecd0ff370a80b586996fc3a445c1cee0e672 --- /dev/null +++ b/checkpoints/model_weights_000000327680.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93705569fc1e4ebc703fafa5fca35896d7db01cb668575f779674f4b69cd6ee4 +size 158534613 diff --git a/checkpoints/model_weights_000000360448.pt b/checkpoints/model_weights_000000360448.pt new file mode 100644 index 0000000000000000000000000000000000000000..41572c32483acc212bad15980e25fc777adf2b4b --- /dev/null +++ b/checkpoints/model_weights_000000360448.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03e1978f27dad69bda25b213601a8878c0a67b20624b38bff35e565925ade323 +size 158534613 diff --git a/checkpoints/model_weights_000000425984.pt b/checkpoints/model_weights_000000425984.pt new file mode 100644 index 0000000000000000000000000000000000000000..ee6bacb05d533e867086f1e7f1b804946a6b3c19 --- /dev/null +++ b/checkpoints/model_weights_000000425984.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2679541973a5efe6807b9e9540b8da36bab972a46c0d60c7731bee82bb426b7 +size 158534613 diff --git a/checkpoints/model_weights_000000458752.pt b/checkpoints/model_weights_000000458752.pt new file mode 100644 index 0000000000000000000000000000000000000000..1779aacd8f56efb8994e0878f7a60664cc5867a7 --- /dev/null +++ b/checkpoints/model_weights_000000458752.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e43cc8b07ed482950ebe3ac41c54b4d170cc848ade1ff0649f9e990f74a7d563 +size 158534613 diff --git a/checkpoints/model_weights_000000491520.pt b/checkpoints/model_weights_000000491520.pt new file mode 100644 index 0000000000000000000000000000000000000000..c85ec245405884100dc89435579777b36133b07b --- /dev/null +++ b/checkpoints/model_weights_000000491520.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c5cd663cbb5b3e8a1293e26fd9e7bc70779316e6969e3a07503dd96508c4562 +size 158534613 diff --git a/checkpoints/model_weights_000000557056.pt b/checkpoints/model_weights_000000557056.pt new file mode 100644 index 0000000000000000000000000000000000000000..3886509361ba1a1f560a273589070ce85aebc3b1 --- /dev/null +++ b/checkpoints/model_weights_000000557056.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80e31633549f3ac1711987ffd49be0764beeec64d235538e481c243739571c10 +size 158534613 diff --git a/checkpoints/model_weights_000000622592.pt b/checkpoints/model_weights_000000622592.pt new file mode 100644 index 0000000000000000000000000000000000000000..8b651c572d8c747e64b12e65b152bcd7a60c3795 --- /dev/null +++ b/checkpoints/model_weights_000000622592.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f24a36e9f8fa514d4b473f27b2803debbee316d86ecabed86af2f7ca6bd6014 +size 158534613 diff --git a/checkpoints/model_weights_000000688128.pt b/checkpoints/model_weights_000000688128.pt new file mode 100644 index 0000000000000000000000000000000000000000..22d1dbc988dd1e7042f11fe93638820952a6cc0f --- /dev/null +++ b/checkpoints/model_weights_000000688128.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a12bcbc4bfbba1f2addbc548a3a083a82abe7e03d608f69f6cda13d5ffb1b9c +size 158534613 diff --git a/checkpoints/model_weights_000000753664.pt b/checkpoints/model_weights_000000753664.pt new file mode 100644 index 0000000000000000000000000000000000000000..e88c1fdb2bfcac445d064efd4349210c00b83929 --- /dev/null +++ b/checkpoints/model_weights_000000753664.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7f0be30d47f9f463ee3b15d965e82c9496fb444316424867a11c6818002fa48 +size 158534613 diff --git a/checkpoints/model_weights_000000819200.pt b/checkpoints/model_weights_000000819200.pt new file mode 100644 index 0000000000000000000000000000000000000000..8bdc601fc39a0401dcaafb8ffc462e653f9a9ae0 --- /dev/null +++ b/checkpoints/model_weights_000000819200.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40cb89b1ccfed518f8012726ed8a707119a943452d1bba507fb6cdfdc37949ad +size 158534613 diff --git a/checkpoints/model_weights_000000917504.pt b/checkpoints/model_weights_000000917504.pt new file mode 100644 index 0000000000000000000000000000000000000000..75ad8474620210057031b603546a330c88a7d1b9 --- /dev/null +++ b/checkpoints/model_weights_000000917504.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83a015a756bcca55c71d822a457e0be1ffaa2c9b1d2be22e330ef72c0f03d434 +size 158534613 diff --git a/checkpoints/model_weights_000000983040.pt b/checkpoints/model_weights_000000983040.pt new file mode 100644 index 0000000000000000000000000000000000000000..70d6aaff0864b665ae3b97e6b97fb03dc952ca85 --- /dev/null +++ b/checkpoints/model_weights_000000983040.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bda5b0d8913da93dd318c6180bd8270becac8e132ed6c81b3a86b9c8d574501b +size 158534613 diff --git a/checkpoints/model_weights_000001114112.pt b/checkpoints/model_weights_000001114112.pt new file mode 100644 index 0000000000000000000000000000000000000000..a82f682554e4a4a9e9ac7bb5b1941fb361064a27 --- /dev/null +++ b/checkpoints/model_weights_000001114112.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa806953b63b369ee6b445763dadad58896010aca3d1ca3b5a7e40d4086a4b5c +size 158534613 diff --git a/checkpoints/model_weights_000001212416.pt b/checkpoints/model_weights_000001212416.pt new file mode 100644 index 0000000000000000000000000000000000000000..91a678589eb386cd6dc4908db87319c2b31e020d --- /dev/null +++ b/checkpoints/model_weights_000001212416.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62339fb2d4c664a1b3930ab94b7ae0b2d2064ebf311ef2767834bdebce570a93 +size 158534613 diff --git a/checkpoints/model_weights_000001343488.pt b/checkpoints/model_weights_000001343488.pt new file mode 100644 index 0000000000000000000000000000000000000000..25eb7830ec5a54d067ce3084d2b6e7e96b6e9fb9 --- /dev/null +++ b/checkpoints/model_weights_000001343488.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5652faa8d489228e8a17dec005b8fbc6af273a39818fa1d276c4f23970e9860f +size 158534613 diff --git a/checkpoints/model_weights_000001474560.pt b/checkpoints/model_weights_000001474560.pt new file mode 100644 index 0000000000000000000000000000000000000000..a0d9f92a3c1a328a66c6835a1afabe315c6248da --- /dev/null +++ b/checkpoints/model_weights_000001474560.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47d372dc78e407aa7e3768e4ab84adce7e5246d86affce94f597027b1a230418 +size 158534613 diff --git a/checkpoints/model_weights_000001605632.pt b/checkpoints/model_weights_000001605632.pt new file mode 100644 index 0000000000000000000000000000000000000000..3b562ef8bb9dbc034cdcdc696a89f2505ece2c6a --- /dev/null +++ b/checkpoints/model_weights_000001605632.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7eeab8cb101508e8eaa425b918e5086a0d01b3f92fb0c4aa62596a7e430bafd +size 158534613 diff --git a/checkpoints/model_weights_000001769472.pt b/checkpoints/model_weights_000001769472.pt new file mode 100644 index 0000000000000000000000000000000000000000..2128bb29b61b0e48937be28db6ab5f0f29bdd306 --- /dev/null +++ b/checkpoints/model_weights_000001769472.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d40710b574cac4324a71293c8ab6427bdde80304c712b5384d4c86e3c85d4d18 +size 158534613 diff --git a/checkpoints/model_weights_000001966080.pt b/checkpoints/model_weights_000001966080.pt new file mode 100644 index 0000000000000000000000000000000000000000..6b084db28242ddf2e8da463d947c49ecef7944b6 --- /dev/null +++ b/checkpoints/model_weights_000001966080.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c35254dce8c3515a1380798310556182e614b42b87e2e28f2ba41eeaab73d644 +size 158534613 diff --git a/checkpoints/model_weights_000002162688.pt b/checkpoints/model_weights_000002162688.pt new file mode 100644 index 0000000000000000000000000000000000000000..60ecba87d0fd4658642bc10995edd17044fcc093 --- /dev/null +++ b/checkpoints/model_weights_000002162688.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3dda2eea34d0fa1863767874f3317d631a6329b3dee56f326701dc2ad5484722 +size 158534613 diff --git a/checkpoints/model_weights_000002359296.pt b/checkpoints/model_weights_000002359296.pt new file mode 100644 index 0000000000000000000000000000000000000000..49f641986760dcbaf0fafb7969c3b41b7d8914e7 --- /dev/null +++ b/checkpoints/model_weights_000002359296.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c929a5c5dc6e25ba9d5d650346c8be78a6521d2c5ffc56e30f9255876d00b6a7 +size 158534613 diff --git a/checkpoints/model_weights_000002621440.pt b/checkpoints/model_weights_000002621440.pt new file mode 100644 index 0000000000000000000000000000000000000000..f207eb84792088fdea39f63cede9ac0c637fa3f9 --- /dev/null +++ b/checkpoints/model_weights_000002621440.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:498a44285e7de52370cb6dd842e5700104f56eb1ed71af6219c8a2007bf7d5f5 +size 158534613 diff --git a/checkpoints/model_weights_000002883584.pt b/checkpoints/model_weights_000002883584.pt new file mode 100644 index 0000000000000000000000000000000000000000..b4e9feca0e7317705d9584317181380b078b1d1a --- /dev/null +++ b/checkpoints/model_weights_000002883584.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2da76eae520bce7e293876e4f77fef2eb48b6b9e4ac123158c495321aa216ee8 +size 158534613 diff --git a/checkpoints/model_weights_000003178496.pt b/checkpoints/model_weights_000003178496.pt new file mode 100644 index 0000000000000000000000000000000000000000..d5878afea092facbdf0efea3716cd76468c228cd --- /dev/null +++ b/checkpoints/model_weights_000003178496.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b9fbbdb9567c1aa42f3dd649876d062559f5aded46971c5ae6f4ac9d588450a +size 158534613 diff --git a/checkpoints/model_weights_000003473408.pt b/checkpoints/model_weights_000003473408.pt new file mode 100644 index 0000000000000000000000000000000000000000..07f03c2025f55c1a0b9feea44f016fc8232958f6 --- /dev/null +++ b/checkpoints/model_weights_000003473408.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4af18f45ea8e5ef0ad8f3b251e4ef978a5fdf8ea5beee49de0a70a387223dc85 +size 158534613 diff --git a/checkpoints/model_weights_000003833856.pt b/checkpoints/model_weights_000003833856.pt new file mode 100644 index 0000000000000000000000000000000000000000..da1a6bba3a71ed6898d1790db8f5a3c5662be90a --- /dev/null +++ b/checkpoints/model_weights_000003833856.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d333480c9cd89e0427400ad2def62127001e4e6aa7f9868ab370510581befe1 +size 158534613 diff --git a/checkpoints/model_weights_000004227072.pt b/checkpoints/model_weights_000004227072.pt new file mode 100644 index 0000000000000000000000000000000000000000..e83ce9edffe21377da295d83f5fb0c48c704f010 --- /dev/null +++ b/checkpoints/model_weights_000004227072.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:553163b300c69ec98048ee615c68f2be113ea6a4899dbc4a1f2766934b6563a5 +size 158534613 diff --git a/checkpoints/model_weights_000004653056.pt b/checkpoints/model_weights_000004653056.pt new file mode 100644 index 0000000000000000000000000000000000000000..cae22ab00991c460aec54a7a101030b58b442929 --- /dev/null +++ b/checkpoints/model_weights_000004653056.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b29b081ad0c308ca36cb225850ac1aae13bc5db0d339694469db0a795bdcf40 +size 158534613 diff --git a/checkpoints/model_weights_000005111808.pt b/checkpoints/model_weights_000005111808.pt new file mode 100644 index 0000000000000000000000000000000000000000..e2b89d6b19680ff1e3c846babd308879b517fa01 --- /dev/null +++ b/checkpoints/model_weights_000005111808.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5365da2705286687b25c3437dcf3d3712dc2d3d1745692b111819c97475ea286 +size 158534613 diff --git a/checkpoints/model_weights_000005603328.pt b/checkpoints/model_weights_000005603328.pt new file mode 100644 index 0000000000000000000000000000000000000000..fb16dcab539041b9f1020f0fa8e59eaabb193310 --- /dev/null +++ b/checkpoints/model_weights_000005603328.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93c7b8125d80ea4543a66121d71c7e456326eb624f5dc0f8d4be27e88a51aa9c +size 158534613 diff --git a/checkpoints/model_weights_000006193152.pt b/checkpoints/model_weights_000006193152.pt new file mode 100644 index 0000000000000000000000000000000000000000..d95375fbc1fdb8c886f0467447b3247bacf96f0e --- /dev/null +++ b/checkpoints/model_weights_000006193152.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7e8966982830b98ddf77aab082bdb02f2971ddb163e5aa89c1646edeb6c28fb +size 158534613 diff --git a/checkpoints/model_weights_000006782976.pt b/checkpoints/model_weights_000006782976.pt new file mode 100644 index 0000000000000000000000000000000000000000..4afae64c26179b0d18853d33b417c00c9fd2a4c3 --- /dev/null +++ b/checkpoints/model_weights_000006782976.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:adc5f92a1d24a5040eda06f9c697e5430cd3c5f0e966ed1e407e99bca951effc +size 158534613 diff --git a/checkpoints/model_weights_000007471104.pt b/checkpoints/model_weights_000007471104.pt new file mode 100644 index 0000000000000000000000000000000000000000..7672f6b625324e5715cc0ebe63e8c7d6ee5a875d --- /dev/null +++ b/checkpoints/model_weights_000007471104.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a7587ac8b166908e57c2b6d90f5be493d5e62a9fbc0c630989dc800d6ef65fc +size 158534613 diff --git a/checkpoints/model_weights_000008224768.pt b/checkpoints/model_weights_000008224768.pt new file mode 100644 index 0000000000000000000000000000000000000000..283be9d6a513986825cdb2601afe2f1adf19edbf --- /dev/null +++ b/checkpoints/model_weights_000008224768.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7161ad423c516924f27783c61bf7d8528de2dbbfffe0aee5a1e3635c473ef655 +size 158534613 diff --git a/checkpoints/model_weights_000009043968.pt b/checkpoints/model_weights_000009043968.pt new file mode 100644 index 0000000000000000000000000000000000000000..bbf54d86ac72c87852323949c3618333edb41033 --- /dev/null +++ b/checkpoints/model_weights_000009043968.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe3ed26e9bb5299ec4982dac7862604eea1c769c44ab68c06c34f7c479741dd5 +size 158534613 diff --git a/checkpoints/model_weights_000009961472.pt b/checkpoints/model_weights_000009961472.pt new file mode 100644 index 0000000000000000000000000000000000000000..5412f7038c75ce4096aa98beedc00540fff641c0 --- /dev/null +++ b/checkpoints/model_weights_000009961472.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:631d7c01e14c8ba59c966ac5f01c818310289b3134727b6eec72a3cc7e5e7f53 +size 158534613 diff --git a/checkpoints/model_weights_000010944512.pt b/checkpoints/model_weights_000010944512.pt new file mode 100644 index 0000000000000000000000000000000000000000..6353cc842e4fe737e9be026f4e718032ab891414 --- /dev/null +++ b/checkpoints/model_weights_000010944512.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5fc7b76a0196481e6871856585482a5bb8b53062455602941fd218a499347699 +size 158534613 diff --git a/checkpoints/model_weights_000012058624.pt b/checkpoints/model_weights_000012058624.pt new file mode 100644 index 0000000000000000000000000000000000000000..e5de0a5ace20837fb69c7bc8f43c89d514d981d6 --- /dev/null +++ b/checkpoints/model_weights_000012058624.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a6fb22ca10376719fe4448e46bf591993831d043c5fc0c807568e2a2740a0bbe +size 158534613 diff --git a/checkpoints/model_weights_000013271040.pt b/checkpoints/model_weights_000013271040.pt new file mode 100644 index 0000000000000000000000000000000000000000..f88d7b7b4168dad5d716e396e8a9f7ec9b4218c1 --- /dev/null +++ b/checkpoints/model_weights_000013271040.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a97513076280e6ecbc81402e04ff0341a952f9c873bb5866fff26d575999f44 +size 158534613 diff --git a/checkpoints/model_weights_000014581760.pt b/checkpoints/model_weights_000014581760.pt new file mode 100644 index 0000000000000000000000000000000000000000..4ece15e443cf9decf9f6f1ad396561551af82628 --- /dev/null +++ b/checkpoints/model_weights_000014581760.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70e5e8277338770f79844df7d380bed36727d8597daa7ec0cff4be957cc155e9 +size 158534613 diff --git a/checkpoints/model_weights_000016056320.pt b/checkpoints/model_weights_000016056320.pt new file mode 100644 index 0000000000000000000000000000000000000000..4f8f29f5441580655d7cc89bf3061264ca916f59 --- /dev/null +++ b/checkpoints/model_weights_000016056320.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27e890cf9b4aae1390dcb23bb305a42e53821dbc3f7cf611b00f76697caa7b38 +size 158534613 diff --git a/checkpoints/model_weights_000016384000.pt b/checkpoints/model_weights_000016384000.pt new file mode 100644 index 0000000000000000000000000000000000000000..9c11a4d0c0903a6d37bbf093574ed2d562500f6d --- /dev/null +++ b/checkpoints/model_weights_000016384000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9dc0b771f38cf791c278ae0f8bec164dbbab3be312bcf32ebcff23f9e4bedd9d +size 158534613 diff --git a/checkpoints/model_weights_000017661952.pt b/checkpoints/model_weights_000017661952.pt new file mode 100644 index 0000000000000000000000000000000000000000..9ce038d98cc3d5c7cca6591909608e73df295e18 --- /dev/null +++ b/checkpoints/model_weights_000017661952.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24b3007a5cc8234bc24797e3d00cd921057b7f464106b3ad6f438cccf326947a +size 158534613 diff --git a/checkpoints/model_weights_000019431424.pt b/checkpoints/model_weights_000019431424.pt new file mode 100644 index 0000000000000000000000000000000000000000..38b87dbc77dbb861340bfe5c9188ea6e60a87895 --- /dev/null +++ b/checkpoints/model_weights_000019431424.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:403185f38281ee013e7265938b6f493d8d0931e179b0d04ea6e081e7ee22661e +size 158534613 diff --git a/checkpoints/model_weights_000021364736.pt b/checkpoints/model_weights_000021364736.pt new file mode 100644 index 0000000000000000000000000000000000000000..c1ea9661fdcbeedde60227f6f4939ff37f29572c --- /dev/null +++ b/checkpoints/model_weights_000021364736.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7836f8f99907e0dadc124eae4f54ba1306c2f006f6887c8c32ba4b7e32602cc2 +size 158534613 diff --git a/checkpoints/model_weights_000023494656.pt b/checkpoints/model_weights_000023494656.pt new file mode 100644 index 0000000000000000000000000000000000000000..6e0e2520b46f5cac602c38e27826b994da3dbdfa --- /dev/null +++ b/checkpoints/model_weights_000023494656.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8483f205f96a8b4ef297ce19c16c06145a11faacf71d586529a1ee14cdcec12 +size 158534613 diff --git a/checkpoints/model_weights_000025853952.pt b/checkpoints/model_weights_000025853952.pt new file mode 100644 index 0000000000000000000000000000000000000000..8617fe3590fa2ec785f98899cdb6d65bcfe8fe73 --- /dev/null +++ b/checkpoints/model_weights_000025853952.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e8fe840d6d2b26436d1f199ffbcc2781b6bc864049a3c31eaa3ea778a581ff7 +size 158534613 diff --git a/checkpoints/model_weights_000028442624.pt b/checkpoints/model_weights_000028442624.pt new file mode 100644 index 0000000000000000000000000000000000000000..348a6eb9df1fc648fc6dee38973c42508d65ee64 --- /dev/null +++ b/checkpoints/model_weights_000028442624.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90dbe7e27cf79668d59d8983a8447a5f6ad87ba95895b01aaabc4a5509dfc9c1 +size 158534613 diff --git a/checkpoints/model_weights_000031293440.pt b/checkpoints/model_weights_000031293440.pt new file mode 100644 index 0000000000000000000000000000000000000000..c3eedad229c55f0b74d29e126e82a1645d7f5f50 --- /dev/null +++ b/checkpoints/model_weights_000031293440.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1247d9acf6c96eb6cc1f59b7330e274553ec6454dea7e9f4c3b36d3884f37f7c +size 158534613 diff --git a/checkpoints/model_weights_000032768000.pt b/checkpoints/model_weights_000032768000.pt new file mode 100644 index 0000000000000000000000000000000000000000..fd58d87d6ee0b248cccf3d6b463d206f16e77e51 --- /dev/null +++ b/checkpoints/model_weights_000032768000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd98eb7a29f1e179ee7556708e3d231515478918cbb61b9fe7af8b1380612a8d +size 158534613 diff --git a/checkpoints/model_weights_000034439168.pt b/checkpoints/model_weights_000034439168.pt new file mode 100644 index 0000000000000000000000000000000000000000..8b95ffa131ad9ce8d7c6387cb7420fd4a87cfb52 --- /dev/null +++ b/checkpoints/model_weights_000034439168.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed2b4c53be6dbcb787eb4a896b10170b263f27271ec1f97d8a59577c89c9e960 +size 158534613 diff --git a/checkpoints/model_weights_000037879808.pt b/checkpoints/model_weights_000037879808.pt new file mode 100644 index 0000000000000000000000000000000000000000..3054982d53b1e271a48980b625dd8471a260c6b9 --- /dev/null +++ b/checkpoints/model_weights_000037879808.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d31d01a6334c4c6ae7de5251266437cdc0986ef2d6c365388951edb730dab6aa +size 158534613 diff --git a/checkpoints/model_weights_000041648128.pt b/checkpoints/model_weights_000041648128.pt new file mode 100644 index 0000000000000000000000000000000000000000..759e6edfaf74cf61a3c0893d528dcd24b4339888 --- /dev/null +++ b/checkpoints/model_weights_000041648128.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:520738c6b0e3730d5898fa39c5688783ca88eb5229ca9275efe3e647351368c8 +size 158534613 diff --git a/checkpoints/model_weights_000045842432.pt b/checkpoints/model_weights_000045842432.pt new file mode 100644 index 0000000000000000000000000000000000000000..7ebc531fb28eb459b879a7ec4f97c778df287218 --- /dev/null +++ b/checkpoints/model_weights_000045842432.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:260e62dcff3196960a144ab718d80cda02bb9ad6a40657b9997cd9ea7b841934 +size 158534613 diff --git a/checkpoints/model_weights_000049152000.pt b/checkpoints/model_weights_000049152000.pt new file mode 100644 index 0000000000000000000000000000000000000000..79056190beb46f3398a309ba05f76688e309649b --- /dev/null +++ b/checkpoints/model_weights_000049152000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c43dcd2a2ad8eb5cdc46915e9e06b39e2f693a26017707ef56a2202af7c25863 +size 158534613 diff --git a/checkpoints/model_weights_000050397184.pt b/checkpoints/model_weights_000050397184.pt new file mode 100644 index 0000000000000000000000000000000000000000..e5f793f3b29a0f3ee5eeb4fc066c2a5b317fb8c5 --- /dev/null +++ b/checkpoints/model_weights_000050397184.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f524978957f245c8d9fd33ee435b919b5b36c0b7b31afc638da60ae4a2616d04 +size 158534613 diff --git a/checkpoints/model_weights_000055443456.pt b/checkpoints/model_weights_000055443456.pt new file mode 100644 index 0000000000000000000000000000000000000000..08d71006d3097d18465d13999e86f94322e3150f --- /dev/null +++ b/checkpoints/model_weights_000055443456.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4f20467a98b18ccafa84238e6200d85fc2c5ad811d970052819cafc0c0fcb89 +size 158534613 diff --git a/checkpoints/model_weights_000061014016.pt b/checkpoints/model_weights_000061014016.pt new file mode 100644 index 0000000000000000000000000000000000000000..6bd833ccd4ecae05d5359e2855341ac6cea88448 --- /dev/null +++ b/checkpoints/model_weights_000061014016.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a43884666c1eb4f86908711e789d6bc18e340ddbd9d51c5134e70faa40f5ded +size 158534613 diff --git a/checkpoints/model_weights_000065536000.pt b/checkpoints/model_weights_000065536000.pt new file mode 100644 index 0000000000000000000000000000000000000000..ebf89edd6d251ae66e180c9494a034bc3c4e3b69 --- /dev/null +++ b/checkpoints/model_weights_000065536000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:025ee21bddb9b22ff71a8976e6bd3f9b0a704fc7d87b9e1eb6565fa6a58cd1e4 +size 158534613 diff --git a/checkpoints/model_weights_000067108864.pt b/checkpoints/model_weights_000067108864.pt new file mode 100644 index 0000000000000000000000000000000000000000..872469533de9b4a31598032f8e721ca887453d0a --- /dev/null +++ b/checkpoints/model_weights_000067108864.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d386d4ab563511cfd4e725033d9b975c7cf810c58affbadb60d4a906ab93ff4 +size 158534613 diff --git a/checkpoints/model_weights_000073826304.pt b/checkpoints/model_weights_000073826304.pt new file mode 100644 index 0000000000000000000000000000000000000000..03d695d18d8ad04ebb7b5ad18a4a3e0dad436e4b --- /dev/null +++ b/checkpoints/model_weights_000073826304.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d92f5b587d1e212a674b6ad73e516308a37b3babcfc6a27269da5718a47ffa3a +size 158534613 diff --git a/checkpoints/model_weights_000081199104.pt b/checkpoints/model_weights_000081199104.pt new file mode 100644 index 0000000000000000000000000000000000000000..bf4d3cab72a937153f10c13efb51a702be780a3a --- /dev/null +++ b/checkpoints/model_weights_000081199104.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c32b17fe7b47f1e00a31add4e423f891ff5ccb44299af1603739a4709cff7f75 +size 158534613 diff --git a/checkpoints/model_weights_000081920000.pt b/checkpoints/model_weights_000081920000.pt new file mode 100644 index 0000000000000000000000000000000000000000..051b9f1fe9a84f6f5044ffe1fdea2586c94f1b32 --- /dev/null +++ b/checkpoints/model_weights_000081920000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8dfc0368d595da6e55217183464955db7738724cd6bfb7b434128ecf15d3a34c +size 158534613 diff --git a/checkpoints/model_weights_000089325568.pt b/checkpoints/model_weights_000089325568.pt new file mode 100644 index 0000000000000000000000000000000000000000..22cb0ab85bc1bd960fed83112ec023e198b7a3a5 --- /dev/null +++ b/checkpoints/model_weights_000089325568.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71752a82d85777f897508be1df7536b18719fb065a385e1c0cd62735e2761d01 +size 158534613 diff --git a/checkpoints/model_weights_000098271232.pt b/checkpoints/model_weights_000098271232.pt new file mode 100644 index 0000000000000000000000000000000000000000..fdfb8860f7add4e36bd4da96ee7815700dfb4e02 --- /dev/null +++ b/checkpoints/model_weights_000098271232.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f9dcd8e48233cb5fe2ccab0d4ee258928ab5209213294f3b07e9bd631ad9281 +size 158534613 diff --git a/checkpoints/model_weights_000098304000.pt b/checkpoints/model_weights_000098304000.pt new file mode 100644 index 0000000000000000000000000000000000000000..0fa33b5779311e95520bb4e0cb44b502b56aca61 --- /dev/null +++ b/checkpoints/model_weights_000098304000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5380764e94e15a24a8d84ae4cf8500e821e57b97e1ddd1f02de4ad0942ff4d1 +size 158534613 diff --git a/checkpoints/model_weights_000108068864.pt b/checkpoints/model_weights_000108068864.pt new file mode 100644 index 0000000000000000000000000000000000000000..997ce28b84ce276daca472736d4faed329c7d3b9 --- /dev/null +++ b/checkpoints/model_weights_000108068864.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13244ccf6fc3e2ed2389c28eff477d270dc5f7a4816a9b6fd50088134600b9cb +size 158534613 diff --git a/checkpoints/model_weights_000114688000.pt b/checkpoints/model_weights_000114688000.pt new file mode 100644 index 0000000000000000000000000000000000000000..31a57d7723d77323b0063ffea677a36fbec8617f --- /dev/null +++ b/checkpoints/model_weights_000114688000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d71ba981c123086931f752623733bd823e05049e2d6b92ad9265976d7782369 +size 158534613 diff --git a/checkpoints/model_weights_000118882304.pt b/checkpoints/model_weights_000118882304.pt new file mode 100644 index 0000000000000000000000000000000000000000..e80d90755c086ddd9b2c1f2c61ede256d5e67c11 --- /dev/null +++ b/checkpoints/model_weights_000118882304.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89d41d8a7e0e6947589693b9b797f3b222e1f5cfded582e83788aadb3bd4e637 +size 158534613 diff --git a/checkpoints/model_weights_000130777088.pt b/checkpoints/model_weights_000130777088.pt new file mode 100644 index 0000000000000000000000000000000000000000..8b374edc6ab37f58454c7ae47e9491e8f068ad6d --- /dev/null +++ b/checkpoints/model_weights_000130777088.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a2bb197cda0bd8c174856ff7e032117def94fbf3bc47c3c884a222f6a93fd7b +size 158534613 diff --git a/checkpoints/model_weights_000131072000.pt b/checkpoints/model_weights_000131072000.pt new file mode 100644 index 0000000000000000000000000000000000000000..1a84eda28ee29dbb34a4f0fa0561e92fac8fbea0 --- /dev/null +++ b/checkpoints/model_weights_000131072000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d49e919ba740eaec7d2fbcd57da92613c15d04408b654abc811aa3c54f34992 +size 158534613 diff --git a/checkpoints/model_weights_000143834112.pt b/checkpoints/model_weights_000143834112.pt new file mode 100644 index 0000000000000000000000000000000000000000..4f566000bf8b1a141b73afce42856611c5585761 --- /dev/null +++ b/checkpoints/model_weights_000143834112.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:827a461ee4cfddb85265552f6f7991be86d1e85fe40e3f7b3b12d4d74891a3fd +size 158534613 diff --git a/checkpoints/model_weights_000147438592.pt b/checkpoints/model_weights_000147438592.pt new file mode 100644 index 0000000000000000000000000000000000000000..d1456f5f24b3315aa8a7f6235b3ebd67db42e0ef --- /dev/null +++ b/checkpoints/model_weights_000147438592.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:781469d4f1b870a8506c816d89600ed09b3ae40d029fb4c82b7670f3ac25bc9b +size 158534613 diff --git a/checkpoints/model_weights_000158252032.pt b/checkpoints/model_weights_000158252032.pt new file mode 100644 index 0000000000000000000000000000000000000000..3ff009c6ef9ce04218277817f11e62a3c10274e1 --- /dev/null +++ b/checkpoints/model_weights_000158252032.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4d2c4a5c2b9509f2566702c4982a7b112f93d2f9e0969c674d7daeb3e2a3a1b +size 158534613 diff --git a/checkpoints/model_weights_000163822592.pt b/checkpoints/model_weights_000163822592.pt new file mode 100644 index 0000000000000000000000000000000000000000..ec9a7b9f786ce6ca018801696f2ae3edfcae3d2b --- /dev/null +++ b/checkpoints/model_weights_000163822592.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7095420dfbd42401f208c074555cf1cad948ff64131097bf1cd5e9b28f243cd +size 158534613 diff --git a/checkpoints/model_weights_000174078976.pt b/checkpoints/model_weights_000174078976.pt new file mode 100644 index 0000000000000000000000000000000000000000..f064ae71ed3ff91c2cd568bb6de874f8248084f3 --- /dev/null +++ b/checkpoints/model_weights_000174078976.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ecbc0f39c5cf5e49c8b6c87dca54cc7324eb8e0e45bf9c74c0616978db1c7c02 +size 158534613 diff --git a/checkpoints/model_weights_000180206592.pt b/checkpoints/model_weights_000180206592.pt new file mode 100644 index 0000000000000000000000000000000000000000..0644f636082bd8760c42cab4d1b4acda4c400b33 --- /dev/null +++ b/checkpoints/model_weights_000180206592.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa7a2a7d07c34c9580b654d5d64755e0a67cc5ffbb09637967258e1c6c2b600e +size 158534613 diff --git a/checkpoints/model_weights_000191478784.pt b/checkpoints/model_weights_000191478784.pt new file mode 100644 index 0000000000000000000000000000000000000000..2127cc263a12e40c3d828ecde9d03c5953ead611 --- /dev/null +++ b/checkpoints/model_weights_000191478784.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47fb5ec480684e762c719f1c39efae1e3b3c4819a760ffe56fa2d9af01e3f757 +size 158534613 diff --git a/checkpoints/model_weights_000196590592.pt b/checkpoints/model_weights_000196590592.pt new file mode 100644 index 0000000000000000000000000000000000000000..1fdb3e61c4024862f111a5f291cbaa94a13a7952 --- /dev/null +++ b/checkpoints/model_weights_000196590592.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e09dc92e5b2dec7dbf128b66dd5ddd21ce2827686d7cb06d112d3995a216b5d +size 158534613 diff --git a/checkpoints/model_weights_000196688896.pt b/checkpoints/model_weights_000196688896.pt new file mode 100644 index 0000000000000000000000000000000000000000..0da491f7645c5c8375e92acd65fe44992dd3e95d --- /dev/null +++ b/checkpoints/model_weights_000196688896.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d5a5f29f14d746675adff9ee23f3b45302ec592d82c78c9ee856ad494411036 +size 158534613 diff --git a/checkpoints/model_weights_000197344256.pt b/checkpoints/model_weights_000197344256.pt new file mode 100644 index 0000000000000000000000000000000000000000..4ccae960ff192a3a11882c6c899640d31324b55d --- /dev/null +++ b/checkpoints/model_weights_000197344256.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2870f1418b771970706f659d6ec7b79ce5a8aacae1c1b30da4244643951bf984 +size 158534613 diff --git a/checkpoints/model_weights_000197999616.pt b/checkpoints/model_weights_000197999616.pt new file mode 100644 index 0000000000000000000000000000000000000000..c7fff68dbb06b1df75bd417832d4b7d96c55692c --- /dev/null +++ b/checkpoints/model_weights_000197999616.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09ae981741a654ab491559fe060db1dfb0c589d4c64daa42393bfced8602baba +size 158534613 diff --git a/checkpoints/model_weights_000198654976.pt b/checkpoints/model_weights_000198654976.pt new file mode 100644 index 0000000000000000000000000000000000000000..19b4575c97a74eec8d44c49cbc90a9bd25fae754 --- /dev/null +++ b/checkpoints/model_weights_000198654976.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3dff0129fe878e78331eb5e578b5559d63b962a40115f0a47448d0e4f969f5d4 +size 158534613 diff --git a/checkpoints/model_weights_000199310336.pt b/checkpoints/model_weights_000199310336.pt new file mode 100644 index 0000000000000000000000000000000000000000..cc9438eb317d3edc5253b85b31b7c2be0528d64d --- /dev/null +++ b/checkpoints/model_weights_000199310336.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8c07b43278b076551e00ab40c37b36eb2aad700fdabcd42e0aa7f4d7c6e6c5e +size 158534613 diff --git a/checkpoints/model_weights_000199932928.pt b/checkpoints/model_weights_000199932928.pt new file mode 100644 index 0000000000000000000000000000000000000000..389b8b41f05dbbe3a550f7468a8fa46e720c91ae --- /dev/null +++ b/checkpoints/model_weights_000199932928.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9708ef459da72b9d039cef8348006b4b147a8303021c37360e91c2089e33b3dc +size 158534613 diff --git a/config.toml b/config.toml new file mode 100644 index 0000000000000000000000000000000000000000..a70f27ac4c8c12863b1e4f354b9a32168aa42136 --- /dev/null +++ b/config.toml @@ -0,0 +1,31 @@ +model_name = "baseline" +n_layers = 2 +d_model = 512 +d_mlp = 2048 +d_head = 64 +n_heads = 8 +attn_only = false +layer_norm_eps = 1e-05 +init_range = 0.02 +n_ctx = 1024 +d_vocab = 32000 +dataset_name = "eoinf/wikitext_llama" +seed = 10 +device = "cuda" +use_bfloat16_matmul = false +batch_size_per_device = 32 +n_devices = 1 +batches_per_step = 1 +max_tokens = 200000000 +lr_hidden = 0.002 +lr_vector = 0.001 +lr_schedule = "constant_with_warmup" +warmup_tokens = 30000000 +weight_decay = 0.05 +grad_norm_clip = 1.0 +train_loss_moving_average_beta = 0.99 +log_interval = 25 +save_checkpoints = true +checkpoint_interval = 500 +checkpoint_interval_ratio = 1.1 +save_log_checkpoints = true \ No newline at end of file diff --git a/latest_checkpoint.pt b/latest_checkpoint.pt new file mode 100644 index 0000000000000000000000000000000000000000..89e69cd2f65557a438e82cbbbb314256d452bb19 --- /dev/null +++ b/latest_checkpoint.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91bc6ef3f0c6e23da7c79d0ec57669d8eeffef70735a2ba0a79921357111bfc4 +size 158534135 diff --git a/latest_metadata.json b/latest_metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..b528453dd1f04d2fcb97266a922e54d655100505 --- /dev/null +++ b/latest_metadata.json @@ -0,0 +1 @@ +{"step": 6102, "tokens_seen": 199932928, "config": {"model_name": "baseline", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/wikitext_llama", "tokenizer_name": "", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.1746377496608504} \ No newline at end of file diff --git a/latest_optimizer.pt b/latest_optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..37635ebb216f3c8a76760957d0b41df06dc9ab8f --- /dev/null +++ b/latest_optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2199eb89fc69424d828e7b51bc020bb3cdcbacd9750b9408a1eab9051ba8c0d8 +size 317074195 diff --git a/run.sh b/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..7f267f430e5b2294c4e308dc61c1c813beb7e718 --- /dev/null +++ b/run.sh @@ -0,0 +1,27 @@ +#!/bin/bash +# Check if "restart" argument is passed to force normal training +if [ "$1" = "restart" ]; then + echo "Force restart: Running normal training ..." + python -c " +import os +from toy_models.models.trainer import train_transformer_from_config +current_dir = os.getcwd() +train_transformer_from_config('config.toml', current_dir) +" +else + # Check for checkpoints and run appropriate training + python -c " +import os +from pathlib import Path +from toy_models.models.trainer import train_transformer_from_config, restart_from_checkpoint +current_dir = os.getcwd() +# Check if checkpoints directory exists and has .pt files +latest_checkpoint = Path('latest_checkpoint.pt') +if latest_checkpoint.exists(): + print(f'Found checkpoint: {latest_checkpoint}. Restarting from checkpoint...') + restart_from_checkpoint(current_dir) +else: + print('Starting training from beginning ...') + train_transformer_from_config(current_dir) +" +fi diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..190c8d92cb74318e25bb1a6d2e9afb9ba322984e --- /dev/null +++ b/wandb/debug-internal.log @@ -0,0 +1,12 @@ +{"time":"2025-11-01T05:07:19.401880129Z","level":"INFO","msg":"stream: starting","core version":"0.21.1"} +{"time":"2025-11-01T05:07:19.623553191Z","level":"INFO","msg":"stream: created new stream","id":"x0he2mby"} +{"time":"2025-11-01T05:07:19.623592042Z","level":"INFO","msg":"stream: started","id":"x0he2mby"} +{"time":"2025-11-01T05:07:19.623629201Z","level":"INFO","msg":"writer: started","stream_id":"x0he2mby"} +{"time":"2025-11-01T05:07:19.623681532Z","level":"INFO","msg":"handler: started","stream_id":"x0he2mby"} +{"time":"2025-11-01T05:07:19.623637897Z","level":"INFO","msg":"sender: started","stream_id":"x0he2mby"} +{"time":"2025-11-01T06:00:07.926698703Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-11-01T06:00:08.601238042Z","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"uploading history steps 242-243, summary, console lines 251-251","runtime_seconds":0.672901309}],"total_operations":1}} +{"time":"2025-11-01T06:00:13.068341746Z","level":"INFO","msg":"stream: closing","id":"x0he2mby"} +{"time":"2025-11-01T06:00:13.068386588Z","level":"INFO","msg":"handler: closed","stream_id":"x0he2mby"} +{"time":"2025-11-01T06:00:13.068443279Z","level":"INFO","msg":"sender: closed","stream_id":"x0he2mby"} +{"time":"2025-11-01T06:00:13.068447047Z","level":"INFO","msg":"stream: closed","id":"x0he2mby"} diff --git a/wandb/debug.log b/wandb/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..639aa195000734693fe22ae117b938a73e0c1fed --- /dev/null +++ b/wandb/debug.log @@ -0,0 +1,28 @@ +2025-11-01 05:07:18,928 INFO MainThread:290 [wandb_setup.py:_flush():80] Current SDK version is 0.21.1 +2025-11-01 05:07:18,928 INFO MainThread:290 [wandb_setup.py:_flush():80] Configure stats pid to 290 +2025-11-01 05:07:18,928 INFO MainThread:290 [wandb_setup.py:_flush():80] Loading settings from /root/.config/wandb/settings +2025-11-01 05:07:18,928 INFO MainThread:290 [wandb_setup.py:_flush():80] Loading settings from /notebooks/toy_models/model_training/baseline_dataset_name_wikitext_llama/wandb/settings +2025-11-01 05:07:18,928 INFO MainThread:290 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-11-01 05:07:18,928 INFO MainThread:290 [wandb_init.py:setup_run_log_directory():703] Logging user logs to /notebooks/toy_models/model_training/baseline_dataset_name_wikitext_llama/wandb/run-20251101_050718-x0he2mby/logs/debug.log +2025-11-01 05:07:18,928 INFO MainThread:290 [wandb_init.py:setup_run_log_directory():704] Logging internal logs to /notebooks/toy_models/model_training/baseline_dataset_name_wikitext_llama/wandb/run-20251101_050718-x0he2mby/logs/debug-internal.log +2025-11-01 05:07:18,928 INFO MainThread:290 [wandb_init.py:init():830] calling init triggers +2025-11-01 05:07:18,928 INFO MainThread:290 [wandb_init.py:init():835] wandb.init called with sweep_config: {} +config: {'model_name': 'baseline', 'n_layers': 2, 'd_model': 512, 'd_mlp': 2048, 'd_head': 64, 'n_heads': 8, 'attn_only': False, 'layer_norm_eps': 1e-05, 'init_range': 0.02, 'n_ctx': 1024, 'd_vocab': 32000, 'dataset_name': 'eoinf/wikitext_llama', 'tokenizer_name': '', 'seed': 10, 'device': 'cuda', 'use_bfloat16_matmul': False, 'batch_size_per_device': 32, 'n_devices': 1, 'batches_per_step': 1, 'max_tokens': 200000000, 'lr_hidden': 0.002, 'lr_vector': 0.001, 'lr_schedule': 'constant_with_warmup', 'warmup_tokens': 30000000, 'weight_decay': 0.05, 'grad_norm_clip': 1.0, 'train_loss_moving_average_beta': 0.99, 'log_interval': 25, 'save_checkpoints': True, 'checkpoint_interval': 500, 'checkpoint_interval_ratio': 1.1, 'save_log_checkpoints': True, 'use_wandb': True, 'batch_size': 32, 'tokens_per_step': 32768, 'warmup_steps': 915, 'max_steps': 6103, '_wandb': {}} +2025-11-01 05:07:18,928 INFO MainThread:290 [wandb_init.py:init():871] starting backend +2025-11-01 05:07:19,390 INFO MainThread:290 [wandb_init.py:init():874] sending inform_init request +2025-11-01 05:07:19,399 INFO MainThread:290 [wandb_init.py:init():882] backend started and connected +2025-11-01 05:07:19,401 INFO MainThread:290 [wandb_init.py:init():953] updated telemetry +2025-11-01 05:07:19,405 INFO MainThread:290 [wandb_init.py:init():977] communicating run to backend with 90.0 second timeout +2025-11-01 05:07:19,840 INFO MainThread:290 [wandb_init.py:init():1029] starting run threads in backend +2025-11-01 05:07:19,956 INFO MainThread:290 [wandb_run.py:_console_start():2494] atexit reg +2025-11-01 05:07:19,956 INFO MainThread:290 [wandb_run.py:_redirect():2342] redirect: wrap_raw +2025-11-01 05:07:19,956 INFO MainThread:290 [wandb_run.py:_redirect():2411] Wrapping output streams. +2025-11-01 05:07:19,956 INFO MainThread:290 [wandb_run.py:_redirect():2434] Redirects installed. +2025-11-01 05:07:19,964 INFO MainThread:290 [wandb_init.py:init():1075] run started, returning control to user process +2025-11-01 06:00:07,597 INFO MainThread:290 [wandb_run.py:_finish():2260] finishing run eoin/toy-transformer-replication/x0he2mby +2025-11-01 06:00:07,598 INFO MainThread:290 [wandb_run.py:_atexit_cleanup():2459] got exitcode: 0 +2025-11-01 06:00:07,599 INFO MainThread:290 [wandb_run.py:_restore():2441] restore +2025-11-01 06:00:07,599 INFO MainThread:290 [wandb_run.py:_restore():2447] restore done +2025-11-01 06:00:13,066 INFO MainThread:290 [wandb_run.py:_footer_history_summary_info():3895] rendering history +2025-11-01 06:00:13,067 INFO MainThread:290 [wandb_run.py:_footer_history_summary_info():3927] rendering summary +2025-11-01 06:00:13,067 INFO MainThread:290 [wandb_run.py:_footer_sync_info():3856] logging synced files diff --git a/wandb/run-20251101_050718-x0he2mby/files/config.yaml b/wandb/run-20251101_050718-x0he2mby/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2659f178849e670025c9702dbff11e1395e657f3 --- /dev/null +++ b/wandb/run-20251101_050718-x0he2mby/files/config.yaml @@ -0,0 +1,134 @@ +_wandb: + value: + cli_version: 0.21.1 + e: + b2kz8h9szdlb2cddiita5qr6fnj2cdux: + cpu_count: 8 + cpu_count_logical: 8 + cudaVersion: "12.4" + disk: + /: + total: "262240792576" + used: "123042988032" + email: efarrel4@tcd.ie + executable: /notebooks/toy_models/.toy_models_env/bin/python + git: + commit: f1ab6930b532ac49a882df0914c8b0fd16fa74f5 + remote: git@github.com:jgroh3/toy_models.git + gpu: NVIDIA RTX A6000 + gpu_count: 1 + gpu_nvidia: + - architecture: Ampere + cudaCores: 10752 + memoryTotal: "51527024640" + name: NVIDIA RTX A6000 + uuid: GPU-785ade46-5d0b-7a95-8bdc-e144a0a8994c + host: n4q19ari04 + memory: + total: "47332843520" + os: Linux-5.19.0-45-generic-x86_64-with-glibc2.35 + program: + python: CPython 3.11.7 + root: /notebooks/toy_models/model_training/baseline_dataset_name_wikitext_llama + startedAt: "2025-11-01T05:07:18.924866Z" + writerId: b2kz8h9szdlb2cddiita5qr6fnj2cdux + m: [] + python_version: 3.11.7 + t: + "1": + - 1 + - 11 + - 49 + - 51 + - 71 + "2": + - 1 + - 11 + - 49 + - 51 + - 71 + "3": + - 2 + - 13 + - 15 + - 16 + - 61 + "4": 3.11.7 + "5": 0.21.1 + "6": 4.55.4 + "12": 0.21.1 + "13": linux-x86_64 +attn_only: + value: false +batch_size: + value: 32 +batch_size_per_device: + value: 32 +batches_per_step: + value: 1 +checkpoint_interval: + value: 500 +checkpoint_interval_ratio: + value: 1.1 +d_head: + value: 64 +d_mlp: + value: 2048 +d_model: + value: 512 +d_vocab: + value: 32000 +dataset_name: + value: eoinf/wikitext_llama +device: + value: cuda +grad_norm_clip: + value: 1 +init_range: + value: 0.02 +layer_norm_eps: + value: 1e-05 +log_interval: + value: 25 +lr_hidden: + value: 0.002 +lr_schedule: + value: constant_with_warmup +lr_vector: + value: 0.001 +max_steps: + value: 6103 +max_tokens: + value: 200000000 +model_name: + value: baseline +n_ctx: + value: 1024 +n_devices: + value: 1 +n_heads: + value: 8 +n_layers: + value: 2 +save_checkpoints: + value: true +save_log_checkpoints: + value: true +seed: + value: 10 +tokenizer_name: + value: "" +tokens_per_step: + value: 32768 +train_loss_moving_average_beta: + value: 0.99 +use_bfloat16_matmul: + value: false +use_wandb: + value: true +warmup_steps: + value: 915 +warmup_tokens: + value: 30000000 +weight_decay: + value: 0.05 diff --git a/wandb/run-20251101_050718-x0he2mby/files/output.log b/wandb/run-20251101_050718-x0he2mby/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..de701ce6b466a0273e26a879e17b89281dcef031 --- /dev/null +++ b/wandb/run-20251101_050718-x0he2mby/files/output.log @@ -0,0 +1,252 @@ +Training on cuda +Model: 2L, 512d, 8h +Max steps: 6,103, Max tokens: 200,000,000 +Warmup steps: 915, Warmup tokens: 30,000,000 +Batch size per device: 32 +Context length: 1024 +Learning rates - Hidden: 0.002, Vector: 0.001 + +Step 25 | Tokens: 819,200 | Train Loss EWMA: 10.3958 | Learning Rate: 0.000055 | Progress: 0.00410 +Step 50 | Tokens: 1,638,400 | Train Loss EWMA: 10.0921 | Learning Rate: 0.000109 | Progress: 0.00819 +Step 75 | Tokens: 2,457,600 | Train Loss EWMA: 9.6090 | Learning Rate: 0.000164 | Progress: 0.01229 +Step 100 | Tokens: 3,276,800 | Train Loss EWMA: 9.0145 | Learning Rate: 0.000219 | Progress: 0.01638 +Step 125 | Tokens: 4,096,000 | Train Loss EWMA: 8.4374 | Learning Rate: 0.000273 | Progress: 0.02048 +Step 150 | Tokens: 4,915,200 | Train Loss EWMA: 7.9337 | Learning Rate: 0.000328 | Progress: 0.02458 +Step 175 | Tokens: 5,734,400 | Train Loss EWMA: 7.4945 | Learning Rate: 0.000383 | Progress: 0.02867 +Step 200 | Tokens: 6,553,600 | Train Loss EWMA: 7.1073 | Learning Rate: 0.000437 | Progress: 0.03277 +Step 225 | Tokens: 7,372,800 | Train Loss EWMA: 6.7820 | Learning Rate: 0.000492 | Progress: 0.03686 +Step 250 | Tokens: 8,192,000 | Train Loss EWMA: 6.4997 | Learning Rate: 0.000546 | Progress: 0.04096 +Step 275 | Tokens: 9,011,200 | Train Loss EWMA: 6.2541 | Learning Rate: 0.000601 | Progress: 0.04506 +Step 300 | Tokens: 9,830,400 | Train Loss EWMA: 6.0444 | Learning Rate: 0.000656 | Progress: 0.04915 +Step 325 | Tokens: 10,649,600 | Train Loss EWMA: 5.8602 | Learning Rate: 0.000710 | Progress: 0.05325 +Step 350 | Tokens: 11,468,800 | Train Loss EWMA: 5.7047 | Learning Rate: 0.000765 | Progress: 0.05734 +Step 375 | Tokens: 12,288,000 | Train Loss EWMA: 5.5661 | Learning Rate: 0.000820 | Progress: 0.06144 +Step 400 | Tokens: 13,107,200 | Train Loss EWMA: 5.4458 | Learning Rate: 0.000874 | Progress: 0.06554 +Step 425 | Tokens: 13,926,400 | Train Loss EWMA: 5.3391 | Learning Rate: 0.000929 | Progress: 0.06963 +Step 450 | Tokens: 14,745,600 | Train Loss EWMA: 5.2505 | Learning Rate: 0.000984 | Progress: 0.07373 +Step 475 | Tokens: 15,564,800 | Train Loss EWMA: 5.1689 | Learning Rate: 0.001038 | Progress: 0.07782 +Step 500 | Tokens: 16,384,000 | Train Loss EWMA: 5.0922 | Learning Rate: 0.001093 | Progress: 0.08192 +Step 525 | Tokens: 17,203,200 | Train Loss EWMA: 5.0270 | Learning Rate: 0.001148 | Progress: 0.08602 +Step 550 | Tokens: 18,022,400 | Train Loss EWMA: 4.9672 | Learning Rate: 0.001202 | Progress: 0.09011 +Step 575 | Tokens: 18,841,600 | Train Loss EWMA: 4.9110 | Learning Rate: 0.001257 | Progress: 0.09421 +Step 600 | Tokens: 19,660,800 | Train Loss EWMA: 4.8633 | Learning Rate: 0.001311 | Progress: 0.09830 +Step 625 | Tokens: 20,480,000 | Train Loss EWMA: 4.8081 | Learning Rate: 0.001366 | Progress: 0.10240 +Step 650 | Tokens: 21,299,200 | Train Loss EWMA: 4.7518 | Learning Rate: 0.001421 | Progress: 0.10650 +Step 675 | Tokens: 22,118,400 | Train Loss EWMA: 4.7071 | Learning Rate: 0.001475 | Progress: 0.11059 +Step 700 | Tokens: 22,937,600 | Train Loss EWMA: 4.6604 | Learning Rate: 0.001530 | Progress: 0.11469 +Step 725 | Tokens: 23,756,800 | Train Loss EWMA: 4.6143 | Learning Rate: 0.001585 | Progress: 0.11878 +Step 750 | Tokens: 24,576,000 | Train Loss EWMA: 4.5725 | Learning Rate: 0.001639 | Progress: 0.12288 +Step 775 | Tokens: 25,395,200 | Train Loss EWMA: 4.5288 | Learning Rate: 0.001694 | Progress: 0.12698 +Step 800 | Tokens: 26,214,400 | Train Loss EWMA: 4.4844 | Learning Rate: 0.001749 | Progress: 0.13107 +Step 825 | Tokens: 27,033,600 | Train Loss EWMA: 4.4484 | Learning Rate: 0.001803 | Progress: 0.13517 +Step 850 | Tokens: 27,852,800 | Train Loss EWMA: 4.4112 | Learning Rate: 0.001858 | Progress: 0.13926 +Step 875 | Tokens: 28,672,000 | Train Loss EWMA: 4.3800 | Learning Rate: 0.001913 | Progress: 0.14336 +Step 900 | Tokens: 29,491,200 | Train Loss EWMA: 4.3459 | Learning Rate: 0.001967 | Progress: 0.14746 +Step 925 | Tokens: 30,310,400 | Train Loss EWMA: 4.3143 | Learning Rate: 0.002000 | Progress: 0.15155 +Step 950 | Tokens: 31,129,600 | Train Loss EWMA: 4.2875 | Learning Rate: 0.002000 | Progress: 0.15565 +Step 975 | Tokens: 31,948,800 | Train Loss EWMA: 4.2620 | Learning Rate: 0.002000 | Progress: 0.15974 +Step 1,000 | Tokens: 32,768,000 | Train Loss EWMA: 4.2379 | Learning Rate: 0.002000 | Progress: 0.16384 +Step 1,025 | Tokens: 33,587,200 | Train Loss EWMA: 4.2127 | Learning Rate: 0.002000 | Progress: 0.16794 +Step 1,050 | Tokens: 34,406,400 | Train Loss EWMA: 4.1897 | Learning Rate: 0.002000 | Progress: 0.17203 +Step 1,075 | Tokens: 35,225,600 | Train Loss EWMA: 4.1637 | Learning Rate: 0.002000 | Progress: 0.17613 +Step 1,100 | Tokens: 36,044,800 | Train Loss EWMA: 4.1452 | Learning Rate: 0.002000 | Progress: 0.18022 +Step 1,125 | Tokens: 36,864,000 | Train Loss EWMA: 4.1252 | Learning Rate: 0.002000 | Progress: 0.18432 +Step 1,150 | Tokens: 37,683,200 | Train Loss EWMA: 4.1027 | Learning Rate: 0.002000 | Progress: 0.18842 +Step 1,175 | Tokens: 38,502,400 | Train Loss EWMA: 4.0814 | Learning Rate: 0.002000 | Progress: 0.19251 +Step 1,200 | Tokens: 39,321,600 | Train Loss EWMA: 4.0591 | Learning Rate: 0.002000 | Progress: 0.19661 +Step 1,225 | Tokens: 40,140,800 | Train Loss EWMA: 4.0396 | Learning Rate: 0.002000 | Progress: 0.20070 +Step 1,250 | Tokens: 40,960,000 | Train Loss EWMA: 4.0287 | Learning Rate: 0.002000 | Progress: 0.20480 +Step 1,275 | Tokens: 41,779,200 | Train Loss EWMA: 4.0064 | Learning Rate: 0.002000 | Progress: 0.20890 +Step 1,300 | Tokens: 42,598,400 | Train Loss EWMA: 3.9882 | Learning Rate: 0.002000 | Progress: 0.21299 +Step 1,325 | Tokens: 43,417,600 | Train Loss EWMA: 3.9680 | Learning Rate: 0.002000 | Progress: 0.21709 +Step 1,350 | Tokens: 44,236,800 | Train Loss EWMA: 3.9531 | Learning Rate: 0.002000 | Progress: 0.22118 +Step 1,375 | Tokens: 45,056,000 | Train Loss EWMA: 3.9353 | Learning Rate: 0.002000 | Progress: 0.22528 +Step 1,400 | Tokens: 45,875,200 | Train Loss EWMA: 3.9256 | Learning Rate: 0.002000 | Progress: 0.22938 +Step 1,425 | Tokens: 46,694,400 | Train Loss EWMA: 3.9198 | Learning Rate: 0.002000 | Progress: 0.23347 +Step 1,450 | Tokens: 47,513,600 | Train Loss EWMA: 3.9070 | Learning Rate: 0.002000 | Progress: 0.23757 +Step 1,475 | Tokens: 48,332,800 | Train Loss EWMA: 3.8861 | Learning Rate: 0.002000 | Progress: 0.24166 +Step 1,500 | Tokens: 49,152,000 | Train Loss EWMA: 3.8758 | Learning Rate: 0.002000 | Progress: 0.24576 +Step 1,525 | Tokens: 49,971,200 | Train Loss EWMA: 3.8661 | Learning Rate: 0.002000 | Progress: 0.24986 +Step 1,550 | Tokens: 50,790,400 | Train Loss EWMA: 3.8557 | Learning Rate: 0.002000 | Progress: 0.25395 +Step 1,575 | Tokens: 51,609,600 | Train Loss EWMA: 3.8409 | Learning Rate: 0.002000 | Progress: 0.25805 +Step 1,600 | Tokens: 52,428,800 | Train Loss EWMA: 3.8288 | Learning Rate: 0.002000 | Progress: 0.26214 +Step 1,625 | Tokens: 53,248,000 | Train Loss EWMA: 3.8140 | Learning Rate: 0.002000 | Progress: 0.26624 +Step 1,650 | Tokens: 54,067,200 | Train Loss EWMA: 3.7998 | Learning Rate: 0.002000 | Progress: 0.27034 +Step 1,675 | Tokens: 54,886,400 | Train Loss EWMA: 3.7918 | Learning Rate: 0.002000 | Progress: 0.27443 +Step 1,700 | Tokens: 55,705,600 | Train Loss EWMA: 3.7755 | Learning Rate: 0.002000 | Progress: 0.27853 +Step 1,725 | Tokens: 56,524,800 | Train Loss EWMA: 3.7622 | Learning Rate: 0.002000 | Progress: 0.28262 +Step 1,750 | Tokens: 57,344,000 | Train Loss EWMA: 3.7459 | Learning Rate: 0.002000 | Progress: 0.28672 +Step 1,775 | Tokens: 58,163,200 | Train Loss EWMA: 3.7413 | Learning Rate: 0.002000 | Progress: 0.29082 +Step 1,800 | Tokens: 58,982,400 | Train Loss EWMA: 3.7282 | Learning Rate: 0.002000 | Progress: 0.29491 +Step 1,825 | Tokens: 59,801,600 | Train Loss EWMA: 3.7128 | Learning Rate: 0.002000 | Progress: 0.29901 +Step 1,850 | Tokens: 60,620,800 | Train Loss EWMA: 3.7013 | Learning Rate: 0.002000 | Progress: 0.30310 +Step 1,875 | Tokens: 61,440,000 | Train Loss EWMA: 3.6976 | Learning Rate: 0.002000 | Progress: 0.30720 +Step 1,900 | Tokens: 62,259,200 | Train Loss EWMA: 3.6865 | Learning Rate: 0.002000 | Progress: 0.31130 +Step 1,925 | Tokens: 63,078,400 | Train Loss EWMA: 3.6765 | Learning Rate: 0.002000 | Progress: 0.31539 +Step 1,950 | Tokens: 63,897,600 | Train Loss EWMA: 3.6649 | Learning Rate: 0.002000 | Progress: 0.31949 +Step 1,975 | Tokens: 64,716,800 | Train Loss EWMA: 3.6537 | Learning Rate: 0.002000 | Progress: 0.32358 +Step 2,000 | Tokens: 65,536,000 | Train Loss EWMA: 3.6418 | Learning Rate: 0.002000 | Progress: 0.32768 +Step 2,025 | Tokens: 66,355,200 | Train Loss EWMA: 3.6311 | Learning Rate: 0.002000 | Progress: 0.33178 +Step 2,050 | Tokens: 67,174,400 | Train Loss EWMA: 3.6170 | Learning Rate: 0.002000 | Progress: 0.33587 +Step 2,075 | Tokens: 67,993,600 | Train Loss EWMA: 3.6079 | Learning Rate: 0.002000 | Progress: 0.33997 +Step 2,100 | Tokens: 68,812,800 | Train Loss EWMA: 3.6044 | Learning Rate: 0.002000 | Progress: 0.34406 +Step 2,125 | Tokens: 69,632,000 | Train Loss EWMA: 3.5947 | Learning Rate: 0.002000 | Progress: 0.34816 +Step 2,150 | Tokens: 70,451,200 | Train Loss EWMA: 3.5891 | Learning Rate: 0.002000 | Progress: 0.35226 +Step 2,175 | Tokens: 71,270,400 | Train Loss EWMA: 3.5906 | Learning Rate: 0.002000 | Progress: 0.35635 +Step 2,200 | Tokens: 72,089,600 | Train Loss EWMA: 3.5815 | Learning Rate: 0.002000 | Progress: 0.36045 +Step 2,225 | Tokens: 72,908,800 | Train Loss EWMA: 3.5679 | Learning Rate: 0.002000 | Progress: 0.36454 +Step 2,250 | Tokens: 73,728,000 | Train Loss EWMA: 3.5613 | Learning Rate: 0.002000 | Progress: 0.36864 +Step 2,275 | Tokens: 74,547,200 | Train Loss EWMA: 3.5474 | Learning Rate: 0.002000 | Progress: 0.37274 +Step 2,300 | Tokens: 75,366,400 | Train Loss EWMA: 3.5441 | Learning Rate: 0.002000 | Progress: 0.37683 +Step 2,325 | Tokens: 76,185,600 | Train Loss EWMA: 3.5364 | Learning Rate: 0.002000 | Progress: 0.38093 +Step 2,350 | Tokens: 77,004,800 | Train Loss EWMA: 3.5299 | Learning Rate: 0.002000 | Progress: 0.38502 +Step 2,375 | Tokens: 77,824,000 | Train Loss EWMA: 3.5280 | Learning Rate: 0.002000 | Progress: 0.38912 +Step 2,400 | Tokens: 78,643,200 | Train Loss EWMA: 3.5255 | Learning Rate: 0.002000 | Progress: 0.39322 +Step 2,425 | Tokens: 79,462,400 | Train Loss EWMA: 3.5172 | Learning Rate: 0.002000 | Progress: 0.39731 +Step 2,450 | Tokens: 80,281,600 | Train Loss EWMA: 3.5056 | Learning Rate: 0.002000 | Progress: 0.40141 +Step 2,475 | Tokens: 81,100,800 | Train Loss EWMA: 3.5076 | Learning Rate: 0.002000 | Progress: 0.40550 +Step 2,500 | Tokens: 81,920,000 | Train Loss EWMA: 3.5034 | Learning Rate: 0.002000 | Progress: 0.40960 +Step 2,525 | Tokens: 82,739,200 | Train Loss EWMA: 3.4938 | Learning Rate: 0.002000 | Progress: 0.41370 +Step 2,550 | Tokens: 83,558,400 | Train Loss EWMA: 3.4903 | Learning Rate: 0.002000 | Progress: 0.41779 +Step 2,575 | Tokens: 84,377,600 | Train Loss EWMA: 3.4839 | Learning Rate: 0.002000 | Progress: 0.42189 +Step 2,600 | Tokens: 85,196,800 | Train Loss EWMA: 3.4761 | Learning Rate: 0.002000 | Progress: 0.42598 +Step 2,625 | Tokens: 86,016,000 | Train Loss EWMA: 3.4799 | Learning Rate: 0.002000 | Progress: 0.43008 +Step 2,650 | Tokens: 86,835,200 | Train Loss EWMA: 3.4801 | Learning Rate: 0.002000 | Progress: 0.43418 +Step 2,675 | Tokens: 87,654,400 | Train Loss EWMA: 3.4688 | Learning Rate: 0.002000 | Progress: 0.43827 +Step 2,700 | Tokens: 88,473,600 | Train Loss EWMA: 3.4628 | Learning Rate: 0.002000 | Progress: 0.44237 +Step 2,725 | Tokens: 89,292,800 | Train Loss EWMA: 3.4594 | Learning Rate: 0.002000 | Progress: 0.44646 +Step 2,750 | Tokens: 90,112,000 | Train Loss EWMA: 3.4601 | Learning Rate: 0.002000 | Progress: 0.45056 +Step 2,775 | Tokens: 90,931,200 | Train Loss EWMA: 3.4629 | Learning Rate: 0.002000 | Progress: 0.45466 +Step 2,800 | Tokens: 91,750,400 | Train Loss EWMA: 3.4556 | Learning Rate: 0.002000 | Progress: 0.45875 +Step 2,825 | Tokens: 92,569,600 | Train Loss EWMA: 3.4401 | Learning Rate: 0.002000 | Progress: 0.46285 +Step 2,850 | Tokens: 93,388,800 | Train Loss EWMA: 3.4317 | Learning Rate: 0.002000 | Progress: 0.46694 +Step 2,875 | Tokens: 94,208,000 | Train Loss EWMA: 3.4274 | Learning Rate: 0.002000 | Progress: 0.47104 +Step 2,900 | Tokens: 95,027,200 | Train Loss EWMA: 3.4229 | Learning Rate: 0.002000 | Progress: 0.47514 +Step 2,925 | Tokens: 95,846,400 | Train Loss EWMA: 3.4160 | Learning Rate: 0.002000 | Progress: 0.47923 +Step 2,950 | Tokens: 96,665,600 | Train Loss EWMA: 3.4133 | Learning Rate: 0.002000 | Progress: 0.48333 +Step 2,975 | Tokens: 97,484,800 | Train Loss EWMA: 3.4140 | Learning Rate: 0.002000 | Progress: 0.48742 +Step 3,000 | Tokens: 98,304,000 | Train Loss EWMA: 3.4168 | Learning Rate: 0.002000 | Progress: 0.49152 +Step 3,025 | Tokens: 99,123,200 | Train Loss EWMA: 3.4144 | Learning Rate: 0.002000 | Progress: 0.49562 +Step 3,050 | Tokens: 99,942,400 | Train Loss EWMA: 3.4005 | Learning Rate: 0.002000 | Progress: 0.49971 +Step 3,075 | Tokens: 100,761,600 | Train Loss EWMA: 3.4022 | Learning Rate: 0.002000 | Progress: 0.50381 +Step 3,100 | Tokens: 101,580,800 | Train Loss EWMA: 3.3999 | Learning Rate: 0.002000 | Progress: 0.50790 +Step 3,125 | Tokens: 102,400,000 | Train Loss EWMA: 3.3968 | Learning Rate: 0.002000 | Progress: 0.51200 +Step 3,150 | Tokens: 103,219,200 | Train Loss EWMA: 3.3883 | Learning Rate: 0.002000 | Progress: 0.51610 +Step 3,175 | Tokens: 104,038,400 | Train Loss EWMA: 3.3846 | Learning Rate: 0.002000 | Progress: 0.52019 +Step 3,200 | Tokens: 104,857,600 | Train Loss EWMA: 3.3813 | Learning Rate: 0.002000 | Progress: 0.52429 +Step 3,225 | Tokens: 105,676,800 | Train Loss EWMA: 3.3809 | Learning Rate: 0.002000 | Progress: 0.52838 +Step 3,250 | Tokens: 106,496,000 | Train Loss EWMA: 3.3823 | Learning Rate: 0.002000 | Progress: 0.53248 +Step 3,275 | Tokens: 107,315,200 | Train Loss EWMA: 3.3886 | Learning Rate: 0.002000 | Progress: 0.53658 +Step 3,300 | Tokens: 108,134,400 | Train Loss EWMA: 3.3806 | Learning Rate: 0.002000 | Progress: 0.54067 +Step 3,325 | Tokens: 108,953,600 | Train Loss EWMA: 3.3811 | Learning Rate: 0.002000 | Progress: 0.54477 +Step 3,350 | Tokens: 109,772,800 | Train Loss EWMA: 3.3733 | Learning Rate: 0.002000 | Progress: 0.54886 +Step 3,375 | Tokens: 110,592,000 | Train Loss EWMA: 3.3739 | Learning Rate: 0.002000 | Progress: 0.55296 +Step 3,400 | Tokens: 111,411,200 | Train Loss EWMA: 3.3706 | Learning Rate: 0.002000 | Progress: 0.55706 +Step 3,425 | Tokens: 112,230,400 | Train Loss EWMA: 3.3690 | Learning Rate: 0.002000 | Progress: 0.56115 +Step 3,450 | Tokens: 113,049,600 | Train Loss EWMA: 3.3667 | Learning Rate: 0.002000 | Progress: 0.56525 +Step 3,475 | Tokens: 113,868,800 | Train Loss EWMA: 3.3614 | Learning Rate: 0.002000 | Progress: 0.56934 +Step 3,500 | Tokens: 114,688,000 | Train Loss EWMA: 3.3525 | Learning Rate: 0.002000 | Progress: 0.57344 +Step 3,525 | Tokens: 115,507,200 | Train Loss EWMA: 3.3532 | Learning Rate: 0.002000 | Progress: 0.57754 +Step 3,550 | Tokens: 116,326,400 | Train Loss EWMA: 3.3469 | Learning Rate: 0.002000 | Progress: 0.58163 +Step 3,575 | Tokens: 117,145,600 | Train Loss EWMA: 3.3488 | Learning Rate: 0.002000 | Progress: 0.58573 +Step 3,600 | Tokens: 117,964,800 | Train Loss EWMA: 3.3447 | Learning Rate: 0.002000 | Progress: 0.58982 +Step 3,625 | Tokens: 118,784,000 | Train Loss EWMA: 3.3429 | Learning Rate: 0.002000 | Progress: 0.59392 +Step 3,650 | Tokens: 119,603,200 | Train Loss EWMA: 3.3326 | Learning Rate: 0.002000 | Progress: 0.59802 +Step 3,675 | Tokens: 120,422,400 | Train Loss EWMA: 3.3384 | Learning Rate: 0.002000 | Progress: 0.60211 +Step 3,700 | Tokens: 121,241,600 | Train Loss EWMA: 3.3338 | Learning Rate: 0.002000 | Progress: 0.60621 +Step 3,725 | Tokens: 122,060,800 | Train Loss EWMA: 3.3267 | Learning Rate: 0.002000 | Progress: 0.61030 +Step 3,750 | Tokens: 122,880,000 | Train Loss EWMA: 3.3225 | Learning Rate: 0.002000 | Progress: 0.61440 +Step 3,775 | Tokens: 123,699,200 | Train Loss EWMA: 3.3201 | Learning Rate: 0.002000 | Progress: 0.61850 +Step 3,800 | Tokens: 124,518,400 | Train Loss EWMA: 3.3144 | Learning Rate: 0.002000 | Progress: 0.62259 +Step 3,825 | Tokens: 125,337,600 | Train Loss EWMA: 3.3126 | Learning Rate: 0.002000 | Progress: 0.62669 +Step 3,850 | Tokens: 126,156,800 | Train Loss EWMA: 3.3090 | Learning Rate: 0.002000 | Progress: 0.63078 +Step 3,875 | Tokens: 126,976,000 | Train Loss EWMA: 3.3111 | Learning Rate: 0.002000 | Progress: 0.63488 +Step 3,900 | Tokens: 127,795,200 | Train Loss EWMA: 3.3070 | Learning Rate: 0.002000 | Progress: 0.63898 +Step 3,925 | Tokens: 128,614,400 | Train Loss EWMA: 3.3039 | Learning Rate: 0.002000 | Progress: 0.64307 +Step 3,950 | Tokens: 129,433,600 | Train Loss EWMA: 3.3078 | Learning Rate: 0.002000 | Progress: 0.64717 +Step 3,975 | Tokens: 130,252,800 | Train Loss EWMA: 3.3087 | Learning Rate: 0.002000 | Progress: 0.65126 +Step 4,000 | Tokens: 131,072,000 | Train Loss EWMA: 3.3022 | Learning Rate: 0.002000 | Progress: 0.65536 +Step 4,025 | Tokens: 131,891,200 | Train Loss EWMA: 3.3044 | Learning Rate: 0.002000 | Progress: 0.65946 +Step 4,050 | Tokens: 132,710,400 | Train Loss EWMA: 3.2951 | Learning Rate: 0.002000 | Progress: 0.66355 +Step 4,075 | Tokens: 133,529,600 | Train Loss EWMA: 3.2927 | Learning Rate: 0.002000 | Progress: 0.66765 +Step 4,100 | Tokens: 134,348,800 | Train Loss EWMA: 3.2934 | Learning Rate: 0.002000 | Progress: 0.67174 +Step 4,125 | Tokens: 135,168,000 | Train Loss EWMA: 3.2963 | Learning Rate: 0.002000 | Progress: 0.67584 +Step 4,150 | Tokens: 135,987,200 | Train Loss EWMA: 3.2918 | Learning Rate: 0.002000 | Progress: 0.67994 +Step 4,175 | Tokens: 136,806,400 | Train Loss EWMA: 3.2973 | Learning Rate: 0.002000 | Progress: 0.68403 +Step 4,200 | Tokens: 137,625,600 | Train Loss EWMA: 3.2989 | Learning Rate: 0.002000 | Progress: 0.68813 +Step 4,225 | Tokens: 138,444,800 | Train Loss EWMA: 3.2942 | Learning Rate: 0.002000 | Progress: 0.69222 +Step 4,250 | Tokens: 139,264,000 | Train Loss EWMA: 3.2995 | Learning Rate: 0.002000 | Progress: 0.69632 +Step 4,275 | Tokens: 140,083,200 | Train Loss EWMA: 3.2938 | Learning Rate: 0.002000 | Progress: 0.70042 +Step 4,300 | Tokens: 140,902,400 | Train Loss EWMA: 3.2876 | Learning Rate: 0.002000 | Progress: 0.70451 +Step 4,325 | Tokens: 141,721,600 | Train Loss EWMA: 3.2882 | Learning Rate: 0.002000 | Progress: 0.70861 +Step 4,350 | Tokens: 142,540,800 | Train Loss EWMA: 3.2798 | Learning Rate: 0.002000 | Progress: 0.71270 +Step 4,375 | Tokens: 143,342,592 | Train Loss EWMA: 3.2721 | Learning Rate: 0.002000 | Progress: 0.71671 +Step 4,400 | Tokens: 144,161,792 | Train Loss EWMA: 3.2638 | Learning Rate: 0.002000 | Progress: 0.72081 +Step 4,425 | Tokens: 144,980,992 | Train Loss EWMA: 3.2593 | Learning Rate: 0.002000 | Progress: 0.72490 +Step 4,450 | Tokens: 145,800,192 | Train Loss EWMA: 3.2444 | Learning Rate: 0.002000 | Progress: 0.72900 +Step 4,475 | Tokens: 146,619,392 | Train Loss EWMA: 3.2393 | Learning Rate: 0.002000 | Progress: 0.73310 +Step 4,500 | Tokens: 147,438,592 | Train Loss EWMA: 3.2261 | Learning Rate: 0.002000 | Progress: 0.73719 +Step 4,525 | Tokens: 148,257,792 | Train Loss EWMA: 3.2224 | Learning Rate: 0.002000 | Progress: 0.74129 +Step 4,550 | Tokens: 149,076,992 | Train Loss EWMA: 3.2185 | Learning Rate: 0.002000 | Progress: 0.74538 +Step 4,575 | Tokens: 149,896,192 | Train Loss EWMA: 3.2182 | Learning Rate: 0.002000 | Progress: 0.74948 +Step 4,600 | Tokens: 150,715,392 | Train Loss EWMA: 3.2215 | Learning Rate: 0.002000 | Progress: 0.75358 +Step 4,625 | Tokens: 151,534,592 | Train Loss EWMA: 3.2165 | Learning Rate: 0.002000 | Progress: 0.75767 +Step 4,650 | Tokens: 152,353,792 | Train Loss EWMA: 3.2140 | Learning Rate: 0.002000 | Progress: 0.76177 +Step 4,675 | Tokens: 153,172,992 | Train Loss EWMA: 3.2135 | Learning Rate: 0.002000 | Progress: 0.76586 +Step 4,700 | Tokens: 153,992,192 | Train Loss EWMA: 3.2162 | Learning Rate: 0.002000 | Progress: 0.76996 +Step 4,725 | Tokens: 154,811,392 | Train Loss EWMA: 3.2217 | Learning Rate: 0.002000 | Progress: 0.77406 +Step 4,750 | Tokens: 155,630,592 | Train Loss EWMA: 3.2209 | Learning Rate: 0.002000 | Progress: 0.77815 +Step 4,775 | Tokens: 156,449,792 | Train Loss EWMA: 3.2170 | Learning Rate: 0.002000 | Progress: 0.78225 +Step 4,800 | Tokens: 157,268,992 | Train Loss EWMA: 3.2196 | Learning Rate: 0.002000 | Progress: 0.78634 +Step 4,825 | Tokens: 158,088,192 | Train Loss EWMA: 3.2165 | Learning Rate: 0.002000 | Progress: 0.79044 +Step 4,850 | Tokens: 158,907,392 | Train Loss EWMA: 3.2126 | Learning Rate: 0.002000 | Progress: 0.79454 +Step 4,875 | Tokens: 159,726,592 | Train Loss EWMA: 3.2123 | Learning Rate: 0.002000 | Progress: 0.79863 +Step 4,900 | Tokens: 160,545,792 | Train Loss EWMA: 3.2094 | Learning Rate: 0.002000 | Progress: 0.80273 +Step 4,925 | Tokens: 161,364,992 | Train Loss EWMA: 3.2051 | Learning Rate: 0.002000 | Progress: 0.80682 +Step 4,950 | Tokens: 162,184,192 | Train Loss EWMA: 3.2039 | Learning Rate: 0.002000 | Progress: 0.81092 +Step 4,975 | Tokens: 163,003,392 | Train Loss EWMA: 3.2032 | Learning Rate: 0.002000 | Progress: 0.81502 +Step 5,000 | Tokens: 163,822,592 | Train Loss EWMA: 3.2098 | Learning Rate: 0.002000 | Progress: 0.81911 +Step 5,025 | Tokens: 164,641,792 | Train Loss EWMA: 3.2065 | Learning Rate: 0.002000 | Progress: 0.82321 +Step 5,050 | Tokens: 165,460,992 | Train Loss EWMA: 3.2021 | Learning Rate: 0.002000 | Progress: 0.82730 +Step 5,075 | Tokens: 166,280,192 | Train Loss EWMA: 3.1928 | Learning Rate: 0.002000 | Progress: 0.83140 +Step 5,100 | Tokens: 167,099,392 | Train Loss EWMA: 3.1951 | Learning Rate: 0.002000 | Progress: 0.83550 +Step 5,125 | Tokens: 167,918,592 | Train Loss EWMA: 3.1988 | Learning Rate: 0.002000 | Progress: 0.83959 +Step 5,150 | Tokens: 168,737,792 | Train Loss EWMA: 3.2011 | Learning Rate: 0.002000 | Progress: 0.84369 +Step 5,175 | Tokens: 169,556,992 | Train Loss EWMA: 3.2016 | Learning Rate: 0.002000 | Progress: 0.84778 +Step 5,200 | Tokens: 170,376,192 | Train Loss EWMA: 3.2027 | Learning Rate: 0.002000 | Progress: 0.85188 +Step 5,225 | Tokens: 171,195,392 | Train Loss EWMA: 3.1932 | Learning Rate: 0.002000 | Progress: 0.85598 +Step 5,250 | Tokens: 172,014,592 | Train Loss EWMA: 3.1943 | Learning Rate: 0.002000 | Progress: 0.86007 +Step 5,275 | Tokens: 172,833,792 | Train Loss EWMA: 3.1941 | Learning Rate: 0.002000 | Progress: 0.86417 +Step 5,300 | Tokens: 173,652,992 | Train Loss EWMA: 3.1889 | Learning Rate: 0.002000 | Progress: 0.86826 +Step 5,325 | Tokens: 174,472,192 | Train Loss EWMA: 3.1910 | Learning Rate: 0.002000 | Progress: 0.87236 +Step 5,350 | Tokens: 175,291,392 | Train Loss EWMA: 3.1920 | Learning Rate: 0.002000 | Progress: 0.87646 +Step 5,375 | Tokens: 176,110,592 | Train Loss EWMA: 3.1873 | Learning Rate: 0.002000 | Progress: 0.88055 +Step 5,400 | Tokens: 176,929,792 | Train Loss EWMA: 3.1876 | Learning Rate: 0.002000 | Progress: 0.88465 +Step 5,425 | Tokens: 177,748,992 | Train Loss EWMA: 3.1825 | Learning Rate: 0.002000 | Progress: 0.88874 +Step 5,450 | Tokens: 178,568,192 | Train Loss EWMA: 3.1891 | Learning Rate: 0.002000 | Progress: 0.89284 +Step 5,475 | Tokens: 179,387,392 | Train Loss EWMA: 3.1937 | Learning Rate: 0.002000 | Progress: 0.89694 +Step 5,500 | Tokens: 180,206,592 | Train Loss EWMA: 3.1946 | Learning Rate: 0.002000 | Progress: 0.90103 +Step 5,525 | Tokens: 181,025,792 | Train Loss EWMA: 3.1965 | Learning Rate: 0.002000 | Progress: 0.90513 +Step 5,550 | Tokens: 181,844,992 | Train Loss EWMA: 3.1995 | Learning Rate: 0.002000 | Progress: 0.90922 +Step 5,575 | Tokens: 182,664,192 | Train Loss EWMA: 3.2000 | Learning Rate: 0.002000 | Progress: 0.91332 +Step 5,600 | Tokens: 183,483,392 | Train Loss EWMA: 3.1984 | Learning Rate: 0.002000 | Progress: 0.91742 +Step 5,625 | Tokens: 184,302,592 | Train Loss EWMA: 3.1955 | Learning Rate: 0.002000 | Progress: 0.92151 +Step 5,650 | Tokens: 185,121,792 | Train Loss EWMA: 3.1921 | Learning Rate: 0.002000 | Progress: 0.92561 +Step 5,675 | Tokens: 185,940,992 | Train Loss EWMA: 3.1872 | Learning Rate: 0.002000 | Progress: 0.92970 +Step 5,700 | Tokens: 186,760,192 | Train Loss EWMA: 3.1847 | Learning Rate: 0.002000 | Progress: 0.93380 +Step 5,725 | Tokens: 187,579,392 | Train Loss EWMA: 3.1857 | Learning Rate: 0.002000 | Progress: 0.93790 +Step 5,750 | Tokens: 188,398,592 | Train Loss EWMA: 3.1903 | Learning Rate: 0.002000 | Progress: 0.94199 +Step 5,775 | Tokens: 189,217,792 | Train Loss EWMA: 3.1887 | Learning Rate: 0.002000 | Progress: 0.94609 +Step 5,800 | Tokens: 190,036,992 | Train Loss EWMA: 3.1849 | Learning Rate: 0.002000 | Progress: 0.95018 +Step 5,825 | Tokens: 190,856,192 | Train Loss EWMA: 3.1829 | Learning Rate: 0.002000 | Progress: 0.95428 +Step 5,850 | Tokens: 191,675,392 | Train Loss EWMA: 3.1812 | Learning Rate: 0.002000 | Progress: 0.95838 +Step 5,875 | Tokens: 192,494,592 | Train Loss EWMA: 3.1824 | Learning Rate: 0.002000 | Progress: 0.96247 +Step 5,900 | Tokens: 193,313,792 | Train Loss EWMA: 3.1795 | Learning Rate: 0.002000 | Progress: 0.96657 +Step 5,925 | Tokens: 194,132,992 | Train Loss EWMA: 3.1843 | Learning Rate: 0.002000 | Progress: 0.97066 +Step 5,950 | Tokens: 194,952,192 | Train Loss EWMA: 3.1816 | Learning Rate: 0.002000 | Progress: 0.97476 +Step 5,975 | Tokens: 195,771,392 | Train Loss EWMA: 3.1812 | Learning Rate: 0.002000 | Progress: 0.97886 +Step 6,000 | Tokens: 196,590,592 | Train Loss EWMA: 3.1694 | Learning Rate: 0.002000 | Progress: 0.98295 +Step 6,025 | Tokens: 197,409,792 | Train Loss EWMA: 3.1715 | Learning Rate: 0.002000 | Progress: 0.98705 +Step 6,050 | Tokens: 198,228,992 | Train Loss EWMA: 3.1714 | Learning Rate: 0.002000 | Progress: 0.99114 +Step 6,075 | Tokens: 199,048,192 | Train Loss EWMA: 3.1720 | Learning Rate: 0.002000 | Progress: 0.99524 +Step 6,100 | Tokens: 199,867,392 | Train Loss EWMA: 3.1732 | Learning Rate: 0.002000 | Progress: 0.99934 diff --git a/wandb/run-20251101_050718-x0he2mby/files/requirements.txt b/wandb/run-20251101_050718-x0he2mby/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..c2cbab9ffb6c27c4669296998dc5731d4f57bca4 --- /dev/null +++ b/wandb/run-20251101_050718-x0he2mby/files/requirements.txt @@ -0,0 +1,217 @@ +fsspec==2025.3.0 +PyYAML==6.0.2 +certifi==2025.8.3 +comm==0.2.3 +hf-xet==1.1.8 +widgetsnbextension==4.0.14 +Jinja2==3.1.6 +rich==14.1.0 +circuitsvis==1.43.3 +param==2.2.1 +httpcore==1.0.9 +nvidia-cuda-cupti-cu12==12.8.90 +pytest==8.4.1 +nvidia-cuda-nvrtc-cu12==12.8.93 +asttokens==3.0.0 +filelock==3.19.1 +jsonschema-specifications==2025.4.1 +types-python-dateutil==2.9.0.20250822 +cycler==0.12.1 +stack-data==0.6.3 +jupyter_server==2.17.0 +aiosignal==1.4.0 +xyzservices==2025.4.0 +lark==1.2.2 +ptyprocess==0.7.0 +xxhash==3.5.0 +mpmath==1.3.0 +seaborn==0.13.2 +wadler_lindig==0.1.7 +typing_extensions==4.14.1 +nbformat==5.10.4 +jupyterlab==4.4.6 +plotly==6.3.0 +bokeh==3.7.3 +huggingface-hub==0.34.4 +sentencepiece==0.2.1 +torchvision==0.23.0 +tqdm==4.67.1 +contourpy==1.3.3 +nvidia-nvtx-cu12==12.8.90 +nvidia-cuda-runtime-cu12==12.8.90 +yarl==1.20.1 +charset-normalizer==3.4.3 +jupyter-events==0.12.0 +nbclient==0.10.2 +numpy==1.26.4 +decorator==5.2.1 +networkx==3.5 +smmap==5.0.2 +nbconvert==7.16.6 +pytz==2025.2 +aiohappyeyeballs==2.6.1 +requests==2.32.5 +tinycss2==1.4.0 +ruff==0.12.10 +defusedxml==0.7.1 +matplotlib-inline==0.1.7 +identify==2.6.13 +jedi==0.19.2 +pathspec==0.12.1 +transformer-lens==2.16.1 +sympy==1.14.0 +jupyterlab_pygments==0.3.0 +overrides==7.7.0 +notebook_shim==0.2.4 +matplotlib==3.10.5 +jupyter==1.1.1 +accelerate==1.10.0 +better-abc==0.0.3 +jsonpointer==3.0.0 +terminado==0.18.1 +cfgv==3.4.0 +rfc3987-syntax==1.1.0 +annotated-types==0.7.0 +pyarrow==21.0.0 +webencodings==0.5.1 +wcwidth==0.2.13 +mistune==3.1.3 +cffi==1.17.1 +jupyterlab_server==2.27.3 +argon2-cffi-bindings==25.1.0 +nvidia-nvjitlink-cu12==12.8.93 +jaxtyping==0.3.2 +Pygments==2.19.2 +panel==1.7.5 +torch==2.8.0 +rfc3339-validator==0.1.4 +urllib3==2.5.0 +jupyterlab_widgets==3.0.15 +ipykernel==6.30.1 +nvidia-cudnn-cu12==9.10.2.21 +babel==2.17.0 +transformers==4.55.4 +pure_eval==0.2.3 +pyparsing==3.2.3 +nvidia-cublas-cu12==12.8.4.1 +sniffio==1.3.1 +notebook==7.4.5 +pycparser==2.22 +packaging==25.0 +h11==0.16.0 +psutil==7.0.0 +pexpect==4.9.0 +zstandard==0.25.0 +gitdb==4.0.12 +rfc3986-validator==0.1.1 +toy_models==0.1.0 +torchaudio==2.8.0 +pyzmq==27.0.2 +mypy_extensions==1.1.0 +prompt_toolkit==3.0.51 +pytest-cov==6.2.1 +attrs==25.3.0 +regex==2025.7.34 +jupyter_core==5.8.1 +bleach==6.2.0 +fqdn==1.5.1 +async-lru==2.0.5 +nvidia-nccl-cu12==2.27.3 +GitPython==3.1.45 +referencing==0.36.2 +click==8.2.1 +fonttools==4.59.1 +prometheus_client==0.22.1 +httpx==0.28.1 +setuptools==80.9.0 +argon2-cffi==25.1.0 +multidict==6.6.4 +pyviz_comms==3.0.6 +executing==2.2.0 +arrow==1.3.0 +sentry-sdk==2.35.0 +beartype==0.14.1 +coverage==7.10.4 +ipywidgets==8.1.7 +pydantic_core==2.33.2 +tokenizers==0.21.4 +markdown-it-py==4.0.0 +pandas==2.3.2 +virtualenv==20.34.0 +python-dotenv==1.1.1 +isoduration==20.11.0 +python-dateutil==2.9.0.post0 +nodeenv==1.9.1 +nvidia-curand-cu12==10.3.9.90 +webcolors==24.11.1 +mypy==1.17.1 +MarkupSafe==3.0.2 +nvidia-cusolver-cu12==11.7.3.90 +Send2Trash==1.8.3 +protobuf==6.32.0 +jupyter_server_terminals==0.5.3 +debugpy==1.8.16 +json5==0.12.1 +linkify-it-py==2.0.3 +importlib_metadata==8.7.0 +nvidia-cufft-cu12==11.3.3.83 +distlib==0.4.0 +typing-inspection==0.4.1 +rpds-py==0.27.0 +nvidia-cufile-cu12==1.13.1.3 +mdurl==0.1.2 +websocket-client==1.8.0 +jsonschema==4.25.1 +python-json-logger==3.3.0 +ipympl==0.9.7 +einops==0.8.1 +jupyter_client==8.6.3 +ipython_pygments_lexers==1.1.1 +h5py==3.14.0 +tabulate==0.9.0 +propcache==0.3.2 +tornado==6.5.2 +typeguard==4.4.4 +tomlkit==0.13.2 +pluggy==1.6.0 +pydantic==2.11.7 +ipython==9.4.0 +zipp==3.23.0 +fancy-einsum==0.0.3 +fastjsonschema==2.21.2 +datasets==4.0.0 +Markdown==3.8.2 +pillow==11.3.0 +uc-micro-py==1.0.3 +pre_commit==4.3.0 +beautifulsoup4==4.13.4 +soupsieve==2.7 +aiohttp==3.12.15 +wandb==0.21.1 +tzdata==2025.2 +jupyter-lsp==2.2.6 +triton==3.4.0 +kiwisolver==1.4.9 +idna==3.10 +narwhals==2.1.2 +multiprocess==0.70.16 +dill==0.3.8 +mdit-py-plugins==0.5.0 +transformers-stream-generator==0.0.5 +nvidia-cusparselt-cu12==0.7.1 +parso==0.8.4 +pandocfilters==1.5.1 +jupyter-console==6.6.3 +anyio==4.10.0 +six==1.17.0 +holoviews==1.21.0 +colorcet==3.1.0 +uri-template==1.3.0 +nest-asyncio==1.6.0 +nvidia-cusparse-cu12==12.5.8.93 +platformdirs==4.3.8 +iniconfig==2.1.0 +traitlets==5.14.3 +safetensors==0.6.2 +frozenlist==1.7.0 +toy_models==0.1.0 diff --git a/wandb/run-20251101_050718-x0he2mby/files/wandb-metadata.json b/wandb/run-20251101_050718-x0he2mby/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..d1397e9e5df330abdcdaa1e551bc2c91a8ba15c7 --- /dev/null +++ b/wandb/run-20251101_050718-x0he2mby/files/wandb-metadata.json @@ -0,0 +1,38 @@ +{ + "os": "Linux-5.19.0-45-generic-x86_64-with-glibc2.35", + "python": "CPython 3.11.7", + "startedAt": "2025-11-01T05:07:18.924866Z", + "program": "", + "git": { + "remote": "git@github.com:jgroh3/toy_models.git", + "commit": "f1ab6930b532ac49a882df0914c8b0fd16fa74f5" + }, + "email": "efarrel4@tcd.ie", + "root": "/notebooks/toy_models/model_training/baseline_dataset_name_wikitext_llama", + "host": "n4q19ari04", + "executable": "/notebooks/toy_models/.toy_models_env/bin/python", + "cpu_count": 8, + "cpu_count_logical": 8, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 1, + "disk": { + "/": { + "total": "262240792576", + "used": "123042988032" + } + }, + "memory": { + "total": "47332843520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere", + "uuid": "GPU-785ade46-5d0b-7a95-8bdc-e144a0a8994c" + } + ], + "cudaVersion": "12.4", + "writerId": "b2kz8h9szdlb2cddiita5qr6fnj2cdux" +} \ No newline at end of file diff --git a/wandb/run-20251101_050718-x0he2mby/files/wandb-summary.json b/wandb/run-20251101_050718-x0he2mby/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..35edaf049ce2a9c5c97a69ced7a7ce9332f7ab2d --- /dev/null +++ b/wandb/run-20251101_050718-x0he2mby/files/wandb-summary.json @@ -0,0 +1 @@ +{"_runtime":3167.757984591,"tokens_per_second":32765.1462295082,"_wandb":{"runtime":3167},"progress":0.99933696,"_step":6100,"train_loss":3.228931427001953,"_timestamp":1.7619768052479196e+09,"learning_rate":0.002,"tokens_seen":199867392,"step":6100,"train_loss_ewma":3.1731960439116436} \ No newline at end of file diff --git a/wandb/run-20251101_050718-x0he2mby/logs/debug-internal.log b/wandb/run-20251101_050718-x0he2mby/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..190c8d92cb74318e25bb1a6d2e9afb9ba322984e --- /dev/null +++ b/wandb/run-20251101_050718-x0he2mby/logs/debug-internal.log @@ -0,0 +1,12 @@ +{"time":"2025-11-01T05:07:19.401880129Z","level":"INFO","msg":"stream: starting","core version":"0.21.1"} +{"time":"2025-11-01T05:07:19.623553191Z","level":"INFO","msg":"stream: created new stream","id":"x0he2mby"} +{"time":"2025-11-01T05:07:19.623592042Z","level":"INFO","msg":"stream: started","id":"x0he2mby"} +{"time":"2025-11-01T05:07:19.623629201Z","level":"INFO","msg":"writer: started","stream_id":"x0he2mby"} +{"time":"2025-11-01T05:07:19.623681532Z","level":"INFO","msg":"handler: started","stream_id":"x0he2mby"} +{"time":"2025-11-01T05:07:19.623637897Z","level":"INFO","msg":"sender: started","stream_id":"x0he2mby"} +{"time":"2025-11-01T06:00:07.926698703Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-11-01T06:00:08.601238042Z","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"uploading history steps 242-243, summary, console lines 251-251","runtime_seconds":0.672901309}],"total_operations":1}} +{"time":"2025-11-01T06:00:13.068341746Z","level":"INFO","msg":"stream: closing","id":"x0he2mby"} +{"time":"2025-11-01T06:00:13.068386588Z","level":"INFO","msg":"handler: closed","stream_id":"x0he2mby"} +{"time":"2025-11-01T06:00:13.068443279Z","level":"INFO","msg":"sender: closed","stream_id":"x0he2mby"} +{"time":"2025-11-01T06:00:13.068447047Z","level":"INFO","msg":"stream: closed","id":"x0he2mby"} diff --git a/wandb/run-20251101_050718-x0he2mby/logs/debug.log b/wandb/run-20251101_050718-x0he2mby/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..639aa195000734693fe22ae117b938a73e0c1fed --- /dev/null +++ b/wandb/run-20251101_050718-x0he2mby/logs/debug.log @@ -0,0 +1,28 @@ +2025-11-01 05:07:18,928 INFO MainThread:290 [wandb_setup.py:_flush():80] Current SDK version is 0.21.1 +2025-11-01 05:07:18,928 INFO MainThread:290 [wandb_setup.py:_flush():80] Configure stats pid to 290 +2025-11-01 05:07:18,928 INFO MainThread:290 [wandb_setup.py:_flush():80] Loading settings from /root/.config/wandb/settings +2025-11-01 05:07:18,928 INFO MainThread:290 [wandb_setup.py:_flush():80] Loading settings from /notebooks/toy_models/model_training/baseline_dataset_name_wikitext_llama/wandb/settings +2025-11-01 05:07:18,928 INFO MainThread:290 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-11-01 05:07:18,928 INFO MainThread:290 [wandb_init.py:setup_run_log_directory():703] Logging user logs to /notebooks/toy_models/model_training/baseline_dataset_name_wikitext_llama/wandb/run-20251101_050718-x0he2mby/logs/debug.log +2025-11-01 05:07:18,928 INFO MainThread:290 [wandb_init.py:setup_run_log_directory():704] Logging internal logs to /notebooks/toy_models/model_training/baseline_dataset_name_wikitext_llama/wandb/run-20251101_050718-x0he2mby/logs/debug-internal.log +2025-11-01 05:07:18,928 INFO MainThread:290 [wandb_init.py:init():830] calling init triggers +2025-11-01 05:07:18,928 INFO MainThread:290 [wandb_init.py:init():835] wandb.init called with sweep_config: {} +config: {'model_name': 'baseline', 'n_layers': 2, 'd_model': 512, 'd_mlp': 2048, 'd_head': 64, 'n_heads': 8, 'attn_only': False, 'layer_norm_eps': 1e-05, 'init_range': 0.02, 'n_ctx': 1024, 'd_vocab': 32000, 'dataset_name': 'eoinf/wikitext_llama', 'tokenizer_name': '', 'seed': 10, 'device': 'cuda', 'use_bfloat16_matmul': False, 'batch_size_per_device': 32, 'n_devices': 1, 'batches_per_step': 1, 'max_tokens': 200000000, 'lr_hidden': 0.002, 'lr_vector': 0.001, 'lr_schedule': 'constant_with_warmup', 'warmup_tokens': 30000000, 'weight_decay': 0.05, 'grad_norm_clip': 1.0, 'train_loss_moving_average_beta': 0.99, 'log_interval': 25, 'save_checkpoints': True, 'checkpoint_interval': 500, 'checkpoint_interval_ratio': 1.1, 'save_log_checkpoints': True, 'use_wandb': True, 'batch_size': 32, 'tokens_per_step': 32768, 'warmup_steps': 915, 'max_steps': 6103, '_wandb': {}} +2025-11-01 05:07:18,928 INFO MainThread:290 [wandb_init.py:init():871] starting backend +2025-11-01 05:07:19,390 INFO MainThread:290 [wandb_init.py:init():874] sending inform_init request +2025-11-01 05:07:19,399 INFO MainThread:290 [wandb_init.py:init():882] backend started and connected +2025-11-01 05:07:19,401 INFO MainThread:290 [wandb_init.py:init():953] updated telemetry +2025-11-01 05:07:19,405 INFO MainThread:290 [wandb_init.py:init():977] communicating run to backend with 90.0 second timeout +2025-11-01 05:07:19,840 INFO MainThread:290 [wandb_init.py:init():1029] starting run threads in backend +2025-11-01 05:07:19,956 INFO MainThread:290 [wandb_run.py:_console_start():2494] atexit reg +2025-11-01 05:07:19,956 INFO MainThread:290 [wandb_run.py:_redirect():2342] redirect: wrap_raw +2025-11-01 05:07:19,956 INFO MainThread:290 [wandb_run.py:_redirect():2411] Wrapping output streams. +2025-11-01 05:07:19,956 INFO MainThread:290 [wandb_run.py:_redirect():2434] Redirects installed. +2025-11-01 05:07:19,964 INFO MainThread:290 [wandb_init.py:init():1075] run started, returning control to user process +2025-11-01 06:00:07,597 INFO MainThread:290 [wandb_run.py:_finish():2260] finishing run eoin/toy-transformer-replication/x0he2mby +2025-11-01 06:00:07,598 INFO MainThread:290 [wandb_run.py:_atexit_cleanup():2459] got exitcode: 0 +2025-11-01 06:00:07,599 INFO MainThread:290 [wandb_run.py:_restore():2441] restore +2025-11-01 06:00:07,599 INFO MainThread:290 [wandb_run.py:_restore():2447] restore done +2025-11-01 06:00:13,066 INFO MainThread:290 [wandb_run.py:_footer_history_summary_info():3895] rendering history +2025-11-01 06:00:13,067 INFO MainThread:290 [wandb_run.py:_footer_history_summary_info():3927] rendering summary +2025-11-01 06:00:13,067 INFO MainThread:290 [wandb_run.py:_footer_sync_info():3856] logging synced files diff --git a/wandb/run-20251101_050718-x0he2mby/run-x0he2mby.wandb b/wandb/run-20251101_050718-x0he2mby/run-x0he2mby.wandb new file mode 100644 index 0000000000000000000000000000000000000000..5d10a7edc01b48899abaff08481a1370a554ac63 --- /dev/null +++ b/wandb/run-20251101_050718-x0he2mby/run-x0he2mby.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ce8ac3cc5035560cb99cfa57518ca32382ab61239d7fec92715e1a8568fc737 +size 7488927