diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..efed6e049f8a1c8c5da4cefe53087ae0856785bd 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +wandb/run-20250819_121927-k06awv3c/run-k06awv3c.wandb filter=lfs diff=lfs merge=lfs -text diff --git a/.ipynb_checkpoints/config-checkpoint.toml b/.ipynb_checkpoints/config-checkpoint.toml new file mode 100644 index 0000000000000000000000000000000000000000..f4c56ba2c2e5172ed7ca1b52a2c44efc7c774c76 --- /dev/null +++ b/.ipynb_checkpoints/config-checkpoint.toml @@ -0,0 +1,32 @@ +model_name = "gelu_2l_50m_subset" +n_layers = 2 +d_model = 512 +d_mlp = 2048 +d_head = 64 +n_heads = 8 +attn_only = false +layer_norm_eps = 1e-05 +init_range = 0.02 +n_ctx = 1024 +d_vocab = 48262 +dataset_name = "eoinf/c4-code-test-50m" +tokenizer_name = "NeelNanda/gpt-neox-tokenizer-digits" +seed = 10 +device = "cuda" +use_bfloat16_matmul = false +batch_size_per_device = 32 +n_devices = 1 +batches_per_step = 1 +max_tokens = 200000000 +lr_hidden = 0.002 +lr_vector = 0.001 +lr_schedule = "constant_with_warmup" +warmup_tokens = 30000000 +weight_decay = 0.05 +grad_norm_clip = 1.0 +train_loss_moving_average_beta = 0.99 +log_interval = 25 +save_checkpoints = true +checkpoint_interval = 500 +checkpoint_interval_ratio = 1.08 +save_log_checkpoints = true \ No newline at end of file diff --git a/checkpoints/metadata_000000032768.json b/checkpoints/metadata_000000032768.json new file mode 100644 index 0000000000000000000000000000000000000000..74a3d5c1d46cb3abdc4f09d2be7992791e3869ca --- /dev/null +++ b/checkpoints/metadata_000000032768.json @@ -0,0 +1 @@ +{"step": 1, "tokens_seen": 32768, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.875693321228027} \ No newline at end of file diff --git a/checkpoints/metadata_000000327680.json b/checkpoints/metadata_000000327680.json new file mode 100644 index 0000000000000000000000000000000000000000..b2dc5054150459a10f06c94ce0d99b49a1e121b6 --- /dev/null +++ b/checkpoints/metadata_000000327680.json @@ -0,0 +1 @@ +{"step": 10, "tokens_seen": 327680, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.869194659938932} \ No newline at end of file diff --git a/checkpoints/metadata_000000360448.json b/checkpoints/metadata_000000360448.json new file mode 100644 index 0000000000000000000000000000000000000000..1b4b930aa276f889eac362749ba127e8a92846f2 --- /dev/null +++ b/checkpoints/metadata_000000360448.json @@ -0,0 +1 @@ +{"step": 11, "tokens_seen": 360448, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.866845906393742} \ No newline at end of file diff --git a/checkpoints/metadata_000000393216.json b/checkpoints/metadata_000000393216.json new file mode 100644 index 0000000000000000000000000000000000000000..b2135fef9378971b38b763aeed21015aa842bb2e --- /dev/null +++ b/checkpoints/metadata_000000393216.json @@ -0,0 +1 @@ +{"step": 12, "tokens_seen": 393216, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.863887085925386} \ No newline at end of file diff --git a/checkpoints/metadata_000000425984.json b/checkpoints/metadata_000000425984.json new file mode 100644 index 0000000000000000000000000000000000000000..c9a9674c2fcdd58287b26b83dbdcc6c0bf52784c --- /dev/null +++ b/checkpoints/metadata_000000425984.json @@ -0,0 +1 @@ +{"step": 13, "tokens_seen": 425984, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.860769321786103} \ No newline at end of file diff --git a/checkpoints/metadata_000000458752.json b/checkpoints/metadata_000000458752.json new file mode 100644 index 0000000000000000000000000000000000000000..60c9dae94013183ea75382a8ac0b204f5c447197 --- /dev/null +++ b/checkpoints/metadata_000000458752.json @@ -0,0 +1 @@ +{"step": 14, "tokens_seen": 458752, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.857123414838384} \ No newline at end of file diff --git a/checkpoints/metadata_000000491520.json b/checkpoints/metadata_000000491520.json new file mode 100644 index 0000000000000000000000000000000000000000..780628f8658829ae60a5f23d406cb48368bb5310 --- /dev/null +++ b/checkpoints/metadata_000000491520.json @@ -0,0 +1 @@ +{"step": 15, "tokens_seen": 491520, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.852884313029477} \ No newline at end of file diff --git a/checkpoints/metadata_000000557056.json b/checkpoints/metadata_000000557056.json new file mode 100644 index 0000000000000000000000000000000000000000..6f27b5fe1aea5b710c6be1bde5b116686bc528d8 --- /dev/null +++ b/checkpoints/metadata_000000557056.json @@ -0,0 +1 @@ +{"step": 17, "tokens_seen": 557056, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.843942977135617} \ No newline at end of file diff --git a/checkpoints/metadata_000000589824.json b/checkpoints/metadata_000000589824.json new file mode 100644 index 0000000000000000000000000000000000000000..20493df97dd64f1c23767cce04042cde3e441019 --- /dev/null +++ b/checkpoints/metadata_000000589824.json @@ -0,0 +1 @@ +{"step": 18, "tokens_seen": 589824, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.838675801926637} \ No newline at end of file diff --git a/checkpoints/metadata_000000655360.json b/checkpoints/metadata_000000655360.json new file mode 100644 index 0000000000000000000000000000000000000000..a102617520235809d2e032719126f19aac7c8e0e --- /dev/null +++ b/checkpoints/metadata_000000655360.json @@ -0,0 +1 @@ +{"step": 20, "tokens_seen": 655360, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.827581881102727} \ No newline at end of file diff --git a/checkpoints/metadata_000000688128.json b/checkpoints/metadata_000000688128.json new file mode 100644 index 0000000000000000000000000000000000000000..dca98bc561ff1a16d1d8dadad6f4ae28ed9824ef --- /dev/null +++ b/checkpoints/metadata_000000688128.json @@ -0,0 +1 @@ +{"step": 21, "tokens_seen": 688128, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.821188396047193} \ No newline at end of file diff --git a/checkpoints/metadata_000000753664.json b/checkpoints/metadata_000000753664.json new file mode 100644 index 0000000000000000000000000000000000000000..1399ada89a59da9998fffa1cf9c2ac2ae91690c0 --- /dev/null +++ b/checkpoints/metadata_000000753664.json @@ -0,0 +1 @@ +{"step": 23, "tokens_seen": 753664, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.80767400402686} \ No newline at end of file diff --git a/checkpoints/metadata_000000819200.json b/checkpoints/metadata_000000819200.json new file mode 100644 index 0000000000000000000000000000000000000000..9ba9da14310ad3e87606511aaff43d98aa4a61e7 --- /dev/null +++ b/checkpoints/metadata_000000819200.json @@ -0,0 +1 @@ +{"step": 25, "tokens_seen": 819200, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.793338704276259} \ No newline at end of file diff --git a/checkpoints/metadata_000000884736.json b/checkpoints/metadata_000000884736.json new file mode 100644 index 0000000000000000000000000000000000000000..13a6ea26c0c01e53d56e555600822303686c9d8b --- /dev/null +++ b/checkpoints/metadata_000000884736.json @@ -0,0 +1 @@ +{"step": 27, "tokens_seen": 884736, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.778333187641332} \ No newline at end of file diff --git a/checkpoints/metadata_000000950272.json b/checkpoints/metadata_000000950272.json new file mode 100644 index 0000000000000000000000000000000000000000..4eb182fbe0d92e3f54d19db28b26884eea1a1350 --- /dev/null +++ b/checkpoints/metadata_000000950272.json @@ -0,0 +1 @@ +{"step": 29, "tokens_seen": 950272, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.762387405684441} \ No newline at end of file diff --git a/checkpoints/metadata_000001015808.json b/checkpoints/metadata_000001015808.json new file mode 100644 index 0000000000000000000000000000000000000000..2cdecdc6af928ed2ce8d8e027989f2b535ba386c --- /dev/null +++ b/checkpoints/metadata_000001015808.json @@ -0,0 +1 @@ +{"step": 31, "tokens_seen": 1015808, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.745991090765514} \ No newline at end of file diff --git a/checkpoints/metadata_000001114112.json b/checkpoints/metadata_000001114112.json new file mode 100644 index 0000000000000000000000000000000000000000..575ff9d740aaa553cf5822613060223004dabb2a --- /dev/null +++ b/checkpoints/metadata_000001114112.json @@ -0,0 +1 @@ +{"step": 34, "tokens_seen": 1114112, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.72089704279391} \ No newline at end of file diff --git a/checkpoints/metadata_000001212416.json b/checkpoints/metadata_000001212416.json new file mode 100644 index 0000000000000000000000000000000000000000..83208486fbe4b94466cabb5d03258732f5b6f2d4 --- /dev/null +++ b/checkpoints/metadata_000001212416.json @@ -0,0 +1 @@ +{"step": 37, "tokens_seen": 1212416, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.693857732332471} \ No newline at end of file diff --git a/checkpoints/metadata_000001310720.json b/checkpoints/metadata_000001310720.json new file mode 100644 index 0000000000000000000000000000000000000000..247ec530cf40536f662565bea390bdb5d7cdf993 --- /dev/null +++ b/checkpoints/metadata_000001310720.json @@ -0,0 +1 @@ +{"step": 40, "tokens_seen": 1310720, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.665191925962642} \ No newline at end of file diff --git a/checkpoints/metadata_000001409024.json b/checkpoints/metadata_000001409024.json new file mode 100644 index 0000000000000000000000000000000000000000..13fa76bf063558b1d6ac2287c2f29ca5e3d682b2 --- /dev/null +++ b/checkpoints/metadata_000001409024.json @@ -0,0 +1 @@ +{"step": 43, "tokens_seen": 1409024, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.633825906491143} \ No newline at end of file diff --git a/checkpoints/metadata_000001507328.json b/checkpoints/metadata_000001507328.json new file mode 100644 index 0000000000000000000000000000000000000000..2ba41f5394564289a6a6ec19bf7e8431d27063e2 --- /dev/null +++ b/checkpoints/metadata_000001507328.json @@ -0,0 +1 @@ +{"step": 46, "tokens_seen": 1507328, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.602067891043278} \ No newline at end of file diff --git a/checkpoints/metadata_000001638400.json b/checkpoints/metadata_000001638400.json new file mode 100644 index 0000000000000000000000000000000000000000..22833a80efec2eb323f4210e8d5cd93afa3d9810 --- /dev/null +++ b/checkpoints/metadata_000001638400.json @@ -0,0 +1 @@ +{"step": 50, "tokens_seen": 1638400, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.556301510054759} \ No newline at end of file diff --git a/checkpoints/metadata_000001769472.json b/checkpoints/metadata_000001769472.json new file mode 100644 index 0000000000000000000000000000000000000000..20d9b6a2c928e4646bbbb054cd1be79f6e1626c4 --- /dev/null +++ b/checkpoints/metadata_000001769472.json @@ -0,0 +1 @@ +{"step": 54, "tokens_seen": 1769472, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.506479708696956} \ No newline at end of file diff --git a/checkpoints/metadata_000001933312.json b/checkpoints/metadata_000001933312.json new file mode 100644 index 0000000000000000000000000000000000000000..1e08327724937904a40c89849b99f0e1e0ef18a9 --- /dev/null +++ b/checkpoints/metadata_000001933312.json @@ -0,0 +1 @@ +{"step": 59, "tokens_seen": 1933312, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.439339538822045} \ No newline at end of file diff --git a/checkpoints/metadata_000002064384.json b/checkpoints/metadata_000002064384.json new file mode 100644 index 0000000000000000000000000000000000000000..33ca3b2307cd2119bff200297b7eb643eb4565ec --- /dev/null +++ b/checkpoints/metadata_000002064384.json @@ -0,0 +1 @@ +{"step": 63, "tokens_seen": 2064384, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.381791756197856} \ No newline at end of file diff --git a/checkpoints/metadata_000002228224.json b/checkpoints/metadata_000002228224.json new file mode 100644 index 0000000000000000000000000000000000000000..5347b746d704eb812ac56e77fcc671eafc7e7b57 --- /dev/null +++ b/checkpoints/metadata_000002228224.json @@ -0,0 +1 @@ +{"step": 68, "tokens_seen": 2228224, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.305867163846642} \ No newline at end of file diff --git a/checkpoints/metadata_000002424832.json b/checkpoints/metadata_000002424832.json new file mode 100644 index 0000000000000000000000000000000000000000..fcc3bbc0eeb6974bc7f83efe4f85267fc139c6c1 --- /dev/null +++ b/checkpoints/metadata_000002424832.json @@ -0,0 +1 @@ +{"step": 74, "tokens_seen": 2424832, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.20789828227841} \ No newline at end of file diff --git a/checkpoints/metadata_000002621440.json b/checkpoints/metadata_000002621440.json new file mode 100644 index 0000000000000000000000000000000000000000..3b3d76a6abb46c8cedf4d2fa9623c076df9aaa9d --- /dev/null +++ b/checkpoints/metadata_000002621440.json @@ -0,0 +1 @@ +{"step": 80, "tokens_seen": 2621440, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.102181636865007} \ No newline at end of file diff --git a/checkpoints/metadata_000002818048.json b/checkpoints/metadata_000002818048.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe53520cb6d947d46665f537f1247858c074819 --- /dev/null +++ b/checkpoints/metadata_000002818048.json @@ -0,0 +1 @@ +{"step": 86, "tokens_seen": 2818048, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.98962688875305} \ No newline at end of file diff --git a/checkpoints/metadata_000003047424.json b/checkpoints/metadata_000003047424.json new file mode 100644 index 0000000000000000000000000000000000000000..77ee25e5a7592bad79a59bceac9bb3165ca4564e --- /dev/null +++ b/checkpoints/metadata_000003047424.json @@ -0,0 +1 @@ +{"step": 93, "tokens_seen": 3047424, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.851253248021033} \ No newline at end of file diff --git a/checkpoints/metadata_000003309568.json b/checkpoints/metadata_000003309568.json new file mode 100644 index 0000000000000000000000000000000000000000..6e9e7227847932fdb873bc2b71ca0a9b1e3abeb8 --- /dev/null +++ b/checkpoints/metadata_000003309568.json @@ -0,0 +1 @@ +{"step": 101, "tokens_seen": 3309568, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.687782369547088} \ No newline at end of file diff --git a/checkpoints/metadata_000003571712.json b/checkpoints/metadata_000003571712.json new file mode 100644 index 0000000000000000000000000000000000000000..6fe3e0d1620b613ef01982f843487a84d2c0365c --- /dev/null +++ b/checkpoints/metadata_000003571712.json @@ -0,0 +1 @@ +{"step": 109, "tokens_seen": 3571712, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.520436808701447} \ No newline at end of file diff --git a/checkpoints/metadata_000003866624.json b/checkpoints/metadata_000003866624.json new file mode 100644 index 0000000000000000000000000000000000000000..73c209f35bf126885d54f7d30d8204e00fb70ef7 --- /dev/null +++ b/checkpoints/metadata_000003866624.json @@ -0,0 +1 @@ +{"step": 118, "tokens_seen": 3866624, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.341413630195127} \ No newline at end of file diff --git a/checkpoints/metadata_000004161536.json b/checkpoints/metadata_000004161536.json new file mode 100644 index 0000000000000000000000000000000000000000..b77fc02619a9241ba2c03740e02080b6875f9f85 --- /dev/null +++ b/checkpoints/metadata_000004161536.json @@ -0,0 +1 @@ +{"step": 127, "tokens_seen": 4161536, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.164262377235985} \ No newline at end of file diff --git a/checkpoints/metadata_000004489216.json b/checkpoints/metadata_000004489216.json new file mode 100644 index 0000000000000000000000000000000000000000..50f9ec84870ffc4b00250fdb6b7b972e0ba484c6 --- /dev/null +++ b/checkpoints/metadata_000004489216.json @@ -0,0 +1 @@ +{"step": 137, "tokens_seen": 4489216, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 8.97327932979284} \ No newline at end of file diff --git a/checkpoints/metadata_000004849664.json b/checkpoints/metadata_000004849664.json new file mode 100644 index 0000000000000000000000000000000000000000..8c16b287fbf763be04c92ac8b8c41569c85c713f --- /dev/null +++ b/checkpoints/metadata_000004849664.json @@ -0,0 +1 @@ +{"step": 148, "tokens_seen": 4849664, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 8.769313620031904} \ No newline at end of file diff --git a/checkpoints/metadata_000005242880.json b/checkpoints/metadata_000005242880.json new file mode 100644 index 0000000000000000000000000000000000000000..fd4823cd49dc17ca4743c00904cb1f5593644848 --- /dev/null +++ b/checkpoints/metadata_000005242880.json @@ -0,0 +1 @@ +{"step": 160, "tokens_seen": 5242880, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 8.562050306151097} \ No newline at end of file diff --git a/checkpoints/metadata_000005668864.json b/checkpoints/metadata_000005668864.json new file mode 100644 index 0000000000000000000000000000000000000000..d6aa10bde5ae030f0c08c70a3f3eae6eb8be5cee --- /dev/null +++ b/checkpoints/metadata_000005668864.json @@ -0,0 +1 @@ +{"step": 173, "tokens_seen": 5668864, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 8.34983091070313} \ No newline at end of file diff --git a/checkpoints/metadata_000006127616.json b/checkpoints/metadata_000006127616.json new file mode 100644 index 0000000000000000000000000000000000000000..a4561169631a35a3e8550eadd16b54b05be22130 --- /dev/null +++ b/checkpoints/metadata_000006127616.json @@ -0,0 +1 @@ +{"step": 187, "tokens_seen": 6127616, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 8.139855879496354} \ No newline at end of file diff --git a/checkpoints/metadata_000006619136.json b/checkpoints/metadata_000006619136.json new file mode 100644 index 0000000000000000000000000000000000000000..16e3661a9ee30542ce002eb0e4a8fa710bdb5d55 --- /dev/null +++ b/checkpoints/metadata_000006619136.json @@ -0,0 +1 @@ +{"step": 202, "tokens_seen": 6619136, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.930274531158507} \ No newline at end of file diff --git a/checkpoints/metadata_000007143424.json b/checkpoints/metadata_000007143424.json new file mode 100644 index 0000000000000000000000000000000000000000..f3cd39e093495cfbd2b09443cad8eb811f5a34ff --- /dev/null +++ b/checkpoints/metadata_000007143424.json @@ -0,0 +1 @@ +{"step": 218, "tokens_seen": 7143424, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.726464644587039} \ No newline at end of file diff --git a/checkpoints/metadata_000007733248.json b/checkpoints/metadata_000007733248.json new file mode 100644 index 0000000000000000000000000000000000000000..8f7ceae0e5592ac51e16430d1937d5333df0ac97 --- /dev/null +++ b/checkpoints/metadata_000007733248.json @@ -0,0 +1 @@ +{"step": 236, "tokens_seen": 7733248, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.5225022621700415} \ No newline at end of file diff --git a/checkpoints/metadata_000008323072.json b/checkpoints/metadata_000008323072.json new file mode 100644 index 0000000000000000000000000000000000000000..76c4ef737dc385e5a1b684af236234b73fb41254 --- /dev/null +++ b/checkpoints/metadata_000008323072.json @@ -0,0 +1 @@ +{"step": 254, "tokens_seen": 8323072, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.345735066228189} \ No newline at end of file diff --git a/checkpoints/metadata_000009011200.json b/checkpoints/metadata_000009011200.json new file mode 100644 index 0000000000000000000000000000000000000000..68777ceef314844d416eac111d91df1b5f3964ef --- /dev/null +++ b/checkpoints/metadata_000009011200.json @@ -0,0 +1 @@ +{"step": 275, "tokens_seen": 9011200, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.165483392009832} \ No newline at end of file diff --git a/checkpoints/metadata_000009732096.json b/checkpoints/metadata_000009732096.json new file mode 100644 index 0000000000000000000000000000000000000000..59ece5fd5ebee9bd1939493d850a22847b3d27c6 --- /dev/null +++ b/checkpoints/metadata_000009732096.json @@ -0,0 +1 @@ +{"step": 297, "tokens_seen": 9732096, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.0003946644838315} \ No newline at end of file diff --git a/checkpoints/metadata_000010518528.json b/checkpoints/metadata_000010518528.json new file mode 100644 index 0000000000000000000000000000000000000000..c29a3b822b9978291a6d99662b6713d02d039675 --- /dev/null +++ b/checkpoints/metadata_000010518528.json @@ -0,0 +1 @@ +{"step": 321, "tokens_seen": 10518528, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.8489763203609515} \ No newline at end of file diff --git a/checkpoints/metadata_000011337728.json b/checkpoints/metadata_000011337728.json new file mode 100644 index 0000000000000000000000000000000000000000..8acfa4c9583842d59ac3246c216bd46e1562492e --- /dev/null +++ b/checkpoints/metadata_000011337728.json @@ -0,0 +1 @@ +{"step": 346, "tokens_seen": 11337728, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.713418538766802} \ No newline at end of file diff --git a/checkpoints/metadata_000012255232.json b/checkpoints/metadata_000012255232.json new file mode 100644 index 0000000000000000000000000000000000000000..ac75421d3091aba38e4204b1406fd6c6d544bb5a --- /dev/null +++ b/checkpoints/metadata_000012255232.json @@ -0,0 +1 @@ +{"step": 374, "tokens_seen": 12255232, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.586614493978286} \ No newline at end of file diff --git a/checkpoints/metadata_000013238272.json b/checkpoints/metadata_000013238272.json new file mode 100644 index 0000000000000000000000000000000000000000..abfc47d98f4af4a06ac45078c7508e9c66e03f2e --- /dev/null +++ b/checkpoints/metadata_000013238272.json @@ -0,0 +1 @@ +{"step": 404, "tokens_seen": 13238272, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.4722833945745} \ No newline at end of file diff --git a/checkpoints/metadata_000014286848.json b/checkpoints/metadata_000014286848.json new file mode 100644 index 0000000000000000000000000000000000000000..01c0cd7f01b07c2f57685d1175af9e16ea412742 --- /dev/null +++ b/checkpoints/metadata_000014286848.json @@ -0,0 +1 @@ +{"step": 436, "tokens_seen": 14286848, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.375912745241577} \ No newline at end of file diff --git a/checkpoints/metadata_000015433728.json b/checkpoints/metadata_000015433728.json new file mode 100644 index 0000000000000000000000000000000000000000..84b8e2d4d84bc1d33fd9e18d12f8924b85585a47 --- /dev/null +++ b/checkpoints/metadata_000015433728.json @@ -0,0 +1 @@ +{"step": 471, "tokens_seen": 15433728, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.280213734342172} \ No newline at end of file diff --git a/checkpoints/metadata_000016384000.json b/checkpoints/metadata_000016384000.json new file mode 100644 index 0000000000000000000000000000000000000000..503b96e0e68ac1300a7d717e8f11d37843a08c02 --- /dev/null +++ b/checkpoints/metadata_000016384000.json @@ -0,0 +1 @@ +{"step": 500, "tokens_seen": 16384000, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.211088226493741} \ No newline at end of file diff --git a/checkpoints/metadata_000016678912.json b/checkpoints/metadata_000016678912.json new file mode 100644 index 0000000000000000000000000000000000000000..1b116bd8987a2b8951129fbb713e11e64037632e --- /dev/null +++ b/checkpoints/metadata_000016678912.json @@ -0,0 +1 @@ +{"step": 509, "tokens_seen": 16678912, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.188235117274248} \ No newline at end of file diff --git a/checkpoints/metadata_000018022400.json b/checkpoints/metadata_000018022400.json new file mode 100644 index 0000000000000000000000000000000000000000..8ce4c9ffe1f64b4baede5b97c015764841fb6594 --- /dev/null +++ b/checkpoints/metadata_000018022400.json @@ -0,0 +1 @@ +{"step": 550, "tokens_seen": 18022400, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.115511030314343} \ No newline at end of file diff --git a/checkpoints/metadata_000019464192.json b/checkpoints/metadata_000019464192.json new file mode 100644 index 0000000000000000000000000000000000000000..a793c312461249e78f3120522fb524e43ed815f1 --- /dev/null +++ b/checkpoints/metadata_000019464192.json @@ -0,0 +1 @@ +{"step": 594, "tokens_seen": 19464192, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.031790313666961} \ No newline at end of file diff --git a/checkpoints/metadata_000021037056.json b/checkpoints/metadata_000021037056.json new file mode 100644 index 0000000000000000000000000000000000000000..8f7cbbddfe8bddbe23a5f5b95f6aeeab855f6218 --- /dev/null +++ b/checkpoints/metadata_000021037056.json @@ -0,0 +1 @@ +{"step": 642, "tokens_seen": 21037056, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.961658483293991} \ No newline at end of file diff --git a/checkpoints/metadata_000022708224.json b/checkpoints/metadata_000022708224.json new file mode 100644 index 0000000000000000000000000000000000000000..333aba690127007a3638501c0d5a0c36199e2638 --- /dev/null +++ b/checkpoints/metadata_000022708224.json @@ -0,0 +1 @@ +{"step": 693, "tokens_seen": 22708224, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.894870036597681} \ No newline at end of file diff --git a/checkpoints/metadata_000024510464.json b/checkpoints/metadata_000024510464.json new file mode 100644 index 0000000000000000000000000000000000000000..b713b3d131df33e1ef81a8da672afded8993be29 --- /dev/null +++ b/checkpoints/metadata_000024510464.json @@ -0,0 +1 @@ +{"step": 748, "tokens_seen": 24510464, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.845927044752075} \ No newline at end of file diff --git a/checkpoints/metadata_000026476544.json b/checkpoints/metadata_000026476544.json new file mode 100644 index 0000000000000000000000000000000000000000..c86c2cca3ccabec6ab12d1487f9e9fcd2b8ba38f --- /dev/null +++ b/checkpoints/metadata_000026476544.json @@ -0,0 +1 @@ +{"step": 808, "tokens_seen": 26476544, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.782317226204285} \ No newline at end of file diff --git a/checkpoints/metadata_000028606464.json b/checkpoints/metadata_000028606464.json new file mode 100644 index 0000000000000000000000000000000000000000..2f358945fe4310b216990438de75d01cc46ff6f6 --- /dev/null +++ b/checkpoints/metadata_000028606464.json @@ -0,0 +1 @@ +{"step": 873, "tokens_seen": 28606464, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.721265417156013} \ No newline at end of file diff --git a/checkpoints/metadata_000030900224.json b/checkpoints/metadata_000030900224.json new file mode 100644 index 0000000000000000000000000000000000000000..5d20bc8ea492a0a32daf05d2e4967f010b0237b1 --- /dev/null +++ b/checkpoints/metadata_000030900224.json @@ -0,0 +1 @@ +{"step": 943, "tokens_seen": 30900224, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.63599984546126} \ No newline at end of file diff --git a/checkpoints/metadata_000032768000.json b/checkpoints/metadata_000032768000.json new file mode 100644 index 0000000000000000000000000000000000000000..4a73fe9c6c4332f8c64c6d7efde021b0e5d6703c --- /dev/null +++ b/checkpoints/metadata_000032768000.json @@ -0,0 +1 @@ +{"step": 1000, "tokens_seen": 32768000, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.560508808297965} \ No newline at end of file diff --git a/checkpoints/metadata_000033357824.json b/checkpoints/metadata_000033357824.json new file mode 100644 index 0000000000000000000000000000000000000000..ad0e092c36dff96c1f15fd32f29204068e67c120 --- /dev/null +++ b/checkpoints/metadata_000033357824.json @@ -0,0 +1 @@ +{"step": 1018, "tokens_seen": 33357824, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.536460550454261} \ No newline at end of file diff --git a/checkpoints/metadata_000036044800.json b/checkpoints/metadata_000036044800.json new file mode 100644 index 0000000000000000000000000000000000000000..9649de949e7dfd85598b403af8033b40b6b80e66 --- /dev/null +++ b/checkpoints/metadata_000036044800.json @@ -0,0 +1 @@ +{"step": 1100, "tokens_seen": 36044800, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.453908899435509} \ No newline at end of file diff --git a/checkpoints/metadata_000038928384.json b/checkpoints/metadata_000038928384.json new file mode 100644 index 0000000000000000000000000000000000000000..f6f33802ffdc400907a2218e7d008b6f393fdbba --- /dev/null +++ b/checkpoints/metadata_000038928384.json @@ -0,0 +1 @@ +{"step": 1188, "tokens_seen": 38928384, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.362477480680261} \ No newline at end of file diff --git a/checkpoints/metadata_000042041344.json b/checkpoints/metadata_000042041344.json new file mode 100644 index 0000000000000000000000000000000000000000..2e6bea2b419d2d341ac9b4c7020371c51d95cfed --- /dev/null +++ b/checkpoints/metadata_000042041344.json @@ -0,0 +1 @@ +{"step": 1283, "tokens_seen": 42041344, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.274515374688925} \ No newline at end of file diff --git a/checkpoints/metadata_000045416448.json b/checkpoints/metadata_000045416448.json new file mode 100644 index 0000000000000000000000000000000000000000..42866ce40a33fc912e832c32b4c62ac5ce908476 --- /dev/null +++ b/checkpoints/metadata_000045416448.json @@ -0,0 +1 @@ +{"step": 1386, "tokens_seen": 45416448, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.198322212155322} \ No newline at end of file diff --git a/checkpoints/metadata_000049053696.json b/checkpoints/metadata_000049053696.json new file mode 100644 index 0000000000000000000000000000000000000000..07266fedf8ba0804c68c9eb332d7930df94e07d4 --- /dev/null +++ b/checkpoints/metadata_000049053696.json @@ -0,0 +1 @@ +{"step": 1497, "tokens_seen": 49053696, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.143817127507039} \ No newline at end of file diff --git a/checkpoints/metadata_000049152000.json b/checkpoints/metadata_000049152000.json new file mode 100644 index 0000000000000000000000000000000000000000..4b86b891b0d2674232d0355f2433ea242b1a63e5 --- /dev/null +++ b/checkpoints/metadata_000049152000.json @@ -0,0 +1 @@ +{"step": 1500, "tokens_seen": 49152000, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.142836379121624} \ No newline at end of file diff --git a/checkpoints/metadata_000052936704.json b/checkpoints/metadata_000052936704.json new file mode 100644 index 0000000000000000000000000000000000000000..84301938c5b311e3d20c840c4f80880aacbcb747 --- /dev/null +++ b/checkpoints/metadata_000052936704.json @@ -0,0 +1 @@ +{"step": 1616, "tokens_seen": 52936704, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.025277766427137} \ No newline at end of file diff --git a/checkpoints/metadata_000057196544.json b/checkpoints/metadata_000057196544.json new file mode 100644 index 0000000000000000000000000000000000000000..f4dd5fff7d7502f935516022adaa2d78d802f2c3 --- /dev/null +++ b/checkpoints/metadata_000057196544.json @@ -0,0 +1 @@ +{"step": 1746, "tokens_seen": 57196544, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.93877945750906} \ No newline at end of file diff --git a/checkpoints/metadata_000061751296.json b/checkpoints/metadata_000061751296.json new file mode 100644 index 0000000000000000000000000000000000000000..a1e2b6991e1957a95ebd71f14b2c70d9593d33dc --- /dev/null +++ b/checkpoints/metadata_000061751296.json @@ -0,0 +1 @@ +{"step": 1885, "tokens_seen": 61751296, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.868801570942121} \ No newline at end of file diff --git a/checkpoints/metadata_000065519616.json b/checkpoints/metadata_000065519616.json new file mode 100644 index 0000000000000000000000000000000000000000..d9dbd4f6dca50f6834128a17704c01eaedea2903 --- /dev/null +++ b/checkpoints/metadata_000065519616.json @@ -0,0 +1 @@ +{"step": 2000, "tokens_seen": 65519616, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.8115727212121815} \ No newline at end of file diff --git a/checkpoints/metadata_000066699264.json b/checkpoints/metadata_000066699264.json new file mode 100644 index 0000000000000000000000000000000000000000..54caad1e7743a54b720c11073c216e8416e74083 --- /dev/null +++ b/checkpoints/metadata_000066699264.json @@ -0,0 +1 @@ +{"step": 2036, "tokens_seen": 66699264, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.795000170217534} \ No newline at end of file diff --git a/checkpoints/metadata_000072040448.json b/checkpoints/metadata_000072040448.json new file mode 100644 index 0000000000000000000000000000000000000000..77bf52745cca303f821807bba9075b34394678ea --- /dev/null +++ b/checkpoints/metadata_000072040448.json @@ -0,0 +1 @@ +{"step": 2199, "tokens_seen": 72040448, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.705779946935515} \ No newline at end of file diff --git a/checkpoints/metadata_000077807616.json b/checkpoints/metadata_000077807616.json new file mode 100644 index 0000000000000000000000000000000000000000..d95e31b2082c4e5fb2533218fe4d00fc61e60ab8 --- /dev/null +++ b/checkpoints/metadata_000077807616.json @@ -0,0 +1 @@ +{"step": 2375, "tokens_seen": 77807616, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.616199421476643} \ No newline at end of file diff --git a/checkpoints/metadata_000081903616.json b/checkpoints/metadata_000081903616.json new file mode 100644 index 0000000000000000000000000000000000000000..0ee28fb722310fefeb502614ed3ce638ae20bff5 --- /dev/null +++ b/checkpoints/metadata_000081903616.json @@ -0,0 +1 @@ +{"step": 2500, "tokens_seen": 81903616, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.563341343725333} \ No newline at end of file diff --git a/checkpoints/metadata_000084033536.json b/checkpoints/metadata_000084033536.json new file mode 100644 index 0000000000000000000000000000000000000000..2937111f2d0961e578dc35e430f2e8b6b3773994 --- /dev/null +++ b/checkpoints/metadata_000084033536.json @@ -0,0 +1 @@ +{"step": 2565, "tokens_seen": 84033536, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.54012020136703} \ No newline at end of file diff --git a/checkpoints/metadata_000090783744.json b/checkpoints/metadata_000090783744.json new file mode 100644 index 0000000000000000000000000000000000000000..cfa7e778aed9474b045b6e371fe9675657086329 --- /dev/null +++ b/checkpoints/metadata_000090783744.json @@ -0,0 +1 @@ +{"step": 2771, "tokens_seen": 90783744, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.458764986462275} \ No newline at end of file diff --git a/checkpoints/metadata_000098025472.json b/checkpoints/metadata_000098025472.json new file mode 100644 index 0000000000000000000000000000000000000000..b7e241349862051359bb8d86fdc915fb6a4d33ce --- /dev/null +++ b/checkpoints/metadata_000098025472.json @@ -0,0 +1 @@ +{"step": 2992, "tokens_seen": 98025472, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.389518339256737} \ No newline at end of file diff --git a/checkpoints/metadata_000098287616.json b/checkpoints/metadata_000098287616.json new file mode 100644 index 0000000000000000000000000000000000000000..de254590cb861dba9768b85fb7cf52081cc1bc00 --- /dev/null +++ b/checkpoints/metadata_000098287616.json @@ -0,0 +1 @@ +{"step": 3000, "tokens_seen": 98287616, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.392079796948936} \ No newline at end of file diff --git a/checkpoints/metadata_000105873408.json b/checkpoints/metadata_000105873408.json new file mode 100644 index 0000000000000000000000000000000000000000..0ec2795d3fb642b6701eb53f6d85d6555e0aaac7 --- /dev/null +++ b/checkpoints/metadata_000105873408.json @@ -0,0 +1 @@ +{"step": 3232, "tokens_seen": 105873408, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.282883708821922} \ No newline at end of file diff --git a/checkpoints/metadata_000114327552.json b/checkpoints/metadata_000114327552.json new file mode 100644 index 0000000000000000000000000000000000000000..ef095165f8f112707df93a50e87e5a3f37c565cc --- /dev/null +++ b/checkpoints/metadata_000114327552.json @@ -0,0 +1 @@ +{"step": 3490, "tokens_seen": 114327552, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.21502106987704} \ No newline at end of file diff --git a/checkpoints/metadata_000114655232.json b/checkpoints/metadata_000114655232.json new file mode 100644 index 0000000000000000000000000000000000000000..322edc63a4aa5425b9a32545c531d483ca028490 --- /dev/null +++ b/checkpoints/metadata_000114655232.json @@ -0,0 +1 @@ +{"step": 3500, "tokens_seen": 114655232, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.209441159851795} \ No newline at end of file diff --git a/checkpoints/metadata_000123502592.json b/checkpoints/metadata_000123502592.json new file mode 100644 index 0000000000000000000000000000000000000000..ca8661b7c81ba3b5fa3d38c4ab075b57d0bb6f11 --- /dev/null +++ b/checkpoints/metadata_000123502592.json @@ -0,0 +1 @@ +{"step": 3770, "tokens_seen": 123502592, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.202868722509909} \ No newline at end of file diff --git a/checkpoints/metadata_000131039232.json b/checkpoints/metadata_000131039232.json new file mode 100644 index 0000000000000000000000000000000000000000..97678a2ab020cfdf691751123e67c11b4f2b9808 --- /dev/null +++ b/checkpoints/metadata_000131039232.json @@ -0,0 +1 @@ +{"step": 4000, "tokens_seen": 131039232, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.173603163859597} \ No newline at end of file diff --git a/checkpoints/metadata_000133365760.json b/checkpoints/metadata_000133365760.json new file mode 100644 index 0000000000000000000000000000000000000000..a5a69e38cfd86ab526b3bdf7cb5b88987eb8e26f --- /dev/null +++ b/checkpoints/metadata_000133365760.json @@ -0,0 +1 @@ +{"step": 4071, "tokens_seen": 133365760, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.177871291157809} \ No newline at end of file diff --git a/checkpoints/metadata_000144048128.json b/checkpoints/metadata_000144048128.json new file mode 100644 index 0000000000000000000000000000000000000000..897beb3bb12bba16bef96632ece637b4dd30a5fb --- /dev/null +++ b/checkpoints/metadata_000144048128.json @@ -0,0 +1 @@ +{"step": 4397, "tokens_seen": 144048128, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.187769517125681} \ No newline at end of file diff --git a/checkpoints/metadata_000147423232.json b/checkpoints/metadata_000147423232.json new file mode 100644 index 0000000000000000000000000000000000000000..74d37d5078b1580bda89fd6f6b41266fb11ec194 --- /dev/null +++ b/checkpoints/metadata_000147423232.json @@ -0,0 +1 @@ +{"step": 4500, "tokens_seen": 147423232, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.159381399829371} \ No newline at end of file diff --git a/checkpoints/metadata_000155566080.json b/checkpoints/metadata_000155566080.json new file mode 100644 index 0000000000000000000000000000000000000000..09e972811e91a06eeb09b62a99b815f9e88a0951 --- /dev/null +++ b/checkpoints/metadata_000155566080.json @@ -0,0 +1 @@ +{"step": 4749, "tokens_seen": 155566080, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.08764796663189} \ No newline at end of file diff --git a/checkpoints/metadata_000163790848.json b/checkpoints/metadata_000163790848.json new file mode 100644 index 0000000000000000000000000000000000000000..b847b9460dfe9fd79fe067a00fb9599dfa2d9908 --- /dev/null +++ b/checkpoints/metadata_000163790848.json @@ -0,0 +1 @@ +{"step": 5000, "tokens_seen": 163790848, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.024841792245207} \ No newline at end of file diff --git a/checkpoints/metadata_000168017920.json b/checkpoints/metadata_000168017920.json new file mode 100644 index 0000000000000000000000000000000000000000..7b22f15bc0cb3ea822ff2330b271691477c93a60 --- /dev/null +++ b/checkpoints/metadata_000168017920.json @@ -0,0 +1 @@ +{"step": 5129, "tokens_seen": 168017920, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.019610716838619} \ No newline at end of file diff --git a/checkpoints/metadata_000180174848.json b/checkpoints/metadata_000180174848.json new file mode 100644 index 0000000000000000000000000000000000000000..e2a5ad2687c549297b98ca28132456b76fdc4259 --- /dev/null +++ b/checkpoints/metadata_000180174848.json @@ -0,0 +1 @@ +{"step": 5500, "tokens_seen": 180174848, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.016283230235387} \ No newline at end of file diff --git a/checkpoints/metadata_000181452800.json b/checkpoints/metadata_000181452800.json new file mode 100644 index 0000000000000000000000000000000000000000..1ebd22232b77e3cdd27f4490e8eddd15f343e8ab --- /dev/null +++ b/checkpoints/metadata_000181452800.json @@ -0,0 +1 @@ +{"step": 5539, "tokens_seen": 181452800, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.0067767972138775} \ No newline at end of file diff --git a/checkpoints/metadata_000195969024.json b/checkpoints/metadata_000195969024.json new file mode 100644 index 0000000000000000000000000000000000000000..1c7285a4b02c85b52eb4430750daac8447ba02df --- /dev/null +++ b/checkpoints/metadata_000195969024.json @@ -0,0 +1 @@ +{"step": 5982, "tokens_seen": 195969024, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.03058521340059} \ No newline at end of file diff --git a/checkpoints/metadata_000196558848.json b/checkpoints/metadata_000196558848.json new file mode 100644 index 0000000000000000000000000000000000000000..b1d055ec1afdab04f2a6f3fa2d040075fcff51d1 --- /dev/null +++ b/checkpoints/metadata_000196558848.json @@ -0,0 +1 @@ +{"step": 6000, "tokens_seen": 196558848, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.028505287133009} \ No newline at end of file diff --git a/checkpoints/model_weights_000000032768.pt b/checkpoints/model_weights_000000032768.pt new file mode 100644 index 0000000000000000000000000000000000000000..03511d91e914d121706074672ed3717e4b31658a --- /dev/null +++ b/checkpoints/model_weights_000000032768.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78ce67efbdfc3c9279831326e38cbb39268eb87b8627e4d8e3239dbba09c7ac9 +size 225208789 diff --git a/checkpoints/model_weights_000000327680.pt b/checkpoints/model_weights_000000327680.pt new file mode 100644 index 0000000000000000000000000000000000000000..ebd6f5756f1b946534887e0d0d990e3ee05c8980 --- /dev/null +++ b/checkpoints/model_weights_000000327680.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:633ede9c63a3c6a6371a82e484a4b2e70bddbdc28a060e6fb39ec1fe3af1475a +size 225208789 diff --git a/checkpoints/model_weights_000000360448.pt b/checkpoints/model_weights_000000360448.pt new file mode 100644 index 0000000000000000000000000000000000000000..5f15cf3de072bd04bb2f35e4af5e402ec634595b --- /dev/null +++ b/checkpoints/model_weights_000000360448.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2d94210991e8b53ff84b7e2a14ccdaa6f9fc66a15fe4601973ed5119ed8304a +size 225208789 diff --git a/checkpoints/model_weights_000000393216.pt b/checkpoints/model_weights_000000393216.pt new file mode 100644 index 0000000000000000000000000000000000000000..725653478d69a48dda02f1852610793de23c4d2b --- /dev/null +++ b/checkpoints/model_weights_000000393216.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f39136e70cff4159226a70e7d677d1055f8c6a50ff1db1538cb18a43e3132738 +size 225208789 diff --git a/checkpoints/model_weights_000000425984.pt b/checkpoints/model_weights_000000425984.pt new file mode 100644 index 0000000000000000000000000000000000000000..2732b4ed87fb0871195b7a7df13f72d61f1ef3b3 --- /dev/null +++ b/checkpoints/model_weights_000000425984.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:696b773336282c2a83c9238bc5120134d69971f3a9acdd83b799784cc084cd72 +size 225208789 diff --git a/checkpoints/model_weights_000000458752.pt b/checkpoints/model_weights_000000458752.pt new file mode 100644 index 0000000000000000000000000000000000000000..5a5ca15ee33fffd931358c71916a1a98060a16a1 --- /dev/null +++ b/checkpoints/model_weights_000000458752.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6cf39c0cd05f6a46bdb3996f75c7912d08b3acd65c22e9edf568ebd82e879a59 +size 225208789 diff --git a/checkpoints/model_weights_000000491520.pt b/checkpoints/model_weights_000000491520.pt new file mode 100644 index 0000000000000000000000000000000000000000..e67cc5f9897fdfeca864e511c489d2dc480b1b3f --- /dev/null +++ b/checkpoints/model_weights_000000491520.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b016c71f2b1e801a51412b9cda141506011ca91419aaf225c9bf32e692ecfef2 +size 225208789 diff --git a/checkpoints/model_weights_000000557056.pt b/checkpoints/model_weights_000000557056.pt new file mode 100644 index 0000000000000000000000000000000000000000..de8f262037f0babadea1fb6c3ea1c0ffdf277a8b --- /dev/null +++ b/checkpoints/model_weights_000000557056.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0cba99165f97ddf70f87ba136ff83f9ae23fcda3e0fc9bb911b5b3a70526ca9e +size 225208789 diff --git a/checkpoints/model_weights_000000589824.pt b/checkpoints/model_weights_000000589824.pt new file mode 100644 index 0000000000000000000000000000000000000000..4f7b34af1fb33d829a519c98b20d6b8e6d7ae0e3 --- /dev/null +++ b/checkpoints/model_weights_000000589824.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f7b57c3095e72ff1cbc64cae52045bdc2ce2d9ce5491715ca7c91ca3501b078 +size 225208789 diff --git a/checkpoints/model_weights_000000655360.pt b/checkpoints/model_weights_000000655360.pt new file mode 100644 index 0000000000000000000000000000000000000000..93b9def5bbf8bae31c45675b3b3917abfc228c74 --- /dev/null +++ b/checkpoints/model_weights_000000655360.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:04b81013586b8e1a18f57fa7e12f1f0204000cdd2afb643296912d715ad0dfac +size 225208789 diff --git a/checkpoints/model_weights_000000688128.pt b/checkpoints/model_weights_000000688128.pt new file mode 100644 index 0000000000000000000000000000000000000000..2e2b9a5babc977799cdda207e0f7f86baaec08ab --- /dev/null +++ b/checkpoints/model_weights_000000688128.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:091b5313191ec64b53d59974148187d59d879a9a5f5df981e4a554e12e943f25 +size 225208789 diff --git a/checkpoints/model_weights_000000753664.pt b/checkpoints/model_weights_000000753664.pt new file mode 100644 index 0000000000000000000000000000000000000000..0fe8a72a1e81fa200fe2550badba501ac93fe370 --- /dev/null +++ b/checkpoints/model_weights_000000753664.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f249769c67bf430ba07291d14083b1d2281e8e1df4ebdcce7290ad086c3d9c4a +size 225208789 diff --git a/checkpoints/model_weights_000000819200.pt b/checkpoints/model_weights_000000819200.pt new file mode 100644 index 0000000000000000000000000000000000000000..69e1a4e4322565bdae586d5a42f7379d8e9de141 --- /dev/null +++ b/checkpoints/model_weights_000000819200.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c64206fec6a90f3a90aa6522046666a1e69ec6a7911d0caa8fa510c151890e8f +size 225208789 diff --git a/checkpoints/model_weights_000000884736.pt b/checkpoints/model_weights_000000884736.pt new file mode 100644 index 0000000000000000000000000000000000000000..7d58036813a7be2557902840a28a93847e18b69b --- /dev/null +++ b/checkpoints/model_weights_000000884736.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:935d4719b0faab57af8fdf37c086ab2c4f59763c9943da45be013b7c54cd6d54 +size 225208789 diff --git a/checkpoints/model_weights_000000950272.pt b/checkpoints/model_weights_000000950272.pt new file mode 100644 index 0000000000000000000000000000000000000000..2d102dea90ce66149caabb3a1be175c1c2c76017 --- /dev/null +++ b/checkpoints/model_weights_000000950272.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4aeb00a5462846cb7e0c49a05919537a462fb1f8b8dbf376851e2f595efdd516 +size 225208789 diff --git a/checkpoints/model_weights_000001015808.pt b/checkpoints/model_weights_000001015808.pt new file mode 100644 index 0000000000000000000000000000000000000000..fdf61d56c14dfde5239636bf964f47af9e5395f3 --- /dev/null +++ b/checkpoints/model_weights_000001015808.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d20c33b5f0e73605d03870de42840c84f25839564c91b3845fa17dc79be1b48 +size 225208789 diff --git a/checkpoints/model_weights_000001114112.pt b/checkpoints/model_weights_000001114112.pt new file mode 100644 index 0000000000000000000000000000000000000000..6ccbf0df4c134401fb8a220f9f072a6b29633c52 --- /dev/null +++ b/checkpoints/model_weights_000001114112.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:33b03116283334821a3d623600c4707078723213495f8c6bc0f299656a41a56d +size 225208789 diff --git a/checkpoints/model_weights_000001212416.pt b/checkpoints/model_weights_000001212416.pt new file mode 100644 index 0000000000000000000000000000000000000000..f0fb628b6574ac90c550977fca77174edc2ba8e7 --- /dev/null +++ b/checkpoints/model_weights_000001212416.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b24265d38b2017e4b8602cb9c60d6bc3a7a6094ab20bc22695ff9041ca66d93d +size 225208789 diff --git a/checkpoints/model_weights_000001310720.pt b/checkpoints/model_weights_000001310720.pt new file mode 100644 index 0000000000000000000000000000000000000000..c6bb01375986211777bbd6db544967be4a8c8149 --- /dev/null +++ b/checkpoints/model_weights_000001310720.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d3dde872d6989533837d4a2e19c2698ef7f2d3b0001af0fa41468d88f96053d +size 225208789 diff --git a/checkpoints/model_weights_000001409024.pt b/checkpoints/model_weights_000001409024.pt new file mode 100644 index 0000000000000000000000000000000000000000..ea24843b2776561a512bcdf33914dd7142b092fa --- /dev/null +++ b/checkpoints/model_weights_000001409024.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1559d35e5d6295d0b680855874d51b5d591dc72c99b6972be09c2641d4eb1a0c +size 225208789 diff --git a/checkpoints/model_weights_000001507328.pt b/checkpoints/model_weights_000001507328.pt new file mode 100644 index 0000000000000000000000000000000000000000..2b0b79de2d56421f5f8bd74b6b9cefa1a78be43f --- /dev/null +++ b/checkpoints/model_weights_000001507328.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be698e7741c033b9c5393c23e6497fa23dd711f28ebb36c83c675b01a74cc183 +size 225208789 diff --git a/checkpoints/model_weights_000001638400.pt b/checkpoints/model_weights_000001638400.pt new file mode 100644 index 0000000000000000000000000000000000000000..1d32f0c4b1cba737a143cf0301c2ef3adf79a580 --- /dev/null +++ b/checkpoints/model_weights_000001638400.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:001d84b2cecec626a90aa2c65f74f0272b0eef5c5cfc6184c1d8ef4f0c408bda +size 225208789 diff --git a/checkpoints/model_weights_000001769472.pt b/checkpoints/model_weights_000001769472.pt new file mode 100644 index 0000000000000000000000000000000000000000..cf03a587e8e2ffa78a6343b75bfb7419ba081ff8 --- /dev/null +++ b/checkpoints/model_weights_000001769472.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:252c1873016abb6d372d406426d8295b624e878818146bc6d767b4ccf32f8e39 +size 225208789 diff --git a/checkpoints/model_weights_000001933312.pt b/checkpoints/model_weights_000001933312.pt new file mode 100644 index 0000000000000000000000000000000000000000..fd3f895cd6276c16550c297f499a1f1f7b963a59 --- /dev/null +++ b/checkpoints/model_weights_000001933312.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a4827c4d633e486476f0eb626a2ce634ad0558d205b120e9f844e4bdca8eec3 +size 225208789 diff --git a/checkpoints/model_weights_000002064384.pt b/checkpoints/model_weights_000002064384.pt new file mode 100644 index 0000000000000000000000000000000000000000..627decda89beddc314edb8aadfab6260d7e41d0b --- /dev/null +++ b/checkpoints/model_weights_000002064384.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a702bac8bb90ed3c78f9fcdade971b9f41b2fdef0463dc0fac8bb8762761eb8b +size 225208789 diff --git a/checkpoints/model_weights_000002228224.pt b/checkpoints/model_weights_000002228224.pt new file mode 100644 index 0000000000000000000000000000000000000000..2d4d17754a22f86bc5788a76742dc09994f414a5 --- /dev/null +++ b/checkpoints/model_weights_000002228224.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be2c034d9c8524b1bb3c4a191fc7d36526f1314190f19c6df59328e6873bc373 +size 225208789 diff --git a/checkpoints/model_weights_000002424832.pt b/checkpoints/model_weights_000002424832.pt new file mode 100644 index 0000000000000000000000000000000000000000..5f67c3fa99f5d88aed4143c3fe10a539acfc27f6 --- /dev/null +++ b/checkpoints/model_weights_000002424832.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25be3b1e270a79116814600acd00082ec1a451f79c5e364f96ed06fd453faf80 +size 225208789 diff --git a/checkpoints/model_weights_000002621440.pt b/checkpoints/model_weights_000002621440.pt new file mode 100644 index 0000000000000000000000000000000000000000..6f86c19e33d6a614683449925010d5f92f67a991 --- /dev/null +++ b/checkpoints/model_weights_000002621440.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:efdd1240053c5908090741b93eff80aa38e64f491fa58cae1750f8bee54e0588 +size 225208789 diff --git a/checkpoints/model_weights_000002818048.pt b/checkpoints/model_weights_000002818048.pt new file mode 100644 index 0000000000000000000000000000000000000000..263ef0ad4c64dd2f8d2a1517bb71897825aee3f7 --- /dev/null +++ b/checkpoints/model_weights_000002818048.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e08b11c09b7f770102dca4cc0afa4c07fdab651835a29af62aed315d2e3fd3fa +size 225208789 diff --git a/checkpoints/model_weights_000003047424.pt b/checkpoints/model_weights_000003047424.pt new file mode 100644 index 0000000000000000000000000000000000000000..35d74570ad714901a9a510fd5230b5cec70fc8cb --- /dev/null +++ b/checkpoints/model_weights_000003047424.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71bf03d0df9757d5e2225dc78ce05f9db7e1c89cf7bdb75f8d8785a621b3c32b +size 225208789 diff --git a/checkpoints/model_weights_000003309568.pt b/checkpoints/model_weights_000003309568.pt new file mode 100644 index 0000000000000000000000000000000000000000..6261ced412e60e97b05e2e43a31df5a6ec3a6683 --- /dev/null +++ b/checkpoints/model_weights_000003309568.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2da96faed3b123397abde32176023131f17182abc8457c8dd01fc0e137b47c0 +size 225208789 diff --git a/checkpoints/model_weights_000003571712.pt b/checkpoints/model_weights_000003571712.pt new file mode 100644 index 0000000000000000000000000000000000000000..0c8a3366df4db79ac53987c136c2a35a07b7ba54 --- /dev/null +++ b/checkpoints/model_weights_000003571712.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d0ddc1aec12ce3924a54465cb818b1f5da1c2e8a8065dfff4c7c2f39f932e63 +size 225208789 diff --git a/checkpoints/model_weights_000003866624.pt b/checkpoints/model_weights_000003866624.pt new file mode 100644 index 0000000000000000000000000000000000000000..baeb20cf67655a20130b76b4f6482f25c7917aee --- /dev/null +++ b/checkpoints/model_weights_000003866624.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ceb0866e5b0ba12264777af779123dde8b043a22ac04f17e98be37b41f6f1d92 +size 225208789 diff --git a/checkpoints/model_weights_000004161536.pt b/checkpoints/model_weights_000004161536.pt new file mode 100644 index 0000000000000000000000000000000000000000..0209480949343da57259a7c827a782daca0f6bc3 --- /dev/null +++ b/checkpoints/model_weights_000004161536.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aecc0a2a731789e53205ed503bc97437c989f276283c09f1f41e594f91c9f315 +size 225208789 diff --git a/checkpoints/model_weights_000004489216.pt b/checkpoints/model_weights_000004489216.pt new file mode 100644 index 0000000000000000000000000000000000000000..c5982326a06586b679826e1be6e6ce802c072711 --- /dev/null +++ b/checkpoints/model_weights_000004489216.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2dc09be8d0f0390d1edd66112a6957072a496c2bd3e7cf266d3e3be3fee7c039 +size 225208789 diff --git a/checkpoints/model_weights_000004849664.pt b/checkpoints/model_weights_000004849664.pt new file mode 100644 index 0000000000000000000000000000000000000000..d575a12e8d912dd81629bf0de58e7e17bbefc0b4 --- /dev/null +++ b/checkpoints/model_weights_000004849664.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cdd99864fcf59d2abf9beae3a66ed33a4d9555b35b0cf126856769e16bd1c4fb +size 225208789 diff --git a/checkpoints/model_weights_000005242880.pt b/checkpoints/model_weights_000005242880.pt new file mode 100644 index 0000000000000000000000000000000000000000..778f89c7e480f5e23e76f4f4d47a1f4dbe9dd997 --- /dev/null +++ b/checkpoints/model_weights_000005242880.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:57e246e618e476b4d20322475ce5d51d01263599afe18b63486637f2cd2c5a3d +size 225208789 diff --git a/checkpoints/model_weights_000005668864.pt b/checkpoints/model_weights_000005668864.pt new file mode 100644 index 0000000000000000000000000000000000000000..76ce036a9c2710be4b4e30dfe680dc332a6f2d6d --- /dev/null +++ b/checkpoints/model_weights_000005668864.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42d2d48b2d95a9d699a7755a93fef68086831873f30c9575158bad97f1d0020a +size 225208789 diff --git a/checkpoints/model_weights_000006127616.pt b/checkpoints/model_weights_000006127616.pt new file mode 100644 index 0000000000000000000000000000000000000000..4a6d09196723b997bfc44c71fbbba467d72c1362 --- /dev/null +++ b/checkpoints/model_weights_000006127616.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51e8e1d5ac349979d63f9cceed9f8fa8d83f189055f262659869d26ebd85c187 +size 225208789 diff --git a/checkpoints/model_weights_000006619136.pt b/checkpoints/model_weights_000006619136.pt new file mode 100644 index 0000000000000000000000000000000000000000..3f042185600e6620b9885a2c1268a1eddaf80da4 --- /dev/null +++ b/checkpoints/model_weights_000006619136.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9cd90a4585ae9061bce88d3c47db69ad09ce155347cf5096d5fe8371af1cd896 +size 225208789 diff --git a/checkpoints/model_weights_000007143424.pt b/checkpoints/model_weights_000007143424.pt new file mode 100644 index 0000000000000000000000000000000000000000..e0b0f43f5afc170d20fb6bb78f522b25eeed2b1a --- /dev/null +++ b/checkpoints/model_weights_000007143424.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7282bbc144a5eb5a4cb5fec2fbf5153ad06710562983dba7e06f3612effee062 +size 225208789 diff --git a/checkpoints/model_weights_000007733248.pt b/checkpoints/model_weights_000007733248.pt new file mode 100644 index 0000000000000000000000000000000000000000..7045ea59fc815be0a8e32c4746a5582a3c0ed6d4 --- /dev/null +++ b/checkpoints/model_weights_000007733248.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86ac64192b693e7ef4c6216de9427fd4760f6e5f2ade0c181330583080d68c6d +size 225208789 diff --git a/checkpoints/model_weights_000008323072.pt b/checkpoints/model_weights_000008323072.pt new file mode 100644 index 0000000000000000000000000000000000000000..863300a1a667c0eebe018c6cd1f2ee325c92af02 --- /dev/null +++ b/checkpoints/model_weights_000008323072.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:060d0c8984ae444fa3ab6ae855fbed7f15d91c174cae0090b6539f8ec6a78880 +size 225208789 diff --git a/checkpoints/model_weights_000009011200.pt b/checkpoints/model_weights_000009011200.pt new file mode 100644 index 0000000000000000000000000000000000000000..2ae47cb9c801473265df2230b5dbe9846316d630 --- /dev/null +++ b/checkpoints/model_weights_000009011200.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46206dd3f4c52823cb50812eb7a990b28d9b39ee69a63584d76a045db071813d +size 225208789 diff --git a/checkpoints/model_weights_000009732096.pt b/checkpoints/model_weights_000009732096.pt new file mode 100644 index 0000000000000000000000000000000000000000..ea92cb43617491586f7177712d76600c90de884f --- /dev/null +++ b/checkpoints/model_weights_000009732096.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c1dbacbaee0f12e70169b7b379928246c11bfc2b570db6178401d0a13b57d43 +size 225208789 diff --git a/checkpoints/model_weights_000010518528.pt b/checkpoints/model_weights_000010518528.pt new file mode 100644 index 0000000000000000000000000000000000000000..3cccbebfae270e948894dd365887982382b19662 --- /dev/null +++ b/checkpoints/model_weights_000010518528.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:940fcac197c1d433930284da7a939c08e3715b680c08c7b066e5878924c42709 +size 225208789 diff --git a/checkpoints/model_weights_000011337728.pt b/checkpoints/model_weights_000011337728.pt new file mode 100644 index 0000000000000000000000000000000000000000..7f85e5fd5c815d7e74c8747384bbee6b9dadf322 --- /dev/null +++ b/checkpoints/model_weights_000011337728.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7074d613aaa17cffd74940b0c58b4c76609a1fbdcbc8ba05f7bbb827166bfbd1 +size 225208789 diff --git a/checkpoints/model_weights_000012255232.pt b/checkpoints/model_weights_000012255232.pt new file mode 100644 index 0000000000000000000000000000000000000000..988742f104d884233fbd5413250bb14ce7ea26ec --- /dev/null +++ b/checkpoints/model_weights_000012255232.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:458e8fc7115b6831c4881449cf5ba407742a011f407147842734bb6146cffce0 +size 225208789 diff --git a/checkpoints/model_weights_000013238272.pt b/checkpoints/model_weights_000013238272.pt new file mode 100644 index 0000000000000000000000000000000000000000..c9e25cfc8a61ff9b2f20615a26531287ef067d84 --- /dev/null +++ b/checkpoints/model_weights_000013238272.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4976f4673d9d39cb2f40a25831a403cdb206d046ddbe64cd3a90d9cfc8cdf6a6 +size 225208789 diff --git a/checkpoints/model_weights_000014286848.pt b/checkpoints/model_weights_000014286848.pt new file mode 100644 index 0000000000000000000000000000000000000000..fa74c02deffcaa363f65b958d71f280db5422c56 --- /dev/null +++ b/checkpoints/model_weights_000014286848.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28a68dd8eb2ec1adab93f861ae16ba6b900bd6d6c94e2b4a538f9e96c56cf6a9 +size 225208789 diff --git a/checkpoints/model_weights_000015433728.pt b/checkpoints/model_weights_000015433728.pt new file mode 100644 index 0000000000000000000000000000000000000000..b7b118381b0d68ac1da5738febfdcfc2e153ea79 --- /dev/null +++ b/checkpoints/model_weights_000015433728.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:87d927988351d49b6cf7c552953d9d8a4a61907d4c730cce1d4e4be19f633b0d +size 225208789 diff --git a/checkpoints/model_weights_000016384000.pt b/checkpoints/model_weights_000016384000.pt new file mode 100644 index 0000000000000000000000000000000000000000..206a9f06487274919efea4bb19d2fb605dc09863 --- /dev/null +++ b/checkpoints/model_weights_000016384000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1bfea2eb2e39705636a3e48da4194a01b3337340fb3ff0afc2ad8bc661e3ab45 +size 225208789 diff --git a/checkpoints/model_weights_000016678912.pt b/checkpoints/model_weights_000016678912.pt new file mode 100644 index 0000000000000000000000000000000000000000..604896dd6328fa078b6d786692f09d51682790b9 --- /dev/null +++ b/checkpoints/model_weights_000016678912.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b0e0e468164983a67b3c4f53a5e4428377cb47cdbc79cd88e073ceed827a201 +size 225208789 diff --git a/checkpoints/model_weights_000018022400.pt b/checkpoints/model_weights_000018022400.pt new file mode 100644 index 0000000000000000000000000000000000000000..27e6daabd9851d113cf7cd9f18a285d058c1d51d --- /dev/null +++ b/checkpoints/model_weights_000018022400.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b199dd46a283d310a3e439f3f2913c76669bca55643a95bb3113ae4ab185c43 +size 225208789 diff --git a/checkpoints/model_weights_000019464192.pt b/checkpoints/model_weights_000019464192.pt new file mode 100644 index 0000000000000000000000000000000000000000..ee211674b291bb7ee6e45d02710ffd0b3b7ddbe8 --- /dev/null +++ b/checkpoints/model_weights_000019464192.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b62ca7c2fa44e93a654ac9e819bded455a13fa4673c2910950289042a00bd31 +size 225208789 diff --git a/checkpoints/model_weights_000021037056.pt b/checkpoints/model_weights_000021037056.pt new file mode 100644 index 0000000000000000000000000000000000000000..70f67cbd154702ee7bb737f5c9099e0db1484efc --- /dev/null +++ b/checkpoints/model_weights_000021037056.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:401d9a419c5ad511429c8495ecffaeb7ca2fd8f1f77d7f0d7b61bcb7f9d59189 +size 225208789 diff --git a/checkpoints/model_weights_000022708224.pt b/checkpoints/model_weights_000022708224.pt new file mode 100644 index 0000000000000000000000000000000000000000..76d8c4e8c5ecc55db1a357a4ec75eebfd6fd6fe8 --- /dev/null +++ b/checkpoints/model_weights_000022708224.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d95c65fb2de62b8504140e611ea195964d9c56912941acffa7d6c19e8d265cb +size 225208789 diff --git a/checkpoints/model_weights_000024510464.pt b/checkpoints/model_weights_000024510464.pt new file mode 100644 index 0000000000000000000000000000000000000000..7595be4515e59284c2b29970dfa1550b48e2e920 --- /dev/null +++ b/checkpoints/model_weights_000024510464.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49af7a45cd7a4884e8786222d8daa155bf13b81093cf1feb8a72fb7f15e6b66e +size 225208789 diff --git a/checkpoints/model_weights_000026476544.pt b/checkpoints/model_weights_000026476544.pt new file mode 100644 index 0000000000000000000000000000000000000000..6d6ab802c09d1af9f3f70de88e156cf7f59e3212 --- /dev/null +++ b/checkpoints/model_weights_000026476544.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c3ea6cec1ac45cb3f9cb0cc7d17a749e120206d385e41bb46649fef01735fd1 +size 225208789 diff --git a/checkpoints/model_weights_000028606464.pt b/checkpoints/model_weights_000028606464.pt new file mode 100644 index 0000000000000000000000000000000000000000..7f9e14638bf2d72affcdd0d573497c560bfd9114 --- /dev/null +++ b/checkpoints/model_weights_000028606464.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4fc2b0b88bdca6daa12ecba1990283f35ea44b5c0e2d4563c5292c435cdab3e3 +size 225208789 diff --git a/checkpoints/model_weights_000030900224.pt b/checkpoints/model_weights_000030900224.pt new file mode 100644 index 0000000000000000000000000000000000000000..2faf9bcfb8bcf953d68437440a2f0dea4b4f8df3 --- /dev/null +++ b/checkpoints/model_weights_000030900224.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5736d1878a3cbe9b044067c30099e866102af154dcbd8975488067f37df2a85d +size 225208789 diff --git a/checkpoints/model_weights_000032768000.pt b/checkpoints/model_weights_000032768000.pt new file mode 100644 index 0000000000000000000000000000000000000000..a983bce9875eda59f17ec07085fa1b15628d17f3 --- /dev/null +++ b/checkpoints/model_weights_000032768000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f84b9466dc92f979f967a44f195256edabffbaf49baf8d5b3262c6362d1e0ed2 +size 225208789 diff --git a/checkpoints/model_weights_000033357824.pt b/checkpoints/model_weights_000033357824.pt new file mode 100644 index 0000000000000000000000000000000000000000..322948dffee04a02ffe4909383b5f49907ffa2fb --- /dev/null +++ b/checkpoints/model_weights_000033357824.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5bdc81942e6131fa80e5dd555c3f99ceb384833bb7dd356f4b229e0f29944cf7 +size 225208789 diff --git a/checkpoints/model_weights_000036044800.pt b/checkpoints/model_weights_000036044800.pt new file mode 100644 index 0000000000000000000000000000000000000000..6301f32a7cbbfad7ccc1995728cbddecc4b7530a --- /dev/null +++ b/checkpoints/model_weights_000036044800.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49c2947258748ca51c2a57acc5b292506f70d16fc322b0ba01b2c40858c9a1bc +size 225208789 diff --git a/checkpoints/model_weights_000038928384.pt b/checkpoints/model_weights_000038928384.pt new file mode 100644 index 0000000000000000000000000000000000000000..40ab7e6a6d8a8042cc2a87392eeee4177c149e29 --- /dev/null +++ b/checkpoints/model_weights_000038928384.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:578e96f276d3983b0a8fe27435526937a0b7ddc0d457e7c1e80a14146f30f7a8 +size 225208789 diff --git a/checkpoints/model_weights_000042041344.pt b/checkpoints/model_weights_000042041344.pt new file mode 100644 index 0000000000000000000000000000000000000000..9172994ae9f26d965e4dd8b8dee484fd5e2bee1e --- /dev/null +++ b/checkpoints/model_weights_000042041344.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c556a7fda925c2e77bc0749bcc1e8b1f13936d3359d0875f46889703f227807 +size 225208789 diff --git a/checkpoints/model_weights_000045416448.pt b/checkpoints/model_weights_000045416448.pt new file mode 100644 index 0000000000000000000000000000000000000000..94ef2de4da8929485f1e6534ca7996a90a265f04 --- /dev/null +++ b/checkpoints/model_weights_000045416448.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38332d13055555a4f1db0770354b847157047205e9545d9ee89cec08fc42543a +size 225208789 diff --git a/checkpoints/model_weights_000049053696.pt b/checkpoints/model_weights_000049053696.pt new file mode 100644 index 0000000000000000000000000000000000000000..4932616a0832c72ba87faa59d7ce965fb4142e91 --- /dev/null +++ b/checkpoints/model_weights_000049053696.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9db6206db3d05c6dda9a94b87f7ee86f7f29405497115c8ec3d5cb793539573c +size 225208789 diff --git a/checkpoints/model_weights_000049152000.pt b/checkpoints/model_weights_000049152000.pt new file mode 100644 index 0000000000000000000000000000000000000000..55cc3dbce0566d471bb35500f6f5229c3b650ce2 --- /dev/null +++ b/checkpoints/model_weights_000049152000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab15d5895687edb3dd1f79f7d8919bfbc4a76af3578151648876349a6db07e02 +size 225208789 diff --git a/checkpoints/model_weights_000052936704.pt b/checkpoints/model_weights_000052936704.pt new file mode 100644 index 0000000000000000000000000000000000000000..5ba7aa9efc249bd99ae184d3f19d31312f02d74a --- /dev/null +++ b/checkpoints/model_weights_000052936704.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9143cf88e2653198a0919fbfb19b1af45c722422c8cd812638cab6ee82e7604 +size 225208789 diff --git a/checkpoints/model_weights_000057196544.pt b/checkpoints/model_weights_000057196544.pt new file mode 100644 index 0000000000000000000000000000000000000000..0c42092b514ed129959b06765441ebbd2538a4e1 --- /dev/null +++ b/checkpoints/model_weights_000057196544.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e11da8413e37c12d1afd831d1877bdbd98a5832a29ba8c599d65e30ce5e96b8c +size 225208789 diff --git a/checkpoints/model_weights_000061751296.pt b/checkpoints/model_weights_000061751296.pt new file mode 100644 index 0000000000000000000000000000000000000000..96281a599906c2aa8af51140d5f5d61d72003b58 --- /dev/null +++ b/checkpoints/model_weights_000061751296.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c970cc8db46f2648a267c5d38fce006a0cee5152c5db0700511434965947b7b +size 225208789 diff --git a/checkpoints/model_weights_000065519616.pt b/checkpoints/model_weights_000065519616.pt new file mode 100644 index 0000000000000000000000000000000000000000..b1b103e1bbe018f6024921d2330139f70f3acf70 --- /dev/null +++ b/checkpoints/model_weights_000065519616.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c17fa57f3299ca7365961cd7ef46dac25ac0b56c0007e015c7dbdba6b832975 +size 225208789 diff --git a/checkpoints/model_weights_000066699264.pt b/checkpoints/model_weights_000066699264.pt new file mode 100644 index 0000000000000000000000000000000000000000..fbee451fb53da1beca4eacd708190682a732788f --- /dev/null +++ b/checkpoints/model_weights_000066699264.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f553705fbc3fbb386f34ba3d3b37927df30c8775c14fb581c2399715a5566176 +size 225208789 diff --git a/checkpoints/model_weights_000072040448.pt b/checkpoints/model_weights_000072040448.pt new file mode 100644 index 0000000000000000000000000000000000000000..c271172e22ed9b6a803bbb261346370a5f466eaf --- /dev/null +++ b/checkpoints/model_weights_000072040448.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b8abdf6a7ed2ed2f631c8f7e041f48c466a4ff84a8058a32d3317daf0647ff5 +size 225208789 diff --git a/checkpoints/model_weights_000077807616.pt b/checkpoints/model_weights_000077807616.pt new file mode 100644 index 0000000000000000000000000000000000000000..f0a05d3b11c35b30f03ac0bcf9581acc0dde2283 --- /dev/null +++ b/checkpoints/model_weights_000077807616.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dab308e55788cde845dc5ced07ec3bb8fd6b5f840f26c4fa2b2c1ed46d935a82 +size 225208789 diff --git a/checkpoints/model_weights_000081903616.pt b/checkpoints/model_weights_000081903616.pt new file mode 100644 index 0000000000000000000000000000000000000000..d316e29899090c2412d2f17a66a0ef8e5fe57117 --- /dev/null +++ b/checkpoints/model_weights_000081903616.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:857160c1e64dd574bf1abdf795398facc98c23a62999778780cdca803d212469 +size 225208789 diff --git a/checkpoints/model_weights_000084033536.pt b/checkpoints/model_weights_000084033536.pt new file mode 100644 index 0000000000000000000000000000000000000000..3d0e01f5867fad31e92cb6ad43395415f927b881 --- /dev/null +++ b/checkpoints/model_weights_000084033536.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b07aafc6db62edfa6c087f208496d9edc2ad842d0a9fdec2d9e466a70c99edbb +size 225208789 diff --git a/checkpoints/model_weights_000090783744.pt b/checkpoints/model_weights_000090783744.pt new file mode 100644 index 0000000000000000000000000000000000000000..ff17416b31a4833ebbd973826c9753a37b2596e5 --- /dev/null +++ b/checkpoints/model_weights_000090783744.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1672640554a664c2ada5bca190d751fb4f4c1845bbd129c8118635629279c697 +size 225208789 diff --git a/checkpoints/model_weights_000098025472.pt b/checkpoints/model_weights_000098025472.pt new file mode 100644 index 0000000000000000000000000000000000000000..160b2c74ca3280baba3f961236b1cee27bc77a61 --- /dev/null +++ b/checkpoints/model_weights_000098025472.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5326bd0a23723abd11b086b5c5cd8586b9364324a5e1176267af1d5f4a6a91ab +size 225208789 diff --git a/checkpoints/model_weights_000098287616.pt b/checkpoints/model_weights_000098287616.pt new file mode 100644 index 0000000000000000000000000000000000000000..eeb3f831e24805e53d6bac5c2077f5333d86f3fd --- /dev/null +++ b/checkpoints/model_weights_000098287616.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2211781f77df65cc4074468b7b7a27f271d84b3c5a65370c21323f76b2236d3 +size 225208789 diff --git a/checkpoints/model_weights_000105873408.pt b/checkpoints/model_weights_000105873408.pt new file mode 100644 index 0000000000000000000000000000000000000000..9511d564482ce4702feb51fb8d4275d43f264a3c --- /dev/null +++ b/checkpoints/model_weights_000105873408.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7dc0af3bc2c86ca0998eff3eb91ba4c638a9d845f3cbe8dbd5b3c3ca66d1d8f0 +size 225208789 diff --git a/checkpoints/model_weights_000114327552.pt b/checkpoints/model_weights_000114327552.pt new file mode 100644 index 0000000000000000000000000000000000000000..bf184a9975db0d696758aab7789ffc590757a6b0 --- /dev/null +++ b/checkpoints/model_weights_000114327552.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd0dfdf60588f2dba3052cd687c7abe61baab27e68c4689600a56a4f5124efff +size 225208789 diff --git a/checkpoints/model_weights_000114655232.pt b/checkpoints/model_weights_000114655232.pt new file mode 100644 index 0000000000000000000000000000000000000000..e047c4c0d5ed8c9f53f6a21019eb2e6f7dc6fc83 --- /dev/null +++ b/checkpoints/model_weights_000114655232.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf9df84a4672cc2c23851cf0f19857129fa073b3a538e2204489aaa5f67961cc +size 225208789 diff --git a/checkpoints/model_weights_000123502592.pt b/checkpoints/model_weights_000123502592.pt new file mode 100644 index 0000000000000000000000000000000000000000..eb564e8e81ea50c7b87be5e0c0aa722d99984581 --- /dev/null +++ b/checkpoints/model_weights_000123502592.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b77c45fe17081b8bfa6bcbd333623d8ba97f7bd31954976f3747a7e6fab8cb2 +size 225208789 diff --git a/checkpoints/model_weights_000131039232.pt b/checkpoints/model_weights_000131039232.pt new file mode 100644 index 0000000000000000000000000000000000000000..0e0fca1054eee5c05299341ae5806076f568c939 --- /dev/null +++ b/checkpoints/model_weights_000131039232.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2429ac49c00b536cb6b2abdd3cb83f6b491ba331de1556e7d4bc90359e49d427 +size 225208789 diff --git a/checkpoints/model_weights_000133365760.pt b/checkpoints/model_weights_000133365760.pt new file mode 100644 index 0000000000000000000000000000000000000000..c23a1674c566a18d149d03c19acd1c8537826e7a --- /dev/null +++ b/checkpoints/model_weights_000133365760.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11f9e0f2afef0b40ccf64eb612c1ccf76ee2fd764a70320134beda0142eaea61 +size 225208789 diff --git a/checkpoints/model_weights_000144048128.pt b/checkpoints/model_weights_000144048128.pt new file mode 100644 index 0000000000000000000000000000000000000000..f64bc4a9397b9953917b9c2b7ce23f8ea6abd431 --- /dev/null +++ b/checkpoints/model_weights_000144048128.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15f6cb0c3d43d6bb0edae38a4a7214685c28b8c83632b1adb0f3e9e7e203709e +size 225208789 diff --git a/checkpoints/model_weights_000147423232.pt b/checkpoints/model_weights_000147423232.pt new file mode 100644 index 0000000000000000000000000000000000000000..ec3964595d6bd1bbda4a5b221b9eb93bc6f13288 --- /dev/null +++ b/checkpoints/model_weights_000147423232.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:72c76980dbfca4578b354b8069318797933598b0470192fda3b084a2fdc97d41 +size 225208789 diff --git a/checkpoints/model_weights_000155566080.pt b/checkpoints/model_weights_000155566080.pt new file mode 100644 index 0000000000000000000000000000000000000000..32bfe9c5ce60fc8c3a51089fb8cbed2db61f98e7 --- /dev/null +++ b/checkpoints/model_weights_000155566080.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:345f7c883f307d3cca4b673438e860c6b85c3d5244c907bb0084efb3c8c11467 +size 225208789 diff --git a/checkpoints/model_weights_000163790848.pt b/checkpoints/model_weights_000163790848.pt new file mode 100644 index 0000000000000000000000000000000000000000..cbfea5666dab59378b01f84afcb10d539cb7f24f --- /dev/null +++ b/checkpoints/model_weights_000163790848.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27660ddddf0270c485b24ba28a23d158874f83020624dbca4ce7d1ff5ef1a01a +size 225208789 diff --git a/checkpoints/model_weights_000168017920.pt b/checkpoints/model_weights_000168017920.pt new file mode 100644 index 0000000000000000000000000000000000000000..54e6ec7d0a24eeb5e96be36755dcd4c32c0c0725 --- /dev/null +++ b/checkpoints/model_weights_000168017920.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba16c9ee95cfee356f7b3d94216eaf7c64fc4bbb875174dbb48cba6de6cfa59e +size 225208789 diff --git a/checkpoints/model_weights_000180174848.pt b/checkpoints/model_weights_000180174848.pt new file mode 100644 index 0000000000000000000000000000000000000000..d908269e29ca629e1594f703e5213eae65e671f8 --- /dev/null +++ b/checkpoints/model_weights_000180174848.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4e5138b4cb2546c34ea1bcd400d3cdae77886a15451cebf18b8e3d0288eb1a9 +size 225208789 diff --git a/checkpoints/model_weights_000181452800.pt b/checkpoints/model_weights_000181452800.pt new file mode 100644 index 0000000000000000000000000000000000000000..3ac9c399894e9c1806a79fd7a4dfeac06effe345 --- /dev/null +++ b/checkpoints/model_weights_000181452800.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:463afa9d527a7728cb553bde2f2fa1938b2402812d21043a2ae33bbde3afcee0 +size 225208789 diff --git a/checkpoints/model_weights_000195969024.pt b/checkpoints/model_weights_000195969024.pt new file mode 100644 index 0000000000000000000000000000000000000000..77de0b968c377cb0c574dc53add3bdc9d24cd62a --- /dev/null +++ b/checkpoints/model_weights_000195969024.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:904f1e9aa719818779a7e7fbaaf79444d8ca125d63dc51f4f0c3fe361d4ed145 +size 225208789 diff --git a/checkpoints/model_weights_000196558848.pt b/checkpoints/model_weights_000196558848.pt new file mode 100644 index 0000000000000000000000000000000000000000..0ebb86c03607177da3cc7f40beeb422b5e216d90 --- /dev/null +++ b/checkpoints/model_weights_000196558848.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98304db9410dc86c9a91d1711e05879cf07aced42ee2d59204474ec4dc6a0998 +size 225208789 diff --git a/config.toml b/config.toml new file mode 100644 index 0000000000000000000000000000000000000000..f4c56ba2c2e5172ed7ca1b52a2c44efc7c774c76 --- /dev/null +++ b/config.toml @@ -0,0 +1,32 @@ +model_name = "gelu_2l_50m_subset" +n_layers = 2 +d_model = 512 +d_mlp = 2048 +d_head = 64 +n_heads = 8 +attn_only = false +layer_norm_eps = 1e-05 +init_range = 0.02 +n_ctx = 1024 +d_vocab = 48262 +dataset_name = "eoinf/c4-code-test-50m" +tokenizer_name = "NeelNanda/gpt-neox-tokenizer-digits" +seed = 10 +device = "cuda" +use_bfloat16_matmul = false +batch_size_per_device = 32 +n_devices = 1 +batches_per_step = 1 +max_tokens = 200000000 +lr_hidden = 0.002 +lr_vector = 0.001 +lr_schedule = "constant_with_warmup" +warmup_tokens = 30000000 +weight_decay = 0.05 +grad_norm_clip = 1.0 +train_loss_moving_average_beta = 0.99 +log_interval = 25 +save_checkpoints = true +checkpoint_interval = 500 +checkpoint_interval_ratio = 1.08 +save_log_checkpoints = true \ No newline at end of file diff --git a/latest_checkpoint.pt b/latest_checkpoint.pt new file mode 100644 index 0000000000000000000000000000000000000000..dbe67e8e442507cf3e89c05ef240d6684de8108a --- /dev/null +++ b/latest_checkpoint.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa6a2ada466803216f898bc8f0b57648d88c76745f5186ebeff3667c558ab706 +size 225208311 diff --git a/latest_metadata.json b/latest_metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..b1d055ec1afdab04f2a6f3fa2d040075fcff51d1 --- /dev/null +++ b/latest_metadata.json @@ -0,0 +1 @@ +{"step": 6000, "tokens_seen": 196558848, "config": {"model_name": "gelu_2l_50m_subset", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/c4-code-test-50m", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.028505287133009} \ No newline at end of file diff --git a/latest_optimizer.pt b/latest_optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..a6c8a537542b63f642c0a90ba02d98ccee3d5083 --- /dev/null +++ b/latest_optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7438256fd14dc1532b819b68c73db8fb7fc7b7482d0042d848553c6278768395 +size 450422547 diff --git a/run.sh b/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..122ddb5e9329c0fcebd4b9d7df66890effe86817 --- /dev/null +++ b/run.sh @@ -0,0 +1,28 @@ + +#!/bin/bash +# Check if "restart" argument is passed to force normal training +if [ "$1" = "restart" ]; then + echo "Force restart: Running normal training ..." + python -c " +import os +from toy_models.train.trainer import train_transformer_from_config +current_dir = os.getcwd() +train_transformer_from_config('config.toml', current_dir) +" +else + # Check for checkpoints and run appropriate training + python -c " +import os +from pathlib import Path +from toy_models.train.trainer import train_transformer_from_config, restart_from_checkpoint +current_dir = os.getcwd() +# Check if checkpoints directory exists and has .pt files +latest_checkpoint = Path('latest_checkpoint.pt') +if latest_checkpoint.exists(): + print(f'Found checkpoint: {latest_checkpoint}. Restarting from checkpoint...') + restart_from_checkpoint(current_dir) +else: + print('Starting training from beginning ...') + train_transformer_from_config(current_dir) +" +fi \ No newline at end of file diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..cfc6cf5ec5c7a6d30ed23caf0df0f146d4864b9f --- /dev/null +++ b/wandb/debug-internal.log @@ -0,0 +1,12 @@ +{"time":"2025-08-19T12:19:27.936258015Z","level":"INFO","msg":"stream: starting","core version":"0.21.1"} +{"time":"2025-08-19T12:19:28.205333845Z","level":"INFO","msg":"stream: created new stream","id":"k06awv3c"} +{"time":"2025-08-19T12:19:28.205424727Z","level":"INFO","msg":"stream: started","id":"k06awv3c"} +{"time":"2025-08-19T12:19:28.205427856Z","level":"INFO","msg":"writer: started","stream_id":"k06awv3c"} +{"time":"2025-08-19T12:19:28.205549372Z","level":"INFO","msg":"sender: started","stream_id":"k06awv3c"} +{"time":"2025-08-19T12:19:28.205576435Z","level":"INFO","msg":"handler: started","stream_id":"k06awv3c"} +{"time":"2025-08-19T13:28:02.517899324Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-08-19T13:28:02.672144864Z","level":"INFO","msg":"handler: operation stats","stats":{}} +{"time":"2025-08-19T13:28:02.674610912Z","level":"INFO","msg":"stream: closing","id":"k06awv3c"} +{"time":"2025-08-19T13:28:02.674631235Z","level":"INFO","msg":"handler: closed","stream_id":"k06awv3c"} +{"time":"2025-08-19T13:28:02.674685151Z","level":"INFO","msg":"sender: closed","stream_id":"k06awv3c"} +{"time":"2025-08-19T13:28:02.674698058Z","level":"INFO","msg":"stream: closed","id":"k06awv3c"} diff --git a/wandb/debug.log b/wandb/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..74e30822fd74f3b58596102132adda40098c2c55 --- /dev/null +++ b/wandb/debug.log @@ -0,0 +1,28 @@ +2025-08-19 12:19:27,434 INFO MainThread:387 [wandb_setup.py:_flush():80] Current SDK version is 0.21.1 +2025-08-19 12:19:27,434 INFO MainThread:387 [wandb_setup.py:_flush():80] Configure stats pid to 387 +2025-08-19 12:19:27,434 INFO MainThread:387 [wandb_setup.py:_flush():80] Loading settings from /root/.config/wandb/settings +2025-08-19 12:19:27,434 INFO MainThread:387 [wandb_setup.py:_flush():80] Loading settings from /notebooks/toy_models/models/gelu_2l_v7_50m_subset/wandb/settings +2025-08-19 12:19:27,434 INFO MainThread:387 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-08-19 12:19:27,434 INFO MainThread:387 [wandb_init.py:setup_run_log_directory():703] Logging user logs to /notebooks/toy_models/models/gelu_2l_v7_50m_subset/wandb/run-20250819_121927-k06awv3c/logs/debug.log +2025-08-19 12:19:27,434 INFO MainThread:387 [wandb_init.py:setup_run_log_directory():704] Logging internal logs to /notebooks/toy_models/models/gelu_2l_v7_50m_subset/wandb/run-20250819_121927-k06awv3c/logs/debug-internal.log +2025-08-19 12:19:27,434 INFO MainThread:387 [wandb_init.py:init():830] calling init triggers +2025-08-19 12:19:27,434 INFO MainThread:387 [wandb_init.py:init():835] wandb.init called with sweep_config: {} +config: {'model_name': 'gelu_2l_50m_subset', 'n_layers': 2, 'd_model': 512, 'd_mlp': 2048, 'd_head': 64, 'n_heads': 8, 'attn_only': False, 'layer_norm_eps': 1e-05, 'init_range': 0.02, 'n_ctx': 1024, 'd_vocab': 48262, 'dataset_name': 'eoinf/c4-code-test-50m', 'tokenizer_name': 'NeelNanda/gpt-neox-tokenizer-digits', 'seed': 10, 'device': 'cuda', 'use_bfloat16_matmul': False, 'batch_size_per_device': 32, 'n_devices': 1, 'batches_per_step': 1, 'max_tokens': 200000000, 'lr_hidden': 0.002, 'lr_vector': 0.001, 'lr_schedule': 'constant_with_warmup', 'warmup_tokens': 30000000, 'weight_decay': 0.05, 'grad_norm_clip': 1.0, 'train_loss_moving_average_beta': 0.99, 'log_interval': 25, 'save_checkpoints': True, 'checkpoint_interval': 500, 'checkpoint_interval_ratio': 1.08, 'save_log_checkpoints': True, 'use_wandb': True, 'batch_size': 32, 'tokens_per_step': 32768, 'warmup_steps': 915, 'max_steps': 6103, '_wandb': {}} +2025-08-19 12:19:27,434 INFO MainThread:387 [wandb_init.py:init():871] starting backend +2025-08-19 12:19:27,906 INFO MainThread:387 [wandb_init.py:init():874] sending inform_init request +2025-08-19 12:19:27,933 INFO MainThread:387 [wandb_init.py:init():882] backend started and connected +2025-08-19 12:19:27,934 INFO MainThread:387 [wandb_init.py:init():953] updated telemetry +2025-08-19 12:19:27,940 INFO MainThread:387 [wandb_init.py:init():977] communicating run to backend with 90.0 second timeout +2025-08-19 12:19:28,473 INFO MainThread:387 [wandb_init.py:init():1029] starting run threads in backend +2025-08-19 12:19:29,237 INFO MainThread:387 [wandb_run.py:_console_start():2494] atexit reg +2025-08-19 12:19:29,237 INFO MainThread:387 [wandb_run.py:_redirect():2342] redirect: wrap_raw +2025-08-19 12:19:29,238 INFO MainThread:387 [wandb_run.py:_redirect():2411] Wrapping output streams. +2025-08-19 12:19:29,238 INFO MainThread:387 [wandb_run.py:_redirect():2434] Redirects installed. +2025-08-19 12:19:29,248 INFO MainThread:387 [wandb_init.py:init():1075] run started, returning control to user process +2025-08-19 13:28:02,197 INFO MainThread:387 [wandb_run.py:_finish():2260] finishing run eoin/toy-transformer-replication/k06awv3c +2025-08-19 13:28:02,202 INFO MainThread:387 [wandb_run.py:_atexit_cleanup():2459] got exitcode: 0 +2025-08-19 13:28:02,202 INFO MainThread:387 [wandb_run.py:_restore():2441] restore +2025-08-19 13:28:02,202 INFO MainThread:387 [wandb_run.py:_restore():2447] restore done +2025-08-19 13:28:02,673 INFO MainThread:387 [wandb_run.py:_footer_history_summary_info():3895] rendering history +2025-08-19 13:28:02,674 INFO MainThread:387 [wandb_run.py:_footer_history_summary_info():3927] rendering summary +2025-08-19 13:28:02,674 INFO MainThread:387 [wandb_run.py:_footer_sync_info():3856] logging synced files diff --git a/wandb/run-20250819_121927-k06awv3c/files/config.yaml b/wandb/run-20250819_121927-k06awv3c/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..17c72f8b981b9f0d8bb5de5918b67c91f067f405 --- /dev/null +++ b/wandb/run-20250819_121927-k06awv3c/files/config.yaml @@ -0,0 +1,129 @@ +_wandb: + value: + cli_version: 0.21.1 + e: + cqhawoa7kp3h9928du7m9jrbdvr3k0xq: + cpu_count: 8 + cpu_count_logical: 8 + cudaVersion: "12.4" + disk: + /: + total: "262240792576" + used: "119662645248" + email: efarrel4@tcd.ie + executable: /notebooks/clean_env/bin/python + git: + commit: c3cfb768d471036c37848ff2c6d223b68ad88e82 + remote: git@github.com:jgroh3/toy_models.git + gpu: NVIDIA RTX A6000 + gpu_count: 1 + gpu_nvidia: + - architecture: Ampere + cudaCores: 10752 + memoryTotal: "51527024640" + name: NVIDIA RTX A6000 + uuid: GPU-5d4a8d59-78f7-4f92-75ec-db7c23705ed5 + host: nrsym6j40o + memory: + total: "47332843520" + os: Linux-5.19.0-45-generic-x86_64-with-glibc2.35 + program: + python: CPython 3.11.7 + root: /notebooks/toy_models/models/gelu_2l_v7_50m_subset + startedAt: "2025-08-19T12:19:27.429840Z" + writerId: cqhawoa7kp3h9928du7m9jrbdvr3k0xq + m: [] + python_version: 3.11.7 + t: + "1": + - 1 + - 49 + - 51 + "2": + - 1 + - 49 + - 51 + "3": + - 2 + - 13 + - 15 + - 16 + - 61 + "4": 3.11.7 + "5": 0.21.1 + "12": 0.21.1 + "13": linux-x86_64 +attn_only: + value: false +batch_size: + value: 32 +batch_size_per_device: + value: 32 +batches_per_step: + value: 1 +checkpoint_interval: + value: 500 +checkpoint_interval_ratio: + value: 1.08 +d_head: + value: 64 +d_mlp: + value: 2048 +d_model: + value: 512 +d_vocab: + value: 48262 +dataset_name: + value: eoinf/c4-code-test-50m +device: + value: cuda +grad_norm_clip: + value: 1 +init_range: + value: 0.02 +layer_norm_eps: + value: 1e-05 +log_interval: + value: 25 +lr_hidden: + value: 0.002 +lr_schedule: + value: constant_with_warmup +lr_vector: + value: 0.001 +max_steps: + value: 6103 +max_tokens: + value: 200000000 +model_name: + value: gelu_2l_50m_subset +n_ctx: + value: 1024 +n_devices: + value: 1 +n_heads: + value: 8 +n_layers: + value: 2 +save_checkpoints: + value: true +save_log_checkpoints: + value: true +seed: + value: 10 +tokenizer_name: + value: NeelNanda/gpt-neox-tokenizer-digits +tokens_per_step: + value: 32768 +train_loss_moving_average_beta: + value: 0.99 +use_bfloat16_matmul: + value: false +use_wandb: + value: true +warmup_steps: + value: 915 +warmup_tokens: + value: 30000000 +weight_decay: + value: 0.05 diff --git a/wandb/run-20250819_121927-k06awv3c/files/output.log b/wandb/run-20250819_121927-k06awv3c/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..c2681f97b16669e21d33f82c0bd7a9c63b285564 --- /dev/null +++ b/wandb/run-20250819_121927-k06awv3c/files/output.log @@ -0,0 +1,252 @@ +Training on cuda +Model: 2L, 512d, 8h +Max steps: 6,103, Max tokens: 200,000,000 +Warmup steps: 915, Warmup tokens: 30,000,000 +Batch size per device: 32 +Context length: 1024 +Learning rates - Hidden: 0.002, Vector: 0.001 + +Step 25 | Tokens: 819,200 | Train Loss EWMA: 10.7933 | Learning Rate: 0.000055 | Progress: 0.00410 +Step 50 | Tokens: 1,638,400 | Train Loss EWMA: 10.5563 | Learning Rate: 0.000109 | Progress: 0.00819 +Step 75 | Tokens: 2,457,600 | Train Loss EWMA: 10.1902 | Learning Rate: 0.000164 | Progress: 0.01229 +Step 100 | Tokens: 3,276,800 | Train Loss EWMA: 9.7077 | Learning Rate: 0.000219 | Progress: 0.01638 +Step 125 | Tokens: 4,096,000 | Train Loss EWMA: 9.2018 | Learning Rate: 0.000273 | Progress: 0.02048 +Step 150 | Tokens: 4,915,200 | Train Loss EWMA: 8.7332 | Learning Rate: 0.000328 | Progress: 0.02458 +Step 175 | Tokens: 5,734,400 | Train Loss EWMA: 8.3194 | Learning Rate: 0.000383 | Progress: 0.02867 +Step 200 | Tokens: 6,553,600 | Train Loss EWMA: 7.9567 | Learning Rate: 0.000437 | Progress: 0.03277 +Step 225 | Tokens: 7,372,800 | Train Loss EWMA: 7.6436 | Learning Rate: 0.000492 | Progress: 0.03686 +Step 250 | Tokens: 8,192,000 | Train Loss EWMA: 7.3802 | Learning Rate: 0.000546 | Progress: 0.04096 +Step 275 | Tokens: 9,011,200 | Train Loss EWMA: 7.1655 | Learning Rate: 0.000601 | Progress: 0.04506 +Step 300 | Tokens: 9,830,400 | Train Loss EWMA: 6.9792 | Learning Rate: 0.000656 | Progress: 0.04915 +Step 325 | Tokens: 10,649,600 | Train Loss EWMA: 6.8274 | Learning Rate: 0.000710 | Progress: 0.05325 +Step 350 | Tokens: 11,468,800 | Train Loss EWMA: 6.6926 | Learning Rate: 0.000765 | Progress: 0.05734 +Step 375 | Tokens: 12,288,000 | Train Loss EWMA: 6.5822 | Learning Rate: 0.000820 | Progress: 0.06144 +Step 400 | Tokens: 13,107,200 | Train Loss EWMA: 6.4867 | Learning Rate: 0.000874 | Progress: 0.06554 +Step 425 | Tokens: 13,926,400 | Train Loss EWMA: 6.4087 | Learning Rate: 0.000929 | Progress: 0.06963 +Step 450 | Tokens: 14,745,600 | Train Loss EWMA: 6.3379 | Learning Rate: 0.000984 | Progress: 0.07373 +Step 475 | Tokens: 15,564,800 | Train Loss EWMA: 6.2717 | Learning Rate: 0.001038 | Progress: 0.07782 +Step 500 | Tokens: 16,384,000 | Train Loss EWMA: 6.2111 | Learning Rate: 0.001093 | Progress: 0.08192 +Step 525 | Tokens: 17,203,200 | Train Loss EWMA: 6.1570 | Learning Rate: 0.001148 | Progress: 0.08602 +Step 550 | Tokens: 18,022,400 | Train Loss EWMA: 6.1155 | Learning Rate: 0.001202 | Progress: 0.09011 +Step 575 | Tokens: 18,841,600 | Train Loss EWMA: 6.0602 | Learning Rate: 0.001257 | Progress: 0.09421 +Step 600 | Tokens: 19,660,800 | Train Loss EWMA: 6.0240 | Learning Rate: 0.001311 | Progress: 0.09830 +Step 625 | Tokens: 20,480,000 | Train Loss EWMA: 5.9864 | Learning Rate: 0.001366 | Progress: 0.10240 +Step 650 | Tokens: 21,299,200 | Train Loss EWMA: 5.9499 | Learning Rate: 0.001421 | Progress: 0.10650 +Step 675 | Tokens: 22,118,400 | Train Loss EWMA: 5.9158 | Learning Rate: 0.001475 | Progress: 0.11059 +Step 700 | Tokens: 22,937,600 | Train Loss EWMA: 5.8866 | Learning Rate: 0.001530 | Progress: 0.11469 +Step 725 | Tokens: 23,756,800 | Train Loss EWMA: 5.8651 | Learning Rate: 0.001585 | Progress: 0.11878 +Step 750 | Tokens: 24,576,000 | Train Loss EWMA: 5.8452 | Learning Rate: 0.001639 | Progress: 0.12288 +Step 775 | Tokens: 25,395,200 | Train Loss EWMA: 5.8207 | Learning Rate: 0.001694 | Progress: 0.12698 +Step 800 | Tokens: 26,214,400 | Train Loss EWMA: 5.7974 | Learning Rate: 0.001749 | Progress: 0.13107 +Step 825 | Tokens: 27,033,600 | Train Loss EWMA: 5.7640 | Learning Rate: 0.001803 | Progress: 0.13517 +Step 850 | Tokens: 27,852,800 | Train Loss EWMA: 5.7452 | Learning Rate: 0.001858 | Progress: 0.13926 +Step 875 | Tokens: 28,672,000 | Train Loss EWMA: 5.7172 | Learning Rate: 0.001913 | Progress: 0.14336 +Step 900 | Tokens: 29,491,200 | Train Loss EWMA: 5.6882 | Learning Rate: 0.001967 | Progress: 0.14746 +Step 925 | Tokens: 30,310,400 | Train Loss EWMA: 5.6575 | Learning Rate: 0.002000 | Progress: 0.15155 +Step 950 | Tokens: 31,129,600 | Train Loss EWMA: 5.6264 | Learning Rate: 0.002000 | Progress: 0.15565 +Step 975 | Tokens: 31,948,800 | Train Loss EWMA: 5.5938 | Learning Rate: 0.002000 | Progress: 0.15974 +Step 1,000 | Tokens: 32,768,000 | Train Loss EWMA: 5.5605 | Learning Rate: 0.002000 | Progress: 0.16384 +Step 1,025 | Tokens: 33,587,200 | Train Loss EWMA: 5.5288 | Learning Rate: 0.002000 | Progress: 0.16794 +Step 1,050 | Tokens: 34,406,400 | Train Loss EWMA: 5.4973 | Learning Rate: 0.002000 | Progress: 0.17203 +Step 1,075 | Tokens: 35,225,600 | Train Loss EWMA: 5.4699 | Learning Rate: 0.002000 | Progress: 0.17613 +Step 1,100 | Tokens: 36,044,800 | Train Loss EWMA: 5.4539 | Learning Rate: 0.002000 | Progress: 0.18022 +Step 1,125 | Tokens: 36,864,000 | Train Loss EWMA: 5.4213 | Learning Rate: 0.002000 | Progress: 0.18432 +Step 1,150 | Tokens: 37,683,200 | Train Loss EWMA: 5.4024 | Learning Rate: 0.002000 | Progress: 0.18842 +Step 1,175 | Tokens: 38,502,400 | Train Loss EWMA: 5.3795 | Learning Rate: 0.002000 | Progress: 0.19251 +Step 1,200 | Tokens: 39,321,600 | Train Loss EWMA: 5.3501 | Learning Rate: 0.002000 | Progress: 0.19661 +Step 1,225 | Tokens: 40,140,800 | Train Loss EWMA: 5.3304 | Learning Rate: 0.002000 | Progress: 0.20070 +Step 1,250 | Tokens: 40,960,000 | Train Loss EWMA: 5.3079 | Learning Rate: 0.002000 | Progress: 0.20480 +Step 1,275 | Tokens: 41,779,200 | Train Loss EWMA: 5.2877 | Learning Rate: 0.002000 | Progress: 0.20890 +Step 1,300 | Tokens: 42,598,400 | Train Loss EWMA: 5.2602 | Learning Rate: 0.002000 | Progress: 0.21299 +Step 1,325 | Tokens: 43,417,600 | Train Loss EWMA: 5.2393 | Learning Rate: 0.002000 | Progress: 0.21709 +Step 1,350 | Tokens: 44,236,800 | Train Loss EWMA: 5.2261 | Learning Rate: 0.002000 | Progress: 0.22118 +Step 1,375 | Tokens: 45,056,000 | Train Loss EWMA: 5.2075 | Learning Rate: 0.002000 | Progress: 0.22528 +Step 1,400 | Tokens: 45,875,200 | Train Loss EWMA: 5.1917 | Learning Rate: 0.002000 | Progress: 0.22938 +Step 1,425 | Tokens: 46,694,400 | Train Loss EWMA: 5.1755 | Learning Rate: 0.002000 | Progress: 0.23347 +Step 1,450 | Tokens: 47,513,600 | Train Loss EWMA: 5.1673 | Learning Rate: 0.002000 | Progress: 0.23757 +Step 1,475 | Tokens: 48,332,800 | Train Loss EWMA: 5.1499 | Learning Rate: 0.002000 | Progress: 0.24166 +Step 1,500 | Tokens: 49,152,000 | Train Loss EWMA: 5.1428 | Learning Rate: 0.002000 | Progress: 0.24576 +Step 1,525 | Tokens: 49,971,200 | Train Loss EWMA: 5.1287 | Learning Rate: 0.002000 | Progress: 0.24986 +Step 1,550 | Tokens: 50,790,400 | Train Loss EWMA: 5.1116 | Learning Rate: 0.002000 | Progress: 0.25395 +Step 1,575 | Tokens: 51,593,216 | Train Loss EWMA: 5.0761 | Learning Rate: 0.002000 | Progress: 0.25797 +Step 1,600 | Tokens: 52,412,416 | Train Loss EWMA: 5.0497 | Learning Rate: 0.002000 | Progress: 0.26206 +Step 1,625 | Tokens: 53,231,616 | Train Loss EWMA: 5.0160 | Learning Rate: 0.002000 | Progress: 0.26616 +Step 1,650 | Tokens: 54,050,816 | Train Loss EWMA: 4.9916 | Learning Rate: 0.002000 | Progress: 0.27025 +Step 1,675 | Tokens: 54,870,016 | Train Loss EWMA: 4.9738 | Learning Rate: 0.002000 | Progress: 0.27435 +Step 1,700 | Tokens: 55,689,216 | Train Loss EWMA: 4.9610 | Learning Rate: 0.002000 | Progress: 0.27845 +Step 1,725 | Tokens: 56,508,416 | Train Loss EWMA: 4.9461 | Learning Rate: 0.002000 | Progress: 0.28254 +Step 1,750 | Tokens: 57,327,616 | Train Loss EWMA: 4.9367 | Learning Rate: 0.002000 | Progress: 0.28664 +Step 1,775 | Tokens: 58,146,816 | Train Loss EWMA: 4.9308 | Learning Rate: 0.002000 | Progress: 0.29073 +Step 1,800 | Tokens: 58,966,016 | Train Loss EWMA: 4.9134 | Learning Rate: 0.002000 | Progress: 0.29483 +Step 1,825 | Tokens: 59,785,216 | Train Loss EWMA: 4.9011 | Learning Rate: 0.002000 | Progress: 0.29893 +Step 1,850 | Tokens: 60,604,416 | Train Loss EWMA: 4.8815 | Learning Rate: 0.002000 | Progress: 0.30302 +Step 1,875 | Tokens: 61,423,616 | Train Loss EWMA: 4.8754 | Learning Rate: 0.002000 | Progress: 0.30712 +Step 1,900 | Tokens: 62,242,816 | Train Loss EWMA: 4.8664 | Learning Rate: 0.002000 | Progress: 0.31121 +Step 1,925 | Tokens: 63,062,016 | Train Loss EWMA: 4.8573 | Learning Rate: 0.002000 | Progress: 0.31531 +Step 1,950 | Tokens: 63,881,216 | Train Loss EWMA: 4.8343 | Learning Rate: 0.002000 | Progress: 0.31941 +Step 1,975 | Tokens: 64,700,416 | Train Loss EWMA: 4.8217 | Learning Rate: 0.002000 | Progress: 0.32350 +Step 2,000 | Tokens: 65,519,616 | Train Loss EWMA: 4.8116 | Learning Rate: 0.002000 | Progress: 0.32760 +Step 2,025 | Tokens: 66,338,816 | Train Loss EWMA: 4.8019 | Learning Rate: 0.002000 | Progress: 0.33169 +Step 2,050 | Tokens: 67,158,016 | Train Loss EWMA: 4.7890 | Learning Rate: 0.002000 | Progress: 0.33579 +Step 2,075 | Tokens: 67,977,216 | Train Loss EWMA: 4.7743 | Learning Rate: 0.002000 | Progress: 0.33989 +Step 2,100 | Tokens: 68,796,416 | Train Loss EWMA: 4.7587 | Learning Rate: 0.002000 | Progress: 0.34398 +Step 2,125 | Tokens: 69,615,616 | Train Loss EWMA: 4.7450 | Learning Rate: 0.002000 | Progress: 0.34808 +Step 2,150 | Tokens: 70,434,816 | Train Loss EWMA: 4.7247 | Learning Rate: 0.002000 | Progress: 0.35217 +Step 2,175 | Tokens: 71,254,016 | Train Loss EWMA: 4.7182 | Learning Rate: 0.002000 | Progress: 0.35627 +Step 2,200 | Tokens: 72,073,216 | Train Loss EWMA: 4.7026 | Learning Rate: 0.002000 | Progress: 0.36037 +Step 2,225 | Tokens: 72,892,416 | Train Loss EWMA: 4.6963 | Learning Rate: 0.002000 | Progress: 0.36446 +Step 2,250 | Tokens: 73,711,616 | Train Loss EWMA: 4.6749 | Learning Rate: 0.002000 | Progress: 0.36856 +Step 2,275 | Tokens: 74,530,816 | Train Loss EWMA: 4.6565 | Learning Rate: 0.002000 | Progress: 0.37265 +Step 2,300 | Tokens: 75,350,016 | Train Loss EWMA: 4.6412 | Learning Rate: 0.002000 | Progress: 0.37675 +Step 2,325 | Tokens: 76,169,216 | Train Loss EWMA: 4.6286 | Learning Rate: 0.002000 | Progress: 0.38085 +Step 2,350 | Tokens: 76,988,416 | Train Loss EWMA: 4.6244 | Learning Rate: 0.002000 | Progress: 0.38494 +Step 2,375 | Tokens: 77,807,616 | Train Loss EWMA: 4.6162 | Learning Rate: 0.002000 | Progress: 0.38904 +Step 2,400 | Tokens: 78,626,816 | Train Loss EWMA: 4.6097 | Learning Rate: 0.002000 | Progress: 0.39313 +Step 2,425 | Tokens: 79,446,016 | Train Loss EWMA: 4.6104 | Learning Rate: 0.002000 | Progress: 0.39723 +Step 2,450 | Tokens: 80,265,216 | Train Loss EWMA: 4.5924 | Learning Rate: 0.002000 | Progress: 0.40133 +Step 2,475 | Tokens: 81,084,416 | Train Loss EWMA: 4.5824 | Learning Rate: 0.002000 | Progress: 0.40542 +Step 2,500 | Tokens: 81,903,616 | Train Loss EWMA: 4.5633 | Learning Rate: 0.002000 | Progress: 0.40952 +Step 2,525 | Tokens: 82,722,816 | Train Loss EWMA: 4.5513 | Learning Rate: 0.002000 | Progress: 0.41361 +Step 2,550 | Tokens: 83,542,016 | Train Loss EWMA: 4.5504 | Learning Rate: 0.002000 | Progress: 0.41771 +Step 2,575 | Tokens: 84,361,216 | Train Loss EWMA: 4.5358 | Learning Rate: 0.002000 | Progress: 0.42181 +Step 2,600 | Tokens: 85,180,416 | Train Loss EWMA: 4.5250 | Learning Rate: 0.002000 | Progress: 0.42590 +Step 2,625 | Tokens: 85,999,616 | Train Loss EWMA: 4.5129 | Learning Rate: 0.002000 | Progress: 0.43000 +Step 2,650 | Tokens: 86,818,816 | Train Loss EWMA: 4.5125 | Learning Rate: 0.002000 | Progress: 0.43409 +Step 2,675 | Tokens: 87,638,016 | Train Loss EWMA: 4.5060 | Learning Rate: 0.002000 | Progress: 0.43819 +Step 2,700 | Tokens: 88,457,216 | Train Loss EWMA: 4.4996 | Learning Rate: 0.002000 | Progress: 0.44229 +Step 2,725 | Tokens: 89,276,416 | Train Loss EWMA: 4.4719 | Learning Rate: 0.002000 | Progress: 0.44638 +Step 2,750 | Tokens: 90,095,616 | Train Loss EWMA: 4.4652 | Learning Rate: 0.002000 | Progress: 0.45048 +Step 2,775 | Tokens: 90,914,816 | Train Loss EWMA: 4.4579 | Learning Rate: 0.002000 | Progress: 0.45457 +Step 2,800 | Tokens: 91,734,016 | Train Loss EWMA: 4.4543 | Learning Rate: 0.002000 | Progress: 0.45867 +Step 2,825 | Tokens: 92,553,216 | Train Loss EWMA: 4.4543 | Learning Rate: 0.002000 | Progress: 0.46277 +Step 2,850 | Tokens: 93,372,416 | Train Loss EWMA: 4.4531 | Learning Rate: 0.002000 | Progress: 0.46686 +Step 2,875 | Tokens: 94,191,616 | Train Loss EWMA: 4.4413 | Learning Rate: 0.002000 | Progress: 0.47096 +Step 2,900 | Tokens: 95,010,816 | Train Loss EWMA: 4.4201 | Learning Rate: 0.002000 | Progress: 0.47505 +Step 2,925 | Tokens: 95,830,016 | Train Loss EWMA: 4.4114 | Learning Rate: 0.002000 | Progress: 0.47915 +Step 2,950 | Tokens: 96,649,216 | Train Loss EWMA: 4.3998 | Learning Rate: 0.002000 | Progress: 0.48325 +Step 2,975 | Tokens: 97,468,416 | Train Loss EWMA: 4.3962 | Learning Rate: 0.002000 | Progress: 0.48734 +Step 3,000 | Tokens: 98,287,616 | Train Loss EWMA: 4.3921 | Learning Rate: 0.002000 | Progress: 0.49144 +Step 3,025 | Tokens: 99,106,816 | Train Loss EWMA: 4.3855 | Learning Rate: 0.002000 | Progress: 0.49553 +Step 3,050 | Tokens: 99,926,016 | Train Loss EWMA: 4.3885 | Learning Rate: 0.002000 | Progress: 0.49963 +Step 3,075 | Tokens: 100,745,216 | Train Loss EWMA: 4.3941 | Learning Rate: 0.002000 | Progress: 0.50373 +Step 3,100 | Tokens: 101,564,416 | Train Loss EWMA: 4.4004 | Learning Rate: 0.002000 | Progress: 0.50782 +Step 3,125 | Tokens: 102,383,616 | Train Loss EWMA: 4.3980 | Learning Rate: 0.002000 | Progress: 0.51192 +Step 3,150 | Tokens: 103,186,432 | Train Loss EWMA: 4.3617 | Learning Rate: 0.002000 | Progress: 0.51593 +Step 3,175 | Tokens: 104,005,632 | Train Loss EWMA: 4.3252 | Learning Rate: 0.002000 | Progress: 0.52003 +Step 3,200 | Tokens: 104,824,832 | Train Loss EWMA: 4.3003 | Learning Rate: 0.002000 | Progress: 0.52412 +Step 3,225 | Tokens: 105,644,032 | Train Loss EWMA: 4.2891 | Learning Rate: 0.002000 | Progress: 0.52822 +Step 3,250 | Tokens: 106,463,232 | Train Loss EWMA: 4.2697 | Learning Rate: 0.002000 | Progress: 0.53232 +Step 3,275 | Tokens: 107,282,432 | Train Loss EWMA: 4.2568 | Learning Rate: 0.002000 | Progress: 0.53641 +Step 3,300 | Tokens: 108,101,632 | Train Loss EWMA: 4.2569 | Learning Rate: 0.002000 | Progress: 0.54051 +Step 3,325 | Tokens: 108,920,832 | Train Loss EWMA: 4.2526 | Learning Rate: 0.002000 | Progress: 0.54460 +Step 3,350 | Tokens: 109,740,032 | Train Loss EWMA: 4.2526 | Learning Rate: 0.002000 | Progress: 0.54870 +Step 3,375 | Tokens: 110,559,232 | Train Loss EWMA: 4.2430 | Learning Rate: 0.002000 | Progress: 0.55280 +Step 3,400 | Tokens: 111,378,432 | Train Loss EWMA: 4.2321 | Learning Rate: 0.002000 | Progress: 0.55689 +Step 3,425 | Tokens: 112,197,632 | Train Loss EWMA: 4.2356 | Learning Rate: 0.002000 | Progress: 0.56099 +Step 3,450 | Tokens: 113,016,832 | Train Loss EWMA: 4.2131 | Learning Rate: 0.002000 | Progress: 0.56508 +Step 3,475 | Tokens: 113,836,032 | Train Loss EWMA: 4.2179 | Learning Rate: 0.002000 | Progress: 0.56918 +Step 3,500 | Tokens: 114,655,232 | Train Loss EWMA: 4.2094 | Learning Rate: 0.002000 | Progress: 0.57328 +Step 3,525 | Tokens: 115,474,432 | Train Loss EWMA: 4.2151 | Learning Rate: 0.002000 | Progress: 0.57737 +Step 3,550 | Tokens: 116,293,632 | Train Loss EWMA: 4.2146 | Learning Rate: 0.002000 | Progress: 0.58147 +Step 3,575 | Tokens: 117,112,832 | Train Loss EWMA: 4.2153 | Learning Rate: 0.002000 | Progress: 0.58556 +Step 3,600 | Tokens: 117,932,032 | Train Loss EWMA: 4.2166 | Learning Rate: 0.002000 | Progress: 0.58966 +Step 3,625 | Tokens: 118,751,232 | Train Loss EWMA: 4.2182 | Learning Rate: 0.002000 | Progress: 0.59376 +Step 3,650 | Tokens: 119,570,432 | Train Loss EWMA: 4.2133 | Learning Rate: 0.002000 | Progress: 0.59785 +Step 3,675 | Tokens: 120,389,632 | Train Loss EWMA: 4.2047 | Learning Rate: 0.002000 | Progress: 0.60195 +Step 3,700 | Tokens: 121,208,832 | Train Loss EWMA: 4.2080 | Learning Rate: 0.002000 | Progress: 0.60604 +Step 3,725 | Tokens: 122,028,032 | Train Loss EWMA: 4.2117 | Learning Rate: 0.002000 | Progress: 0.61014 +Step 3,750 | Tokens: 122,847,232 | Train Loss EWMA: 4.2062 | Learning Rate: 0.002000 | Progress: 0.61424 +Step 3,775 | Tokens: 123,666,432 | Train Loss EWMA: 4.1999 | Learning Rate: 0.002000 | Progress: 0.61833 +Step 3,800 | Tokens: 124,485,632 | Train Loss EWMA: 4.2144 | Learning Rate: 0.002000 | Progress: 0.62243 +Step 3,825 | Tokens: 125,304,832 | Train Loss EWMA: 4.2127 | Learning Rate: 0.002000 | Progress: 0.62652 +Step 3,850 | Tokens: 126,124,032 | Train Loss EWMA: 4.2041 | Learning Rate: 0.002000 | Progress: 0.63062 +Step 3,875 | Tokens: 126,943,232 | Train Loss EWMA: 4.1981 | Learning Rate: 0.002000 | Progress: 0.63472 +Step 3,900 | Tokens: 127,762,432 | Train Loss EWMA: 4.1990 | Learning Rate: 0.002000 | Progress: 0.63881 +Step 3,925 | Tokens: 128,581,632 | Train Loss EWMA: 4.1907 | Learning Rate: 0.002000 | Progress: 0.64291 +Step 3,950 | Tokens: 129,400,832 | Train Loss EWMA: 4.1937 | Learning Rate: 0.002000 | Progress: 0.64700 +Step 3,975 | Tokens: 130,220,032 | Train Loss EWMA: 4.1888 | Learning Rate: 0.002000 | Progress: 0.65110 +Step 4,000 | Tokens: 131,039,232 | Train Loss EWMA: 4.1736 | Learning Rate: 0.002000 | Progress: 0.65520 +Step 4,025 | Tokens: 131,858,432 | Train Loss EWMA: 4.1622 | Learning Rate: 0.002000 | Progress: 0.65929 +Step 4,050 | Tokens: 132,677,632 | Train Loss EWMA: 4.1774 | Learning Rate: 0.002000 | Progress: 0.66339 +Step 4,075 | Tokens: 133,496,832 | Train Loss EWMA: 4.1773 | Learning Rate: 0.002000 | Progress: 0.66748 +Step 4,100 | Tokens: 134,316,032 | Train Loss EWMA: 4.1889 | Learning Rate: 0.002000 | Progress: 0.67158 +Step 4,125 | Tokens: 135,135,232 | Train Loss EWMA: 4.1927 | Learning Rate: 0.002000 | Progress: 0.67568 +Step 4,150 | Tokens: 135,954,432 | Train Loss EWMA: 4.1914 | Learning Rate: 0.002000 | Progress: 0.67977 +Step 4,175 | Tokens: 136,773,632 | Train Loss EWMA: 4.1836 | Learning Rate: 0.002000 | Progress: 0.68387 +Step 4,200 | Tokens: 137,592,832 | Train Loss EWMA: 4.1833 | Learning Rate: 0.002000 | Progress: 0.68796 +Step 4,225 | Tokens: 138,412,032 | Train Loss EWMA: 4.1896 | Learning Rate: 0.002000 | Progress: 0.69206 +Step 4,250 | Tokens: 139,231,232 | Train Loss EWMA: 4.1991 | Learning Rate: 0.002000 | Progress: 0.69616 +Step 4,275 | Tokens: 140,050,432 | Train Loss EWMA: 4.2021 | Learning Rate: 0.002000 | Progress: 0.70025 +Step 4,300 | Tokens: 140,869,632 | Train Loss EWMA: 4.2049 | Learning Rate: 0.002000 | Progress: 0.70435 +Step 4,325 | Tokens: 141,688,832 | Train Loss EWMA: 4.1911 | Learning Rate: 0.002000 | Progress: 0.70844 +Step 4,350 | Tokens: 142,508,032 | Train Loss EWMA: 4.1847 | Learning Rate: 0.002000 | Progress: 0.71254 +Step 4,375 | Tokens: 143,327,232 | Train Loss EWMA: 4.1956 | Learning Rate: 0.002000 | Progress: 0.71664 +Step 4,400 | Tokens: 144,146,432 | Train Loss EWMA: 4.1868 | Learning Rate: 0.002000 | Progress: 0.72073 +Step 4,425 | Tokens: 144,965,632 | Train Loss EWMA: 4.1813 | Learning Rate: 0.002000 | Progress: 0.72483 +Step 4,450 | Tokens: 145,784,832 | Train Loss EWMA: 4.1629 | Learning Rate: 0.002000 | Progress: 0.72892 +Step 4,475 | Tokens: 146,604,032 | Train Loss EWMA: 4.1679 | Learning Rate: 0.002000 | Progress: 0.73302 +Step 4,500 | Tokens: 147,423,232 | Train Loss EWMA: 4.1594 | Learning Rate: 0.002000 | Progress: 0.73712 +Step 4,525 | Tokens: 148,242,432 | Train Loss EWMA: 4.1608 | Learning Rate: 0.002000 | Progress: 0.74121 +Step 4,550 | Tokens: 149,061,632 | Train Loss EWMA: 4.1610 | Learning Rate: 0.002000 | Progress: 0.74531 +Step 4,575 | Tokens: 149,880,832 | Train Loss EWMA: 4.1595 | Learning Rate: 0.002000 | Progress: 0.74940 +Step 4,600 | Tokens: 150,700,032 | Train Loss EWMA: 4.1735 | Learning Rate: 0.002000 | Progress: 0.75350 +Step 4,625 | Tokens: 151,519,232 | Train Loss EWMA: 4.1597 | Learning Rate: 0.002000 | Progress: 0.75760 +Step 4,650 | Tokens: 152,338,432 | Train Loss EWMA: 4.1679 | Learning Rate: 0.002000 | Progress: 0.76169 +Step 4,675 | Tokens: 153,157,632 | Train Loss EWMA: 4.1598 | Learning Rate: 0.002000 | Progress: 0.76579 +Step 4,700 | Tokens: 153,960,448 | Train Loss EWMA: 4.1467 | Learning Rate: 0.002000 | Progress: 0.76980 +Step 4,725 | Tokens: 154,779,648 | Train Loss EWMA: 4.1138 | Learning Rate: 0.002000 | Progress: 0.77390 +Step 4,750 | Tokens: 155,598,848 | Train Loss EWMA: 4.0853 | Learning Rate: 0.002000 | Progress: 0.77799 +Step 4,775 | Tokens: 156,418,048 | Train Loss EWMA: 4.0643 | Learning Rate: 0.002000 | Progress: 0.78209 +Step 4,800 | Tokens: 157,237,248 | Train Loss EWMA: 4.0509 | Learning Rate: 0.002000 | Progress: 0.78619 +Step 4,825 | Tokens: 158,056,448 | Train Loss EWMA: 4.0393 | Learning Rate: 0.002000 | Progress: 0.79028 +Step 4,850 | Tokens: 158,875,648 | Train Loss EWMA: 4.0379 | Learning Rate: 0.002000 | Progress: 0.79438 +Step 4,875 | Tokens: 159,694,848 | Train Loss EWMA: 4.0305 | Learning Rate: 0.002000 | Progress: 0.79847 +Step 4,900 | Tokens: 160,514,048 | Train Loss EWMA: 4.0342 | Learning Rate: 0.002000 | Progress: 0.80257 +Step 4,925 | Tokens: 161,333,248 | Train Loss EWMA: 4.0326 | Learning Rate: 0.002000 | Progress: 0.80667 +Step 4,950 | Tokens: 162,152,448 | Train Loss EWMA: 4.0291 | Learning Rate: 0.002000 | Progress: 0.81076 +Step 4,975 | Tokens: 162,971,648 | Train Loss EWMA: 4.0255 | Learning Rate: 0.002000 | Progress: 0.81486 +Step 5,000 | Tokens: 163,790,848 | Train Loss EWMA: 4.0248 | Learning Rate: 0.002000 | Progress: 0.81895 +Step 5,025 | Tokens: 164,610,048 | Train Loss EWMA: 4.0160 | Learning Rate: 0.002000 | Progress: 0.82305 +Step 5,050 | Tokens: 165,429,248 | Train Loss EWMA: 4.0062 | Learning Rate: 0.002000 | Progress: 0.82715 +Step 5,075 | Tokens: 166,248,448 | Train Loss EWMA: 4.0116 | Learning Rate: 0.002000 | Progress: 0.83124 +Step 5,100 | Tokens: 167,067,648 | Train Loss EWMA: 4.0200 | Learning Rate: 0.002000 | Progress: 0.83534 +Step 5,125 | Tokens: 167,886,848 | Train Loss EWMA: 4.0177 | Learning Rate: 0.002000 | Progress: 0.83943 +Step 5,150 | Tokens: 168,706,048 | Train Loss EWMA: 4.0063 | Learning Rate: 0.002000 | Progress: 0.84353 +Step 5,175 | Tokens: 169,525,248 | Train Loss EWMA: 4.0112 | Learning Rate: 0.002000 | Progress: 0.84763 +Step 5,200 | Tokens: 170,344,448 | Train Loss EWMA: 3.9973 | Learning Rate: 0.002000 | Progress: 0.85172 +Step 5,225 | Tokens: 171,163,648 | Train Loss EWMA: 4.0059 | Learning Rate: 0.002000 | Progress: 0.85582 +Step 5,250 | Tokens: 171,982,848 | Train Loss EWMA: 4.0223 | Learning Rate: 0.002000 | Progress: 0.85991 +Step 5,275 | Tokens: 172,802,048 | Train Loss EWMA: 4.0173 | Learning Rate: 0.002000 | Progress: 0.86401 +Step 5,300 | Tokens: 173,621,248 | Train Loss EWMA: 4.0117 | Learning Rate: 0.002000 | Progress: 0.86811 +Step 5,325 | Tokens: 174,440,448 | Train Loss EWMA: 4.0070 | Learning Rate: 0.002000 | Progress: 0.87220 +Step 5,350 | Tokens: 175,259,648 | Train Loss EWMA: 4.0067 | Learning Rate: 0.002000 | Progress: 0.87630 +Step 5,375 | Tokens: 176,078,848 | Train Loss EWMA: 4.0045 | Learning Rate: 0.002000 | Progress: 0.88039 +Step 5,400 | Tokens: 176,898,048 | Train Loss EWMA: 4.0142 | Learning Rate: 0.002000 | Progress: 0.88449 +Step 5,425 | Tokens: 177,717,248 | Train Loss EWMA: 4.0171 | Learning Rate: 0.002000 | Progress: 0.88859 +Step 5,450 | Tokens: 178,536,448 | Train Loss EWMA: 4.0104 | Learning Rate: 0.002000 | Progress: 0.89268 +Step 5,475 | Tokens: 179,355,648 | Train Loss EWMA: 4.0086 | Learning Rate: 0.002000 | Progress: 0.89678 +Step 5,500 | Tokens: 180,174,848 | Train Loss EWMA: 4.0163 | Learning Rate: 0.002000 | Progress: 0.90087 +Step 5,525 | Tokens: 180,994,048 | Train Loss EWMA: 4.0108 | Learning Rate: 0.002000 | Progress: 0.90497 +Step 5,550 | Tokens: 181,813,248 | Train Loss EWMA: 4.0078 | Learning Rate: 0.002000 | Progress: 0.90907 +Step 5,575 | Tokens: 182,632,448 | Train Loss EWMA: 4.0077 | Learning Rate: 0.002000 | Progress: 0.91316 +Step 5,600 | Tokens: 183,451,648 | Train Loss EWMA: 4.0185 | Learning Rate: 0.002000 | Progress: 0.91726 +Step 5,625 | Tokens: 184,270,848 | Train Loss EWMA: 4.0198 | Learning Rate: 0.002000 | Progress: 0.92135 +Step 5,650 | Tokens: 185,090,048 | Train Loss EWMA: 4.0244 | Learning Rate: 0.002000 | Progress: 0.92545 +Step 5,675 | Tokens: 185,909,248 | Train Loss EWMA: 4.0303 | Learning Rate: 0.002000 | Progress: 0.92955 +Step 5,700 | Tokens: 186,728,448 | Train Loss EWMA: 4.0284 | Learning Rate: 0.002000 | Progress: 0.93364 +Step 5,725 | Tokens: 187,547,648 | Train Loss EWMA: 4.0258 | Learning Rate: 0.002000 | Progress: 0.93774 +Step 5,750 | Tokens: 188,366,848 | Train Loss EWMA: 4.0126 | Learning Rate: 0.002000 | Progress: 0.94183 +Step 5,775 | Tokens: 189,186,048 | Train Loss EWMA: 4.0275 | Learning Rate: 0.002000 | Progress: 0.94593 +Step 5,800 | Tokens: 190,005,248 | Train Loss EWMA: 4.0319 | Learning Rate: 0.002000 | Progress: 0.95003 +Step 5,825 | Tokens: 190,824,448 | Train Loss EWMA: 4.0293 | Learning Rate: 0.002000 | Progress: 0.95412 +Step 5,850 | Tokens: 191,643,648 | Train Loss EWMA: 4.0336 | Learning Rate: 0.002000 | Progress: 0.95822 +Step 5,875 | Tokens: 192,462,848 | Train Loss EWMA: 4.0347 | Learning Rate: 0.002000 | Progress: 0.96231 +Step 5,900 | Tokens: 193,282,048 | Train Loss EWMA: 4.0333 | Learning Rate: 0.002000 | Progress: 0.96641 +Step 5,925 | Tokens: 194,101,248 | Train Loss EWMA: 4.0341 | Learning Rate: 0.002000 | Progress: 0.97051 +Step 5,950 | Tokens: 194,920,448 | Train Loss EWMA: 4.0360 | Learning Rate: 0.002000 | Progress: 0.97460 +Step 5,975 | Tokens: 195,739,648 | Train Loss EWMA: 4.0266 | Learning Rate: 0.002000 | Progress: 0.97870 +Step 6,000 | Tokens: 196,558,848 | Train Loss EWMA: 4.0285 | Learning Rate: 0.002000 | Progress: 0.98279 +Step 6,025 | Tokens: 197,378,048 | Train Loss EWMA: 4.0175 | Learning Rate: 0.002000 | Progress: 0.98689 +Step 6,050 | Tokens: 198,197,248 | Train Loss EWMA: 4.0173 | Learning Rate: 0.002000 | Progress: 0.99099 +Step 6,075 | Tokens: 199,016,448 | Train Loss EWMA: 4.0030 | Learning Rate: 0.002000 | Progress: 0.99508 +Step 6,100 | Tokens: 199,835,648 | Train Loss EWMA: 4.0009 | Learning Rate: 0.002000 | Progress: 0.99918 diff --git a/wandb/run-20250819_121927-k06awv3c/files/requirements.txt b/wandb/run-20250819_121927-k06awv3c/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..bbed86cc72e05aec5c78850f9963d0e3471caff0 --- /dev/null +++ b/wandb/run-20250819_121927-k06awv3c/files/requirements.txt @@ -0,0 +1,185 @@ +fsspec==2025.3.0 +PyYAML==6.0.2 +certifi==2025.8.3 +comm==0.2.3 +multidict==6.6.3 +widgetsnbextension==4.0.14 +Jinja2==3.1.6 +rich==14.1.0 +httpcore==1.0.9 +nvidia-cuda-cupti-cu12==12.8.90 +nvidia-cuda-nvrtc-cu12==12.8.93 +pyzmq==27.0.1 +jsonschema==4.25.0 +asttokens==3.0.0 +jsonschema-specifications==2025.4.1 +cycler==0.12.1 +stack-data==0.6.3 +aiosignal==1.4.0 +lark==1.2.2 +ptyprocess==0.7.0 +xxhash==3.5.0 +setuptools==65.5.0 +mpmath==1.3.0 +wadler_lindig==0.1.7 +typing_extensions==4.14.1 +nbformat==5.10.4 +huggingface-hub==0.34.4 +torchvision==0.23.0 +tqdm==4.67.1 +contourpy==1.3.3 +nvidia-nvtx-cu12==12.8.90 +nvidia-cuda-runtime-cu12==12.8.90 +yarl==1.20.1 +jupyter-events==0.12.0 +protobuf==6.31.1 +nbclient==0.10.2 +numpy==1.26.4 +decorator==5.2.1 +networkx==3.5 +smmap==5.0.2 +nbconvert==7.16.6 +pytz==2025.2 +aiohappyeyeballs==2.6.1 +tinycss2==1.4.0 +defusedxml==0.7.1 +matplotlib-inline==0.1.7 +hf-xet==1.1.7 +jedi==0.19.2 +transformer-lens==2.16.1 +pandas==2.3.1 +sympy==1.14.0 +jupyterlab_pygments==0.3.0 +overrides==7.7.0 +notebook_shim==0.2.4 +matplotlib==3.10.5 +jupyter==1.1.1 +dotenv==0.9.9 +accelerate==1.10.0 +better-abc==0.0.3 +jsonpointer==3.0.0 +terminado==0.18.1 +rfc3987-syntax==1.1.0 +annotated-types==0.7.0 +pyarrow==21.0.0 +webencodings==0.5.1 +wcwidth==0.2.13 +mistune==3.1.3 +cffi==1.17.1 +jupyterlab_server==2.27.3 +argon2-cffi-bindings==25.1.0 +nvidia-nvjitlink-cu12==12.8.93 +jaxtyping==0.3.2 +Pygments==2.19.2 +torch==2.8.0 +rfc3339-validator==0.1.4 +urllib3==2.5.0 +jupyterlab_widgets==3.0.15 +ipykernel==6.30.1 +nvidia-cudnn-cu12==9.10.2.21 +transformers==4.55.0 +babel==2.17.0 +pure_eval==0.2.3 +pyparsing==3.2.3 +nvidia-cublas-cu12==12.8.4.1 +sniffio==1.3.1 +notebook==7.4.5 +pycparser==2.22 +packaging==25.0 +h11==0.16.0 +psutil==7.0.0 +pexpect==4.9.0 +gitdb==4.0.12 +rfc3986-validator==0.1.1 +toy_models==0.1.0 +narwhals==2.0.1 +torchaudio==2.8.0 +prompt_toolkit==3.0.51 +attrs==25.3.0 +regex==2025.7.34 +jupyter_core==5.8.1 +bleach==6.2.0 +fqdn==1.5.1 +async-lru==2.0.5 +nvidia-nccl-cu12==2.27.3 +GitPython==3.1.45 +referencing==0.36.2 +click==8.2.1 +prometheus_client==0.22.1 +httpx==0.28.1 +requests==2.32.4 +fonttools==4.59.0 +argon2-cffi==25.1.0 +executing==2.2.0 +arrow==1.3.0 +beartype==0.14.1 +ipywidgets==8.1.7 +pydantic_core==2.33.2 +tokenizers==0.21.4 +pip==23.2.1 +python-dotenv==1.1.1 +isoduration==20.11.0 +python-dateutil==2.9.0.post0 +json5==0.12.0 +nvidia-curand-cu12==10.3.9.90 +webcolors==24.11.1 +MarkupSafe==3.0.2 +nvidia-cusolver-cu12==11.7.3.90 +sentry-sdk==2.34.1 +Send2Trash==1.8.3 +jupyter_server_terminals==0.5.3 +debugpy==1.8.16 +nvidia-cufft-cu12==11.3.3.83 +typing-inspection==0.4.1 +rpds-py==0.27.0 +nvidia-cufile-cu12==1.13.1.3 +mdurl==0.1.2 +websocket-client==1.8.0 +python-json-logger==3.3.0 +filelock==3.18.0 +types-python-dateutil==2.9.0.20250809 +kiwisolver==1.4.8 +einops==0.8.1 +jupyter_client==8.6.3 +ipython_pygments_lexers==1.1.1 +tabulate==0.9.0 +propcache==0.3.2 +tornado==6.5.2 +typeguard==4.4.4 +tomlkit==0.13.2 +pydantic==2.11.7 +ipython==9.4.0 +charset-normalizer==3.4.2 +fancy-einsum==0.0.3 +datasets==4.0.0 +pillow==11.3.0 +beautifulsoup4==4.13.4 +soupsieve==2.7 +aiohttp==3.12.15 +plotly==6.2.0 +wandb==0.21.1 +tzdata==2025.2 +jupyter-lsp==2.2.6 +triton==3.4.0 +idna==3.10 +jupyterlab==4.4.5 +multiprocess==0.70.16 +dill==0.3.8 +fastjsonschema==2.21.1 +transformers-stream-generator==0.0.5 +nvidia-cusparselt-cu12==0.7.1 +parso==0.8.4 +pandocfilters==1.5.1 +jupyter-console==6.6.3 +anyio==4.10.0 +six==1.17.0 +uri-template==1.3.0 +sentencepiece==0.2.0 +markdown-it-py==3.0.0 +nest-asyncio==1.6.0 +nvidia-cusparse-cu12==12.5.8.93 +platformdirs==4.3.8 +traitlets==5.14.3 +jupyter_server==2.16.0 +safetensors==0.6.2 +frozenlist==1.7.0 diff --git a/wandb/run-20250819_121927-k06awv3c/files/wandb-metadata.json b/wandb/run-20250819_121927-k06awv3c/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..f93f4945b77f6338ddc552fc46b11c9130ca47b7 --- /dev/null +++ b/wandb/run-20250819_121927-k06awv3c/files/wandb-metadata.json @@ -0,0 +1,38 @@ +{ + "os": "Linux-5.19.0-45-generic-x86_64-with-glibc2.35", + "python": "CPython 3.11.7", + "startedAt": "2025-08-19T12:19:27.429840Z", + "program": "", + "git": { + "remote": "git@github.com:jgroh3/toy_models.git", + "commit": "c3cfb768d471036c37848ff2c6d223b68ad88e82" + }, + "email": "efarrel4@tcd.ie", + "root": "/notebooks/toy_models/models/gelu_2l_v7_50m_subset", + "host": "nrsym6j40o", + "executable": "/notebooks/clean_env/bin/python", + "cpu_count": 8, + "cpu_count_logical": 8, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 1, + "disk": { + "/": { + "total": "262240792576", + "used": "119662645248" + } + }, + "memory": { + "total": "47332843520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere", + "uuid": "GPU-5d4a8d59-78f7-4f92-75ec-db7c23705ed5" + } + ], + "cudaVersion": "12.4", + "writerId": "cqhawoa7kp3h9928du7m9jrbdvr3k0xq" +} \ No newline at end of file diff --git a/wandb/run-20250819_121927-k06awv3c/files/wandb-summary.json b/wandb/run-20250819_121927-k06awv3c/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..da43e02725c5ce9e3f0b925d84c5e9f3db67f052 --- /dev/null +++ b/wandb/run-20250819_121927-k06awv3c/files/wandb-summary.json @@ -0,0 +1 @@ +{"_step":6100,"train_loss_ewma":4.000948251654192,"tokens_seen":199835648,"train_loss":3.9788458347320557,"progress":0.99917824,"_wandb":{"runtime":4113},"_runtime":4113.72819522,"step":6100,"learning_rate":0.002,"tokens_per_second":32759.942295081968,"_timestamp":1.7556100802339656e+09} \ No newline at end of file diff --git a/wandb/run-20250819_121927-k06awv3c/logs/debug-internal.log b/wandb/run-20250819_121927-k06awv3c/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..cfc6cf5ec5c7a6d30ed23caf0df0f146d4864b9f --- /dev/null +++ b/wandb/run-20250819_121927-k06awv3c/logs/debug-internal.log @@ -0,0 +1,12 @@ +{"time":"2025-08-19T12:19:27.936258015Z","level":"INFO","msg":"stream: starting","core version":"0.21.1"} +{"time":"2025-08-19T12:19:28.205333845Z","level":"INFO","msg":"stream: created new stream","id":"k06awv3c"} +{"time":"2025-08-19T12:19:28.205424727Z","level":"INFO","msg":"stream: started","id":"k06awv3c"} +{"time":"2025-08-19T12:19:28.205427856Z","level":"INFO","msg":"writer: started","stream_id":"k06awv3c"} +{"time":"2025-08-19T12:19:28.205549372Z","level":"INFO","msg":"sender: started","stream_id":"k06awv3c"} +{"time":"2025-08-19T12:19:28.205576435Z","level":"INFO","msg":"handler: started","stream_id":"k06awv3c"} +{"time":"2025-08-19T13:28:02.517899324Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-08-19T13:28:02.672144864Z","level":"INFO","msg":"handler: operation stats","stats":{}} +{"time":"2025-08-19T13:28:02.674610912Z","level":"INFO","msg":"stream: closing","id":"k06awv3c"} +{"time":"2025-08-19T13:28:02.674631235Z","level":"INFO","msg":"handler: closed","stream_id":"k06awv3c"} +{"time":"2025-08-19T13:28:02.674685151Z","level":"INFO","msg":"sender: closed","stream_id":"k06awv3c"} +{"time":"2025-08-19T13:28:02.674698058Z","level":"INFO","msg":"stream: closed","id":"k06awv3c"} diff --git a/wandb/run-20250819_121927-k06awv3c/logs/debug.log b/wandb/run-20250819_121927-k06awv3c/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..74e30822fd74f3b58596102132adda40098c2c55 --- /dev/null +++ b/wandb/run-20250819_121927-k06awv3c/logs/debug.log @@ -0,0 +1,28 @@ +2025-08-19 12:19:27,434 INFO MainThread:387 [wandb_setup.py:_flush():80] Current SDK version is 0.21.1 +2025-08-19 12:19:27,434 INFO MainThread:387 [wandb_setup.py:_flush():80] Configure stats pid to 387 +2025-08-19 12:19:27,434 INFO MainThread:387 [wandb_setup.py:_flush():80] Loading settings from /root/.config/wandb/settings +2025-08-19 12:19:27,434 INFO MainThread:387 [wandb_setup.py:_flush():80] Loading settings from /notebooks/toy_models/models/gelu_2l_v7_50m_subset/wandb/settings +2025-08-19 12:19:27,434 INFO MainThread:387 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-08-19 12:19:27,434 INFO MainThread:387 [wandb_init.py:setup_run_log_directory():703] Logging user logs to /notebooks/toy_models/models/gelu_2l_v7_50m_subset/wandb/run-20250819_121927-k06awv3c/logs/debug.log +2025-08-19 12:19:27,434 INFO MainThread:387 [wandb_init.py:setup_run_log_directory():704] Logging internal logs to /notebooks/toy_models/models/gelu_2l_v7_50m_subset/wandb/run-20250819_121927-k06awv3c/logs/debug-internal.log +2025-08-19 12:19:27,434 INFO MainThread:387 [wandb_init.py:init():830] calling init triggers +2025-08-19 12:19:27,434 INFO MainThread:387 [wandb_init.py:init():835] wandb.init called with sweep_config: {} +config: {'model_name': 'gelu_2l_50m_subset', 'n_layers': 2, 'd_model': 512, 'd_mlp': 2048, 'd_head': 64, 'n_heads': 8, 'attn_only': False, 'layer_norm_eps': 1e-05, 'init_range': 0.02, 'n_ctx': 1024, 'd_vocab': 48262, 'dataset_name': 'eoinf/c4-code-test-50m', 'tokenizer_name': 'NeelNanda/gpt-neox-tokenizer-digits', 'seed': 10, 'device': 'cuda', 'use_bfloat16_matmul': False, 'batch_size_per_device': 32, 'n_devices': 1, 'batches_per_step': 1, 'max_tokens': 200000000, 'lr_hidden': 0.002, 'lr_vector': 0.001, 'lr_schedule': 'constant_with_warmup', 'warmup_tokens': 30000000, 'weight_decay': 0.05, 'grad_norm_clip': 1.0, 'train_loss_moving_average_beta': 0.99, 'log_interval': 25, 'save_checkpoints': True, 'checkpoint_interval': 500, 'checkpoint_interval_ratio': 1.08, 'save_log_checkpoints': True, 'use_wandb': True, 'batch_size': 32, 'tokens_per_step': 32768, 'warmup_steps': 915, 'max_steps': 6103, '_wandb': {}} +2025-08-19 12:19:27,434 INFO MainThread:387 [wandb_init.py:init():871] starting backend +2025-08-19 12:19:27,906 INFO MainThread:387 [wandb_init.py:init():874] sending inform_init request +2025-08-19 12:19:27,933 INFO MainThread:387 [wandb_init.py:init():882] backend started and connected +2025-08-19 12:19:27,934 INFO MainThread:387 [wandb_init.py:init():953] updated telemetry +2025-08-19 12:19:27,940 INFO MainThread:387 [wandb_init.py:init():977] communicating run to backend with 90.0 second timeout +2025-08-19 12:19:28,473 INFO MainThread:387 [wandb_init.py:init():1029] starting run threads in backend +2025-08-19 12:19:29,237 INFO MainThread:387 [wandb_run.py:_console_start():2494] atexit reg +2025-08-19 12:19:29,237 INFO MainThread:387 [wandb_run.py:_redirect():2342] redirect: wrap_raw +2025-08-19 12:19:29,238 INFO MainThread:387 [wandb_run.py:_redirect():2411] Wrapping output streams. +2025-08-19 12:19:29,238 INFO MainThread:387 [wandb_run.py:_redirect():2434] Redirects installed. +2025-08-19 12:19:29,248 INFO MainThread:387 [wandb_init.py:init():1075] run started, returning control to user process +2025-08-19 13:28:02,197 INFO MainThread:387 [wandb_run.py:_finish():2260] finishing run eoin/toy-transformer-replication/k06awv3c +2025-08-19 13:28:02,202 INFO MainThread:387 [wandb_run.py:_atexit_cleanup():2459] got exitcode: 0 +2025-08-19 13:28:02,202 INFO MainThread:387 [wandb_run.py:_restore():2441] restore +2025-08-19 13:28:02,202 INFO MainThread:387 [wandb_run.py:_restore():2447] restore done +2025-08-19 13:28:02,673 INFO MainThread:387 [wandb_run.py:_footer_history_summary_info():3895] rendering history +2025-08-19 13:28:02,674 INFO MainThread:387 [wandb_run.py:_footer_history_summary_info():3927] rendering summary +2025-08-19 13:28:02,674 INFO MainThread:387 [wandb_run.py:_footer_sync_info():3856] logging synced files diff --git a/wandb/run-20250819_121927-k06awv3c/run-k06awv3c.wandb b/wandb/run-20250819_121927-k06awv3c/run-k06awv3c.wandb new file mode 100644 index 0000000000000000000000000000000000000000..b525062f63326789d03820633e8878c79eaee29d --- /dev/null +++ b/wandb/run-20250819_121927-k06awv3c/run-k06awv3c.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:639c5bfd6d35e0e4f84711fdca222ce661afd2ac67d94ca4d26b8a8b7c9d5117 +size 3952779