diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..ea4b84de47ef7cad73372faa39e7c1a7b4aed3a1 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +wandb/run-20250817_231334-ztcapltu/run-ztcapltu.wandb filter=lfs diff=lfs merge=lfs -text diff --git a/.ipynb_checkpoints/config-checkpoint.toml b/.ipynb_checkpoints/config-checkpoint.toml new file mode 100644 index 0000000000000000000000000000000000000000..c78f8b132c6da4593ac880d47ac2951898146e57 --- /dev/null +++ b/.ipynb_checkpoints/config-checkpoint.toml @@ -0,0 +1,32 @@ +model_name = "gelu_2l_v4" +n_layers = 2 +d_model = 512 +d_mlp = 2048 +d_head = 64 +n_heads = 8 +attn_only = false +layer_norm_eps = 1e-05 +init_range = 0.02 +n_ctx = 1024 +d_vocab = 48262 +dataset_name = "NeelNanda/c4-code-tokenized-2b" +tokenizer_name = "NeelNanda/gpt-neox-tokenizer-digits" +seed = 10 +device = "cuda" +use_bfloat16_matmul = false +batch_size_per_device = 32 +n_devices = 1 +batches_per_step = 1 +max_tokens = 22000000000 +lr_hidden = 0.002 +lr_vector = 0.001 +lr_schedule = "cosine_warmup" +warmup_tokens = 30000000 +weight_decay = 0.05 +grad_norm_clip = 1.0 +train_loss_moving_average_beta = 0.99 +log_interval = 25 +save_checkpoints = true +checkpoint_interval = 500 +checkpoint_interval_ratio = 1.06 +save_log_checkpoints = true \ No newline at end of file diff --git a/checkpoints/metadata_000000032768.json b/checkpoints/metadata_000000032768.json new file mode 100644 index 0000000000000000000000000000000000000000..7f2626722f55cb4d71ab8558e4cd027cd9279a57 --- /dev/null +++ b/checkpoints/metadata_000000032768.json @@ -0,0 +1 @@ +{"step": 1, "tokens_seen": 32768, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 10.872627258300781} \ No newline at end of file diff --git a/checkpoints/metadata_000000327680.json b/checkpoints/metadata_000000327680.json new file mode 100644 index 0000000000000000000000000000000000000000..8308c6991e20b709ae7d2aa7c6609af1aa6d99bf --- /dev/null +++ b/checkpoints/metadata_000000327680.json @@ -0,0 +1 @@ +{"step": 10, "tokens_seen": 327680, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 10.866761351790585} \ No newline at end of file diff --git a/checkpoints/metadata_000000360448.json b/checkpoints/metadata_000000360448.json new file mode 100644 index 0000000000000000000000000000000000000000..4a792ede42ffba13b55755a529647a3f475e87a6 --- /dev/null +++ b/checkpoints/metadata_000000360448.json @@ -0,0 +1 @@ +{"step": 11, "tokens_seen": 360448, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 10.864644593909276} \ No newline at end of file diff --git a/checkpoints/metadata_000000393216.json b/checkpoints/metadata_000000393216.json new file mode 100644 index 0000000000000000000000000000000000000000..6c42aeb5e2c7c31dacef2e93c0464f9843d41ed0 --- /dev/null +++ b/checkpoints/metadata_000000393216.json @@ -0,0 +1 @@ +{"step": 12, "tokens_seen": 393216, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 10.862174114233001} \ No newline at end of file diff --git a/checkpoints/metadata_000000425984.json b/checkpoints/metadata_000000425984.json new file mode 100644 index 0000000000000000000000000000000000000000..05aa9fd2e274080044a45f294a2d86953723109b --- /dev/null +++ b/checkpoints/metadata_000000425984.json @@ -0,0 +1 @@ +{"step": 13, "tokens_seen": 425984, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 10.859094050565647} \ No newline at end of file diff --git a/checkpoints/metadata_000000458752.json b/checkpoints/metadata_000000458752.json new file mode 100644 index 0000000000000000000000000000000000000000..94ba89690885c319e7e7e8e1cf62425976a04652 --- /dev/null +++ b/checkpoints/metadata_000000458752.json @@ -0,0 +1 @@ +{"step": 14, "tokens_seen": 458752, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 10.855491923460258} \ No newline at end of file diff --git a/checkpoints/metadata_000000491520.json b/checkpoints/metadata_000000491520.json new file mode 100644 index 0000000000000000000000000000000000000000..7b81952c020b37b5c1c655b0fa426b6a3713bb83 --- /dev/null +++ b/checkpoints/metadata_000000491520.json @@ -0,0 +1 @@ +{"step": 15, "tokens_seen": 491520, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 10.851487270491525} \ No newline at end of file diff --git a/checkpoints/metadata_000000524288.json b/checkpoints/metadata_000000524288.json new file mode 100644 index 0000000000000000000000000000000000000000..c3d7a7430d963fc3c061578637c93f7ffbc689b1 --- /dev/null +++ b/checkpoints/metadata_000000524288.json @@ -0,0 +1 @@ +{"step": 16, "tokens_seen": 524288, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 10.84698640344762} \ No newline at end of file diff --git a/checkpoints/metadata_000000557056.json b/checkpoints/metadata_000000557056.json new file mode 100644 index 0000000000000000000000000000000000000000..c4a7e7e3e5a57d37ad514d0920ca8643de9aaeae --- /dev/null +++ b/checkpoints/metadata_000000557056.json @@ -0,0 +1 @@ +{"step": 17, "tokens_seen": 557056, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 10.84191619761627} \ No newline at end of file diff --git a/checkpoints/metadata_000000589824.json b/checkpoints/metadata_000000589824.json new file mode 100644 index 0000000000000000000000000000000000000000..c1aeb5d3cc82418c321bd17921d1a7763b0c55b4 --- /dev/null +++ b/checkpoints/metadata_000000589824.json @@ -0,0 +1 @@ +{"step": 18, "tokens_seen": 589824, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 10.836490857737884} \ No newline at end of file diff --git a/checkpoints/metadata_000000622592.json b/checkpoints/metadata_000000622592.json new file mode 100644 index 0000000000000000000000000000000000000000..ba5b3ec4177ab94594201ce28424ece1c1aaae22 --- /dev/null +++ b/checkpoints/metadata_000000622592.json @@ -0,0 +1 @@ +{"step": 19, "tokens_seen": 622592, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 10.831044011370588} \ No newline at end of file diff --git a/checkpoints/metadata_000000655360.json b/checkpoints/metadata_000000655360.json new file mode 100644 index 0000000000000000000000000000000000000000..eeb587c61f66d449ab60deeb6994c9de116965dd --- /dev/null +++ b/checkpoints/metadata_000000655360.json @@ -0,0 +1 @@ +{"step": 20, "tokens_seen": 655360, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 10.825376021589523} \ No newline at end of file diff --git a/checkpoints/metadata_000000688128.json b/checkpoints/metadata_000000688128.json new file mode 100644 index 0000000000000000000000000000000000000000..34e2c6cbb389571b0bf5079f5f5428a982df53fd --- /dev/null +++ b/checkpoints/metadata_000000688128.json @@ -0,0 +1 @@ +{"step": 21, "tokens_seen": 688128, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 10.818837015478243} \ No newline at end of file diff --git a/checkpoints/metadata_000000753664.json b/checkpoints/metadata_000000753664.json new file mode 100644 index 0000000000000000000000000000000000000000..020ca80f1dcb8382cabe821eb0a21d28b3e9e3bd --- /dev/null +++ b/checkpoints/metadata_000000753664.json @@ -0,0 +1 @@ +{"step": 23, "tokens_seen": 753664, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 10.805813345984943} \ No newline at end of file diff --git a/checkpoints/metadata_000000786432.json b/checkpoints/metadata_000000786432.json new file mode 100644 index 0000000000000000000000000000000000000000..3ce679fb43a82760af4afcf18f71b2dc668183f7 --- /dev/null +++ b/checkpoints/metadata_000000786432.json @@ -0,0 +1 @@ +{"step": 24, "tokens_seen": 786432, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 10.798696908692087} \ No newline at end of file diff --git a/checkpoints/metadata_000000851968.json b/checkpoints/metadata_000000851968.json new file mode 100644 index 0000000000000000000000000000000000000000..30333182c20668e1a84ced3802d8e865460e2fda --- /dev/null +++ b/checkpoints/metadata_000000851968.json @@ -0,0 +1 @@ +{"step": 26, "tokens_seen": 851968, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 10.7840466634513} \ No newline at end of file diff --git a/checkpoints/metadata_000000884736.json b/checkpoints/metadata_000000884736.json new file mode 100644 index 0000000000000000000000000000000000000000..2b32ac59aebc6d60b2a4091e968a654339cc7454 --- /dev/null +++ b/checkpoints/metadata_000000884736.json @@ -0,0 +1 @@ +{"step": 27, "tokens_seen": 884736, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 10.776380309136732} \ No newline at end of file diff --git a/checkpoints/metadata_000000950272.json b/checkpoints/metadata_000000950272.json new file mode 100644 index 0000000000000000000000000000000000000000..d2c4bed506c3ec2401816987941f83df17610755 --- /dev/null +++ b/checkpoints/metadata_000000950272.json @@ -0,0 +1 @@ +{"step": 29, "tokens_seen": 950272, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 10.760829628399462} \ No newline at end of file diff --git a/checkpoints/metadata_000001015808.json b/checkpoints/metadata_000001015808.json new file mode 100644 index 0000000000000000000000000000000000000000..126b52e88841502dfcf98053d9eeb83e027c53b0 --- /dev/null +++ b/checkpoints/metadata_000001015808.json @@ -0,0 +1 @@ +{"step": 31, "tokens_seen": 1015808, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 10.744669876221526} \ No newline at end of file diff --git a/checkpoints/metadata_000001048576.json b/checkpoints/metadata_000001048576.json new file mode 100644 index 0000000000000000000000000000000000000000..a240195e61046918a85925f1d5b3a5060d730b14 --- /dev/null +++ b/checkpoints/metadata_000001048576.json @@ -0,0 +1 @@ +{"step": 32, "tokens_seen": 1048576, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 10.73648620655782} \ No newline at end of file diff --git a/checkpoints/metadata_000001114112.json b/checkpoints/metadata_000001114112.json new file mode 100644 index 0000000000000000000000000000000000000000..f1a0f9d730500448e64024cccc17c860a93d730e --- /dev/null +++ b/checkpoints/metadata_000001114112.json @@ -0,0 +1 @@ +{"step": 34, "tokens_seen": 1114112, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 10.719629249470781} \ No newline at end of file diff --git a/checkpoints/metadata_000001212416.json b/checkpoints/metadata_000001212416.json new file mode 100644 index 0000000000000000000000000000000000000000..f91156c48df713c534b67c16ae0849f83730cb35 --- /dev/null +++ b/checkpoints/metadata_000001212416.json @@ -0,0 +1 @@ +{"step": 37, "tokens_seen": 1212416, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 10.692299968132296} \ No newline at end of file diff --git a/checkpoints/metadata_000001277952.json b/checkpoints/metadata_000001277952.json new file mode 100644 index 0000000000000000000000000000000000000000..0078fffe91746d298486b4f7ff6d9b881e7ba94c --- /dev/null +++ b/checkpoints/metadata_000001277952.json @@ -0,0 +1 @@ +{"step": 39, "tokens_seen": 1277952, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 10.673439214296094} \ No newline at end of file diff --git a/checkpoints/metadata_000001343488.json b/checkpoints/metadata_000001343488.json new file mode 100644 index 0000000000000000000000000000000000000000..9bec5b4ba8bc9f977a6adca22769c7079e3c9549 --- /dev/null +++ b/checkpoints/metadata_000001343488.json @@ -0,0 +1 @@ +{"step": 41, "tokens_seen": 1343488, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 10.653080189741232} \ No newline at end of file diff --git a/checkpoints/metadata_000001441792.json b/checkpoints/metadata_000001441792.json new file mode 100644 index 0000000000000000000000000000000000000000..c875995f0c173b2234ed7de66597788bfde28200 --- /dev/null +++ b/checkpoints/metadata_000001441792.json @@ -0,0 +1 @@ +{"step": 44, "tokens_seen": 1441792, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 10.62168992813459} \ No newline at end of file diff --git a/checkpoints/metadata_000001507328.json b/checkpoints/metadata_000001507328.json new file mode 100644 index 0000000000000000000000000000000000000000..d745052f0f54c73a3c80cfd3a1461045ad7bd731 --- /dev/null +++ b/checkpoints/metadata_000001507328.json @@ -0,0 +1 @@ +{"step": 46, "tokens_seen": 1507328, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 10.599969859538833} \ No newline at end of file diff --git a/checkpoints/metadata_000001605632.json b/checkpoints/metadata_000001605632.json new file mode 100644 index 0000000000000000000000000000000000000000..aa6d3aad91ebc2d76d42590a4f6034e223ec024b --- /dev/null +++ b/checkpoints/metadata_000001605632.json @@ -0,0 +1 @@ +{"step": 49, "tokens_seen": 1605632, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 10.565739813182214} \ No newline at end of file diff --git a/checkpoints/metadata_000001703936.json b/checkpoints/metadata_000001703936.json new file mode 100644 index 0000000000000000000000000000000000000000..b3228220f17b748726383fe9768a147bec68749d --- /dev/null +++ b/checkpoints/metadata_000001703936.json @@ -0,0 +1 @@ +{"step": 52, "tokens_seen": 1703936, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 10.530182852771613} \ No newline at end of file diff --git a/checkpoints/metadata_000001802240.json b/checkpoints/metadata_000001802240.json new file mode 100644 index 0000000000000000000000000000000000000000..e33d17af78fe9f96f864577a0a045e6bf98658ec --- /dev/null +++ b/checkpoints/metadata_000001802240.json @@ -0,0 +1 @@ +{"step": 55, "tokens_seen": 1802240, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 10.492692989176277} \ No newline at end of file diff --git a/checkpoints/metadata_000001933312.json b/checkpoints/metadata_000001933312.json new file mode 100644 index 0000000000000000000000000000000000000000..b0010fe635a0188aa373ba125a15ff58124ad2e4 --- /dev/null +++ b/checkpoints/metadata_000001933312.json @@ -0,0 +1 @@ +{"step": 59, "tokens_seen": 1933312, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 10.438970700236368} \ No newline at end of file diff --git a/checkpoints/metadata_000002031616.json b/checkpoints/metadata_000002031616.json new file mode 100644 index 0000000000000000000000000000000000000000..87b82f4ecbca970582063a66523a5b15775d510f --- /dev/null +++ b/checkpoints/metadata_000002031616.json @@ -0,0 +1 @@ +{"step": 62, "tokens_seen": 2031616, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 10.396784224738186} \ No newline at end of file diff --git a/checkpoints/metadata_000002162688.json b/checkpoints/metadata_000002162688.json new file mode 100644 index 0000000000000000000000000000000000000000..23da03188619a9f4427f4dae14c51774b8904b56 --- /dev/null +++ b/checkpoints/metadata_000002162688.json @@ -0,0 +1 @@ +{"step": 66, "tokens_seen": 2162688, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 10.336795163256} \ No newline at end of file diff --git a/checkpoints/metadata_000002293760.json b/checkpoints/metadata_000002293760.json new file mode 100644 index 0000000000000000000000000000000000000000..3029395487f7b57d02b5baee49c122041491dbe4 --- /dev/null +++ b/checkpoints/metadata_000002293760.json @@ -0,0 +1 @@ +{"step": 70, "tokens_seen": 2293760, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 10.274848059392296} \ No newline at end of file diff --git a/checkpoints/metadata_000002424832.json b/checkpoints/metadata_000002424832.json new file mode 100644 index 0000000000000000000000000000000000000000..0c634bb3a6e421197eaf29911b8c57a78fc70e83 --- /dev/null +++ b/checkpoints/metadata_000002424832.json @@ -0,0 +1 @@ +{"step": 74, "tokens_seen": 2424832, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 10.2076555970393} \ No newline at end of file diff --git a/checkpoints/metadata_000002588672.json b/checkpoints/metadata_000002588672.json new file mode 100644 index 0000000000000000000000000000000000000000..fe515c8364bda65b370a6a2fec520434e22f4c2e --- /dev/null +++ b/checkpoints/metadata_000002588672.json @@ -0,0 +1 @@ +{"step": 79, "tokens_seen": 2588672, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 10.11923377988726} \ No newline at end of file diff --git a/checkpoints/metadata_000002719744.json b/checkpoints/metadata_000002719744.json new file mode 100644 index 0000000000000000000000000000000000000000..d01fb0d2202c681cf59ccafa8a17cfad8f070a74 --- /dev/null +++ b/checkpoints/metadata_000002719744.json @@ -0,0 +1 @@ +{"step": 83, "tokens_seen": 2719744, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 10.045257050081743} \ No newline at end of file diff --git a/checkpoints/metadata_000002883584.json b/checkpoints/metadata_000002883584.json new file mode 100644 index 0000000000000000000000000000000000000000..53f51028c3ae180f38ea26af84cb8aac410f6313 --- /dev/null +++ b/checkpoints/metadata_000002883584.json @@ -0,0 +1 @@ +{"step": 88, "tokens_seen": 2883584, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 9.949561085556693} \ No newline at end of file diff --git a/checkpoints/metadata_000003080192.json b/checkpoints/metadata_000003080192.json new file mode 100644 index 0000000000000000000000000000000000000000..31e153842cc1f24eab5772547d5f448f3fc258ac --- /dev/null +++ b/checkpoints/metadata_000003080192.json @@ -0,0 +1 @@ +{"step": 94, "tokens_seen": 3080192, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 9.829012673974832} \ No newline at end of file diff --git a/checkpoints/metadata_000003244032.json b/checkpoints/metadata_000003244032.json new file mode 100644 index 0000000000000000000000000000000000000000..f8bc8aab1818adcefb95d08f68b91dbe2e8979ea --- /dev/null +++ b/checkpoints/metadata_000003244032.json @@ -0,0 +1 @@ +{"step": 99, "tokens_seen": 3244032, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 9.727049145615627} \ No newline at end of file diff --git a/checkpoints/metadata_000003440640.json b/checkpoints/metadata_000003440640.json new file mode 100644 index 0000000000000000000000000000000000000000..b706bc7cf60cd685e7a21672382c08292575e1b4 --- /dev/null +++ b/checkpoints/metadata_000003440640.json @@ -0,0 +1 @@ +{"step": 105, "tokens_seen": 3440640, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 9.603069449854939} \ No newline at end of file diff --git a/checkpoints/metadata_000003670016.json b/checkpoints/metadata_000003670016.json new file mode 100644 index 0000000000000000000000000000000000000000..194170949f2563fc981930e3e6a64f41620ae6b5 --- /dev/null +++ b/checkpoints/metadata_000003670016.json @@ -0,0 +1 @@ +{"step": 112, "tokens_seen": 3670016, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 9.458391938448699} \ No newline at end of file diff --git a/checkpoints/metadata_000003866624.json b/checkpoints/metadata_000003866624.json new file mode 100644 index 0000000000000000000000000000000000000000..e3240eac3459dc04b95d251b6446c5d2fec3283c --- /dev/null +++ b/checkpoints/metadata_000003866624.json @@ -0,0 +1 @@ +{"step": 118, "tokens_seen": 3866624, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 9.336868231558974} \ No newline at end of file diff --git a/checkpoints/metadata_000004128768.json b/checkpoints/metadata_000004128768.json new file mode 100644 index 0000000000000000000000000000000000000000..59c6fe6f79037745c1b8cc585d12c3fe8833fb66 --- /dev/null +++ b/checkpoints/metadata_000004128768.json @@ -0,0 +1 @@ +{"step": 126, "tokens_seen": 4128768, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 9.178058547104627} \ No newline at end of file diff --git a/checkpoints/metadata_000004358144.json b/checkpoints/metadata_000004358144.json new file mode 100644 index 0000000000000000000000000000000000000000..74c605ec25e4df8e66f774f131f85af867e04071 --- /dev/null +++ b/checkpoints/metadata_000004358144.json @@ -0,0 +1 @@ +{"step": 133, "tokens_seen": 4358144, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 9.040881959507956} \ No newline at end of file diff --git a/checkpoints/metadata_000004620288.json b/checkpoints/metadata_000004620288.json new file mode 100644 index 0000000000000000000000000000000000000000..bab02e03680955a7692b26a037b258aed44e5cec --- /dev/null +++ b/checkpoints/metadata_000004620288.json @@ -0,0 +1 @@ +{"step": 141, "tokens_seen": 4620288, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 8.890883279587326} \ No newline at end of file diff --git a/checkpoints/metadata_000004915200.json b/checkpoints/metadata_000004915200.json new file mode 100644 index 0000000000000000000000000000000000000000..795bdb16481e0b303215277c4ae5215a26659de2 --- /dev/null +++ b/checkpoints/metadata_000004915200.json @@ -0,0 +1 @@ +{"step": 150, "tokens_seen": 4915200, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 8.731224113445938} \ No newline at end of file diff --git a/checkpoints/metadata_000005210112.json b/checkpoints/metadata_000005210112.json new file mode 100644 index 0000000000000000000000000000000000000000..eef839fe0b2c28097256ee58665e91f4804c3ad5 --- /dev/null +++ b/checkpoints/metadata_000005210112.json @@ -0,0 +1 @@ +{"step": 159, "tokens_seen": 5210112, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 8.573775043540326} \ No newline at end of file diff --git a/checkpoints/metadata_000005505024.json b/checkpoints/metadata_000005505024.json new file mode 100644 index 0000000000000000000000000000000000000000..e22118b0533557e8a116a0d2b394b97330e4a202 --- /dev/null +++ b/checkpoints/metadata_000005505024.json @@ -0,0 +1 @@ +{"step": 168, "tokens_seen": 5505024, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 8.426512230695142} \ No newline at end of file diff --git a/checkpoints/metadata_000005832704.json b/checkpoints/metadata_000005832704.json new file mode 100644 index 0000000000000000000000000000000000000000..f028324c07ecbfdb5420447a9a753c260b5280fe --- /dev/null +++ b/checkpoints/metadata_000005832704.json @@ -0,0 +1 @@ +{"step": 178, "tokens_seen": 5832704, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 8.266021907850861} \ No newline at end of file diff --git a/checkpoints/metadata_000006193152.json b/checkpoints/metadata_000006193152.json new file mode 100644 index 0000000000000000000000000000000000000000..59c127efabc1b92129195a757441055ce5786d65 --- /dev/null +++ b/checkpoints/metadata_000006193152.json @@ -0,0 +1 @@ +{"step": 189, "tokens_seen": 6193152, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 8.09942541453696} \ No newline at end of file diff --git a/checkpoints/metadata_000006553600.json b/checkpoints/metadata_000006553600.json new file mode 100644 index 0000000000000000000000000000000000000000..7b5def137a28b7ff2adf615d5c762a1cddb32222 --- /dev/null +++ b/checkpoints/metadata_000006553600.json @@ -0,0 +1 @@ +{"step": 200, "tokens_seen": 6553600, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 7.94247200504239} \ No newline at end of file diff --git a/checkpoints/metadata_000006946816.json b/checkpoints/metadata_000006946816.json new file mode 100644 index 0000000000000000000000000000000000000000..258b8b1a81e0af08a0b93df63678773d8f198b71 --- /dev/null +++ b/checkpoints/metadata_000006946816.json @@ -0,0 +1 @@ +{"step": 212, "tokens_seen": 6946816, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 7.782110879308245} \ No newline at end of file diff --git a/checkpoints/metadata_000007372800.json b/checkpoints/metadata_000007372800.json new file mode 100644 index 0000000000000000000000000000000000000000..5b102574b43af6bea9845945d7681e9f1bca5f19 --- /dev/null +++ b/checkpoints/metadata_000007372800.json @@ -0,0 +1 @@ +{"step": 225, "tokens_seen": 7372800, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 7.633753154251662} \ No newline at end of file diff --git a/checkpoints/metadata_000007831552.json b/checkpoints/metadata_000007831552.json new file mode 100644 index 0000000000000000000000000000000000000000..78e39777d4c5261c765efdabcf1a9fa2616a3900 --- /dev/null +++ b/checkpoints/metadata_000007831552.json @@ -0,0 +1 @@ +{"step": 239, "tokens_seen": 7831552, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 7.479663365788289} \ No newline at end of file diff --git a/checkpoints/metadata_000008290304.json b/checkpoints/metadata_000008290304.json new file mode 100644 index 0000000000000000000000000000000000000000..ef03cf78ac15bfd41b14765cd9c4c950cb1e4f00 --- /dev/null +++ b/checkpoints/metadata_000008290304.json @@ -0,0 +1 @@ +{"step": 253, "tokens_seen": 8290304, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 7.343758604848924} \ No newline at end of file diff --git a/checkpoints/metadata_000008781824.json b/checkpoints/metadata_000008781824.json new file mode 100644 index 0000000000000000000000000000000000000000..2fc3b6d6caf90657835c0122b000e1bd86b25881 --- /dev/null +++ b/checkpoints/metadata_000008781824.json @@ -0,0 +1 @@ +{"step": 268, "tokens_seen": 8781824, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 7.214364006381903} \ No newline at end of file diff --git a/checkpoints/metadata_000009306112.json b/checkpoints/metadata_000009306112.json new file mode 100644 index 0000000000000000000000000000000000000000..08ae71b2b4f119df05015c9048ca40bb03cab101 --- /dev/null +++ b/checkpoints/metadata_000009306112.json @@ -0,0 +1 @@ +{"step": 284, "tokens_seen": 9306112, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 7.08434606340243} \ No newline at end of file diff --git a/checkpoints/metadata_000009863168.json b/checkpoints/metadata_000009863168.json new file mode 100644 index 0000000000000000000000000000000000000000..1a21df32561b5cf59a39bdf6b852a22c9b883894 --- /dev/null +++ b/checkpoints/metadata_000009863168.json @@ -0,0 +1 @@ +{"step": 301, "tokens_seen": 9863168, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 6.962520269909929} \ No newline at end of file diff --git a/checkpoints/metadata_000010485760.json b/checkpoints/metadata_000010485760.json new file mode 100644 index 0000000000000000000000000000000000000000..2150276e1cf3b405de8a1ab7a318dc6b942ef9d1 --- /dev/null +++ b/checkpoints/metadata_000010485760.json @@ -0,0 +1 @@ +{"step": 320, "tokens_seen": 10485760, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 6.840428571448589} \ No newline at end of file diff --git a/checkpoints/metadata_000011108352.json b/checkpoints/metadata_000011108352.json new file mode 100644 index 0000000000000000000000000000000000000000..fe8c23878879dacf93f5c959dde3a53c15144a30 --- /dev/null +++ b/checkpoints/metadata_000011108352.json @@ -0,0 +1 @@ +{"step": 339, "tokens_seen": 11108352, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 6.740457772704066} \ No newline at end of file diff --git a/checkpoints/metadata_000011763712.json b/checkpoints/metadata_000011763712.json new file mode 100644 index 0000000000000000000000000000000000000000..47d3befce50b830b821fe0f55a536bfa23703b76 --- /dev/null +++ b/checkpoints/metadata_000011763712.json @@ -0,0 +1 @@ +{"step": 359, "tokens_seen": 11763712, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 6.642679858642692} \ No newline at end of file diff --git a/checkpoints/metadata_000012484608.json b/checkpoints/metadata_000012484608.json new file mode 100644 index 0000000000000000000000000000000000000000..87c303826e8b06cd495c190fa0f0855ade85e561 --- /dev/null +++ b/checkpoints/metadata_000012484608.json @@ -0,0 +1 @@ +{"step": 381, "tokens_seen": 12484608, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 6.559482908036395} \ No newline at end of file diff --git a/checkpoints/metadata_000013238272.json b/checkpoints/metadata_000013238272.json new file mode 100644 index 0000000000000000000000000000000000000000..c1c7852816b6f1bdb5029bf1c41319c20020c84c --- /dev/null +++ b/checkpoints/metadata_000013238272.json @@ -0,0 +1 @@ +{"step": 404, "tokens_seen": 13238272, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 6.47488103894651} \ No newline at end of file diff --git a/checkpoints/metadata_000014024704.json b/checkpoints/metadata_000014024704.json new file mode 100644 index 0000000000000000000000000000000000000000..99551656cc0ffac8577af83675899819ed9a2c6b --- /dev/null +++ b/checkpoints/metadata_000014024704.json @@ -0,0 +1 @@ +{"step": 428, "tokens_seen": 14024704, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 6.392302586774002} \ No newline at end of file diff --git a/checkpoints/metadata_000014876672.json b/checkpoints/metadata_000014876672.json new file mode 100644 index 0000000000000000000000000000000000000000..5f9bd5b1971daa5511c4e44f1fa0b6637dada173 --- /dev/null +++ b/checkpoints/metadata_000014876672.json @@ -0,0 +1 @@ +{"step": 454, "tokens_seen": 14876672, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 6.3178709392927255} \ No newline at end of file diff --git a/checkpoints/metadata_000015761408.json b/checkpoints/metadata_000015761408.json new file mode 100644 index 0000000000000000000000000000000000000000..e747998c8437876e5c50ca7c77b8e133465d29f5 --- /dev/null +++ b/checkpoints/metadata_000015761408.json @@ -0,0 +1 @@ +{"step": 481, "tokens_seen": 15761408, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 6.2500705778279295} \ No newline at end of file diff --git a/checkpoints/metadata_000016384000.json b/checkpoints/metadata_000016384000.json new file mode 100644 index 0000000000000000000000000000000000000000..e7d35f783cd03c31f0b6e32f40f6ce905b1c382a --- /dev/null +++ b/checkpoints/metadata_000016384000.json @@ -0,0 +1 @@ +{"step": 500, "tokens_seen": 16384000, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 6.207935843384246} \ No newline at end of file diff --git a/checkpoints/metadata_000016711680.json b/checkpoints/metadata_000016711680.json new file mode 100644 index 0000000000000000000000000000000000000000..f24ffb06b57ae97e614e71ed278f1f506021bbcc --- /dev/null +++ b/checkpoints/metadata_000016711680.json @@ -0,0 +1 @@ +{"step": 510, "tokens_seen": 16711680, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 6.188658376170218} \ No newline at end of file diff --git a/checkpoints/metadata_000017694720.json b/checkpoints/metadata_000017694720.json new file mode 100644 index 0000000000000000000000000000000000000000..6a3c7c7c27da2d87ce1c04a61447e6a7d64fe314 --- /dev/null +++ b/checkpoints/metadata_000017694720.json @@ -0,0 +1 @@ +{"step": 540, "tokens_seen": 17694720, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 6.135493602559535} \ No newline at end of file diff --git a/checkpoints/metadata_000018776064.json b/checkpoints/metadata_000018776064.json new file mode 100644 index 0000000000000000000000000000000000000000..acba66ce428412cd108c84488bcb9894a6cb2e3d --- /dev/null +++ b/checkpoints/metadata_000018776064.json @@ -0,0 +1 @@ +{"step": 573, "tokens_seen": 18776064, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 6.080283621895378} \ No newline at end of file diff --git a/checkpoints/metadata_000019890176.json b/checkpoints/metadata_000019890176.json new file mode 100644 index 0000000000000000000000000000000000000000..68518f05433ba8ae1518e0145ffb6b4eff67c1a0 --- /dev/null +++ b/checkpoints/metadata_000019890176.json @@ -0,0 +1 @@ +{"step": 607, "tokens_seen": 19890176, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 6.017403944257985} \ No newline at end of file diff --git a/checkpoints/metadata_000021102592.json b/checkpoints/metadata_000021102592.json new file mode 100644 index 0000000000000000000000000000000000000000..9b82b6805909515d185a5b7d0a83390b0a834b17 --- /dev/null +++ b/checkpoints/metadata_000021102592.json @@ -0,0 +1 @@ +{"step": 644, "tokens_seen": 21102592, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 5.964554085046414} \ No newline at end of file diff --git a/checkpoints/metadata_000022347776.json b/checkpoints/metadata_000022347776.json new file mode 100644 index 0000000000000000000000000000000000000000..e242c4065db377430dc8ea3c938dada4328ef15b --- /dev/null +++ b/checkpoints/metadata_000022347776.json @@ -0,0 +1 @@ +{"step": 682, "tokens_seen": 22347776, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 5.914984727344645} \ No newline at end of file diff --git a/checkpoints/metadata_000023691264.json b/checkpoints/metadata_000023691264.json new file mode 100644 index 0000000000000000000000000000000000000000..7810fa0416a0abf4ee3f6019167643840dff4063 --- /dev/null +++ b/checkpoints/metadata_000023691264.json @@ -0,0 +1 @@ +{"step": 723, "tokens_seen": 23691264, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 5.864252443759749} \ No newline at end of file diff --git a/checkpoints/metadata_000025133056.json b/checkpoints/metadata_000025133056.json new file mode 100644 index 0000000000000000000000000000000000000000..d0b997241d0e10085ed0ff6bf096997a9f3fb7fd --- /dev/null +++ b/checkpoints/metadata_000025133056.json @@ -0,0 +1 @@ +{"step": 767, "tokens_seen": 25133056, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 5.8238329189844515} \ No newline at end of file diff --git a/checkpoints/metadata_000026640384.json b/checkpoints/metadata_000026640384.json new file mode 100644 index 0000000000000000000000000000000000000000..549ef2f882984bfab0e0ef4eb8f643e59e17303e --- /dev/null +++ b/checkpoints/metadata_000026640384.json @@ -0,0 +1 @@ +{"step": 813, "tokens_seen": 26640384, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 5.779944595001218} \ No newline at end of file diff --git a/checkpoints/metadata_000028213248.json b/checkpoints/metadata_000028213248.json new file mode 100644 index 0000000000000000000000000000000000000000..b5137943965b9255af8029a4b6fe0fb6d9104fb9 --- /dev/null +++ b/checkpoints/metadata_000028213248.json @@ -0,0 +1 @@ +{"step": 861, "tokens_seen": 28213248, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 5.725379428604655} \ No newline at end of file diff --git a/checkpoints/metadata_000029917184.json b/checkpoints/metadata_000029917184.json new file mode 100644 index 0000000000000000000000000000000000000000..e43a827cbb3ae509a0cb3496126bc71ce577a5ac --- /dev/null +++ b/checkpoints/metadata_000029917184.json @@ -0,0 +1 @@ +{"step": 913, "tokens_seen": 29917184, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 5.67571176210419} \ No newline at end of file diff --git a/checkpoints/metadata_000031719424.json b/checkpoints/metadata_000031719424.json new file mode 100644 index 0000000000000000000000000000000000000000..63e133c282db5f2e18e276755c5ad364701b73f9 --- /dev/null +++ b/checkpoints/metadata_000031719424.json @@ -0,0 +1 @@ +{"step": 968, "tokens_seen": 31719424, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 5.613865173596023} \ No newline at end of file diff --git a/checkpoints/metadata_000032768000.json b/checkpoints/metadata_000032768000.json new file mode 100644 index 0000000000000000000000000000000000000000..77b19118db5d0557800d144287af0dd99232de0f --- /dev/null +++ b/checkpoints/metadata_000032768000.json @@ -0,0 +1 @@ +{"step": 1000, "tokens_seen": 32768000, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 5.573614135354942} \ No newline at end of file diff --git a/checkpoints/metadata_000033619968.json b/checkpoints/metadata_000033619968.json new file mode 100644 index 0000000000000000000000000000000000000000..024f54b1a578e90b8c567a9aafb9abf54264dd0f --- /dev/null +++ b/checkpoints/metadata_000033619968.json @@ -0,0 +1 @@ +{"step": 1026, "tokens_seen": 33619968, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 5.535977495599602} \ No newline at end of file diff --git a/checkpoints/metadata_000035651584.json b/checkpoints/metadata_000035651584.json new file mode 100644 index 0000000000000000000000000000000000000000..30d8973ced2b535170105ce2d40baa8cb225918a --- /dev/null +++ b/checkpoints/metadata_000035651584.json @@ -0,0 +1 @@ +{"step": 1088, "tokens_seen": 35651584, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 5.470055864597493} \ No newline at end of file diff --git a/checkpoints/metadata_000037781504.json b/checkpoints/metadata_000037781504.json new file mode 100644 index 0000000000000000000000000000000000000000..8a6cebcd8ed291ab5e5b32c5d4d6e767161fbc25 --- /dev/null +++ b/checkpoints/metadata_000037781504.json @@ -0,0 +1 @@ +{"step": 1153, "tokens_seen": 37781504, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 5.401063728759256} \ No newline at end of file diff --git a/checkpoints/metadata_000040042496.json b/checkpoints/metadata_000040042496.json new file mode 100644 index 0000000000000000000000000000000000000000..6f82f5041c6cc82ed7605d351b7c157a327b225f --- /dev/null +++ b/checkpoints/metadata_000040042496.json @@ -0,0 +1 @@ +{"step": 1222, "tokens_seen": 40042496, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 5.33333317257896} \ No newline at end of file diff --git a/checkpoints/metadata_000042467328.json b/checkpoints/metadata_000042467328.json new file mode 100644 index 0000000000000000000000000000000000000000..ef660ebba0a3c3ffb013f060fada46fcf6d44fd5 --- /dev/null +++ b/checkpoints/metadata_000042467328.json @@ -0,0 +1 @@ +{"step": 1296, "tokens_seen": 42467328, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 5.271726664760353} \ No newline at end of file diff --git a/checkpoints/metadata_000044990464.json b/checkpoints/metadata_000044990464.json new file mode 100644 index 0000000000000000000000000000000000000000..42816832571cfc32fbc1a0fab2e4d180a805931b --- /dev/null +++ b/checkpoints/metadata_000044990464.json @@ -0,0 +1 @@ +{"step": 1373, "tokens_seen": 44990464, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 5.215914666702584} \ No newline at end of file diff --git a/checkpoints/metadata_000047710208.json b/checkpoints/metadata_000047710208.json new file mode 100644 index 0000000000000000000000000000000000000000..0add02aaf12b37863969689471a66490d5d6ffa9 --- /dev/null +++ b/checkpoints/metadata_000047710208.json @@ -0,0 +1 @@ +{"step": 1456, "tokens_seen": 47710208, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 5.152133892704035} \ No newline at end of file diff --git a/checkpoints/metadata_000049152000.json b/checkpoints/metadata_000049152000.json new file mode 100644 index 0000000000000000000000000000000000000000..3167db0a04b62c2bf130cd2f24c8907ed4d4abaa --- /dev/null +++ b/checkpoints/metadata_000049152000.json @@ -0,0 +1 @@ +{"step": 1500, "tokens_seen": 49152000, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 5.12122833972581} \ No newline at end of file diff --git a/checkpoints/metadata_000050561024.json b/checkpoints/metadata_000050561024.json new file mode 100644 index 0000000000000000000000000000000000000000..7003b6b325abd7e7398fc241fef942587d18a969 --- /dev/null +++ b/checkpoints/metadata_000050561024.json @@ -0,0 +1 @@ +{"step": 1543, "tokens_seen": 50561024, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 5.106327154817548} \ No newline at end of file diff --git a/checkpoints/metadata_000053608448.json b/checkpoints/metadata_000053608448.json new file mode 100644 index 0000000000000000000000000000000000000000..3f38315d5dd626021f0a67fbf01c0081d8423784 --- /dev/null +++ b/checkpoints/metadata_000053608448.json @@ -0,0 +1 @@ +{"step": 1636, "tokens_seen": 53608448, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 5.0547002217029435} \ No newline at end of file diff --git a/checkpoints/metadata_000056819712.json b/checkpoints/metadata_000056819712.json new file mode 100644 index 0000000000000000000000000000000000000000..ddf50d6263e1fbb30afde556e99d3323e3884322 --- /dev/null +++ b/checkpoints/metadata_000056819712.json @@ -0,0 +1 @@ +{"step": 1734, "tokens_seen": 56819712, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 5.009688361385578} \ No newline at end of file diff --git a/checkpoints/metadata_000060227584.json b/checkpoints/metadata_000060227584.json new file mode 100644 index 0000000000000000000000000000000000000000..370e84a1180a06eba31de6aaa91e2118b7c2dbdd --- /dev/null +++ b/checkpoints/metadata_000060227584.json @@ -0,0 +1 @@ +{"step": 1838, "tokens_seen": 60227584, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 4.9738863397981765} \ No newline at end of file diff --git a/checkpoints/metadata_000063832064.json b/checkpoints/metadata_000063832064.json new file mode 100644 index 0000000000000000000000000000000000000000..b584e1c55272f9c25c057b8b974a013dadb888b3 --- /dev/null +++ b/checkpoints/metadata_000063832064.json @@ -0,0 +1 @@ +{"step": 1948, "tokens_seen": 63832064, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 4.895221412079541} \ No newline at end of file diff --git a/checkpoints/metadata_000065536000.json b/checkpoints/metadata_000065536000.json new file mode 100644 index 0000000000000000000000000000000000000000..b4573fdaafb990958557cdd6e218856c209681a1 --- /dev/null +++ b/checkpoints/metadata_000065536000.json @@ -0,0 +1 @@ +{"step": 2000, "tokens_seen": 65536000, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 4.842252290279579} \ No newline at end of file diff --git a/checkpoints/metadata_000067665920.json b/checkpoints/metadata_000067665920.json new file mode 100644 index 0000000000000000000000000000000000000000..eff35cd4b7489d900bea9b9c54355986082e747c --- /dev/null +++ b/checkpoints/metadata_000067665920.json @@ -0,0 +1 @@ +{"step": 2065, "tokens_seen": 67665920, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 4.80979938805876} \ No newline at end of file diff --git a/checkpoints/metadata_000071729152.json b/checkpoints/metadata_000071729152.json new file mode 100644 index 0000000000000000000000000000000000000000..c2dfb6173cf56140b0c372dbc9e0e90f351eb962 --- /dev/null +++ b/checkpoints/metadata_000071729152.json @@ -0,0 +1 @@ +{"step": 2189, "tokens_seen": 71729152, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 4.745388721436949} \ No newline at end of file diff --git a/checkpoints/metadata_000076054528.json b/checkpoints/metadata_000076054528.json new file mode 100644 index 0000000000000000000000000000000000000000..17507b0b245a03db196797d1058fdc1ed2d9dfc1 --- /dev/null +++ b/checkpoints/metadata_000076054528.json @@ -0,0 +1 @@ +{"step": 2321, "tokens_seen": 76054528, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 4.647540353521421} \ No newline at end of file diff --git a/checkpoints/metadata_000080609280.json b/checkpoints/metadata_000080609280.json new file mode 100644 index 0000000000000000000000000000000000000000..d450f7960521f0b7ca3a525181d7d20733196d38 --- /dev/null +++ b/checkpoints/metadata_000080609280.json @@ -0,0 +1 @@ +{"step": 2460, "tokens_seen": 80609280, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 4.5969105941658714} \ No newline at end of file diff --git a/checkpoints/metadata_000081920000.json b/checkpoints/metadata_000081920000.json new file mode 100644 index 0000000000000000000000000000000000000000..528ceae1e20ec1de6f2657e3cc1cbda04a3cb926 --- /dev/null +++ b/checkpoints/metadata_000081920000.json @@ -0,0 +1 @@ +{"step": 2500, "tokens_seen": 81920000, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 4.572575401037275} \ No newline at end of file diff --git a/checkpoints/metadata_000085426176.json b/checkpoints/metadata_000085426176.json new file mode 100644 index 0000000000000000000000000000000000000000..a05ffd340956e8e374fbb5ab1d2abec1bcf5006c --- /dev/null +++ b/checkpoints/metadata_000085426176.json @@ -0,0 +1 @@ +{"step": 2607, "tokens_seen": 85426176, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 4.5213000024585135} \ No newline at end of file diff --git a/checkpoints/metadata_000090570752.json b/checkpoints/metadata_000090570752.json new file mode 100644 index 0000000000000000000000000000000000000000..fa07bc9fba77058498758b400a7f75c0a1416f30 --- /dev/null +++ b/checkpoints/metadata_000090570752.json @@ -0,0 +1 @@ +{"step": 2764, "tokens_seen": 90570752, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 4.473296220232062} \ No newline at end of file diff --git a/checkpoints/metadata_000096010240.json b/checkpoints/metadata_000096010240.json new file mode 100644 index 0000000000000000000000000000000000000000..c9e40a117ecd3121bfcf8f9d518aa78d086d9a7d --- /dev/null +++ b/checkpoints/metadata_000096010240.json @@ -0,0 +1 @@ +{"step": 2930, "tokens_seen": 96010240, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 4.427331619131198} \ No newline at end of file diff --git a/checkpoints/metadata_000098304000.json b/checkpoints/metadata_000098304000.json new file mode 100644 index 0000000000000000000000000000000000000000..959b7f322da9239c8036207b5ae806b14334b844 --- /dev/null +++ b/checkpoints/metadata_000098304000.json @@ -0,0 +1 @@ +{"step": 3000, "tokens_seen": 98304000, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 4.406767716144262} \ No newline at end of file diff --git a/checkpoints/metadata_000101777408.json b/checkpoints/metadata_000101777408.json new file mode 100644 index 0000000000000000000000000000000000000000..3f75e9772c14a47d08ae7d06a63903626a400721 --- /dev/null +++ b/checkpoints/metadata_000101777408.json @@ -0,0 +1 @@ +{"step": 3106, "tokens_seen": 101777408, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 4.385128060087921} \ No newline at end of file diff --git a/checkpoints/metadata_000107872256.json b/checkpoints/metadata_000107872256.json new file mode 100644 index 0000000000000000000000000000000000000000..8ec7a9b616dff552e97af3fcb6fb7f75f90e63c7 --- /dev/null +++ b/checkpoints/metadata_000107872256.json @@ -0,0 +1 @@ +{"step": 3292, "tokens_seen": 107872256, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 4.351529993258676} \ No newline at end of file diff --git a/checkpoints/metadata_000114327552.json b/checkpoints/metadata_000114327552.json new file mode 100644 index 0000000000000000000000000000000000000000..42581c2ba1ecc20ed9c4744ba6e0ce2eafba2104 --- /dev/null +++ b/checkpoints/metadata_000114327552.json @@ -0,0 +1 @@ +{"step": 3489, "tokens_seen": 114327552, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 4.32237501496495} \ No newline at end of file diff --git a/checkpoints/metadata_000114688000.json b/checkpoints/metadata_000114688000.json new file mode 100644 index 0000000000000000000000000000000000000000..3dc3b976cea59164fc7ac399d003b7b1fd951dae --- /dev/null +++ b/checkpoints/metadata_000114688000.json @@ -0,0 +1 @@ +{"step": 3500, "tokens_seen": 114688000, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 4.320825203279289} \ No newline at end of file diff --git a/checkpoints/metadata_000121208832.json b/checkpoints/metadata_000121208832.json new file mode 100644 index 0000000000000000000000000000000000000000..4ed820db6b74f50408d292ff212725a67c935942 --- /dev/null +++ b/checkpoints/metadata_000121208832.json @@ -0,0 +1 @@ +{"step": 3699, "tokens_seen": 121208832, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 4.29075868283881} \ No newline at end of file diff --git a/checkpoints/metadata_000128483328.json b/checkpoints/metadata_000128483328.json new file mode 100644 index 0000000000000000000000000000000000000000..bd12f72f809ebbafaaef839c01991d387dc1d209 --- /dev/null +++ b/checkpoints/metadata_000128483328.json @@ -0,0 +1 @@ +{"step": 3921, "tokens_seen": 128483328, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 4.246192465723148} \ No newline at end of file diff --git a/checkpoints/metadata_000131072000.json b/checkpoints/metadata_000131072000.json new file mode 100644 index 0000000000000000000000000000000000000000..5424361457f13f22a3581ae0405328c1237d71b6 --- /dev/null +++ b/checkpoints/metadata_000131072000.json @@ -0,0 +1 @@ +{"step": 4000, "tokens_seen": 131072000, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 4.219468823711385} \ No newline at end of file diff --git a/checkpoints/metadata_000136183808.json b/checkpoints/metadata_000136183808.json new file mode 100644 index 0000000000000000000000000000000000000000..0be56e24bb293ed6092dcbd2ffeeb034f0d46160 --- /dev/null +++ b/checkpoints/metadata_000136183808.json @@ -0,0 +1 @@ +{"step": 4156, "tokens_seen": 136183808, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 4.212730041098502} \ No newline at end of file diff --git a/checkpoints/metadata_000144375808.json b/checkpoints/metadata_000144375808.json new file mode 100644 index 0000000000000000000000000000000000000000..dd7780c8727a45aee092d3ad5b560709e250d109 --- /dev/null +++ b/checkpoints/metadata_000144375808.json @@ -0,0 +1 @@ +{"step": 4406, "tokens_seen": 144375808, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 4.185961593339415} \ No newline at end of file diff --git a/checkpoints/metadata_000147456000.json b/checkpoints/metadata_000147456000.json new file mode 100644 index 0000000000000000000000000000000000000000..4fef16bc88d20faf837a93a4e30f97fba3bee3f5 --- /dev/null +++ b/checkpoints/metadata_000147456000.json @@ -0,0 +1 @@ +{"step": 4500, "tokens_seen": 147456000, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 4.174494974734745} \ No newline at end of file diff --git a/checkpoints/metadata_000153026560.json b/checkpoints/metadata_000153026560.json new file mode 100644 index 0000000000000000000000000000000000000000..86c5713398a6de94bdb082b9fc6dd3a600126147 --- /dev/null +++ b/checkpoints/metadata_000153026560.json @@ -0,0 +1 @@ +{"step": 4670, "tokens_seen": 153026560, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 4.165175524560158} \ No newline at end of file diff --git a/checkpoints/metadata_000162201600.json b/checkpoints/metadata_000162201600.json new file mode 100644 index 0000000000000000000000000000000000000000..f129ab0e7c629f17c02cb681784c41e217f55bfc --- /dev/null +++ b/checkpoints/metadata_000162201600.json @@ -0,0 +1 @@ +{"step": 4950, "tokens_seen": 162201600, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 4.130839409307733} \ No newline at end of file diff --git a/checkpoints/metadata_000163840000.json b/checkpoints/metadata_000163840000.json new file mode 100644 index 0000000000000000000000000000000000000000..6d6b2791dc81e079d7785ead79fe18e8ab4c50e3 --- /dev/null +++ b/checkpoints/metadata_000163840000.json @@ -0,0 +1 @@ +{"step": 5000, "tokens_seen": 163840000, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 4.131935032523368} \ No newline at end of file diff --git a/checkpoints/metadata_000171933696.json b/checkpoints/metadata_000171933696.json new file mode 100644 index 0000000000000000000000000000000000000000..c30073a22b8eb5790b143cbfd4afe2f3fed22f3f --- /dev/null +++ b/checkpoints/metadata_000171933696.json @@ -0,0 +1 @@ +{"step": 5247, "tokens_seen": 171933696, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 4.12925506237479} \ No newline at end of file diff --git a/checkpoints/metadata_000180224000.json b/checkpoints/metadata_000180224000.json new file mode 100644 index 0000000000000000000000000000000000000000..639b77d240327c584cdfc9831d2eac195e274e80 --- /dev/null +++ b/checkpoints/metadata_000180224000.json @@ -0,0 +1 @@ +{"step": 5500, "tokens_seen": 180224000, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 4.099209511326652} \ No newline at end of file diff --git a/checkpoints/metadata_000182255616.json b/checkpoints/metadata_000182255616.json new file mode 100644 index 0000000000000000000000000000000000000000..c2f7bd2127e0dc8ab2372f3d34f23e19c1bb2827 --- /dev/null +++ b/checkpoints/metadata_000182255616.json @@ -0,0 +1 @@ +{"step": 5562, "tokens_seen": 182255616, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 4.108095484333142} \ No newline at end of file diff --git a/checkpoints/metadata_000193200128.json b/checkpoints/metadata_000193200128.json new file mode 100644 index 0000000000000000000000000000000000000000..166a5c9ab509b370a8e6b0727aa594c46fbbe520 --- /dev/null +++ b/checkpoints/metadata_000193200128.json @@ -0,0 +1 @@ +{"step": 5896, "tokens_seen": 193200128, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 4.074571218769802} \ No newline at end of file diff --git a/checkpoints/metadata_000196608000.json b/checkpoints/metadata_000196608000.json new file mode 100644 index 0000000000000000000000000000000000000000..0432b3c58323cd7f5dd48c27a1e844917fe1d44b --- /dev/null +++ b/checkpoints/metadata_000196608000.json @@ -0,0 +1 @@ +{"step": 6000, "tokens_seen": 196608000, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 4.0832437060491005} \ No newline at end of file diff --git a/checkpoints/metadata_000204767232.json b/checkpoints/metadata_000204767232.json new file mode 100644 index 0000000000000000000000000000000000000000..4a114c9c60fd666f2bd8862462abac8d47912d70 --- /dev/null +++ b/checkpoints/metadata_000204767232.json @@ -0,0 +1 @@ +{"step": 6249, "tokens_seen": 204767232, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 4.041304005751662} \ No newline at end of file diff --git a/checkpoints/metadata_000212992000.json b/checkpoints/metadata_000212992000.json new file mode 100644 index 0000000000000000000000000000000000000000..1a43f3c10969031b102d60415b91121c6c5bb787 --- /dev/null +++ b/checkpoints/metadata_000212992000.json @@ -0,0 +1 @@ +{"step": 6500, "tokens_seen": 212992000, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 4.0234418663745615} \ No newline at end of file diff --git a/checkpoints/metadata_000217055232.json b/checkpoints/metadata_000217055232.json new file mode 100644 index 0000000000000000000000000000000000000000..4e2cbffe5e1ff62655eaae87a25cd42989a7ac23 --- /dev/null +++ b/checkpoints/metadata_000217055232.json @@ -0,0 +1 @@ +{"step": 6624, "tokens_seen": 217055232, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 4.036090239519361} \ No newline at end of file diff --git a/checkpoints/metadata_000229376000.json b/checkpoints/metadata_000229376000.json new file mode 100644 index 0000000000000000000000000000000000000000..aeb96f2823435ff40a91b66319c65faf91a38071 --- /dev/null +++ b/checkpoints/metadata_000229376000.json @@ -0,0 +1 @@ +{"step": 7000, "tokens_seen": 229376000, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 4.033123678728216} \ No newline at end of file diff --git a/checkpoints/metadata_000230096896.json b/checkpoints/metadata_000230096896.json new file mode 100644 index 0000000000000000000000000000000000000000..dda3fc30e58dea022f42fbd64b0abe282fe20d10 --- /dev/null +++ b/checkpoints/metadata_000230096896.json @@ -0,0 +1 @@ +{"step": 7022, "tokens_seen": 230096896, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 4.0227820953767095} \ No newline at end of file diff --git a/checkpoints/metadata_000243892224.json b/checkpoints/metadata_000243892224.json new file mode 100644 index 0000000000000000000000000000000000000000..1b1e7fcfb9cd07264de1efea11ef1aa10720b73d --- /dev/null +++ b/checkpoints/metadata_000243892224.json @@ -0,0 +1 @@ +{"step": 7443, "tokens_seen": 243892224, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 3.9948527031880965} \ No newline at end of file diff --git a/checkpoints/metadata_000245760000.json b/checkpoints/metadata_000245760000.json new file mode 100644 index 0000000000000000000000000000000000000000..a22dbf767f708ce0ef4495533253678ca403c7b8 --- /dev/null +++ b/checkpoints/metadata_000245760000.json @@ -0,0 +1 @@ +{"step": 7500, "tokens_seen": 245760000, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 3.997779211171014} \ No newline at end of file diff --git a/checkpoints/metadata_000258539520.json b/checkpoints/metadata_000258539520.json new file mode 100644 index 0000000000000000000000000000000000000000..eaa5299e1855b2e43a21dfbddc2e41b1be057556 --- /dev/null +++ b/checkpoints/metadata_000258539520.json @@ -0,0 +1 @@ +{"step": 7890, "tokens_seen": 258539520, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 3.972808340164624} \ No newline at end of file diff --git a/checkpoints/metadata_000262144000.json b/checkpoints/metadata_000262144000.json new file mode 100644 index 0000000000000000000000000000000000000000..a21228ebcf6fcc561191dad749611271a21fa415 --- /dev/null +++ b/checkpoints/metadata_000262144000.json @@ -0,0 +1 @@ +{"step": 8000, "tokens_seen": 262144000, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 3.970252707568018} \ No newline at end of file diff --git a/checkpoints/metadata_000274038784.json b/checkpoints/metadata_000274038784.json new file mode 100644 index 0000000000000000000000000000000000000000..a75f50602af14e271d612c780cd5dce46cfd6869 --- /dev/null +++ b/checkpoints/metadata_000274038784.json @@ -0,0 +1 @@ +{"step": 8363, "tokens_seen": 274038784, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 3.9510172098955425} \ No newline at end of file diff --git a/checkpoints/metadata_000278528000.json b/checkpoints/metadata_000278528000.json new file mode 100644 index 0000000000000000000000000000000000000000..fd8e34a761b7d788501abf54a3045372406b6e37 --- /dev/null +++ b/checkpoints/metadata_000278528000.json @@ -0,0 +1 @@ +{"step": 8500, "tokens_seen": 278528000, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 3.9448079199034525} \ No newline at end of file diff --git a/checkpoints/metadata_000290488320.json b/checkpoints/metadata_000290488320.json new file mode 100644 index 0000000000000000000000000000000000000000..f34c3e81ea735d5dc7649e43896c101f059066ed --- /dev/null +++ b/checkpoints/metadata_000290488320.json @@ -0,0 +1 @@ +{"step": 8865, "tokens_seen": 290488320, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 3.956209574965382} \ No newline at end of file diff --git a/checkpoints/metadata_000294912000.json b/checkpoints/metadata_000294912000.json new file mode 100644 index 0000000000000000000000000000000000000000..ba7835579d400d64fedb6b655348bc4a6a5221ca --- /dev/null +++ b/checkpoints/metadata_000294912000.json @@ -0,0 +1 @@ +{"step": 9000, "tokens_seen": 294912000, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 3.9343922139800642} \ No newline at end of file diff --git a/checkpoints/metadata_000307920896.json b/checkpoints/metadata_000307920896.json new file mode 100644 index 0000000000000000000000000000000000000000..7b70679070d15e6cdb1d9771cc2c0f819e9b1600 --- /dev/null +++ b/checkpoints/metadata_000307920896.json @@ -0,0 +1 @@ +{"step": 9397, "tokens_seen": 307920896, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 3.9302963166079246} \ No newline at end of file diff --git a/checkpoints/metadata_000311296000.json b/checkpoints/metadata_000311296000.json new file mode 100644 index 0000000000000000000000000000000000000000..35aea622a237c18788e38cf21e42dde08019b6af --- /dev/null +++ b/checkpoints/metadata_000311296000.json @@ -0,0 +1 @@ +{"step": 9500, "tokens_seen": 311296000, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 3.9303859482941124} \ No newline at end of file diff --git a/checkpoints/metadata_000326402048.json b/checkpoints/metadata_000326402048.json new file mode 100644 index 0000000000000000000000000000000000000000..23328991d4634bcfa692f2a5e1c16f2bfc4984ec --- /dev/null +++ b/checkpoints/metadata_000326402048.json @@ -0,0 +1 @@ +{"step": 9961, "tokens_seen": 326402048, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 3.9101114116989684} \ No newline at end of file diff --git a/checkpoints/metadata_000327680000.json b/checkpoints/metadata_000327680000.json new file mode 100644 index 0000000000000000000000000000000000000000..c1924186df7e973e5f9782836adedfe66c160c27 --- /dev/null +++ b/checkpoints/metadata_000327680000.json @@ -0,0 +1 @@ +{"step": 10000, "tokens_seen": 327680000, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 3.914438457069278} \ No newline at end of file diff --git a/checkpoints/metadata_000344064000.json b/checkpoints/metadata_000344064000.json new file mode 100644 index 0000000000000000000000000000000000000000..0e3486eaff5b4fe9a6a20076e248632bb45664d3 --- /dev/null +++ b/checkpoints/metadata_000344064000.json @@ -0,0 +1 @@ +{"step": 10500, "tokens_seen": 344064000, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 3.885792193125623} \ No newline at end of file diff --git a/checkpoints/metadata_000345997312.json b/checkpoints/metadata_000345997312.json new file mode 100644 index 0000000000000000000000000000000000000000..29901e9eae26950a5d4dae73e73a381cf649ef5e --- /dev/null +++ b/checkpoints/metadata_000345997312.json @@ -0,0 +1 @@ +{"step": 10559, "tokens_seen": 345997312, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 3.8769615286193746} \ No newline at end of file diff --git a/checkpoints/metadata_000360448000.json b/checkpoints/metadata_000360448000.json new file mode 100644 index 0000000000000000000000000000000000000000..a0791ca45de5cf308ea3540f92fc2984ddebd4dd --- /dev/null +++ b/checkpoints/metadata_000360448000.json @@ -0,0 +1 @@ +{"step": 11000, "tokens_seen": 360448000, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 3.905163652820761} \ No newline at end of file diff --git a/checkpoints/metadata_000366739456.json b/checkpoints/metadata_000366739456.json new file mode 100644 index 0000000000000000000000000000000000000000..bcdc1d4842cd8fc11970b5bb7380b8bd2b98b6e6 --- /dev/null +++ b/checkpoints/metadata_000366739456.json @@ -0,0 +1 @@ +{"step": 11192, "tokens_seen": 366739456, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 3.900713432646579} \ No newline at end of file diff --git a/checkpoints/metadata_000376832000.json b/checkpoints/metadata_000376832000.json new file mode 100644 index 0000000000000000000000000000000000000000..664909ad49e5df2c68dc2521d817340a86261b38 --- /dev/null +++ b/checkpoints/metadata_000376832000.json @@ -0,0 +1 @@ +{"step": 11500, "tokens_seen": 376832000, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 3.898993206418362} \ No newline at end of file diff --git a/checkpoints/metadata_000388759552.json b/checkpoints/metadata_000388759552.json new file mode 100644 index 0000000000000000000000000000000000000000..4defc7a2687555f071d5f0303a9f4b678f09524b --- /dev/null +++ b/checkpoints/metadata_000388759552.json @@ -0,0 +1 @@ +{"step": 11864, "tokens_seen": 388759552, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 3.8632653729659356} \ No newline at end of file diff --git a/checkpoints/metadata_000393216000.json b/checkpoints/metadata_000393216000.json new file mode 100644 index 0000000000000000000000000000000000000000..9fbf927a39915987485375d7ffc41aafd2e7ec56 --- /dev/null +++ b/checkpoints/metadata_000393216000.json @@ -0,0 +1 @@ +{"step": 12000, "tokens_seen": 393216000, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 3.864027154106854} \ No newline at end of file diff --git a/checkpoints/metadata_000409600000.json b/checkpoints/metadata_000409600000.json new file mode 100644 index 0000000000000000000000000000000000000000..17848ecf99010d9851b719de0d6d57bb4e6de1d1 --- /dev/null +++ b/checkpoints/metadata_000409600000.json @@ -0,0 +1 @@ +{"step": 12500, "tokens_seen": 409600000, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 3.8828461697001706} \ No newline at end of file diff --git a/checkpoints/metadata_000412090368.json b/checkpoints/metadata_000412090368.json new file mode 100644 index 0000000000000000000000000000000000000000..a4d2a86317a05a1257a99cc6c8b4404761c79256 --- /dev/null +++ b/checkpoints/metadata_000412090368.json @@ -0,0 +1 @@ +{"step": 12576, "tokens_seen": 412090368, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 3.8828947585945177} \ No newline at end of file diff --git a/checkpoints/metadata_000425984000.json b/checkpoints/metadata_000425984000.json new file mode 100644 index 0000000000000000000000000000000000000000..b86a5c1d2bbb0ea2d8a60da49a9eff4f8dd11248 --- /dev/null +++ b/checkpoints/metadata_000425984000.json @@ -0,0 +1 @@ +{"step": 13000, "tokens_seen": 425984000, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 3.86583878932724} \ No newline at end of file diff --git a/checkpoints/metadata_000436797440.json b/checkpoints/metadata_000436797440.json new file mode 100644 index 0000000000000000000000000000000000000000..1337d36e9ec28145278885e17d89b937d6ea0399 --- /dev/null +++ b/checkpoints/metadata_000436797440.json @@ -0,0 +1 @@ +{"step": 13330, "tokens_seen": 436797440, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 3.8491479497943057} \ No newline at end of file diff --git a/checkpoints/metadata_000442368000.json b/checkpoints/metadata_000442368000.json new file mode 100644 index 0000000000000000000000000000000000000000..9cfb31c94b39b622315434fd38503d8b8106324e --- /dev/null +++ b/checkpoints/metadata_000442368000.json @@ -0,0 +1 @@ +{"step": 13500, "tokens_seen": 442368000, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 3.8500606553515695} \ No newline at end of file diff --git a/checkpoints/metadata_000458752000.json b/checkpoints/metadata_000458752000.json new file mode 100644 index 0000000000000000000000000000000000000000..7eb4c7f41eec1ea5e447d985fbd1031c8024a40c --- /dev/null +++ b/checkpoints/metadata_000458752000.json @@ -0,0 +1 @@ +{"step": 14000, "tokens_seen": 458752000, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 3.843897773861025} \ No newline at end of file diff --git a/checkpoints/metadata_000463011840.json b/checkpoints/metadata_000463011840.json new file mode 100644 index 0000000000000000000000000000000000000000..17b1b23d7a2f7f95a2173b0866c725f7fe753dea --- /dev/null +++ b/checkpoints/metadata_000463011840.json @@ -0,0 +1 @@ +{"step": 14130, "tokens_seen": 463011840, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 3.8467455188255424} \ No newline at end of file diff --git a/checkpoints/metadata_000475136000.json b/checkpoints/metadata_000475136000.json new file mode 100644 index 0000000000000000000000000000000000000000..c26c6a419d3bf738b020234f24883b5727311658 --- /dev/null +++ b/checkpoints/metadata_000475136000.json @@ -0,0 +1 @@ +{"step": 14500, "tokens_seen": 475136000, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 3.850923935727171} \ No newline at end of file diff --git a/checkpoints/metadata_000490799104.json b/checkpoints/metadata_000490799104.json new file mode 100644 index 0000000000000000000000000000000000000000..684f40c21df4d50a56c1232977dcc1be65928cb6 --- /dev/null +++ b/checkpoints/metadata_000490799104.json @@ -0,0 +1 @@ +{"step": 14978, "tokens_seen": 490799104, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 3.8490084243306546} \ No newline at end of file diff --git a/checkpoints/metadata_000491520000.json b/checkpoints/metadata_000491520000.json new file mode 100644 index 0000000000000000000000000000000000000000..a467bd99abc5cfa4610f6f74f317bf58d56f7fa3 --- /dev/null +++ b/checkpoints/metadata_000491520000.json @@ -0,0 +1 @@ +{"step": 15000, "tokens_seen": 491520000, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 3.835874819447651} \ No newline at end of file diff --git a/checkpoints/metadata_000507904000.json b/checkpoints/metadata_000507904000.json new file mode 100644 index 0000000000000000000000000000000000000000..fdd41cca74113aeebdbb9763f0fbf1b7988a134e --- /dev/null +++ b/checkpoints/metadata_000507904000.json @@ -0,0 +1 @@ +{"step": 15500, "tokens_seen": 507904000, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 3.808097702327077} \ No newline at end of file diff --git a/checkpoints/metadata_000520257536.json b/checkpoints/metadata_000520257536.json new file mode 100644 index 0000000000000000000000000000000000000000..b4b10a9dd31f752a842b466b9f20e19a20a2ecb7 --- /dev/null +++ b/checkpoints/metadata_000520257536.json @@ -0,0 +1 @@ +{"step": 15877, "tokens_seen": 520257536, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 3.81473325918689} \ No newline at end of file diff --git a/checkpoints/metadata_000524288000.json b/checkpoints/metadata_000524288000.json new file mode 100644 index 0000000000000000000000000000000000000000..45c18fd6a2d015e4cfc79fc331ca6b052d08a14f --- /dev/null +++ b/checkpoints/metadata_000524288000.json @@ -0,0 +1 @@ +{"step": 16000, "tokens_seen": 524288000, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 3.8238459922026427} \ No newline at end of file diff --git a/checkpoints/metadata_000540672000.json b/checkpoints/metadata_000540672000.json new file mode 100644 index 0000000000000000000000000000000000000000..aede56380ebe2bb7aaa1de44af1d42b99d43a165 --- /dev/null +++ b/checkpoints/metadata_000540672000.json @@ -0,0 +1 @@ +{"step": 16500, "tokens_seen": 540672000, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 3.81268601542414} \ No newline at end of file diff --git a/checkpoints/metadata_000551452672.json b/checkpoints/metadata_000551452672.json new file mode 100644 index 0000000000000000000000000000000000000000..52da24d5094455ae77cfb550cdb9d498e0accab2 --- /dev/null +++ b/checkpoints/metadata_000551452672.json @@ -0,0 +1 @@ +{"step": 16829, "tokens_seen": 551452672, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 3.8149298887722742} \ No newline at end of file diff --git a/checkpoints/metadata_000557056000.json b/checkpoints/metadata_000557056000.json new file mode 100644 index 0000000000000000000000000000000000000000..85d383c0dd7f833b0f1f9762c376150144af46be --- /dev/null +++ b/checkpoints/metadata_000557056000.json @@ -0,0 +1 @@ +{"step": 17000, "tokens_seen": 557056000, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 3.808461619421968} \ No newline at end of file diff --git a/checkpoints/metadata_000573440000.json b/checkpoints/metadata_000573440000.json new file mode 100644 index 0000000000000000000000000000000000000000..1a1fdfc62ccbbe08594e99b81498127cf1d45ba0 --- /dev/null +++ b/checkpoints/metadata_000573440000.json @@ -0,0 +1 @@ +{"step": 17500, "tokens_seen": 573440000, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 3.805556827585357} \ No newline at end of file diff --git a/checkpoints/metadata_000584548352.json b/checkpoints/metadata_000584548352.json new file mode 100644 index 0000000000000000000000000000000000000000..adb58bcdabb8c7a6b003ddf167b3e759a22a36b9 --- /dev/null +++ b/checkpoints/metadata_000584548352.json @@ -0,0 +1 @@ +{"step": 17839, "tokens_seen": 584548352, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 3.792718748641675} \ No newline at end of file diff --git a/checkpoints/metadata_000589824000.json b/checkpoints/metadata_000589824000.json new file mode 100644 index 0000000000000000000000000000000000000000..96c594999ac7114b5e1a11e0ecab6df552e7340c --- /dev/null +++ b/checkpoints/metadata_000589824000.json @@ -0,0 +1 @@ +{"step": 18000, "tokens_seen": 589824000, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 3.7863804430856365} \ No newline at end of file diff --git a/checkpoints/metadata_000606208000.json b/checkpoints/metadata_000606208000.json new file mode 100644 index 0000000000000000000000000000000000000000..ef74ecc079d35d8a77a5c43ebeab7f88a341140b --- /dev/null +++ b/checkpoints/metadata_000606208000.json @@ -0,0 +1 @@ +{"step": 18500, "tokens_seen": 606208000, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 3.796104354278756} \ No newline at end of file diff --git a/checkpoints/metadata_000619610112.json b/checkpoints/metadata_000619610112.json new file mode 100644 index 0000000000000000000000000000000000000000..efdbf08eebafebce222be97f4562e18bb264f35f --- /dev/null +++ b/checkpoints/metadata_000619610112.json @@ -0,0 +1 @@ +{"step": 18909, "tokens_seen": 619610112, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 3.7990546419835507} \ No newline at end of file diff --git a/checkpoints/metadata_000622592000.json b/checkpoints/metadata_000622592000.json new file mode 100644 index 0000000000000000000000000000000000000000..e77687a03a4cfd70a3dd7d011459f4f6f0388331 --- /dev/null +++ b/checkpoints/metadata_000622592000.json @@ -0,0 +1 @@ +{"step": 19000, "tokens_seen": 622592000, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 3.796924407855164} \ No newline at end of file diff --git a/checkpoints/metadata_000638976000.json b/checkpoints/metadata_000638976000.json new file mode 100644 index 0000000000000000000000000000000000000000..81c3fac7eafa2ee7a44e036f68ccc207042c4430 --- /dev/null +++ b/checkpoints/metadata_000638976000.json @@ -0,0 +1 @@ +{"step": 19500, "tokens_seen": 638976000, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 3.788573007063493} \ No newline at end of file diff --git a/checkpoints/metadata_000655360000.json b/checkpoints/metadata_000655360000.json new file mode 100644 index 0000000000000000000000000000000000000000..e6c7a921fb4c66e6257dd68b10cc7152c8f54da3 --- /dev/null +++ b/checkpoints/metadata_000655360000.json @@ -0,0 +1 @@ +{"step": 20000, "tokens_seen": 655360000, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 3.777318569743358} \ No newline at end of file diff --git a/checkpoints/metadata_000656801792.json b/checkpoints/metadata_000656801792.json new file mode 100644 index 0000000000000000000000000000000000000000..e73dd76174bec8fe5c7dc387413612d686bc73f5 --- /dev/null +++ b/checkpoints/metadata_000656801792.json @@ -0,0 +1 @@ +{"step": 20044, "tokens_seen": 656801792, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 3.7824693473711153} \ No newline at end of file diff --git a/checkpoints/metadata_000671744000.json b/checkpoints/metadata_000671744000.json new file mode 100644 index 0000000000000000000000000000000000000000..a8d3920f933f44d4ce2a4ac91f225e908d8134c7 --- /dev/null +++ b/checkpoints/metadata_000671744000.json @@ -0,0 +1 @@ +{"step": 20500, "tokens_seen": 671744000, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 3.798922608613771} \ No newline at end of file diff --git a/checkpoints/metadata_000688128000.json b/checkpoints/metadata_000688128000.json new file mode 100644 index 0000000000000000000000000000000000000000..30c4a1cc737cd3e6108dbaa6302796f7f269d994 --- /dev/null +++ b/checkpoints/metadata_000688128000.json @@ -0,0 +1 @@ +{"step": 21000, "tokens_seen": 688128000, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 3.7927069691251267} \ No newline at end of file diff --git a/checkpoints/metadata_000696221696.json b/checkpoints/metadata_000696221696.json new file mode 100644 index 0000000000000000000000000000000000000000..9bde840f933acb577814e997b9e0e044c5b08af1 --- /dev/null +++ b/checkpoints/metadata_000696221696.json @@ -0,0 +1 @@ +{"step": 21247, "tokens_seen": 696221696, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 3.782485726713721} \ No newline at end of file diff --git a/checkpoints/metadata_000704512000.json b/checkpoints/metadata_000704512000.json new file mode 100644 index 0000000000000000000000000000000000000000..dc6b59666321b60405f973f4d2969d684a4d0df8 --- /dev/null +++ b/checkpoints/metadata_000704512000.json @@ -0,0 +1 @@ +{"step": 21500, "tokens_seen": 704512000, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 3.7788728458018146} \ No newline at end of file diff --git a/checkpoints/metadata_000720896000.json b/checkpoints/metadata_000720896000.json new file mode 100644 index 0000000000000000000000000000000000000000..5c26b88d287db3695e23be3860349cd8114d2154 --- /dev/null +++ b/checkpoints/metadata_000720896000.json @@ -0,0 +1 @@ +{"step": 22000, "tokens_seen": 720896000, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 3.769537903981933} \ No newline at end of file diff --git a/checkpoints/metadata_000737280000.json b/checkpoints/metadata_000737280000.json new file mode 100644 index 0000000000000000000000000000000000000000..9b4bafde43cc863f19eaaa9fdb3aebc333ff58cb --- /dev/null +++ b/checkpoints/metadata_000737280000.json @@ -0,0 +1 @@ +{"step": 22500, "tokens_seen": 737280000, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 3.7742409902862373} \ No newline at end of file diff --git a/checkpoints/metadata_000738000896.json b/checkpoints/metadata_000738000896.json new file mode 100644 index 0000000000000000000000000000000000000000..27b371814a6306e4f2e9cadf8fc94514dfa889a3 --- /dev/null +++ b/checkpoints/metadata_000738000896.json @@ -0,0 +1 @@ +{"step": 22522, "tokens_seen": 738000896, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 3.7736800340420373} \ No newline at end of file diff --git a/checkpoints/metadata_000753664000.json b/checkpoints/metadata_000753664000.json new file mode 100644 index 0000000000000000000000000000000000000000..e9902c9a5950c6fb300a96962240deeafb4e0bea --- /dev/null +++ b/checkpoints/metadata_000753664000.json @@ -0,0 +1 @@ +{"step": 23000, "tokens_seen": 753664000, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 3.7704362950826225} \ No newline at end of file diff --git a/checkpoints/metadata_000770048000.json b/checkpoints/metadata_000770048000.json new file mode 100644 index 0000000000000000000000000000000000000000..e1dbe1f88f507717647f2b82743dbcf50842d5ec --- /dev/null +++ b/checkpoints/metadata_000770048000.json @@ -0,0 +1 @@ +{"step": 23500, "tokens_seen": 770048000, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 3.769726706476158} \ No newline at end of file diff --git a/checkpoints/metadata_000782270464.json b/checkpoints/metadata_000782270464.json new file mode 100644 index 0000000000000000000000000000000000000000..ae196e954c167fef9aff24c477be52648ac53657 --- /dev/null +++ b/checkpoints/metadata_000782270464.json @@ -0,0 +1 @@ +{"step": 23873, "tokens_seen": 782270464, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 3.7606527127206024} \ No newline at end of file diff --git a/checkpoints/metadata_000786432000.json b/checkpoints/metadata_000786432000.json new file mode 100644 index 0000000000000000000000000000000000000000..aaf8c98cb3a2ea2547ee9ace1c3d635ba7992644 --- /dev/null +++ b/checkpoints/metadata_000786432000.json @@ -0,0 +1 @@ +{"step": 24000, "tokens_seen": 786432000, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 3.7616511859688004} \ No newline at end of file diff --git a/checkpoints/metadata_000802816000.json b/checkpoints/metadata_000802816000.json new file mode 100644 index 0000000000000000000000000000000000000000..336f41aec442e218025e5bb3a10bab429646cf98 --- /dev/null +++ b/checkpoints/metadata_000802816000.json @@ -0,0 +1 @@ +{"step": 24500, "tokens_seen": 802816000, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 3.746339684783483} \ No newline at end of file diff --git a/checkpoints/metadata_000819200000.json b/checkpoints/metadata_000819200000.json new file mode 100644 index 0000000000000000000000000000000000000000..f115432dd4236c0e0b9e124cf7530af3a1cb0488 --- /dev/null +++ b/checkpoints/metadata_000819200000.json @@ -0,0 +1 @@ +{"step": 25000, "tokens_seen": 819200000, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 3.7673903613775885} \ No newline at end of file diff --git a/checkpoints/metadata_000829194240.json b/checkpoints/metadata_000829194240.json new file mode 100644 index 0000000000000000000000000000000000000000..4b92e435f838bdb01559d11333b46afe843ee896 --- /dev/null +++ b/checkpoints/metadata_000829194240.json @@ -0,0 +1 @@ +{"step": 25305, "tokens_seen": 829194240, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 3.7740983256536427} \ No newline at end of file diff --git a/checkpoints/metadata_000835584000.json b/checkpoints/metadata_000835584000.json new file mode 100644 index 0000000000000000000000000000000000000000..e4fda317b9fd828774f3751824aeaa2be8494394 --- /dev/null +++ b/checkpoints/metadata_000835584000.json @@ -0,0 +1 @@ +{"step": 25500, "tokens_seen": 835584000, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 3.7642748790318996} \ No newline at end of file diff --git a/checkpoints/metadata_000851968000.json b/checkpoints/metadata_000851968000.json new file mode 100644 index 0000000000000000000000000000000000000000..d3c48fcb32361b41df0b7aa0b5b1638af22fe60f --- /dev/null +++ b/checkpoints/metadata_000851968000.json @@ -0,0 +1 @@ +{"step": 26000, "tokens_seen": 851968000, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 3.7511879017603555} \ No newline at end of file diff --git a/checkpoints/metadata_000868352000.json b/checkpoints/metadata_000868352000.json new file mode 100644 index 0000000000000000000000000000000000000000..9d02f2059fd20d46f9876308f64b086c9ca6463c --- /dev/null +++ b/checkpoints/metadata_000868352000.json @@ -0,0 +1 @@ +{"step": 26500, "tokens_seen": 868352000, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 3.753367624283758} \ No newline at end of file diff --git a/checkpoints/metadata_000878968832.json b/checkpoints/metadata_000878968832.json new file mode 100644 index 0000000000000000000000000000000000000000..aefbec088e982a81b504d78d560ba2bce50ca124 --- /dev/null +++ b/checkpoints/metadata_000878968832.json @@ -0,0 +1 @@ +{"step": 26824, "tokens_seen": 878968832, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 3.759942734654088} \ No newline at end of file diff --git a/checkpoints/metadata_000884736000.json b/checkpoints/metadata_000884736000.json new file mode 100644 index 0000000000000000000000000000000000000000..c91f3955193ae9ed8097e4caac003c0a4b892c6b --- /dev/null +++ b/checkpoints/metadata_000884736000.json @@ -0,0 +1 @@ +{"step": 27000, "tokens_seen": 884736000, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 3.746616014112979} \ No newline at end of file diff --git a/checkpoints/metadata_000901120000.json b/checkpoints/metadata_000901120000.json new file mode 100644 index 0000000000000000000000000000000000000000..eb02d1e081533673129eb6e18d0e2f685daf7c96 --- /dev/null +++ b/checkpoints/metadata_000901120000.json @@ -0,0 +1 @@ +{"step": 27500, "tokens_seen": 901120000, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 3.7678496723953496} \ No newline at end of file diff --git a/checkpoints/metadata_000917504000.json b/checkpoints/metadata_000917504000.json new file mode 100644 index 0000000000000000000000000000000000000000..d2667bec0bc421e8f88e84ced1ce6cd704efdf92 --- /dev/null +++ b/checkpoints/metadata_000917504000.json @@ -0,0 +1 @@ +{"step": 28000, "tokens_seen": 917504000, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 3.736450704236086} \ No newline at end of file diff --git a/checkpoints/metadata_000931692544.json b/checkpoints/metadata_000931692544.json new file mode 100644 index 0000000000000000000000000000000000000000..71d3091448d1c21ab1221bf41569db22b1818759 --- /dev/null +++ b/checkpoints/metadata_000931692544.json @@ -0,0 +1 @@ +{"step": 28433, "tokens_seen": 931692544, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 3.744846356522433} \ No newline at end of file diff --git a/checkpoints/metadata_000933888000.json b/checkpoints/metadata_000933888000.json new file mode 100644 index 0000000000000000000000000000000000000000..ef30fbe160ad4d08c2a2d25ce4f1c87815e84574 --- /dev/null +++ b/checkpoints/metadata_000933888000.json @@ -0,0 +1 @@ +{"step": 28500, "tokens_seen": 933888000, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 3.7600422555952} \ No newline at end of file diff --git a/checkpoints/metadata_000950272000.json b/checkpoints/metadata_000950272000.json new file mode 100644 index 0000000000000000000000000000000000000000..e0723c2391c35dde156e8b72df144e1e9121c5a0 --- /dev/null +++ b/checkpoints/metadata_000950272000.json @@ -0,0 +1 @@ +{"step": 29000, "tokens_seen": 950272000, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 3.7399442408263717} \ No newline at end of file diff --git a/checkpoints/model_weights_000000032768.pt b/checkpoints/model_weights_000000032768.pt new file mode 100644 index 0000000000000000000000000000000000000000..03511d91e914d121706074672ed3717e4b31658a --- /dev/null +++ b/checkpoints/model_weights_000000032768.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78ce67efbdfc3c9279831326e38cbb39268eb87b8627e4d8e3239dbba09c7ac9 +size 225208789 diff --git a/checkpoints/model_weights_000000327680.pt b/checkpoints/model_weights_000000327680.pt new file mode 100644 index 0000000000000000000000000000000000000000..136e23b4e34678332482b287b0570aba65b1021b --- /dev/null +++ b/checkpoints/model_weights_000000327680.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:abd8224967ef39ff5ce2998bc1336b7b256e4c96f3cec5544a8ae74d50ec371c +size 225208789 diff --git a/checkpoints/model_weights_000000360448.pt b/checkpoints/model_weights_000000360448.pt new file mode 100644 index 0000000000000000000000000000000000000000..c549d4f9976f13ccbbd13bfd13f0fdfa456a03ae --- /dev/null +++ b/checkpoints/model_weights_000000360448.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2dd03f58d8b2b7678be07b44f500190eb19838966b4d24c832b885f2bc68fa1d +size 225208789 diff --git a/checkpoints/model_weights_000000393216.pt b/checkpoints/model_weights_000000393216.pt new file mode 100644 index 0000000000000000000000000000000000000000..d24a9bdd57c34164e363302d52990e1fe1d3957d --- /dev/null +++ b/checkpoints/model_weights_000000393216.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fbdfa3d5c09b20066255edac890d58eeffbe7f2d57af8d3406f3ce423ba59864 +size 225208789 diff --git a/checkpoints/model_weights_000000425984.pt b/checkpoints/model_weights_000000425984.pt new file mode 100644 index 0000000000000000000000000000000000000000..a8409b5f1c7c240e634797a9292cec3e3c719f3b --- /dev/null +++ b/checkpoints/model_weights_000000425984.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8e28146fcd9777a2a9655d9393140078bc3bf6956f4f7e2e1b410a9866c6ec8 +size 225208789 diff --git a/checkpoints/model_weights_000000458752.pt b/checkpoints/model_weights_000000458752.pt new file mode 100644 index 0000000000000000000000000000000000000000..52bfb38f907e8295fa27092987f10ff5f38c5e9b --- /dev/null +++ b/checkpoints/model_weights_000000458752.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:615af50552c12006c49986b9fc452be6a69a9c474deb1eb2f25968f42bc9cda5 +size 225208789 diff --git a/checkpoints/model_weights_000000491520.pt b/checkpoints/model_weights_000000491520.pt new file mode 100644 index 0000000000000000000000000000000000000000..72919458ca10df4e01198cde97e537cc34a38b2c --- /dev/null +++ b/checkpoints/model_weights_000000491520.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02dfad7fd5fd026b6250dcbe2eae4976ddf4640128bd473b807d705b529fb307 +size 225208789 diff --git a/checkpoints/model_weights_000000524288.pt b/checkpoints/model_weights_000000524288.pt new file mode 100644 index 0000000000000000000000000000000000000000..317251cd7f351a9fac15f0fd405b22dc2630dd16 --- /dev/null +++ b/checkpoints/model_weights_000000524288.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:04d660e08391626af8af4175a8c21376e46c73072c3a037b170cd0d20585949c +size 225208789 diff --git a/checkpoints/model_weights_000000557056.pt b/checkpoints/model_weights_000000557056.pt new file mode 100644 index 0000000000000000000000000000000000000000..06ec647ab9af9616e35a7502bf62390e44bf53db --- /dev/null +++ b/checkpoints/model_weights_000000557056.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:adb9740b1c817e6e4bdc418b49131190249f18b6901942f5ab3db05cef46a20a +size 225208789 diff --git a/checkpoints/model_weights_000000589824.pt b/checkpoints/model_weights_000000589824.pt new file mode 100644 index 0000000000000000000000000000000000000000..9f65dfacb7c27a20969886e167f26086d3a97ef0 --- /dev/null +++ b/checkpoints/model_weights_000000589824.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f9ba4f93c60c130837ad4670cd613b1db1e8a38dbc25c0088148f543561bd7e +size 225208789 diff --git a/checkpoints/model_weights_000000622592.pt b/checkpoints/model_weights_000000622592.pt new file mode 100644 index 0000000000000000000000000000000000000000..74c04819d69b5b4db542ed39ae2f817f3502fac8 --- /dev/null +++ b/checkpoints/model_weights_000000622592.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d7b30ca477bcbcff681ce439245b42e43b2408e878efe1175edfda3f9f5c36c +size 225208789 diff --git a/checkpoints/model_weights_000000655360.pt b/checkpoints/model_weights_000000655360.pt new file mode 100644 index 0000000000000000000000000000000000000000..7df57917fe04327b9f7738291c548145c502d7d5 --- /dev/null +++ b/checkpoints/model_weights_000000655360.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b6a35a6103806a4831a21c40b0acc2d0871867fc091c2e369abd38a7876e638 +size 225208789 diff --git a/checkpoints/model_weights_000000688128.pt b/checkpoints/model_weights_000000688128.pt new file mode 100644 index 0000000000000000000000000000000000000000..4196c717cb4a0086ec62b8c85fa894ee50321999 --- /dev/null +++ b/checkpoints/model_weights_000000688128.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a1e0cbe7625a0198a8433eed2d77ea0e4a4ab1b51fd9819e82f6b68d5c05066 +size 225208789 diff --git a/checkpoints/model_weights_000000753664.pt b/checkpoints/model_weights_000000753664.pt new file mode 100644 index 0000000000000000000000000000000000000000..cea468d010398536bbee506a2a84b0a0d4b095e8 --- /dev/null +++ b/checkpoints/model_weights_000000753664.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a7c44174f03318fef39f0cc303d3030e9de915bd205f9c21d08946406bab55b +size 225208789 diff --git a/checkpoints/model_weights_000000786432.pt b/checkpoints/model_weights_000000786432.pt new file mode 100644 index 0000000000000000000000000000000000000000..d25edcf47f39840d655f2cd91cf1e93b017b2a56 --- /dev/null +++ b/checkpoints/model_weights_000000786432.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0998d8d494a50b511096a54859f543b02aa1f93720f4ddd1056ad47525bed868 +size 225208789 diff --git a/checkpoints/model_weights_000000851968.pt b/checkpoints/model_weights_000000851968.pt new file mode 100644 index 0000000000000000000000000000000000000000..a30103a4ce32c6d3058f8bad83ab206dffdff19d --- /dev/null +++ b/checkpoints/model_weights_000000851968.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc237541f66602853ac1e9545b425a641cd46ae63f03fcad95546197eb87418e +size 225208789 diff --git a/checkpoints/model_weights_000000884736.pt b/checkpoints/model_weights_000000884736.pt new file mode 100644 index 0000000000000000000000000000000000000000..c0470affcfb0495b7c19bc3c07e775d00916bfa1 --- /dev/null +++ b/checkpoints/model_weights_000000884736.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06007542bf6b787dacf8e4e3d5bf1ef31222aaff4d41920af9a3cb2ecc58d769 +size 225208789 diff --git a/checkpoints/model_weights_000000950272.pt b/checkpoints/model_weights_000000950272.pt new file mode 100644 index 0000000000000000000000000000000000000000..113849db0c86eb4fb83a8b215c5b0a6fa635c2be --- /dev/null +++ b/checkpoints/model_weights_000000950272.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:526d45839b1faa9166cd4de418ad5de6f5ca85cff088297dc680430738c63c8e +size 225208789 diff --git a/checkpoints/model_weights_000001015808.pt b/checkpoints/model_weights_000001015808.pt new file mode 100644 index 0000000000000000000000000000000000000000..83132b65992cb2b12cea0f619a89b99403d7c424 --- /dev/null +++ b/checkpoints/model_weights_000001015808.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c993eed398231c30494f46866ecd1c0228ab179160114280accdcb67fcf96263 +size 225208789 diff --git a/checkpoints/model_weights_000001048576.pt b/checkpoints/model_weights_000001048576.pt new file mode 100644 index 0000000000000000000000000000000000000000..4ee5fc3a075b750dd90e8214a0aeeccd32fc9379 --- /dev/null +++ b/checkpoints/model_weights_000001048576.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11f890b363ed1f602c13bcfbf79d4ef72c0322685b12e326e699f16502dce2d6 +size 225208789 diff --git a/checkpoints/model_weights_000001114112.pt b/checkpoints/model_weights_000001114112.pt new file mode 100644 index 0000000000000000000000000000000000000000..92d3af9da5f57e09f4f9c81d30729fccb2f61f9b --- /dev/null +++ b/checkpoints/model_weights_000001114112.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73363e61f893789d38ab79631c441aa3a8a5aa9c97f4f6f03f072b60ef1a2f15 +size 225208789 diff --git a/checkpoints/model_weights_000001212416.pt b/checkpoints/model_weights_000001212416.pt new file mode 100644 index 0000000000000000000000000000000000000000..fd85fe85a0283037493224366f65c4fd702d65b1 --- /dev/null +++ b/checkpoints/model_weights_000001212416.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:285ac1a591080a20e6d3cf467226d137209f01d6d4c6541b5d2f1e6ed4386434 +size 225208789 diff --git a/checkpoints/model_weights_000001277952.pt b/checkpoints/model_weights_000001277952.pt new file mode 100644 index 0000000000000000000000000000000000000000..f3f2244dca0c17ea83995a70c1a27c6466e74a82 --- /dev/null +++ b/checkpoints/model_weights_000001277952.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:efd846d5cd0dbe9bba064220b8fa1257b3da340b1dad5ef4d39190d96feb0d7b +size 225208789 diff --git a/checkpoints/model_weights_000001343488.pt b/checkpoints/model_weights_000001343488.pt new file mode 100644 index 0000000000000000000000000000000000000000..ef24f83c2eb0a93feb2cd6a324a562fdab8406d5 --- /dev/null +++ b/checkpoints/model_weights_000001343488.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c76557a69ca761b697594b3906c4807154a1c6ac1b06639028c13b137ff7f81a +size 225208789 diff --git a/checkpoints/model_weights_000001441792.pt b/checkpoints/model_weights_000001441792.pt new file mode 100644 index 0000000000000000000000000000000000000000..5a5cd8335c9e0c8725d5b39904185af7d55cd678 --- /dev/null +++ b/checkpoints/model_weights_000001441792.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70db509e6c1f2f2b85007135bec56fe4dfc13bb37a204a077f2fc46b257463df +size 225208789 diff --git a/checkpoints/model_weights_000001507328.pt b/checkpoints/model_weights_000001507328.pt new file mode 100644 index 0000000000000000000000000000000000000000..6cc4e60f281691b040f6271affe0f300f16c129c --- /dev/null +++ b/checkpoints/model_weights_000001507328.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9cb472ac3709e7e9c58461f569ce9f2a51d660e6946ca42f26d8fff7462a7329 +size 225208789 diff --git a/checkpoints/model_weights_000001605632.pt b/checkpoints/model_weights_000001605632.pt new file mode 100644 index 0000000000000000000000000000000000000000..46f31f277eae59d24e2b8e8f39559cc3c37403be --- /dev/null +++ b/checkpoints/model_weights_000001605632.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:44769e5f4bc45ee6d5a5917d948cca26e237d20c700bcf6530808a27163d5813 +size 225208789 diff --git a/checkpoints/model_weights_000001703936.pt b/checkpoints/model_weights_000001703936.pt new file mode 100644 index 0000000000000000000000000000000000000000..9a3c094264a0a18e4eff971da73f80adfb6e5d66 --- /dev/null +++ b/checkpoints/model_weights_000001703936.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:982c66ea6ba4b4582e6a90a0f676ae667eae9fdd7efc822dc9e127ebc492d9f6 +size 225208789 diff --git a/checkpoints/model_weights_000001802240.pt b/checkpoints/model_weights_000001802240.pt new file mode 100644 index 0000000000000000000000000000000000000000..531903abd1d3c8302c0200133d8e8168bc2aee9c --- /dev/null +++ b/checkpoints/model_weights_000001802240.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1820d4a3c51c3a6827ebcd10ef6bb98028e11cdcd5c5f4a43e28beb5a8139283 +size 225208789 diff --git a/checkpoints/model_weights_000001933312.pt b/checkpoints/model_weights_000001933312.pt new file mode 100644 index 0000000000000000000000000000000000000000..3315439829dbfb6b252d66343ee124501ed2328a --- /dev/null +++ b/checkpoints/model_weights_000001933312.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67fdefb9adf4a0c9c727ae33ef4b902a4861cdbd951e219bb4bbbf14f3a8510a +size 225208789 diff --git a/checkpoints/model_weights_000002031616.pt b/checkpoints/model_weights_000002031616.pt new file mode 100644 index 0000000000000000000000000000000000000000..9b581720c4bae53ffeae359ee2aa5f908d11c9bd --- /dev/null +++ b/checkpoints/model_weights_000002031616.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0541b1b5b6efab277da95e3cdc5d279471c123a5198cba6d90e72ab191315308 +size 225208789 diff --git a/checkpoints/model_weights_000002162688.pt b/checkpoints/model_weights_000002162688.pt new file mode 100644 index 0000000000000000000000000000000000000000..90250ad20793763bdb511232522707d4b2826b8a --- /dev/null +++ b/checkpoints/model_weights_000002162688.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a401f080e17bb1c655276c32c5ff0cc8f60d49b58b936eca937585e912ae7b28 +size 225208789 diff --git a/checkpoints/model_weights_000002293760.pt b/checkpoints/model_weights_000002293760.pt new file mode 100644 index 0000000000000000000000000000000000000000..8f1235106cf151dc4ce686a3d37d9847da499b1c --- /dev/null +++ b/checkpoints/model_weights_000002293760.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9fe73ff40decc088564210407daf648bb813a675455345d2bb5db115966bb450 +size 225208789 diff --git a/checkpoints/model_weights_000002424832.pt b/checkpoints/model_weights_000002424832.pt new file mode 100644 index 0000000000000000000000000000000000000000..3442e3d9e8c079043d74d4ee11ecdf039cec3a14 --- /dev/null +++ b/checkpoints/model_weights_000002424832.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:304878f12fa1a3bbc92e562e5d78a0f66fd67071d95dec14ffcaa8a7cb06c546 +size 225208789 diff --git a/checkpoints/model_weights_000002588672.pt b/checkpoints/model_weights_000002588672.pt new file mode 100644 index 0000000000000000000000000000000000000000..4847400bcfa77c928d5c09850e4c45637bd25238 --- /dev/null +++ b/checkpoints/model_weights_000002588672.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b196e20035265e0db7c60b538592dfd9cba868ad8ced45a266aec80d9ad8131d +size 225208789 diff --git a/checkpoints/model_weights_000002719744.pt b/checkpoints/model_weights_000002719744.pt new file mode 100644 index 0000000000000000000000000000000000000000..afc0821754a2252e53d5292d258aaa3be5075736 --- /dev/null +++ b/checkpoints/model_weights_000002719744.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df379d0de5fd2d829b32f2193cc65d0b20b51d69eac9858ed07163895539beaa +size 225208789 diff --git a/checkpoints/model_weights_000002883584.pt b/checkpoints/model_weights_000002883584.pt new file mode 100644 index 0000000000000000000000000000000000000000..88b96a44e12af691c1e59d9a8c91af61aa03ae97 --- /dev/null +++ b/checkpoints/model_weights_000002883584.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba85e7ca9938aff9f3eb0fa8b5537a29455eecd049020928a9f4102c54fe6f94 +size 225208789 diff --git a/checkpoints/model_weights_000003080192.pt b/checkpoints/model_weights_000003080192.pt new file mode 100644 index 0000000000000000000000000000000000000000..810c0f97aadaeed1a5a0a84e728673bd251b0dd2 --- /dev/null +++ b/checkpoints/model_weights_000003080192.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6fdc1e5b3181d28130a35c5f9418632a1fb4416b4560a15d70dd77e777841de4 +size 225208789 diff --git a/checkpoints/model_weights_000003244032.pt b/checkpoints/model_weights_000003244032.pt new file mode 100644 index 0000000000000000000000000000000000000000..98e2301c30f3ae88dba3f3d00863a72feebbb6a9 --- /dev/null +++ b/checkpoints/model_weights_000003244032.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9cbfc63f7483efe932253b18043bdea698b07a60302af98d26924248545a9113 +size 225208789 diff --git a/checkpoints/model_weights_000003440640.pt b/checkpoints/model_weights_000003440640.pt new file mode 100644 index 0000000000000000000000000000000000000000..a3b9260b5eb13835403af2d6af3b4dbf376ad69e --- /dev/null +++ b/checkpoints/model_weights_000003440640.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e53cb83f9630665d5833cf13a1a6ff8f7c9505e9792f769b5361e7f409138f4 +size 225208789 diff --git a/checkpoints/model_weights_000003670016.pt b/checkpoints/model_weights_000003670016.pt new file mode 100644 index 0000000000000000000000000000000000000000..888c478d694cf0764040a4c834812812bc61cc12 --- /dev/null +++ b/checkpoints/model_weights_000003670016.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4082322483859fdfcb1c4aa70367393bf62fae2faf1542597863c6e8d844014a +size 225208789 diff --git a/checkpoints/model_weights_000003866624.pt b/checkpoints/model_weights_000003866624.pt new file mode 100644 index 0000000000000000000000000000000000000000..4af9ce4fb39bb192c32b8ecd1255d876365c49f8 --- /dev/null +++ b/checkpoints/model_weights_000003866624.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d5f01cb6987d38bf247a3261ba4ce980f9754278f145e7443994446f808f630 +size 225208789 diff --git a/checkpoints/model_weights_000004128768.pt b/checkpoints/model_weights_000004128768.pt new file mode 100644 index 0000000000000000000000000000000000000000..bcf7f0a9df3eb292935634f06fa9c757f8049bc8 --- /dev/null +++ b/checkpoints/model_weights_000004128768.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:573b3ac1b1f11f9aa279d25c8727ef67b2e135770bea7a516c9b4e5049a3e403 +size 225208789 diff --git a/checkpoints/model_weights_000004358144.pt b/checkpoints/model_weights_000004358144.pt new file mode 100644 index 0000000000000000000000000000000000000000..d4e2e902c601e859c01c38022207578de38d1f6a --- /dev/null +++ b/checkpoints/model_weights_000004358144.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8342b981f49ba9b79b23eb9b6a794c8d3fc04a2dbb9bb79136e20fa840a6539e +size 225208789 diff --git a/checkpoints/model_weights_000004620288.pt b/checkpoints/model_weights_000004620288.pt new file mode 100644 index 0000000000000000000000000000000000000000..e57bded1141f27a1113dd4c2fe99cdd7757f4675 --- /dev/null +++ b/checkpoints/model_weights_000004620288.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:744799480631ff631e3568684390ba0fb5f7a404f8ff21b21e863a58e66df125 +size 225208789 diff --git a/checkpoints/model_weights_000004915200.pt b/checkpoints/model_weights_000004915200.pt new file mode 100644 index 0000000000000000000000000000000000000000..72477f169382ff330c18bf761fc40a5273048b09 --- /dev/null +++ b/checkpoints/model_weights_000004915200.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f15030a55ad6c05ee9d2c1993fa51c097f9f6e9bcc0d05ace49a4ebb51dc388f +size 225208789 diff --git a/checkpoints/model_weights_000005210112.pt b/checkpoints/model_weights_000005210112.pt new file mode 100644 index 0000000000000000000000000000000000000000..465c40a8272a483f45e9de6dab26b363ddeaa3ed --- /dev/null +++ b/checkpoints/model_weights_000005210112.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28f31d0c3babd1600ae6842392073fd61c2165126c1d291942ee6576165f3ddb +size 225208789 diff --git a/checkpoints/model_weights_000005505024.pt b/checkpoints/model_weights_000005505024.pt new file mode 100644 index 0000000000000000000000000000000000000000..20dd6a73a1f2a338aa9ef5109f88241cd488ab4d --- /dev/null +++ b/checkpoints/model_weights_000005505024.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:175e861a1ad08877b8735e33f2350b378ea745329c5a310b474e00c4a9c8ef95 +size 225208789 diff --git a/checkpoints/model_weights_000005832704.pt b/checkpoints/model_weights_000005832704.pt new file mode 100644 index 0000000000000000000000000000000000000000..e0ed34c356d497d2ddfc05d39606a0f265cab388 --- /dev/null +++ b/checkpoints/model_weights_000005832704.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a862bfd0efd8478bc3f5874174ae7a58ab5c1408308596505cd30235ce7f7a94 +size 225208789 diff --git a/checkpoints/model_weights_000006193152.pt b/checkpoints/model_weights_000006193152.pt new file mode 100644 index 0000000000000000000000000000000000000000..95ba225c2be5e9738bc8e97946676e2780adcffb --- /dev/null +++ b/checkpoints/model_weights_000006193152.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6ed0bcf667d5af51a7574e8d6fdccc7b040030f1f94d6dd961cecbce18a3d44 +size 225208789 diff --git a/checkpoints/model_weights_000006553600.pt b/checkpoints/model_weights_000006553600.pt new file mode 100644 index 0000000000000000000000000000000000000000..add098ebcaf53757d6d3d346191acc159052f8d0 --- /dev/null +++ b/checkpoints/model_weights_000006553600.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:33d9f8d958c2197b8730710ffbafe74da4b5556493dfa20c9567c1732b376f3c +size 225208789 diff --git a/checkpoints/model_weights_000006946816.pt b/checkpoints/model_weights_000006946816.pt new file mode 100644 index 0000000000000000000000000000000000000000..b689475ad3b4b42629d223d76b5e56be9327e59c --- /dev/null +++ b/checkpoints/model_weights_000006946816.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3dec246db875985d780bec1ee54d4c66063bffc8b0f1df641d428be277abd4fa +size 225208789 diff --git a/checkpoints/model_weights_000007372800.pt b/checkpoints/model_weights_000007372800.pt new file mode 100644 index 0000000000000000000000000000000000000000..27eac09d2a95ac314606682f2656a10caf6d5776 --- /dev/null +++ b/checkpoints/model_weights_000007372800.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26693e0ca01c89c9609e5108adda0bb8cbabbe4c04abd2013d28192861b9b5e1 +size 225208789 diff --git a/checkpoints/model_weights_000007831552.pt b/checkpoints/model_weights_000007831552.pt new file mode 100644 index 0000000000000000000000000000000000000000..2a5fb79589a1d4a007b8fda742921799811513d5 --- /dev/null +++ b/checkpoints/model_weights_000007831552.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ec85aaa3a3fdf87a3f2632b6a9a530e20c1602d33fe09dd05dfc39a8ed532be +size 225208789 diff --git a/checkpoints/model_weights_000008290304.pt b/checkpoints/model_weights_000008290304.pt new file mode 100644 index 0000000000000000000000000000000000000000..6cfffc2ed7c35d9773e71d98061444d30c6a044f --- /dev/null +++ b/checkpoints/model_weights_000008290304.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e7d75d34b61cd80c6d958b81e3b0f62b25491edbf90d73124b5791f8f38c460 +size 225208789 diff --git a/checkpoints/model_weights_000008781824.pt b/checkpoints/model_weights_000008781824.pt new file mode 100644 index 0000000000000000000000000000000000000000..d01af38ebf1a019c27cde3e03453a75207f2549a --- /dev/null +++ b/checkpoints/model_weights_000008781824.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab9d19c57f3f015784d0d8eabfd06851730f5e6ef47d04ebe871b0d0b6568fef +size 225208789 diff --git a/checkpoints/model_weights_000009306112.pt b/checkpoints/model_weights_000009306112.pt new file mode 100644 index 0000000000000000000000000000000000000000..587e13c376c35c734f4ae0e0fdb45a74b213efdb --- /dev/null +++ b/checkpoints/model_weights_000009306112.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8111272f10fea4c14dfd9e68c14ed398b9caa6ad0491ab30df98147f7c3ccc31 +size 225208789 diff --git a/checkpoints/model_weights_000009863168.pt b/checkpoints/model_weights_000009863168.pt new file mode 100644 index 0000000000000000000000000000000000000000..cecc5c942401c3108c4f082718cb1bb78257bb7e --- /dev/null +++ b/checkpoints/model_weights_000009863168.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e368090d364ad8ccf89617d81d271182d05666d0c118a63357f84d833e7a3c9 +size 225208789 diff --git a/checkpoints/model_weights_000010485760.pt b/checkpoints/model_weights_000010485760.pt new file mode 100644 index 0000000000000000000000000000000000000000..a38d1928d1a33e1145763ed5b5e0d60994943ccb --- /dev/null +++ b/checkpoints/model_weights_000010485760.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:435b5111a21ba5d5aff066022a7faefa1a748cf7fbc8145cf059146ae1c45f89 +size 225208789 diff --git a/checkpoints/model_weights_000011108352.pt b/checkpoints/model_weights_000011108352.pt new file mode 100644 index 0000000000000000000000000000000000000000..92d3630bfaca975b5083a1d24925ce143ecbbe61 --- /dev/null +++ b/checkpoints/model_weights_000011108352.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9311c383b0353662b9e3eb99d4d85263da40b1d59dab5b3156260166db03badf +size 225208789 diff --git a/checkpoints/model_weights_000011763712.pt b/checkpoints/model_weights_000011763712.pt new file mode 100644 index 0000000000000000000000000000000000000000..3a442831d1a4d4a2c714bd60c77082bf37f67915 --- /dev/null +++ b/checkpoints/model_weights_000011763712.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca114c72747a1d2606fd266321b204db3207ba6bddf9362e6ecb176ee9be7ef3 +size 225208789 diff --git a/checkpoints/model_weights_000012484608.pt b/checkpoints/model_weights_000012484608.pt new file mode 100644 index 0000000000000000000000000000000000000000..18597cb5f8a326545352b93b17b0548cf6a9a864 --- /dev/null +++ b/checkpoints/model_weights_000012484608.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b14fafc8649eb76127ec8e98e79c2e50a77e09b2dfc9166cf976f1c2de030a63 +size 225208789 diff --git a/checkpoints/model_weights_000013238272.pt b/checkpoints/model_weights_000013238272.pt new file mode 100644 index 0000000000000000000000000000000000000000..7a35e3b342101ff3b287f24e73b2e257531a5773 --- /dev/null +++ b/checkpoints/model_weights_000013238272.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d760600baee7cf8291aef70b952392a64ae7aa9b62b82070c49be3501cdb7fb +size 225208789 diff --git a/checkpoints/model_weights_000014024704.pt b/checkpoints/model_weights_000014024704.pt new file mode 100644 index 0000000000000000000000000000000000000000..79d06b58756f653fd836b8d7ae358f64640385c3 --- /dev/null +++ b/checkpoints/model_weights_000014024704.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45ac4f1611e5fddc411216829337383a3b153a6573f9eab768b27eef2ef883fb +size 225208789 diff --git a/checkpoints/model_weights_000014876672.pt b/checkpoints/model_weights_000014876672.pt new file mode 100644 index 0000000000000000000000000000000000000000..7eac7b187de90fafd506ee840e1e90632151f99f --- /dev/null +++ b/checkpoints/model_weights_000014876672.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0d57fb21df8124f39c413f67dcceed190fc85d395d664efda769b76e8b54516 +size 225208789 diff --git a/checkpoints/model_weights_000015761408.pt b/checkpoints/model_weights_000015761408.pt new file mode 100644 index 0000000000000000000000000000000000000000..6178e62ccc7bd249f8fa9e58454e8478714c207a --- /dev/null +++ b/checkpoints/model_weights_000015761408.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18e90dc2b939a2b7155a6ad6f89ce9c513454e7ccd29b4c217f67f5343f432db +size 225208789 diff --git a/checkpoints/model_weights_000016384000.pt b/checkpoints/model_weights_000016384000.pt new file mode 100644 index 0000000000000000000000000000000000000000..9772d9cc29921bf1b2ca38431029622a5c48eac2 --- /dev/null +++ b/checkpoints/model_weights_000016384000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2c008b7df1799920208ad60d706bbfd13a92845d0a5dd9a40092f819d053e8e +size 225208789 diff --git a/checkpoints/model_weights_000016711680.pt b/checkpoints/model_weights_000016711680.pt new file mode 100644 index 0000000000000000000000000000000000000000..55b77973e5c658092520db084220141a5a3d0668 --- /dev/null +++ b/checkpoints/model_weights_000016711680.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:509aeff8a5a9b23df5402077d801ef7c044d50100c073bc51f4201ea9a1e6e91 +size 225208789 diff --git a/checkpoints/model_weights_000017694720.pt b/checkpoints/model_weights_000017694720.pt new file mode 100644 index 0000000000000000000000000000000000000000..08d457abd543867d5b27d49b599676f47ab05abc --- /dev/null +++ b/checkpoints/model_weights_000017694720.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bdc90cf365e5b0e37f5a9e19dd90ef120674819cc5b379a387a9ccd1ce0903a2 +size 225208789 diff --git a/checkpoints/model_weights_000018776064.pt b/checkpoints/model_weights_000018776064.pt new file mode 100644 index 0000000000000000000000000000000000000000..69e637b28069d47e918e8a187f89935164d46e65 --- /dev/null +++ b/checkpoints/model_weights_000018776064.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab2ed59d0521753a05b062abf2bd37d21abe103bfedc3a17ee5e71f1b3ca32b0 +size 225208789 diff --git a/checkpoints/model_weights_000019890176.pt b/checkpoints/model_weights_000019890176.pt new file mode 100644 index 0000000000000000000000000000000000000000..dac6b80fc73f6b10e2a9b32fb8c9c4c4e024be1d --- /dev/null +++ b/checkpoints/model_weights_000019890176.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c46985c8f243a7fad43cb60fa284b2046932998f3152e4efd3b7eb18c4452f53 +size 225208789 diff --git a/checkpoints/model_weights_000021102592.pt b/checkpoints/model_weights_000021102592.pt new file mode 100644 index 0000000000000000000000000000000000000000..7b902377edd2a5495e9a653a26437104f72bd8d3 --- /dev/null +++ b/checkpoints/model_weights_000021102592.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9eedf46f4fe85659cd6fd768f9fa9e8addbc4e96666d647f74c2c0c0ca59969a +size 225208789 diff --git a/checkpoints/model_weights_000022347776.pt b/checkpoints/model_weights_000022347776.pt new file mode 100644 index 0000000000000000000000000000000000000000..43c3f9ef728b64632b549a08477aa12601596dc5 --- /dev/null +++ b/checkpoints/model_weights_000022347776.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1bdb2ed43cf8b63f9c3f7d2b970790940bd916e865ed721ae9fb04877b6640ee +size 225208789 diff --git a/checkpoints/model_weights_000023691264.pt b/checkpoints/model_weights_000023691264.pt new file mode 100644 index 0000000000000000000000000000000000000000..977a0f47e84c5c4ab6be1c68faed4b868a54b532 --- /dev/null +++ b/checkpoints/model_weights_000023691264.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:771767973ee705051f0b79e43df4d19940f5816719245485bc148302f8009870 +size 225208789 diff --git a/checkpoints/model_weights_000025133056.pt b/checkpoints/model_weights_000025133056.pt new file mode 100644 index 0000000000000000000000000000000000000000..0548d3490ffb048bde0dfcf2ebd48679e9453d52 --- /dev/null +++ b/checkpoints/model_weights_000025133056.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd73567cfa7f4919c3de364d67bec351465f63439a9ad529e717687b2ed6c11e +size 225208789 diff --git a/checkpoints/model_weights_000026640384.pt b/checkpoints/model_weights_000026640384.pt new file mode 100644 index 0000000000000000000000000000000000000000..cc114716c63f35597b1bc6b3e1774af217ff9704 --- /dev/null +++ b/checkpoints/model_weights_000026640384.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9585e952561e57c177cf09b1cb38fd05466c5baa2dd57dd4cede7406ecadd38d +size 225208789 diff --git a/checkpoints/model_weights_000028213248.pt b/checkpoints/model_weights_000028213248.pt new file mode 100644 index 0000000000000000000000000000000000000000..734ac0b275222c032ccff7244fd6d15aceab36e2 --- /dev/null +++ b/checkpoints/model_weights_000028213248.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c30c3c7b1ce2ec07a9bd88444094632153df3946221daf0a4fc3630ee8a73ad +size 225208789 diff --git a/checkpoints/model_weights_000029917184.pt b/checkpoints/model_weights_000029917184.pt new file mode 100644 index 0000000000000000000000000000000000000000..9f186faabbd39f7032a24dcda647adf484fbcf38 --- /dev/null +++ b/checkpoints/model_weights_000029917184.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ee785d6beaa741a4a9664f31d2c7c5324efbf19e2d450703d5a3f3328e3151a +size 225208789 diff --git a/checkpoints/model_weights_000031719424.pt b/checkpoints/model_weights_000031719424.pt new file mode 100644 index 0000000000000000000000000000000000000000..1ed2208202f326a1aaefca83c9a627fc2688fefb --- /dev/null +++ b/checkpoints/model_weights_000031719424.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46786429889b78601f447309321d224f77113e0802ecf7d8eebb28fb9dac7f8b +size 225208789 diff --git a/checkpoints/model_weights_000032768000.pt b/checkpoints/model_weights_000032768000.pt new file mode 100644 index 0000000000000000000000000000000000000000..6f12759a55abaeb4e5b9cafe23e2c02d520e3b28 --- /dev/null +++ b/checkpoints/model_weights_000032768000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7de4138c0800435881478449561fe0b4e001d84ab4186e0086668b911813cd47 +size 225208789 diff --git a/checkpoints/model_weights_000033619968.pt b/checkpoints/model_weights_000033619968.pt new file mode 100644 index 0000000000000000000000000000000000000000..4ace05a8cf49f27e866c9667b9a7cbb3646842cf --- /dev/null +++ b/checkpoints/model_weights_000033619968.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2e8f64768e727a0f5c02e7817f52be88d471a40f5470555bb2146bd9e1772bd +size 225208789 diff --git a/checkpoints/model_weights_000035651584.pt b/checkpoints/model_weights_000035651584.pt new file mode 100644 index 0000000000000000000000000000000000000000..83cd5ca9a8907ba88d61c5632a085dada4a13ad4 --- /dev/null +++ b/checkpoints/model_weights_000035651584.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70392440064b8aeb98cae054c1bec58e7b17d1779aa35fa026374a4f29ff1a4c +size 225208789 diff --git a/checkpoints/model_weights_000037781504.pt b/checkpoints/model_weights_000037781504.pt new file mode 100644 index 0000000000000000000000000000000000000000..656a614209f4e2e323bfbd91ce8d609d7e3b8fa8 --- /dev/null +++ b/checkpoints/model_weights_000037781504.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7cd07f120be65dd3fb50862e6ef2442e1a88c235a9f61de357e0b4c09b091df +size 225208789 diff --git a/checkpoints/model_weights_000040042496.pt b/checkpoints/model_weights_000040042496.pt new file mode 100644 index 0000000000000000000000000000000000000000..45f3ef4d2517574a26d27101b46d1a49096241c3 --- /dev/null +++ b/checkpoints/model_weights_000040042496.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5fddd980aeb57e76ecd0af009e28c47bf51a15e5b8db48fb9a50f7e1110787fb +size 225208789 diff --git a/checkpoints/model_weights_000042467328.pt b/checkpoints/model_weights_000042467328.pt new file mode 100644 index 0000000000000000000000000000000000000000..65a47b54db65bb97c39ea17fa4a8278accc476f9 --- /dev/null +++ b/checkpoints/model_weights_000042467328.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8cb691f24c837bb102f1ef9ee3fd163f85a163edb896fab6d99686ebf0ef2cd7 +size 225208789 diff --git a/checkpoints/model_weights_000044990464.pt b/checkpoints/model_weights_000044990464.pt new file mode 100644 index 0000000000000000000000000000000000000000..00850978b383e9d3266dc095c2e1e82b41a461a7 --- /dev/null +++ b/checkpoints/model_weights_000044990464.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25e03dfd162d106ad2bb0373657ebc6679ad7e19fc89d9d6f311b435f4695235 +size 225208789 diff --git a/checkpoints/model_weights_000047710208.pt b/checkpoints/model_weights_000047710208.pt new file mode 100644 index 0000000000000000000000000000000000000000..6c77dbf89e8ff7ee79f9184c57083d4b993fe990 --- /dev/null +++ b/checkpoints/model_weights_000047710208.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d16efe46011da3f5e422cd7c18f581070355fd3c3134ddf4b95d87e09bceab5 +size 225208789 diff --git a/checkpoints/model_weights_000049152000.pt b/checkpoints/model_weights_000049152000.pt new file mode 100644 index 0000000000000000000000000000000000000000..c65279be307da999aa923ad86d86fdb44d35d89a --- /dev/null +++ b/checkpoints/model_weights_000049152000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:534e3fa12e23836163ee7b34be711d74bea1f21b35b3fe4a51c3ecf16907a048 +size 225208789 diff --git a/checkpoints/model_weights_000050561024.pt b/checkpoints/model_weights_000050561024.pt new file mode 100644 index 0000000000000000000000000000000000000000..d5263600daaafe686c3a940ac28e9f63d30b5cc0 --- /dev/null +++ b/checkpoints/model_weights_000050561024.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:004de060cb7c1eeac96d102f678b162492004f74976b0364ec80d25c9ec6aff8 +size 225208789 diff --git a/checkpoints/model_weights_000053608448.pt b/checkpoints/model_weights_000053608448.pt new file mode 100644 index 0000000000000000000000000000000000000000..d4cf26a5e2a058125e2d0014d94f7d18b777aca4 --- /dev/null +++ b/checkpoints/model_weights_000053608448.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e482bb2bac9e944324365b5f2f86dec8741673f0a3aa9f470906478e11cc76ed +size 225208789 diff --git a/checkpoints/model_weights_000056819712.pt b/checkpoints/model_weights_000056819712.pt new file mode 100644 index 0000000000000000000000000000000000000000..657461471cb0d6143777921c71a445ee8489705c --- /dev/null +++ b/checkpoints/model_weights_000056819712.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4461231ffce3c5974cb6e7f81c521282343f0b3d517415f20972d2a584f35738 +size 225208789 diff --git a/checkpoints/model_weights_000060227584.pt b/checkpoints/model_weights_000060227584.pt new file mode 100644 index 0000000000000000000000000000000000000000..92661309d67e2719bb14b9316b529fc50147e189 --- /dev/null +++ b/checkpoints/model_weights_000060227584.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3871fd5a05874c0c94da1d0b28230e7b2b7e1465f48c35269c4277f626ef9e0a +size 225208789 diff --git a/checkpoints/model_weights_000063832064.pt b/checkpoints/model_weights_000063832064.pt new file mode 100644 index 0000000000000000000000000000000000000000..061ed0955e6224a837813b095808c81cd9dee123 --- /dev/null +++ b/checkpoints/model_weights_000063832064.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08b7c5fbb89ccf5d767969105a27c503b49836520c5d05de47eba4cb424b0ed7 +size 225208789 diff --git a/checkpoints/model_weights_000065536000.pt b/checkpoints/model_weights_000065536000.pt new file mode 100644 index 0000000000000000000000000000000000000000..0aef0fb5752d7ac56081269000642f3af54f0861 --- /dev/null +++ b/checkpoints/model_weights_000065536000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03be6a31eb94772ffeb216b78728e2f361db00ae5f629b545affc915de5eb821 +size 225208789 diff --git a/checkpoints/model_weights_000067665920.pt b/checkpoints/model_weights_000067665920.pt new file mode 100644 index 0000000000000000000000000000000000000000..6f65e96df27d27ca075439bbba40733bcce9e27d --- /dev/null +++ b/checkpoints/model_weights_000067665920.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ebbf88845600ddb1ec22f8ee4163cef6b3d0dbecd32d4c9354f59279796beac +size 225208789 diff --git a/checkpoints/model_weights_000071729152.pt b/checkpoints/model_weights_000071729152.pt new file mode 100644 index 0000000000000000000000000000000000000000..d8565699a74abc29093572b0b0cc795aee4fc999 --- /dev/null +++ b/checkpoints/model_weights_000071729152.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2376b16ea3cf1542fca9755be1a3153bdb7e5ec1cbffa2369ec5addf3d3630d9 +size 225208789 diff --git a/checkpoints/model_weights_000076054528.pt b/checkpoints/model_weights_000076054528.pt new file mode 100644 index 0000000000000000000000000000000000000000..11b7195b579e4835de831e7ad5667c2599ffed9b --- /dev/null +++ b/checkpoints/model_weights_000076054528.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe117e429bd8f11566d724acea1575a67caea6de3cfef64974ea6e0ab0e43495 +size 225208789 diff --git a/checkpoints/model_weights_000080609280.pt b/checkpoints/model_weights_000080609280.pt new file mode 100644 index 0000000000000000000000000000000000000000..36de24ca756a75d3b9f3c6842eb76a7497e4f035 --- /dev/null +++ b/checkpoints/model_weights_000080609280.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d0ca3c6506c30303503700aa752cf9106617b92cf808174116598adbd6c49be +size 225208789 diff --git a/checkpoints/model_weights_000081920000.pt b/checkpoints/model_weights_000081920000.pt new file mode 100644 index 0000000000000000000000000000000000000000..3aec5fbae0267e12e29619a14d8216cfc95f0995 --- /dev/null +++ b/checkpoints/model_weights_000081920000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c46f6788c439437544d3766bc54772b76f3a3b5dabb65604597dd696818a783 +size 225208789 diff --git a/checkpoints/model_weights_000085426176.pt b/checkpoints/model_weights_000085426176.pt new file mode 100644 index 0000000000000000000000000000000000000000..09a83b78741b6e9a29639103ae5e871091734469 --- /dev/null +++ b/checkpoints/model_weights_000085426176.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca173bd6612dbe044991216778a99c263bb63a4ffccec9bcaaf0af0b0cf17892 +size 225208789 diff --git a/checkpoints/model_weights_000090570752.pt b/checkpoints/model_weights_000090570752.pt new file mode 100644 index 0000000000000000000000000000000000000000..6d29c4a63679903192cf6c80aa400e89b52da2d6 --- /dev/null +++ b/checkpoints/model_weights_000090570752.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6761eb0f375e70c390c32918896cb7cd3b37e95a304e54bd18535290d4d2636b +size 225208789 diff --git a/checkpoints/model_weights_000096010240.pt b/checkpoints/model_weights_000096010240.pt new file mode 100644 index 0000000000000000000000000000000000000000..65958a73e24533a6ed2198cca42164c302ba2559 --- /dev/null +++ b/checkpoints/model_weights_000096010240.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b680ae4705dbfd658b3429941cdaacfc4da62a965e980c5f88599408db69dab +size 225208789 diff --git a/checkpoints/model_weights_000098304000.pt b/checkpoints/model_weights_000098304000.pt new file mode 100644 index 0000000000000000000000000000000000000000..ca1db4416aa4461e1204a9aeef15eff1be5398f0 --- /dev/null +++ b/checkpoints/model_weights_000098304000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3c5fde42c6eac28f287c910f3863f6c7d3a2a817c7b8d32c78a5c2fb559fbce +size 225208789 diff --git a/checkpoints/model_weights_000101777408.pt b/checkpoints/model_weights_000101777408.pt new file mode 100644 index 0000000000000000000000000000000000000000..a3924083c9b5ce3c2870686d7f28c1effd47f57b --- /dev/null +++ b/checkpoints/model_weights_000101777408.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a002ecfce8e9923f4a1d5f9e7b150bd407173c5726834c1b0f418bc39a24c41 +size 225208789 diff --git a/checkpoints/model_weights_000107872256.pt b/checkpoints/model_weights_000107872256.pt new file mode 100644 index 0000000000000000000000000000000000000000..c601ddbefb41394797790923e8c8f38b60d9c410 --- /dev/null +++ b/checkpoints/model_weights_000107872256.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:014575aae81585c76dbdfacb3f479a4f5d479fe34411c155262e8717d03b4bde +size 225208789 diff --git a/checkpoints/model_weights_000114327552.pt b/checkpoints/model_weights_000114327552.pt new file mode 100644 index 0000000000000000000000000000000000000000..54579dfe4cd37001daa3e14c3f49a827d2d3a33d --- /dev/null +++ b/checkpoints/model_weights_000114327552.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8da678be9b1295802af5c765086393a5e283034bf980f97fd2aaa444193ed091 +size 225208789 diff --git a/checkpoints/model_weights_000114688000.pt b/checkpoints/model_weights_000114688000.pt new file mode 100644 index 0000000000000000000000000000000000000000..9fab7dc07e2b5031b7ed6a17c0835db6f096cba7 --- /dev/null +++ b/checkpoints/model_weights_000114688000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4a15f9f2850717b97020a5f9009c4dca9e916a96257372234e8f6df1cd264f0 +size 225208789 diff --git a/checkpoints/model_weights_000121208832.pt b/checkpoints/model_weights_000121208832.pt new file mode 100644 index 0000000000000000000000000000000000000000..d5cfac762ed45ab25e08f152d13aba27ac9d8370 --- /dev/null +++ b/checkpoints/model_weights_000121208832.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0bb922eba0dda9c61b41cdd348d9b96e1df7c4aecc859de31089f29d4d4369b3 +size 225208789 diff --git a/checkpoints/model_weights_000128483328.pt b/checkpoints/model_weights_000128483328.pt new file mode 100644 index 0000000000000000000000000000000000000000..777a83d03bf054d7d2eebfd6bc15246b2d84a2cc --- /dev/null +++ b/checkpoints/model_weights_000128483328.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f174fe7891fe1a814c072a0e7a96d4e8cc35a182f7888cc726c94b3c5c17f9b +size 225208789 diff --git a/checkpoints/model_weights_000131072000.pt b/checkpoints/model_weights_000131072000.pt new file mode 100644 index 0000000000000000000000000000000000000000..1767e4ad0251668f51e9d68f102ed8779964f115 --- /dev/null +++ b/checkpoints/model_weights_000131072000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64bc3de5f812edcee9e5cdbc52fe5d424abb9f14222cd714a56bdbfd3e6e0d47 +size 225208789 diff --git a/checkpoints/model_weights_000136183808.pt b/checkpoints/model_weights_000136183808.pt new file mode 100644 index 0000000000000000000000000000000000000000..2a916bd8c9995bec9f75f0f32801b4f2412da036 --- /dev/null +++ b/checkpoints/model_weights_000136183808.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8150489124f94f20425730f37a457404eb41bcd5a09346a5846744ef2bfdc3ff +size 225208789 diff --git a/checkpoints/model_weights_000144375808.pt b/checkpoints/model_weights_000144375808.pt new file mode 100644 index 0000000000000000000000000000000000000000..666283f58c767ff8fb9532e07a995edbf20aeeff --- /dev/null +++ b/checkpoints/model_weights_000144375808.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b2c20541231ec4f7abd228ac7fa410301153a34646dd1bc0150cacfc0b9082a +size 225208789 diff --git a/checkpoints/model_weights_000147456000.pt b/checkpoints/model_weights_000147456000.pt new file mode 100644 index 0000000000000000000000000000000000000000..c479a6039abf98339590f66792198bbaf7c5af7d --- /dev/null +++ b/checkpoints/model_weights_000147456000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:988153414e808c19b39569c1d17b2709b2af95612e210deba4d64e9716f49d51 +size 225208789 diff --git a/checkpoints/model_weights_000153026560.pt b/checkpoints/model_weights_000153026560.pt new file mode 100644 index 0000000000000000000000000000000000000000..fdd66fee066364708b4fad9a8119e6cb9b29c970 --- /dev/null +++ b/checkpoints/model_weights_000153026560.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f06a7e98e8410830e402dbe6495b1d981281137af2780503a166311d83eae0f +size 225208789 diff --git a/checkpoints/model_weights_000162201600.pt b/checkpoints/model_weights_000162201600.pt new file mode 100644 index 0000000000000000000000000000000000000000..23bdc7f8b21d9de492a1c597b004562d3daf85bb --- /dev/null +++ b/checkpoints/model_weights_000162201600.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:342f8904a6ed6efb9ba5ee09a462c629290616a80b6467299ae26d88a9c9c0d2 +size 225208789 diff --git a/checkpoints/model_weights_000163840000.pt b/checkpoints/model_weights_000163840000.pt new file mode 100644 index 0000000000000000000000000000000000000000..fb1c7acb37ebaa583d564f16cf43d12a9aec8575 --- /dev/null +++ b/checkpoints/model_weights_000163840000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16f3e72f01ef497aa322bdc6c4d4d8fd75f3239d35668ae1cbf100b6bad4836b +size 225208789 diff --git a/checkpoints/model_weights_000171933696.pt b/checkpoints/model_weights_000171933696.pt new file mode 100644 index 0000000000000000000000000000000000000000..809d22c54a6ddf496041321ef1f4298f9c89d9b8 --- /dev/null +++ b/checkpoints/model_weights_000171933696.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cfb38c36331e58d6329534d938cd9afbc822b2a9dbcb7e5e608fd02b6d119dd3 +size 225208789 diff --git a/checkpoints/model_weights_000180224000.pt b/checkpoints/model_weights_000180224000.pt new file mode 100644 index 0000000000000000000000000000000000000000..966c6e6a96b4937be4239d4708274f3109f8696e --- /dev/null +++ b/checkpoints/model_weights_000180224000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98efbc73c8bd75ae1f6a64d34912bc4e367e07250bec5ba49a155309ed7b554f +size 225208789 diff --git a/checkpoints/model_weights_000182255616.pt b/checkpoints/model_weights_000182255616.pt new file mode 100644 index 0000000000000000000000000000000000000000..a967f5a766eb78edd92d0565e9db004beefb7eb1 --- /dev/null +++ b/checkpoints/model_weights_000182255616.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6941d9b701ff934fdb82cb1b2a908c4c105b72851d4ae9bdbde60666a41c6165 +size 225208789 diff --git a/checkpoints/model_weights_000193200128.pt b/checkpoints/model_weights_000193200128.pt new file mode 100644 index 0000000000000000000000000000000000000000..ffcdf2846444ccbe490662b7f8b64c7090db9c8b --- /dev/null +++ b/checkpoints/model_weights_000193200128.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b62a41e96e16bbd20e69231ebfd1f4d7e1002985f0af17cee31d3faded06710a +size 225208789 diff --git a/checkpoints/model_weights_000196608000.pt b/checkpoints/model_weights_000196608000.pt new file mode 100644 index 0000000000000000000000000000000000000000..59a4d6bd8e0a2c01e4daf98145e8b39784bb4ca5 --- /dev/null +++ b/checkpoints/model_weights_000196608000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30dae573118a847dcda414267251683b122c008d58916b05790be0413d96b99b +size 225208789 diff --git a/checkpoints/model_weights_000204767232.pt b/checkpoints/model_weights_000204767232.pt new file mode 100644 index 0000000000000000000000000000000000000000..7816d9f3938aa0df8a8962f43ce13789e853767f --- /dev/null +++ b/checkpoints/model_weights_000204767232.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b66c9074be9fdd616cca1b995c8eb4ddc959353ec40c981d9b921a0b803f8b1 +size 225208789 diff --git a/checkpoints/model_weights_000212992000.pt b/checkpoints/model_weights_000212992000.pt new file mode 100644 index 0000000000000000000000000000000000000000..c18b56ff86d24db97decf852727bb2d84e84001c --- /dev/null +++ b/checkpoints/model_weights_000212992000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0a2231981309d122960144b6aa749235999b75a54d2c4026e2dfad06e960143 +size 225208789 diff --git a/checkpoints/model_weights_000217055232.pt b/checkpoints/model_weights_000217055232.pt new file mode 100644 index 0000000000000000000000000000000000000000..380ece17bccde8069ad123497386d07619699d3a --- /dev/null +++ b/checkpoints/model_weights_000217055232.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25a1a11855951e51171f0d6a9c304986d4d7c900111468ba542c5ecb0390348c +size 225208789 diff --git a/checkpoints/model_weights_000229376000.pt b/checkpoints/model_weights_000229376000.pt new file mode 100644 index 0000000000000000000000000000000000000000..00374b7ac256dfc757cc677990f51503f2812a5a --- /dev/null +++ b/checkpoints/model_weights_000229376000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7536cbf9dde1bab9119b71efe5d342bd12b6b9d4fc85ae5c18a6b0f15753a605 +size 225208789 diff --git a/checkpoints/model_weights_000230096896.pt b/checkpoints/model_weights_000230096896.pt new file mode 100644 index 0000000000000000000000000000000000000000..f1ab7b3155f583b8a689d930996d5bf7fd5f108d --- /dev/null +++ b/checkpoints/model_weights_000230096896.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed952cdb42e2bd35a1b8aa1767decaf9063072fae6c44bf0adfdfd7535b20b53 +size 225208789 diff --git a/checkpoints/model_weights_000243892224.pt b/checkpoints/model_weights_000243892224.pt new file mode 100644 index 0000000000000000000000000000000000000000..80e4fc228668330bbb73c1f71b9ed6cd578f9c1f --- /dev/null +++ b/checkpoints/model_weights_000243892224.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25fe7e7720a3cb6da33428f070fdd302d00a98d366e481c9b3cd997b6f1ddf8f +size 225208789 diff --git a/checkpoints/model_weights_000245760000.pt b/checkpoints/model_weights_000245760000.pt new file mode 100644 index 0000000000000000000000000000000000000000..34994d417d645c8632df0e21a68b3d13699faa2b --- /dev/null +++ b/checkpoints/model_weights_000245760000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c89b8c2a9199a685bc2fbccfaa77ff463b6bd2c980e5d3d1c93f33fc3f98f56 +size 225208789 diff --git a/checkpoints/model_weights_000258539520.pt b/checkpoints/model_weights_000258539520.pt new file mode 100644 index 0000000000000000000000000000000000000000..79bf28be32953bb51c518cbc17dc25819338799b --- /dev/null +++ b/checkpoints/model_weights_000258539520.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e384f70869817eb8f5092c0ee0078e5d3a8f91d374ed0dd06fbb9e13e4f09036 +size 225208789 diff --git a/checkpoints/model_weights_000262144000.pt b/checkpoints/model_weights_000262144000.pt new file mode 100644 index 0000000000000000000000000000000000000000..ffa21f2fda5ee19829872df2247e678b1a3c2565 --- /dev/null +++ b/checkpoints/model_weights_000262144000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:793f9f6dd1c57c51913f35a8f0aa293522fd889a5bb63f67893b945dfdde6611 +size 225208789 diff --git a/checkpoints/model_weights_000274038784.pt b/checkpoints/model_weights_000274038784.pt new file mode 100644 index 0000000000000000000000000000000000000000..436af133ef22bc413175882334438f5ca2676ea8 --- /dev/null +++ b/checkpoints/model_weights_000274038784.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2034aeaf85316b806421607da72519fc6e3422418a0ad3c98ff003c0bf11787b +size 225208789 diff --git a/checkpoints/model_weights_000278528000.pt b/checkpoints/model_weights_000278528000.pt new file mode 100644 index 0000000000000000000000000000000000000000..2e993083e8556ff4ef14eb1261f76f824dd9b560 --- /dev/null +++ b/checkpoints/model_weights_000278528000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38473322c6c85189b95afa3f64591c0156e6005c0e9b3ee96e3b2cd0a0d1f47c +size 225208789 diff --git a/checkpoints/model_weights_000290488320.pt b/checkpoints/model_weights_000290488320.pt new file mode 100644 index 0000000000000000000000000000000000000000..412e6c5e4e67df0d2c43ad5ff75e3dfee7210cf7 --- /dev/null +++ b/checkpoints/model_weights_000290488320.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17e54a2ebaaa3cf3c115198d93b6d616dae1528a3cb485b430401270c7439ea4 +size 225208789 diff --git a/checkpoints/model_weights_000294912000.pt b/checkpoints/model_weights_000294912000.pt new file mode 100644 index 0000000000000000000000000000000000000000..087461cabb20bfc0dca8a539a329272e3f471eba --- /dev/null +++ b/checkpoints/model_weights_000294912000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ebab956e86a7026080f8ac1e237f83576be99229e5abdad4fdaf937d09eff660 +size 225208789 diff --git a/checkpoints/model_weights_000307920896.pt b/checkpoints/model_weights_000307920896.pt new file mode 100644 index 0000000000000000000000000000000000000000..84fad7a8e100893d48cb929b223e98b24a9ce817 --- /dev/null +++ b/checkpoints/model_weights_000307920896.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8726feb82730e36c127636db19fb3bb5768e7c77038c061dced4056e49198d7 +size 225208789 diff --git a/checkpoints/model_weights_000311296000.pt b/checkpoints/model_weights_000311296000.pt new file mode 100644 index 0000000000000000000000000000000000000000..caf9e93083010db30bfd6acbf00b26f30250bd53 --- /dev/null +++ b/checkpoints/model_weights_000311296000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d4cf6f5b4802d293de2a6eb76cce055e2c654fd211cd3c337c4f530a7d8e16f +size 225208789 diff --git a/checkpoints/model_weights_000326402048.pt b/checkpoints/model_weights_000326402048.pt new file mode 100644 index 0000000000000000000000000000000000000000..ee3399b05ae4b3d9f2c687ec58e1e442fd5945f5 --- /dev/null +++ b/checkpoints/model_weights_000326402048.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e3e2991e57133e176da3ad69b1f6d3e2a693c761d5e1d15072b4fcf5dcfcb8b +size 225208789 diff --git a/checkpoints/model_weights_000327680000.pt b/checkpoints/model_weights_000327680000.pt new file mode 100644 index 0000000000000000000000000000000000000000..9f5079d803c3558f4eb04348f5d8fdfa14c4f130 --- /dev/null +++ b/checkpoints/model_weights_000327680000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47383fc43976b03715681b34e003f1440cd5fd4fb98bf3363548b46f8d76bc74 +size 225208789 diff --git a/checkpoints/model_weights_000344064000.pt b/checkpoints/model_weights_000344064000.pt new file mode 100644 index 0000000000000000000000000000000000000000..49e035e6712b4aea672ec71922e736559929503d --- /dev/null +++ b/checkpoints/model_weights_000344064000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4bc83d82c4f4dac896fc57425d908306232bef4926ff66bb53530c67db2d43ae +size 225208789 diff --git a/checkpoints/model_weights_000345997312.pt b/checkpoints/model_weights_000345997312.pt new file mode 100644 index 0000000000000000000000000000000000000000..2fbd14fa1cfed99e1175c1f2b5571fc7b71470c8 --- /dev/null +++ b/checkpoints/model_weights_000345997312.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:595ef5ba8335fa66eb8c9e03f8f13abb75bf099d3be434cd66da64dd8a23b86a +size 225208789 diff --git a/checkpoints/model_weights_000360448000.pt b/checkpoints/model_weights_000360448000.pt new file mode 100644 index 0000000000000000000000000000000000000000..8ad04be13d7197f43a82e336f01ec75502accce5 --- /dev/null +++ b/checkpoints/model_weights_000360448000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19f90a73116d0e3b4a59b04d4f3347cd0acf30455d6fb0d44fd258eb1b0af641 +size 225208789 diff --git a/checkpoints/model_weights_000366739456.pt b/checkpoints/model_weights_000366739456.pt new file mode 100644 index 0000000000000000000000000000000000000000..6a6ba34c7450497c95fe5f2d44fb621525678bee --- /dev/null +++ b/checkpoints/model_weights_000366739456.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cdf2a6b8b2bb027d895b72243e2e77dfb54a3f2870cd4ad6034987bed5779031 +size 225208789 diff --git a/checkpoints/model_weights_000376832000.pt b/checkpoints/model_weights_000376832000.pt new file mode 100644 index 0000000000000000000000000000000000000000..152f9211f0d0454372f32ea30b3e4dd8ff805df9 --- /dev/null +++ b/checkpoints/model_weights_000376832000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac322191b3f45cfc0c06afc6fc11910edd7690162cb950ead7c6a2d6ccd40a7c +size 225208789 diff --git a/checkpoints/model_weights_000388759552.pt b/checkpoints/model_weights_000388759552.pt new file mode 100644 index 0000000000000000000000000000000000000000..ac267c76665b4fec73543b1cc3cd715e70a694d1 --- /dev/null +++ b/checkpoints/model_weights_000388759552.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a33579ac60c1a5bac2eb0b0cad6f2fa4249abc7b286cd70077e315800b55636 +size 225208789 diff --git a/checkpoints/model_weights_000393216000.pt b/checkpoints/model_weights_000393216000.pt new file mode 100644 index 0000000000000000000000000000000000000000..fc49ff1dd01fc6587a6849fcf6b3821cca1b6a1c --- /dev/null +++ b/checkpoints/model_weights_000393216000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e07f590dca2f2d496c7d2ff03772749ede5089ed8905845611efdf51ed1315c5 +size 225208789 diff --git a/checkpoints/model_weights_000409600000.pt b/checkpoints/model_weights_000409600000.pt new file mode 100644 index 0000000000000000000000000000000000000000..0272f4f8590bb4ec4e7023c33ff28315921ff2d4 --- /dev/null +++ b/checkpoints/model_weights_000409600000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8465162339dd32f22f250cb9b48f021cedce33cf252b139c6cb028feb3d6aab7 +size 225208789 diff --git a/checkpoints/model_weights_000412090368.pt b/checkpoints/model_weights_000412090368.pt new file mode 100644 index 0000000000000000000000000000000000000000..7c543b2d1f67b453d0c3ac3737b44ab3f5c3b01f --- /dev/null +++ b/checkpoints/model_weights_000412090368.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f50510cf83bef4ad73ac51424cc662e7876efa5bf4e23faf79f81402d128991e +size 225208789 diff --git a/checkpoints/model_weights_000425984000.pt b/checkpoints/model_weights_000425984000.pt new file mode 100644 index 0000000000000000000000000000000000000000..ddcc60f501e9eea9f08bfa52ae8758da8912e7fa --- /dev/null +++ b/checkpoints/model_weights_000425984000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe75357a4466469eba6f8cbe0d7edf7d46ecb83cdc24f4a033b9d488265fd4c6 +size 225208789 diff --git a/checkpoints/model_weights_000436797440.pt b/checkpoints/model_weights_000436797440.pt new file mode 100644 index 0000000000000000000000000000000000000000..5b1cbe6c5bd410accf81b0e8f5385eab9840bb05 --- /dev/null +++ b/checkpoints/model_weights_000436797440.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a09e47e097cbfe76d90880eeb40dd8e763d1817ad966bed4b7a24e6a19d82c5 +size 225208789 diff --git a/checkpoints/model_weights_000442368000.pt b/checkpoints/model_weights_000442368000.pt new file mode 100644 index 0000000000000000000000000000000000000000..c8ac6af80fbf0cc57b28dcfa5419ca2413c82c54 --- /dev/null +++ b/checkpoints/model_weights_000442368000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1846e30f5121441009838e7e5fdf1e1469c2908150b16c0a720a9ced3f951b51 +size 225208789 diff --git a/checkpoints/model_weights_000458752000.pt b/checkpoints/model_weights_000458752000.pt new file mode 100644 index 0000000000000000000000000000000000000000..7dc5e5bd79dff4b0c55fe79f79ca3ea07d0028af --- /dev/null +++ b/checkpoints/model_weights_000458752000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d087e598bada66063ced75c851b7060dc96c04b43ae375e9627a1ec84e222ed +size 225208789 diff --git a/checkpoints/model_weights_000463011840.pt b/checkpoints/model_weights_000463011840.pt new file mode 100644 index 0000000000000000000000000000000000000000..6c289d8656bcb3451d938e1bceefbad04d3d4276 --- /dev/null +++ b/checkpoints/model_weights_000463011840.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c4320083721495e70d87e2669bc8d09c0e48916e167608a76e22bf08ca70d22 +size 225208789 diff --git a/checkpoints/model_weights_000475136000.pt b/checkpoints/model_weights_000475136000.pt new file mode 100644 index 0000000000000000000000000000000000000000..123255a7de328a45bebe823a7240eaa9c4f351d2 --- /dev/null +++ b/checkpoints/model_weights_000475136000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e24a0444d15bae20c88af91e31b6cffca3bd0fc441d779ac7de95eb06f9183ac +size 225208789 diff --git a/checkpoints/model_weights_000490799104.pt b/checkpoints/model_weights_000490799104.pt new file mode 100644 index 0000000000000000000000000000000000000000..e8f23b45581a8b3940865775306044a306720784 --- /dev/null +++ b/checkpoints/model_weights_000490799104.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:547b326786b323c0168fcc750886c291f44b7f30f247783991fdc604f591dec2 +size 225208789 diff --git a/checkpoints/model_weights_000491520000.pt b/checkpoints/model_weights_000491520000.pt new file mode 100644 index 0000000000000000000000000000000000000000..ad7c1764f8ff1c4b70b6a82973ce72151ea766bf --- /dev/null +++ b/checkpoints/model_weights_000491520000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3cdb2125e6b9b4a99758cc812730578c07ce0213e3d166cd78606cb9c5a9b6a5 +size 225208789 diff --git a/checkpoints/model_weights_000507904000.pt b/checkpoints/model_weights_000507904000.pt new file mode 100644 index 0000000000000000000000000000000000000000..e19a30ec7962511490cc4927f5f1e648cfe08e6c --- /dev/null +++ b/checkpoints/model_weights_000507904000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2b5fd3686b56b08f044bcca0930808631073f419c70648cf7258a0073e93de0 +size 225208789 diff --git a/checkpoints/model_weights_000520257536.pt b/checkpoints/model_weights_000520257536.pt new file mode 100644 index 0000000000000000000000000000000000000000..859fcaf16c45e742f4f0879cd4d01dc0e16582e0 --- /dev/null +++ b/checkpoints/model_weights_000520257536.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c94bd3f07e3b946ece8613ed9435defc1dcf6e3d946a735af2b3cca456d484e +size 225208789 diff --git a/checkpoints/model_weights_000524288000.pt b/checkpoints/model_weights_000524288000.pt new file mode 100644 index 0000000000000000000000000000000000000000..1b36a1fe149c227d614f445add6bebe5790daebe --- /dev/null +++ b/checkpoints/model_weights_000524288000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f50afbb1604ef3b168508c05983e50efff199785c2747646d7e108b24b673a1 +size 225208789 diff --git a/checkpoints/model_weights_000540672000.pt b/checkpoints/model_weights_000540672000.pt new file mode 100644 index 0000000000000000000000000000000000000000..459dfcf2a7a7cdb2b877f9fad9b4b4bdfe3faef7 --- /dev/null +++ b/checkpoints/model_weights_000540672000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d55489cd4ee932d70008857a220b512a1cae62af4ed807902d2a074cde10258 +size 225208789 diff --git a/checkpoints/model_weights_000551452672.pt b/checkpoints/model_weights_000551452672.pt new file mode 100644 index 0000000000000000000000000000000000000000..c0e5cb7b70fd31a247b6d41911e2274759b27eb5 --- /dev/null +++ b/checkpoints/model_weights_000551452672.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ae3a9f4cf949ed29255b5ea14a1cebea761960f31c3281101fd3912a7e16a97 +size 225208789 diff --git a/checkpoints/model_weights_000557056000.pt b/checkpoints/model_weights_000557056000.pt new file mode 100644 index 0000000000000000000000000000000000000000..c6c75f71fbd4421aac0e74ecb8fd976c7afec824 --- /dev/null +++ b/checkpoints/model_weights_000557056000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29c52c005987e4313c06aaa9bf600959e99f766f12d40e23f66d4ed3b2d327a6 +size 225208789 diff --git a/checkpoints/model_weights_000573440000.pt b/checkpoints/model_weights_000573440000.pt new file mode 100644 index 0000000000000000000000000000000000000000..e56274fe7a6838695325a59cfb10ef27e3b1b34e --- /dev/null +++ b/checkpoints/model_weights_000573440000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1c86b870157a5ee266481813e03d8983fd2598a695abcf5883d8ece7028e3ea +size 225208789 diff --git a/checkpoints/model_weights_000584548352.pt b/checkpoints/model_weights_000584548352.pt new file mode 100644 index 0000000000000000000000000000000000000000..3d01f3f087467b39b8d01fb3c04979674ae75900 --- /dev/null +++ b/checkpoints/model_weights_000584548352.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:72d86d490dce39fec66c82191fa473f7489576629f4bda3c5488a06cd5a2e462 +size 225208789 diff --git a/checkpoints/model_weights_000589824000.pt b/checkpoints/model_weights_000589824000.pt new file mode 100644 index 0000000000000000000000000000000000000000..db2c2ade1250220d97968ddaa38ec85dd61cd26f --- /dev/null +++ b/checkpoints/model_weights_000589824000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b4875670490e1f6b222ddce70ad291576d5904df59dccf838c0ecd517fd4134 +size 225208789 diff --git a/checkpoints/model_weights_000606208000.pt b/checkpoints/model_weights_000606208000.pt new file mode 100644 index 0000000000000000000000000000000000000000..23cbdc3c4ef9044f81831ccd1a564f4a49647b6c --- /dev/null +++ b/checkpoints/model_weights_000606208000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba3fc4201a19af5a2e3a8066b42a716987a9d867d94d8aa20f7872ae58a8e151 +size 225208789 diff --git a/checkpoints/model_weights_000619610112.pt b/checkpoints/model_weights_000619610112.pt new file mode 100644 index 0000000000000000000000000000000000000000..a2a24205d52d2b75c06fcc3dfecfd517f73da739 --- /dev/null +++ b/checkpoints/model_weights_000619610112.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6169edb7586bb61da1cea79875ac60f008b41ff2ae70ea332130fadeb3635eba +size 225208789 diff --git a/checkpoints/model_weights_000622592000.pt b/checkpoints/model_weights_000622592000.pt new file mode 100644 index 0000000000000000000000000000000000000000..d39da7daca5a837521d775bdd10595c9165ce40f --- /dev/null +++ b/checkpoints/model_weights_000622592000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed57f3468ff7f2457abb0f5bbfc4ff898353f147833d33fba24189d5527747c4 +size 225208789 diff --git a/checkpoints/model_weights_000638976000.pt b/checkpoints/model_weights_000638976000.pt new file mode 100644 index 0000000000000000000000000000000000000000..a280ae5cf3d35c9fc3d87ad46ba4574aabce122e --- /dev/null +++ b/checkpoints/model_weights_000638976000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2d56e2688d3955baa40abce798f98bd41845c7a137bb2abdff39fec35f369b2 +size 225208789 diff --git a/checkpoints/model_weights_000655360000.pt b/checkpoints/model_weights_000655360000.pt new file mode 100644 index 0000000000000000000000000000000000000000..fa0422c3b424e0f6e6391bdf75803311daa470e6 --- /dev/null +++ b/checkpoints/model_weights_000655360000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b871940e332627b814fd8ded62042d3cac4ee15884e7aab41c6dc01e25dfe7a3 +size 225208789 diff --git a/checkpoints/model_weights_000656801792.pt b/checkpoints/model_weights_000656801792.pt new file mode 100644 index 0000000000000000000000000000000000000000..e011fa7585ff2cb6a12f606af9dd4e47ec26cd16 --- /dev/null +++ b/checkpoints/model_weights_000656801792.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c5ebf06af5adf3f7dae09d00c4a98d052fd22c0ec7f86ec5ebe9c009500f750 +size 225208789 diff --git a/checkpoints/model_weights_000671744000.pt b/checkpoints/model_weights_000671744000.pt new file mode 100644 index 0000000000000000000000000000000000000000..3a58e2569f0617ed43806a2bda91ebb93f5d271d --- /dev/null +++ b/checkpoints/model_weights_000671744000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b88a29cac229cb9ff37be51822780a2222aa54ba1acc6209603dbdbd6698eb5c +size 225208789 diff --git a/checkpoints/model_weights_000688128000.pt b/checkpoints/model_weights_000688128000.pt new file mode 100644 index 0000000000000000000000000000000000000000..24b0a4cbc0eb3bc3418ba39620bd806e66596eae --- /dev/null +++ b/checkpoints/model_weights_000688128000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ff82244659c41c870405ac677790c505f40617a6e521cb18b8c31bd6571bc47 +size 225208789 diff --git a/checkpoints/model_weights_000696221696.pt b/checkpoints/model_weights_000696221696.pt new file mode 100644 index 0000000000000000000000000000000000000000..1c1eca6b4e6bf5938f8664eea00c5ff5fc05a8e2 --- /dev/null +++ b/checkpoints/model_weights_000696221696.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34a1cf62ed23da8b27d967e1291ddf862e7e8c75bf32b90d79694a30003af234 +size 225208789 diff --git a/checkpoints/model_weights_000704512000.pt b/checkpoints/model_weights_000704512000.pt new file mode 100644 index 0000000000000000000000000000000000000000..1128501e524dff7b4634d23a11688e4fe09d5655 --- /dev/null +++ b/checkpoints/model_weights_000704512000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0fdb6220254494d6ac8effda811baf32f2b36716c7b0fc6892cd7d689ae3a71b +size 225208789 diff --git a/checkpoints/model_weights_000720896000.pt b/checkpoints/model_weights_000720896000.pt new file mode 100644 index 0000000000000000000000000000000000000000..7567ae56d3c11524806f1acc76f0b0fd25925c6c --- /dev/null +++ b/checkpoints/model_weights_000720896000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:001f674a4e3710e74d393655472d0d8805dc7b35164d8c84f91988f9d8fd581d +size 225208789 diff --git a/checkpoints/model_weights_000737280000.pt b/checkpoints/model_weights_000737280000.pt new file mode 100644 index 0000000000000000000000000000000000000000..cc13acf60fb018d6b674241fdd719b206c92751e --- /dev/null +++ b/checkpoints/model_weights_000737280000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de17a117ee6c4e254eef9616905f6369e5ed920314d066091b3a9857c3de4281 +size 225208789 diff --git a/checkpoints/model_weights_000738000896.pt b/checkpoints/model_weights_000738000896.pt new file mode 100644 index 0000000000000000000000000000000000000000..93c6059750e79be43d998586e8baea68268f41d0 --- /dev/null +++ b/checkpoints/model_weights_000738000896.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd08724a88fd0e90cdec481f1fb952eca1645de977cacaa9748eea8cebc6a284 +size 225208789 diff --git a/checkpoints/model_weights_000753664000.pt b/checkpoints/model_weights_000753664000.pt new file mode 100644 index 0000000000000000000000000000000000000000..a3aa0c1be8be02ec67ed00496536010c597a6bd1 --- /dev/null +++ b/checkpoints/model_weights_000753664000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d453d14f79f1da58936bcda2f2365b978ab4c6ba8ddd9a3a7ac8cdbf29e30142 +size 225208789 diff --git a/checkpoints/model_weights_000770048000.pt b/checkpoints/model_weights_000770048000.pt new file mode 100644 index 0000000000000000000000000000000000000000..59dc3e2382557614cabc0ef000c713131167bd1d --- /dev/null +++ b/checkpoints/model_weights_000770048000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0046a11b85de49cdb068f0724bbb347b8b5ac5dbf7229c991d711fae97c2104 +size 225208789 diff --git a/checkpoints/model_weights_000782270464.pt b/checkpoints/model_weights_000782270464.pt new file mode 100644 index 0000000000000000000000000000000000000000..d301f38ff7ece616cf5899a9e3da8a220da6ab92 --- /dev/null +++ b/checkpoints/model_weights_000782270464.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d10d6d1d92fa0945170c7eeb0054b8815546815aba08bcbac0a87eb216d743bb +size 225208789 diff --git a/checkpoints/model_weights_000786432000.pt b/checkpoints/model_weights_000786432000.pt new file mode 100644 index 0000000000000000000000000000000000000000..9631d3f20e85873fab41b81257ecb5354a1d3c1d --- /dev/null +++ b/checkpoints/model_weights_000786432000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:77db166be7a81b5dbce75eaf33e69856bce275619624a1c7d3050b9e0e827085 +size 225208789 diff --git a/checkpoints/model_weights_000802816000.pt b/checkpoints/model_weights_000802816000.pt new file mode 100644 index 0000000000000000000000000000000000000000..67ea356ad994dace215dbbe80c8cea629f675001 --- /dev/null +++ b/checkpoints/model_weights_000802816000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a0d207e4bf835055ad4d9ccca54288a8f3ced6722471c8f724bb0a927407dfd +size 225208789 diff --git a/checkpoints/model_weights_000819200000.pt b/checkpoints/model_weights_000819200000.pt new file mode 100644 index 0000000000000000000000000000000000000000..ce313cb070287b439902df5efa49d56de7fe6f53 --- /dev/null +++ b/checkpoints/model_weights_000819200000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d09476af62d740e8da02d3ed0e97f72ee25f2a7b0e2a20563d071f79285539c +size 225208789 diff --git a/checkpoints/model_weights_000829194240.pt b/checkpoints/model_weights_000829194240.pt new file mode 100644 index 0000000000000000000000000000000000000000..3d323a190777997f73a001ba24b8b576bd948f23 --- /dev/null +++ b/checkpoints/model_weights_000829194240.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5fd742647c30464a46d677a6c3200d4f76f38c49d68b6af0e16327ba0e4d959c +size 225208789 diff --git a/checkpoints/model_weights_000835584000.pt b/checkpoints/model_weights_000835584000.pt new file mode 100644 index 0000000000000000000000000000000000000000..accd2163359990212bb41ad3184e96b39ea2b1d7 --- /dev/null +++ b/checkpoints/model_weights_000835584000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61de62994f7d8c971cd24b744fd614048c1feb0f91281737d354e62ab1df90bd +size 225208789 diff --git a/checkpoints/model_weights_000851968000.pt b/checkpoints/model_weights_000851968000.pt new file mode 100644 index 0000000000000000000000000000000000000000..33ea53b23229825fdb13046f058645728a3e5aa6 --- /dev/null +++ b/checkpoints/model_weights_000851968000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e78d8f9e3530ee21c64a977c23756ca7e00b67d96e164527082531f382dda975 +size 225208789 diff --git a/checkpoints/model_weights_000868352000.pt b/checkpoints/model_weights_000868352000.pt new file mode 100644 index 0000000000000000000000000000000000000000..b59af56a35edbf765c78c5ef963370a3e8d5cc08 --- /dev/null +++ b/checkpoints/model_weights_000868352000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5dc56dea13a63e3917e7c4d4159447d765cbb3610678d7645bc167a9c2b04b77 +size 225208789 diff --git a/checkpoints/model_weights_000878968832.pt b/checkpoints/model_weights_000878968832.pt new file mode 100644 index 0000000000000000000000000000000000000000..554f801a05092c1b9aa131a00952b8accea30c82 --- /dev/null +++ b/checkpoints/model_weights_000878968832.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85fa6645708d7883c2b4d1b4333c290bc682a36ca63ee92613074d4b74d3ecca +size 225208789 diff --git a/checkpoints/model_weights_000884736000.pt b/checkpoints/model_weights_000884736000.pt new file mode 100644 index 0000000000000000000000000000000000000000..3d09dfaeea87143d1f91c2c3c2f6e1afc8f8de3a --- /dev/null +++ b/checkpoints/model_weights_000884736000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca307a2f46e72836d218b5cd93a0fdf7a5a0281432c70e944b082220011356a7 +size 225208789 diff --git a/checkpoints/model_weights_000901120000.pt b/checkpoints/model_weights_000901120000.pt new file mode 100644 index 0000000000000000000000000000000000000000..d66eb4908e5d68a3d38dcfc72c977e33d5988971 --- /dev/null +++ b/checkpoints/model_weights_000901120000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:664a27ae328f89414040e35b119503953c3185e23f11927215e2b67e2af75fac +size 225208789 diff --git a/checkpoints/model_weights_000917504000.pt b/checkpoints/model_weights_000917504000.pt new file mode 100644 index 0000000000000000000000000000000000000000..79407fa659b7462096d49da2b3c4246f71678f4a --- /dev/null +++ b/checkpoints/model_weights_000917504000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:166c1ae1a10294290d7b95a9e792e5306d742686304cbedea9107fa83b0df386 +size 225208789 diff --git a/checkpoints/model_weights_000931692544.pt b/checkpoints/model_weights_000931692544.pt new file mode 100644 index 0000000000000000000000000000000000000000..a00b9c979508455d471386c7cade32da65fcd07e --- /dev/null +++ b/checkpoints/model_weights_000931692544.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dac6e928a1331569e454d1470a9375d9309bc460dfc43cd3007e3a0d2c6e72ff +size 225208789 diff --git a/checkpoints/model_weights_000933888000.pt b/checkpoints/model_weights_000933888000.pt new file mode 100644 index 0000000000000000000000000000000000000000..a30a21153a50ac299d273e1d6984d5108f89ba35 --- /dev/null +++ b/checkpoints/model_weights_000933888000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7917aeddf87e74c77dbc683041cf235a1dd3bdd646cfc64da1c6dab81ea5ce5b +size 225208789 diff --git a/checkpoints/model_weights_000950272000.pt b/checkpoints/model_weights_000950272000.pt new file mode 100644 index 0000000000000000000000000000000000000000..caa7b5f4ee8d41a6d03fe8d203d8328cd62a2f5a --- /dev/null +++ b/checkpoints/model_weights_000950272000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bba6792139d25d07443bfdff93b3465a3d23cc4e4b09b81e1a499954a29017c9 +size 225208789 diff --git a/checkpoints/model_weights_000952696832.pt b/checkpoints/model_weights_000952696832.pt new file mode 100644 index 0000000000000000000000000000000000000000..17bafe77180f3387af9cba18f4802adb85f8b448 --- /dev/null +++ b/checkpoints/model_weights_000952696832.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e62b7a32029b35adf4d000bd6a6ed504ee115c9c40e39f6f197e4e0ae4804c77 +size 225015557 diff --git a/config.toml b/config.toml new file mode 100644 index 0000000000000000000000000000000000000000..c78f8b132c6da4593ac880d47ac2951898146e57 --- /dev/null +++ b/config.toml @@ -0,0 +1,32 @@ +model_name = "gelu_2l_v4" +n_layers = 2 +d_model = 512 +d_mlp = 2048 +d_head = 64 +n_heads = 8 +attn_only = false +layer_norm_eps = 1e-05 +init_range = 0.02 +n_ctx = 1024 +d_vocab = 48262 +dataset_name = "NeelNanda/c4-code-tokenized-2b" +tokenizer_name = "NeelNanda/gpt-neox-tokenizer-digits" +seed = 10 +device = "cuda" +use_bfloat16_matmul = false +batch_size_per_device = 32 +n_devices = 1 +batches_per_step = 1 +max_tokens = 22000000000 +lr_hidden = 0.002 +lr_vector = 0.001 +lr_schedule = "cosine_warmup" +warmup_tokens = 30000000 +weight_decay = 0.05 +grad_norm_clip = 1.0 +train_loss_moving_average_beta = 0.99 +log_interval = 25 +save_checkpoints = true +checkpoint_interval = 500 +checkpoint_interval_ratio = 1.06 +save_log_checkpoints = true \ No newline at end of file diff --git a/latest_checkpoint.pt b/latest_checkpoint.pt new file mode 100644 index 0000000000000000000000000000000000000000..06a34b8573898d8361daffb3c347457616d2288a --- /dev/null +++ b/latest_checkpoint.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3a8c8281264b7fe4f7493bf0dff463d7198da7cb8b072f9aaf975fe015c70bc +size 225208311 diff --git a/latest_metadata.json b/latest_metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..e0723c2391c35dde156e8b72df144e1e9121c5a0 --- /dev/null +++ b/latest_metadata.json @@ -0,0 +1 @@ +{"step": 29000, "tokens_seen": 950272000, "config": {"model_name": "gelu_2l_v4", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "NeelNanda/c4-code-tokenized-2b", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 22000000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "cosine_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.06, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 671386}, "train_loss_ewma": 3.7399442408263717} \ No newline at end of file diff --git a/latest_optimizer.pt b/latest_optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..d12d47d47926be8c79e4b26f96a09ee3600537e8 --- /dev/null +++ b/latest_optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d60a7ba56039bba9cc9bc422c091cca771cddcfae0c89415328a389e4137c12d +size 450422803 diff --git a/run.sh b/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..122ddb5e9329c0fcebd4b9d7df66890effe86817 --- /dev/null +++ b/run.sh @@ -0,0 +1,28 @@ + +#!/bin/bash +# Check if "restart" argument is passed to force normal training +if [ "$1" = "restart" ]; then + echo "Force restart: Running normal training ..." + python -c " +import os +from toy_models.train.trainer import train_transformer_from_config +current_dir = os.getcwd() +train_transformer_from_config('config.toml', current_dir) +" +else + # Check for checkpoints and run appropriate training + python -c " +import os +from pathlib import Path +from toy_models.train.trainer import train_transformer_from_config, restart_from_checkpoint +current_dir = os.getcwd() +# Check if checkpoints directory exists and has .pt files +latest_checkpoint = Path('latest_checkpoint.pt') +if latest_checkpoint.exists(): + print(f'Found checkpoint: {latest_checkpoint}. Restarting from checkpoint...') + restart_from_checkpoint(current_dir) +else: + print('Starting training from beginning ...') + train_transformer_from_config(current_dir) +" +fi \ No newline at end of file diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..9e609479874574a6a45dc17f5fd98cc47b701851 --- /dev/null +++ b/wandb/debug-internal.log @@ -0,0 +1,12 @@ +{"time":"2025-08-17T23:13:35.027401827Z","level":"INFO","msg":"stream: starting","core version":"0.21.1"} +{"time":"2025-08-17T23:13:35.390074378Z","level":"INFO","msg":"stream: created new stream","id":"ztcapltu"} +{"time":"2025-08-17T23:13:35.390133919Z","level":"INFO","msg":"stream: started","id":"ztcapltu"} +{"time":"2025-08-17T23:13:35.390167795Z","level":"INFO","msg":"writer: started","stream_id":"ztcapltu"} +{"time":"2025-08-17T23:13:35.390220023Z","level":"INFO","msg":"handler: started","stream_id":"ztcapltu"} +{"time":"2025-08-17T23:13:35.390286334Z","level":"INFO","msg":"sender: started","stream_id":"ztcapltu"} +{"time":"2025-08-18T04:37:57.836761784Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-08-18T04:37:57.992197787Z","level":"INFO","msg":"handler: operation stats","stats":{}} +{"time":"2025-08-18T04:37:57.99521591Z","level":"INFO","msg":"stream: closing","id":"ztcapltu"} +{"time":"2025-08-18T04:37:57.995240541Z","level":"INFO","msg":"handler: closed","stream_id":"ztcapltu"} +{"time":"2025-08-18T04:37:57.995293994Z","level":"INFO","msg":"sender: closed","stream_id":"ztcapltu"} +{"time":"2025-08-18T04:37:57.995303271Z","level":"INFO","msg":"stream: closed","id":"ztcapltu"} diff --git a/wandb/debug.log b/wandb/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..71497b7e16be463200d033d42be9d95b9ef4377b --- /dev/null +++ b/wandb/debug.log @@ -0,0 +1,28 @@ +2025-08-17 23:13:34,546 INFO MainThread:155 [wandb_setup.py:_flush():80] Current SDK version is 0.21.1 +2025-08-17 23:13:34,546 INFO MainThread:155 [wandb_setup.py:_flush():80] Configure stats pid to 155 +2025-08-17 23:13:34,546 INFO MainThread:155 [wandb_setup.py:_flush():80] Loading settings from /root/.config/wandb/settings +2025-08-17 23:13:34,546 INFO MainThread:155 [wandb_setup.py:_flush():80] Loading settings from /notebooks/toy_models/models/gelu_2l_v4/wandb/settings +2025-08-17 23:13:34,546 INFO MainThread:155 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-08-17 23:13:34,546 INFO MainThread:155 [wandb_init.py:setup_run_log_directory():703] Logging user logs to /notebooks/toy_models/models/gelu_2l_v4/wandb/run-20250817_231334-ztcapltu/logs/debug.log +2025-08-17 23:13:34,546 INFO MainThread:155 [wandb_init.py:setup_run_log_directory():704] Logging internal logs to /notebooks/toy_models/models/gelu_2l_v4/wandb/run-20250817_231334-ztcapltu/logs/debug-internal.log +2025-08-17 23:13:34,546 INFO MainThread:155 [wandb_init.py:init():830] calling init triggers +2025-08-17 23:13:34,546 INFO MainThread:155 [wandb_init.py:init():835] wandb.init called with sweep_config: {} +config: {'model_name': 'gelu_2l_v4', 'n_layers': 2, 'd_model': 512, 'd_mlp': 2048, 'd_head': 64, 'n_heads': 8, 'attn_only': False, 'layer_norm_eps': 1e-05, 'init_range': 0.02, 'n_ctx': 1024, 'd_vocab': 48262, 'dataset_name': 'NeelNanda/c4-code-tokenized-2b', 'tokenizer_name': 'NeelNanda/gpt-neox-tokenizer-digits', 'seed': 10, 'device': 'cuda', 'use_bfloat16_matmul': False, 'batch_size_per_device': 32, 'n_devices': 1, 'batches_per_step': 1, 'max_tokens': 22000000000, 'lr_hidden': 0.002, 'lr_vector': 0.001, 'lr_schedule': 'cosine_warmup', 'warmup_tokens': 30000000, 'weight_decay': 0.05, 'grad_norm_clip': 1.0, 'train_loss_moving_average_beta': 0.99, 'log_interval': 25, 'save_checkpoints': True, 'checkpoint_interval': 500, 'checkpoint_interval_ratio': 1.06, 'save_log_checkpoints': True, 'use_wandb': True, 'batch_size': 32, 'tokens_per_step': 32768, 'warmup_steps': 915, 'max_steps': 671386, '_wandb': {}} +2025-08-17 23:13:34,546 INFO MainThread:155 [wandb_init.py:init():871] starting backend +2025-08-17 23:13:35,012 INFO MainThread:155 [wandb_init.py:init():874] sending inform_init request +2025-08-17 23:13:35,025 INFO MainThread:155 [wandb_init.py:init():882] backend started and connected +2025-08-17 23:13:35,026 INFO MainThread:155 [wandb_init.py:init():953] updated telemetry +2025-08-17 23:13:35,032 INFO MainThread:155 [wandb_init.py:init():977] communicating run to backend with 90.0 second timeout +2025-08-17 23:13:35,613 INFO MainThread:155 [wandb_init.py:init():1029] starting run threads in backend +2025-08-17 23:13:36,205 INFO MainThread:155 [wandb_run.py:_console_start():2494] atexit reg +2025-08-17 23:13:36,205 INFO MainThread:155 [wandb_run.py:_redirect():2342] redirect: wrap_raw +2025-08-17 23:13:36,205 INFO MainThread:155 [wandb_run.py:_redirect():2411] Wrapping output streams. +2025-08-17 23:13:36,206 INFO MainThread:155 [wandb_run.py:_redirect():2434] Redirects installed. +2025-08-17 23:13:36,219 INFO MainThread:155 [wandb_init.py:init():1075] run started, returning control to user process +2025-08-18 04:37:57,417 INFO MainThread:155 [wandb_run.py:_finish():2260] finishing run eoin/toy-transformer-replication/ztcapltu +2025-08-18 04:37:57,422 INFO MainThread:155 [wandb_run.py:_atexit_cleanup():2459] got exitcode: 0 +2025-08-18 04:37:57,422 INFO MainThread:155 [wandb_run.py:_restore():2441] restore +2025-08-18 04:37:57,422 INFO MainThread:155 [wandb_run.py:_restore():2447] restore done +2025-08-18 04:37:57,993 INFO MainThread:155 [wandb_run.py:_footer_history_summary_info():3895] rendering history +2025-08-18 04:37:57,994 INFO MainThread:155 [wandb_run.py:_footer_history_summary_info():3927] rendering summary +2025-08-18 04:37:57,994 INFO MainThread:155 [wandb_run.py:_footer_sync_info():3856] logging synced files diff --git a/wandb/run-20250817_231334-ztcapltu/files/config.yaml b/wandb/run-20250817_231334-ztcapltu/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ecf76f01c56fc29be90c85179dc09c277b4f2641 --- /dev/null +++ b/wandb/run-20250817_231334-ztcapltu/files/config.yaml @@ -0,0 +1,129 @@ +_wandb: + value: + cli_version: 0.21.1 + e: + 62x2pn3fvnbphvb760dj9u9xjsfoa3n2: + cpu_count: 8 + cpu_count_logical: 8 + cudaVersion: "12.4" + disk: + /: + total: "262240792576" + used: "119256522752" + email: efarrel4@tcd.ie + executable: /notebooks/clean_env/bin/python + git: + commit: d64789fa6192a1a6beb031f1d38c9cfcfa725511 + remote: git@github.com:jgroh3/toy_models.git + gpu: NVIDIA RTX A6000 + gpu_count: 1 + gpu_nvidia: + - architecture: Ampere + cudaCores: 10752 + memoryTotal: "51527024640" + name: NVIDIA RTX A6000 + uuid: GPU-7cbd3200-160d-1175-56c6-469e5662f695 + host: negsg2jtgf + memory: + total: "47332843520" + os: Linux-5.19.0-45-generic-x86_64-with-glibc2.35 + program: + python: CPython 3.11.7 + root: /notebooks/toy_models/models/gelu_2l_v4 + startedAt: "2025-08-17T23:13:34.541556Z" + writerId: 62x2pn3fvnbphvb760dj9u9xjsfoa3n2 + m: [] + python_version: 3.11.7 + t: + "1": + - 1 + - 49 + - 51 + "2": + - 1 + - 49 + - 51 + "3": + - 2 + - 13 + - 15 + - 16 + - 61 + "4": 3.11.7 + "5": 0.21.1 + "12": 0.21.1 + "13": linux-x86_64 +attn_only: + value: false +batch_size: + value: 32 +batch_size_per_device: + value: 32 +batches_per_step: + value: 1 +checkpoint_interval: + value: 500 +checkpoint_interval_ratio: + value: 1.06 +d_head: + value: 64 +d_mlp: + value: 2048 +d_model: + value: 512 +d_vocab: + value: 48262 +dataset_name: + value: NeelNanda/c4-code-tokenized-2b +device: + value: cuda +grad_norm_clip: + value: 1 +init_range: + value: 0.02 +layer_norm_eps: + value: 1e-05 +log_interval: + value: 25 +lr_hidden: + value: 0.002 +lr_schedule: + value: cosine_warmup +lr_vector: + value: 0.001 +max_steps: + value: 671386 +max_tokens: + value: 22000000000 +model_name: + value: gelu_2l_v4 +n_ctx: + value: 1024 +n_devices: + value: 1 +n_heads: + value: 8 +n_layers: + value: 2 +save_checkpoints: + value: true +save_log_checkpoints: + value: true +seed: + value: 10 +tokenizer_name: + value: NeelNanda/gpt-neox-tokenizer-digits +tokens_per_step: + value: 32768 +train_loss_moving_average_beta: + value: 0.99 +use_bfloat16_matmul: + value: false +use_wandb: + value: true +warmup_steps: + value: 915 +warmup_tokens: + value: 30000000 +weight_decay: + value: 0.05 diff --git a/wandb/run-20250817_231334-ztcapltu/files/output.log b/wandb/run-20250817_231334-ztcapltu/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..426b4639026855562b062a6f49d254719a03da76 --- /dev/null +++ b/wandb/run-20250817_231334-ztcapltu/files/output.log @@ -0,0 +1,1172 @@ +Training on cuda +Model: 2L, 512d, 8h +Max steps: 671,386, Max tokens: 22,000,000,000 +Warmup steps: 915, Warmup tokens: 30,000,000 +Batch size per device: 32 +Context length: 1024 +Learning rates - Hidden: 0.002, Vector: 0.001 + +Step 25 | Tokens: 819,200 | Train Loss EWMA: 10.7912 | Learning Rate: 0.000055 | Progress: 0.00004 +Step 50 | Tokens: 1,638,400 | Train Loss EWMA: 10.5542 | Learning Rate: 0.000109 | Progress: 0.00007 +Step 75 | Tokens: 2,457,600 | Train Loss EWMA: 10.1907 | Learning Rate: 0.000164 | Progress: 0.00011 +Step 100 | Tokens: 3,276,800 | Train Loss EWMA: 9.7065 | Learning Rate: 0.000219 | Progress: 0.00015 +Step 125 | Tokens: 4,096,000 | Train Loss EWMA: 9.1967 | Learning Rate: 0.000273 | Progress: 0.00019 +Step 150 | Tokens: 4,915,200 | Train Loss EWMA: 8.7312 | Learning Rate: 0.000328 | Progress: 0.00022 +Step 175 | Tokens: 5,734,400 | Train Loss EWMA: 8.3128 | Learning Rate: 0.000383 | Progress: 0.00026 +Step 200 | Tokens: 6,553,600 | Train Loss EWMA: 7.9425 | Learning Rate: 0.000437 | Progress: 0.00030 +Step 225 | Tokens: 7,372,800 | Train Loss EWMA: 7.6338 | Learning Rate: 0.000492 | Progress: 0.00034 +Step 250 | Tokens: 8,192,000 | Train Loss EWMA: 7.3691 | Learning Rate: 0.000546 | Progress: 0.00037 +Step 275 | Tokens: 9,011,200 | Train Loss EWMA: 7.1527 | Learning Rate: 0.000601 | Progress: 0.00041 +Step 300 | Tokens: 9,830,400 | Train Loss EWMA: 6.9707 | Learning Rate: 0.000656 | Progress: 0.00045 +Step 325 | Tokens: 10,649,600 | Train Loss EWMA: 6.8138 | Learning Rate: 0.000710 | Progress: 0.00048 +Step 350 | Tokens: 11,468,800 | Train Loss EWMA: 6.6855 | Learning Rate: 0.000765 | Progress: 0.00052 +Step 375 | Tokens: 12,288,000 | Train Loss EWMA: 6.5790 | Learning Rate: 0.000820 | Progress: 0.00056 +Step 400 | Tokens: 13,107,200 | Train Loss EWMA: 6.4915 | Learning Rate: 0.000874 | Progress: 0.00060 +Step 425 | Tokens: 13,926,400 | Train Loss EWMA: 6.4028 | Learning Rate: 0.000929 | Progress: 0.00063 +Step 450 | Tokens: 14,745,600 | Train Loss EWMA: 6.3279 | Learning Rate: 0.000984 | Progress: 0.00067 +Step 475 | Tokens: 15,564,800 | Train Loss EWMA: 6.2639 | Learning Rate: 0.001038 | Progress: 0.00071 +Step 500 | Tokens: 16,384,000 | Train Loss EWMA: 6.2079 | Learning Rate: 0.001093 | Progress: 0.00074 +Step 525 | Tokens: 17,203,200 | Train Loss EWMA: 6.1643 | Learning Rate: 0.001148 | Progress: 0.00078 +Step 550 | Tokens: 18,022,400 | Train Loss EWMA: 6.1188 | Learning Rate: 0.001202 | Progress: 0.00082 +Step 575 | Tokens: 18,841,600 | Train Loss EWMA: 6.0746 | Learning Rate: 0.001257 | Progress: 0.00086 +Step 600 | Tokens: 19,660,800 | Train Loss EWMA: 6.0286 | Learning Rate: 0.001311 | Progress: 0.00089 +Step 625 | Tokens: 20,480,000 | Train Loss EWMA: 5.9939 | Learning Rate: 0.001366 | Progress: 0.00093 +Step 650 | Tokens: 21,299,200 | Train Loss EWMA: 5.9554 | Learning Rate: 0.001421 | Progress: 0.00097 +Step 675 | Tokens: 22,118,400 | Train Loss EWMA: 5.9233 | Learning Rate: 0.001475 | Progress: 0.00101 +Step 700 | Tokens: 22,937,600 | Train Loss EWMA: 5.8890 | Learning Rate: 0.001530 | Progress: 0.00104 +Step 725 | Tokens: 23,756,800 | Train Loss EWMA: 5.8623 | Learning Rate: 0.001585 | Progress: 0.00108 +Step 750 | Tokens: 24,576,000 | Train Loss EWMA: 5.8382 | Learning Rate: 0.001639 | Progress: 0.00112 +Step 775 | Tokens: 25,395,200 | Train Loss EWMA: 5.8180 | Learning Rate: 0.001694 | Progress: 0.00115 +Step 800 | Tokens: 26,214,400 | Train Loss EWMA: 5.7921 | Learning Rate: 0.001749 | Progress: 0.00119 +Step 825 | Tokens: 27,033,600 | Train Loss EWMA: 5.7636 | Learning Rate: 0.001803 | Progress: 0.00123 +Step 850 | Tokens: 27,852,800 | Train Loss EWMA: 5.7382 | Learning Rate: 0.001858 | Progress: 0.00127 +Step 875 | Tokens: 28,672,000 | Train Loss EWMA: 5.7103 | Learning Rate: 0.001913 | Progress: 0.00130 +Step 900 | Tokens: 29,491,200 | Train Loss EWMA: 5.6889 | Learning Rate: 0.001967 | Progress: 0.00134 +Step 925 | Tokens: 30,310,400 | Train Loss EWMA: 5.6649 | Learning Rate: 0.002000 | Progress: 0.00138 +Step 950 | Tokens: 31,129,600 | Train Loss EWMA: 5.6362 | Learning Rate: 0.002000 | Progress: 0.00141 +Step 975 | Tokens: 31,948,800 | Train Loss EWMA: 5.6077 | Learning Rate: 0.002000 | Progress: 0.00145 +Step 1,000 | Tokens: 32,768,000 | Train Loss EWMA: 5.5736 | Learning Rate: 0.002000 | Progress: 0.00149 +Step 1,025 | Tokens: 33,587,200 | Train Loss EWMA: 5.5384 | Learning Rate: 0.002000 | Progress: 0.00153 +Step 1,050 | Tokens: 34,406,400 | Train Loss EWMA: 5.5140 | Learning Rate: 0.002000 | Progress: 0.00156 +Step 1,075 | Tokens: 35,225,600 | Train Loss EWMA: 5.4891 | Learning Rate: 0.002000 | Progress: 0.00160 +Step 1,100 | Tokens: 36,044,800 | Train Loss EWMA: 5.4566 | Learning Rate: 0.002000 | Progress: 0.00164 +Step 1,125 | Tokens: 36,864,000 | Train Loss EWMA: 5.4370 | Learning Rate: 0.002000 | Progress: 0.00168 +Step 1,150 | Tokens: 37,683,200 | Train Loss EWMA: 5.4059 | Learning Rate: 0.002000 | Progress: 0.00171 +Step 1,175 | Tokens: 38,502,400 | Train Loss EWMA: 5.3765 | Learning Rate: 0.002000 | Progress: 0.00175 +Step 1,200 | Tokens: 39,321,600 | Train Loss EWMA: 5.3588 | Learning Rate: 0.002000 | Progress: 0.00179 +Step 1,225 | Tokens: 40,140,800 | Train Loss EWMA: 5.3307 | Learning Rate: 0.002000 | Progress: 0.00182 +Step 1,250 | Tokens: 40,960,000 | Train Loss EWMA: 5.3099 | Learning Rate: 0.002000 | Progress: 0.00186 +Step 1,275 | Tokens: 41,779,200 | Train Loss EWMA: 5.2815 | Learning Rate: 0.002000 | Progress: 0.00190 +Step 1,300 | Tokens: 42,598,400 | Train Loss EWMA: 5.2658 | Learning Rate: 0.002000 | Progress: 0.00194 +Step 1,325 | Tokens: 43,417,600 | Train Loss EWMA: 5.2482 | Learning Rate: 0.002000 | Progress: 0.00197 +Step 1,350 | Tokens: 44,236,800 | Train Loss EWMA: 5.2329 | Learning Rate: 0.002000 | Progress: 0.00201 +Step 1,375 | Tokens: 45,056,000 | Train Loss EWMA: 5.2153 | Learning Rate: 0.002000 | Progress: 0.00205 +Step 1,400 | Tokens: 45,875,200 | Train Loss EWMA: 5.1967 | Learning Rate: 0.002000 | Progress: 0.00209 +Step 1,425 | Tokens: 46,694,400 | Train Loss EWMA: 5.1778 | Learning Rate: 0.002000 | Progress: 0.00212 +Step 1,450 | Tokens: 47,513,600 | Train Loss EWMA: 5.1586 | Learning Rate: 0.002000 | Progress: 0.00216 +Step 1,475 | Tokens: 48,332,800 | Train Loss EWMA: 5.1395 | Learning Rate: 0.002000 | Progress: 0.00220 +Step 1,500 | Tokens: 49,152,000 | Train Loss EWMA: 5.1212 | Learning Rate: 0.002000 | Progress: 0.00223 +Step 1,525 | Tokens: 49,971,200 | Train Loss EWMA: 5.1130 | Learning Rate: 0.002000 | Progress: 0.00227 +Step 1,550 | Tokens: 50,790,400 | Train Loss EWMA: 5.1056 | Learning Rate: 0.002000 | Progress: 0.00231 +Step 1,575 | Tokens: 51,609,600 | Train Loss EWMA: 5.0853 | Learning Rate: 0.002000 | Progress: 0.00235 +Step 1,600 | Tokens: 52,428,800 | Train Loss EWMA: 5.0718 | Learning Rate: 0.002000 | Progress: 0.00238 +Step 1,625 | Tokens: 53,248,000 | Train Loss EWMA: 5.0590 | Learning Rate: 0.002000 | Progress: 0.00242 +Step 1,650 | Tokens: 54,067,200 | Train Loss EWMA: 5.0556 | Learning Rate: 0.002000 | Progress: 0.00246 +Step 1,675 | Tokens: 54,886,400 | Train Loss EWMA: 5.0493 | Learning Rate: 0.002000 | Progress: 0.00249 +Step 1,700 | Tokens: 55,705,600 | Train Loss EWMA: 5.0393 | Learning Rate: 0.002000 | Progress: 0.00253 +Step 1,725 | Tokens: 56,524,800 | Train Loss EWMA: 5.0186 | Learning Rate: 0.002000 | Progress: 0.00257 +Step 1,750 | Tokens: 57,344,000 | Train Loss EWMA: 5.0060 | Learning Rate: 0.002000 | Progress: 0.00261 +Step 1,775 | Tokens: 58,163,200 | Train Loss EWMA: 4.9968 | Learning Rate: 0.002000 | Progress: 0.00264 +Step 1,800 | Tokens: 58,982,400 | Train Loss EWMA: 4.9907 | Learning Rate: 0.002000 | Progress: 0.00268 +Step 1,825 | Tokens: 59,801,600 | Train Loss EWMA: 4.9783 | Learning Rate: 0.002000 | Progress: 0.00272 +Step 1,850 | Tokens: 60,620,800 | Train Loss EWMA: 4.9612 | Learning Rate: 0.002000 | Progress: 0.00276 +Step 1,875 | Tokens: 61,440,000 | Train Loss EWMA: 4.9420 | Learning Rate: 0.002000 | Progress: 0.00279 +Step 1,900 | Tokens: 62,259,200 | Train Loss EWMA: 4.9201 | Learning Rate: 0.002000 | Progress: 0.00283 +Step 1,925 | Tokens: 63,078,400 | Train Loss EWMA: 4.9114 | Learning Rate: 0.002000 | Progress: 0.00287 +Step 1,950 | Tokens: 63,897,600 | Train Loss EWMA: 4.8954 | Learning Rate: 0.002000 | Progress: 0.00290 +Step 1,975 | Tokens: 64,716,800 | Train Loss EWMA: 4.8654 | Learning Rate: 0.002000 | Progress: 0.00294 +Step 2,000 | Tokens: 65,536,000 | Train Loss EWMA: 4.8423 | Learning Rate: 0.002000 | Progress: 0.00298 +Step 2,025 | Tokens: 66,355,200 | Train Loss EWMA: 4.8300 | Learning Rate: 0.002000 | Progress: 0.00302 +Step 2,050 | Tokens: 67,174,400 | Train Loss EWMA: 4.8224 | Learning Rate: 0.002000 | Progress: 0.00305 +Step 2,075 | Tokens: 67,993,600 | Train Loss EWMA: 4.8059 | Learning Rate: 0.002000 | Progress: 0.00309 +Step 2,100 | Tokens: 68,812,800 | Train Loss EWMA: 4.7922 | Learning Rate: 0.002000 | Progress: 0.00313 +Step 2,125 | Tokens: 69,632,000 | Train Loss EWMA: 4.7768 | Learning Rate: 0.002000 | Progress: 0.00317 +Step 2,150 | Tokens: 70,451,200 | Train Loss EWMA: 4.7689 | Learning Rate: 0.002000 | Progress: 0.00320 +Step 2,175 | Tokens: 71,270,400 | Train Loss EWMA: 4.7559 | Learning Rate: 0.002000 | Progress: 0.00324 +Step 2,200 | Tokens: 72,089,600 | Train Loss EWMA: 4.7384 | Learning Rate: 0.002000 | Progress: 0.00328 +Step 2,225 | Tokens: 72,908,800 | Train Loss EWMA: 4.7111 | Learning Rate: 0.002000 | Progress: 0.00331 +Step 2,250 | Tokens: 73,728,000 | Train Loss EWMA: 4.6991 | Learning Rate: 0.002000 | Progress: 0.00335 +Step 2,275 | Tokens: 74,547,200 | Train Loss EWMA: 4.6869 | Learning Rate: 0.002000 | Progress: 0.00339 +Step 2,300 | Tokens: 75,366,400 | Train Loss EWMA: 4.6668 | Learning Rate: 0.002000 | Progress: 0.00343 +Step 2,325 | Tokens: 76,185,600 | Train Loss EWMA: 4.6494 | Learning Rate: 0.002000 | Progress: 0.00346 +Step 2,350 | Tokens: 77,004,800 | Train Loss EWMA: 4.6313 | Learning Rate: 0.002000 | Progress: 0.00350 +Step 2,375 | Tokens: 77,824,000 | Train Loss EWMA: 4.6194 | Learning Rate: 0.002000 | Progress: 0.00354 +Step 2,400 | Tokens: 78,643,200 | Train Loss EWMA: 4.6112 | Learning Rate: 0.002000 | Progress: 0.00357 +Step 2,425 | Tokens: 79,462,400 | Train Loss EWMA: 4.6000 | Learning Rate: 0.002000 | Progress: 0.00361 +Step 2,450 | Tokens: 80,281,600 | Train Loss EWMA: 4.5893 | Learning Rate: 0.002000 | Progress: 0.00365 +Step 2,475 | Tokens: 81,100,800 | Train Loss EWMA: 4.5839 | Learning Rate: 0.002000 | Progress: 0.00369 +Step 2,500 | Tokens: 81,920,000 | Train Loss EWMA: 4.5726 | Learning Rate: 0.002000 | Progress: 0.00372 +Step 2,525 | Tokens: 82,739,200 | Train Loss EWMA: 4.5597 | Learning Rate: 0.002000 | Progress: 0.00376 +Step 2,550 | Tokens: 83,558,400 | Train Loss EWMA: 4.5439 | Learning Rate: 0.002000 | Progress: 0.00380 +Step 2,575 | Tokens: 84,377,600 | Train Loss EWMA: 4.5358 | Learning Rate: 0.002000 | Progress: 0.00384 +Step 2,600 | Tokens: 85,196,800 | Train Loss EWMA: 4.5272 | Learning Rate: 0.002000 | Progress: 0.00387 +Step 2,625 | Tokens: 86,016,000 | Train Loss EWMA: 4.5076 | Learning Rate: 0.002000 | Progress: 0.00391 +Step 2,650 | Tokens: 86,835,200 | Train Loss EWMA: 4.4921 | Learning Rate: 0.002000 | Progress: 0.00395 +Step 2,675 | Tokens: 87,654,400 | Train Loss EWMA: 4.4863 | Learning Rate: 0.002000 | Progress: 0.00398 +Step 2,700 | Tokens: 88,473,600 | Train Loss EWMA: 4.4849 | Learning Rate: 0.002000 | Progress: 0.00402 +Step 2,725 | Tokens: 89,292,800 | Train Loss EWMA: 4.4847 | Learning Rate: 0.002000 | Progress: 0.00406 +Step 2,750 | Tokens: 90,112,000 | Train Loss EWMA: 4.4811 | Learning Rate: 0.002000 | Progress: 0.00410 +Step 2,775 | Tokens: 90,931,200 | Train Loss EWMA: 4.4692 | Learning Rate: 0.002000 | Progress: 0.00413 +Step 2,800 | Tokens: 91,750,400 | Train Loss EWMA: 4.4734 | Learning Rate: 0.002000 | Progress: 0.00417 +Step 2,825 | Tokens: 92,569,600 | Train Loss EWMA: 4.4593 | Learning Rate: 0.002000 | Progress: 0.00421 +Step 2,850 | Tokens: 93,388,800 | Train Loss EWMA: 4.4424 | Learning Rate: 0.002000 | Progress: 0.00424 +Step 2,875 | Tokens: 94,208,000 | Train Loss EWMA: 4.4288 | Learning Rate: 0.002000 | Progress: 0.00428 +Step 2,900 | Tokens: 95,027,200 | Train Loss EWMA: 4.4307 | Learning Rate: 0.002000 | Progress: 0.00432 +Step 2,925 | Tokens: 95,846,400 | Train Loss EWMA: 4.4290 | Learning Rate: 0.002000 | Progress: 0.00436 +Step 2,950 | Tokens: 96,665,600 | Train Loss EWMA: 4.4241 | Learning Rate: 0.002000 | Progress: 0.00439 +Step 2,975 | Tokens: 97,484,800 | Train Loss EWMA: 4.4109 | Learning Rate: 0.002000 | Progress: 0.00443 +Step 3,000 | Tokens: 98,304,000 | Train Loss EWMA: 4.4068 | Learning Rate: 0.002000 | Progress: 0.00447 +Step 3,025 | Tokens: 99,123,200 | Train Loss EWMA: 4.4042 | Learning Rate: 0.002000 | Progress: 0.00451 +Step 3,050 | Tokens: 99,942,400 | Train Loss EWMA: 4.3953 | Learning Rate: 0.002000 | Progress: 0.00454 +Step 3,075 | Tokens: 100,761,600 | Train Loss EWMA: 4.3975 | Learning Rate: 0.002000 | Progress: 0.00458 +Step 3,100 | Tokens: 101,580,800 | Train Loss EWMA: 4.3864 | Learning Rate: 0.002000 | Progress: 0.00462 +Step 3,125 | Tokens: 102,400,000 | Train Loss EWMA: 4.3834 | Learning Rate: 0.002000 | Progress: 0.00465 +Step 3,150 | Tokens: 103,219,200 | Train Loss EWMA: 4.3789 | Learning Rate: 0.002000 | Progress: 0.00469 +Step 3,175 | Tokens: 104,038,400 | Train Loss EWMA: 4.3804 | Learning Rate: 0.002000 | Progress: 0.00473 +Step 3,200 | Tokens: 104,857,600 | Train Loss EWMA: 4.3863 | Learning Rate: 0.002000 | Progress: 0.00477 +Step 3,225 | Tokens: 105,676,800 | Train Loss EWMA: 4.3825 | Learning Rate: 0.002000 | Progress: 0.00480 +Step 3,250 | Tokens: 106,496,000 | Train Loss EWMA: 4.3727 | Learning Rate: 0.002000 | Progress: 0.00484 +Step 3,275 | Tokens: 107,315,200 | Train Loss EWMA: 4.3609 | Learning Rate: 0.002000 | Progress: 0.00488 +Step 3,300 | Tokens: 108,134,400 | Train Loss EWMA: 4.3484 | Learning Rate: 0.002000 | Progress: 0.00492 +Step 3,325 | Tokens: 108,953,600 | Train Loss EWMA: 4.3358 | Learning Rate: 0.002000 | Progress: 0.00495 +Step 3,350 | Tokens: 109,772,800 | Train Loss EWMA: 4.3346 | Learning Rate: 0.002000 | Progress: 0.00499 +Step 3,375 | Tokens: 110,592,000 | Train Loss EWMA: 4.3313 | Learning Rate: 0.002000 | Progress: 0.00503 +Step 3,400 | Tokens: 111,411,200 | Train Loss EWMA: 4.3283 | Learning Rate: 0.002000 | Progress: 0.00506 +Step 3,425 | Tokens: 112,230,400 | Train Loss EWMA: 4.3100 | Learning Rate: 0.002000 | Progress: 0.00510 +Step 3,450 | Tokens: 113,049,600 | Train Loss EWMA: 4.3128 | Learning Rate: 0.002000 | Progress: 0.00514 +Step 3,475 | Tokens: 113,868,800 | Train Loss EWMA: 4.3206 | Learning Rate: 0.002000 | Progress: 0.00518 +Step 3,500 | Tokens: 114,688,000 | Train Loss EWMA: 4.3208 | Learning Rate: 0.002000 | Progress: 0.00521 +Step 3,525 | Tokens: 115,507,200 | Train Loss EWMA: 4.2985 | Learning Rate: 0.002000 | Progress: 0.00525 +Step 3,550 | Tokens: 116,326,400 | Train Loss EWMA: 4.3004 | Learning Rate: 0.002000 | Progress: 0.00529 +Step 3,575 | Tokens: 117,145,600 | Train Loss EWMA: 4.2946 | Learning Rate: 0.002000 | Progress: 0.00532 +Step 3,600 | Tokens: 117,964,800 | Train Loss EWMA: 4.2753 | Learning Rate: 0.002000 | Progress: 0.00536 +Step 3,625 | Tokens: 118,784,000 | Train Loss EWMA: 4.2702 | Learning Rate: 0.002000 | Progress: 0.00540 +Step 3,650 | Tokens: 119,603,200 | Train Loss EWMA: 4.2733 | Learning Rate: 0.002000 | Progress: 0.00544 +Step 3,675 | Tokens: 120,422,400 | Train Loss EWMA: 4.2832 | Learning Rate: 0.002000 | Progress: 0.00547 +Step 3,700 | Tokens: 121,241,600 | Train Loss EWMA: 4.2922 | Learning Rate: 0.002000 | Progress: 0.00551 +Step 3,725 | Tokens: 122,060,800 | Train Loss EWMA: 4.2714 | Learning Rate: 0.002000 | Progress: 0.00555 +Step 3,750 | Tokens: 122,880,000 | Train Loss EWMA: 4.2747 | Learning Rate: 0.002000 | Progress: 0.00559 +Step 3,775 | Tokens: 123,699,200 | Train Loss EWMA: 4.2643 | Learning Rate: 0.002000 | Progress: 0.00562 +Step 3,800 | Tokens: 124,518,400 | Train Loss EWMA: 4.2540 | Learning Rate: 0.002000 | Progress: 0.00566 +Step 3,825 | Tokens: 125,337,600 | Train Loss EWMA: 4.2543 | Learning Rate: 0.002000 | Progress: 0.00570 +Step 3,850 | Tokens: 126,156,800 | Train Loss EWMA: 4.2552 | Learning Rate: 0.002000 | Progress: 0.00573 +Step 3,875 | Tokens: 126,976,000 | Train Loss EWMA: 4.2588 | Learning Rate: 0.002000 | Progress: 0.00577 +Step 3,900 | Tokens: 127,795,200 | Train Loss EWMA: 4.2492 | Learning Rate: 0.002000 | Progress: 0.00581 +Step 3,925 | Tokens: 128,614,400 | Train Loss EWMA: 4.2476 | Learning Rate: 0.002000 | Progress: 0.00585 +Step 3,950 | Tokens: 129,433,600 | Train Loss EWMA: 4.2446 | Learning Rate: 0.002000 | Progress: 0.00588 +Step 3,975 | Tokens: 130,252,800 | Train Loss EWMA: 4.2441 | Learning Rate: 0.002000 | Progress: 0.00592 +Step 4,000 | Tokens: 131,072,000 | Train Loss EWMA: 4.2195 | Learning Rate: 0.002000 | Progress: 0.00596 +Step 4,025 | Tokens: 131,891,200 | Train Loss EWMA: 4.2216 | Learning Rate: 0.002000 | Progress: 0.00600 +Step 4,050 | Tokens: 132,710,400 | Train Loss EWMA: 4.2309 | Learning Rate: 0.002000 | Progress: 0.00603 +Step 4,075 | Tokens: 133,529,600 | Train Loss EWMA: 4.2356 | Learning Rate: 0.002000 | Progress: 0.00607 +Step 4,100 | Tokens: 134,348,800 | Train Loss EWMA: 4.2241 | Learning Rate: 0.002000 | Progress: 0.00611 +Step 4,125 | Tokens: 135,168,000 | Train Loss EWMA: 4.2114 | Learning Rate: 0.002000 | Progress: 0.00614 +Step 4,150 | Tokens: 135,987,200 | Train Loss EWMA: 4.2129 | Learning Rate: 0.002000 | Progress: 0.00618 +Step 4,175 | Tokens: 136,806,400 | Train Loss EWMA: 4.2068 | Learning Rate: 0.002000 | Progress: 0.00622 +Step 4,200 | Tokens: 137,625,600 | Train Loss EWMA: 4.2063 | Learning Rate: 0.002000 | Progress: 0.00626 +Step 4,225 | Tokens: 138,444,800 | Train Loss EWMA: 4.2170 | Learning Rate: 0.002000 | Progress: 0.00629 +Step 4,250 | Tokens: 139,264,000 | Train Loss EWMA: 4.2149 | Learning Rate: 0.002000 | Progress: 0.00633 +Step 4,275 | Tokens: 140,083,200 | Train Loss EWMA: 4.2034 | Learning Rate: 0.002000 | Progress: 0.00637 +Step 4,300 | Tokens: 140,902,400 | Train Loss EWMA: 4.2000 | Learning Rate: 0.002000 | Progress: 0.00640 +Step 4,325 | Tokens: 141,721,600 | Train Loss EWMA: 4.2020 | Learning Rate: 0.002000 | Progress: 0.00644 +Step 4,350 | Tokens: 142,540,800 | Train Loss EWMA: 4.1985 | Learning Rate: 0.002000 | Progress: 0.00648 +Step 4,375 | Tokens: 143,360,000 | Train Loss EWMA: 4.1929 | Learning Rate: 0.002000 | Progress: 0.00652 +Step 4,400 | Tokens: 144,179,200 | Train Loss EWMA: 4.1870 | Learning Rate: 0.002000 | Progress: 0.00655 +Step 4,425 | Tokens: 144,998,400 | Train Loss EWMA: 4.1884 | Learning Rate: 0.002000 | Progress: 0.00659 +Step 4,450 | Tokens: 145,817,600 | Train Loss EWMA: 4.1902 | Learning Rate: 0.002000 | Progress: 0.00663 +Step 4,475 | Tokens: 146,636,800 | Train Loss EWMA: 4.1836 | Learning Rate: 0.002000 | Progress: 0.00667 +Step 4,500 | Tokens: 147,456,000 | Train Loss EWMA: 4.1745 | Learning Rate: 0.002000 | Progress: 0.00670 +Step 4,525 | Tokens: 148,275,200 | Train Loss EWMA: 4.1783 | Learning Rate: 0.002000 | Progress: 0.00674 +Step 4,550 | Tokens: 149,094,400 | Train Loss EWMA: 4.1810 | Learning Rate: 0.002000 | Progress: 0.00678 +Step 4,575 | Tokens: 149,913,600 | Train Loss EWMA: 4.1823 | Learning Rate: 0.002000 | Progress: 0.00681 +Step 4,600 | Tokens: 150,732,800 | Train Loss EWMA: 4.1770 | Learning Rate: 0.002000 | Progress: 0.00685 +Step 4,625 | Tokens: 151,552,000 | Train Loss EWMA: 4.1699 | Learning Rate: 0.002000 | Progress: 0.00689 +Step 4,650 | Tokens: 152,371,200 | Train Loss EWMA: 4.1658 | Learning Rate: 0.002000 | Progress: 0.00693 +Step 4,675 | Tokens: 153,190,400 | Train Loss EWMA: 4.1678 | Learning Rate: 0.002000 | Progress: 0.00696 +Step 4,700 | Tokens: 154,009,600 | Train Loss EWMA: 4.1525 | Learning Rate: 0.002000 | Progress: 0.00700 +Step 4,725 | Tokens: 154,828,800 | Train Loss EWMA: 4.1508 | Learning Rate: 0.002000 | Progress: 0.00704 +Step 4,750 | Tokens: 155,648,000 | Train Loss EWMA: 4.1447 | Learning Rate: 0.002000 | Progress: 0.00707 +Step 4,775 | Tokens: 156,467,200 | Train Loss EWMA: 4.1506 | Learning Rate: 0.002000 | Progress: 0.00711 +Step 4,800 | Tokens: 157,286,400 | Train Loss EWMA: 4.1489 | Learning Rate: 0.002000 | Progress: 0.00715 +Step 4,825 | Tokens: 158,105,600 | Train Loss EWMA: 4.1595 | Learning Rate: 0.002000 | Progress: 0.00719 +Step 4,850 | Tokens: 158,924,800 | Train Loss EWMA: 4.1644 | Learning Rate: 0.002000 | Progress: 0.00722 +Step 4,875 | Tokens: 159,744,000 | Train Loss EWMA: 4.1683 | Learning Rate: 0.002000 | Progress: 0.00726 +Step 4,900 | Tokens: 160,563,200 | Train Loss EWMA: 4.1512 | Learning Rate: 0.002000 | Progress: 0.00730 +Step 4,925 | Tokens: 161,382,400 | Train Loss EWMA: 4.1415 | Learning Rate: 0.002000 | Progress: 0.00734 +Step 4,950 | Tokens: 162,201,600 | Train Loss EWMA: 4.1308 | Learning Rate: 0.002000 | Progress: 0.00737 +Step 4,975 | Tokens: 163,020,800 | Train Loss EWMA: 4.1390 | Learning Rate: 0.002000 | Progress: 0.00741 +Step 5,000 | Tokens: 163,840,000 | Train Loss EWMA: 4.1319 | Learning Rate: 0.002000 | Progress: 0.00745 +Step 5,025 | Tokens: 164,659,200 | Train Loss EWMA: 4.1311 | Learning Rate: 0.002000 | Progress: 0.00748 +Step 5,050 | Tokens: 165,478,400 | Train Loss EWMA: 4.1340 | Learning Rate: 0.002000 | Progress: 0.00752 +Step 5,075 | Tokens: 166,297,600 | Train Loss EWMA: 4.1279 | Learning Rate: 0.002000 | Progress: 0.00756 +Step 5,100 | Tokens: 167,116,800 | Train Loss EWMA: 4.1315 | Learning Rate: 0.002000 | Progress: 0.00760 +Step 5,125 | Tokens: 167,936,000 | Train Loss EWMA: 4.1304 | Learning Rate: 0.002000 | Progress: 0.00763 +Step 5,150 | Tokens: 168,755,200 | Train Loss EWMA: 4.1325 | Learning Rate: 0.002000 | Progress: 0.00767 +Step 5,175 | Tokens: 169,574,400 | Train Loss EWMA: 4.1231 | Learning Rate: 0.002000 | Progress: 0.00771 +Step 5,200 | Tokens: 170,393,600 | Train Loss EWMA: 4.1225 | Learning Rate: 0.002000 | Progress: 0.00775 +Step 5,225 | Tokens: 171,212,800 | Train Loss EWMA: 4.1293 | Learning Rate: 0.002000 | Progress: 0.00778 +Step 5,250 | Tokens: 172,032,000 | Train Loss EWMA: 4.1304 | Learning Rate: 0.002000 | Progress: 0.00782 +Step 5,275 | Tokens: 172,851,200 | Train Loss EWMA: 4.1189 | Learning Rate: 0.002000 | Progress: 0.00786 +Step 5,300 | Tokens: 173,670,400 | Train Loss EWMA: 4.1165 | Learning Rate: 0.002000 | Progress: 0.00789 +Step 5,325 | Tokens: 174,489,600 | Train Loss EWMA: 4.1341 | Learning Rate: 0.002000 | Progress: 0.00793 +Step 5,350 | Tokens: 175,308,800 | Train Loss EWMA: 4.1287 | Learning Rate: 0.002000 | Progress: 0.00797 +Step 5,375 | Tokens: 176,128,000 | Train Loss EWMA: 4.1093 | Learning Rate: 0.002000 | Progress: 0.00801 +Step 5,400 | Tokens: 176,947,200 | Train Loss EWMA: 4.1009 | Learning Rate: 0.002000 | Progress: 0.00804 +Step 5,425 | Tokens: 177,766,400 | Train Loss EWMA: 4.1029 | Learning Rate: 0.002000 | Progress: 0.00808 +Step 5,450 | Tokens: 178,585,600 | Train Loss EWMA: 4.1020 | Learning Rate: 0.002000 | Progress: 0.00812 +Step 5,475 | Tokens: 179,404,800 | Train Loss EWMA: 4.1009 | Learning Rate: 0.002000 | Progress: 0.00815 +Step 5,500 | Tokens: 180,224,000 | Train Loss EWMA: 4.0992 | Learning Rate: 0.002000 | Progress: 0.00819 +Step 5,525 | Tokens: 181,043,200 | Train Loss EWMA: 4.1112 | Learning Rate: 0.002000 | Progress: 0.00823 +Step 5,550 | Tokens: 181,862,400 | Train Loss EWMA: 4.1045 | Learning Rate: 0.002000 | Progress: 0.00827 +Step 5,575 | Tokens: 182,681,600 | Train Loss EWMA: 4.1085 | Learning Rate: 0.002000 | Progress: 0.00830 +Step 5,600 | Tokens: 183,500,800 | Train Loss EWMA: 4.1049 | Learning Rate: 0.002000 | Progress: 0.00834 +Step 5,625 | Tokens: 184,320,000 | Train Loss EWMA: 4.1076 | Learning Rate: 0.002000 | Progress: 0.00838 +Step 5,650 | Tokens: 185,139,200 | Train Loss EWMA: 4.0961 | Learning Rate: 0.002000 | Progress: 0.00842 +Step 5,675 | Tokens: 185,958,400 | Train Loss EWMA: 4.0922 | Learning Rate: 0.002000 | Progress: 0.00845 +Step 5,700 | Tokens: 186,777,600 | Train Loss EWMA: 4.0850 | Learning Rate: 0.002000 | Progress: 0.00849 +Step 5,725 | Tokens: 187,596,800 | Train Loss EWMA: 4.0757 | Learning Rate: 0.002000 | Progress: 0.00853 +Step 5,750 | Tokens: 188,416,000 | Train Loss EWMA: 4.0718 | Learning Rate: 0.002000 | Progress: 0.00856 +Step 5,775 | Tokens: 189,235,200 | Train Loss EWMA: 4.0836 | Learning Rate: 0.002000 | Progress: 0.00860 +Step 5,800 | Tokens: 190,054,400 | Train Loss EWMA: 4.0822 | Learning Rate: 0.002000 | Progress: 0.00864 +Step 5,825 | Tokens: 190,873,600 | Train Loss EWMA: 4.0761 | Learning Rate: 0.002000 | Progress: 0.00868 +Step 5,850 | Tokens: 191,692,800 | Train Loss EWMA: 4.0854 | Learning Rate: 0.002000 | Progress: 0.00871 +Step 5,875 | Tokens: 192,512,000 | Train Loss EWMA: 4.0755 | Learning Rate: 0.002000 | Progress: 0.00875 +Step 5,900 | Tokens: 193,331,200 | Train Loss EWMA: 4.0759 | Learning Rate: 0.002000 | Progress: 0.00879 +Step 5,925 | Tokens: 194,150,400 | Train Loss EWMA: 4.0749 | Learning Rate: 0.002000 | Progress: 0.00883 +Step 5,950 | Tokens: 194,969,600 | Train Loss EWMA: 4.0805 | Learning Rate: 0.002000 | Progress: 0.00886 +Step 5,975 | Tokens: 195,788,800 | Train Loss EWMA: 4.0804 | Learning Rate: 0.002000 | Progress: 0.00890 +Step 6,000 | Tokens: 196,608,000 | Train Loss EWMA: 4.0832 | Learning Rate: 0.002000 | Progress: 0.00894 +Step 6,025 | Tokens: 197,427,200 | Train Loss EWMA: 4.0824 | Learning Rate: 0.002000 | Progress: 0.00897 +Step 6,050 | Tokens: 198,246,400 | Train Loss EWMA: 4.0790 | Learning Rate: 0.002000 | Progress: 0.00901 +Step 6,075 | Tokens: 199,065,600 | Train Loss EWMA: 4.0818 | Learning Rate: 0.002000 | Progress: 0.00905 +Step 6,100 | Tokens: 199,884,800 | Train Loss EWMA: 4.0789 | Learning Rate: 0.002000 | Progress: 0.00909 +Step 6,125 | Tokens: 200,704,000 | Train Loss EWMA: 4.0698 | Learning Rate: 0.002000 | Progress: 0.00912 +Step 6,150 | Tokens: 201,523,200 | Train Loss EWMA: 4.0701 | Learning Rate: 0.002000 | Progress: 0.00916 +Step 6,175 | Tokens: 202,342,400 | Train Loss EWMA: 4.0627 | Learning Rate: 0.002000 | Progress: 0.00920 +Step 6,200 | Tokens: 203,161,600 | Train Loss EWMA: 4.0563 | Learning Rate: 0.002000 | Progress: 0.00923 +Step 6,225 | Tokens: 203,980,800 | Train Loss EWMA: 4.0472 | Learning Rate: 0.002000 | Progress: 0.00927 +Step 6,250 | Tokens: 204,800,000 | Train Loss EWMA: 4.0408 | Learning Rate: 0.002000 | Progress: 0.00931 +Step 6,275 | Tokens: 205,619,200 | Train Loss EWMA: 4.0410 | Learning Rate: 0.002000 | Progress: 0.00935 +Step 6,300 | Tokens: 206,438,400 | Train Loss EWMA: 4.0425 | Learning Rate: 0.002000 | Progress: 0.00938 +Step 6,325 | Tokens: 207,257,600 | Train Loss EWMA: 4.0346 | Learning Rate: 0.002000 | Progress: 0.00942 +Step 6,350 | Tokens: 208,076,800 | Train Loss EWMA: 4.0411 | Learning Rate: 0.002000 | Progress: 0.00946 +Step 6,375 | Tokens: 208,896,000 | Train Loss EWMA: 4.0331 | Learning Rate: 0.002000 | Progress: 0.00950 +Step 6,400 | Tokens: 209,715,200 | Train Loss EWMA: 4.0438 | Learning Rate: 0.002000 | Progress: 0.00953 +Step 6,425 | Tokens: 210,534,400 | Train Loss EWMA: 4.0373 | Learning Rate: 0.002000 | Progress: 0.00957 +Step 6,450 | Tokens: 211,353,600 | Train Loss EWMA: 4.0171 | Learning Rate: 0.002000 | Progress: 0.00961 +Step 6,475 | Tokens: 212,172,800 | Train Loss EWMA: 4.0241 | Learning Rate: 0.002000 | Progress: 0.00964 +Step 6,500 | Tokens: 212,992,000 | Train Loss EWMA: 4.0234 | Learning Rate: 0.002000 | Progress: 0.00968 +Step 6,525 | Tokens: 213,811,200 | Train Loss EWMA: 4.0224 | Learning Rate: 0.002000 | Progress: 0.00972 +Step 6,550 | Tokens: 214,630,400 | Train Loss EWMA: 4.0307 | Learning Rate: 0.002000 | Progress: 0.00976 +Step 6,575 | Tokens: 215,449,600 | Train Loss EWMA: 4.0361 | Learning Rate: 0.002000 | Progress: 0.00979 +Step 6,600 | Tokens: 216,268,800 | Train Loss EWMA: 4.0430 | Learning Rate: 0.002000 | Progress: 0.00983 +Step 6,625 | Tokens: 217,088,000 | Train Loss EWMA: 4.0366 | Learning Rate: 0.002000 | Progress: 0.00987 +Step 6,650 | Tokens: 217,907,200 | Train Loss EWMA: 4.0355 | Learning Rate: 0.002000 | Progress: 0.00990 +Step 6,675 | Tokens: 218,726,400 | Train Loss EWMA: 4.0367 | Learning Rate: 0.002000 | Progress: 0.00994 +Step 6,700 | Tokens: 219,545,600 | Train Loss EWMA: 4.0302 | Learning Rate: 0.002000 | Progress: 0.00998 +Step 6,725 | Tokens: 220,364,800 | Train Loss EWMA: 4.0310 | Learning Rate: 0.002000 | Progress: 0.01002 +Step 6,750 | Tokens: 221,184,000 | Train Loss EWMA: 4.0309 | Learning Rate: 0.002000 | Progress: 0.01005 +Step 6,775 | Tokens: 222,003,200 | Train Loss EWMA: 4.0304 | Learning Rate: 0.002000 | Progress: 0.01009 +Step 6,800 | Tokens: 222,822,400 | Train Loss EWMA: 4.0309 | Learning Rate: 0.002000 | Progress: 0.01013 +Step 6,825 | Tokens: 223,641,600 | Train Loss EWMA: 4.0260 | Learning Rate: 0.002000 | Progress: 0.01017 +Step 6,850 | Tokens: 224,460,800 | Train Loss EWMA: 4.0214 | Learning Rate: 0.002000 | Progress: 0.01020 +Step 6,875 | Tokens: 225,280,000 | Train Loss EWMA: 4.0280 | Learning Rate: 0.002000 | Progress: 0.01024 +Step 6,900 | Tokens: 226,099,200 | Train Loss EWMA: 4.0261 | Learning Rate: 0.002000 | Progress: 0.01028 +Step 6,925 | Tokens: 226,918,400 | Train Loss EWMA: 4.0304 | Learning Rate: 0.002000 | Progress: 0.01031 +Step 6,950 | Tokens: 227,737,600 | Train Loss EWMA: 4.0277 | Learning Rate: 0.002000 | Progress: 0.01035 +Step 6,975 | Tokens: 228,556,800 | Train Loss EWMA: 4.0286 | Learning Rate: 0.002000 | Progress: 0.01039 +Step 7,000 | Tokens: 229,376,000 | Train Loss EWMA: 4.0331 | Learning Rate: 0.002000 | Progress: 0.01043 +Step 7,025 | Tokens: 230,195,200 | Train Loss EWMA: 4.0251 | Learning Rate: 0.002000 | Progress: 0.01046 +Step 7,050 | Tokens: 231,014,400 | Train Loss EWMA: 4.0243 | Learning Rate: 0.002000 | Progress: 0.01050 +Step 7,075 | Tokens: 231,833,600 | Train Loss EWMA: 4.0238 | Learning Rate: 0.002000 | Progress: 0.01054 +Step 7,100 | Tokens: 232,652,800 | Train Loss EWMA: 4.0225 | Learning Rate: 0.002000 | Progress: 0.01058 +Step 7,125 | Tokens: 233,472,000 | Train Loss EWMA: 4.0276 | Learning Rate: 0.002000 | Progress: 0.01061 +Step 7,150 | Tokens: 234,291,200 | Train Loss EWMA: 4.0166 | Learning Rate: 0.002000 | Progress: 0.01065 +Step 7,175 | Tokens: 235,110,400 | Train Loss EWMA: 4.0232 | Learning Rate: 0.002000 | Progress: 0.01069 +Step 7,200 | Tokens: 235,929,600 | Train Loss EWMA: 4.0126 | Learning Rate: 0.002000 | Progress: 0.01072 +Step 7,225 | Tokens: 236,748,800 | Train Loss EWMA: 4.0065 | Learning Rate: 0.002000 | Progress: 0.01076 +Step 7,250 | Tokens: 237,568,000 | Train Loss EWMA: 4.0062 | Learning Rate: 0.002000 | Progress: 0.01080 +Step 7,275 | Tokens: 238,387,200 | Train Loss EWMA: 4.0067 | Learning Rate: 0.002000 | Progress: 0.01084 +Step 7,300 | Tokens: 239,206,400 | Train Loss EWMA: 4.0153 | Learning Rate: 0.002000 | Progress: 0.01087 +Step 7,325 | Tokens: 240,025,600 | Train Loss EWMA: 4.0128 | Learning Rate: 0.002000 | Progress: 0.01091 +Step 7,350 | Tokens: 240,844,800 | Train Loss EWMA: 4.0111 | Learning Rate: 0.002000 | Progress: 0.01095 +Step 7,375 | Tokens: 241,664,000 | Train Loss EWMA: 4.0111 | Learning Rate: 0.002000 | Progress: 0.01098 +Step 7,400 | Tokens: 242,483,200 | Train Loss EWMA: 3.9850 | Learning Rate: 0.002000 | Progress: 0.01102 +Step 7,425 | Tokens: 243,302,400 | Train Loss EWMA: 3.9803 | Learning Rate: 0.002000 | Progress: 0.01106 +Step 7,450 | Tokens: 244,121,600 | Train Loss EWMA: 3.9941 | Learning Rate: 0.002000 | Progress: 0.01110 +Step 7,475 | Tokens: 244,940,800 | Train Loss EWMA: 3.9981 | Learning Rate: 0.002000 | Progress: 0.01113 +Step 7,500 | Tokens: 245,760,000 | Train Loss EWMA: 3.9978 | Learning Rate: 0.002000 | Progress: 0.01117 +Step 7,525 | Tokens: 246,579,200 | Train Loss EWMA: 3.9932 | Learning Rate: 0.002000 | Progress: 0.01121 +Step 7,550 | Tokens: 247,398,400 | Train Loss EWMA: 3.9788 | Learning Rate: 0.002000 | Progress: 0.01125 +Step 7,575 | Tokens: 248,217,600 | Train Loss EWMA: 3.9804 | Learning Rate: 0.002000 | Progress: 0.01128 +Step 7,600 | Tokens: 249,036,800 | Train Loss EWMA: 3.9688 | Learning Rate: 0.002000 | Progress: 0.01132 +Step 7,625 | Tokens: 249,856,000 | Train Loss EWMA: 3.9821 | Learning Rate: 0.002000 | Progress: 0.01136 +Step 7,650 | Tokens: 250,675,200 | Train Loss EWMA: 3.9837 | Learning Rate: 0.002000 | Progress: 0.01139 +Step 7,675 | Tokens: 251,494,400 | Train Loss EWMA: 3.9867 | Learning Rate: 0.001999 | Progress: 0.01143 +Step 7,700 | Tokens: 252,313,600 | Train Loss EWMA: 3.9856 | Learning Rate: 0.001999 | Progress: 0.01147 +Step 7,725 | Tokens: 253,132,800 | Train Loss EWMA: 3.9812 | Learning Rate: 0.001999 | Progress: 0.01151 +Step 7,750 | Tokens: 253,952,000 | Train Loss EWMA: 3.9789 | Learning Rate: 0.001999 | Progress: 0.01154 +Step 7,775 | Tokens: 254,771,200 | Train Loss EWMA: 3.9879 | Learning Rate: 0.001999 | Progress: 0.01158 +Step 7,800 | Tokens: 255,590,400 | Train Loss EWMA: 3.9744 | Learning Rate: 0.001999 | Progress: 0.01162 +Step 7,825 | Tokens: 256,409,600 | Train Loss EWMA: 3.9853 | Learning Rate: 0.001999 | Progress: 0.01165 +Step 7,850 | Tokens: 257,228,800 | Train Loss EWMA: 3.9834 | Learning Rate: 0.001999 | Progress: 0.01169 +Step 7,875 | Tokens: 258,048,000 | Train Loss EWMA: 3.9821 | Learning Rate: 0.001999 | Progress: 0.01173 +Step 7,900 | Tokens: 258,867,200 | Train Loss EWMA: 3.9724 | Learning Rate: 0.001999 | Progress: 0.01177 +Step 7,925 | Tokens: 259,686,400 | Train Loss EWMA: 3.9758 | Learning Rate: 0.001999 | Progress: 0.01180 +Step 7,950 | Tokens: 260,505,600 | Train Loss EWMA: 3.9653 | Learning Rate: 0.001999 | Progress: 0.01184 +Step 7,975 | Tokens: 261,324,800 | Train Loss EWMA: 3.9593 | Learning Rate: 0.001999 | Progress: 0.01188 +Step 8,000 | Tokens: 262,144,000 | Train Loss EWMA: 3.9703 | Learning Rate: 0.001999 | Progress: 0.01192 +Step 8,025 | Tokens: 262,963,200 | Train Loss EWMA: 3.9716 | Learning Rate: 0.001999 | Progress: 0.01195 +Step 8,050 | Tokens: 263,782,400 | Train Loss EWMA: 3.9626 | Learning Rate: 0.001999 | Progress: 0.01199 +Step 8,075 | Tokens: 264,601,600 | Train Loss EWMA: 3.9633 | Learning Rate: 0.001999 | Progress: 0.01203 +Step 8,100 | Tokens: 265,420,800 | Train Loss EWMA: 3.9568 | Learning Rate: 0.001999 | Progress: 0.01206 +Step 8,125 | Tokens: 266,240,000 | Train Loss EWMA: 3.9679 | Learning Rate: 0.001999 | Progress: 0.01210 +Step 8,150 | Tokens: 267,059,200 | Train Loss EWMA: 3.9696 | Learning Rate: 0.001999 | Progress: 0.01214 +Step 8,175 | Tokens: 267,878,400 | Train Loss EWMA: 3.9734 | Learning Rate: 0.001999 | Progress: 0.01218 +Step 8,200 | Tokens: 268,697,600 | Train Loss EWMA: 3.9785 | Learning Rate: 0.001999 | Progress: 0.01221 +Step 8,225 | Tokens: 269,516,800 | Train Loss EWMA: 3.9714 | Learning Rate: 0.001999 | Progress: 0.01225 +Step 8,250 | Tokens: 270,336,000 | Train Loss EWMA: 3.9785 | Learning Rate: 0.001999 | Progress: 0.01229 +Step 8,275 | Tokens: 271,155,200 | Train Loss EWMA: 3.9695 | Learning Rate: 0.001999 | Progress: 0.01233 +Step 8,300 | Tokens: 271,974,400 | Train Loss EWMA: 3.9696 | Learning Rate: 0.001999 | Progress: 0.01236 +Step 8,325 | Tokens: 272,793,600 | Train Loss EWMA: 3.9588 | Learning Rate: 0.001999 | Progress: 0.01240 +Step 8,350 | Tokens: 273,612,800 | Train Loss EWMA: 3.9501 | Learning Rate: 0.001999 | Progress: 0.01244 +Step 8,375 | Tokens: 274,432,000 | Train Loss EWMA: 3.9602 | Learning Rate: 0.001999 | Progress: 0.01247 +Step 8,400 | Tokens: 275,251,200 | Train Loss EWMA: 3.9648 | Learning Rate: 0.001999 | Progress: 0.01251 +Step 8,425 | Tokens: 276,070,400 | Train Loss EWMA: 3.9672 | Learning Rate: 0.001999 | Progress: 0.01255 +Step 8,450 | Tokens: 276,889,600 | Train Loss EWMA: 3.9528 | Learning Rate: 0.001999 | Progress: 0.01259 +Step 8,475 | Tokens: 277,708,800 | Train Loss EWMA: 3.9509 | Learning Rate: 0.001999 | Progress: 0.01262 +Step 8,500 | Tokens: 278,528,000 | Train Loss EWMA: 3.9448 | Learning Rate: 0.001999 | Progress: 0.01266 +Step 8,525 | Tokens: 279,347,200 | Train Loss EWMA: 3.9637 | Learning Rate: 0.001999 | Progress: 0.01270 +Step 8,550 | Tokens: 280,166,400 | Train Loss EWMA: 3.9601 | Learning Rate: 0.001999 | Progress: 0.01273 +Step 8,575 | Tokens: 280,985,600 | Train Loss EWMA: 3.9735 | Learning Rate: 0.001999 | Progress: 0.01277 +Step 8,600 | Tokens: 281,804,800 | Train Loss EWMA: 3.9810 | Learning Rate: 0.001999 | Progress: 0.01281 +Step 8,625 | Tokens: 282,624,000 | Train Loss EWMA: 3.9784 | Learning Rate: 0.001999 | Progress: 0.01285 +Step 8,650 | Tokens: 283,443,200 | Train Loss EWMA: 3.9632 | Learning Rate: 0.001999 | Progress: 0.01288 +Step 8,675 | Tokens: 284,262,400 | Train Loss EWMA: 3.9627 | Learning Rate: 0.001999 | Progress: 0.01292 +Step 8,700 | Tokens: 285,081,600 | Train Loss EWMA: 3.9596 | Learning Rate: 0.001999 | Progress: 0.01296 +Step 8,725 | Tokens: 285,900,800 | Train Loss EWMA: 3.9534 | Learning Rate: 0.001999 | Progress: 0.01300 +Step 8,750 | Tokens: 286,720,000 | Train Loss EWMA: 3.9507 | Learning Rate: 0.001999 | Progress: 0.01303 +Step 8,775 | Tokens: 287,539,200 | Train Loss EWMA: 3.9419 | Learning Rate: 0.001999 | Progress: 0.01307 +Step 8,800 | Tokens: 288,358,400 | Train Loss EWMA: 3.9442 | Learning Rate: 0.001999 | Progress: 0.01311 +Step 8,825 | Tokens: 289,177,600 | Train Loss EWMA: 3.9475 | Learning Rate: 0.001999 | Progress: 0.01314 +Step 8,850 | Tokens: 289,996,800 | Train Loss EWMA: 3.9533 | Learning Rate: 0.001999 | Progress: 0.01318 +Step 8,875 | Tokens: 290,816,000 | Train Loss EWMA: 3.9415 | Learning Rate: 0.001999 | Progress: 0.01322 +Step 8,900 | Tokens: 291,635,200 | Train Loss EWMA: 3.9431 | Learning Rate: 0.001999 | Progress: 0.01326 +Step 8,925 | Tokens: 292,454,400 | Train Loss EWMA: 3.9524 | Learning Rate: 0.001999 | Progress: 0.01329 +Step 8,950 | Tokens: 293,273,600 | Train Loss EWMA: 3.9436 | Learning Rate: 0.001999 | Progress: 0.01333 +Step 8,975 | Tokens: 294,092,800 | Train Loss EWMA: 3.9312 | Learning Rate: 0.001999 | Progress: 0.01337 +Step 9,000 | Tokens: 294,912,000 | Train Loss EWMA: 3.9344 | Learning Rate: 0.001999 | Progress: 0.01341 +Step 9,025 | Tokens: 295,731,200 | Train Loss EWMA: 3.9300 | Learning Rate: 0.001999 | Progress: 0.01344 +Step 9,050 | Tokens: 296,550,400 | Train Loss EWMA: 3.9389 | Learning Rate: 0.001999 | Progress: 0.01348 +Step 9,075 | Tokens: 297,369,600 | Train Loss EWMA: 3.9384 | Learning Rate: 0.001999 | Progress: 0.01352 +Step 9,100 | Tokens: 298,188,800 | Train Loss EWMA: 3.9356 | Learning Rate: 0.001999 | Progress: 0.01355 +Step 9,125 | Tokens: 299,008,000 | Train Loss EWMA: 3.9333 | Learning Rate: 0.001999 | Progress: 0.01359 +Step 9,150 | Tokens: 299,827,200 | Train Loss EWMA: 3.9417 | Learning Rate: 0.001999 | Progress: 0.01363 +Step 9,175 | Tokens: 300,646,400 | Train Loss EWMA: 3.9419 | Learning Rate: 0.001999 | Progress: 0.01367 +Step 9,200 | Tokens: 301,465,600 | Train Loss EWMA: 3.9518 | Learning Rate: 0.001999 | Progress: 0.01370 +Step 9,225 | Tokens: 302,284,800 | Train Loss EWMA: 3.9482 | Learning Rate: 0.001999 | Progress: 0.01374 +Step 9,250 | Tokens: 303,104,000 | Train Loss EWMA: 3.9569 | Learning Rate: 0.001999 | Progress: 0.01378 +Step 9,275 | Tokens: 303,923,200 | Train Loss EWMA: 3.9603 | Learning Rate: 0.001999 | Progress: 0.01381 +Step 9,300 | Tokens: 304,742,400 | Train Loss EWMA: 3.9522 | Learning Rate: 0.001999 | Progress: 0.01385 +Step 9,325 | Tokens: 305,561,600 | Train Loss EWMA: 3.9478 | Learning Rate: 0.001999 | Progress: 0.01389 +Step 9,350 | Tokens: 306,380,800 | Train Loss EWMA: 3.9445 | Learning Rate: 0.001999 | Progress: 0.01393 +Step 9,375 | Tokens: 307,200,000 | Train Loss EWMA: 3.9272 | Learning Rate: 0.001999 | Progress: 0.01396 +Step 9,400 | Tokens: 308,019,200 | Train Loss EWMA: 3.9303 | Learning Rate: 0.001999 | Progress: 0.01400 +Step 9,425 | Tokens: 308,838,400 | Train Loss EWMA: 3.9314 | Learning Rate: 0.001999 | Progress: 0.01404 +Step 9,450 | Tokens: 309,657,600 | Train Loss EWMA: 3.9475 | Learning Rate: 0.001999 | Progress: 0.01408 +Step 9,475 | Tokens: 310,476,800 | Train Loss EWMA: 3.9283 | Learning Rate: 0.001999 | Progress: 0.01411 +Step 9,500 | Tokens: 311,296,000 | Train Loss EWMA: 3.9304 | Learning Rate: 0.001999 | Progress: 0.01415 +Step 9,525 | Tokens: 312,115,200 | Train Loss EWMA: 3.9225 | Learning Rate: 0.001999 | Progress: 0.01419 +Step 9,550 | Tokens: 312,934,400 | Train Loss EWMA: 3.9164 | Learning Rate: 0.001999 | Progress: 0.01422 +Step 9,575 | Tokens: 313,753,600 | Train Loss EWMA: 3.9217 | Learning Rate: 0.001999 | Progress: 0.01426 +Step 9,600 | Tokens: 314,572,800 | Train Loss EWMA: 3.9284 | Learning Rate: 0.001999 | Progress: 0.01430 +Step 9,625 | Tokens: 315,392,000 | Train Loss EWMA: 3.9255 | Learning Rate: 0.001999 | Progress: 0.01434 +Step 9,650 | Tokens: 316,211,200 | Train Loss EWMA: 3.9265 | Learning Rate: 0.001999 | Progress: 0.01437 +Step 9,675 | Tokens: 317,030,400 | Train Loss EWMA: 3.9411 | Learning Rate: 0.001999 | Progress: 0.01441 +Step 9,700 | Tokens: 317,849,600 | Train Loss EWMA: 3.9387 | Learning Rate: 0.001999 | Progress: 0.01445 +Step 9,725 | Tokens: 318,668,800 | Train Loss EWMA: 3.9465 | Learning Rate: 0.001999 | Progress: 0.01448 +Step 9,750 | Tokens: 319,488,000 | Train Loss EWMA: 3.9443 | Learning Rate: 0.001999 | Progress: 0.01452 +Step 9,775 | Tokens: 320,307,200 | Train Loss EWMA: 3.9294 | Learning Rate: 0.001999 | Progress: 0.01456 +Step 9,800 | Tokens: 321,126,400 | Train Loss EWMA: 3.9206 | Learning Rate: 0.001999 | Progress: 0.01460 +Step 9,825 | Tokens: 321,945,600 | Train Loss EWMA: 3.9180 | Learning Rate: 0.001999 | Progress: 0.01463 +Step 9,850 | Tokens: 322,764,800 | Train Loss EWMA: 3.9120 | Learning Rate: 0.001999 | Progress: 0.01467 +Step 9,875 | Tokens: 323,584,000 | Train Loss EWMA: 3.9149 | Learning Rate: 0.001999 | Progress: 0.01471 +Step 9,900 | Tokens: 324,403,200 | Train Loss EWMA: 3.9114 | Learning Rate: 0.001999 | Progress: 0.01475 +Step 9,925 | Tokens: 325,222,400 | Train Loss EWMA: 3.9039 | Learning Rate: 0.001999 | Progress: 0.01478 +Step 9,950 | Tokens: 326,041,600 | Train Loss EWMA: 3.9077 | Learning Rate: 0.001999 | Progress: 0.01482 +Step 9,975 | Tokens: 326,860,800 | Train Loss EWMA: 3.9154 | Learning Rate: 0.001999 | Progress: 0.01486 +Step 10,000 | Tokens: 327,680,000 | Train Loss EWMA: 3.9144 | Learning Rate: 0.001999 | Progress: 0.01489 +Step 10,025 | Tokens: 328,499,200 | Train Loss EWMA: 3.9222 | Learning Rate: 0.001999 | Progress: 0.01493 +Step 10,050 | Tokens: 329,318,400 | Train Loss EWMA: 3.9177 | Learning Rate: 0.001999 | Progress: 0.01497 +Step 10,075 | Tokens: 330,137,600 | Train Loss EWMA: 3.9219 | Learning Rate: 0.001999 | Progress: 0.01501 +Step 10,100 | Tokens: 330,956,800 | Train Loss EWMA: 3.9180 | Learning Rate: 0.001999 | Progress: 0.01504 +Step 10,125 | Tokens: 331,776,000 | Train Loss EWMA: 3.9112 | Learning Rate: 0.001999 | Progress: 0.01508 +Step 10,150 | Tokens: 332,595,200 | Train Loss EWMA: 3.9148 | Learning Rate: 0.001999 | Progress: 0.01512 +Step 10,175 | Tokens: 333,414,400 | Train Loss EWMA: 3.9147 | Learning Rate: 0.001999 | Progress: 0.01516 +Step 10,200 | Tokens: 334,233,600 | Train Loss EWMA: 3.9090 | Learning Rate: 0.001999 | Progress: 0.01519 +Step 10,225 | Tokens: 335,052,800 | Train Loss EWMA: 3.9118 | Learning Rate: 0.001999 | Progress: 0.01523 +Step 10,250 | Tokens: 335,872,000 | Train Loss EWMA: 3.9144 | Learning Rate: 0.001999 | Progress: 0.01527 +Step 10,275 | Tokens: 336,691,200 | Train Loss EWMA: 3.9008 | Learning Rate: 0.001999 | Progress: 0.01530 +Step 10,300 | Tokens: 337,510,400 | Train Loss EWMA: 3.9002 | Learning Rate: 0.001999 | Progress: 0.01534 +Step 10,325 | Tokens: 338,329,600 | Train Loss EWMA: 3.8985 | Learning Rate: 0.001999 | Progress: 0.01538 +Step 10,350 | Tokens: 339,148,800 | Train Loss EWMA: 3.8987 | Learning Rate: 0.001999 | Progress: 0.01542 +Step 10,375 | Tokens: 339,968,000 | Train Loss EWMA: 3.9021 | Learning Rate: 0.001999 | Progress: 0.01545 +Step 10,400 | Tokens: 340,787,200 | Train Loss EWMA: 3.9079 | Learning Rate: 0.001999 | Progress: 0.01549 +Step 10,425 | Tokens: 341,606,400 | Train Loss EWMA: 3.8916 | Learning Rate: 0.001999 | Progress: 0.01553 +Step 10,450 | Tokens: 342,425,600 | Train Loss EWMA: 3.8887 | Learning Rate: 0.001999 | Progress: 0.01556 +Step 10,475 | Tokens: 343,244,800 | Train Loss EWMA: 3.8956 | Learning Rate: 0.001999 | Progress: 0.01560 +Step 10,500 | Tokens: 344,064,000 | Train Loss EWMA: 3.8858 | Learning Rate: 0.001999 | Progress: 0.01564 +Step 10,525 | Tokens: 344,883,200 | Train Loss EWMA: 3.8749 | Learning Rate: 0.001999 | Progress: 0.01568 +Step 10,550 | Tokens: 345,702,400 | Train Loss EWMA: 3.8760 | Learning Rate: 0.001999 | Progress: 0.01571 +Step 10,575 | Tokens: 346,521,600 | Train Loss EWMA: 3.8811 | Learning Rate: 0.001999 | Progress: 0.01575 +Step 10,600 | Tokens: 347,340,800 | Train Loss EWMA: 3.8840 | Learning Rate: 0.001999 | Progress: 0.01579 +Step 10,625 | Tokens: 348,160,000 | Train Loss EWMA: 3.8916 | Learning Rate: 0.001999 | Progress: 0.01583 +Step 10,650 | Tokens: 348,979,200 | Train Loss EWMA: 3.8932 | Learning Rate: 0.001999 | Progress: 0.01586 +Step 10,675 | Tokens: 349,798,400 | Train Loss EWMA: 3.8923 | Learning Rate: 0.001999 | Progress: 0.01590 +Step 10,700 | Tokens: 350,617,600 | Train Loss EWMA: 3.8863 | Learning Rate: 0.001999 | Progress: 0.01594 +Step 10,725 | Tokens: 351,436,800 | Train Loss EWMA: 3.8806 | Learning Rate: 0.001999 | Progress: 0.01597 +Step 10,750 | Tokens: 352,256,000 | Train Loss EWMA: 3.8830 | Learning Rate: 0.001999 | Progress: 0.01601 +Step 10,775 | Tokens: 353,075,200 | Train Loss EWMA: 3.8812 | Learning Rate: 0.001999 | Progress: 0.01605 +Step 10,800 | Tokens: 353,894,400 | Train Loss EWMA: 3.8808 | Learning Rate: 0.001999 | Progress: 0.01609 +Step 10,825 | Tokens: 354,713,600 | Train Loss EWMA: 3.8780 | Learning Rate: 0.001999 | Progress: 0.01612 +Step 10,850 | Tokens: 355,532,800 | Train Loss EWMA: 3.8807 | Learning Rate: 0.001999 | Progress: 0.01616 +Step 10,875 | Tokens: 356,352,000 | Train Loss EWMA: 3.8831 | Learning Rate: 0.001999 | Progress: 0.01620 +Step 10,900 | Tokens: 357,171,200 | Train Loss EWMA: 3.8841 | Learning Rate: 0.001999 | Progress: 0.01624 +Step 10,925 | Tokens: 357,990,400 | Train Loss EWMA: 3.8862 | Learning Rate: 0.001999 | Progress: 0.01627 +Step 10,950 | Tokens: 358,809,600 | Train Loss EWMA: 3.8825 | Learning Rate: 0.001999 | Progress: 0.01631 +Step 10,975 | Tokens: 359,628,800 | Train Loss EWMA: 3.8937 | Learning Rate: 0.001999 | Progress: 0.01635 +Step 11,000 | Tokens: 360,448,000 | Train Loss EWMA: 3.9052 | Learning Rate: 0.001999 | Progress: 0.01638 +Step 11,025 | Tokens: 361,267,200 | Train Loss EWMA: 3.9065 | Learning Rate: 0.001999 | Progress: 0.01642 +Step 11,050 | Tokens: 362,086,400 | Train Loss EWMA: 3.9046 | Learning Rate: 0.001999 | Progress: 0.01646 +Step 11,075 | Tokens: 362,905,600 | Train Loss EWMA: 3.9090 | Learning Rate: 0.001999 | Progress: 0.01650 +Step 11,100 | Tokens: 363,724,800 | Train Loss EWMA: 3.9123 | Learning Rate: 0.001999 | Progress: 0.01653 +Step 11,125 | Tokens: 364,544,000 | Train Loss EWMA: 3.9157 | Learning Rate: 0.001999 | Progress: 0.01657 +Step 11,150 | Tokens: 365,363,200 | Train Loss EWMA: 3.9041 | Learning Rate: 0.001999 | Progress: 0.01661 +Step 11,175 | Tokens: 366,182,400 | Train Loss EWMA: 3.8944 | Learning Rate: 0.001999 | Progress: 0.01664 +Step 11,200 | Tokens: 367,001,600 | Train Loss EWMA: 3.9010 | Learning Rate: 0.001999 | Progress: 0.01668 +Step 11,225 | Tokens: 367,820,800 | Train Loss EWMA: 3.8901 | Learning Rate: 0.001999 | Progress: 0.01672 +Step 11,250 | Tokens: 368,640,000 | Train Loss EWMA: 3.8957 | Learning Rate: 0.001999 | Progress: 0.01676 +Step 11,275 | Tokens: 369,459,200 | Train Loss EWMA: 3.9060 | Learning Rate: 0.001999 | Progress: 0.01679 +Step 11,300 | Tokens: 370,278,400 | Train Loss EWMA: 3.9015 | Learning Rate: 0.001999 | Progress: 0.01683 +Step 11,325 | Tokens: 371,097,600 | Train Loss EWMA: 3.9019 | Learning Rate: 0.001999 | Progress: 0.01687 +Step 11,350 | Tokens: 371,916,800 | Train Loss EWMA: 3.8970 | Learning Rate: 0.001999 | Progress: 0.01691 +Step 11,375 | Tokens: 372,736,000 | Train Loss EWMA: 3.8801 | Learning Rate: 0.001999 | Progress: 0.01694 +Step 11,400 | Tokens: 373,555,200 | Train Loss EWMA: 3.8820 | Learning Rate: 0.001999 | Progress: 0.01698 +Step 11,425 | Tokens: 374,374,400 | Train Loss EWMA: 3.8881 | Learning Rate: 0.001999 | Progress: 0.01702 +Step 11,450 | Tokens: 375,193,600 | Train Loss EWMA: 3.8917 | Learning Rate: 0.001999 | Progress: 0.01705 +Step 11,475 | Tokens: 376,012,800 | Train Loss EWMA: 3.8856 | Learning Rate: 0.001999 | Progress: 0.01709 +Step 11,500 | Tokens: 376,832,000 | Train Loss EWMA: 3.8990 | Learning Rate: 0.001999 | Progress: 0.01713 +Step 11,525 | Tokens: 377,651,200 | Train Loss EWMA: 3.8925 | Learning Rate: 0.001999 | Progress: 0.01717 +Step 11,550 | Tokens: 378,470,400 | Train Loss EWMA: 3.8888 | Learning Rate: 0.001999 | Progress: 0.01720 +Step 11,575 | Tokens: 379,289,600 | Train Loss EWMA: 3.8912 | Learning Rate: 0.001999 | Progress: 0.01724 +Step 11,600 | Tokens: 380,108,800 | Train Loss EWMA: 3.8820 | Learning Rate: 0.001999 | Progress: 0.01728 +Step 11,625 | Tokens: 380,928,000 | Train Loss EWMA: 3.8887 | Learning Rate: 0.001999 | Progress: 0.01731 +Step 11,650 | Tokens: 381,747,200 | Train Loss EWMA: 3.8797 | Learning Rate: 0.001999 | Progress: 0.01735 +Step 11,675 | Tokens: 382,566,400 | Train Loss EWMA: 3.8835 | Learning Rate: 0.001999 | Progress: 0.01739 +Step 11,700 | Tokens: 383,385,600 | Train Loss EWMA: 3.8876 | Learning Rate: 0.001999 | Progress: 0.01743 +Step 11,725 | Tokens: 384,204,800 | Train Loss EWMA: 3.8801 | Learning Rate: 0.001999 | Progress: 0.01746 +Step 11,750 | Tokens: 385,024,000 | Train Loss EWMA: 3.8683 | Learning Rate: 0.001999 | Progress: 0.01750 +Step 11,775 | Tokens: 385,843,200 | Train Loss EWMA: 3.8833 | Learning Rate: 0.001999 | Progress: 0.01754 +Step 11,800 | Tokens: 386,662,400 | Train Loss EWMA: 3.8773 | Learning Rate: 0.001999 | Progress: 0.01758 +Step 11,825 | Tokens: 387,481,600 | Train Loss EWMA: 3.8729 | Learning Rate: 0.001999 | Progress: 0.01761 +Step 11,850 | Tokens: 388,300,800 | Train Loss EWMA: 3.8684 | Learning Rate: 0.001999 | Progress: 0.01765 +Step 11,875 | Tokens: 389,120,000 | Train Loss EWMA: 3.8587 | Learning Rate: 0.001999 | Progress: 0.01769 +Step 11,900 | Tokens: 389,939,200 | Train Loss EWMA: 3.8662 | Learning Rate: 0.001999 | Progress: 0.01772 +Step 11,925 | Tokens: 390,758,400 | Train Loss EWMA: 3.8571 | Learning Rate: 0.001999 | Progress: 0.01776 +Step 11,950 | Tokens: 391,577,600 | Train Loss EWMA: 3.8710 | Learning Rate: 0.001999 | Progress: 0.01780 +Step 11,975 | Tokens: 392,396,800 | Train Loss EWMA: 3.8760 | Learning Rate: 0.001999 | Progress: 0.01784 +Step 12,000 | Tokens: 393,216,000 | Train Loss EWMA: 3.8640 | Learning Rate: 0.001999 | Progress: 0.01787 +Step 12,025 | Tokens: 394,035,200 | Train Loss EWMA: 3.8665 | Learning Rate: 0.001999 | Progress: 0.01791 +Step 12,050 | Tokens: 394,854,400 | Train Loss EWMA: 3.8781 | Learning Rate: 0.001999 | Progress: 0.01795 +Step 12,075 | Tokens: 395,673,600 | Train Loss EWMA: 3.8769 | Learning Rate: 0.001999 | Progress: 0.01799 +Step 12,100 | Tokens: 396,492,800 | Train Loss EWMA: 3.8638 | Learning Rate: 0.001999 | Progress: 0.01802 +Step 12,125 | Tokens: 397,312,000 | Train Loss EWMA: 3.8684 | Learning Rate: 0.001999 | Progress: 0.01806 +Step 12,150 | Tokens: 398,131,200 | Train Loss EWMA: 3.8689 | Learning Rate: 0.001999 | Progress: 0.01810 +Step 12,175 | Tokens: 398,950,400 | Train Loss EWMA: 3.8707 | Learning Rate: 0.001999 | Progress: 0.01813 +Step 12,200 | Tokens: 399,769,600 | Train Loss EWMA: 3.8814 | Learning Rate: 0.001999 | Progress: 0.01817 +Step 12,225 | Tokens: 400,588,800 | Train Loss EWMA: 3.8931 | Learning Rate: 0.001999 | Progress: 0.01821 +Step 12,250 | Tokens: 401,408,000 | Train Loss EWMA: 3.8871 | Learning Rate: 0.001999 | Progress: 0.01825 +Step 12,275 | Tokens: 402,227,200 | Train Loss EWMA: 3.8808 | Learning Rate: 0.001999 | Progress: 0.01828 +Step 12,300 | Tokens: 403,046,400 | Train Loss EWMA: 3.8762 | Learning Rate: 0.001999 | Progress: 0.01832 +Step 12,325 | Tokens: 403,865,600 | Train Loss EWMA: 3.8699 | Learning Rate: 0.001999 | Progress: 0.01836 +Step 12,350 | Tokens: 404,684,800 | Train Loss EWMA: 3.8836 | Learning Rate: 0.001999 | Progress: 0.01839 +Step 12,375 | Tokens: 405,504,000 | Train Loss EWMA: 3.8764 | Learning Rate: 0.001999 | Progress: 0.01843 +Step 12,400 | Tokens: 406,323,200 | Train Loss EWMA: 3.8762 | Learning Rate: 0.001999 | Progress: 0.01847 +Step 12,425 | Tokens: 407,142,400 | Train Loss EWMA: 3.8753 | Learning Rate: 0.001999 | Progress: 0.01851 +Step 12,450 | Tokens: 407,961,600 | Train Loss EWMA: 3.8812 | Learning Rate: 0.001999 | Progress: 0.01854 +Step 12,475 | Tokens: 408,780,800 | Train Loss EWMA: 3.8840 | Learning Rate: 0.001999 | Progress: 0.01858 +Step 12,500 | Tokens: 409,600,000 | Train Loss EWMA: 3.8828 | Learning Rate: 0.001999 | Progress: 0.01862 +Step 12,525 | Tokens: 410,419,200 | Train Loss EWMA: 3.8907 | Learning Rate: 0.001999 | Progress: 0.01866 +Step 12,550 | Tokens: 411,238,400 | Train Loss EWMA: 3.8888 | Learning Rate: 0.001999 | Progress: 0.01869 +Step 12,575 | Tokens: 412,057,600 | Train Loss EWMA: 3.8821 | Learning Rate: 0.001999 | Progress: 0.01873 +Step 12,600 | Tokens: 412,876,800 | Train Loss EWMA: 3.8661 | Learning Rate: 0.001999 | Progress: 0.01877 +Step 12,625 | Tokens: 413,696,000 | Train Loss EWMA: 3.8663 | Learning Rate: 0.001998 | Progress: 0.01880 +Step 12,650 | Tokens: 414,515,200 | Train Loss EWMA: 3.8630 | Learning Rate: 0.001998 | Progress: 0.01884 +Step 12,675 | Tokens: 415,334,400 | Train Loss EWMA: 3.8642 | Learning Rate: 0.001998 | Progress: 0.01888 +Step 12,700 | Tokens: 416,153,600 | Train Loss EWMA: 3.8742 | Learning Rate: 0.001998 | Progress: 0.01892 +Step 12,725 | Tokens: 416,972,800 | Train Loss EWMA: 3.8700 | Learning Rate: 0.001998 | Progress: 0.01895 +Step 12,750 | Tokens: 417,792,000 | Train Loss EWMA: 3.8611 | Learning Rate: 0.001998 | Progress: 0.01899 +Step 12,775 | Tokens: 418,611,200 | Train Loss EWMA: 3.8528 | Learning Rate: 0.001998 | Progress: 0.01903 +Step 12,800 | Tokens: 419,430,400 | Train Loss EWMA: 3.8600 | Learning Rate: 0.001998 | Progress: 0.01907 +Step 12,825 | Tokens: 420,249,600 | Train Loss EWMA: 3.8550 | Learning Rate: 0.001998 | Progress: 0.01910 +Step 12,850 | Tokens: 421,068,800 | Train Loss EWMA: 3.8602 | Learning Rate: 0.001998 | Progress: 0.01914 +Step 12,875 | Tokens: 421,888,000 | Train Loss EWMA: 3.8497 | Learning Rate: 0.001998 | Progress: 0.01918 +Step 12,900 | Tokens: 422,707,200 | Train Loss EWMA: 3.8434 | Learning Rate: 0.001998 | Progress: 0.01921 +Step 12,925 | Tokens: 423,526,400 | Train Loss EWMA: 3.8573 | Learning Rate: 0.001998 | Progress: 0.01925 +Step 12,950 | Tokens: 424,345,600 | Train Loss EWMA: 3.8572 | Learning Rate: 0.001998 | Progress: 0.01929 +Step 12,975 | Tokens: 425,164,800 | Train Loss EWMA: 3.8516 | Learning Rate: 0.001998 | Progress: 0.01933 +Step 13,000 | Tokens: 425,984,000 | Train Loss EWMA: 3.8658 | Learning Rate: 0.001998 | Progress: 0.01936 +Step 13,025 | Tokens: 426,803,200 | Train Loss EWMA: 3.8607 | Learning Rate: 0.001998 | Progress: 0.01940 +Step 13,050 | Tokens: 427,622,400 | Train Loss EWMA: 3.8608 | Learning Rate: 0.001998 | Progress: 0.01944 +Step 13,075 | Tokens: 428,441,600 | Train Loss EWMA: 3.8681 | Learning Rate: 0.001998 | Progress: 0.01947 +Step 13,100 | Tokens: 429,260,800 | Train Loss EWMA: 3.8682 | Learning Rate: 0.001998 | Progress: 0.01951 +Step 13,125 | Tokens: 430,080,000 | Train Loss EWMA: 3.8527 | Learning Rate: 0.001998 | Progress: 0.01955 +Step 13,150 | Tokens: 430,899,200 | Train Loss EWMA: 3.8462 | Learning Rate: 0.001998 | Progress: 0.01959 +Step 13,175 | Tokens: 431,718,400 | Train Loss EWMA: 3.8460 | Learning Rate: 0.001998 | Progress: 0.01962 +Step 13,200 | Tokens: 432,537,600 | Train Loss EWMA: 3.8429 | Learning Rate: 0.001998 | Progress: 0.01966 +Step 13,225 | Tokens: 433,356,800 | Train Loss EWMA: 3.8396 | Learning Rate: 0.001998 | Progress: 0.01970 +Step 13,250 | Tokens: 434,176,000 | Train Loss EWMA: 3.8375 | Learning Rate: 0.001998 | Progress: 0.01974 +Step 13,275 | Tokens: 434,995,200 | Train Loss EWMA: 3.8414 | Learning Rate: 0.001998 | Progress: 0.01977 +Step 13,300 | Tokens: 435,814,400 | Train Loss EWMA: 3.8496 | Learning Rate: 0.001998 | Progress: 0.01981 +Step 13,325 | Tokens: 436,633,600 | Train Loss EWMA: 3.8487 | Learning Rate: 0.001998 | Progress: 0.01985 +Step 13,350 | Tokens: 437,452,800 | Train Loss EWMA: 3.8533 | Learning Rate: 0.001998 | Progress: 0.01988 +Step 13,375 | Tokens: 438,272,000 | Train Loss EWMA: 3.8491 | Learning Rate: 0.001998 | Progress: 0.01992 +Step 13,400 | Tokens: 439,091,200 | Train Loss EWMA: 3.8387 | Learning Rate: 0.001998 | Progress: 0.01996 +Step 13,425 | Tokens: 439,910,400 | Train Loss EWMA: 3.8410 | Learning Rate: 0.001998 | Progress: 0.02000 +Step 13,450 | Tokens: 440,729,600 | Train Loss EWMA: 3.8507 | Learning Rate: 0.001998 | Progress: 0.02003 +Step 13,475 | Tokens: 441,548,800 | Train Loss EWMA: 3.8453 | Learning Rate: 0.001998 | Progress: 0.02007 +Step 13,500 | Tokens: 442,368,000 | Train Loss EWMA: 3.8501 | Learning Rate: 0.001998 | Progress: 0.02011 +Step 13,525 | Tokens: 443,187,200 | Train Loss EWMA: 3.8585 | Learning Rate: 0.001998 | Progress: 0.02014 +Step 13,550 | Tokens: 444,006,400 | Train Loss EWMA: 3.8569 | Learning Rate: 0.001998 | Progress: 0.02018 +Step 13,575 | Tokens: 444,825,600 | Train Loss EWMA: 3.8613 | Learning Rate: 0.001998 | Progress: 0.02022 +Step 13,600 | Tokens: 445,644,800 | Train Loss EWMA: 3.8544 | Learning Rate: 0.001998 | Progress: 0.02026 +Step 13,625 | Tokens: 446,464,000 | Train Loss EWMA: 3.8453 | Learning Rate: 0.001998 | Progress: 0.02029 +Step 13,650 | Tokens: 447,283,200 | Train Loss EWMA: 3.8449 | Learning Rate: 0.001998 | Progress: 0.02033 +Step 13,675 | Tokens: 448,102,400 | Train Loss EWMA: 3.8404 | Learning Rate: 0.001998 | Progress: 0.02037 +Step 13,700 | Tokens: 448,921,600 | Train Loss EWMA: 3.8413 | Learning Rate: 0.001998 | Progress: 0.02041 +Step 13,725 | Tokens: 449,740,800 | Train Loss EWMA: 3.8443 | Learning Rate: 0.001998 | Progress: 0.02044 +Step 13,750 | Tokens: 450,560,000 | Train Loss EWMA: 3.8555 | Learning Rate: 0.001998 | Progress: 0.02048 +Step 13,775 | Tokens: 451,379,200 | Train Loss EWMA: 3.8468 | Learning Rate: 0.001998 | Progress: 0.02052 +Step 13,800 | Tokens: 452,198,400 | Train Loss EWMA: 3.8506 | Learning Rate: 0.001998 | Progress: 0.02055 +Step 13,825 | Tokens: 453,017,600 | Train Loss EWMA: 3.8577 | Learning Rate: 0.001998 | Progress: 0.02059 +Step 13,850 | Tokens: 453,836,800 | Train Loss EWMA: 3.8516 | Learning Rate: 0.001998 | Progress: 0.02063 +Step 13,875 | Tokens: 454,656,000 | Train Loss EWMA: 3.8615 | Learning Rate: 0.001998 | Progress: 0.02067 +Step 13,900 | Tokens: 455,475,200 | Train Loss EWMA: 3.8525 | Learning Rate: 0.001998 | Progress: 0.02070 +Step 13,925 | Tokens: 456,294,400 | Train Loss EWMA: 3.8474 | Learning Rate: 0.001998 | Progress: 0.02074 +Step 13,950 | Tokens: 457,113,600 | Train Loss EWMA: 3.8435 | Learning Rate: 0.001998 | Progress: 0.02078 +Step 13,975 | Tokens: 457,932,800 | Train Loss EWMA: 3.8514 | Learning Rate: 0.001998 | Progress: 0.02082 +Step 14,000 | Tokens: 458,752,000 | Train Loss EWMA: 3.8439 | Learning Rate: 0.001998 | Progress: 0.02085 +Step 14,025 | Tokens: 459,571,200 | Train Loss EWMA: 3.8502 | Learning Rate: 0.001998 | Progress: 0.02089 +Step 14,050 | Tokens: 460,390,400 | Train Loss EWMA: 3.8482 | Learning Rate: 0.001998 | Progress: 0.02093 +Step 14,075 | Tokens: 461,209,600 | Train Loss EWMA: 3.8589 | Learning Rate: 0.001998 | Progress: 0.02096 +Step 14,100 | Tokens: 462,028,800 | Train Loss EWMA: 3.8578 | Learning Rate: 0.001998 | Progress: 0.02100 +Step 14,125 | Tokens: 462,848,000 | Train Loss EWMA: 3.8529 | Learning Rate: 0.001998 | Progress: 0.02104 +Step 14,150 | Tokens: 463,667,200 | Train Loss EWMA: 3.8433 | Learning Rate: 0.001998 | Progress: 0.02108 +Step 14,175 | Tokens: 464,486,400 | Train Loss EWMA: 3.8472 | Learning Rate: 0.001998 | Progress: 0.02111 +Step 14,200 | Tokens: 465,305,600 | Train Loss EWMA: 3.8517 | Learning Rate: 0.001998 | Progress: 0.02115 +Step 14,225 | Tokens: 466,124,800 | Train Loss EWMA: 3.8407 | Learning Rate: 0.001998 | Progress: 0.02119 +Step 14,250 | Tokens: 466,944,000 | Train Loss EWMA: 3.8381 | Learning Rate: 0.001998 | Progress: 0.02122 +Step 14,275 | Tokens: 467,763,200 | Train Loss EWMA: 3.8478 | Learning Rate: 0.001998 | Progress: 0.02126 +Step 14,300 | Tokens: 468,582,400 | Train Loss EWMA: 3.8492 | Learning Rate: 0.001998 | Progress: 0.02130 +Step 14,325 | Tokens: 469,401,600 | Train Loss EWMA: 3.8382 | Learning Rate: 0.001998 | Progress: 0.02134 +Step 14,350 | Tokens: 470,220,800 | Train Loss EWMA: 3.8292 | Learning Rate: 0.001998 | Progress: 0.02137 +Step 14,375 | Tokens: 471,040,000 | Train Loss EWMA: 3.8339 | Learning Rate: 0.001998 | Progress: 0.02141 +Step 14,400 | Tokens: 471,859,200 | Train Loss EWMA: 3.8474 | Learning Rate: 0.001998 | Progress: 0.02145 +Step 14,425 | Tokens: 472,678,400 | Train Loss EWMA: 3.8499 | Learning Rate: 0.001998 | Progress: 0.02149 +Step 14,450 | Tokens: 473,497,600 | Train Loss EWMA: 3.8510 | Learning Rate: 0.001998 | Progress: 0.02152 +Step 14,475 | Tokens: 474,316,800 | Train Loss EWMA: 3.8536 | Learning Rate: 0.001998 | Progress: 0.02156 +Step 14,500 | Tokens: 475,136,000 | Train Loss EWMA: 3.8509 | Learning Rate: 0.001998 | Progress: 0.02160 +Step 14,525 | Tokens: 475,955,200 | Train Loss EWMA: 3.8517 | Learning Rate: 0.001998 | Progress: 0.02163 +Step 14,550 | Tokens: 476,774,400 | Train Loss EWMA: 3.8530 | Learning Rate: 0.001998 | Progress: 0.02167 +Step 14,575 | Tokens: 477,593,600 | Train Loss EWMA: 3.8562 | Learning Rate: 0.001998 | Progress: 0.02171 +Step 14,600 | Tokens: 478,412,800 | Train Loss EWMA: 3.8481 | Learning Rate: 0.001998 | Progress: 0.02175 +Step 14,625 | Tokens: 479,232,000 | Train Loss EWMA: 3.8611 | Learning Rate: 0.001998 | Progress: 0.02178 +Step 14,650 | Tokens: 480,051,200 | Train Loss EWMA: 3.8607 | Learning Rate: 0.001998 | Progress: 0.02182 +Step 14,675 | Tokens: 480,870,400 | Train Loss EWMA: 3.8497 | Learning Rate: 0.001998 | Progress: 0.02186 +Step 14,700 | Tokens: 481,689,600 | Train Loss EWMA: 3.8365 | Learning Rate: 0.001998 | Progress: 0.02189 +Step 14,725 | Tokens: 482,508,800 | Train Loss EWMA: 3.8430 | Learning Rate: 0.001998 | Progress: 0.02193 +Step 14,750 | Tokens: 483,328,000 | Train Loss EWMA: 3.8475 | Learning Rate: 0.001998 | Progress: 0.02197 +Step 14,775 | Tokens: 484,147,200 | Train Loss EWMA: 3.8528 | Learning Rate: 0.001998 | Progress: 0.02201 +Step 14,800 | Tokens: 484,966,400 | Train Loss EWMA: 3.8533 | Learning Rate: 0.001998 | Progress: 0.02204 +Step 14,825 | Tokens: 485,785,600 | Train Loss EWMA: 3.8439 | Learning Rate: 0.001998 | Progress: 0.02208 +Step 14,850 | Tokens: 486,604,800 | Train Loss EWMA: 3.8357 | Learning Rate: 0.001998 | Progress: 0.02212 +Step 14,875 | Tokens: 487,424,000 | Train Loss EWMA: 3.8373 | Learning Rate: 0.001998 | Progress: 0.02216 +Step 14,900 | Tokens: 488,243,200 | Train Loss EWMA: 3.8461 | Learning Rate: 0.001998 | Progress: 0.02219 +Step 14,925 | Tokens: 489,062,400 | Train Loss EWMA: 3.8531 | Learning Rate: 0.001998 | Progress: 0.02223 +Step 14,950 | Tokens: 489,881,600 | Train Loss EWMA: 3.8452 | Learning Rate: 0.001998 | Progress: 0.02227 +Step 14,975 | Tokens: 490,700,800 | Train Loss EWMA: 3.8501 | Learning Rate: 0.001998 | Progress: 0.02230 +Step 15,000 | Tokens: 491,520,000 | Train Loss EWMA: 3.8359 | Learning Rate: 0.001998 | Progress: 0.02234 +Step 15,025 | Tokens: 492,339,200 | Train Loss EWMA: 3.8425 | Learning Rate: 0.001998 | Progress: 0.02238 +Step 15,050 | Tokens: 493,158,400 | Train Loss EWMA: 3.8404 | Learning Rate: 0.001998 | Progress: 0.02242 +Step 15,075 | Tokens: 493,977,600 | Train Loss EWMA: 3.8246 | Learning Rate: 0.001998 | Progress: 0.02245 +Step 15,100 | Tokens: 494,796,800 | Train Loss EWMA: 3.8185 | Learning Rate: 0.001998 | Progress: 0.02249 +Step 15,125 | Tokens: 495,616,000 | Train Loss EWMA: 3.8227 | Learning Rate: 0.001998 | Progress: 0.02253 +Step 15,150 | Tokens: 496,435,200 | Train Loss EWMA: 3.8082 | Learning Rate: 0.001998 | Progress: 0.02257 +Step 15,175 | Tokens: 497,254,400 | Train Loss EWMA: 3.8124 | Learning Rate: 0.001998 | Progress: 0.02260 +Step 15,200 | Tokens: 498,073,600 | Train Loss EWMA: 3.8140 | Learning Rate: 0.001998 | Progress: 0.02264 +Step 15,225 | Tokens: 498,892,800 | Train Loss EWMA: 3.8084 | Learning Rate: 0.001998 | Progress: 0.02268 +Step 15,250 | Tokens: 499,712,000 | Train Loss EWMA: 3.8094 | Learning Rate: 0.001998 | Progress: 0.02271 +Step 15,275 | Tokens: 500,531,200 | Train Loss EWMA: 3.8104 | Learning Rate: 0.001998 | Progress: 0.02275 +Step 15,300 | Tokens: 501,350,400 | Train Loss EWMA: 3.8176 | Learning Rate: 0.001998 | Progress: 0.02279 +Step 15,325 | Tokens: 502,169,600 | Train Loss EWMA: 3.8181 | Learning Rate: 0.001998 | Progress: 0.02283 +Step 15,350 | Tokens: 502,988,800 | Train Loss EWMA: 3.8261 | Learning Rate: 0.001998 | Progress: 0.02286 +Step 15,375 | Tokens: 503,808,000 | Train Loss EWMA: 3.8217 | Learning Rate: 0.001998 | Progress: 0.02290 +Step 15,400 | Tokens: 504,627,200 | Train Loss EWMA: 3.8203 | Learning Rate: 0.001998 | Progress: 0.02294 +Step 15,425 | Tokens: 505,446,400 | Train Loss EWMA: 3.8143 | Learning Rate: 0.001998 | Progress: 0.02297 +Step 15,450 | Tokens: 506,265,600 | Train Loss EWMA: 3.8071 | Learning Rate: 0.001998 | Progress: 0.02301 +Step 15,475 | Tokens: 507,084,800 | Train Loss EWMA: 3.8057 | Learning Rate: 0.001998 | Progress: 0.02305 +Step 15,500 | Tokens: 507,904,000 | Train Loss EWMA: 3.8081 | Learning Rate: 0.001998 | Progress: 0.02309 +Step 15,525 | Tokens: 508,723,200 | Train Loss EWMA: 3.8112 | Learning Rate: 0.001998 | Progress: 0.02312 +Step 15,550 | Tokens: 509,542,400 | Train Loss EWMA: 3.8237 | Learning Rate: 0.001998 | Progress: 0.02316 +Step 15,575 | Tokens: 510,361,600 | Train Loss EWMA: 3.8316 | Learning Rate: 0.001998 | Progress: 0.02320 +Step 15,600 | Tokens: 511,180,800 | Train Loss EWMA: 3.8275 | Learning Rate: 0.001998 | Progress: 0.02324 +Step 15,625 | Tokens: 512,000,000 | Train Loss EWMA: 3.8207 | Learning Rate: 0.001998 | Progress: 0.02327 +Step 15,650 | Tokens: 512,819,200 | Train Loss EWMA: 3.8246 | Learning Rate: 0.001998 | Progress: 0.02331 +Step 15,675 | Tokens: 513,638,400 | Train Loss EWMA: 3.8220 | Learning Rate: 0.001998 | Progress: 0.02335 +Step 15,700 | Tokens: 514,457,600 | Train Loss EWMA: 3.8318 | Learning Rate: 0.001998 | Progress: 0.02338 +Step 15,725 | Tokens: 515,276,800 | Train Loss EWMA: 3.8258 | Learning Rate: 0.001998 | Progress: 0.02342 +Step 15,750 | Tokens: 516,096,000 | Train Loss EWMA: 3.8225 | Learning Rate: 0.001998 | Progress: 0.02346 +Step 15,775 | Tokens: 516,915,200 | Train Loss EWMA: 3.8183 | Learning Rate: 0.001998 | Progress: 0.02350 +Step 15,800 | Tokens: 517,734,400 | Train Loss EWMA: 3.8201 | Learning Rate: 0.001998 | Progress: 0.02353 +Step 15,825 | Tokens: 518,553,600 | Train Loss EWMA: 3.8135 | Learning Rate: 0.001998 | Progress: 0.02357 +Step 15,850 | Tokens: 519,372,800 | Train Loss EWMA: 3.8119 | Learning Rate: 0.001998 | Progress: 0.02361 +Step 15,875 | Tokens: 520,192,000 | Train Loss EWMA: 3.8109 | Learning Rate: 0.001998 | Progress: 0.02365 +Step 15,900 | Tokens: 521,011,200 | Train Loss EWMA: 3.8168 | Learning Rate: 0.001998 | Progress: 0.02368 +Step 15,925 | Tokens: 521,830,400 | Train Loss EWMA: 3.8233 | Learning Rate: 0.001998 | Progress: 0.02372 +Step 15,950 | Tokens: 522,649,600 | Train Loss EWMA: 3.8251 | Learning Rate: 0.001998 | Progress: 0.02376 +Step 15,975 | Tokens: 523,468,800 | Train Loss EWMA: 3.8290 | Learning Rate: 0.001998 | Progress: 0.02379 +Step 16,000 | Tokens: 524,288,000 | Train Loss EWMA: 3.8238 | Learning Rate: 0.001998 | Progress: 0.02383 +Step 16,025 | Tokens: 525,107,200 | Train Loss EWMA: 3.8353 | Learning Rate: 0.001997 | Progress: 0.02387 +Step 16,050 | Tokens: 525,926,400 | Train Loss EWMA: 3.8396 | Learning Rate: 0.001997 | Progress: 0.02391 +Step 16,075 | Tokens: 526,745,600 | Train Loss EWMA: 3.8323 | Learning Rate: 0.001997 | Progress: 0.02394 +Step 16,100 | Tokens: 527,564,800 | Train Loss EWMA: 3.8305 | Learning Rate: 0.001997 | Progress: 0.02398 +Step 16,125 | Tokens: 528,384,000 | Train Loss EWMA: 3.8207 | Learning Rate: 0.001997 | Progress: 0.02402 +Step 16,150 | Tokens: 529,203,200 | Train Loss EWMA: 3.8230 | Learning Rate: 0.001997 | Progress: 0.02405 +Step 16,175 | Tokens: 530,022,400 | Train Loss EWMA: 3.8112 | Learning Rate: 0.001997 | Progress: 0.02409 +Step 16,200 | Tokens: 530,841,600 | Train Loss EWMA: 3.8037 | Learning Rate: 0.001997 | Progress: 0.02413 +Step 16,225 | Tokens: 531,660,800 | Train Loss EWMA: 3.8026 | Learning Rate: 0.001997 | Progress: 0.02417 +Step 16,250 | Tokens: 532,480,000 | Train Loss EWMA: 3.8096 | Learning Rate: 0.001997 | Progress: 0.02420 +Step 16,275 | Tokens: 533,299,200 | Train Loss EWMA: 3.8207 | Learning Rate: 0.001997 | Progress: 0.02424 +Step 16,300 | Tokens: 534,118,400 | Train Loss EWMA: 3.8114 | Learning Rate: 0.001997 | Progress: 0.02428 +Step 16,325 | Tokens: 534,937,600 | Train Loss EWMA: 3.8148 | Learning Rate: 0.001997 | Progress: 0.02432 +Step 16,350 | Tokens: 535,756,800 | Train Loss EWMA: 3.8085 | Learning Rate: 0.001997 | Progress: 0.02435 +Step 16,375 | Tokens: 536,576,000 | Train Loss EWMA: 3.8074 | Learning Rate: 0.001997 | Progress: 0.02439 +Step 16,400 | Tokens: 537,395,200 | Train Loss EWMA: 3.8047 | Learning Rate: 0.001997 | Progress: 0.02443 +Step 16,425 | Tokens: 538,214,400 | Train Loss EWMA: 3.8140 | Learning Rate: 0.001997 | Progress: 0.02446 +Step 16,450 | Tokens: 539,033,600 | Train Loss EWMA: 3.8067 | Learning Rate: 0.001997 | Progress: 0.02450 +Step 16,475 | Tokens: 539,852,800 | Train Loss EWMA: 3.8138 | Learning Rate: 0.001997 | Progress: 0.02454 +Step 16,500 | Tokens: 540,672,000 | Train Loss EWMA: 3.8127 | Learning Rate: 0.001997 | Progress: 0.02458 +Step 16,525 | Tokens: 541,491,200 | Train Loss EWMA: 3.8092 | Learning Rate: 0.001997 | Progress: 0.02461 +Step 16,550 | Tokens: 542,310,400 | Train Loss EWMA: 3.8121 | Learning Rate: 0.001997 | Progress: 0.02465 +Step 16,575 | Tokens: 543,129,600 | Train Loss EWMA: 3.8132 | Learning Rate: 0.001997 | Progress: 0.02469 +Step 16,600 | Tokens: 543,948,800 | Train Loss EWMA: 3.8175 | Learning Rate: 0.001997 | Progress: 0.02472 +Step 16,625 | Tokens: 544,768,000 | Train Loss EWMA: 3.8084 | Learning Rate: 0.001997 | Progress: 0.02476 +Step 16,650 | Tokens: 545,587,200 | Train Loss EWMA: 3.8238 | Learning Rate: 0.001997 | Progress: 0.02480 +Step 16,675 | Tokens: 546,406,400 | Train Loss EWMA: 3.8207 | Learning Rate: 0.001997 | Progress: 0.02484 +Step 16,700 | Tokens: 547,225,600 | Train Loss EWMA: 3.8277 | Learning Rate: 0.001997 | Progress: 0.02487 +Step 16,725 | Tokens: 548,044,800 | Train Loss EWMA: 3.8179 | Learning Rate: 0.001997 | Progress: 0.02491 +Step 16,750 | Tokens: 548,864,000 | Train Loss EWMA: 3.8194 | Learning Rate: 0.001997 | Progress: 0.02495 +Step 16,775 | Tokens: 549,683,200 | Train Loss EWMA: 3.8186 | Learning Rate: 0.001997 | Progress: 0.02499 +Step 16,800 | Tokens: 550,502,400 | Train Loss EWMA: 3.8100 | Learning Rate: 0.001997 | Progress: 0.02502 +Step 16,825 | Tokens: 551,321,600 | Train Loss EWMA: 3.8174 | Learning Rate: 0.001997 | Progress: 0.02506 +Step 16,850 | Tokens: 552,140,800 | Train Loss EWMA: 3.8196 | Learning Rate: 0.001997 | Progress: 0.02510 +Step 16,875 | Tokens: 552,960,000 | Train Loss EWMA: 3.8244 | Learning Rate: 0.001997 | Progress: 0.02513 +Step 16,900 | Tokens: 553,779,200 | Train Loss EWMA: 3.8178 | Learning Rate: 0.001997 | Progress: 0.02517 +Step 16,925 | Tokens: 554,598,400 | Train Loss EWMA: 3.8173 | Learning Rate: 0.001997 | Progress: 0.02521 +Step 16,950 | Tokens: 555,417,600 | Train Loss EWMA: 3.8096 | Learning Rate: 0.001997 | Progress: 0.02525 +Step 16,975 | Tokens: 556,236,800 | Train Loss EWMA: 3.8026 | Learning Rate: 0.001997 | Progress: 0.02528 +Step 17,000 | Tokens: 557,056,000 | Train Loss EWMA: 3.8085 | Learning Rate: 0.001997 | Progress: 0.02532 +Step 17,025 | Tokens: 557,875,200 | Train Loss EWMA: 3.8101 | Learning Rate: 0.001997 | Progress: 0.02536 +Step 17,050 | Tokens: 558,694,400 | Train Loss EWMA: 3.8113 | Learning Rate: 0.001997 | Progress: 0.02540 +Step 17,075 | Tokens: 559,513,600 | Train Loss EWMA: 3.8184 | Learning Rate: 0.001997 | Progress: 0.02543 +Step 17,100 | Tokens: 560,332,800 | Train Loss EWMA: 3.8109 | Learning Rate: 0.001997 | Progress: 0.02547 +Step 17,125 | Tokens: 561,152,000 | Train Loss EWMA: 3.8090 | Learning Rate: 0.001997 | Progress: 0.02551 +Step 17,150 | Tokens: 561,971,200 | Train Loss EWMA: 3.8098 | Learning Rate: 0.001997 | Progress: 0.02554 +Step 17,175 | Tokens: 562,790,400 | Train Loss EWMA: 3.8066 | Learning Rate: 0.001997 | Progress: 0.02558 +Step 17,200 | Tokens: 563,609,600 | Train Loss EWMA: 3.8036 | Learning Rate: 0.001997 | Progress: 0.02562 +Step 17,225 | Tokens: 564,428,800 | Train Loss EWMA: 3.8200 | Learning Rate: 0.001997 | Progress: 0.02566 +Step 17,250 | Tokens: 565,248,000 | Train Loss EWMA: 3.8130 | Learning Rate: 0.001997 | Progress: 0.02569 +Step 17,275 | Tokens: 566,067,200 | Train Loss EWMA: 3.8074 | Learning Rate: 0.001997 | Progress: 0.02573 +Step 17,300 | Tokens: 566,886,400 | Train Loss EWMA: 3.8037 | Learning Rate: 0.001997 | Progress: 0.02577 +Step 17,325 | Tokens: 567,705,600 | Train Loss EWMA: 3.8030 | Learning Rate: 0.001997 | Progress: 0.02580 +Step 17,350 | Tokens: 568,524,800 | Train Loss EWMA: 3.8175 | Learning Rate: 0.001997 | Progress: 0.02584 +Step 17,375 | Tokens: 569,344,000 | Train Loss EWMA: 3.8165 | Learning Rate: 0.001997 | Progress: 0.02588 +Step 17,400 | Tokens: 570,163,200 | Train Loss EWMA: 3.8175 | Learning Rate: 0.001997 | Progress: 0.02592 +Step 17,425 | Tokens: 570,982,400 | Train Loss EWMA: 3.8233 | Learning Rate: 0.001997 | Progress: 0.02595 +Step 17,450 | Tokens: 571,801,600 | Train Loss EWMA: 3.8106 | Learning Rate: 0.001997 | Progress: 0.02599 +Step 17,475 | Tokens: 572,620,800 | Train Loss EWMA: 3.8047 | Learning Rate: 0.001997 | Progress: 0.02603 +Step 17,500 | Tokens: 573,440,000 | Train Loss EWMA: 3.8056 | Learning Rate: 0.001997 | Progress: 0.02607 +Step 17,525 | Tokens: 574,259,200 | Train Loss EWMA: 3.8012 | Learning Rate: 0.001997 | Progress: 0.02610 +Step 17,550 | Tokens: 575,078,400 | Train Loss EWMA: 3.8063 | Learning Rate: 0.001997 | Progress: 0.02614 +Step 17,575 | Tokens: 575,897,600 | Train Loss EWMA: 3.8140 | Learning Rate: 0.001997 | Progress: 0.02618 +Step 17,600 | Tokens: 576,716,800 | Train Loss EWMA: 3.8146 | Learning Rate: 0.001997 | Progress: 0.02621 +Step 17,625 | Tokens: 577,536,000 | Train Loss EWMA: 3.8209 | Learning Rate: 0.001997 | Progress: 0.02625 +Step 17,650 | Tokens: 578,355,200 | Train Loss EWMA: 3.8164 | Learning Rate: 0.001997 | Progress: 0.02629 +Step 17,675 | Tokens: 579,174,400 | Train Loss EWMA: 3.7995 | Learning Rate: 0.001997 | Progress: 0.02633 +Step 17,700 | Tokens: 579,993,600 | Train Loss EWMA: 3.8000 | Learning Rate: 0.001997 | Progress: 0.02636 +Step 17,725 | Tokens: 580,812,800 | Train Loss EWMA: 3.8090 | Learning Rate: 0.001997 | Progress: 0.02640 +Step 17,750 | Tokens: 581,632,000 | Train Loss EWMA: 3.8015 | Learning Rate: 0.001997 | Progress: 0.02644 +Step 17,775 | Tokens: 582,451,200 | Train Loss EWMA: 3.7959 | Learning Rate: 0.001997 | Progress: 0.02648 +Step 17,800 | Tokens: 583,270,400 | Train Loss EWMA: 3.7823 | Learning Rate: 0.001997 | Progress: 0.02651 +Step 17,825 | Tokens: 584,089,600 | Train Loss EWMA: 3.7845 | Learning Rate: 0.001997 | Progress: 0.02655 +Step 17,850 | Tokens: 584,908,800 | Train Loss EWMA: 3.7908 | Learning Rate: 0.001997 | Progress: 0.02659 +Step 17,875 | Tokens: 585,728,000 | Train Loss EWMA: 3.7919 | Learning Rate: 0.001997 | Progress: 0.02662 +Step 17,900 | Tokens: 586,547,200 | Train Loss EWMA: 3.7889 | Learning Rate: 0.001997 | Progress: 0.02666 +Step 17,925 | Tokens: 587,366,400 | Train Loss EWMA: 3.7893 | Learning Rate: 0.001997 | Progress: 0.02670 +Step 17,950 | Tokens: 588,185,600 | Train Loss EWMA: 3.7942 | Learning Rate: 0.001997 | Progress: 0.02674 +Step 17,975 | Tokens: 589,004,800 | Train Loss EWMA: 3.7805 | Learning Rate: 0.001997 | Progress: 0.02677 +Step 18,000 | Tokens: 589,824,000 | Train Loss EWMA: 3.7864 | Learning Rate: 0.001997 | Progress: 0.02681 +Step 18,025 | Tokens: 590,643,200 | Train Loss EWMA: 3.7976 | Learning Rate: 0.001997 | Progress: 0.02685 +Step 18,050 | Tokens: 591,462,400 | Train Loss EWMA: 3.7991 | Learning Rate: 0.001997 | Progress: 0.02688 +Step 18,075 | Tokens: 592,281,600 | Train Loss EWMA: 3.7960 | Learning Rate: 0.001997 | Progress: 0.02692 +Step 18,100 | Tokens: 593,100,800 | Train Loss EWMA: 3.7879 | Learning Rate: 0.001997 | Progress: 0.02696 +Step 18,125 | Tokens: 593,920,000 | Train Loss EWMA: 3.7851 | Learning Rate: 0.001997 | Progress: 0.02700 +Step 18,150 | Tokens: 594,739,200 | Train Loss EWMA: 3.7858 | Learning Rate: 0.001997 | Progress: 0.02703 +Step 18,175 | Tokens: 595,558,400 | Train Loss EWMA: 3.7807 | Learning Rate: 0.001997 | Progress: 0.02707 +Step 18,200 | Tokens: 596,377,600 | Train Loss EWMA: 3.7771 | Learning Rate: 0.001997 | Progress: 0.02711 +Step 18,225 | Tokens: 597,196,800 | Train Loss EWMA: 3.7837 | Learning Rate: 0.001997 | Progress: 0.02715 +Step 18,250 | Tokens: 598,016,000 | Train Loss EWMA: 3.7761 | Learning Rate: 0.001997 | Progress: 0.02718 +Step 18,275 | Tokens: 598,835,200 | Train Loss EWMA: 3.7791 | Learning Rate: 0.001997 | Progress: 0.02722 +Step 18,300 | Tokens: 599,654,400 | Train Loss EWMA: 3.7860 | Learning Rate: 0.001997 | Progress: 0.02726 +Step 18,325 | Tokens: 600,473,600 | Train Loss EWMA: 3.7880 | Learning Rate: 0.001997 | Progress: 0.02729 +Step 18,350 | Tokens: 601,292,800 | Train Loss EWMA: 3.7900 | Learning Rate: 0.001997 | Progress: 0.02733 +Step 18,375 | Tokens: 602,112,000 | Train Loss EWMA: 3.8065 | Learning Rate: 0.001997 | Progress: 0.02737 +Step 18,400 | Tokens: 602,931,200 | Train Loss EWMA: 3.8076 | Learning Rate: 0.001997 | Progress: 0.02741 +Step 18,425 | Tokens: 603,750,400 | Train Loss EWMA: 3.8045 | Learning Rate: 0.001997 | Progress: 0.02744 +Step 18,450 | Tokens: 604,569,600 | Train Loss EWMA: 3.8044 | Learning Rate: 0.001997 | Progress: 0.02748 +Step 18,475 | Tokens: 605,388,800 | Train Loss EWMA: 3.8058 | Learning Rate: 0.001997 | Progress: 0.02752 +Step 18,500 | Tokens: 606,208,000 | Train Loss EWMA: 3.7961 | Learning Rate: 0.001997 | Progress: 0.02755 +Step 18,525 | Tokens: 607,027,200 | Train Loss EWMA: 3.8042 | Learning Rate: 0.001997 | Progress: 0.02759 +Step 18,550 | Tokens: 607,846,400 | Train Loss EWMA: 3.8037 | Learning Rate: 0.001997 | Progress: 0.02763 +Step 18,575 | Tokens: 608,665,600 | Train Loss EWMA: 3.8030 | Learning Rate: 0.001997 | Progress: 0.02767 +Step 18,600 | Tokens: 609,484,800 | Train Loss EWMA: 3.8005 | Learning Rate: 0.001997 | Progress: 0.02770 +Step 18,625 | Tokens: 610,304,000 | Train Loss EWMA: 3.7988 | Learning Rate: 0.001997 | Progress: 0.02774 +Step 18,650 | Tokens: 611,123,200 | Train Loss EWMA: 3.8016 | Learning Rate: 0.001997 | Progress: 0.02778 +Step 18,675 | Tokens: 611,942,400 | Train Loss EWMA: 3.7936 | Learning Rate: 0.001997 | Progress: 0.02782 +Step 18,700 | Tokens: 612,761,600 | Train Loss EWMA: 3.7948 | Learning Rate: 0.001997 | Progress: 0.02785 +Step 18,725 | Tokens: 613,580,800 | Train Loss EWMA: 3.7970 | Learning Rate: 0.001997 | Progress: 0.02789 +Step 18,750 | Tokens: 614,400,000 | Train Loss EWMA: 3.8052 | Learning Rate: 0.001997 | Progress: 0.02793 +Step 18,775 | Tokens: 615,219,200 | Train Loss EWMA: 3.8018 | Learning Rate: 0.001997 | Progress: 0.02796 +Step 18,800 | Tokens: 616,038,400 | Train Loss EWMA: 3.8087 | Learning Rate: 0.001996 | Progress: 0.02800 +Step 18,825 | Tokens: 616,857,600 | Train Loss EWMA: 3.7997 | Learning Rate: 0.001996 | Progress: 0.02804 +Step 18,850 | Tokens: 617,676,800 | Train Loss EWMA: 3.7957 | Learning Rate: 0.001996 | Progress: 0.02808 +Step 18,875 | Tokens: 618,496,000 | Train Loss EWMA: 3.7910 | Learning Rate: 0.001996 | Progress: 0.02811 +Step 18,900 | Tokens: 619,315,200 | Train Loss EWMA: 3.7954 | Learning Rate: 0.001996 | Progress: 0.02815 +Step 18,925 | Tokens: 620,134,400 | Train Loss EWMA: 3.7978 | Learning Rate: 0.001996 | Progress: 0.02819 +Step 18,950 | Tokens: 620,953,600 | Train Loss EWMA: 3.7979 | Learning Rate: 0.001996 | Progress: 0.02823 +Step 18,975 | Tokens: 621,772,800 | Train Loss EWMA: 3.7951 | Learning Rate: 0.001996 | Progress: 0.02826 +Step 19,000 | Tokens: 622,592,000 | Train Loss EWMA: 3.7969 | Learning Rate: 0.001996 | Progress: 0.02830 +Step 19,025 | Tokens: 623,411,200 | Train Loss EWMA: 3.8002 | Learning Rate: 0.001996 | Progress: 0.02834 +Step 19,050 | Tokens: 624,230,400 | Train Loss EWMA: 3.8085 | Learning Rate: 0.001996 | Progress: 0.02837 +Step 19,075 | Tokens: 625,049,600 | Train Loss EWMA: 3.8016 | Learning Rate: 0.001996 | Progress: 0.02841 +Step 19,100 | Tokens: 625,868,800 | Train Loss EWMA: 3.8022 | Learning Rate: 0.001996 | Progress: 0.02845 +Step 19,125 | Tokens: 626,688,000 | Train Loss EWMA: 3.7864 | Learning Rate: 0.001996 | Progress: 0.02849 +Step 19,150 | Tokens: 627,507,200 | Train Loss EWMA: 3.7859 | Learning Rate: 0.001996 | Progress: 0.02852 +Step 19,175 | Tokens: 628,326,400 | Train Loss EWMA: 3.7845 | Learning Rate: 0.001996 | Progress: 0.02856 +Step 19,200 | Tokens: 629,145,600 | Train Loss EWMA: 3.7943 | Learning Rate: 0.001996 | Progress: 0.02860 +Step 19,225 | Tokens: 629,964,800 | Train Loss EWMA: 3.8008 | Learning Rate: 0.001996 | Progress: 0.02863 +Step 19,250 | Tokens: 630,784,000 | Train Loss EWMA: 3.7986 | Learning Rate: 0.001996 | Progress: 0.02867 +Step 19,275 | Tokens: 631,603,200 | Train Loss EWMA: 3.8034 | Learning Rate: 0.001996 | Progress: 0.02871 +Step 19,300 | Tokens: 632,422,400 | Train Loss EWMA: 3.7983 | Learning Rate: 0.001996 | Progress: 0.02875 +Step 19,325 | Tokens: 633,241,600 | Train Loss EWMA: 3.7948 | Learning Rate: 0.001996 | Progress: 0.02878 +Step 19,350 | Tokens: 634,060,800 | Train Loss EWMA: 3.7915 | Learning Rate: 0.001996 | Progress: 0.02882 +Step 19,375 | Tokens: 634,880,000 | Train Loss EWMA: 3.7831 | Learning Rate: 0.001996 | Progress: 0.02886 +Step 19,400 | Tokens: 635,699,200 | Train Loss EWMA: 3.7863 | Learning Rate: 0.001996 | Progress: 0.02890 +Step 19,425 | Tokens: 636,518,400 | Train Loss EWMA: 3.7934 | Learning Rate: 0.001996 | Progress: 0.02893 +Step 19,450 | Tokens: 637,337,600 | Train Loss EWMA: 3.7952 | Learning Rate: 0.001996 | Progress: 0.02897 +Step 19,475 | Tokens: 638,156,800 | Train Loss EWMA: 3.7862 | Learning Rate: 0.001996 | Progress: 0.02901 +Step 19,500 | Tokens: 638,976,000 | Train Loss EWMA: 3.7886 | Learning Rate: 0.001996 | Progress: 0.02904 +Step 19,525 | Tokens: 639,795,200 | Train Loss EWMA: 3.7784 | Learning Rate: 0.001996 | Progress: 0.02908 +Step 19,550 | Tokens: 640,614,400 | Train Loss EWMA: 3.7805 | Learning Rate: 0.001996 | Progress: 0.02912 +Step 19,575 | Tokens: 641,433,600 | Train Loss EWMA: 3.7761 | Learning Rate: 0.001996 | Progress: 0.02916 +Step 19,600 | Tokens: 642,252,800 | Train Loss EWMA: 3.7791 | Learning Rate: 0.001996 | Progress: 0.02919 +Step 19,625 | Tokens: 643,072,000 | Train Loss EWMA: 3.7729 | Learning Rate: 0.001996 | Progress: 0.02923 +Step 19,650 | Tokens: 643,891,200 | Train Loss EWMA: 3.7800 | Learning Rate: 0.001996 | Progress: 0.02927 +Step 19,675 | Tokens: 644,710,400 | Train Loss EWMA: 3.7749 | Learning Rate: 0.001996 | Progress: 0.02931 +Step 19,700 | Tokens: 645,529,600 | Train Loss EWMA: 3.7703 | Learning Rate: 0.001996 | Progress: 0.02934 +Step 19,725 | Tokens: 646,348,800 | Train Loss EWMA: 3.7665 | Learning Rate: 0.001996 | Progress: 0.02938 +Step 19,750 | Tokens: 647,168,000 | Train Loss EWMA: 3.7653 | Learning Rate: 0.001996 | Progress: 0.02942 +Step 19,775 | Tokens: 647,987,200 | Train Loss EWMA: 3.7673 | Learning Rate: 0.001996 | Progress: 0.02945 +Step 19,800 | Tokens: 648,806,400 | Train Loss EWMA: 3.7715 | Learning Rate: 0.001996 | Progress: 0.02949 +Step 19,825 | Tokens: 649,625,600 | Train Loss EWMA: 3.7832 | Learning Rate: 0.001996 | Progress: 0.02953 +Step 19,850 | Tokens: 650,444,800 | Train Loss EWMA: 3.7836 | Learning Rate: 0.001996 | Progress: 0.02957 +Step 19,875 | Tokens: 651,264,000 | Train Loss EWMA: 3.7749 | Learning Rate: 0.001996 | Progress: 0.02960 +Step 19,900 | Tokens: 652,083,200 | Train Loss EWMA: 3.7784 | Learning Rate: 0.001996 | Progress: 0.02964 +Step 19,925 | Tokens: 652,902,400 | Train Loss EWMA: 3.7909 | Learning Rate: 0.001996 | Progress: 0.02968 +Step 19,950 | Tokens: 653,721,600 | Train Loss EWMA: 3.7905 | Learning Rate: 0.001996 | Progress: 0.02971 +Step 19,975 | Tokens: 654,540,800 | Train Loss EWMA: 3.7877 | Learning Rate: 0.001996 | Progress: 0.02975 +Step 20,000 | Tokens: 655,360,000 | Train Loss EWMA: 3.7773 | Learning Rate: 0.001996 | Progress: 0.02979 +Step 20,025 | Tokens: 656,179,200 | Train Loss EWMA: 3.7834 | Learning Rate: 0.001996 | Progress: 0.02983 +Step 20,050 | Tokens: 656,998,400 | Train Loss EWMA: 3.7805 | Learning Rate: 0.001996 | Progress: 0.02986 +Step 20,075 | Tokens: 657,817,600 | Train Loss EWMA: 3.7800 | Learning Rate: 0.001996 | Progress: 0.02990 +Step 20,100 | Tokens: 658,636,800 | Train Loss EWMA: 3.7836 | Learning Rate: 0.001996 | Progress: 0.02994 +Step 20,125 | Tokens: 659,456,000 | Train Loss EWMA: 3.7862 | Learning Rate: 0.001996 | Progress: 0.02998 +Step 20,150 | Tokens: 660,275,200 | Train Loss EWMA: 3.7869 | Learning Rate: 0.001996 | Progress: 0.03001 +Step 20,175 | Tokens: 661,094,400 | Train Loss EWMA: 3.7860 | Learning Rate: 0.001996 | Progress: 0.03005 +Step 20,200 | Tokens: 661,913,600 | Train Loss EWMA: 3.7943 | Learning Rate: 0.001996 | Progress: 0.03009 +Step 20,225 | Tokens: 662,732,800 | Train Loss EWMA: 3.7919 | Learning Rate: 0.001996 | Progress: 0.03012 +Step 20,250 | Tokens: 663,552,000 | Train Loss EWMA: 3.7828 | Learning Rate: 0.001996 | Progress: 0.03016 +Step 20,275 | Tokens: 664,371,200 | Train Loss EWMA: 3.7795 | Learning Rate: 0.001996 | Progress: 0.03020 +Step 20,300 | Tokens: 665,190,400 | Train Loss EWMA: 3.7772 | Learning Rate: 0.001996 | Progress: 0.03024 +Step 20,325 | Tokens: 666,009,600 | Train Loss EWMA: 3.7811 | Learning Rate: 0.001996 | Progress: 0.03027 +Step 20,350 | Tokens: 666,828,800 | Train Loss EWMA: 3.7873 | Learning Rate: 0.001996 | Progress: 0.03031 +Step 20,375 | Tokens: 667,648,000 | Train Loss EWMA: 3.7895 | Learning Rate: 0.001996 | Progress: 0.03035 +Step 20,400 | Tokens: 668,467,200 | Train Loss EWMA: 3.7858 | Learning Rate: 0.001996 | Progress: 0.03038 +Step 20,425 | Tokens: 669,286,400 | Train Loss EWMA: 3.7919 | Learning Rate: 0.001996 | Progress: 0.03042 +Step 20,450 | Tokens: 670,105,600 | Train Loss EWMA: 3.7912 | Learning Rate: 0.001996 | Progress: 0.03046 +Step 20,475 | Tokens: 670,924,800 | Train Loss EWMA: 3.8000 | Learning Rate: 0.001996 | Progress: 0.03050 +Step 20,500 | Tokens: 671,744,000 | Train Loss EWMA: 3.7989 | Learning Rate: 0.001996 | Progress: 0.03053 +Step 20,525 | Tokens: 672,563,200 | Train Loss EWMA: 3.7852 | Learning Rate: 0.001996 | Progress: 0.03057 +Step 20,550 | Tokens: 673,382,400 | Train Loss EWMA: 3.7888 | Learning Rate: 0.001996 | Progress: 0.03061 +Step 20,575 | Tokens: 674,201,600 | Train Loss EWMA: 3.7905 | Learning Rate: 0.001996 | Progress: 0.03065 +Step 20,600 | Tokens: 675,020,800 | Train Loss EWMA: 3.7904 | Learning Rate: 0.001996 | Progress: 0.03068 +Step 20,625 | Tokens: 675,840,000 | Train Loss EWMA: 3.7907 | Learning Rate: 0.001996 | Progress: 0.03072 +Step 20,650 | Tokens: 676,659,200 | Train Loss EWMA: 3.7944 | Learning Rate: 0.001996 | Progress: 0.03076 +Step 20,675 | Tokens: 677,478,400 | Train Loss EWMA: 3.7885 | Learning Rate: 0.001996 | Progress: 0.03079 +Step 20,700 | Tokens: 678,297,600 | Train Loss EWMA: 3.7841 | Learning Rate: 0.001996 | Progress: 0.03083 +Step 20,725 | Tokens: 679,116,800 | Train Loss EWMA: 3.7825 | Learning Rate: 0.001996 | Progress: 0.03087 +Step 20,750 | Tokens: 679,936,000 | Train Loss EWMA: 3.7879 | Learning Rate: 0.001996 | Progress: 0.03091 +Step 20,775 | Tokens: 680,755,200 | Train Loss EWMA: 3.7904 | Learning Rate: 0.001996 | Progress: 0.03094 +Step 20,800 | Tokens: 681,574,400 | Train Loss EWMA: 3.7812 | Learning Rate: 0.001996 | Progress: 0.03098 +Step 20,825 | Tokens: 682,393,600 | Train Loss EWMA: 3.7776 | Learning Rate: 0.001996 | Progress: 0.03102 +Step 20,850 | Tokens: 683,212,800 | Train Loss EWMA: 3.7682 | Learning Rate: 0.001996 | Progress: 0.03106 +Step 20,875 | Tokens: 684,032,000 | Train Loss EWMA: 3.7746 | Learning Rate: 0.001996 | Progress: 0.03109 +Step 20,900 | Tokens: 684,851,200 | Train Loss EWMA: 3.7787 | Learning Rate: 0.001996 | Progress: 0.03113 +Step 20,925 | Tokens: 685,670,400 | Train Loss EWMA: 3.7779 | Learning Rate: 0.001996 | Progress: 0.03117 +Step 20,950 | Tokens: 686,489,600 | Train Loss EWMA: 3.7835 | Learning Rate: 0.001996 | Progress: 0.03120 +Step 20,975 | Tokens: 687,308,800 | Train Loss EWMA: 3.7914 | Learning Rate: 0.001996 | Progress: 0.03124 +Step 21,000 | Tokens: 688,128,000 | Train Loss EWMA: 3.7927 | Learning Rate: 0.001996 | Progress: 0.03128 +Step 21,025 | Tokens: 688,947,200 | Train Loss EWMA: 3.7988 | Learning Rate: 0.001996 | Progress: 0.03132 +Step 21,050 | Tokens: 689,766,400 | Train Loss EWMA: 3.7894 | Learning Rate: 0.001996 | Progress: 0.03135 +Step 21,075 | Tokens: 690,585,600 | Train Loss EWMA: 3.7827 | Learning Rate: 0.001996 | Progress: 0.03139 +Step 21,100 | Tokens: 691,404,800 | Train Loss EWMA: 3.7826 | Learning Rate: 0.001996 | Progress: 0.03143 +Step 21,125 | Tokens: 692,224,000 | Train Loss EWMA: 3.7806 | Learning Rate: 0.001996 | Progress: 0.03146 +Step 21,150 | Tokens: 693,043,200 | Train Loss EWMA: 3.7934 | Learning Rate: 0.001996 | Progress: 0.03150 +Step 21,175 | Tokens: 693,862,400 | Train Loss EWMA: 3.7951 | Learning Rate: 0.001995 | Progress: 0.03154 +Step 21,200 | Tokens: 694,681,600 | Train Loss EWMA: 3.7815 | Learning Rate: 0.001995 | Progress: 0.03158 +Step 21,225 | Tokens: 695,500,800 | Train Loss EWMA: 3.7782 | Learning Rate: 0.001995 | Progress: 0.03161 +Step 21,250 | Tokens: 696,320,000 | Train Loss EWMA: 3.7809 | Learning Rate: 0.001995 | Progress: 0.03165 +Step 21,275 | Tokens: 697,139,200 | Train Loss EWMA: 3.7761 | Learning Rate: 0.001995 | Progress: 0.03169 +Step 21,300 | Tokens: 697,958,400 | Train Loss EWMA: 3.7760 | Learning Rate: 0.001995 | Progress: 0.03173 +Step 21,325 | Tokens: 698,777,600 | Train Loss EWMA: 3.7738 | Learning Rate: 0.001995 | Progress: 0.03176 +Step 21,350 | Tokens: 699,596,800 | Train Loss EWMA: 3.7793 | Learning Rate: 0.001995 | Progress: 0.03180 +Step 21,375 | Tokens: 700,416,000 | Train Loss EWMA: 3.7724 | Learning Rate: 0.001995 | Progress: 0.03184 +Step 21,400 | Tokens: 701,235,200 | Train Loss EWMA: 3.7680 | Learning Rate: 0.001995 | Progress: 0.03187 +Step 21,425 | Tokens: 702,054,400 | Train Loss EWMA: 3.7790 | Learning Rate: 0.001995 | Progress: 0.03191 +Step 21,450 | Tokens: 702,873,600 | Train Loss EWMA: 3.7757 | Learning Rate: 0.001995 | Progress: 0.03195 +Step 21,475 | Tokens: 703,692,800 | Train Loss EWMA: 3.7804 | Learning Rate: 0.001995 | Progress: 0.03199 +Step 21,500 | Tokens: 704,512,000 | Train Loss EWMA: 3.7789 | Learning Rate: 0.001995 | Progress: 0.03202 +Step 21,525 | Tokens: 705,331,200 | Train Loss EWMA: 3.7702 | Learning Rate: 0.001995 | Progress: 0.03206 +Step 21,550 | Tokens: 706,150,400 | Train Loss EWMA: 3.7652 | Learning Rate: 0.001995 | Progress: 0.03210 +Step 21,575 | Tokens: 706,969,600 | Train Loss EWMA: 3.7652 | Learning Rate: 0.001995 | Progress: 0.03213 +Step 21,600 | Tokens: 707,788,800 | Train Loss EWMA: 3.7654 | Learning Rate: 0.001995 | Progress: 0.03217 +Step 21,625 | Tokens: 708,608,000 | Train Loss EWMA: 3.7600 | Learning Rate: 0.001995 | Progress: 0.03221 +Step 21,650 | Tokens: 709,427,200 | Train Loss EWMA: 3.7658 | Learning Rate: 0.001995 | Progress: 0.03225 +Step 21,675 | Tokens: 710,246,400 | Train Loss EWMA: 3.7714 | Learning Rate: 0.001995 | Progress: 0.03228 +Step 21,700 | Tokens: 711,065,600 | Train Loss EWMA: 3.7733 | Learning Rate: 0.001995 | Progress: 0.03232 +Step 21,725 | Tokens: 711,884,800 | Train Loss EWMA: 3.7703 | Learning Rate: 0.001995 | Progress: 0.03236 +Step 21,750 | Tokens: 712,704,000 | Train Loss EWMA: 3.7648 | Learning Rate: 0.001995 | Progress: 0.03240 +Step 21,775 | Tokens: 713,523,200 | Train Loss EWMA: 3.7604 | Learning Rate: 0.001995 | Progress: 0.03243 +Step 21,800 | Tokens: 714,342,400 | Train Loss EWMA: 3.7625 | Learning Rate: 0.001995 | Progress: 0.03247 +Step 21,825 | Tokens: 715,161,600 | Train Loss EWMA: 3.7627 | Learning Rate: 0.001995 | Progress: 0.03251 +Step 21,850 | Tokens: 715,980,800 | Train Loss EWMA: 3.7485 | Learning Rate: 0.001995 | Progress: 0.03254 +Step 21,875 | Tokens: 716,800,000 | Train Loss EWMA: 3.7464 | Learning Rate: 0.001995 | Progress: 0.03258 +Step 21,900 | Tokens: 717,619,200 | Train Loss EWMA: 3.7506 | Learning Rate: 0.001995 | Progress: 0.03262 +Step 21,925 | Tokens: 718,438,400 | Train Loss EWMA: 3.7660 | Learning Rate: 0.001995 | Progress: 0.03266 +Step 21,950 | Tokens: 719,257,600 | Train Loss EWMA: 3.7745 | Learning Rate: 0.001995 | Progress: 0.03269 +Step 21,975 | Tokens: 720,076,800 | Train Loss EWMA: 3.7777 | Learning Rate: 0.001995 | Progress: 0.03273 +Step 22,000 | Tokens: 720,896,000 | Train Loss EWMA: 3.7695 | Learning Rate: 0.001995 | Progress: 0.03277 +Step 22,025 | Tokens: 721,715,200 | Train Loss EWMA: 3.7690 | Learning Rate: 0.001995 | Progress: 0.03281 +Step 22,050 | Tokens: 722,534,400 | Train Loss EWMA: 3.7620 | Learning Rate: 0.001995 | Progress: 0.03284 +Step 22,075 | Tokens: 723,353,600 | Train Loss EWMA: 3.7600 | Learning Rate: 0.001995 | Progress: 0.03288 +Step 22,100 | Tokens: 724,172,800 | Train Loss EWMA: 3.7640 | Learning Rate: 0.001995 | Progress: 0.03292 +Step 22,125 | Tokens: 724,992,000 | Train Loss EWMA: 3.7661 | Learning Rate: 0.001995 | Progress: 0.03295 +Step 22,150 | Tokens: 725,811,200 | Train Loss EWMA: 3.7654 | Learning Rate: 0.001995 | Progress: 0.03299 +Step 22,175 | Tokens: 726,630,400 | Train Loss EWMA: 3.7623 | Learning Rate: 0.001995 | Progress: 0.03303 +Step 22,200 | Tokens: 727,449,600 | Train Loss EWMA: 3.7702 | Learning Rate: 0.001995 | Progress: 0.03307 +Step 22,225 | Tokens: 728,268,800 | Train Loss EWMA: 3.7643 | Learning Rate: 0.001995 | Progress: 0.03310 +Step 22,250 | Tokens: 729,088,000 | Train Loss EWMA: 3.7640 | Learning Rate: 0.001995 | Progress: 0.03314 +Step 22,275 | Tokens: 729,907,200 | Train Loss EWMA: 3.7595 | Learning Rate: 0.001995 | Progress: 0.03318 +Step 22,300 | Tokens: 730,726,400 | Train Loss EWMA: 3.7602 | Learning Rate: 0.001995 | Progress: 0.03321 +Step 22,325 | Tokens: 731,545,600 | Train Loss EWMA: 3.7591 | Learning Rate: 0.001995 | Progress: 0.03325 +Step 22,350 | Tokens: 732,364,800 | Train Loss EWMA: 3.7468 | Learning Rate: 0.001995 | Progress: 0.03329 +Step 22,375 | Tokens: 733,184,000 | Train Loss EWMA: 3.7552 | Learning Rate: 0.001995 | Progress: 0.03333 +Step 22,400 | Tokens: 734,003,200 | Train Loss EWMA: 3.7665 | Learning Rate: 0.001995 | Progress: 0.03336 +Step 22,425 | Tokens: 734,822,400 | Train Loss EWMA: 3.7662 | Learning Rate: 0.001995 | Progress: 0.03340 +Step 22,450 | Tokens: 735,641,600 | Train Loss EWMA: 3.7685 | Learning Rate: 0.001995 | Progress: 0.03344 +Step 22,475 | Tokens: 736,460,800 | Train Loss EWMA: 3.7762 | Learning Rate: 0.001995 | Progress: 0.03348 +Step 22,500 | Tokens: 737,280,000 | Train Loss EWMA: 3.7742 | Learning Rate: 0.001995 | Progress: 0.03351 +Step 22,525 | Tokens: 738,099,200 | Train Loss EWMA: 3.7710 | Learning Rate: 0.001995 | Progress: 0.03355 +Step 22,550 | Tokens: 738,918,400 | Train Loss EWMA: 3.7575 | Learning Rate: 0.001995 | Progress: 0.03359 +Step 22,575 | Tokens: 739,737,600 | Train Loss EWMA: 3.7579 | Learning Rate: 0.001995 | Progress: 0.03362 +Step 22,600 | Tokens: 740,556,800 | Train Loss EWMA: 3.7624 | Learning Rate: 0.001995 | Progress: 0.03366 +Step 22,625 | Tokens: 741,376,000 | Train Loss EWMA: 3.7615 | Learning Rate: 0.001995 | Progress: 0.03370 +Step 22,650 | Tokens: 742,195,200 | Train Loss EWMA: 3.7657 | Learning Rate: 0.001995 | Progress: 0.03374 +Step 22,675 | Tokens: 743,014,400 | Train Loss EWMA: 3.7693 | Learning Rate: 0.001995 | Progress: 0.03377 +Step 22,700 | Tokens: 743,833,600 | Train Loss EWMA: 3.7751 | Learning Rate: 0.001995 | Progress: 0.03381 +Step 22,725 | Tokens: 744,652,800 | Train Loss EWMA: 3.7772 | Learning Rate: 0.001995 | Progress: 0.03385 +Step 22,750 | Tokens: 745,472,000 | Train Loss EWMA: 3.7792 | Learning Rate: 0.001995 | Progress: 0.03389 +Step 22,775 | Tokens: 746,291,200 | Train Loss EWMA: 3.7718 | Learning Rate: 0.001995 | Progress: 0.03392 +Step 22,800 | Tokens: 747,110,400 | Train Loss EWMA: 3.7712 | Learning Rate: 0.001995 | Progress: 0.03396 +Step 22,825 | Tokens: 747,929,600 | Train Loss EWMA: 3.7820 | Learning Rate: 0.001995 | Progress: 0.03400 +Step 22,850 | Tokens: 748,748,800 | Train Loss EWMA: 3.7892 | Learning Rate: 0.001995 | Progress: 0.03403 +Step 22,875 | Tokens: 749,568,000 | Train Loss EWMA: 3.7831 | Learning Rate: 0.001995 | Progress: 0.03407 +Step 22,900 | Tokens: 750,387,200 | Train Loss EWMA: 3.7837 | Learning Rate: 0.001995 | Progress: 0.03411 +Step 22,925 | Tokens: 751,206,400 | Train Loss EWMA: 3.7893 | Learning Rate: 0.001995 | Progress: 0.03415 +Step 22,950 | Tokens: 752,025,600 | Train Loss EWMA: 3.7745 | Learning Rate: 0.001995 | Progress: 0.03418 +Step 22,975 | Tokens: 752,844,800 | Train Loss EWMA: 3.7779 | Learning Rate: 0.001995 | Progress: 0.03422 +Step 23,000 | Tokens: 753,664,000 | Train Loss EWMA: 3.7704 | Learning Rate: 0.001995 | Progress: 0.03426 +Step 23,025 | Tokens: 754,483,200 | Train Loss EWMA: 3.7851 | Learning Rate: 0.001995 | Progress: 0.03429 +Step 23,050 | Tokens: 755,302,400 | Train Loss EWMA: 3.7804 | Learning Rate: 0.001995 | Progress: 0.03433 +Step 23,075 | Tokens: 756,121,600 | Train Loss EWMA: 3.7686 | Learning Rate: 0.001995 | Progress: 0.03437 +Step 23,100 | Tokens: 756,940,800 | Train Loss EWMA: 3.7674 | Learning Rate: 0.001995 | Progress: 0.03441 +Step 23,125 | Tokens: 757,760,000 | Train Loss EWMA: 3.7638 | Learning Rate: 0.001995 | Progress: 0.03444 +Step 23,150 | Tokens: 758,579,200 | Train Loss EWMA: 3.7677 | Learning Rate: 0.001995 | Progress: 0.03448 +Step 23,175 | Tokens: 759,398,400 | Train Loss EWMA: 3.7625 | Learning Rate: 0.001995 | Progress: 0.03452 +Step 23,200 | Tokens: 760,217,600 | Train Loss EWMA: 3.7807 | Learning Rate: 0.001995 | Progress: 0.03456 +Step 23,225 | Tokens: 761,036,800 | Train Loss EWMA: 3.7901 | Learning Rate: 0.001995 | Progress: 0.03459 +Step 23,250 | Tokens: 761,856,000 | Train Loss EWMA: 3.7771 | Learning Rate: 0.001995 | Progress: 0.03463 +Step 23,275 | Tokens: 762,675,200 | Train Loss EWMA: 3.7691 | Learning Rate: 0.001995 | Progress: 0.03467 +Step 23,300 | Tokens: 763,494,400 | Train Loss EWMA: 3.7714 | Learning Rate: 0.001995 | Progress: 0.03470 +Step 23,325 | Tokens: 764,313,600 | Train Loss EWMA: 3.7672 | Learning Rate: 0.001994 | Progress: 0.03474 +Step 23,350 | Tokens: 765,132,800 | Train Loss EWMA: 3.7754 | Learning Rate: 0.001994 | Progress: 0.03478 +Step 23,375 | Tokens: 765,952,000 | Train Loss EWMA: 3.7822 | Learning Rate: 0.001994 | Progress: 0.03482 +Step 23,400 | Tokens: 766,771,200 | Train Loss EWMA: 3.7887 | Learning Rate: 0.001994 | Progress: 0.03485 +Step 23,425 | Tokens: 767,590,400 | Train Loss EWMA: 3.7830 | Learning Rate: 0.001994 | Progress: 0.03489 +Step 23,450 | Tokens: 768,409,600 | Train Loss EWMA: 3.7785 | Learning Rate: 0.001994 | Progress: 0.03493 +Step 23,475 | Tokens: 769,228,800 | Train Loss EWMA: 3.7852 | Learning Rate: 0.001994 | Progress: 0.03496 +Step 23,500 | Tokens: 770,048,000 | Train Loss EWMA: 3.7697 | Learning Rate: 0.001994 | Progress: 0.03500 +Step 23,525 | Tokens: 770,867,200 | Train Loss EWMA: 3.7734 | Learning Rate: 0.001994 | Progress: 0.03504 +Step 23,550 | Tokens: 771,686,400 | Train Loss EWMA: 3.7585 | Learning Rate: 0.001994 | Progress: 0.03508 +Step 23,575 | Tokens: 772,505,600 | Train Loss EWMA: 3.7581 | Learning Rate: 0.001994 | Progress: 0.03511 +Step 23,600 | Tokens: 773,324,800 | Train Loss EWMA: 3.7626 | Learning Rate: 0.001994 | Progress: 0.03515 +Step 23,625 | Tokens: 774,144,000 | Train Loss EWMA: 3.7639 | Learning Rate: 0.001994 | Progress: 0.03519 +Step 23,650 | Tokens: 774,963,200 | Train Loss EWMA: 3.7621 | Learning Rate: 0.001994 | Progress: 0.03523 +Step 23,675 | Tokens: 775,782,400 | Train Loss EWMA: 3.7553 | Learning Rate: 0.001994 | Progress: 0.03526 +Step 23,700 | Tokens: 776,601,600 | Train Loss EWMA: 3.7624 | Learning Rate: 0.001994 | Progress: 0.03530 +Step 23,725 | Tokens: 777,420,800 | Train Loss EWMA: 3.7703 | Learning Rate: 0.001994 | Progress: 0.03534 +Step 23,750 | Tokens: 778,240,000 | Train Loss EWMA: 3.7612 | Learning Rate: 0.001994 | Progress: 0.03537 +Step 23,775 | Tokens: 779,059,200 | Train Loss EWMA: 3.7523 | Learning Rate: 0.001994 | Progress: 0.03541 +Step 23,800 | Tokens: 779,878,400 | Train Loss EWMA: 3.7468 | Learning Rate: 0.001994 | Progress: 0.03545 +Step 23,825 | Tokens: 780,697,600 | Train Loss EWMA: 3.7486 | Learning Rate: 0.001994 | Progress: 0.03549 +Step 23,850 | Tokens: 781,516,800 | Train Loss EWMA: 3.7607 | Learning Rate: 0.001994 | Progress: 0.03552 +Step 23,875 | Tokens: 782,336,000 | Train Loss EWMA: 3.7585 | Learning Rate: 0.001994 | Progress: 0.03556 +Step 23,900 | Tokens: 783,155,200 | Train Loss EWMA: 3.7571 | Learning Rate: 0.001994 | Progress: 0.03560 +Step 23,925 | Tokens: 783,974,400 | Train Loss EWMA: 3.7555 | Learning Rate: 0.001994 | Progress: 0.03564 +Step 23,950 | Tokens: 784,793,600 | Train Loss EWMA: 3.7553 | Learning Rate: 0.001994 | Progress: 0.03567 +Step 23,975 | Tokens: 785,612,800 | Train Loss EWMA: 3.7640 | Learning Rate: 0.001994 | Progress: 0.03571 +Step 24,000 | Tokens: 786,432,000 | Train Loss EWMA: 3.7617 | Learning Rate: 0.001994 | Progress: 0.03575 +Step 24,025 | Tokens: 787,251,200 | Train Loss EWMA: 3.7649 | Learning Rate: 0.001994 | Progress: 0.03578 +Step 24,050 | Tokens: 788,070,400 | Train Loss EWMA: 3.7667 | Learning Rate: 0.001994 | Progress: 0.03582 +Step 24,075 | Tokens: 788,889,600 | Train Loss EWMA: 3.7649 | Learning Rate: 0.001994 | Progress: 0.03586 +Step 24,100 | Tokens: 789,708,800 | Train Loss EWMA: 3.7686 | Learning Rate: 0.001994 | Progress: 0.03590 +Step 24,125 | Tokens: 790,528,000 | Train Loss EWMA: 3.7649 | Learning Rate: 0.001994 | Progress: 0.03593 +Step 24,150 | Tokens: 791,347,200 | Train Loss EWMA: 3.7687 | Learning Rate: 0.001994 | Progress: 0.03597 +Step 24,175 | Tokens: 792,166,400 | Train Loss EWMA: 3.7807 | Learning Rate: 0.001994 | Progress: 0.03601 +Step 24,200 | Tokens: 792,985,600 | Train Loss EWMA: 3.7744 | Learning Rate: 0.001994 | Progress: 0.03604 +Step 24,225 | Tokens: 793,804,800 | Train Loss EWMA: 3.7663 | Learning Rate: 0.001994 | Progress: 0.03608 +Step 24,250 | Tokens: 794,624,000 | Train Loss EWMA: 3.7748 | Learning Rate: 0.001994 | Progress: 0.03612 +Step 24,275 | Tokens: 795,443,200 | Train Loss EWMA: 3.7759 | Learning Rate: 0.001994 | Progress: 0.03616 +Step 24,300 | Tokens: 796,262,400 | Train Loss EWMA: 3.7723 | Learning Rate: 0.001994 | Progress: 0.03619 +Step 24,325 | Tokens: 797,081,600 | Train Loss EWMA: 3.7700 | Learning Rate: 0.001994 | Progress: 0.03623 +Step 24,350 | Tokens: 797,900,800 | Train Loss EWMA: 3.7619 | Learning Rate: 0.001994 | Progress: 0.03627 +Step 24,375 | Tokens: 798,720,000 | Train Loss EWMA: 3.7527 | Learning Rate: 0.001994 | Progress: 0.03631 +Step 24,400 | Tokens: 799,539,200 | Train Loss EWMA: 3.7619 | Learning Rate: 0.001994 | Progress: 0.03634 +Step 24,425 | Tokens: 800,358,400 | Train Loss EWMA: 3.7576 | Learning Rate: 0.001994 | Progress: 0.03638 +Step 24,450 | Tokens: 801,177,600 | Train Loss EWMA: 3.7524 | Learning Rate: 0.001994 | Progress: 0.03642 +Step 24,475 | Tokens: 801,996,800 | Train Loss EWMA: 3.7436 | Learning Rate: 0.001994 | Progress: 0.03645 +Step 24,500 | Tokens: 802,816,000 | Train Loss EWMA: 3.7463 | Learning Rate: 0.001994 | Progress: 0.03649 +Step 24,525 | Tokens: 803,635,200 | Train Loss EWMA: 3.7543 | Learning Rate: 0.001994 | Progress: 0.03653 +Step 24,550 | Tokens: 804,454,400 | Train Loss EWMA: 3.7520 | Learning Rate: 0.001994 | Progress: 0.03657 +Step 24,575 | Tokens: 805,273,600 | Train Loss EWMA: 3.7477 | Learning Rate: 0.001994 | Progress: 0.03660 +Step 24,600 | Tokens: 806,092,800 | Train Loss EWMA: 3.7616 | Learning Rate: 0.001994 | Progress: 0.03664 +Step 24,625 | Tokens: 806,912,000 | Train Loss EWMA: 3.7562 | Learning Rate: 0.001994 | Progress: 0.03668 +Step 24,650 | Tokens: 807,731,200 | Train Loss EWMA: 3.7583 | Learning Rate: 0.001994 | Progress: 0.03672 +Step 24,675 | Tokens: 808,550,400 | Train Loss EWMA: 3.7667 | Learning Rate: 0.001994 | Progress: 0.03675 +Step 24,700 | Tokens: 809,369,600 | Train Loss EWMA: 3.7669 | Learning Rate: 0.001994 | Progress: 0.03679 +Step 24,725 | Tokens: 810,188,800 | Train Loss EWMA: 3.7552 | Learning Rate: 0.001994 | Progress: 0.03683 +Step 24,750 | Tokens: 811,008,000 | Train Loss EWMA: 3.7553 | Learning Rate: 0.001994 | Progress: 0.03686 +Step 24,775 | Tokens: 811,827,200 | Train Loss EWMA: 3.7588 | Learning Rate: 0.001994 | Progress: 0.03690 +Step 24,800 | Tokens: 812,646,400 | Train Loss EWMA: 3.7662 | Learning Rate: 0.001994 | Progress: 0.03694 +Step 24,825 | Tokens: 813,465,600 | Train Loss EWMA: 3.7635 | Learning Rate: 0.001994 | Progress: 0.03698 +Step 24,850 | Tokens: 814,284,800 | Train Loss EWMA: 3.7772 | Learning Rate: 0.001994 | Progress: 0.03701 +Step 24,875 | Tokens: 815,104,000 | Train Loss EWMA: 3.7759 | Learning Rate: 0.001994 | Progress: 0.03705 +Step 24,900 | Tokens: 815,923,200 | Train Loss EWMA: 3.7743 | Learning Rate: 0.001994 | Progress: 0.03709 +Step 24,925 | Tokens: 816,742,400 | Train Loss EWMA: 3.7711 | Learning Rate: 0.001994 | Progress: 0.03712 +Step 24,950 | Tokens: 817,561,600 | Train Loss EWMA: 3.7686 | Learning Rate: 0.001994 | Progress: 0.03716 +Step 24,975 | Tokens: 818,380,800 | Train Loss EWMA: 3.7639 | Learning Rate: 0.001994 | Progress: 0.03720 +Step 25,000 | Tokens: 819,200,000 | Train Loss EWMA: 3.7674 | Learning Rate: 0.001994 | Progress: 0.03724 +Step 25,025 | Tokens: 820,019,200 | Train Loss EWMA: 3.7612 | Learning Rate: 0.001994 | Progress: 0.03727 +Step 25,050 | Tokens: 820,838,400 | Train Loss EWMA: 3.7609 | Learning Rate: 0.001994 | Progress: 0.03731 +Step 25,075 | Tokens: 821,657,600 | Train Loss EWMA: 3.7531 | Learning Rate: 0.001994 | Progress: 0.03735 +Step 25,100 | Tokens: 822,476,800 | Train Loss EWMA: 3.7442 | Learning Rate: 0.001994 | Progress: 0.03739 +Step 25,125 | Tokens: 823,296,000 | Train Loss EWMA: 3.7465 | Learning Rate: 0.001994 | Progress: 0.03742 +Step 25,150 | Tokens: 824,115,200 | Train Loss EWMA: 3.7409 | Learning Rate: 0.001994 | Progress: 0.03746 +Step 25,175 | Tokens: 824,934,400 | Train Loss EWMA: 3.7480 | Learning Rate: 0.001994 | Progress: 0.03750 +Step 25,200 | Tokens: 825,753,600 | Train Loss EWMA: 3.7584 | Learning Rate: 0.001994 | Progress: 0.03753 +Step 25,225 | Tokens: 826,572,800 | Train Loss EWMA: 3.7547 | Learning Rate: 0.001994 | Progress: 0.03757 +Step 25,250 | Tokens: 827,392,000 | Train Loss EWMA: 3.7603 | Learning Rate: 0.001994 | Progress: 0.03761 +Step 25,275 | Tokens: 828,211,200 | Train Loss EWMA: 3.7544 | Learning Rate: 0.001993 | Progress: 0.03765 +Step 25,300 | Tokens: 829,030,400 | Train Loss EWMA: 3.7704 | Learning Rate: 0.001993 | Progress: 0.03768 +Step 25,325 | Tokens: 829,849,600 | Train Loss EWMA: 3.7750 | Learning Rate: 0.001993 | Progress: 0.03772 +Step 25,350 | Tokens: 830,668,800 | Train Loss EWMA: 3.7799 | Learning Rate: 0.001993 | Progress: 0.03776 +Step 25,375 | Tokens: 831,488,000 | Train Loss EWMA: 3.7732 | Learning Rate: 0.001993 | Progress: 0.03779 +Step 25,400 | Tokens: 832,307,200 | Train Loss EWMA: 3.7615 | Learning Rate: 0.001993 | Progress: 0.03783 +Step 25,425 | Tokens: 833,126,400 | Train Loss EWMA: 3.7518 | Learning Rate: 0.001993 | Progress: 0.03787 +Step 25,450 | Tokens: 833,945,600 | Train Loss EWMA: 3.7611 | Learning Rate: 0.001993 | Progress: 0.03791 +Step 25,475 | Tokens: 834,764,800 | Train Loss EWMA: 3.7665 | Learning Rate: 0.001993 | Progress: 0.03794 +Step 25,500 | Tokens: 835,584,000 | Train Loss EWMA: 3.7643 | Learning Rate: 0.001993 | Progress: 0.03798 +Step 25,525 | Tokens: 836,403,200 | Train Loss EWMA: 3.7704 | Learning Rate: 0.001993 | Progress: 0.03802 +Step 25,550 | Tokens: 837,222,400 | Train Loss EWMA: 3.7654 | Learning Rate: 0.001993 | Progress: 0.03806 +Step 25,575 | Tokens: 838,041,600 | Train Loss EWMA: 3.7678 | Learning Rate: 0.001993 | Progress: 0.03809 +Step 25,600 | Tokens: 838,860,800 | Train Loss EWMA: 3.7610 | Learning Rate: 0.001993 | Progress: 0.03813 +Step 25,625 | Tokens: 839,680,000 | Train Loss EWMA: 3.7566 | Learning Rate: 0.001993 | Progress: 0.03817 +Step 25,650 | Tokens: 840,499,200 | Train Loss EWMA: 3.7623 | Learning Rate: 0.001993 | Progress: 0.03820 +Step 25,675 | Tokens: 841,318,400 | Train Loss EWMA: 3.7669 | Learning Rate: 0.001993 | Progress: 0.03824 +Step 25,700 | Tokens: 842,137,600 | Train Loss EWMA: 3.7705 | Learning Rate: 0.001993 | Progress: 0.03828 +Step 25,725 | Tokens: 842,956,800 | Train Loss EWMA: 3.7694 | Learning Rate: 0.001993 | Progress: 0.03832 +Step 25,750 | Tokens: 843,776,000 | Train Loss EWMA: 3.7741 | Learning Rate: 0.001993 | Progress: 0.03835 +Step 25,775 | Tokens: 844,595,200 | Train Loss EWMA: 3.7760 | Learning Rate: 0.001993 | Progress: 0.03839 +Step 25,800 | Tokens: 845,414,400 | Train Loss EWMA: 3.7756 | Learning Rate: 0.001993 | Progress: 0.03843 +Step 25,825 | Tokens: 846,233,600 | Train Loss EWMA: 3.7726 | Learning Rate: 0.001993 | Progress: 0.03847 +Step 25,850 | Tokens: 847,052,800 | Train Loss EWMA: 3.7606 | Learning Rate: 0.001993 | Progress: 0.03850 +Step 25,875 | Tokens: 847,872,000 | Train Loss EWMA: 3.7625 | Learning Rate: 0.001993 | Progress: 0.03854 +Step 25,900 | Tokens: 848,691,200 | Train Loss EWMA: 3.7602 | Learning Rate: 0.001993 | Progress: 0.03858 +Step 25,925 | Tokens: 849,510,400 | Train Loss EWMA: 3.7588 | Learning Rate: 0.001993 | Progress: 0.03861 +Step 25,950 | Tokens: 850,329,600 | Train Loss EWMA: 3.7644 | Learning Rate: 0.001993 | Progress: 0.03865 +Step 25,975 | Tokens: 851,148,800 | Train Loss EWMA: 3.7600 | Learning Rate: 0.001993 | Progress: 0.03869 +Step 26,000 | Tokens: 851,968,000 | Train Loss EWMA: 3.7512 | Learning Rate: 0.001993 | Progress: 0.03873 +Step 26,025 | Tokens: 852,787,200 | Train Loss EWMA: 3.7386 | Learning Rate: 0.001993 | Progress: 0.03876 +Step 26,050 | Tokens: 853,606,400 | Train Loss EWMA: 3.7418 | Learning Rate: 0.001993 | Progress: 0.03880 +Step 26,075 | Tokens: 854,425,600 | Train Loss EWMA: 3.7383 | Learning Rate: 0.001993 | Progress: 0.03884 +Step 26,100 | Tokens: 855,244,800 | Train Loss EWMA: 3.7451 | Learning Rate: 0.001993 | Progress: 0.03887 +Step 26,125 | Tokens: 856,064,000 | Train Loss EWMA: 3.7332 | Learning Rate: 0.001993 | Progress: 0.03891 +Step 26,150 | Tokens: 856,883,200 | Train Loss EWMA: 3.7446 | Learning Rate: 0.001993 | Progress: 0.03895 +Step 26,175 | Tokens: 857,702,400 | Train Loss EWMA: 3.7398 | Learning Rate: 0.001993 | Progress: 0.03899 +Step 26,200 | Tokens: 858,521,600 | Train Loss EWMA: 3.7512 | Learning Rate: 0.001993 | Progress: 0.03902 +Step 26,225 | Tokens: 859,340,800 | Train Loss EWMA: 3.7449 | Learning Rate: 0.001993 | Progress: 0.03906 +Step 26,250 | Tokens: 860,160,000 | Train Loss EWMA: 3.7514 | Learning Rate: 0.001993 | Progress: 0.03910 +Step 26,275 | Tokens: 860,979,200 | Train Loss EWMA: 3.7561 | Learning Rate: 0.001993 | Progress: 0.03914 +Step 26,300 | Tokens: 861,798,400 | Train Loss EWMA: 3.7650 | Learning Rate: 0.001993 | Progress: 0.03917 +Step 26,325 | Tokens: 862,617,600 | Train Loss EWMA: 3.7654 | Learning Rate: 0.001993 | Progress: 0.03921 +Step 26,350 | Tokens: 863,436,800 | Train Loss EWMA: 3.7704 | Learning Rate: 0.001993 | Progress: 0.03925 +Step 26,375 | Tokens: 864,256,000 | Train Loss EWMA: 3.7815 | Learning Rate: 0.001993 | Progress: 0.03928 +Step 26,400 | Tokens: 865,075,200 | Train Loss EWMA: 3.7845 | Learning Rate: 0.001993 | Progress: 0.03932 +Step 26,425 | Tokens: 865,894,400 | Train Loss EWMA: 3.7649 | Learning Rate: 0.001993 | Progress: 0.03936 +Step 26,450 | Tokens: 866,713,600 | Train Loss EWMA: 3.7628 | Learning Rate: 0.001993 | Progress: 0.03940 +Step 26,475 | Tokens: 867,532,800 | Train Loss EWMA: 3.7558 | Learning Rate: 0.001993 | Progress: 0.03943 +Step 26,500 | Tokens: 868,352,000 | Train Loss EWMA: 3.7534 | Learning Rate: 0.001993 | Progress: 0.03947 +Step 26,525 | Tokens: 869,171,200 | Train Loss EWMA: 3.7494 | Learning Rate: 0.001993 | Progress: 0.03951 +Step 26,550 | Tokens: 869,990,400 | Train Loss EWMA: 3.7492 | Learning Rate: 0.001993 | Progress: 0.03955 +Step 26,575 | Tokens: 870,809,600 | Train Loss EWMA: 3.7559 | Learning Rate: 0.001993 | Progress: 0.03958 +Step 26,600 | Tokens: 871,628,800 | Train Loss EWMA: 3.7586 | Learning Rate: 0.001993 | Progress: 0.03962 +Step 26,625 | Tokens: 872,448,000 | Train Loss EWMA: 3.7629 | Learning Rate: 0.001993 | Progress: 0.03966 +Step 26,650 | Tokens: 873,267,200 | Train Loss EWMA: 3.7663 | Learning Rate: 0.001993 | Progress: 0.03969 +Step 26,675 | Tokens: 874,086,400 | Train Loss EWMA: 3.7670 | Learning Rate: 0.001993 | Progress: 0.03973 +Step 26,700 | Tokens: 874,905,600 | Train Loss EWMA: 3.7678 | Learning Rate: 0.001993 | Progress: 0.03977 +Step 26,725 | Tokens: 875,724,800 | Train Loss EWMA: 3.7779 | Learning Rate: 0.001993 | Progress: 0.03981 +Step 26,750 | Tokens: 876,544,000 | Train Loss EWMA: 3.7629 | Learning Rate: 0.001993 | Progress: 0.03984 +Step 26,775 | Tokens: 877,363,200 | Train Loss EWMA: 3.7609 | Learning Rate: 0.001993 | Progress: 0.03988 +Step 26,800 | Tokens: 878,182,400 | Train Loss EWMA: 3.7512 | Learning Rate: 0.001993 | Progress: 0.03992 +Step 26,825 | Tokens: 879,001,600 | Train Loss EWMA: 3.7594 | Learning Rate: 0.001993 | Progress: 0.03995 +Step 26,850 | Tokens: 879,820,800 | Train Loss EWMA: 3.7531 | Learning Rate: 0.001993 | Progress: 0.03999 +Step 26,875 | Tokens: 880,640,000 | Train Loss EWMA: 3.7616 | Learning Rate: 0.001993 | Progress: 0.04003 +Step 26,900 | Tokens: 881,459,200 | Train Loss EWMA: 3.7611 | Learning Rate: 0.001993 | Progress: 0.04007 +Step 26,925 | Tokens: 882,278,400 | Train Loss EWMA: 3.7578 | Learning Rate: 0.001993 | Progress: 0.04010 +Step 26,950 | Tokens: 883,097,600 | Train Loss EWMA: 3.7456 | Learning Rate: 0.001993 | Progress: 0.04014 +Step 26,975 | Tokens: 883,916,800 | Train Loss EWMA: 3.7438 | Learning Rate: 0.001993 | Progress: 0.04018 +Step 27,000 | Tokens: 884,736,000 | Train Loss EWMA: 3.7466 | Learning Rate: 0.001993 | Progress: 0.04022 +Step 27,025 | Tokens: 885,555,200 | Train Loss EWMA: 3.7518 | Learning Rate: 0.001993 | Progress: 0.04025 +Step 27,050 | Tokens: 886,374,400 | Train Loss EWMA: 3.7544 | Learning Rate: 0.001993 | Progress: 0.04029 +Step 27,075 | Tokens: 887,193,600 | Train Loss EWMA: 3.7619 | Learning Rate: 0.001992 | Progress: 0.04033 +Step 27,100 | Tokens: 888,012,800 | Train Loss EWMA: 3.7572 | Learning Rate: 0.001992 | Progress: 0.04036 +Step 27,125 | Tokens: 888,832,000 | Train Loss EWMA: 3.7515 | Learning Rate: 0.001992 | Progress: 0.04040 +Step 27,150 | Tokens: 889,651,200 | Train Loss EWMA: 3.7633 | Learning Rate: 0.001992 | Progress: 0.04044 +Step 27,175 | Tokens: 890,470,400 | Train Loss EWMA: 3.7646 | Learning Rate: 0.001992 | Progress: 0.04048 +Step 27,200 | Tokens: 891,289,600 | Train Loss EWMA: 3.7609 | Learning Rate: 0.001992 | Progress: 0.04051 +Step 27,225 | Tokens: 892,108,800 | Train Loss EWMA: 3.7624 | Learning Rate: 0.001992 | Progress: 0.04055 +Step 27,250 | Tokens: 892,928,000 | Train Loss EWMA: 3.7635 | Learning Rate: 0.001992 | Progress: 0.04059 +Step 27,275 | Tokens: 893,747,200 | Train Loss EWMA: 3.7621 | Learning Rate: 0.001992 | Progress: 0.04062 +Step 27,300 | Tokens: 894,566,400 | Train Loss EWMA: 3.7622 | Learning Rate: 0.001992 | Progress: 0.04066 +Step 27,325 | Tokens: 895,385,600 | Train Loss EWMA: 3.7687 | Learning Rate: 0.001992 | Progress: 0.04070 +Step 27,350 | Tokens: 896,204,800 | Train Loss EWMA: 3.7664 | Learning Rate: 0.001992 | Progress: 0.04074 +Step 27,375 | Tokens: 897,024,000 | Train Loss EWMA: 3.7582 | Learning Rate: 0.001992 | Progress: 0.04077 +Step 27,400 | Tokens: 897,843,200 | Train Loss EWMA: 3.7513 | Learning Rate: 0.001992 | Progress: 0.04081 +Step 27,425 | Tokens: 898,662,400 | Train Loss EWMA: 3.7520 | Learning Rate: 0.001992 | Progress: 0.04085 +Step 27,450 | Tokens: 899,481,600 | Train Loss EWMA: 3.7588 | Learning Rate: 0.001992 | Progress: 0.04089 +Step 27,475 | Tokens: 900,300,800 | Train Loss EWMA: 3.7485 | Learning Rate: 0.001992 | Progress: 0.04092 +Step 27,500 | Tokens: 901,120,000 | Train Loss EWMA: 3.7678 | Learning Rate: 0.001992 | Progress: 0.04096 +Step 27,525 | Tokens: 901,939,200 | Train Loss EWMA: 3.7688 | Learning Rate: 0.001992 | Progress: 0.04100 +Step 27,550 | Tokens: 902,758,400 | Train Loss EWMA: 3.7569 | Learning Rate: 0.001992 | Progress: 0.04103 +Step 27,575 | Tokens: 903,577,600 | Train Loss EWMA: 3.7506 | Learning Rate: 0.001992 | Progress: 0.04107 +Step 27,600 | Tokens: 904,396,800 | Train Loss EWMA: 3.7592 | Learning Rate: 0.001992 | Progress: 0.04111 +Step 27,625 | Tokens: 905,216,000 | Train Loss EWMA: 3.7512 | Learning Rate: 0.001992 | Progress: 0.04115 +Step 27,650 | Tokens: 906,035,200 | Train Loss EWMA: 3.7495 | Learning Rate: 0.001992 | Progress: 0.04118 +Step 27,675 | Tokens: 906,854,400 | Train Loss EWMA: 3.7463 | Learning Rate: 0.001992 | Progress: 0.04122 +Step 27,700 | Tokens: 907,673,600 | Train Loss EWMA: 3.7498 | Learning Rate: 0.001992 | Progress: 0.04126 +Step 27,725 | Tokens: 908,492,800 | Train Loss EWMA: 3.7497 | Learning Rate: 0.001992 | Progress: 0.04130 +Step 27,750 | Tokens: 909,312,000 | Train Loss EWMA: 3.7488 | Learning Rate: 0.001992 | Progress: 0.04133 +Step 27,775 | Tokens: 910,131,200 | Train Loss EWMA: 3.7552 | Learning Rate: 0.001992 | Progress: 0.04137 +Step 27,800 | Tokens: 910,950,400 | Train Loss EWMA: 3.7490 | Learning Rate: 0.001992 | Progress: 0.04141 +Step 27,825 | Tokens: 911,769,600 | Train Loss EWMA: 3.7361 | Learning Rate: 0.001992 | Progress: 0.04144 +Step 27,850 | Tokens: 912,588,800 | Train Loss EWMA: 3.7371 | Learning Rate: 0.001992 | Progress: 0.04148 +Step 27,875 | Tokens: 913,408,000 | Train Loss EWMA: 3.7476 | Learning Rate: 0.001992 | Progress: 0.04152 +Step 27,900 | Tokens: 914,227,200 | Train Loss EWMA: 3.7440 | Learning Rate: 0.001992 | Progress: 0.04156 +Step 27,925 | Tokens: 915,046,400 | Train Loss EWMA: 3.7468 | Learning Rate: 0.001992 | Progress: 0.04159 +Step 27,950 | Tokens: 915,865,600 | Train Loss EWMA: 3.7380 | Learning Rate: 0.001992 | Progress: 0.04163 +Step 27,975 | Tokens: 916,684,800 | Train Loss EWMA: 3.7301 | Learning Rate: 0.001992 | Progress: 0.04167 +Step 28,000 | Tokens: 917,504,000 | Train Loss EWMA: 3.7365 | Learning Rate: 0.001992 | Progress: 0.04170 +Step 28,025 | Tokens: 918,323,200 | Train Loss EWMA: 3.7442 | Learning Rate: 0.001992 | Progress: 0.04174 +Step 28,050 | Tokens: 919,142,400 | Train Loss EWMA: 3.7461 | Learning Rate: 0.001992 | Progress: 0.04178 +Step 28,075 | Tokens: 919,961,600 | Train Loss EWMA: 3.7528 | Learning Rate: 0.001992 | Progress: 0.04182 +Step 28,100 | Tokens: 920,780,800 | Train Loss EWMA: 3.7486 | Learning Rate: 0.001992 | Progress: 0.04185 +Step 28,125 | Tokens: 921,600,000 | Train Loss EWMA: 3.7492 | Learning Rate: 0.001992 | Progress: 0.04189 +Step 28,150 | Tokens: 922,419,200 | Train Loss EWMA: 3.7575 | Learning Rate: 0.001992 | Progress: 0.04193 +Step 28,175 | Tokens: 923,238,400 | Train Loss EWMA: 3.7540 | Learning Rate: 0.001992 | Progress: 0.04197 +Step 28,200 | Tokens: 924,057,600 | Train Loss EWMA: 3.7540 | Learning Rate: 0.001992 | Progress: 0.04200 +Step 28,225 | Tokens: 924,876,800 | Train Loss EWMA: 3.7669 | Learning Rate: 0.001992 | Progress: 0.04204 +Step 28,250 | Tokens: 925,696,000 | Train Loss EWMA: 3.7641 | Learning Rate: 0.001992 | Progress: 0.04208 +Step 28,275 | Tokens: 926,515,200 | Train Loss EWMA: 3.7461 | Learning Rate: 0.001992 | Progress: 0.04211 +Step 28,300 | Tokens: 927,334,400 | Train Loss EWMA: 3.7460 | Learning Rate: 0.001992 | Progress: 0.04215 +Step 28,325 | Tokens: 928,153,600 | Train Loss EWMA: 3.7490 | Learning Rate: 0.001992 | Progress: 0.04219 +Step 28,350 | Tokens: 928,972,800 | Train Loss EWMA: 3.7416 | Learning Rate: 0.001992 | Progress: 0.04223 +Step 28,375 | Tokens: 929,792,000 | Train Loss EWMA: 3.7468 | Learning Rate: 0.001992 | Progress: 0.04226 +Step 28,400 | Tokens: 930,611,200 | Train Loss EWMA: 3.7521 | Learning Rate: 0.001992 | Progress: 0.04230 +Step 28,425 | Tokens: 931,430,400 | Train Loss EWMA: 3.7490 | Learning Rate: 0.001992 | Progress: 0.04234 +Step 28,450 | Tokens: 932,249,600 | Train Loss EWMA: 3.7456 | Learning Rate: 0.001992 | Progress: 0.04237 +Step 28,475 | Tokens: 933,068,800 | Train Loss EWMA: 3.7493 | Learning Rate: 0.001992 | Progress: 0.04241 +Step 28,500 | Tokens: 933,888,000 | Train Loss EWMA: 3.7600 | Learning Rate: 0.001992 | Progress: 0.04245 +Step 28,525 | Tokens: 934,707,200 | Train Loss EWMA: 3.7603 | Learning Rate: 0.001992 | Progress: 0.04249 +Step 28,550 | Tokens: 935,526,400 | Train Loss EWMA: 3.7443 | Learning Rate: 0.001992 | Progress: 0.04252 +Step 28,575 | Tokens: 936,345,600 | Train Loss EWMA: 3.7511 | Learning Rate: 0.001992 | Progress: 0.04256 +Step 28,600 | Tokens: 937,164,800 | Train Loss EWMA: 3.7467 | Learning Rate: 0.001992 | Progress: 0.04260 +Step 28,625 | Tokens: 937,984,000 | Train Loss EWMA: 3.7439 | Learning Rate: 0.001992 | Progress: 0.04264 +Step 28,650 | Tokens: 938,803,200 | Train Loss EWMA: 3.7532 | Learning Rate: 0.001992 | Progress: 0.04267 +Step 28,675 | Tokens: 939,622,400 | Train Loss EWMA: 3.7539 | Learning Rate: 0.001992 | Progress: 0.04271 +Step 28,700 | Tokens: 940,441,600 | Train Loss EWMA: 3.7513 | Learning Rate: 0.001992 | Progress: 0.04275 +Step 28,725 | Tokens: 941,260,800 | Train Loss EWMA: 3.7644 | Learning Rate: 0.001992 | Progress: 0.04278 +Step 28,750 | Tokens: 942,080,000 | Train Loss EWMA: 3.7631 | Learning Rate: 0.001992 | Progress: 0.04282 +Step 28,775 | Tokens: 942,899,200 | Train Loss EWMA: 3.7551 | Learning Rate: 0.001991 | Progress: 0.04286 +Step 28,800 | Tokens: 943,718,400 | Train Loss EWMA: 3.7512 | Learning Rate: 0.001991 | Progress: 0.04290 +Step 28,825 | Tokens: 944,537,600 | Train Loss EWMA: 3.7548 | Learning Rate: 0.001991 | Progress: 0.04293 +Step 28,850 | Tokens: 945,356,800 | Train Loss EWMA: 3.7570 | Learning Rate: 0.001991 | Progress: 0.04297 +Step 28,875 | Tokens: 946,176,000 | Train Loss EWMA: 3.7419 | Learning Rate: 0.001991 | Progress: 0.04301 +Step 28,900 | Tokens: 946,995,200 | Train Loss EWMA: 3.7460 | Learning Rate: 0.001991 | Progress: 0.04305 +Step 28,925 | Tokens: 947,814,400 | Train Loss EWMA: 3.7441 | Learning Rate: 0.001991 | Progress: 0.04308 +Step 28,950 | Tokens: 948,633,600 | Train Loss EWMA: 3.7407 | Learning Rate: 0.001991 | Progress: 0.04312 +Step 28,975 | Tokens: 949,452,800 | Train Loss EWMA: 3.7443 | Learning Rate: 0.001991 | Progress: 0.04316 +Step 29,000 | Tokens: 950,272,000 | Train Loss EWMA: 3.7399 | Learning Rate: 0.001991 | Progress: 0.04319 +Step 29,025 | Tokens: 951,091,200 | Train Loss EWMA: 3.7453 | Learning Rate: 0.001991 | Progress: 0.04323 +Step 29,050 | Tokens: 951,910,400 | Train Loss EWMA: 3.7469 | Learning Rate: 0.001991 | Progress: 0.04327 + +Training interrupted by user diff --git a/wandb/run-20250817_231334-ztcapltu/files/requirements.txt b/wandb/run-20250817_231334-ztcapltu/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..bbed86cc72e05aec5c78850f9963d0e3471caff0 --- /dev/null +++ b/wandb/run-20250817_231334-ztcapltu/files/requirements.txt @@ -0,0 +1,185 @@ +fsspec==2025.3.0 +PyYAML==6.0.2 +certifi==2025.8.3 +comm==0.2.3 +multidict==6.6.3 +widgetsnbextension==4.0.14 +Jinja2==3.1.6 +rich==14.1.0 +httpcore==1.0.9 +nvidia-cuda-cupti-cu12==12.8.90 +nvidia-cuda-nvrtc-cu12==12.8.93 +pyzmq==27.0.1 +jsonschema==4.25.0 +asttokens==3.0.0 +jsonschema-specifications==2025.4.1 +cycler==0.12.1 +stack-data==0.6.3 +aiosignal==1.4.0 +lark==1.2.2 +ptyprocess==0.7.0 +xxhash==3.5.0 +setuptools==65.5.0 +mpmath==1.3.0 +wadler_lindig==0.1.7 +typing_extensions==4.14.1 +nbformat==5.10.4 +huggingface-hub==0.34.4 +torchvision==0.23.0 +tqdm==4.67.1 +contourpy==1.3.3 +nvidia-nvtx-cu12==12.8.90 +nvidia-cuda-runtime-cu12==12.8.90 +yarl==1.20.1 +jupyter-events==0.12.0 +protobuf==6.31.1 +nbclient==0.10.2 +numpy==1.26.4 +decorator==5.2.1 +networkx==3.5 +smmap==5.0.2 +nbconvert==7.16.6 +pytz==2025.2 +aiohappyeyeballs==2.6.1 +tinycss2==1.4.0 +defusedxml==0.7.1 +matplotlib-inline==0.1.7 +hf-xet==1.1.7 +jedi==0.19.2 +transformer-lens==2.16.1 +pandas==2.3.1 +sympy==1.14.0 +jupyterlab_pygments==0.3.0 +overrides==7.7.0 +notebook_shim==0.2.4 +matplotlib==3.10.5 +jupyter==1.1.1 +dotenv==0.9.9 +accelerate==1.10.0 +better-abc==0.0.3 +jsonpointer==3.0.0 +terminado==0.18.1 +rfc3987-syntax==1.1.0 +annotated-types==0.7.0 +pyarrow==21.0.0 +webencodings==0.5.1 +wcwidth==0.2.13 +mistune==3.1.3 +cffi==1.17.1 +jupyterlab_server==2.27.3 +argon2-cffi-bindings==25.1.0 +nvidia-nvjitlink-cu12==12.8.93 +jaxtyping==0.3.2 +Pygments==2.19.2 +torch==2.8.0 +rfc3339-validator==0.1.4 +urllib3==2.5.0 +jupyterlab_widgets==3.0.15 +ipykernel==6.30.1 +nvidia-cudnn-cu12==9.10.2.21 +transformers==4.55.0 +babel==2.17.0 +pure_eval==0.2.3 +pyparsing==3.2.3 +nvidia-cublas-cu12==12.8.4.1 +sniffio==1.3.1 +notebook==7.4.5 +pycparser==2.22 +packaging==25.0 +h11==0.16.0 +psutil==7.0.0 +pexpect==4.9.0 +gitdb==4.0.12 +rfc3986-validator==0.1.1 +toy_models==0.1.0 +narwhals==2.0.1 +torchaudio==2.8.0 +prompt_toolkit==3.0.51 +attrs==25.3.0 +regex==2025.7.34 +jupyter_core==5.8.1 +bleach==6.2.0 +fqdn==1.5.1 +async-lru==2.0.5 +nvidia-nccl-cu12==2.27.3 +GitPython==3.1.45 +referencing==0.36.2 +click==8.2.1 +prometheus_client==0.22.1 +httpx==0.28.1 +requests==2.32.4 +fonttools==4.59.0 +argon2-cffi==25.1.0 +executing==2.2.0 +arrow==1.3.0 +beartype==0.14.1 +ipywidgets==8.1.7 +pydantic_core==2.33.2 +tokenizers==0.21.4 +pip==23.2.1 +python-dotenv==1.1.1 +isoduration==20.11.0 +python-dateutil==2.9.0.post0 +json5==0.12.0 +nvidia-curand-cu12==10.3.9.90 +webcolors==24.11.1 +MarkupSafe==3.0.2 +nvidia-cusolver-cu12==11.7.3.90 +sentry-sdk==2.34.1 +Send2Trash==1.8.3 +jupyter_server_terminals==0.5.3 +debugpy==1.8.16 +nvidia-cufft-cu12==11.3.3.83 +typing-inspection==0.4.1 +rpds-py==0.27.0 +nvidia-cufile-cu12==1.13.1.3 +mdurl==0.1.2 +websocket-client==1.8.0 +python-json-logger==3.3.0 +filelock==3.18.0 +types-python-dateutil==2.9.0.20250809 +kiwisolver==1.4.8 +einops==0.8.1 +jupyter_client==8.6.3 +ipython_pygments_lexers==1.1.1 +tabulate==0.9.0 +propcache==0.3.2 +tornado==6.5.2 +typeguard==4.4.4 +tomlkit==0.13.2 +pydantic==2.11.7 +ipython==9.4.0 +charset-normalizer==3.4.2 +fancy-einsum==0.0.3 +datasets==4.0.0 +pillow==11.3.0 +beautifulsoup4==4.13.4 +soupsieve==2.7 +aiohttp==3.12.15 +plotly==6.2.0 +wandb==0.21.1 +tzdata==2025.2 +jupyter-lsp==2.2.6 +triton==3.4.0 +idna==3.10 +jupyterlab==4.4.5 +multiprocess==0.70.16 +dill==0.3.8 +fastjsonschema==2.21.1 +transformers-stream-generator==0.0.5 +nvidia-cusparselt-cu12==0.7.1 +parso==0.8.4 +pandocfilters==1.5.1 +jupyter-console==6.6.3 +anyio==4.10.0 +six==1.17.0 +uri-template==1.3.0 +sentencepiece==0.2.0 +markdown-it-py==3.0.0 +nest-asyncio==1.6.0 +nvidia-cusparse-cu12==12.5.8.93 +platformdirs==4.3.8 +traitlets==5.14.3 +jupyter_server==2.16.0 +safetensors==0.6.2 +frozenlist==1.7.0 diff --git a/wandb/run-20250817_231334-ztcapltu/files/wandb-metadata.json b/wandb/run-20250817_231334-ztcapltu/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..d8db966efc583ca2da62920393d3cb957da33e7d --- /dev/null +++ b/wandb/run-20250817_231334-ztcapltu/files/wandb-metadata.json @@ -0,0 +1,38 @@ +{ + "os": "Linux-5.19.0-45-generic-x86_64-with-glibc2.35", + "python": "CPython 3.11.7", + "startedAt": "2025-08-17T23:13:34.541556Z", + "program": "", + "git": { + "remote": "git@github.com:jgroh3/toy_models.git", + "commit": "d64789fa6192a1a6beb031f1d38c9cfcfa725511" + }, + "email": "efarrel4@tcd.ie", + "root": "/notebooks/toy_models/models/gelu_2l_v4", + "host": "negsg2jtgf", + "executable": "/notebooks/clean_env/bin/python", + "cpu_count": 8, + "cpu_count_logical": 8, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 1, + "disk": { + "/": { + "total": "262240792576", + "used": "119256522752" + } + }, + "memory": { + "total": "47332843520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere", + "uuid": "GPU-7cbd3200-160d-1175-56c6-469e5662f695" + } + ], + "cudaVersion": "12.4", + "writerId": "62x2pn3fvnbphvb760dj9u9xjsfoa3n2" +} \ No newline at end of file diff --git a/wandb/run-20250817_231334-ztcapltu/files/wandb-summary.json b/wandb/run-20250817_231334-ztcapltu/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..d42a2ef62a22b6d54eadcd468420a1d7c8b45468 --- /dev/null +++ b/wandb/run-20250817_231334-ztcapltu/files/wandb-summary.json @@ -0,0 +1 @@ +{"progress":0.043268654545454546,"tokens_seen":951910400,"tokens_per_second":32768,"_runtime":19461.869757642,"_wandb":{"runtime":19461},"train_loss_ewma":3.7469153827782615,"learning_rate":0.001991322901611416,"train_loss":3.805884838104248,"_timestamp":1.7554918606117625e+09,"_step":29050,"step":29050} \ No newline at end of file diff --git a/wandb/run-20250817_231334-ztcapltu/logs/debug-internal.log b/wandb/run-20250817_231334-ztcapltu/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..9e609479874574a6a45dc17f5fd98cc47b701851 --- /dev/null +++ b/wandb/run-20250817_231334-ztcapltu/logs/debug-internal.log @@ -0,0 +1,12 @@ +{"time":"2025-08-17T23:13:35.027401827Z","level":"INFO","msg":"stream: starting","core version":"0.21.1"} +{"time":"2025-08-17T23:13:35.390074378Z","level":"INFO","msg":"stream: created new stream","id":"ztcapltu"} +{"time":"2025-08-17T23:13:35.390133919Z","level":"INFO","msg":"stream: started","id":"ztcapltu"} +{"time":"2025-08-17T23:13:35.390167795Z","level":"INFO","msg":"writer: started","stream_id":"ztcapltu"} +{"time":"2025-08-17T23:13:35.390220023Z","level":"INFO","msg":"handler: started","stream_id":"ztcapltu"} +{"time":"2025-08-17T23:13:35.390286334Z","level":"INFO","msg":"sender: started","stream_id":"ztcapltu"} +{"time":"2025-08-18T04:37:57.836761784Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-08-18T04:37:57.992197787Z","level":"INFO","msg":"handler: operation stats","stats":{}} +{"time":"2025-08-18T04:37:57.99521591Z","level":"INFO","msg":"stream: closing","id":"ztcapltu"} +{"time":"2025-08-18T04:37:57.995240541Z","level":"INFO","msg":"handler: closed","stream_id":"ztcapltu"} +{"time":"2025-08-18T04:37:57.995293994Z","level":"INFO","msg":"sender: closed","stream_id":"ztcapltu"} +{"time":"2025-08-18T04:37:57.995303271Z","level":"INFO","msg":"stream: closed","id":"ztcapltu"} diff --git a/wandb/run-20250817_231334-ztcapltu/logs/debug.log b/wandb/run-20250817_231334-ztcapltu/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..71497b7e16be463200d033d42be9d95b9ef4377b --- /dev/null +++ b/wandb/run-20250817_231334-ztcapltu/logs/debug.log @@ -0,0 +1,28 @@ +2025-08-17 23:13:34,546 INFO MainThread:155 [wandb_setup.py:_flush():80] Current SDK version is 0.21.1 +2025-08-17 23:13:34,546 INFO MainThread:155 [wandb_setup.py:_flush():80] Configure stats pid to 155 +2025-08-17 23:13:34,546 INFO MainThread:155 [wandb_setup.py:_flush():80] Loading settings from /root/.config/wandb/settings +2025-08-17 23:13:34,546 INFO MainThread:155 [wandb_setup.py:_flush():80] Loading settings from /notebooks/toy_models/models/gelu_2l_v4/wandb/settings +2025-08-17 23:13:34,546 INFO MainThread:155 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-08-17 23:13:34,546 INFO MainThread:155 [wandb_init.py:setup_run_log_directory():703] Logging user logs to /notebooks/toy_models/models/gelu_2l_v4/wandb/run-20250817_231334-ztcapltu/logs/debug.log +2025-08-17 23:13:34,546 INFO MainThread:155 [wandb_init.py:setup_run_log_directory():704] Logging internal logs to /notebooks/toy_models/models/gelu_2l_v4/wandb/run-20250817_231334-ztcapltu/logs/debug-internal.log +2025-08-17 23:13:34,546 INFO MainThread:155 [wandb_init.py:init():830] calling init triggers +2025-08-17 23:13:34,546 INFO MainThread:155 [wandb_init.py:init():835] wandb.init called with sweep_config: {} +config: {'model_name': 'gelu_2l_v4', 'n_layers': 2, 'd_model': 512, 'd_mlp': 2048, 'd_head': 64, 'n_heads': 8, 'attn_only': False, 'layer_norm_eps': 1e-05, 'init_range': 0.02, 'n_ctx': 1024, 'd_vocab': 48262, 'dataset_name': 'NeelNanda/c4-code-tokenized-2b', 'tokenizer_name': 'NeelNanda/gpt-neox-tokenizer-digits', 'seed': 10, 'device': 'cuda', 'use_bfloat16_matmul': False, 'batch_size_per_device': 32, 'n_devices': 1, 'batches_per_step': 1, 'max_tokens': 22000000000, 'lr_hidden': 0.002, 'lr_vector': 0.001, 'lr_schedule': 'cosine_warmup', 'warmup_tokens': 30000000, 'weight_decay': 0.05, 'grad_norm_clip': 1.0, 'train_loss_moving_average_beta': 0.99, 'log_interval': 25, 'save_checkpoints': True, 'checkpoint_interval': 500, 'checkpoint_interval_ratio': 1.06, 'save_log_checkpoints': True, 'use_wandb': True, 'batch_size': 32, 'tokens_per_step': 32768, 'warmup_steps': 915, 'max_steps': 671386, '_wandb': {}} +2025-08-17 23:13:34,546 INFO MainThread:155 [wandb_init.py:init():871] starting backend +2025-08-17 23:13:35,012 INFO MainThread:155 [wandb_init.py:init():874] sending inform_init request +2025-08-17 23:13:35,025 INFO MainThread:155 [wandb_init.py:init():882] backend started and connected +2025-08-17 23:13:35,026 INFO MainThread:155 [wandb_init.py:init():953] updated telemetry +2025-08-17 23:13:35,032 INFO MainThread:155 [wandb_init.py:init():977] communicating run to backend with 90.0 second timeout +2025-08-17 23:13:35,613 INFO MainThread:155 [wandb_init.py:init():1029] starting run threads in backend +2025-08-17 23:13:36,205 INFO MainThread:155 [wandb_run.py:_console_start():2494] atexit reg +2025-08-17 23:13:36,205 INFO MainThread:155 [wandb_run.py:_redirect():2342] redirect: wrap_raw +2025-08-17 23:13:36,205 INFO MainThread:155 [wandb_run.py:_redirect():2411] Wrapping output streams. +2025-08-17 23:13:36,206 INFO MainThread:155 [wandb_run.py:_redirect():2434] Redirects installed. +2025-08-17 23:13:36,219 INFO MainThread:155 [wandb_init.py:init():1075] run started, returning control to user process +2025-08-18 04:37:57,417 INFO MainThread:155 [wandb_run.py:_finish():2260] finishing run eoin/toy-transformer-replication/ztcapltu +2025-08-18 04:37:57,422 INFO MainThread:155 [wandb_run.py:_atexit_cleanup():2459] got exitcode: 0 +2025-08-18 04:37:57,422 INFO MainThread:155 [wandb_run.py:_restore():2441] restore +2025-08-18 04:37:57,422 INFO MainThread:155 [wandb_run.py:_restore():2447] restore done +2025-08-18 04:37:57,993 INFO MainThread:155 [wandb_run.py:_footer_history_summary_info():3895] rendering history +2025-08-18 04:37:57,994 INFO MainThread:155 [wandb_run.py:_footer_history_summary_info():3927] rendering summary +2025-08-18 04:37:57,994 INFO MainThread:155 [wandb_run.py:_footer_sync_info():3856] logging synced files diff --git a/wandb/run-20250817_231334-ztcapltu/run-ztcapltu.wandb b/wandb/run-20250817_231334-ztcapltu/run-ztcapltu.wandb new file mode 100644 index 0000000000000000000000000000000000000000..9376aca4b83904d09a33c26e98cae564c5941f5f --- /dev/null +++ b/wandb/run-20250817_231334-ztcapltu/run-ztcapltu.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:655ac7d5bf6cb95d1e2dc3919583a8ab29ef622665400a2de2f429599e49c191 +size 16039398