diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..a237ac0fae30b8161be31da5d46018fc81377239 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +wandb/run-20250819_063627-4rs47wj5/run-4rs47wj5.wandb filter=lfs diff=lfs merge=lfs -text diff --git a/.ipynb_checkpoints/config-checkpoint.toml b/.ipynb_checkpoints/config-checkpoint.toml new file mode 100644 index 0000000000000000000000000000000000000000..165cc88299d1c0dbf158b9a6be3d43f70773519c --- /dev/null +++ b/.ipynb_checkpoints/config-checkpoint.toml @@ -0,0 +1,32 @@ +model_name = "gelu_2l_v5_random_above_15000" +n_layers = 2 +d_model = 512 +d_mlp = 2048 +d_head = 64 +n_heads = 8 +attn_only = false +layer_norm_eps = 1e-05 +init_range = 0.02 +n_ctx = 1024 +d_vocab = 48262 +dataset_name = "eoinf/unprocessed-c4-code-test" +tokenizer_name = "NeelNanda/gpt-neox-tokenizer-digits" +seed = 10 +device = "cuda" +use_bfloat16_matmul = false +batch_size_per_device = 32 +n_devices = 1 +batches_per_step = 1 +max_tokens = 200000000 +lr_hidden = 0.002 +lr_vector = 0.001 +lr_schedule = "constant_with_warmup" +warmup_tokens = 30000000 +weight_decay = 0.05 +grad_norm_clip = 1.0 +train_loss_moving_average_beta = 0.99 +log_interval = 25 +save_checkpoints = true +checkpoint_interval = 500 +checkpoint_interval_ratio = 1.08 +save_log_checkpoints = true \ No newline at end of file diff --git a/checkpoints/metadata_000000032768.json b/checkpoints/metadata_000000032768.json new file mode 100644 index 0000000000000000000000000000000000000000..88c3c4540c7b82532a6284835eeea5423d09b5d5 --- /dev/null +++ b/checkpoints/metadata_000000032768.json @@ -0,0 +1 @@ +{"step": 1, "tokens_seen": 32768, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.868239402770996} \ No newline at end of file diff --git a/checkpoints/metadata_000000327680.json b/checkpoints/metadata_000000327680.json new file mode 100644 index 0000000000000000000000000000000000000000..8afd64c07028973d77b96708fc94748b9708fc34 --- /dev/null +++ b/checkpoints/metadata_000000327680.json @@ -0,0 +1 @@ +{"step": 10, "tokens_seen": 327680, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.86287446604396} \ No newline at end of file diff --git a/checkpoints/metadata_000000360448.json b/checkpoints/metadata_000000360448.json new file mode 100644 index 0000000000000000000000000000000000000000..6641e51460339b44ff786c280b2eee610d4fe48b --- /dev/null +++ b/checkpoints/metadata_000000360448.json @@ -0,0 +1 @@ +{"step": 11, "tokens_seen": 360448, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.860849172158668} \ No newline at end of file diff --git a/checkpoints/metadata_000000393216.json b/checkpoints/metadata_000000393216.json new file mode 100644 index 0000000000000000000000000000000000000000..b75048276184e01a85d30e44f4357a843b028fab --- /dev/null +++ b/checkpoints/metadata_000000393216.json @@ -0,0 +1 @@ +{"step": 12, "tokens_seen": 393216, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.858176158647531} \ No newline at end of file diff --git a/checkpoints/metadata_000000425984.json b/checkpoints/metadata_000000425984.json new file mode 100644 index 0000000000000000000000000000000000000000..51bf5269d0772c2382ee5b05b02d124e68bc895d --- /dev/null +++ b/checkpoints/metadata_000000425984.json @@ -0,0 +1 @@ +{"step": 13, "tokens_seen": 425984, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.855003761761374} \ No newline at end of file diff --git a/checkpoints/metadata_000000458752.json b/checkpoints/metadata_000000458752.json new file mode 100644 index 0000000000000000000000000000000000000000..91bec8bcb901b16f9adffc096d5875f0852b2c29 --- /dev/null +++ b/checkpoints/metadata_000000458752.json @@ -0,0 +1 @@ +{"step": 14, "tokens_seen": 458752, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.851786966560143} \ No newline at end of file diff --git a/checkpoints/metadata_000000491520.json b/checkpoints/metadata_000000491520.json new file mode 100644 index 0000000000000000000000000000000000000000..e7b0b95d933c606a45ce9cd01c4a3fb21ae279f2 --- /dev/null +++ b/checkpoints/metadata_000000491520.json @@ -0,0 +1 @@ +{"step": 15, "tokens_seen": 491520, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.847815453095714} \ No newline at end of file diff --git a/checkpoints/metadata_000000557056.json b/checkpoints/metadata_000000557056.json new file mode 100644 index 0000000000000000000000000000000000000000..e56e3f944c38c450ae3f94bcfc406131bc408fe0 --- /dev/null +++ b/checkpoints/metadata_000000557056.json @@ -0,0 +1 @@ +{"step": 17, "tokens_seen": 557056, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.838863768585243} \ No newline at end of file diff --git a/checkpoints/metadata_000000589824.json b/checkpoints/metadata_000000589824.json new file mode 100644 index 0000000000000000000000000000000000000000..c1023c08d8ed6b7893a7b35256b76ea24140a4dd --- /dev/null +++ b/checkpoints/metadata_000000589824.json @@ -0,0 +1 @@ +{"step": 18, "tokens_seen": 589824, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.834144211633644} \ No newline at end of file diff --git a/checkpoints/metadata_000000655360.json b/checkpoints/metadata_000000655360.json new file mode 100644 index 0000000000000000000000000000000000000000..603d6b9624aa0e5fefa2415b4433f74311983e8c --- /dev/null +++ b/checkpoints/metadata_000000655360.json @@ -0,0 +1 @@ +{"step": 20, "tokens_seen": 655360, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.822769889832388} \ No newline at end of file diff --git a/checkpoints/metadata_000000688128.json b/checkpoints/metadata_000000688128.json new file mode 100644 index 0000000000000000000000000000000000000000..07b77a8e1e4eafc406122a2a77c3c432c2baf19e --- /dev/null +++ b/checkpoints/metadata_000000688128.json @@ -0,0 +1 @@ +{"step": 21, "tokens_seen": 688128, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.81652450790489} \ No newline at end of file diff --git a/checkpoints/metadata_000000753664.json b/checkpoints/metadata_000000753664.json new file mode 100644 index 0000000000000000000000000000000000000000..18352ed8099c991762eaa605aeb6a525f2b7d7a7 --- /dev/null +++ b/checkpoints/metadata_000000753664.json @@ -0,0 +1 @@ +{"step": 23, "tokens_seen": 753664, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.803400143292523} \ No newline at end of file diff --git a/checkpoints/metadata_000000819200.json b/checkpoints/metadata_000000819200.json new file mode 100644 index 0000000000000000000000000000000000000000..e6f48f8ce8056321606349afcc61709ba97fcada --- /dev/null +++ b/checkpoints/metadata_000000819200.json @@ -0,0 +1 @@ +{"step": 25, "tokens_seen": 819200, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.789313247099784} \ No newline at end of file diff --git a/checkpoints/metadata_000000884736.json b/checkpoints/metadata_000000884736.json new file mode 100644 index 0000000000000000000000000000000000000000..4f8d596f767f7588d62b7bc7ecb38007fde9dcb7 --- /dev/null +++ b/checkpoints/metadata_000000884736.json @@ -0,0 +1 @@ +{"step": 27, "tokens_seen": 884736, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.774260997711016} \ No newline at end of file diff --git a/checkpoints/metadata_000000950272.json b/checkpoints/metadata_000000950272.json new file mode 100644 index 0000000000000000000000000000000000000000..a190a2d07ca5135a590770ec623f1ebab5fd95e5 --- /dev/null +++ b/checkpoints/metadata_000000950272.json @@ -0,0 +1 @@ +{"step": 29, "tokens_seen": 950272, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.758691686808685} \ No newline at end of file diff --git a/checkpoints/metadata_000001015808.json b/checkpoints/metadata_000001015808.json new file mode 100644 index 0000000000000000000000000000000000000000..9800e061859f6f3c2e9c1bd4dc90ea8515111c8b --- /dev/null +++ b/checkpoints/metadata_000001015808.json @@ -0,0 +1 @@ +{"step": 31, "tokens_seen": 1015808, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.742747327988415} \ No newline at end of file diff --git a/checkpoints/metadata_000001114112.json b/checkpoints/metadata_000001114112.json new file mode 100644 index 0000000000000000000000000000000000000000..450f4f5a0886f787e7cc04d37a54213da83d4705 --- /dev/null +++ b/checkpoints/metadata_000001114112.json @@ -0,0 +1 @@ +{"step": 34, "tokens_seen": 1114112, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.717705506825098} \ No newline at end of file diff --git a/checkpoints/metadata_000001212416.json b/checkpoints/metadata_000001212416.json new file mode 100644 index 0000000000000000000000000000000000000000..85f07019157c97d86e1023debb81edcf11ecf4e0 --- /dev/null +++ b/checkpoints/metadata_000001212416.json @@ -0,0 +1 @@ +{"step": 37, "tokens_seen": 1212416, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.690564109185202} \ No newline at end of file diff --git a/checkpoints/metadata_000001310720.json b/checkpoints/metadata_000001310720.json new file mode 100644 index 0000000000000000000000000000000000000000..8eba25616b8e0f9aa35325d2fafd7c7ce3c89eb2 --- /dev/null +++ b/checkpoints/metadata_000001310720.json @@ -0,0 +1 @@ +{"step": 40, "tokens_seen": 1310720, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.66166241962609} \ No newline at end of file diff --git a/checkpoints/metadata_000001409024.json b/checkpoints/metadata_000001409024.json new file mode 100644 index 0000000000000000000000000000000000000000..1b9dad3e5b5725ceb23d06fa181d99e9c95967cf --- /dev/null +++ b/checkpoints/metadata_000001409024.json @@ -0,0 +1 @@ +{"step": 43, "tokens_seen": 1409024, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.631708621668675} \ No newline at end of file diff --git a/checkpoints/metadata_000001507328.json b/checkpoints/metadata_000001507328.json new file mode 100644 index 0000000000000000000000000000000000000000..5e114567d5db09eb33ccb14430f1d88d33b0d860 --- /dev/null +++ b/checkpoints/metadata_000001507328.json @@ -0,0 +1 @@ +{"step": 46, "tokens_seen": 1507328, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.599057965991031} \ No newline at end of file diff --git a/checkpoints/metadata_000001638400.json b/checkpoints/metadata_000001638400.json new file mode 100644 index 0000000000000000000000000000000000000000..b1f05fc8925e7477e0105c3fc10f01e73a4a317b --- /dev/null +++ b/checkpoints/metadata_000001638400.json @@ -0,0 +1 @@ +{"step": 50, "tokens_seen": 1638400, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.553797967122502} \ No newline at end of file diff --git a/checkpoints/metadata_000001769472.json b/checkpoints/metadata_000001769472.json new file mode 100644 index 0000000000000000000000000000000000000000..ad91c44bc4748e6cfbc22158209c4dd233e28bd1 --- /dev/null +++ b/checkpoints/metadata_000001769472.json @@ -0,0 +1 @@ +{"step": 54, "tokens_seen": 1769472, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.50501859749949} \ No newline at end of file diff --git a/checkpoints/metadata_000001933312.json b/checkpoints/metadata_000001933312.json new file mode 100644 index 0000000000000000000000000000000000000000..d6545da32e6bcc2aa9d719b4dca20a414732ef0a --- /dev/null +++ b/checkpoints/metadata_000001933312.json @@ -0,0 +1 @@ +{"step": 59, "tokens_seen": 1933312, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.438414175730971} \ No newline at end of file diff --git a/checkpoints/metadata_000002064384.json b/checkpoints/metadata_000002064384.json new file mode 100644 index 0000000000000000000000000000000000000000..f54cff06c454d0f55bcf398eec7ebdb824a4503d --- /dev/null +++ b/checkpoints/metadata_000002064384.json @@ -0,0 +1 @@ +{"step": 63, "tokens_seen": 2064384, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.38184663052806} \ No newline at end of file diff --git a/checkpoints/metadata_000002228224.json b/checkpoints/metadata_000002228224.json new file mode 100644 index 0000000000000000000000000000000000000000..478114544bb6c5a683523f702e060e8b23b382fc --- /dev/null +++ b/checkpoints/metadata_000002228224.json @@ -0,0 +1 @@ +{"step": 68, "tokens_seen": 2228224, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.305938791827995} \ No newline at end of file diff --git a/checkpoints/metadata_000002424832.json b/checkpoints/metadata_000002424832.json new file mode 100644 index 0000000000000000000000000000000000000000..4d2a905d1a59acb26fc6ce378635124e073280cd --- /dev/null +++ b/checkpoints/metadata_000002424832.json @@ -0,0 +1 @@ +{"step": 74, "tokens_seen": 2424832, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.207305079593207} \ No newline at end of file diff --git a/checkpoints/metadata_000002621440.json b/checkpoints/metadata_000002621440.json new file mode 100644 index 0000000000000000000000000000000000000000..aac1622e7974ee989ee66b169015fbca9cd3247f --- /dev/null +++ b/checkpoints/metadata_000002621440.json @@ -0,0 +1 @@ +{"step": 80, "tokens_seen": 2621440, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.100438468498478} \ No newline at end of file diff --git a/checkpoints/metadata_000002818048.json b/checkpoints/metadata_000002818048.json new file mode 100644 index 0000000000000000000000000000000000000000..ae1ae58e54a4b7e3cfa744f2737f590b1b4478af --- /dev/null +++ b/checkpoints/metadata_000002818048.json @@ -0,0 +1 @@ +{"step": 86, "tokens_seen": 2818048, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.989904257278067} \ No newline at end of file diff --git a/checkpoints/metadata_000003047424.json b/checkpoints/metadata_000003047424.json new file mode 100644 index 0000000000000000000000000000000000000000..939fa3cf299739081001157496b68975eb1a6820 --- /dev/null +++ b/checkpoints/metadata_000003047424.json @@ -0,0 +1 @@ +{"step": 93, "tokens_seen": 3047424, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.850987174384615} \ No newline at end of file diff --git a/checkpoints/metadata_000003309568.json b/checkpoints/metadata_000003309568.json new file mode 100644 index 0000000000000000000000000000000000000000..8a14b5a744c18fe602c6ccf062bd72e495933750 --- /dev/null +++ b/checkpoints/metadata_000003309568.json @@ -0,0 +1 @@ +{"step": 101, "tokens_seen": 3309568, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.687185549744209} \ No newline at end of file diff --git a/checkpoints/metadata_000003571712.json b/checkpoints/metadata_000003571712.json new file mode 100644 index 0000000000000000000000000000000000000000..5a4ade98a0e52a2dabcea505c9a4c080d133228b --- /dev/null +++ b/checkpoints/metadata_000003571712.json @@ -0,0 +1 @@ +{"step": 109, "tokens_seen": 3571712, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.52364022057291} \ No newline at end of file diff --git a/checkpoints/metadata_000003866624.json b/checkpoints/metadata_000003866624.json new file mode 100644 index 0000000000000000000000000000000000000000..3ec82e55453fa8cfaebd1423b3d465897ded2b6e --- /dev/null +++ b/checkpoints/metadata_000003866624.json @@ -0,0 +1 @@ +{"step": 118, "tokens_seen": 3866624, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.340571823774262} \ No newline at end of file diff --git a/checkpoints/metadata_000004161536.json b/checkpoints/metadata_000004161536.json new file mode 100644 index 0000000000000000000000000000000000000000..f7c3a021528d0cfc0e1be1d93e9d0c8e64f23b6a --- /dev/null +++ b/checkpoints/metadata_000004161536.json @@ -0,0 +1 @@ +{"step": 127, "tokens_seen": 4161536, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.161249963618868} \ No newline at end of file diff --git a/checkpoints/metadata_000004489216.json b/checkpoints/metadata_000004489216.json new file mode 100644 index 0000000000000000000000000000000000000000..9c13d2ae3b878fe527e68b4be64bb7d6858c766c --- /dev/null +++ b/checkpoints/metadata_000004489216.json @@ -0,0 +1 @@ +{"step": 137, "tokens_seen": 4489216, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 8.973033804834454} \ No newline at end of file diff --git a/checkpoints/metadata_000004849664.json b/checkpoints/metadata_000004849664.json new file mode 100644 index 0000000000000000000000000000000000000000..08d81e07640089f5242050d158e8b62657e808e7 --- /dev/null +++ b/checkpoints/metadata_000004849664.json @@ -0,0 +1 @@ +{"step": 148, "tokens_seen": 4849664, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 8.771731265844688} \ No newline at end of file diff --git a/checkpoints/metadata_000005242880.json b/checkpoints/metadata_000005242880.json new file mode 100644 index 0000000000000000000000000000000000000000..f830e821f8afe3700e243ba7dff9b814a6e379dc --- /dev/null +++ b/checkpoints/metadata_000005242880.json @@ -0,0 +1 @@ +{"step": 160, "tokens_seen": 5242880, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 8.564690051795699} \ No newline at end of file diff --git a/checkpoints/metadata_000005668864.json b/checkpoints/metadata_000005668864.json new file mode 100644 index 0000000000000000000000000000000000000000..d4edf99fd705bab48ac8df851acd1370c252dea3 --- /dev/null +++ b/checkpoints/metadata_000005668864.json @@ -0,0 +1 @@ +{"step": 173, "tokens_seen": 5668864, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 8.348267745123467} \ No newline at end of file diff --git a/checkpoints/metadata_000006127616.json b/checkpoints/metadata_000006127616.json new file mode 100644 index 0000000000000000000000000000000000000000..54ec1269792beeeb1e528ee3207e9a39d3e0f5dc --- /dev/null +++ b/checkpoints/metadata_000006127616.json @@ -0,0 +1 @@ +{"step": 187, "tokens_seen": 6127616, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 8.140781545419552} \ No newline at end of file diff --git a/checkpoints/metadata_000006619136.json b/checkpoints/metadata_000006619136.json new file mode 100644 index 0000000000000000000000000000000000000000..3ecacd7c0db4efa4069a718290d8892252d4235e --- /dev/null +++ b/checkpoints/metadata_000006619136.json @@ -0,0 +1 @@ +{"step": 202, "tokens_seen": 6619136, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.9317240622439735} \ No newline at end of file diff --git a/checkpoints/metadata_000007143424.json b/checkpoints/metadata_000007143424.json new file mode 100644 index 0000000000000000000000000000000000000000..758769bc007b607a9af7990fe2545199305980f1 --- /dev/null +++ b/checkpoints/metadata_000007143424.json @@ -0,0 +1 @@ +{"step": 218, "tokens_seen": 7143424, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.7321983945549615} \ No newline at end of file diff --git a/checkpoints/metadata_000007733248.json b/checkpoints/metadata_000007733248.json new file mode 100644 index 0000000000000000000000000000000000000000..930d22e5f2cd7c7af0c25473804a787094716c74 --- /dev/null +++ b/checkpoints/metadata_000007733248.json @@ -0,0 +1 @@ +{"step": 236, "tokens_seen": 7733248, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.530479688649124} \ No newline at end of file diff --git a/checkpoints/metadata_000008323072.json b/checkpoints/metadata_000008323072.json new file mode 100644 index 0000000000000000000000000000000000000000..f8a78878697dd9550d3e294ea5b1b784a1e29fcc --- /dev/null +++ b/checkpoints/metadata_000008323072.json @@ -0,0 +1 @@ +{"step": 254, "tokens_seen": 8323072, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.352459819658762} \ No newline at end of file diff --git a/checkpoints/metadata_000009011200.json b/checkpoints/metadata_000009011200.json new file mode 100644 index 0000000000000000000000000000000000000000..c4be19ed8f16539a6ca7253d3eb747935f5c28b5 --- /dev/null +++ b/checkpoints/metadata_000009011200.json @@ -0,0 +1 @@ +{"step": 275, "tokens_seen": 9011200, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.16380355688412} \ No newline at end of file diff --git a/checkpoints/metadata_000009732096.json b/checkpoints/metadata_000009732096.json new file mode 100644 index 0000000000000000000000000000000000000000..b00952f758b79fa031e85ad6c7d2328124939f84 --- /dev/null +++ b/checkpoints/metadata_000009732096.json @@ -0,0 +1 @@ +{"step": 297, "tokens_seen": 9732096, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.001504471403502} \ No newline at end of file diff --git a/checkpoints/metadata_000010518528.json b/checkpoints/metadata_000010518528.json new file mode 100644 index 0000000000000000000000000000000000000000..629792e50ed15c10d67073ba51d0a5409c967ec9 --- /dev/null +++ b/checkpoints/metadata_000010518528.json @@ -0,0 +1 @@ +{"step": 321, "tokens_seen": 10518528, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.847585309257486} \ No newline at end of file diff --git a/checkpoints/metadata_000011337728.json b/checkpoints/metadata_000011337728.json new file mode 100644 index 0000000000000000000000000000000000000000..948292440cc8ad6d6d58b652a93b14571de50008 --- /dev/null +++ b/checkpoints/metadata_000011337728.json @@ -0,0 +1 @@ +{"step": 346, "tokens_seen": 11337728, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.714681949977808} \ No newline at end of file diff --git a/checkpoints/metadata_000012255232.json b/checkpoints/metadata_000012255232.json new file mode 100644 index 0000000000000000000000000000000000000000..8e2095b19a6d4c952508829b6aeb39b8c63bf8cb --- /dev/null +++ b/checkpoints/metadata_000012255232.json @@ -0,0 +1 @@ +{"step": 374, "tokens_seen": 12255232, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.579506133846703} \ No newline at end of file diff --git a/checkpoints/metadata_000013238272.json b/checkpoints/metadata_000013238272.json new file mode 100644 index 0000000000000000000000000000000000000000..8a9ca3cf5cea2057828c24a6198b1979735b6122 --- /dev/null +++ b/checkpoints/metadata_000013238272.json @@ -0,0 +1 @@ +{"step": 404, "tokens_seen": 13238272, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.464905847149448} \ No newline at end of file diff --git a/checkpoints/metadata_000014286848.json b/checkpoints/metadata_000014286848.json new file mode 100644 index 0000000000000000000000000000000000000000..6db9a9fdf84ee673e6817be7a309eb25d413ebac --- /dev/null +++ b/checkpoints/metadata_000014286848.json @@ -0,0 +1 @@ +{"step": 436, "tokens_seen": 14286848, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.360627677815071} \ No newline at end of file diff --git a/checkpoints/metadata_000015433728.json b/checkpoints/metadata_000015433728.json new file mode 100644 index 0000000000000000000000000000000000000000..11699851a55a7930f19e663f45d34ba9d6113f5d --- /dev/null +++ b/checkpoints/metadata_000015433728.json @@ -0,0 +1 @@ +{"step": 471, "tokens_seen": 15433728, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.26785436502008} \ No newline at end of file diff --git a/checkpoints/metadata_000016384000.json b/checkpoints/metadata_000016384000.json new file mode 100644 index 0000000000000000000000000000000000000000..5e9b4ed0efc4ad5b4009b70fa42a03c833e3bddc --- /dev/null +++ b/checkpoints/metadata_000016384000.json @@ -0,0 +1 @@ +{"step": 500, "tokens_seen": 16384000, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.203028927173892} \ No newline at end of file diff --git a/checkpoints/metadata_000016678912.json b/checkpoints/metadata_000016678912.json new file mode 100644 index 0000000000000000000000000000000000000000..f753d406e60b024cebabf8cbecf65707693ca55b --- /dev/null +++ b/checkpoints/metadata_000016678912.json @@ -0,0 +1 @@ +{"step": 509, "tokens_seen": 16678912, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.183952591647396} \ No newline at end of file diff --git a/checkpoints/metadata_000018022400.json b/checkpoints/metadata_000018022400.json new file mode 100644 index 0000000000000000000000000000000000000000..193ca0e3e09d7558d60cc16b062f87184362e2c8 --- /dev/null +++ b/checkpoints/metadata_000018022400.json @@ -0,0 +1 @@ +{"step": 550, "tokens_seen": 18022400, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.1159256665536} \ No newline at end of file diff --git a/checkpoints/metadata_000019464192.json b/checkpoints/metadata_000019464192.json new file mode 100644 index 0000000000000000000000000000000000000000..7f729b7794ff8d1d064af8c359b2bcd3c724d02d --- /dev/null +++ b/checkpoints/metadata_000019464192.json @@ -0,0 +1 @@ +{"step": 594, "tokens_seen": 19464192, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.044892915777454} \ No newline at end of file diff --git a/checkpoints/metadata_000021037056.json b/checkpoints/metadata_000021037056.json new file mode 100644 index 0000000000000000000000000000000000000000..efba1fa32952f61653bdf3b501bd98b9270543d0 --- /dev/null +++ b/checkpoints/metadata_000021037056.json @@ -0,0 +1 @@ +{"step": 642, "tokens_seen": 21037056, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.968713502281799} \ No newline at end of file diff --git a/checkpoints/metadata_000022708224.json b/checkpoints/metadata_000022708224.json new file mode 100644 index 0000000000000000000000000000000000000000..599ee7c7fa3e4c2f1b92906cd6338b735db7521f --- /dev/null +++ b/checkpoints/metadata_000022708224.json @@ -0,0 +1 @@ +{"step": 693, "tokens_seen": 22708224, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.905551129962754} \ No newline at end of file diff --git a/checkpoints/metadata_000024510464.json b/checkpoints/metadata_000024510464.json new file mode 100644 index 0000000000000000000000000000000000000000..0b37b3f7335fba43ada68d085c1d052f1dadee32 --- /dev/null +++ b/checkpoints/metadata_000024510464.json @@ -0,0 +1 @@ +{"step": 748, "tokens_seen": 24510464, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.841984989776493} \ No newline at end of file diff --git a/checkpoints/metadata_000026476544.json b/checkpoints/metadata_000026476544.json new file mode 100644 index 0000000000000000000000000000000000000000..1dbcc9098b4c24464dd5a719d107f6e9f084a1e8 --- /dev/null +++ b/checkpoints/metadata_000026476544.json @@ -0,0 +1 @@ +{"step": 808, "tokens_seen": 26476544, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.788286342516858} \ No newline at end of file diff --git a/checkpoints/metadata_000028606464.json b/checkpoints/metadata_000028606464.json new file mode 100644 index 0000000000000000000000000000000000000000..8ed924b7802ee103b6e724fde035eb596d352bc9 --- /dev/null +++ b/checkpoints/metadata_000028606464.json @@ -0,0 +1 @@ +{"step": 873, "tokens_seen": 28606464, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.721052623936837} \ No newline at end of file diff --git a/checkpoints/metadata_000030900224.json b/checkpoints/metadata_000030900224.json new file mode 100644 index 0000000000000000000000000000000000000000..3d9759310baa6f92c90f470f9f561542a24eabe6 --- /dev/null +++ b/checkpoints/metadata_000030900224.json @@ -0,0 +1 @@ +{"step": 943, "tokens_seen": 30900224, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.6547326306976515} \ No newline at end of file diff --git a/checkpoints/metadata_000032768000.json b/checkpoints/metadata_000032768000.json new file mode 100644 index 0000000000000000000000000000000000000000..d9fd45353d25628ce444b27705994084d9623070 --- /dev/null +++ b/checkpoints/metadata_000032768000.json @@ -0,0 +1 @@ +{"step": 1000, "tokens_seen": 32768000, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.577602666400762} \ No newline at end of file diff --git a/checkpoints/metadata_000033357824.json b/checkpoints/metadata_000033357824.json new file mode 100644 index 0000000000000000000000000000000000000000..ade5c80eb7f649633121b14d628982d0be2d0711 --- /dev/null +++ b/checkpoints/metadata_000033357824.json @@ -0,0 +1 @@ +{"step": 1018, "tokens_seen": 33357824, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.557618199812324} \ No newline at end of file diff --git a/checkpoints/metadata_000036044800.json b/checkpoints/metadata_000036044800.json new file mode 100644 index 0000000000000000000000000000000000000000..cc9535f5b1c1d7a4f1ad421a887447cae5ebe711 --- /dev/null +++ b/checkpoints/metadata_000036044800.json @@ -0,0 +1 @@ +{"step": 1100, "tokens_seen": 36044800, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.466977358698373} \ No newline at end of file diff --git a/checkpoints/metadata_000038928384.json b/checkpoints/metadata_000038928384.json new file mode 100644 index 0000000000000000000000000000000000000000..aa20b8f45aeba2a4d55bf53e435c211c29d9b186 --- /dev/null +++ b/checkpoints/metadata_000038928384.json @@ -0,0 +1 @@ +{"step": 1188, "tokens_seen": 38928384, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.372961213113283} \ No newline at end of file diff --git a/checkpoints/metadata_000042041344.json b/checkpoints/metadata_000042041344.json new file mode 100644 index 0000000000000000000000000000000000000000..e9543486cc6a17ea7d2591ea976658004a242a7a --- /dev/null +++ b/checkpoints/metadata_000042041344.json @@ -0,0 +1 @@ +{"step": 1283, "tokens_seen": 42041344, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.292891597494571} \ No newline at end of file diff --git a/checkpoints/metadata_000045416448.json b/checkpoints/metadata_000045416448.json new file mode 100644 index 0000000000000000000000000000000000000000..1dc169a5017dfe6147dce74e12faee7f361464b8 --- /dev/null +++ b/checkpoints/metadata_000045416448.json @@ -0,0 +1 @@ +{"step": 1386, "tokens_seen": 45416448, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.213082596928136} \ No newline at end of file diff --git a/checkpoints/metadata_000049053696.json b/checkpoints/metadata_000049053696.json new file mode 100644 index 0000000000000000000000000000000000000000..47100344bce6c611ff261908354142838f5f4506 --- /dev/null +++ b/checkpoints/metadata_000049053696.json @@ -0,0 +1 @@ +{"step": 1497, "tokens_seen": 49053696, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.135476106417908} \ No newline at end of file diff --git a/checkpoints/metadata_000049152000.json b/checkpoints/metadata_000049152000.json new file mode 100644 index 0000000000000000000000000000000000000000..b3644a42de6b23f6402584d252d1cf4e703cf6df --- /dev/null +++ b/checkpoints/metadata_000049152000.json @@ -0,0 +1 @@ +{"step": 1500, "tokens_seen": 49152000, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.133520773974897} \ No newline at end of file diff --git a/checkpoints/metadata_000052953088.json b/checkpoints/metadata_000052953088.json new file mode 100644 index 0000000000000000000000000000000000000000..41ca0f95c19fe1805da5347a2d749c3cb4bfd121 --- /dev/null +++ b/checkpoints/metadata_000052953088.json @@ -0,0 +1 @@ +{"step": 1616, "tokens_seen": 52953088, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.0768725365776595} \ No newline at end of file diff --git a/checkpoints/metadata_000057212928.json b/checkpoints/metadata_000057212928.json new file mode 100644 index 0000000000000000000000000000000000000000..5490ece38285a61c88c7b763017b578c0de3213a --- /dev/null +++ b/checkpoints/metadata_000057212928.json @@ -0,0 +1 @@ +{"step": 1746, "tokens_seen": 57212928, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.010452169604542} \ No newline at end of file diff --git a/checkpoints/metadata_000061767680.json b/checkpoints/metadata_000061767680.json new file mode 100644 index 0000000000000000000000000000000000000000..03c71051c24648ac99c5bc9eff66c8d55d8d6f4a --- /dev/null +++ b/checkpoints/metadata_000061767680.json @@ -0,0 +1 @@ +{"step": 1885, "tokens_seen": 61767680, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.931915458912789} \ No newline at end of file diff --git a/checkpoints/metadata_000065536000.json b/checkpoints/metadata_000065536000.json new file mode 100644 index 0000000000000000000000000000000000000000..b4e994ad55934cc854fb4eac846069bec37cd1ee --- /dev/null +++ b/checkpoints/metadata_000065536000.json @@ -0,0 +1 @@ +{"step": 2000, "tokens_seen": 65536000, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.861460926928725} \ No newline at end of file diff --git a/checkpoints/metadata_000066715648.json b/checkpoints/metadata_000066715648.json new file mode 100644 index 0000000000000000000000000000000000000000..94ac30c0e63ba4ce3ebc5cbcc9d60f959e1b1a69 --- /dev/null +++ b/checkpoints/metadata_000066715648.json @@ -0,0 +1 @@ +{"step": 2036, "tokens_seen": 66715648, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.839675179619374} \ No newline at end of file diff --git a/checkpoints/metadata_000072056832.json b/checkpoints/metadata_000072056832.json new file mode 100644 index 0000000000000000000000000000000000000000..7896ac17f47ef3f1f58291f3d1433659d66f9c7b --- /dev/null +++ b/checkpoints/metadata_000072056832.json @@ -0,0 +1 @@ +{"step": 2199, "tokens_seen": 72056832, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.726386224784686} \ No newline at end of file diff --git a/checkpoints/metadata_000077824000.json b/checkpoints/metadata_000077824000.json new file mode 100644 index 0000000000000000000000000000000000000000..c5ad0a7a545ecac5c894bdc249257bf545dbe874 --- /dev/null +++ b/checkpoints/metadata_000077824000.json @@ -0,0 +1 @@ +{"step": 2375, "tokens_seen": 77824000, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.636259174386781} \ No newline at end of file diff --git a/checkpoints/metadata_000081920000.json b/checkpoints/metadata_000081920000.json new file mode 100644 index 0000000000000000000000000000000000000000..55cd5a42e25848c5fa0cb2ef039da5e6a962ec06 --- /dev/null +++ b/checkpoints/metadata_000081920000.json @@ -0,0 +1 @@ +{"step": 2500, "tokens_seen": 81920000, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.570245994086186} \ No newline at end of file diff --git a/checkpoints/metadata_000084049920.json b/checkpoints/metadata_000084049920.json new file mode 100644 index 0000000000000000000000000000000000000000..392b18397cb7cb9026e83975d941afd9a8cc06ea --- /dev/null +++ b/checkpoints/metadata_000084049920.json @@ -0,0 +1 @@ +{"step": 2565, "tokens_seen": 84049920, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.544253958978482} \ No newline at end of file diff --git a/checkpoints/metadata_000090800128.json b/checkpoints/metadata_000090800128.json new file mode 100644 index 0000000000000000000000000000000000000000..d36af580650f9ff3f9fed8bd9d39c7b5980db726 --- /dev/null +++ b/checkpoints/metadata_000090800128.json @@ -0,0 +1 @@ +{"step": 2771, "tokens_seen": 90800128, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.4577379594060185} \ No newline at end of file diff --git a/checkpoints/metadata_000098041856.json b/checkpoints/metadata_000098041856.json new file mode 100644 index 0000000000000000000000000000000000000000..15786de74ac807c197df7e57568fa22d0478ca3d --- /dev/null +++ b/checkpoints/metadata_000098041856.json @@ -0,0 +1 @@ +{"step": 2992, "tokens_seen": 98041856, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.4013038728484615} \ No newline at end of file diff --git a/checkpoints/metadata_000098304000.json b/checkpoints/metadata_000098304000.json new file mode 100644 index 0000000000000000000000000000000000000000..3e81699a3079f9493cc4e8dc6fbc35e3a35712cd --- /dev/null +++ b/checkpoints/metadata_000098304000.json @@ -0,0 +1 @@ +{"step": 3000, "tokens_seen": 98304000, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.39735605515085} \ No newline at end of file diff --git a/checkpoints/metadata_000105906176.json b/checkpoints/metadata_000105906176.json new file mode 100644 index 0000000000000000000000000000000000000000..27285c19d4617f33b07350d00d4df947f4488fdf --- /dev/null +++ b/checkpoints/metadata_000105906176.json @@ -0,0 +1 @@ +{"step": 3232, "tokens_seen": 105906176, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.33819010247988} \ No newline at end of file diff --git a/checkpoints/metadata_000114360320.json b/checkpoints/metadata_000114360320.json new file mode 100644 index 0000000000000000000000000000000000000000..feba9c552ed95c309f9d3a60bd3c9d6c951d55f0 --- /dev/null +++ b/checkpoints/metadata_000114360320.json @@ -0,0 +1 @@ +{"step": 3490, "tokens_seen": 114360320, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.279858281483689} \ No newline at end of file diff --git a/checkpoints/metadata_000114688000.json b/checkpoints/metadata_000114688000.json new file mode 100644 index 0000000000000000000000000000000000000000..03361bdde497dec9d7650fb7d2091ab418ae6257 --- /dev/null +++ b/checkpoints/metadata_000114688000.json @@ -0,0 +1 @@ +{"step": 3500, "tokens_seen": 114688000, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.2696151592467775} \ No newline at end of file diff --git a/checkpoints/metadata_000123535360.json b/checkpoints/metadata_000123535360.json new file mode 100644 index 0000000000000000000000000000000000000000..d4399dc1f456a85037c5c047b224cc1dfd48e5cf --- /dev/null +++ b/checkpoints/metadata_000123535360.json @@ -0,0 +1 @@ +{"step": 3770, "tokens_seen": 123535360, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.235345477739887} \ No newline at end of file diff --git a/checkpoints/metadata_000131072000.json b/checkpoints/metadata_000131072000.json new file mode 100644 index 0000000000000000000000000000000000000000..9fd24ca2adf4222caa96c556f41d56a40c99b99b --- /dev/null +++ b/checkpoints/metadata_000131072000.json @@ -0,0 +1 @@ +{"step": 4000, "tokens_seen": 131072000, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.2206030820408795} \ No newline at end of file diff --git a/checkpoints/metadata_000133398528.json b/checkpoints/metadata_000133398528.json new file mode 100644 index 0000000000000000000000000000000000000000..27048cbca6c62f9592c4a282477b6ee1d28884a7 --- /dev/null +++ b/checkpoints/metadata_000133398528.json @@ -0,0 +1 @@ +{"step": 4071, "tokens_seen": 133398528, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.219651159621151} \ No newline at end of file diff --git a/checkpoints/metadata_000144080896.json b/checkpoints/metadata_000144080896.json new file mode 100644 index 0000000000000000000000000000000000000000..826df79e5474e5b6035db7ebd256e39d1671f062 --- /dev/null +++ b/checkpoints/metadata_000144080896.json @@ -0,0 +1 @@ +{"step": 4397, "tokens_seen": 144080896, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.162633973389495} \ No newline at end of file diff --git a/checkpoints/metadata_000147456000.json b/checkpoints/metadata_000147456000.json new file mode 100644 index 0000000000000000000000000000000000000000..82680713d008b0725b6a5361f99e7de9c3363ddb --- /dev/null +++ b/checkpoints/metadata_000147456000.json @@ -0,0 +1 @@ +{"step": 4500, "tokens_seen": 147456000, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.1375600051626975} \ No newline at end of file diff --git a/checkpoints/metadata_000155615232.json b/checkpoints/metadata_000155615232.json new file mode 100644 index 0000000000000000000000000000000000000000..b143990a62a1839ab1da43ed7e4bfd42e6090eb9 --- /dev/null +++ b/checkpoints/metadata_000155615232.json @@ -0,0 +1 @@ +{"step": 4749, "tokens_seen": 155615232, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.1401389388008125} \ No newline at end of file diff --git a/checkpoints/metadata_000163840000.json b/checkpoints/metadata_000163840000.json new file mode 100644 index 0000000000000000000000000000000000000000..ff30d91639795433338dafa564a2bc21b3f370df --- /dev/null +++ b/checkpoints/metadata_000163840000.json @@ -0,0 +1 @@ +{"step": 5000, "tokens_seen": 163840000, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.134116985791741} \ No newline at end of file diff --git a/checkpoints/metadata_000168067072.json b/checkpoints/metadata_000168067072.json new file mode 100644 index 0000000000000000000000000000000000000000..30f52de3fd52113500a81bad6b2042cf8fab2eea --- /dev/null +++ b/checkpoints/metadata_000168067072.json @@ -0,0 +1 @@ +{"step": 5129, "tokens_seen": 168067072, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.121989516558103} \ No newline at end of file diff --git a/checkpoints/metadata_000180224000.json b/checkpoints/metadata_000180224000.json new file mode 100644 index 0000000000000000000000000000000000000000..21741597cb08fd6970699b23b751e1be0c11cc9b --- /dev/null +++ b/checkpoints/metadata_000180224000.json @@ -0,0 +1 @@ +{"step": 5500, "tokens_seen": 180224000, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.089761262097163} \ No newline at end of file diff --git a/checkpoints/metadata_000181501952.json b/checkpoints/metadata_000181501952.json new file mode 100644 index 0000000000000000000000000000000000000000..20444778978c51f9b612258056eff3eb11e0b782 --- /dev/null +++ b/checkpoints/metadata_000181501952.json @@ -0,0 +1 @@ +{"step": 5539, "tokens_seen": 181501952, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.079092198128712} \ No newline at end of file diff --git a/checkpoints/metadata_000196018176.json b/checkpoints/metadata_000196018176.json new file mode 100644 index 0000000000000000000000000000000000000000..ced990c130e3553c22d73b210e067fd68155ee67 --- /dev/null +++ b/checkpoints/metadata_000196018176.json @@ -0,0 +1 @@ +{"step": 5982, "tokens_seen": 196018176, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.059876957371325} \ No newline at end of file diff --git a/checkpoints/metadata_000196608000.json b/checkpoints/metadata_000196608000.json new file mode 100644 index 0000000000000000000000000000000000000000..9885f969fcf97cae42b47927d2ddbb9d5d70d232 --- /dev/null +++ b/checkpoints/metadata_000196608000.json @@ -0,0 +1 @@ +{"step": 6000, "tokens_seen": 196608000, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.054760013092177} \ No newline at end of file diff --git a/checkpoints/model_weights_000000032768.pt b/checkpoints/model_weights_000000032768.pt new file mode 100644 index 0000000000000000000000000000000000000000..03511d91e914d121706074672ed3717e4b31658a --- /dev/null +++ b/checkpoints/model_weights_000000032768.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78ce67efbdfc3c9279831326e38cbb39268eb87b8627e4d8e3239dbba09c7ac9 +size 225208789 diff --git a/checkpoints/model_weights_000000327680.pt b/checkpoints/model_weights_000000327680.pt new file mode 100644 index 0000000000000000000000000000000000000000..018dcdc2ad4e7b1186df7179b088370f040f2505 --- /dev/null +++ b/checkpoints/model_weights_000000327680.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0d1547f4d794b9c3e9a3b27c7bee23438512c7212729532c2c55e687b15e9b5 +size 225208789 diff --git a/checkpoints/model_weights_000000360448.pt b/checkpoints/model_weights_000000360448.pt new file mode 100644 index 0000000000000000000000000000000000000000..a89a8698e80ca4325167f501a543cea993900de8 --- /dev/null +++ b/checkpoints/model_weights_000000360448.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5bb2f326069c83b76db2c3df3f21a075ae3a8375097af08c6f300189378b20a +size 225208789 diff --git a/checkpoints/model_weights_000000393216.pt b/checkpoints/model_weights_000000393216.pt new file mode 100644 index 0000000000000000000000000000000000000000..693eadec3dd01e209c60ac89825d892ce3dd804f --- /dev/null +++ b/checkpoints/model_weights_000000393216.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd8ac8ad9ee707827b002b5fe105d41e307b96b3ea96254639f8991837ac2fc1 +size 225208789 diff --git a/checkpoints/model_weights_000000425984.pt b/checkpoints/model_weights_000000425984.pt new file mode 100644 index 0000000000000000000000000000000000000000..4cdd140e9d54e93ca60c4213a12e51f6b69056e2 --- /dev/null +++ b/checkpoints/model_weights_000000425984.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd196f321053e220b58ed5f25c07eaaaa6a390aa526c4ce05b23bec622956b9a +size 225208789 diff --git a/checkpoints/model_weights_000000458752.pt b/checkpoints/model_weights_000000458752.pt new file mode 100644 index 0000000000000000000000000000000000000000..70721911b22f2c9f813de099121f3d2a7b877c2c --- /dev/null +++ b/checkpoints/model_weights_000000458752.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:276b134c6db99d0fbcfa6680e7a6cb2f3850fbd46f3b5265839ae15b21924a5e +size 225208789 diff --git a/checkpoints/model_weights_000000491520.pt b/checkpoints/model_weights_000000491520.pt new file mode 100644 index 0000000000000000000000000000000000000000..519f6b0d11e89a75dfb072ac5cc24ced1ab71f53 --- /dev/null +++ b/checkpoints/model_weights_000000491520.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d40881882d4429168cb8f3bdd055bb6d7b8639b24b1ccb40870bf8eccfa4e9ca +size 225208789 diff --git a/checkpoints/model_weights_000000557056.pt b/checkpoints/model_weights_000000557056.pt new file mode 100644 index 0000000000000000000000000000000000000000..7e12a6865ea9082d1a9b4b9ae9aa0efdc6940816 --- /dev/null +++ b/checkpoints/model_weights_000000557056.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0879054c19f75bf5710c8a767a1ea621d04536757d9cdebbe8be0a54a18cce6f +size 225208789 diff --git a/checkpoints/model_weights_000000589824.pt b/checkpoints/model_weights_000000589824.pt new file mode 100644 index 0000000000000000000000000000000000000000..654987b474426cfa1b306430bf58e92558336fb1 --- /dev/null +++ b/checkpoints/model_weights_000000589824.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1f1d3483d44d3b2efb1d67e38cb12bf879ec6e9b82dba74afa6dbbf800147df +size 225208789 diff --git a/checkpoints/model_weights_000000655360.pt b/checkpoints/model_weights_000000655360.pt new file mode 100644 index 0000000000000000000000000000000000000000..9405f01f4009926951cf448e740af8cd321b6de4 --- /dev/null +++ b/checkpoints/model_weights_000000655360.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f14cf384594aa5f800d415bb833cd917252f7a0a269020c57171d9dbe53e8304 +size 225208789 diff --git a/checkpoints/model_weights_000000688128.pt b/checkpoints/model_weights_000000688128.pt new file mode 100644 index 0000000000000000000000000000000000000000..2edb38c4b68e71218f93a2521ba2c6951c1f2993 --- /dev/null +++ b/checkpoints/model_weights_000000688128.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e18889c65fbd58b824bbd60e948a5ad0131aad0c1422178c5ff6c149eda5de1 +size 225208789 diff --git a/checkpoints/model_weights_000000753664.pt b/checkpoints/model_weights_000000753664.pt new file mode 100644 index 0000000000000000000000000000000000000000..f6ad676b9f1bf675c854622a7b9bc8b80fc16021 --- /dev/null +++ b/checkpoints/model_weights_000000753664.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c17669e457d6c4044635dab89884222d3b8896afa6f3b45a38d6f08530535a9 +size 225208789 diff --git a/checkpoints/model_weights_000000819200.pt b/checkpoints/model_weights_000000819200.pt new file mode 100644 index 0000000000000000000000000000000000000000..c985ae44b374677fa0628555cdf9487afa29940f --- /dev/null +++ b/checkpoints/model_weights_000000819200.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c67282d532965fb21c25a9d474929e9fef75256f58ba6c5a4925b51a25ce067 +size 225208789 diff --git a/checkpoints/model_weights_000000884736.pt b/checkpoints/model_weights_000000884736.pt new file mode 100644 index 0000000000000000000000000000000000000000..b36fd6ce358aad16c3746f2e8b7b5da50e28e80e --- /dev/null +++ b/checkpoints/model_weights_000000884736.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:143a216c2f938d103463234aaeac5dda237b294eea7c20fcb1a77bfa94b51ae8 +size 225208789 diff --git a/checkpoints/model_weights_000000950272.pt b/checkpoints/model_weights_000000950272.pt new file mode 100644 index 0000000000000000000000000000000000000000..a3424cbbb0624566baf4fd03fe38b33f01171721 --- /dev/null +++ b/checkpoints/model_weights_000000950272.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fae9b9d991e670a2db4022870909c7ea67b313c4ab4a7e91c67c66ff59957c35 +size 225208789 diff --git a/checkpoints/model_weights_000001015808.pt b/checkpoints/model_weights_000001015808.pt new file mode 100644 index 0000000000000000000000000000000000000000..2483709ecc5d59e97615681efefe5950e05dce3a --- /dev/null +++ b/checkpoints/model_weights_000001015808.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd79f866f84cb5d0b7ace39b41c14863f45e5a50edf1aaba511e2b1422088ccf +size 225208789 diff --git a/checkpoints/model_weights_000001114112.pt b/checkpoints/model_weights_000001114112.pt new file mode 100644 index 0000000000000000000000000000000000000000..50754a0efde1418ecc03543227604eb1f63a0131 --- /dev/null +++ b/checkpoints/model_weights_000001114112.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3cabfeec6ae1e4f875587f792d71e10095c2603fb1eb1ba4c1729be19120c76 +size 225208789 diff --git a/checkpoints/model_weights_000001212416.pt b/checkpoints/model_weights_000001212416.pt new file mode 100644 index 0000000000000000000000000000000000000000..40cbf4656b0c3ee4f8af73fa3184bef3da66af35 --- /dev/null +++ b/checkpoints/model_weights_000001212416.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be9661c33478f0a08a76767eb1bcfa6a83728a8e75077fcb13449e59e154d979 +size 225208789 diff --git a/checkpoints/model_weights_000001310720.pt b/checkpoints/model_weights_000001310720.pt new file mode 100644 index 0000000000000000000000000000000000000000..e7a428dc7c090ba2e65e9852bcf957925442af0d --- /dev/null +++ b/checkpoints/model_weights_000001310720.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cbcb1a99c9a7a3e40d18e418d059ec377a5d93033e5788a1daa50be4c299cf7f +size 225208789 diff --git a/checkpoints/model_weights_000001409024.pt b/checkpoints/model_weights_000001409024.pt new file mode 100644 index 0000000000000000000000000000000000000000..a7b259e5a6965e4c15486503aa8950697bcf940e --- /dev/null +++ b/checkpoints/model_weights_000001409024.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91ebf542aa147f5dd899aa5fade77fcd0a875a48b49a51321605d35c4d7fd387 +size 225208789 diff --git a/checkpoints/model_weights_000001507328.pt b/checkpoints/model_weights_000001507328.pt new file mode 100644 index 0000000000000000000000000000000000000000..a0fbeb11b4a1e5f96b88833dc99588ecab6e094b --- /dev/null +++ b/checkpoints/model_weights_000001507328.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f9b30aea4e96991171ada78c2f9b93727541bba72c135788f7bba5e9c6737d93 +size 225208789 diff --git a/checkpoints/model_weights_000001638400.pt b/checkpoints/model_weights_000001638400.pt new file mode 100644 index 0000000000000000000000000000000000000000..1ce44c28056351116bc9810d30c9463d59c3e762 --- /dev/null +++ b/checkpoints/model_weights_000001638400.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49ef19fb3e8c2405b3d250e00ce84ecb404a2a5e70704c91ebc728d68734821b +size 225208789 diff --git a/checkpoints/model_weights_000001769472.pt b/checkpoints/model_weights_000001769472.pt new file mode 100644 index 0000000000000000000000000000000000000000..c56c15df836097d5d8a432c5fb14ca02a5ff966c --- /dev/null +++ b/checkpoints/model_weights_000001769472.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a4bc81283af99047386727e3c09d2f1da1ec9249691b448abe0c5d83b0bb81d +size 225208789 diff --git a/checkpoints/model_weights_000001933312.pt b/checkpoints/model_weights_000001933312.pt new file mode 100644 index 0000000000000000000000000000000000000000..e4440158207f6c7e1f67cd88270f24cb20fe8a73 --- /dev/null +++ b/checkpoints/model_weights_000001933312.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc00b9da5156792d30ea3a64dfb6df065497d417c12997d7f3bf04997d64afe4 +size 225208789 diff --git a/checkpoints/model_weights_000002064384.pt b/checkpoints/model_weights_000002064384.pt new file mode 100644 index 0000000000000000000000000000000000000000..00aff9ef280663dd197ec20a12c3d6c15189f926 --- /dev/null +++ b/checkpoints/model_weights_000002064384.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25ab7a61fbe97fe9d6d971aef95254225eec394c64c781297849a6eec2172259 +size 225208789 diff --git a/checkpoints/model_weights_000002228224.pt b/checkpoints/model_weights_000002228224.pt new file mode 100644 index 0000000000000000000000000000000000000000..58921e51be621c3fa675bad06e93f92e69fec679 --- /dev/null +++ b/checkpoints/model_weights_000002228224.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:311708aa7542fdda45f4a65e6b02a00e83657d8ce886319aa9cab39ffe778428 +size 225208789 diff --git a/checkpoints/model_weights_000002424832.pt b/checkpoints/model_weights_000002424832.pt new file mode 100644 index 0000000000000000000000000000000000000000..26e50bea47897296e54550b04ce8f73d10466bbb --- /dev/null +++ b/checkpoints/model_weights_000002424832.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf6d9f7356193201c69d0133951d8fbd83b58c858cb29d3d4440d41834086a34 +size 225208789 diff --git a/checkpoints/model_weights_000002621440.pt b/checkpoints/model_weights_000002621440.pt new file mode 100644 index 0000000000000000000000000000000000000000..68daa035d4d2db5b52b15297440d61ff4f0be66f --- /dev/null +++ b/checkpoints/model_weights_000002621440.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc4243d755f70dcee195e749db3d7194fb540879bd0b6c28b54fff2cb48f04b6 +size 225208789 diff --git a/checkpoints/model_weights_000002818048.pt b/checkpoints/model_weights_000002818048.pt new file mode 100644 index 0000000000000000000000000000000000000000..19c2edd5e298bd40f7bfaec102393089c428df68 --- /dev/null +++ b/checkpoints/model_weights_000002818048.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:104fc2850789c0015406270379f8b1a934736856feaf5bcd511acd7c796aca1d +size 225208789 diff --git a/checkpoints/model_weights_000003047424.pt b/checkpoints/model_weights_000003047424.pt new file mode 100644 index 0000000000000000000000000000000000000000..aa74bc7ec218a4cff33535243a1bbc845d58cb70 --- /dev/null +++ b/checkpoints/model_weights_000003047424.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54db08e6dc5d65812d61c3adc0190ae5faaf4a94a93de136015a4672c0c945fc +size 225208789 diff --git a/checkpoints/model_weights_000003309568.pt b/checkpoints/model_weights_000003309568.pt new file mode 100644 index 0000000000000000000000000000000000000000..a5ca49f35f3db5c3ab0b355e7996c58a8b1fe0b3 --- /dev/null +++ b/checkpoints/model_weights_000003309568.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd9c1eee0cedd7bf88d92695f6c045cecffeee6ec45dd2f14e5481544c4edce6 +size 225208789 diff --git a/checkpoints/model_weights_000003571712.pt b/checkpoints/model_weights_000003571712.pt new file mode 100644 index 0000000000000000000000000000000000000000..0e81d2d10fa08da6aa19752452c5cbaea6e61a1b --- /dev/null +++ b/checkpoints/model_weights_000003571712.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5df065ea7569da9343aa6ef6cc43172a7b1b702903a3904b906d40ebbee53370 +size 225208789 diff --git a/checkpoints/model_weights_000003866624.pt b/checkpoints/model_weights_000003866624.pt new file mode 100644 index 0000000000000000000000000000000000000000..7c6929d61337078b4e75465198cfca4649c87904 --- /dev/null +++ b/checkpoints/model_weights_000003866624.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3488b002945002487999a9031792c614cd3fefa5417f8a2a25b1430e447d106 +size 225208789 diff --git a/checkpoints/model_weights_000004161536.pt b/checkpoints/model_weights_000004161536.pt new file mode 100644 index 0000000000000000000000000000000000000000..e4ba0fa68e9a8316142052bcc9029323528d893c --- /dev/null +++ b/checkpoints/model_weights_000004161536.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfb29fa6c5b80a9467cc19584d50e6b25d43385da35653a62ea0765704ff3173 +size 225208789 diff --git a/checkpoints/model_weights_000004489216.pt b/checkpoints/model_weights_000004489216.pt new file mode 100644 index 0000000000000000000000000000000000000000..3b492f87c8812675381f1e4596429a0dfdf30076 --- /dev/null +++ b/checkpoints/model_weights_000004489216.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4fdc56dab0b7193d81604457bf538ca3b6edfe6808c6cb06dc7df270526f0a60 +size 225208789 diff --git a/checkpoints/model_weights_000004849664.pt b/checkpoints/model_weights_000004849664.pt new file mode 100644 index 0000000000000000000000000000000000000000..3f902ca94d38f4bfad40868df1e0db17496421e3 --- /dev/null +++ b/checkpoints/model_weights_000004849664.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a49af2a52db543c1e43e6acc30bf07bf3d8742ae81eb1e2721a0f72ef9097706 +size 225208789 diff --git a/checkpoints/model_weights_000005242880.pt b/checkpoints/model_weights_000005242880.pt new file mode 100644 index 0000000000000000000000000000000000000000..bb4016439c9094d4b0e1efd4016dbec034da43d7 --- /dev/null +++ b/checkpoints/model_weights_000005242880.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d839db1fcfd76c63c464f15381e93f328ca7069fdc062522b46c41256ac2809 +size 225208789 diff --git a/checkpoints/model_weights_000005668864.pt b/checkpoints/model_weights_000005668864.pt new file mode 100644 index 0000000000000000000000000000000000000000..204376669a94996e43d009d2dd2c33d4c459d417 --- /dev/null +++ b/checkpoints/model_weights_000005668864.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2213f38cdf39bdabc2f2664bda66c150132d2d3c99370c10cbc0a1d2a0b0f274 +size 225208789 diff --git a/checkpoints/model_weights_000006127616.pt b/checkpoints/model_weights_000006127616.pt new file mode 100644 index 0000000000000000000000000000000000000000..011adb3a0e8b0a43a8e6bf6150e4400f13632ea1 --- /dev/null +++ b/checkpoints/model_weights_000006127616.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f74b7890e4d908c385d72121f6771da1f93567142c03b023085e01fd379e21b +size 225208789 diff --git a/checkpoints/model_weights_000006619136.pt b/checkpoints/model_weights_000006619136.pt new file mode 100644 index 0000000000000000000000000000000000000000..a6477ba0557ee1524e37d291c295e3ed004ad1c4 --- /dev/null +++ b/checkpoints/model_weights_000006619136.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88cd879d6a2a13c8790afabaa36ad03eceae85748683434eef49b9b31839151f +size 225208789 diff --git a/checkpoints/model_weights_000007143424.pt b/checkpoints/model_weights_000007143424.pt new file mode 100644 index 0000000000000000000000000000000000000000..13006c5db0396e01a536e67f28910348e256f054 --- /dev/null +++ b/checkpoints/model_weights_000007143424.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa63d0d72efcdec016c8be3022fbe09099b2fd2b7c75efc9eb73696177baab43 +size 225208789 diff --git a/checkpoints/model_weights_000007733248.pt b/checkpoints/model_weights_000007733248.pt new file mode 100644 index 0000000000000000000000000000000000000000..fa49e391e2d573c8198434f46bdd84c04502148b --- /dev/null +++ b/checkpoints/model_weights_000007733248.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:37496517fc01984eb68ea694e25019c425ac93813afdb35a83b143b0a84abcfc +size 225208789 diff --git a/checkpoints/model_weights_000008323072.pt b/checkpoints/model_weights_000008323072.pt new file mode 100644 index 0000000000000000000000000000000000000000..04f95e58646583883c84fa5f61c4522b00ca69d1 --- /dev/null +++ b/checkpoints/model_weights_000008323072.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:170b26ecc27a358f9444a2f68304e0e29e35367024dead7952f275a23e9c6913 +size 225208789 diff --git a/checkpoints/model_weights_000009011200.pt b/checkpoints/model_weights_000009011200.pt new file mode 100644 index 0000000000000000000000000000000000000000..444121ade53fdc944dd1611fe6b6b088941b2de9 --- /dev/null +++ b/checkpoints/model_weights_000009011200.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17378f1bcdc5e642c697bf5067b4fd558387126c8413cd995b1131a1da6b3555 +size 225208789 diff --git a/checkpoints/model_weights_000009732096.pt b/checkpoints/model_weights_000009732096.pt new file mode 100644 index 0000000000000000000000000000000000000000..a227c080c8ecc1dfe05394a719d1dd2ec7f27ef8 --- /dev/null +++ b/checkpoints/model_weights_000009732096.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f893ca7973dfe0a18af3c6968fae441b3718494152860e0f48899fe2d87931f +size 225208789 diff --git a/checkpoints/model_weights_000010518528.pt b/checkpoints/model_weights_000010518528.pt new file mode 100644 index 0000000000000000000000000000000000000000..e86588de9baadef66440dfdb249fb41308167463 --- /dev/null +++ b/checkpoints/model_weights_000010518528.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:288a803c6da556459bd8e80f752984f09005ccc4f6193280efa1c6b48a1bfe89 +size 225208789 diff --git a/checkpoints/model_weights_000011337728.pt b/checkpoints/model_weights_000011337728.pt new file mode 100644 index 0000000000000000000000000000000000000000..f3eb48bf156dccf2bba215f3d54031525e97cfb1 --- /dev/null +++ b/checkpoints/model_weights_000011337728.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d32185c1ee57eb125ad3f92515edd28b14c693e297fb84fc510cf42f2e5a57ff +size 225208789 diff --git a/checkpoints/model_weights_000012255232.pt b/checkpoints/model_weights_000012255232.pt new file mode 100644 index 0000000000000000000000000000000000000000..6250e5b57993d1095125ef8ff3d8873228fd7a03 --- /dev/null +++ b/checkpoints/model_weights_000012255232.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6bedad5eb46f97bc4e268d6f4270353ff8d2c79b80de11e92a8a99e1adb0f0d7 +size 225208789 diff --git a/checkpoints/model_weights_000013238272.pt b/checkpoints/model_weights_000013238272.pt new file mode 100644 index 0000000000000000000000000000000000000000..6d45c3adf39b660c057f2e8b64f440852229f72e --- /dev/null +++ b/checkpoints/model_weights_000013238272.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52fdacf6cc3e715e563caede7720aa645882294b5b693d955e752635960a5ea8 +size 225208789 diff --git a/checkpoints/model_weights_000014286848.pt b/checkpoints/model_weights_000014286848.pt new file mode 100644 index 0000000000000000000000000000000000000000..7420e93f1a2e51e83f0de4743e42093be36eac8d --- /dev/null +++ b/checkpoints/model_weights_000014286848.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64a70e9e2e8cbc2f081937376d29ec5f509e9a9fe0bcd417dfa771beeede2eff +size 225208789 diff --git a/checkpoints/model_weights_000015433728.pt b/checkpoints/model_weights_000015433728.pt new file mode 100644 index 0000000000000000000000000000000000000000..c3d46e61786e21521deb5032ef32dd18d3b234e3 --- /dev/null +++ b/checkpoints/model_weights_000015433728.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a19b2388029e8f1d8f22612457ad5b2f02ec80b3721af88b8b826d8b744ee0f9 +size 225208789 diff --git a/checkpoints/model_weights_000016384000.pt b/checkpoints/model_weights_000016384000.pt new file mode 100644 index 0000000000000000000000000000000000000000..0971091d1a959e36cb615e2857f14c6dfdf55a0c --- /dev/null +++ b/checkpoints/model_weights_000016384000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab8f34579af45e142f3c8cbb26b9bec0ce84abf62a2c6ffc261d98d308db99b2 +size 225208789 diff --git a/checkpoints/model_weights_000016678912.pt b/checkpoints/model_weights_000016678912.pt new file mode 100644 index 0000000000000000000000000000000000000000..41598707e518ad2bcd697d443854ce066aecacca --- /dev/null +++ b/checkpoints/model_weights_000016678912.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0df69f1629971a8b53076927c1837e62340bc0a70f4dba27229c7872a71f359f +size 225208789 diff --git a/checkpoints/model_weights_000018022400.pt b/checkpoints/model_weights_000018022400.pt new file mode 100644 index 0000000000000000000000000000000000000000..fa32c13304fe0d824c87c9730d646b794d535d6d --- /dev/null +++ b/checkpoints/model_weights_000018022400.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f07c5a979dbb2a8eb9a4091164a9b2ef18d28c9a6cc37da132dfd144d675a91 +size 225208789 diff --git a/checkpoints/model_weights_000019464192.pt b/checkpoints/model_weights_000019464192.pt new file mode 100644 index 0000000000000000000000000000000000000000..50f5ffd2f3fa8939c1bc6926b75cbbfe5b227826 --- /dev/null +++ b/checkpoints/model_weights_000019464192.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:782b701321e960199a13f275783efe036a2c8b539ee0990a4e0ee66570a7baf7 +size 225208789 diff --git a/checkpoints/model_weights_000021037056.pt b/checkpoints/model_weights_000021037056.pt new file mode 100644 index 0000000000000000000000000000000000000000..c91ee83f275f760fa7577a61b63e87b32bc8eeab --- /dev/null +++ b/checkpoints/model_weights_000021037056.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc13f52c6dc603668081eae865f7528e625495d79388ccdd11024e42b3760e65 +size 225208789 diff --git a/checkpoints/model_weights_000022708224.pt b/checkpoints/model_weights_000022708224.pt new file mode 100644 index 0000000000000000000000000000000000000000..d048a2195fd6055ab3c42cac548af24276d89a02 --- /dev/null +++ b/checkpoints/model_weights_000022708224.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fea0e1fe5e3c4fd16af5538fc4325dd39df6df0b33ab897db33eb21e93e4d223 +size 225208789 diff --git a/checkpoints/model_weights_000024510464.pt b/checkpoints/model_weights_000024510464.pt new file mode 100644 index 0000000000000000000000000000000000000000..d9608cf85eb6b4b6074ad1acc9bd4ae2b18d44cb --- /dev/null +++ b/checkpoints/model_weights_000024510464.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:44cc01d9693785d5032d4c68263fb4040fd5aa9d661672984ed10e5cf7f3028a +size 225208789 diff --git a/checkpoints/model_weights_000026476544.pt b/checkpoints/model_weights_000026476544.pt new file mode 100644 index 0000000000000000000000000000000000000000..0cc5d44fbc914a51c8541146468e18f41aa744f1 --- /dev/null +++ b/checkpoints/model_weights_000026476544.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9ab18da69a48037c06de797df33a63ee23ae6dfdd20c2b7858ab6cc305d2876 +size 225208789 diff --git a/checkpoints/model_weights_000028606464.pt b/checkpoints/model_weights_000028606464.pt new file mode 100644 index 0000000000000000000000000000000000000000..39e283d89527dbdf9c6092f70d98ac3673ae01d2 --- /dev/null +++ b/checkpoints/model_weights_000028606464.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0cdfb7857adab912c617742b9dbf7669e386a03eac84c8df11aca374f05b0e81 +size 225208789 diff --git a/checkpoints/model_weights_000030900224.pt b/checkpoints/model_weights_000030900224.pt new file mode 100644 index 0000000000000000000000000000000000000000..14c56db63d1d0c1a20543da8a24b8c1ab4c7e4de --- /dev/null +++ b/checkpoints/model_weights_000030900224.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c211e061d664d280905c7e1c4df820ef45f4ada8025ce9aba0d6ec04f0bdfba3 +size 225208789 diff --git a/checkpoints/model_weights_000032768000.pt b/checkpoints/model_weights_000032768000.pt new file mode 100644 index 0000000000000000000000000000000000000000..ca7b4cef5316ced4af99f92c03412a36588a10e2 --- /dev/null +++ b/checkpoints/model_weights_000032768000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e75a05a110199b35354eba098abd30200f8e78fbb8b97df25ae301991c920d3d +size 225208789 diff --git a/checkpoints/model_weights_000033357824.pt b/checkpoints/model_weights_000033357824.pt new file mode 100644 index 0000000000000000000000000000000000000000..6a4dae44084a2bc0960082acec6a4b2f65fbe9eb --- /dev/null +++ b/checkpoints/model_weights_000033357824.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:20b41437f8bf61856caba0ad575a3019282e293422b930b6ca54e17741bdf415 +size 225208789 diff --git a/checkpoints/model_weights_000036044800.pt b/checkpoints/model_weights_000036044800.pt new file mode 100644 index 0000000000000000000000000000000000000000..4ed4715dfadc767d52c2bd21a1fa05f63e4807ab --- /dev/null +++ b/checkpoints/model_weights_000036044800.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b1e55e03a254f013251465933bc406483558d295d66cee065598930f045efbe +size 225208789 diff --git a/checkpoints/model_weights_000038928384.pt b/checkpoints/model_weights_000038928384.pt new file mode 100644 index 0000000000000000000000000000000000000000..53d68f99fae9361b5af5901b7bf807ccbf796de4 --- /dev/null +++ b/checkpoints/model_weights_000038928384.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e48962c12c8b9c2fa359cc174b9201968fb2ffbe92f6dd0261f167deffc60b93 +size 225208789 diff --git a/checkpoints/model_weights_000042041344.pt b/checkpoints/model_weights_000042041344.pt new file mode 100644 index 0000000000000000000000000000000000000000..dd69176a90c7895e10552e6806d9a3ebc94eb4fa --- /dev/null +++ b/checkpoints/model_weights_000042041344.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ba28fb5305d1eef2701c75feaae3091fa5a5d87118a34c2bd31e8bc1fa59db4 +size 225208789 diff --git a/checkpoints/model_weights_000045416448.pt b/checkpoints/model_weights_000045416448.pt new file mode 100644 index 0000000000000000000000000000000000000000..26f8d09e938799b4e6c37873e0a3f5504ad9583d --- /dev/null +++ b/checkpoints/model_weights_000045416448.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7dc29e6580e83d30c8f61d8f071586598fdd556fcb62e415f029358e8274e751 +size 225208789 diff --git a/checkpoints/model_weights_000049053696.pt b/checkpoints/model_weights_000049053696.pt new file mode 100644 index 0000000000000000000000000000000000000000..83f7077f36dd908d5533b9b5ad9520cb68c37ec9 --- /dev/null +++ b/checkpoints/model_weights_000049053696.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf0d22bacf1b18eff6a6c8445ee2e4d1ef727d98dba90847578be29f172fc6c5 +size 225208789 diff --git a/checkpoints/model_weights_000049152000.pt b/checkpoints/model_weights_000049152000.pt new file mode 100644 index 0000000000000000000000000000000000000000..34b25e3e465e5666cf3d27d23de593aeef76f60d --- /dev/null +++ b/checkpoints/model_weights_000049152000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d732be76d7baa512f3dc0ac98cfd03dcd249453fd91e45a5e5ff705f4fe31203 +size 225208789 diff --git a/checkpoints/model_weights_000052953088.pt b/checkpoints/model_weights_000052953088.pt new file mode 100644 index 0000000000000000000000000000000000000000..b4498fca7aee5643068364898d0b3727813219b9 --- /dev/null +++ b/checkpoints/model_weights_000052953088.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d3401953f5585d8c79ed3e2f7cdf97195155ec1a35996c56006a641f3ad6f3a +size 225208789 diff --git a/checkpoints/model_weights_000057212928.pt b/checkpoints/model_weights_000057212928.pt new file mode 100644 index 0000000000000000000000000000000000000000..4312184b74af51f7285b13972890a39629817a56 --- /dev/null +++ b/checkpoints/model_weights_000057212928.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2100375f97b9bd71f19fda1b02b6fef26035b8773941949ae59477688f9fa396 +size 225208789 diff --git a/checkpoints/model_weights_000061767680.pt b/checkpoints/model_weights_000061767680.pt new file mode 100644 index 0000000000000000000000000000000000000000..6d508d3ab50ecc3fb6eda4e0737f5d2d6e807488 --- /dev/null +++ b/checkpoints/model_weights_000061767680.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e2a4580d154401f714c23d6e3907867c00ab8e647a1124881eee0ae4e774b83 +size 225208789 diff --git a/checkpoints/model_weights_000065536000.pt b/checkpoints/model_weights_000065536000.pt new file mode 100644 index 0000000000000000000000000000000000000000..1735e838996f71d0a33a2a63e5323cb5debe1ef3 --- /dev/null +++ b/checkpoints/model_weights_000065536000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed37ed82cbeab7454ae615fd876888f9a2aa1bdf0f78ae190e71e147341dde41 +size 225208789 diff --git a/checkpoints/model_weights_000066715648.pt b/checkpoints/model_weights_000066715648.pt new file mode 100644 index 0000000000000000000000000000000000000000..32fcfc14d4db6cf887eeb5770f13aed9dd9ddb4a --- /dev/null +++ b/checkpoints/model_weights_000066715648.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0107ca8d67ffac70a2cc11202b0dd2bbe0bc69dee02bfbff817b86015d72cd7 +size 225208789 diff --git a/checkpoints/model_weights_000072056832.pt b/checkpoints/model_weights_000072056832.pt new file mode 100644 index 0000000000000000000000000000000000000000..f476d9b61430eab2da7962ef0e765e274edc61e0 --- /dev/null +++ b/checkpoints/model_weights_000072056832.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfe52f8a8353b509a41826287c382ff83eac8acc485d76fcb022c84a159aa421 +size 225208789 diff --git a/checkpoints/model_weights_000077824000.pt b/checkpoints/model_weights_000077824000.pt new file mode 100644 index 0000000000000000000000000000000000000000..de15e31a3cfaa1786815130737fc8a2fc6f982fe --- /dev/null +++ b/checkpoints/model_weights_000077824000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce5ae17a9b041d9d1e81f5c37c2bcf9ee1a85182db222238bd70a15c731d4cca +size 225208789 diff --git a/checkpoints/model_weights_000081920000.pt b/checkpoints/model_weights_000081920000.pt new file mode 100644 index 0000000000000000000000000000000000000000..a64f6156ec5e9afdd4941a95672307177ac7c9bc --- /dev/null +++ b/checkpoints/model_weights_000081920000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09aa98a064ea440b81eb3e5aaf103d8331933e3cca9c1b009a8b1df8138db187 +size 225208789 diff --git a/checkpoints/model_weights_000084049920.pt b/checkpoints/model_weights_000084049920.pt new file mode 100644 index 0000000000000000000000000000000000000000..172330da64b689796ee0aff373fbe31acd7744ea --- /dev/null +++ b/checkpoints/model_weights_000084049920.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a049725b3d26c309404890c9e78f014bd7829b6c197cc46401040950d82887c +size 225208789 diff --git a/checkpoints/model_weights_000090800128.pt b/checkpoints/model_weights_000090800128.pt new file mode 100644 index 0000000000000000000000000000000000000000..601ddf4c18e7aac6adb4496c416ed550b50a5ae6 --- /dev/null +++ b/checkpoints/model_weights_000090800128.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05a84ad51bcbd3722dc3e3f087df049dcb2fe93c894b92f24c99f12980217f12 +size 225208789 diff --git a/checkpoints/model_weights_000098041856.pt b/checkpoints/model_weights_000098041856.pt new file mode 100644 index 0000000000000000000000000000000000000000..8e19a24fa5dd338f018e51d3d0add2a37c293b37 --- /dev/null +++ b/checkpoints/model_weights_000098041856.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4de0bc8bcefb86e8ed339e069409909ff0ec62fb997093576530ba8c1c530631 +size 225208789 diff --git a/checkpoints/model_weights_000098304000.pt b/checkpoints/model_weights_000098304000.pt new file mode 100644 index 0000000000000000000000000000000000000000..68ad25d4e6abfc53632231e6e006d8a94a40404c --- /dev/null +++ b/checkpoints/model_weights_000098304000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ea1e8f07de02469b93f683504f691860626b640095ca5c6e47815f198208b4a +size 225208789 diff --git a/checkpoints/model_weights_000105906176.pt b/checkpoints/model_weights_000105906176.pt new file mode 100644 index 0000000000000000000000000000000000000000..e66a577a0245174a297ee1b514d20e3468a6fe51 --- /dev/null +++ b/checkpoints/model_weights_000105906176.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5b3220f3d1088e66fe47f2c138200af0eb1786b5cf0b2d8ee2b7636ad168900 +size 225208789 diff --git a/checkpoints/model_weights_000114360320.pt b/checkpoints/model_weights_000114360320.pt new file mode 100644 index 0000000000000000000000000000000000000000..e7518aba40ed8a8270dbecb0f1cdf06e20725b29 --- /dev/null +++ b/checkpoints/model_weights_000114360320.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a80a189d2def882df41f18804f7eade06e456d126d126b6d6cb1322bde8d0c90 +size 225208789 diff --git a/checkpoints/model_weights_000114688000.pt b/checkpoints/model_weights_000114688000.pt new file mode 100644 index 0000000000000000000000000000000000000000..bd395323619d09152c40afb154d77f9825fed141 --- /dev/null +++ b/checkpoints/model_weights_000114688000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1340df9e593faca893f4bcf162d8c8b4e7f64554a0e512c1abca279284e5abcf +size 225208789 diff --git a/checkpoints/model_weights_000123535360.pt b/checkpoints/model_weights_000123535360.pt new file mode 100644 index 0000000000000000000000000000000000000000..d49b6c5d2f35bb6d50c5d5bf1b03cacc85c561ce --- /dev/null +++ b/checkpoints/model_weights_000123535360.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d03b7e29670820b3cd061fdd518ea026d12a0d770c91cdbffe4271a2be7f810 +size 225208789 diff --git a/checkpoints/model_weights_000131072000.pt b/checkpoints/model_weights_000131072000.pt new file mode 100644 index 0000000000000000000000000000000000000000..6dd99dcda88e22bc0513fa33df36dec8be037834 --- /dev/null +++ b/checkpoints/model_weights_000131072000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51f6af24d2c64e2099e3f316df3149942350154a14908c6a185b8ea426a484ca +size 225208789 diff --git a/checkpoints/model_weights_000133398528.pt b/checkpoints/model_weights_000133398528.pt new file mode 100644 index 0000000000000000000000000000000000000000..38f10bc882dda90bbc8761f7d916032414efc114 --- /dev/null +++ b/checkpoints/model_weights_000133398528.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4e7475699e875051284d6570bb37a1a9eb418b6fd34c0bd9733ce7c23e9d95e +size 225208789 diff --git a/checkpoints/model_weights_000144080896.pt b/checkpoints/model_weights_000144080896.pt new file mode 100644 index 0000000000000000000000000000000000000000..eae6d0cc11542c2ecf4d334abc130458ce69181c --- /dev/null +++ b/checkpoints/model_weights_000144080896.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:987e6a2f6b04013b60cacff0dbfd73df69506175e402fa35861dd0dea0cb709c +size 225208789 diff --git a/checkpoints/model_weights_000147456000.pt b/checkpoints/model_weights_000147456000.pt new file mode 100644 index 0000000000000000000000000000000000000000..9151d3fb5def039effee73b07feeeabe42487d22 --- /dev/null +++ b/checkpoints/model_weights_000147456000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd4c5b9d57dbd2937a0e996e52d4efe277da847c56435f9410ddc942a02e4524 +size 225208789 diff --git a/checkpoints/model_weights_000155615232.pt b/checkpoints/model_weights_000155615232.pt new file mode 100644 index 0000000000000000000000000000000000000000..195944de2a85bf1a3e0a5568fbfd76b291b829be --- /dev/null +++ b/checkpoints/model_weights_000155615232.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b68fae77d52923dab2cfe05373d1cb5b06aeaf0be160ce8f699e90c96d19663 +size 225208789 diff --git a/checkpoints/model_weights_000163840000.pt b/checkpoints/model_weights_000163840000.pt new file mode 100644 index 0000000000000000000000000000000000000000..f5178e04847b294685676f05628ef21f1086e11f --- /dev/null +++ b/checkpoints/model_weights_000163840000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c09ab5a56f63e0c138f7a678e4cea8b949be565b51e1a9ed5498f4d3473a4274 +size 225208789 diff --git a/checkpoints/model_weights_000168067072.pt b/checkpoints/model_weights_000168067072.pt new file mode 100644 index 0000000000000000000000000000000000000000..16896ca4277e70c05a77c7e35b4de961266a0280 --- /dev/null +++ b/checkpoints/model_weights_000168067072.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b41e18520fcdcb9532ca0088f54684769aee314d490ca028f72a6b79c6437a59 +size 225208789 diff --git a/checkpoints/model_weights_000180224000.pt b/checkpoints/model_weights_000180224000.pt new file mode 100644 index 0000000000000000000000000000000000000000..386079cce76bac6f0a7eaf5d158ee94b1ce5bd3b --- /dev/null +++ b/checkpoints/model_weights_000180224000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d67baa5c6e62e27d8d5888c2cf8172e42ed7e8a3f70faa462076c23cb1f9ef2b +size 225208789 diff --git a/checkpoints/model_weights_000181501952.pt b/checkpoints/model_weights_000181501952.pt new file mode 100644 index 0000000000000000000000000000000000000000..3c9047dabfc094a43e3122aa19d2f7d1d59fb988 --- /dev/null +++ b/checkpoints/model_weights_000181501952.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0fcbd75704ca55d76aac6b12bed5c993ecda1e64bb5f2b7904b3e29e53386d86 +size 225208789 diff --git a/checkpoints/model_weights_000196018176.pt b/checkpoints/model_weights_000196018176.pt new file mode 100644 index 0000000000000000000000000000000000000000..11a2670bb99e27760b71c23de59e624bb10d9fca --- /dev/null +++ b/checkpoints/model_weights_000196018176.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:823103f4bd2b31d0448425b0e380a3db29f4bb6e2ffb541ec74a3d084a7d7bcd +size 225208789 diff --git a/checkpoints/model_weights_000196608000.pt b/checkpoints/model_weights_000196608000.pt new file mode 100644 index 0000000000000000000000000000000000000000..1c51f32f2fd81c4a1a16a8c0887c5d0ebdcb59f9 --- /dev/null +++ b/checkpoints/model_weights_000196608000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6bc31caa418249d2bd21597d8efb8c7a822d3439ce667db5e5855ad55cef4a2d +size 225208789 diff --git a/config.toml b/config.toml new file mode 100644 index 0000000000000000000000000000000000000000..165cc88299d1c0dbf158b9a6be3d43f70773519c --- /dev/null +++ b/config.toml @@ -0,0 +1,32 @@ +model_name = "gelu_2l_v5_random_above_15000" +n_layers = 2 +d_model = 512 +d_mlp = 2048 +d_head = 64 +n_heads = 8 +attn_only = false +layer_norm_eps = 1e-05 +init_range = 0.02 +n_ctx = 1024 +d_vocab = 48262 +dataset_name = "eoinf/unprocessed-c4-code-test" +tokenizer_name = "NeelNanda/gpt-neox-tokenizer-digits" +seed = 10 +device = "cuda" +use_bfloat16_matmul = false +batch_size_per_device = 32 +n_devices = 1 +batches_per_step = 1 +max_tokens = 200000000 +lr_hidden = 0.002 +lr_vector = 0.001 +lr_schedule = "constant_with_warmup" +warmup_tokens = 30000000 +weight_decay = 0.05 +grad_norm_clip = 1.0 +train_loss_moving_average_beta = 0.99 +log_interval = 25 +save_checkpoints = true +checkpoint_interval = 500 +checkpoint_interval_ratio = 1.08 +save_log_checkpoints = true \ No newline at end of file diff --git a/latest_checkpoint.pt b/latest_checkpoint.pt new file mode 100644 index 0000000000000000000000000000000000000000..994972503e813fdf49d41d3b0f54ca43ff4c20e3 --- /dev/null +++ b/latest_checkpoint.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:021797e83b3e10df857687c018bc65fd63cc774d916975d84df7b1d7967b3c3e +size 225208311 diff --git a/latest_metadata.json b/latest_metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..9885f969fcf97cae42b47927d2ddbb9d5d70d232 --- /dev/null +++ b/latest_metadata.json @@ -0,0 +1 @@ +{"step": 6000, "tokens_seen": 196608000, "config": {"model_name": "gelu_2l_v5_random_above_15000", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 48262, "dataset_name": "eoinf/unprocessed-c4-code-test", "tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", "seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.08, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.054760013092177} \ No newline at end of file diff --git a/latest_optimizer.pt b/latest_optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..7ff9d1062dc6d60b82c2128e80d5216a4db00eb9 --- /dev/null +++ b/latest_optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b6886f5af431577b41a058aee73c473457ffc59238831e80526416db0001709 +size 450423059 diff --git a/run.sh b/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..122ddb5e9329c0fcebd4b9d7df66890effe86817 --- /dev/null +++ b/run.sh @@ -0,0 +1,28 @@ + +#!/bin/bash +# Check if "restart" argument is passed to force normal training +if [ "$1" = "restart" ]; then + echo "Force restart: Running normal training ..." + python -c " +import os +from toy_models.train.trainer import train_transformer_from_config +current_dir = os.getcwd() +train_transformer_from_config('config.toml', current_dir) +" +else + # Check for checkpoints and run appropriate training + python -c " +import os +from pathlib import Path +from toy_models.train.trainer import train_transformer_from_config, restart_from_checkpoint +current_dir = os.getcwd() +# Check if checkpoints directory exists and has .pt files +latest_checkpoint = Path('latest_checkpoint.pt') +if latest_checkpoint.exists(): + print(f'Found checkpoint: {latest_checkpoint}. Restarting from checkpoint...') + restart_from_checkpoint(current_dir) +else: + print('Starting training from beginning ...') + train_transformer_from_config(current_dir) +" +fi \ No newline at end of file diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..c6517d5b70bd65bd3289f3016a891cb424b94b59 --- /dev/null +++ b/wandb/debug-internal.log @@ -0,0 +1,12 @@ +{"time":"2025-08-19T06:36:27.806787676Z","level":"INFO","msg":"stream: starting","core version":"0.21.1"} +{"time":"2025-08-19T06:36:27.983837809Z","level":"INFO","msg":"stream: created new stream","id":"4rs47wj5"} +{"time":"2025-08-19T06:36:27.983881866Z","level":"INFO","msg":"stream: started","id":"4rs47wj5"} +{"time":"2025-08-19T06:36:27.983924068Z","level":"INFO","msg":"writer: started","stream_id":"4rs47wj5"} +{"time":"2025-08-19T06:36:27.98394089Z","level":"INFO","msg":"sender: started","stream_id":"4rs47wj5"} +{"time":"2025-08-19T06:36:27.984004621Z","level":"INFO","msg":"handler: started","stream_id":"4rs47wj5"} +{"time":"2025-08-19T07:45:27.029996056Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-08-19T07:45:27.276957993Z","level":"INFO","msg":"handler: operation stats","stats":{}} +{"time":"2025-08-19T07:45:27.280737625Z","level":"INFO","msg":"stream: closing","id":"4rs47wj5"} +{"time":"2025-08-19T07:45:27.280787969Z","level":"INFO","msg":"handler: closed","stream_id":"4rs47wj5"} +{"time":"2025-08-19T07:45:27.280828684Z","level":"INFO","msg":"sender: closed","stream_id":"4rs47wj5"} +{"time":"2025-08-19T07:45:27.280836763Z","level":"INFO","msg":"stream: closed","id":"4rs47wj5"} diff --git a/wandb/debug.log b/wandb/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..917cfe47f9f7ab051dc866c88a5060928fec1e27 --- /dev/null +++ b/wandb/debug.log @@ -0,0 +1,28 @@ +2025-08-19 06:36:27,594 INFO MainThread:1898 [wandb_setup.py:_flush():80] Current SDK version is 0.21.1 +2025-08-19 06:36:27,594 INFO MainThread:1898 [wandb_setup.py:_flush():80] Configure stats pid to 1898 +2025-08-19 06:36:27,594 INFO MainThread:1898 [wandb_setup.py:_flush():80] Loading settings from /root/.config/wandb/settings +2025-08-19 06:36:27,594 INFO MainThread:1898 [wandb_setup.py:_flush():80] Loading settings from /notebooks/toy_models/models/gelu_2l_v6_200m_subset/wandb/settings +2025-08-19 06:36:27,594 INFO MainThread:1898 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-08-19 06:36:27,594 INFO MainThread:1898 [wandb_init.py:setup_run_log_directory():703] Logging user logs to /notebooks/toy_models/models/gelu_2l_v6_200m_subset/wandb/run-20250819_063627-4rs47wj5/logs/debug.log +2025-08-19 06:36:27,594 INFO MainThread:1898 [wandb_init.py:setup_run_log_directory():704] Logging internal logs to /notebooks/toy_models/models/gelu_2l_v6_200m_subset/wandb/run-20250819_063627-4rs47wj5/logs/debug-internal.log +2025-08-19 06:36:27,594 INFO MainThread:1898 [wandb_init.py:init():830] calling init triggers +2025-08-19 06:36:27,594 INFO MainThread:1898 [wandb_init.py:init():835] wandb.init called with sweep_config: {} +config: {'model_name': 'gelu_2l_v5_random_above_15000', 'n_layers': 2, 'd_model': 512, 'd_mlp': 2048, 'd_head': 64, 'n_heads': 8, 'attn_only': False, 'layer_norm_eps': 1e-05, 'init_range': 0.02, 'n_ctx': 1024, 'd_vocab': 48262, 'dataset_name': 'eoinf/unprocessed-c4-code-test', 'tokenizer_name': 'NeelNanda/gpt-neox-tokenizer-digits', 'seed': 10, 'device': 'cuda', 'use_bfloat16_matmul': False, 'batch_size_per_device': 32, 'n_devices': 1, 'batches_per_step': 1, 'max_tokens': 200000000, 'lr_hidden': 0.002, 'lr_vector': 0.001, 'lr_schedule': 'constant_with_warmup', 'warmup_tokens': 30000000, 'weight_decay': 0.05, 'grad_norm_clip': 1.0, 'train_loss_moving_average_beta': 0.99, 'log_interval': 25, 'save_checkpoints': True, 'checkpoint_interval': 500, 'checkpoint_interval_ratio': 1.08, 'save_log_checkpoints': True, 'use_wandb': True, 'batch_size': 32, 'tokens_per_step': 32768, 'warmup_steps': 915, 'max_steps': 6103, '_wandb': {}} +2025-08-19 06:36:27,594 INFO MainThread:1898 [wandb_init.py:init():871] starting backend +2025-08-19 06:36:27,801 INFO MainThread:1898 [wandb_init.py:init():874] sending inform_init request +2025-08-19 06:36:27,804 INFO MainThread:1898 [wandb_init.py:init():882] backend started and connected +2025-08-19 06:36:27,806 INFO MainThread:1898 [wandb_init.py:init():953] updated telemetry +2025-08-19 06:36:27,809 INFO MainThread:1898 [wandb_init.py:init():977] communicating run to backend with 90.0 second timeout +2025-08-19 06:36:28,120 INFO MainThread:1898 [wandb_init.py:init():1029] starting run threads in backend +2025-08-19 06:36:28,215 INFO MainThread:1898 [wandb_run.py:_console_start():2494] atexit reg +2025-08-19 06:36:28,215 INFO MainThread:1898 [wandb_run.py:_redirect():2342] redirect: wrap_raw +2025-08-19 06:36:28,215 INFO MainThread:1898 [wandb_run.py:_redirect():2411] Wrapping output streams. +2025-08-19 06:36:28,216 INFO MainThread:1898 [wandb_run.py:_redirect():2434] Redirects installed. +2025-08-19 06:36:28,218 INFO MainThread:1898 [wandb_init.py:init():1075] run started, returning control to user process +2025-08-19 07:45:26,690 INFO MainThread:1898 [wandb_run.py:_finish():2260] finishing run eoin/toy-transformer-replication/4rs47wj5 +2025-08-19 07:45:26,697 INFO MainThread:1898 [wandb_run.py:_atexit_cleanup():2459] got exitcode: 0 +2025-08-19 07:45:26,697 INFO MainThread:1898 [wandb_run.py:_restore():2441] restore +2025-08-19 07:45:26,698 INFO MainThread:1898 [wandb_run.py:_restore():2447] restore done +2025-08-19 07:45:27,278 INFO MainThread:1898 [wandb_run.py:_footer_history_summary_info():3895] rendering history +2025-08-19 07:45:27,278 INFO MainThread:1898 [wandb_run.py:_footer_history_summary_info():3927] rendering summary +2025-08-19 07:45:27,279 INFO MainThread:1898 [wandb_run.py:_footer_sync_info():3856] logging synced files diff --git a/wandb/run-20250819_063129-onuk41qn/files/config.yaml b/wandb/run-20250819_063129-onuk41qn/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..157b76fb640c2d5f58299a77578de012c185e536 --- /dev/null +++ b/wandb/run-20250819_063129-onuk41qn/files/config.yaml @@ -0,0 +1,127 @@ +_wandb: + value: + cli_version: 0.21.1 + e: + 6628sp61xuj5q5mk9k6cv04drt0ib4dt: + cpu_count: 8 + cpu_count_logical: 8 + cudaVersion: "12.4" + disk: + /: + total: "262240792576" + used: "165822877696" + email: efarrel4@tcd.ie + executable: /notebooks/clean_env/bin/python + git: + commit: c3cfb768d471036c37848ff2c6d223b68ad88e82 + remote: git@github.com:jgroh3/toy_models.git + gpu: NVIDIA RTX A6000 + gpu_count: 1 + gpu_nvidia: + - architecture: Ampere + cudaCores: 10752 + memoryTotal: "51527024640" + name: NVIDIA RTX A6000 + uuid: GPU-0907b282-5ffc-ff4c-cea5-6f6015ec84ee + host: nww2895qc3 + memory: + total: "47332843520" + os: Linux-5.19.0-45-generic-x86_64-with-glibc2.35 + program: + python: CPython 3.11.7 + root: /notebooks/toy_models/models/gelu_2l_v6_200m_subset + startedAt: "2025-08-19T06:31:29.345870Z" + writerId: 6628sp61xuj5q5mk9k6cv04drt0ib4dt + m: [] + python_version: 3.11.7 + t: + "1": + - 1 + - 49 + - 51 + "2": + - 1 + - 49 + - 51 + "3": + - 13 + - 15 + - 16 + "4": 3.11.7 + "5": 0.21.1 + "12": 0.21.1 + "13": linux-x86_64 +attn_only: + value: false +batch_size: + value: 32 +batch_size_per_device: + value: 32 +batches_per_step: + value: 1 +checkpoint_interval: + value: 500 +checkpoint_interval_ratio: + value: 1.08 +d_head: + value: 64 +d_mlp: + value: 2048 +d_model: + value: 512 +d_vocab: + value: 48262 +dataset_name: + value: eoinf/unprocessed-c4-code-test +device: + value: cuda +grad_norm_clip: + value: 1 +init_range: + value: 0.02 +layer_norm_eps: + value: 1e-05 +log_interval: + value: 25 +lr_hidden: + value: 0.002 +lr_schedule: + value: constant_with_warmup +lr_vector: + value: 0.001 +max_steps: + value: 6103 +max_tokens: + value: 200000000 +model_name: + value: gelu_2l_v5_random_above_15000 +n_ctx: + value: 1024 +n_devices: + value: 1 +n_heads: + value: 8 +n_layers: + value: 2 +save_checkpoints: + value: true +save_log_checkpoints: + value: true +seed: + value: 10 +tokenizer_name: + value: NeelNanda/gpt-neox-tokenizer-digits +tokens_per_step: + value: 32768 +train_loss_moving_average_beta: + value: 0.99 +use_bfloat16_matmul: + value: false +use_wandb: + value: true +warmup_steps: + value: 915 +warmup_tokens: + value: 30000000 +weight_decay: + value: 0.05 diff --git a/wandb/run-20250819_063129-onuk41qn/files/output.log b/wandb/run-20250819_063129-onuk41qn/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..f31b8de75f4fde706697b0f023d164d9b03d97c3 --- /dev/null +++ b/wandb/run-20250819_063129-onuk41qn/files/output.log @@ -0,0 +1,28 @@ +Training on cuda +Model: 2L, 512d, 8h +Max steps: 6,103, Max tokens: 200,000,000 +Warmup steps: 915, Warmup tokens: 30,000,000 +Batch size per device: 32 +Context length: 1024 +Learning rates - Hidden: 0.002, Vector: 0.001 +Training Steps: 0%|▏ | 25/6103 [00:34<2:18:11, 1.36s/it, loss=10.0842, ewma_loss=10.7895, lr=5.46e-05, tokens=0.00B] +Traceback (most recent call last): + File "/notebooks/toy_models/toy_models/train/trainer.py", line 379, in train + self.log_step(loss, current_lr) + File "/notebooks/toy_models/toy_models/train/trainer.py", line 335, in log_step + self.log_metrics(metrics) + File "/notebooks/toy_models/toy_models/train/trainer.py", line 314, in log_metrics + wandb.log(metrics, step=self.step) + ^^^^^ +NameError: name 'wandb' is not defined + +During handling of the above exception, another exception occurred: + +Traceback (most recent call last): + File "", line 13, in + File "/notebooks/toy_models/toy_models/train/trainer.py", line 471, in train_transformer_from_config + trainer.train() + File "/notebooks/toy_models/toy_models/train/trainer.py", line 392, in train + wandb.finish() + ^^^^^ +NameError: name 'wandb' is not defined diff --git a/wandb/run-20250819_063129-onuk41qn/files/requirements.txt b/wandb/run-20250819_063129-onuk41qn/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..bbed86cc72e05aec5c78850f9963d0e3471caff0 --- /dev/null +++ b/wandb/run-20250819_063129-onuk41qn/files/requirements.txt @@ -0,0 +1,185 @@ +fsspec==2025.3.0 +PyYAML==6.0.2 +certifi==2025.8.3 +comm==0.2.3 +multidict==6.6.3 +widgetsnbextension==4.0.14 +Jinja2==3.1.6 +rich==14.1.0 +httpcore==1.0.9 +nvidia-cuda-cupti-cu12==12.8.90 +nvidia-cuda-nvrtc-cu12==12.8.93 +pyzmq==27.0.1 +jsonschema==4.25.0 +asttokens==3.0.0 +jsonschema-specifications==2025.4.1 +cycler==0.12.1 +stack-data==0.6.3 +aiosignal==1.4.0 +lark==1.2.2 +ptyprocess==0.7.0 +xxhash==3.5.0 +setuptools==65.5.0 +mpmath==1.3.0 +wadler_lindig==0.1.7 +typing_extensions==4.14.1 +nbformat==5.10.4 +huggingface-hub==0.34.4 +torchvision==0.23.0 +tqdm==4.67.1 +contourpy==1.3.3 +nvidia-nvtx-cu12==12.8.90 +nvidia-cuda-runtime-cu12==12.8.90 +yarl==1.20.1 +jupyter-events==0.12.0 +protobuf==6.31.1 +nbclient==0.10.2 +numpy==1.26.4 +decorator==5.2.1 +networkx==3.5 +smmap==5.0.2 +nbconvert==7.16.6 +pytz==2025.2 +aiohappyeyeballs==2.6.1 +tinycss2==1.4.0 +defusedxml==0.7.1 +matplotlib-inline==0.1.7 +hf-xet==1.1.7 +jedi==0.19.2 +transformer-lens==2.16.1 +pandas==2.3.1 +sympy==1.14.0 +jupyterlab_pygments==0.3.0 +overrides==7.7.0 +notebook_shim==0.2.4 +matplotlib==3.10.5 +jupyter==1.1.1 +dotenv==0.9.9 +accelerate==1.10.0 +better-abc==0.0.3 +jsonpointer==3.0.0 +terminado==0.18.1 +rfc3987-syntax==1.1.0 +annotated-types==0.7.0 +pyarrow==21.0.0 +webencodings==0.5.1 +wcwidth==0.2.13 +mistune==3.1.3 +cffi==1.17.1 +jupyterlab_server==2.27.3 +argon2-cffi-bindings==25.1.0 +nvidia-nvjitlink-cu12==12.8.93 +jaxtyping==0.3.2 +Pygments==2.19.2 +torch==2.8.0 +rfc3339-validator==0.1.4 +urllib3==2.5.0 +jupyterlab_widgets==3.0.15 +ipykernel==6.30.1 +nvidia-cudnn-cu12==9.10.2.21 +transformers==4.55.0 +babel==2.17.0 +pure_eval==0.2.3 +pyparsing==3.2.3 +nvidia-cublas-cu12==12.8.4.1 +sniffio==1.3.1 +notebook==7.4.5 +pycparser==2.22 +packaging==25.0 +h11==0.16.0 +psutil==7.0.0 +pexpect==4.9.0 +gitdb==4.0.12 +rfc3986-validator==0.1.1 +toy_models==0.1.0 +narwhals==2.0.1 +torchaudio==2.8.0 +prompt_toolkit==3.0.51 +attrs==25.3.0 +regex==2025.7.34 +jupyter_core==5.8.1 +bleach==6.2.0 +fqdn==1.5.1 +async-lru==2.0.5 +nvidia-nccl-cu12==2.27.3 +GitPython==3.1.45 +referencing==0.36.2 +click==8.2.1 +prometheus_client==0.22.1 +httpx==0.28.1 +requests==2.32.4 +fonttools==4.59.0 +argon2-cffi==25.1.0 +executing==2.2.0 +arrow==1.3.0 +beartype==0.14.1 +ipywidgets==8.1.7 +pydantic_core==2.33.2 +tokenizers==0.21.4 +pip==23.2.1 +python-dotenv==1.1.1 +isoduration==20.11.0 +python-dateutil==2.9.0.post0 +json5==0.12.0 +nvidia-curand-cu12==10.3.9.90 +webcolors==24.11.1 +MarkupSafe==3.0.2 +nvidia-cusolver-cu12==11.7.3.90 +sentry-sdk==2.34.1 +Send2Trash==1.8.3 +jupyter_server_terminals==0.5.3 +debugpy==1.8.16 +nvidia-cufft-cu12==11.3.3.83 +typing-inspection==0.4.1 +rpds-py==0.27.0 +nvidia-cufile-cu12==1.13.1.3 +mdurl==0.1.2 +websocket-client==1.8.0 +python-json-logger==3.3.0 +filelock==3.18.0 +types-python-dateutil==2.9.0.20250809 +kiwisolver==1.4.8 +einops==0.8.1 +jupyter_client==8.6.3 +ipython_pygments_lexers==1.1.1 +tabulate==0.9.0 +propcache==0.3.2 +tornado==6.5.2 +typeguard==4.4.4 +tomlkit==0.13.2 +pydantic==2.11.7 +ipython==9.4.0 +charset-normalizer==3.4.2 +fancy-einsum==0.0.3 +datasets==4.0.0 +pillow==11.3.0 +beautifulsoup4==4.13.4 +soupsieve==2.7 +aiohttp==3.12.15 +plotly==6.2.0 +wandb==0.21.1 +tzdata==2025.2 +jupyter-lsp==2.2.6 +triton==3.4.0 +idna==3.10 +jupyterlab==4.4.5 +multiprocess==0.70.16 +dill==0.3.8 +fastjsonschema==2.21.1 +transformers-stream-generator==0.0.5 +nvidia-cusparselt-cu12==0.7.1 +parso==0.8.4 +pandocfilters==1.5.1 +jupyter-console==6.6.3 +anyio==4.10.0 +six==1.17.0 +uri-template==1.3.0 +sentencepiece==0.2.0 +markdown-it-py==3.0.0 +nest-asyncio==1.6.0 +nvidia-cusparse-cu12==12.5.8.93 +platformdirs==4.3.8 +traitlets==5.14.3 +jupyter_server==2.16.0 +safetensors==0.6.2 +frozenlist==1.7.0 diff --git a/wandb/run-20250819_063129-onuk41qn/files/wandb-metadata.json b/wandb/run-20250819_063129-onuk41qn/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..8231046426216c8718c3f09d71cd0a116912645b --- /dev/null +++ b/wandb/run-20250819_063129-onuk41qn/files/wandb-metadata.json @@ -0,0 +1,38 @@ +{ + "os": "Linux-5.19.0-45-generic-x86_64-with-glibc2.35", + "python": "CPython 3.11.7", + "startedAt": "2025-08-19T06:31:29.345870Z", + "program": "", + "git": { + "remote": "git@github.com:jgroh3/toy_models.git", + "commit": "c3cfb768d471036c37848ff2c6d223b68ad88e82" + }, + "email": "efarrel4@tcd.ie", + "root": "/notebooks/toy_models/models/gelu_2l_v6_200m_subset", + "host": "nww2895qc3", + "executable": "/notebooks/clean_env/bin/python", + "cpu_count": 8, + "cpu_count_logical": 8, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 1, + "disk": { + "/": { + "total": "262240792576", + "used": "165822877696" + } + }, + "memory": { + "total": "47332843520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere", + "uuid": "GPU-0907b282-5ffc-ff4c-cea5-6f6015ec84ee" + } + ], + "cudaVersion": "12.4", + "writerId": "6628sp61xuj5q5mk9k6cv04drt0ib4dt" +} \ No newline at end of file diff --git a/wandb/run-20250819_063129-onuk41qn/files/wandb-summary.json b/wandb/run-20250819_063129-onuk41qn/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..c222096ad7d6e8be18209b73a11dff8efb864287 --- /dev/null +++ b/wandb/run-20250819_063129-onuk41qn/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":35},"_runtime":35} \ No newline at end of file diff --git a/wandb/run-20250819_063129-onuk41qn/logs/debug-internal.log b/wandb/run-20250819_063129-onuk41qn/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..b6465f1f305946d1d4bebf05fc9cd4fc5cc5f68d --- /dev/null +++ b/wandb/run-20250819_063129-onuk41qn/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2025-08-19T06:31:30.039349912Z","level":"INFO","msg":"stream: starting","core version":"0.21.1"} +{"time":"2025-08-19T06:31:30.364502983Z","level":"INFO","msg":"stream: created new stream","id":"onuk41qn"} +{"time":"2025-08-19T06:31:30.36455052Z","level":"INFO","msg":"stream: started","id":"onuk41qn"} +{"time":"2025-08-19T06:31:30.364588434Z","level":"INFO","msg":"writer: started","stream_id":"onuk41qn"} +{"time":"2025-08-19T06:31:30.36458861Z","level":"INFO","msg":"sender: started","stream_id":"onuk41qn"} +{"time":"2025-08-19T06:31:30.364636415Z","level":"INFO","msg":"handler: started","stream_id":"onuk41qn"} +{"time":"2025-08-19T06:32:05.650665596Z","level":"INFO","msg":"stream: closing","id":"onuk41qn"} +{"time":"2025-08-19T06:32:05.940352788Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-08-19T06:32:06.14585018Z","level":"INFO","msg":"handler: closed","stream_id":"onuk41qn"} +{"time":"2025-08-19T06:32:06.145930486Z","level":"INFO","msg":"sender: closed","stream_id":"onuk41qn"} +{"time":"2025-08-19T06:32:06.145939498Z","level":"INFO","msg":"stream: closed","id":"onuk41qn"} diff --git a/wandb/run-20250819_063129-onuk41qn/logs/debug.log b/wandb/run-20250819_063129-onuk41qn/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..d99517d6d866be432ec7c266bd78bd083bee901c --- /dev/null +++ b/wandb/run-20250819_063129-onuk41qn/logs/debug.log @@ -0,0 +1,22 @@ +2025-08-19 06:31:29,353 INFO MainThread:1697 [wandb_setup.py:_flush():80] Current SDK version is 0.21.1 +2025-08-19 06:31:29,353 INFO MainThread:1697 [wandb_setup.py:_flush():80] Configure stats pid to 1697 +2025-08-19 06:31:29,353 INFO MainThread:1697 [wandb_setup.py:_flush():80] Loading settings from /root/.config/wandb/settings +2025-08-19 06:31:29,353 INFO MainThread:1697 [wandb_setup.py:_flush():80] Loading settings from /notebooks/toy_models/models/gelu_2l_v6_200m_subset/wandb/settings +2025-08-19 06:31:29,353 INFO MainThread:1697 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-08-19 06:31:29,353 INFO MainThread:1697 [wandb_init.py:setup_run_log_directory():703] Logging user logs to /notebooks/toy_models/models/gelu_2l_v6_200m_subset/wandb/run-20250819_063129-onuk41qn/logs/debug.log +2025-08-19 06:31:29,353 INFO MainThread:1697 [wandb_init.py:setup_run_log_directory():704] Logging internal logs to /notebooks/toy_models/models/gelu_2l_v6_200m_subset/wandb/run-20250819_063129-onuk41qn/logs/debug-internal.log +2025-08-19 06:31:29,353 INFO MainThread:1697 [wandb_init.py:init():830] calling init triggers +2025-08-19 06:31:29,353 INFO MainThread:1697 [wandb_init.py:init():835] wandb.init called with sweep_config: {} +config: {'model_name': 'gelu_2l_v5_random_above_15000', 'n_layers': 2, 'd_model': 512, 'd_mlp': 2048, 'd_head': 64, 'n_heads': 8, 'attn_only': False, 'layer_norm_eps': 1e-05, 'init_range': 0.02, 'n_ctx': 1024, 'd_vocab': 48262, 'dataset_name': 'eoinf/unprocessed-c4-code-test', 'tokenizer_name': 'NeelNanda/gpt-neox-tokenizer-digits', 'seed': 10, 'device': 'cuda', 'use_bfloat16_matmul': False, 'batch_size_per_device': 32, 'n_devices': 1, 'batches_per_step': 1, 'max_tokens': 200000000, 'lr_hidden': 0.002, 'lr_vector': 0.001, 'lr_schedule': 'constant_with_warmup', 'warmup_tokens': 30000000, 'weight_decay': 0.05, 'grad_norm_clip': 1.0, 'train_loss_moving_average_beta': 0.99, 'log_interval': 25, 'save_checkpoints': True, 'checkpoint_interval': 500, 'checkpoint_interval_ratio': 1.08, 'save_log_checkpoints': True, 'use_wandb': True, 'batch_size': 32, 'tokens_per_step': 32768, 'warmup_steps': 915, 'max_steps': 6103, '_wandb': {}} +2025-08-19 06:31:29,353 INFO MainThread:1697 [wandb_init.py:init():871] starting backend +2025-08-19 06:31:30,023 INFO MainThread:1697 [wandb_init.py:init():874] sending inform_init request +2025-08-19 06:31:30,036 INFO MainThread:1697 [wandb_init.py:init():882] backend started and connected +2025-08-19 06:31:30,037 INFO MainThread:1697 [wandb_init.py:init():953] updated telemetry +2025-08-19 06:31:30,041 INFO MainThread:1697 [wandb_init.py:init():977] communicating run to backend with 90.0 second timeout +2025-08-19 06:31:30,549 INFO MainThread:1697 [wandb_init.py:init():1029] starting run threads in backend +2025-08-19 06:31:31,502 INFO MainThread:1697 [wandb_run.py:_console_start():2494] atexit reg +2025-08-19 06:31:31,502 INFO MainThread:1697 [wandb_run.py:_redirect():2342] redirect: wrap_raw +2025-08-19 06:31:31,502 INFO MainThread:1697 [wandb_run.py:_redirect():2411] Wrapping output streams. +2025-08-19 06:31:31,502 INFO MainThread:1697 [wandb_run.py:_redirect():2434] Redirects installed. +2025-08-19 06:31:31,521 INFO MainThread:1697 [wandb_init.py:init():1075] run started, returning control to user process +2025-08-19 06:32:05,650 INFO MsgRouterThr:1697 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 1 handles. diff --git a/wandb/run-20250819_063129-onuk41qn/run-onuk41qn.wandb b/wandb/run-20250819_063129-onuk41qn/run-onuk41qn.wandb new file mode 100644 index 0000000000000000000000000000000000000000..6b40e452be2c467a85bc9d6e4eb0d24caf274a0a Binary files /dev/null and b/wandb/run-20250819_063129-onuk41qn/run-onuk41qn.wandb differ diff --git a/wandb/run-20250819_063411-a3i61ot9/files/config.yaml b/wandb/run-20250819_063411-a3i61ot9/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..04872c016048971e207f6c3afe8ac46c80d5896c --- /dev/null +++ b/wandb/run-20250819_063411-a3i61ot9/files/config.yaml @@ -0,0 +1,127 @@ +_wandb: + value: + cli_version: 0.21.1 + e: + tethbc24dtoek67ozxk7z1ivqk3i2h5r: + cpu_count: 8 + cpu_count_logical: 8 + cudaVersion: "12.4" + disk: + /: + total: "262240792576" + used: "165822988288" + email: efarrel4@tcd.ie + executable: /notebooks/clean_env/bin/python + git: + commit: c3cfb768d471036c37848ff2c6d223b68ad88e82 + remote: git@github.com:jgroh3/toy_models.git + gpu: NVIDIA RTX A6000 + gpu_count: 1 + gpu_nvidia: + - architecture: Ampere + cudaCores: 10752 + memoryTotal: "51527024640" + name: NVIDIA RTX A6000 + uuid: GPU-0907b282-5ffc-ff4c-cea5-6f6015ec84ee + host: nww2895qc3 + memory: + total: "47332843520" + os: Linux-5.19.0-45-generic-x86_64-with-glibc2.35 + program: + python: CPython 3.11.7 + root: /notebooks/toy_models/models/gelu_2l_v6_200m_subset + startedAt: "2025-08-19T06:34:11.371141Z" + writerId: tethbc24dtoek67ozxk7z1ivqk3i2h5r + m: [] + python_version: 3.11.7 + t: + "1": + - 1 + - 49 + - 51 + "2": + - 1 + - 49 + - 51 + "3": + - 13 + - 15 + - 16 + "4": 3.11.7 + "5": 0.21.1 + "12": 0.21.1 + "13": linux-x86_64 +attn_only: + value: false +batch_size: + value: 32 +batch_size_per_device: + value: 32 +batches_per_step: + value: 1 +checkpoint_interval: + value: 500 +checkpoint_interval_ratio: + value: 1.08 +d_head: + value: 64 +d_mlp: + value: 2048 +d_model: + value: 512 +d_vocab: + value: 48262 +dataset_name: + value: eoinf/unprocessed-c4-code-test +device: + value: cuda +grad_norm_clip: + value: 1 +init_range: + value: 0.02 +layer_norm_eps: + value: 1e-05 +log_interval: + value: 25 +lr_hidden: + value: 0.002 +lr_schedule: + value: constant_with_warmup +lr_vector: + value: 0.001 +max_steps: + value: 6103 +max_tokens: + value: 200000000 +model_name: + value: gelu_2l_v5_random_above_15000 +n_ctx: + value: 1024 +n_devices: + value: 1 +n_heads: + value: 8 +n_layers: + value: 2 +save_checkpoints: + value: true +save_log_checkpoints: + value: true +seed: + value: 10 +tokenizer_name: + value: NeelNanda/gpt-neox-tokenizer-digits +tokens_per_step: + value: 32768 +train_loss_moving_average_beta: + value: 0.99 +use_bfloat16_matmul: + value: false +use_wandb: + value: true +warmup_steps: + value: 915 +warmup_tokens: + value: 30000000 +weight_decay: + value: 0.05 diff --git a/wandb/run-20250819_063411-a3i61ot9/files/output.log b/wandb/run-20250819_063411-a3i61ot9/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..9a478606e968b1ead02a7a8c95c7ebf524ad1f42 --- /dev/null +++ b/wandb/run-20250819_063411-a3i61ot9/files/output.log @@ -0,0 +1,31 @@ +{'step': 23, 'tokens_seen': 753664, 'config': {'model_name': 'gelu_2l_v5_random_above_15000', 'n_layers': 2, 'd_model': 512, 'd_mlp': 2048, 'd_head': 64, 'n_heads': 8, 'attn_only': False, 'layer_norm_eps': 1e-05, 'init_range': 0.02, 'n_ctx': 1024, 'd_vocab': 48262, 'dataset_name': 'eoinf/unprocessed-c4-code-test', 'tokenizer_name': 'NeelNanda/gpt-neox-tokenizer-digits', 'seed': 10, 'device': 'cuda', 'use_bfloat16_matmul': False, 'batch_size_per_device': 32, 'n_devices': 1, 'batches_per_step': 1, 'max_tokens': 200000000, 'lr_hidden': 0.002, 'lr_vector': 0.001, 'lr_schedule': 'constant_with_warmup', 'warmup_tokens': 30000000, 'weight_decay': 0.05, 'grad_norm_clip': 1.0, 'train_loss_moving_average_beta': 0.99, 'log_interval': 25, 'save_checkpoints': True, 'checkpoint_interval': 500, 'checkpoint_interval_ratio': 1.08, 'save_log_checkpoints': True, 'use_wandb': True, 'batch_size': 32, 'tokens_per_step': 32768, 'warmup_steps': 915, 'max_steps': 6103}, 'train_loss_ewma': 10.803400143292523} +/notebooks/clean_env/lib/python3.11/site-packages/torch/optim/lr_scheduler.py:192: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`. Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate + warnings.warn( +Training on cuda +Model: 2L, 512d, 8h +Max steps: 6,103, Max tokens: 200,000,000 +Warmup steps: 915, Warmup tokens: 30,000,000 +Batch size per device: 32 +Context length: 1024 +Learning rates - Hidden: 0.002, Vector: 0.001 +Training Steps: 0%|▏ | 25/6103 [00:01<1:21:47, 1.24it/s, loss=10.1160, ewma_loss=10.7893, lr=5.46e-05, tokens=0.00B] +Traceback (most recent call last): + File "/notebooks/toy_models/toy_models/train/trainer.py", line 379, in train + self.log_step(loss, current_lr) + File "/notebooks/toy_models/toy_models/train/trainer.py", line 335, in log_step + self.log_metrics(metrics) + File "/notebooks/toy_models/toy_models/train/trainer.py", line 314, in log_metrics + wandb.log(metrics, step=self.step) + ^^^^^ +NameError: name 'wandb' is not defined + +During handling of the above exception, another exception occurred: + +Traceback (most recent call last): + File "", line 10, in + File "/notebooks/toy_models/toy_models/train/trainer.py", line 478, in restart_from_checkpoint + trainer.train() + File "/notebooks/toy_models/toy_models/train/trainer.py", line 392, in train + wandb.finish() + ^^^^^ +NameError: name 'wandb' is not defined diff --git a/wandb/run-20250819_063411-a3i61ot9/files/requirements.txt b/wandb/run-20250819_063411-a3i61ot9/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..bbed86cc72e05aec5c78850f9963d0e3471caff0 --- /dev/null +++ b/wandb/run-20250819_063411-a3i61ot9/files/requirements.txt @@ -0,0 +1,185 @@ +fsspec==2025.3.0 +PyYAML==6.0.2 +certifi==2025.8.3 +comm==0.2.3 +multidict==6.6.3 +widgetsnbextension==4.0.14 +Jinja2==3.1.6 +rich==14.1.0 +httpcore==1.0.9 +nvidia-cuda-cupti-cu12==12.8.90 +nvidia-cuda-nvrtc-cu12==12.8.93 +pyzmq==27.0.1 +jsonschema==4.25.0 +asttokens==3.0.0 +jsonschema-specifications==2025.4.1 +cycler==0.12.1 +stack-data==0.6.3 +aiosignal==1.4.0 +lark==1.2.2 +ptyprocess==0.7.0 +xxhash==3.5.0 +setuptools==65.5.0 +mpmath==1.3.0 +wadler_lindig==0.1.7 +typing_extensions==4.14.1 +nbformat==5.10.4 +huggingface-hub==0.34.4 +torchvision==0.23.0 +tqdm==4.67.1 +contourpy==1.3.3 +nvidia-nvtx-cu12==12.8.90 +nvidia-cuda-runtime-cu12==12.8.90 +yarl==1.20.1 +jupyter-events==0.12.0 +protobuf==6.31.1 +nbclient==0.10.2 +numpy==1.26.4 +decorator==5.2.1 +networkx==3.5 +smmap==5.0.2 +nbconvert==7.16.6 +pytz==2025.2 +aiohappyeyeballs==2.6.1 +tinycss2==1.4.0 +defusedxml==0.7.1 +matplotlib-inline==0.1.7 +hf-xet==1.1.7 +jedi==0.19.2 +transformer-lens==2.16.1 +pandas==2.3.1 +sympy==1.14.0 +jupyterlab_pygments==0.3.0 +overrides==7.7.0 +notebook_shim==0.2.4 +matplotlib==3.10.5 +jupyter==1.1.1 +dotenv==0.9.9 +accelerate==1.10.0 +better-abc==0.0.3 +jsonpointer==3.0.0 +terminado==0.18.1 +rfc3987-syntax==1.1.0 +annotated-types==0.7.0 +pyarrow==21.0.0 +webencodings==0.5.1 +wcwidth==0.2.13 +mistune==3.1.3 +cffi==1.17.1 +jupyterlab_server==2.27.3 +argon2-cffi-bindings==25.1.0 +nvidia-nvjitlink-cu12==12.8.93 +jaxtyping==0.3.2 +Pygments==2.19.2 +torch==2.8.0 +rfc3339-validator==0.1.4 +urllib3==2.5.0 +jupyterlab_widgets==3.0.15 +ipykernel==6.30.1 +nvidia-cudnn-cu12==9.10.2.21 +transformers==4.55.0 +babel==2.17.0 +pure_eval==0.2.3 +pyparsing==3.2.3 +nvidia-cublas-cu12==12.8.4.1 +sniffio==1.3.1 +notebook==7.4.5 +pycparser==2.22 +packaging==25.0 +h11==0.16.0 +psutil==7.0.0 +pexpect==4.9.0 +gitdb==4.0.12 +rfc3986-validator==0.1.1 +toy_models==0.1.0 +narwhals==2.0.1 +torchaudio==2.8.0 +prompt_toolkit==3.0.51 +attrs==25.3.0 +regex==2025.7.34 +jupyter_core==5.8.1 +bleach==6.2.0 +fqdn==1.5.1 +async-lru==2.0.5 +nvidia-nccl-cu12==2.27.3 +GitPython==3.1.45 +referencing==0.36.2 +click==8.2.1 +prometheus_client==0.22.1 +httpx==0.28.1 +requests==2.32.4 +fonttools==4.59.0 +argon2-cffi==25.1.0 +executing==2.2.0 +arrow==1.3.0 +beartype==0.14.1 +ipywidgets==8.1.7 +pydantic_core==2.33.2 +tokenizers==0.21.4 +pip==23.2.1 +python-dotenv==1.1.1 +isoduration==20.11.0 +python-dateutil==2.9.0.post0 +json5==0.12.0 +nvidia-curand-cu12==10.3.9.90 +webcolors==24.11.1 +MarkupSafe==3.0.2 +nvidia-cusolver-cu12==11.7.3.90 +sentry-sdk==2.34.1 +Send2Trash==1.8.3 +jupyter_server_terminals==0.5.3 +debugpy==1.8.16 +nvidia-cufft-cu12==11.3.3.83 +typing-inspection==0.4.1 +rpds-py==0.27.0 +nvidia-cufile-cu12==1.13.1.3 +mdurl==0.1.2 +websocket-client==1.8.0 +python-json-logger==3.3.0 +filelock==3.18.0 +types-python-dateutil==2.9.0.20250809 +kiwisolver==1.4.8 +einops==0.8.1 +jupyter_client==8.6.3 +ipython_pygments_lexers==1.1.1 +tabulate==0.9.0 +propcache==0.3.2 +tornado==6.5.2 +typeguard==4.4.4 +tomlkit==0.13.2 +pydantic==2.11.7 +ipython==9.4.0 +charset-normalizer==3.4.2 +fancy-einsum==0.0.3 +datasets==4.0.0 +pillow==11.3.0 +beautifulsoup4==4.13.4 +soupsieve==2.7 +aiohttp==3.12.15 +plotly==6.2.0 +wandb==0.21.1 +tzdata==2025.2 +jupyter-lsp==2.2.6 +triton==3.4.0 +idna==3.10 +jupyterlab==4.4.5 +multiprocess==0.70.16 +dill==0.3.8 +fastjsonschema==2.21.1 +transformers-stream-generator==0.0.5 +nvidia-cusparselt-cu12==0.7.1 +parso==0.8.4 +pandocfilters==1.5.1 +jupyter-console==6.6.3 +anyio==4.10.0 +six==1.17.0 +uri-template==1.3.0 +sentencepiece==0.2.0 +markdown-it-py==3.0.0 +nest-asyncio==1.6.0 +nvidia-cusparse-cu12==12.5.8.93 +platformdirs==4.3.8 +traitlets==5.14.3 +jupyter_server==2.16.0 +safetensors==0.6.2 +frozenlist==1.7.0 diff --git a/wandb/run-20250819_063411-a3i61ot9/files/wandb-metadata.json b/wandb/run-20250819_063411-a3i61ot9/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..29f93a6dbec0bafd7ffde5b7c60d5fc524be1696 --- /dev/null +++ b/wandb/run-20250819_063411-a3i61ot9/files/wandb-metadata.json @@ -0,0 +1,38 @@ +{ + "os": "Linux-5.19.0-45-generic-x86_64-with-glibc2.35", + "python": "CPython 3.11.7", + "startedAt": "2025-08-19T06:34:11.371141Z", + "program": "", + "git": { + "remote": "git@github.com:jgroh3/toy_models.git", + "commit": "c3cfb768d471036c37848ff2c6d223b68ad88e82" + }, + "email": "efarrel4@tcd.ie", + "root": "/notebooks/toy_models/models/gelu_2l_v6_200m_subset", + "host": "nww2895qc3", + "executable": "/notebooks/clean_env/bin/python", + "cpu_count": 8, + "cpu_count_logical": 8, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 1, + "disk": { + "/": { + "total": "262240792576", + "used": "165822988288" + } + }, + "memory": { + "total": "47332843520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere", + "uuid": "GPU-0907b282-5ffc-ff4c-cea5-6f6015ec84ee" + } + ], + "cudaVersion": "12.4", + "writerId": "tethbc24dtoek67ozxk7z1ivqk3i2h5r" +} \ No newline at end of file diff --git a/wandb/run-20250819_063411-a3i61ot9/files/wandb-summary.json b/wandb/run-20250819_063411-a3i61ot9/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..8afb95f49483c85658a334253ad61c5e4b5851ef --- /dev/null +++ b/wandb/run-20250819_063411-a3i61ot9/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":2},"_runtime":2} \ No newline at end of file diff --git a/wandb/run-20250819_063411-a3i61ot9/logs/debug-internal.log b/wandb/run-20250819_063411-a3i61ot9/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..9d1bb5e77079ecc46447c9d18fa24300cfa03953 --- /dev/null +++ b/wandb/run-20250819_063411-a3i61ot9/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2025-08-19T06:34:11.589201238Z","level":"INFO","msg":"stream: starting","core version":"0.21.1"} +{"time":"2025-08-19T06:34:13.771140526Z","level":"INFO","msg":"stream: created new stream","id":"a3i61ot9"} +{"time":"2025-08-19T06:34:13.771187437Z","level":"INFO","msg":"stream: started","id":"a3i61ot9"} +{"time":"2025-08-19T06:34:13.771237307Z","level":"INFO","msg":"writer: started","stream_id":"a3i61ot9"} +{"time":"2025-08-19T06:34:13.771282242Z","level":"INFO","msg":"sender: started","stream_id":"a3i61ot9"} +{"time":"2025-08-19T06:34:13.771250699Z","level":"INFO","msg":"handler: started","stream_id":"a3i61ot9"} +{"time":"2025-08-19T06:34:16.322207848Z","level":"INFO","msg":"stream: closing","id":"a3i61ot9"} +{"time":"2025-08-19T06:34:16.631359725Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-08-19T06:34:16.732412188Z","level":"INFO","msg":"handler: closed","stream_id":"a3i61ot9"} +{"time":"2025-08-19T06:34:16.732466199Z","level":"INFO","msg":"sender: closed","stream_id":"a3i61ot9"} +{"time":"2025-08-19T06:34:16.732489657Z","level":"INFO","msg":"stream: closed","id":"a3i61ot9"} diff --git a/wandb/run-20250819_063411-a3i61ot9/logs/debug.log b/wandb/run-20250819_063411-a3i61ot9/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..0c2aef32efefb9d1696c882c19c1b39e5c0ea978 --- /dev/null +++ b/wandb/run-20250819_063411-a3i61ot9/logs/debug.log @@ -0,0 +1,22 @@ +2025-08-19 06:34:11,376 INFO MainThread:1821 [wandb_setup.py:_flush():80] Current SDK version is 0.21.1 +2025-08-19 06:34:11,376 INFO MainThread:1821 [wandb_setup.py:_flush():80] Configure stats pid to 1821 +2025-08-19 06:34:11,376 INFO MainThread:1821 [wandb_setup.py:_flush():80] Loading settings from /root/.config/wandb/settings +2025-08-19 06:34:11,376 INFO MainThread:1821 [wandb_setup.py:_flush():80] Loading settings from /notebooks/toy_models/models/gelu_2l_v6_200m_subset/wandb/settings +2025-08-19 06:34:11,376 INFO MainThread:1821 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-08-19 06:34:11,376 INFO MainThread:1821 [wandb_init.py:setup_run_log_directory():703] Logging user logs to /notebooks/toy_models/models/gelu_2l_v6_200m_subset/wandb/run-20250819_063411-a3i61ot9/logs/debug.log +2025-08-19 06:34:11,376 INFO MainThread:1821 [wandb_init.py:setup_run_log_directory():704] Logging internal logs to /notebooks/toy_models/models/gelu_2l_v6_200m_subset/wandb/run-20250819_063411-a3i61ot9/logs/debug-internal.log +2025-08-19 06:34:11,376 INFO MainThread:1821 [wandb_init.py:init():830] calling init triggers +2025-08-19 06:34:11,376 INFO MainThread:1821 [wandb_init.py:init():835] wandb.init called with sweep_config: {} +config: {'model_name': 'gelu_2l_v5_random_above_15000', 'n_layers': 2, 'd_model': 512, 'd_mlp': 2048, 'd_head': 64, 'n_heads': 8, 'attn_only': False, 'layer_norm_eps': 1e-05, 'init_range': 0.02, 'n_ctx': 1024, 'd_vocab': 48262, 'dataset_name': 'eoinf/unprocessed-c4-code-test', 'tokenizer_name': 'NeelNanda/gpt-neox-tokenizer-digits', 'seed': 10, 'device': 'cuda', 'use_bfloat16_matmul': False, 'batch_size_per_device': 32, 'n_devices': 1, 'batches_per_step': 1, 'max_tokens': 200000000, 'lr_hidden': 0.002, 'lr_vector': 0.001, 'lr_schedule': 'constant_with_warmup', 'warmup_tokens': 30000000, 'weight_decay': 0.05, 'grad_norm_clip': 1.0, 'train_loss_moving_average_beta': 0.99, 'log_interval': 25, 'save_checkpoints': True, 'checkpoint_interval': 500, 'checkpoint_interval_ratio': 1.08, 'save_log_checkpoints': True, 'use_wandb': True, 'batch_size': 32, 'tokens_per_step': 32768, 'warmup_steps': 915, 'max_steps': 6103, '_wandb': {}} +2025-08-19 06:34:11,376 INFO MainThread:1821 [wandb_init.py:init():871] starting backend +2025-08-19 06:34:11,583 INFO MainThread:1821 [wandb_init.py:init():874] sending inform_init request +2025-08-19 06:34:11,586 INFO MainThread:1821 [wandb_init.py:init():882] backend started and connected +2025-08-19 06:34:11,587 INFO MainThread:1821 [wandb_init.py:init():953] updated telemetry +2025-08-19 06:34:11,591 INFO MainThread:1821 [wandb_init.py:init():977] communicating run to backend with 90.0 second timeout +2025-08-19 06:34:14,075 INFO MainThread:1821 [wandb_init.py:init():1029] starting run threads in backend +2025-08-19 06:34:14,168 INFO MainThread:1821 [wandb_run.py:_console_start():2494] atexit reg +2025-08-19 06:34:14,168 INFO MainThread:1821 [wandb_run.py:_redirect():2342] redirect: wrap_raw +2025-08-19 06:34:14,168 INFO MainThread:1821 [wandb_run.py:_redirect():2411] Wrapping output streams. +2025-08-19 06:34:14,168 INFO MainThread:1821 [wandb_run.py:_redirect():2434] Redirects installed. +2025-08-19 06:34:14,170 INFO MainThread:1821 [wandb_init.py:init():1075] run started, returning control to user process +2025-08-19 06:34:16,321 INFO MsgRouterThr:1821 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 1 handles. diff --git a/wandb/run-20250819_063411-a3i61ot9/run-a3i61ot9.wandb b/wandb/run-20250819_063411-a3i61ot9/run-a3i61ot9.wandb new file mode 100644 index 0000000000000000000000000000000000000000..3cda97d1a0b327435cf6f35aacf1f86daeba63eb Binary files /dev/null and b/wandb/run-20250819_063411-a3i61ot9/run-a3i61ot9.wandb differ diff --git a/wandb/run-20250819_063627-4rs47wj5/files/config.yaml b/wandb/run-20250819_063627-4rs47wj5/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..35ef5ae1f644c5dd6eaa2680decd05bdeaf4171a --- /dev/null +++ b/wandb/run-20250819_063627-4rs47wj5/files/config.yaml @@ -0,0 +1,129 @@ +_wandb: + value: + cli_version: 0.21.1 + e: + g93xw3liifw4vi0dlgwvsmir8hggg0ll: + cpu_count: 8 + cpu_count_logical: 8 + cudaVersion: "12.4" + disk: + /: + total: "262240792576" + used: "165823086592" + email: efarrel4@tcd.ie + executable: /notebooks/clean_env/bin/python + git: + commit: c3cfb768d471036c37848ff2c6d223b68ad88e82 + remote: git@github.com:jgroh3/toy_models.git + gpu: NVIDIA RTX A6000 + gpu_count: 1 + gpu_nvidia: + - architecture: Ampere + cudaCores: 10752 + memoryTotal: "51527024640" + name: NVIDIA RTX A6000 + uuid: GPU-0907b282-5ffc-ff4c-cea5-6f6015ec84ee + host: nww2895qc3 + memory: + total: "47332843520" + os: Linux-5.19.0-45-generic-x86_64-with-glibc2.35 + program: + python: CPython 3.11.7 + root: /notebooks/toy_models/models/gelu_2l_v6_200m_subset + startedAt: "2025-08-19T06:36:27.586603Z" + writerId: g93xw3liifw4vi0dlgwvsmir8hggg0ll + m: [] + python_version: 3.11.7 + t: + "1": + - 1 + - 49 + - 51 + "2": + - 1 + - 49 + - 51 + "3": + - 2 + - 13 + - 15 + - 16 + - 61 + "4": 3.11.7 + "5": 0.21.1 + "12": 0.21.1 + "13": linux-x86_64 +attn_only: + value: false +batch_size: + value: 32 +batch_size_per_device: + value: 32 +batches_per_step: + value: 1 +checkpoint_interval: + value: 500 +checkpoint_interval_ratio: + value: 1.08 +d_head: + value: 64 +d_mlp: + value: 2048 +d_model: + value: 512 +d_vocab: + value: 48262 +dataset_name: + value: eoinf/unprocessed-c4-code-test +device: + value: cuda +grad_norm_clip: + value: 1 +init_range: + value: 0.02 +layer_norm_eps: + value: 1e-05 +log_interval: + value: 25 +lr_hidden: + value: 0.002 +lr_schedule: + value: constant_with_warmup +lr_vector: + value: 0.001 +max_steps: + value: 6103 +max_tokens: + value: 200000000 +model_name: + value: gelu_2l_v5_random_above_15000 +n_ctx: + value: 1024 +n_devices: + value: 1 +n_heads: + value: 8 +n_layers: + value: 2 +save_checkpoints: + value: true +save_log_checkpoints: + value: true +seed: + value: 10 +tokenizer_name: + value: NeelNanda/gpt-neox-tokenizer-digits +tokens_per_step: + value: 32768 +train_loss_moving_average_beta: + value: 0.99 +use_bfloat16_matmul: + value: false +use_wandb: + value: true +warmup_steps: + value: 915 +warmup_tokens: + value: 30000000 +weight_decay: + value: 0.05 diff --git a/wandb/run-20250819_063627-4rs47wj5/files/output.log b/wandb/run-20250819_063627-4rs47wj5/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..a0cecb15dd851c94b2fe646f863c4a98619f3821 --- /dev/null +++ b/wandb/run-20250819_063627-4rs47wj5/files/output.log @@ -0,0 +1,255 @@ +{'step': 23, 'tokens_seen': 753664, 'config': {'model_name': 'gelu_2l_v5_random_above_15000', 'n_layers': 2, 'd_model': 512, 'd_mlp': 2048, 'd_head': 64, 'n_heads': 8, 'attn_only': False, 'layer_norm_eps': 1e-05, 'init_range': 0.02, 'n_ctx': 1024, 'd_vocab': 48262, 'dataset_name': 'eoinf/unprocessed-c4-code-test', 'tokenizer_name': 'NeelNanda/gpt-neox-tokenizer-digits', 'seed': 10, 'device': 'cuda', 'use_bfloat16_matmul': False, 'batch_size_per_device': 32, 'n_devices': 1, 'batches_per_step': 1, 'max_tokens': 200000000, 'lr_hidden': 0.002, 'lr_vector': 0.001, 'lr_schedule': 'constant_with_warmup', 'warmup_tokens': 30000000, 'weight_decay': 0.05, 'grad_norm_clip': 1.0, 'train_loss_moving_average_beta': 0.99, 'log_interval': 25, 'save_checkpoints': True, 'checkpoint_interval': 500, 'checkpoint_interval_ratio': 1.08, 'save_log_checkpoints': True, 'use_wandb': True, 'batch_size': 32, 'tokens_per_step': 32768, 'warmup_steps': 915, 'max_steps': 6103}, 'train_loss_ewma': 10.803400143292523} +/notebooks/clean_env/lib/python3.11/site-packages/torch/optim/lr_scheduler.py:192: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`. Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate + warnings.warn( +Training on cuda +Model: 2L, 512d, 8h +Max steps: 6,103, Max tokens: 200,000,000 +Warmup steps: 915, Warmup tokens: 30,000,000 +Batch size per device: 32 +Context length: 1024 +Learning rates - Hidden: 0.002, Vector: 0.001 + +Step 25 | Tokens: 819,200 | Train Loss EWMA: 10.7893 | Learning Rate: 0.000055 | Progress: 0.00410 +Step 50 | Tokens: 1,638,400 | Train Loss EWMA: 10.5538 | Learning Rate: 0.000109 | Progress: 0.00819 +Step 75 | Tokens: 2,457,600 | Train Loss EWMA: 10.1897 | Learning Rate: 0.000164 | Progress: 0.01229 +Step 100 | Tokens: 3,276,800 | Train Loss EWMA: 9.7079 | Learning Rate: 0.000219 | Progress: 0.01638 +Step 125 | Tokens: 4,096,000 | Train Loss EWMA: 9.2007 | Learning Rate: 0.000273 | Progress: 0.02048 +Step 150 | Tokens: 4,915,200 | Train Loss EWMA: 8.7361 | Learning Rate: 0.000328 | Progress: 0.02458 +Step 175 | Tokens: 5,734,400 | Train Loss EWMA: 8.3185 | Learning Rate: 0.000383 | Progress: 0.02867 +Step 200 | Tokens: 6,553,600 | Train Loss EWMA: 7.9580 | Learning Rate: 0.000437 | Progress: 0.03277 +Step 225 | Tokens: 7,372,800 | Train Loss EWMA: 7.6498 | Learning Rate: 0.000492 | Progress: 0.03686 +Step 250 | Tokens: 8,192,000 | Train Loss EWMA: 7.3914 | Learning Rate: 0.000546 | Progress: 0.04096 +Step 275 | Tokens: 9,011,200 | Train Loss EWMA: 7.1638 | Learning Rate: 0.000601 | Progress: 0.04506 +Step 300 | Tokens: 9,830,400 | Train Loss EWMA: 6.9834 | Learning Rate: 0.000656 | Progress: 0.04915 +Step 325 | Tokens: 10,649,600 | Train Loss EWMA: 6.8242 | Learning Rate: 0.000710 | Progress: 0.05325 +Step 350 | Tokens: 11,468,800 | Train Loss EWMA: 6.6939 | Learning Rate: 0.000765 | Progress: 0.05734 +Step 375 | Tokens: 12,288,000 | Train Loss EWMA: 6.5748 | Learning Rate: 0.000820 | Progress: 0.06144 +Step 400 | Tokens: 13,107,200 | Train Loss EWMA: 6.4812 | Learning Rate: 0.000874 | Progress: 0.06554 +Step 425 | Tokens: 13,926,400 | Train Loss EWMA: 6.3957 | Learning Rate: 0.000929 | Progress: 0.06963 +Step 450 | Tokens: 14,745,600 | Train Loss EWMA: 6.3256 | Learning Rate: 0.000984 | Progress: 0.07373 +Step 475 | Tokens: 15,564,800 | Train Loss EWMA: 6.2611 | Learning Rate: 0.001038 | Progress: 0.07782 +Step 500 | Tokens: 16,384,000 | Train Loss EWMA: 6.2030 | Learning Rate: 0.001093 | Progress: 0.08192 +Step 525 | Tokens: 17,203,200 | Train Loss EWMA: 6.1607 | Learning Rate: 0.001148 | Progress: 0.08602 +Step 550 | Tokens: 18,022,400 | Train Loss EWMA: 6.1159 | Learning Rate: 0.001202 | Progress: 0.09011 +Step 575 | Tokens: 18,841,600 | Train Loss EWMA: 6.0738 | Learning Rate: 0.001257 | Progress: 0.09421 +Step 600 | Tokens: 19,660,800 | Train Loss EWMA: 6.0351 | Learning Rate: 0.001311 | Progress: 0.09830 +Step 625 | Tokens: 20,480,000 | Train Loss EWMA: 5.9985 | Learning Rate: 0.001366 | Progress: 0.10240 +Step 650 | Tokens: 21,299,200 | Train Loss EWMA: 5.9592 | Learning Rate: 0.001421 | Progress: 0.10650 +Step 675 | Tokens: 22,118,400 | Train Loss EWMA: 5.9295 | Learning Rate: 0.001475 | Progress: 0.11059 +Step 700 | Tokens: 22,937,600 | Train Loss EWMA: 5.9009 | Learning Rate: 0.001530 | Progress: 0.11469 +Step 725 | Tokens: 23,756,800 | Train Loss EWMA: 5.8706 | Learning Rate: 0.001585 | Progress: 0.11878 +Step 750 | Tokens: 24,576,000 | Train Loss EWMA: 5.8399 | Learning Rate: 0.001639 | Progress: 0.12288 +Step 775 | Tokens: 25,395,200 | Train Loss EWMA: 5.8155 | Learning Rate: 0.001694 | Progress: 0.12698 +Step 800 | Tokens: 26,214,400 | Train Loss EWMA: 5.7942 | Learning Rate: 0.001749 | Progress: 0.13107 +Step 825 | Tokens: 27,033,600 | Train Loss EWMA: 5.7710 | Learning Rate: 0.001803 | Progress: 0.13517 +Step 850 | Tokens: 27,852,800 | Train Loss EWMA: 5.7474 | Learning Rate: 0.001858 | Progress: 0.13926 +Step 875 | Tokens: 28,672,000 | Train Loss EWMA: 5.7200 | Learning Rate: 0.001913 | Progress: 0.14336 +Step 900 | Tokens: 29,491,200 | Train Loss EWMA: 5.6965 | Learning Rate: 0.001967 | Progress: 0.14746 +Step 925 | Tokens: 30,310,400 | Train Loss EWMA: 5.6739 | Learning Rate: 0.002000 | Progress: 0.15155 +Step 950 | Tokens: 31,129,600 | Train Loss EWMA: 5.6473 | Learning Rate: 0.002000 | Progress: 0.15565 +Step 975 | Tokens: 31,948,800 | Train Loss EWMA: 5.6190 | Learning Rate: 0.002000 | Progress: 0.15974 +Step 1,000 | Tokens: 32,768,000 | Train Loss EWMA: 5.5776 | Learning Rate: 0.002000 | Progress: 0.16384 +Step 1,025 | Tokens: 33,587,200 | Train Loss EWMA: 5.5507 | Learning Rate: 0.002000 | Progress: 0.16794 +Step 1,050 | Tokens: 34,406,400 | Train Loss EWMA: 5.5252 | Learning Rate: 0.002000 | Progress: 0.17203 +Step 1,075 | Tokens: 35,225,600 | Train Loss EWMA: 5.4978 | Learning Rate: 0.002000 | Progress: 0.17613 +Step 1,100 | Tokens: 36,044,800 | Train Loss EWMA: 5.4670 | Learning Rate: 0.002000 | Progress: 0.18022 +Step 1,125 | Tokens: 36,864,000 | Train Loss EWMA: 5.4407 | Learning Rate: 0.002000 | Progress: 0.18432 +Step 1,150 | Tokens: 37,683,200 | Train Loss EWMA: 5.4079 | Learning Rate: 0.002000 | Progress: 0.18842 +Step 1,175 | Tokens: 38,502,400 | Train Loss EWMA: 5.3833 | Learning Rate: 0.002000 | Progress: 0.19251 +Step 1,200 | Tokens: 39,321,600 | Train Loss EWMA: 5.3590 | Learning Rate: 0.002000 | Progress: 0.19661 +Step 1,225 | Tokens: 40,140,800 | Train Loss EWMA: 5.3329 | Learning Rate: 0.002000 | Progress: 0.20070 +Step 1,250 | Tokens: 40,960,000 | Train Loss EWMA: 5.3127 | Learning Rate: 0.002000 | Progress: 0.20480 +Step 1,275 | Tokens: 41,779,200 | Train Loss EWMA: 5.2952 | Learning Rate: 0.002000 | Progress: 0.20890 +Step 1,300 | Tokens: 42,598,400 | Train Loss EWMA: 5.2768 | Learning Rate: 0.002000 | Progress: 0.21299 +Step 1,325 | Tokens: 43,417,600 | Train Loss EWMA: 5.2575 | Learning Rate: 0.002000 | Progress: 0.21709 +Step 1,350 | Tokens: 44,236,800 | Train Loss EWMA: 5.2426 | Learning Rate: 0.002000 | Progress: 0.22118 +Step 1,375 | Tokens: 45,056,000 | Train Loss EWMA: 5.2232 | Learning Rate: 0.002000 | Progress: 0.22528 +Step 1,400 | Tokens: 45,875,200 | Train Loss EWMA: 5.2044 | Learning Rate: 0.002000 | Progress: 0.22938 +Step 1,425 | Tokens: 46,694,400 | Train Loss EWMA: 5.1828 | Learning Rate: 0.002000 | Progress: 0.23347 +Step 1,450 | Tokens: 47,513,600 | Train Loss EWMA: 5.1641 | Learning Rate: 0.002000 | Progress: 0.23757 +Step 1,475 | Tokens: 48,332,800 | Train Loss EWMA: 5.1553 | Learning Rate: 0.002000 | Progress: 0.24166 +Step 1,500 | Tokens: 49,152,000 | Train Loss EWMA: 5.1335 | Learning Rate: 0.002000 | Progress: 0.24576 +Step 1,525 | Tokens: 49,971,200 | Train Loss EWMA: 5.1151 | Learning Rate: 0.002000 | Progress: 0.24986 +Step 1,550 | Tokens: 50,790,400 | Train Loss EWMA: 5.1005 | Learning Rate: 0.002000 | Progress: 0.25395 +Step 1,575 | Tokens: 51,609,600 | Train Loss EWMA: 5.0874 | Learning Rate: 0.002000 | Progress: 0.25805 +Step 1,600 | Tokens: 52,428,800 | Train Loss EWMA: 5.0868 | Learning Rate: 0.002000 | Progress: 0.26214 +Step 1,625 | Tokens: 53,248,000 | Train Loss EWMA: 5.0720 | Learning Rate: 0.002000 | Progress: 0.26624 +Step 1,650 | Tokens: 54,067,200 | Train Loss EWMA: 5.0520 | Learning Rate: 0.002000 | Progress: 0.27034 +Step 1,675 | Tokens: 54,886,400 | Train Loss EWMA: 5.0460 | Learning Rate: 0.002000 | Progress: 0.27443 +Step 1,700 | Tokens: 55,705,600 | Train Loss EWMA: 5.0319 | Learning Rate: 0.002000 | Progress: 0.27853 +Step 1,725 | Tokens: 56,524,800 | Train Loss EWMA: 5.0239 | Learning Rate: 0.002000 | Progress: 0.28262 +Step 1,750 | Tokens: 57,344,000 | Train Loss EWMA: 5.0108 | Learning Rate: 0.002000 | Progress: 0.28672 +Step 1,775 | Tokens: 58,163,200 | Train Loss EWMA: 5.0031 | Learning Rate: 0.002000 | Progress: 0.29082 +Step 1,800 | Tokens: 58,982,400 | Train Loss EWMA: 4.9867 | Learning Rate: 0.002000 | Progress: 0.29491 +Step 1,825 | Tokens: 59,801,600 | Train Loss EWMA: 4.9677 | Learning Rate: 0.002000 | Progress: 0.29901 +Step 1,850 | Tokens: 60,620,800 | Train Loss EWMA: 4.9527 | Learning Rate: 0.002000 | Progress: 0.30310 +Step 1,875 | Tokens: 61,440,000 | Train Loss EWMA: 4.9357 | Learning Rate: 0.002000 | Progress: 0.30720 +Step 1,900 | Tokens: 62,259,200 | Train Loss EWMA: 4.9224 | Learning Rate: 0.002000 | Progress: 0.31130 +Step 1,925 | Tokens: 63,078,400 | Train Loss EWMA: 4.9004 | Learning Rate: 0.002000 | Progress: 0.31539 +Step 1,950 | Tokens: 63,897,600 | Train Loss EWMA: 4.8904 | Learning Rate: 0.002000 | Progress: 0.31949 +Step 1,975 | Tokens: 64,716,800 | Train Loss EWMA: 4.8817 | Learning Rate: 0.002000 | Progress: 0.32358 +Step 2,000 | Tokens: 65,536,000 | Train Loss EWMA: 4.8615 | Learning Rate: 0.002000 | Progress: 0.32768 +Step 2,025 | Tokens: 66,355,200 | Train Loss EWMA: 4.8460 | Learning Rate: 0.002000 | Progress: 0.33178 +Step 2,050 | Tokens: 67,174,400 | Train Loss EWMA: 4.8321 | Learning Rate: 0.002000 | Progress: 0.33587 +Step 2,075 | Tokens: 67,993,600 | Train Loss EWMA: 4.8082 | Learning Rate: 0.002000 | Progress: 0.33997 +Step 2,100 | Tokens: 68,812,800 | Train Loss EWMA: 4.7838 | Learning Rate: 0.002000 | Progress: 0.34406 +Step 2,125 | Tokens: 69,632,000 | Train Loss EWMA: 4.7643 | Learning Rate: 0.002000 | Progress: 0.34816 +Step 2,150 | Tokens: 70,451,200 | Train Loss EWMA: 4.7503 | Learning Rate: 0.002000 | Progress: 0.35226 +Step 2,175 | Tokens: 71,270,400 | Train Loss EWMA: 4.7440 | Learning Rate: 0.002000 | Progress: 0.35635 +Step 2,200 | Tokens: 72,089,600 | Train Loss EWMA: 4.7262 | Learning Rate: 0.002000 | Progress: 0.36045 +Step 2,225 | Tokens: 72,908,800 | Train Loss EWMA: 4.7151 | Learning Rate: 0.002000 | Progress: 0.36454 +Step 2,250 | Tokens: 73,728,000 | Train Loss EWMA: 4.6957 | Learning Rate: 0.002000 | Progress: 0.36864 +Step 2,275 | Tokens: 74,547,200 | Train Loss EWMA: 4.6741 | Learning Rate: 0.002000 | Progress: 0.37274 +Step 2,300 | Tokens: 75,366,400 | Train Loss EWMA: 4.6735 | Learning Rate: 0.002000 | Progress: 0.37683 +Step 2,325 | Tokens: 76,185,600 | Train Loss EWMA: 4.6579 | Learning Rate: 0.002000 | Progress: 0.38093 +Step 2,350 | Tokens: 77,004,800 | Train Loss EWMA: 4.6505 | Learning Rate: 0.002000 | Progress: 0.38502 +Step 2,375 | Tokens: 77,824,000 | Train Loss EWMA: 4.6363 | Learning Rate: 0.002000 | Progress: 0.38912 +Step 2,400 | Tokens: 78,643,200 | Train Loss EWMA: 4.6271 | Learning Rate: 0.002000 | Progress: 0.39322 +Step 2,425 | Tokens: 79,462,400 | Train Loss EWMA: 4.6138 | Learning Rate: 0.002000 | Progress: 0.39731 +Step 2,450 | Tokens: 80,281,600 | Train Loss EWMA: 4.5936 | Learning Rate: 0.002000 | Progress: 0.40141 +Step 2,475 | Tokens: 81,100,800 | Train Loss EWMA: 4.5847 | Learning Rate: 0.002000 | Progress: 0.40550 +Step 2,500 | Tokens: 81,920,000 | Train Loss EWMA: 4.5702 | Learning Rate: 0.002000 | Progress: 0.40960 +Step 2,525 | Tokens: 82,739,200 | Train Loss EWMA: 4.5551 | Learning Rate: 0.002000 | Progress: 0.41370 +Step 2,550 | Tokens: 83,558,400 | Train Loss EWMA: 4.5524 | Learning Rate: 0.002000 | Progress: 0.41779 +Step 2,575 | Tokens: 84,377,600 | Train Loss EWMA: 4.5309 | Learning Rate: 0.002000 | Progress: 0.42189 +Step 2,600 | Tokens: 85,196,800 | Train Loss EWMA: 4.5191 | Learning Rate: 0.002000 | Progress: 0.42598 +Step 2,625 | Tokens: 86,016,000 | Train Loss EWMA: 4.5046 | Learning Rate: 0.002000 | Progress: 0.43008 +Step 2,650 | Tokens: 86,835,200 | Train Loss EWMA: 4.5032 | Learning Rate: 0.002000 | Progress: 0.43418 +Step 2,675 | Tokens: 87,654,400 | Train Loss EWMA: 4.4917 | Learning Rate: 0.002000 | Progress: 0.43827 +Step 2,700 | Tokens: 88,473,600 | Train Loss EWMA: 4.4892 | Learning Rate: 0.002000 | Progress: 0.44237 +Step 2,725 | Tokens: 89,292,800 | Train Loss EWMA: 4.4846 | Learning Rate: 0.002000 | Progress: 0.44646 +Step 2,750 | Tokens: 90,112,000 | Train Loss EWMA: 4.4693 | Learning Rate: 0.002000 | Progress: 0.45056 +Step 2,775 | Tokens: 90,931,200 | Train Loss EWMA: 4.4604 | Learning Rate: 0.002000 | Progress: 0.45466 +Step 2,800 | Tokens: 91,750,400 | Train Loss EWMA: 4.4558 | Learning Rate: 0.002000 | Progress: 0.45875 +Step 2,825 | Tokens: 92,569,600 | Train Loss EWMA: 4.4487 | Learning Rate: 0.002000 | Progress: 0.46285 +Step 2,850 | Tokens: 93,388,800 | Train Loss EWMA: 4.4420 | Learning Rate: 0.002000 | Progress: 0.46694 +Step 2,875 | Tokens: 94,208,000 | Train Loss EWMA: 4.4371 | Learning Rate: 0.002000 | Progress: 0.47104 +Step 2,900 | Tokens: 95,027,200 | Train Loss EWMA: 4.4301 | Learning Rate: 0.002000 | Progress: 0.47514 +Step 2,925 | Tokens: 95,846,400 | Train Loss EWMA: 4.4343 | Learning Rate: 0.002000 | Progress: 0.47923 +Step 2,950 | Tokens: 96,665,600 | Train Loss EWMA: 4.4151 | Learning Rate: 0.002000 | Progress: 0.48333 +Step 2,975 | Tokens: 97,484,800 | Train Loss EWMA: 4.4048 | Learning Rate: 0.002000 | Progress: 0.48742 +Step 3,000 | Tokens: 98,304,000 | Train Loss EWMA: 4.3974 | Learning Rate: 0.002000 | Progress: 0.49152 +Step 3,025 | Tokens: 99,123,200 | Train Loss EWMA: 4.3866 | Learning Rate: 0.002000 | Progress: 0.49562 +Step 3,050 | Tokens: 99,942,400 | Train Loss EWMA: 4.3766 | Learning Rate: 0.002000 | Progress: 0.49971 +Step 3,075 | Tokens: 100,761,600 | Train Loss EWMA: 4.3688 | Learning Rate: 0.002000 | Progress: 0.50381 +Step 3,100 | Tokens: 101,580,800 | Train Loss EWMA: 4.3610 | Learning Rate: 0.002000 | Progress: 0.50790 +Step 3,125 | Tokens: 102,400,000 | Train Loss EWMA: 4.3639 | Learning Rate: 0.002000 | Progress: 0.51200 +Step 3,150 | Tokens: 103,219,200 | Train Loss EWMA: 4.3701 | Learning Rate: 0.002000 | Progress: 0.51610 +Step 3,175 | Tokens: 104,038,400 | Train Loss EWMA: 4.3645 | Learning Rate: 0.002000 | Progress: 0.52019 +Step 3,200 | Tokens: 104,857,600 | Train Loss EWMA: 4.3448 | Learning Rate: 0.002000 | Progress: 0.52429 +Step 3,225 | Tokens: 105,676,800 | Train Loss EWMA: 4.3408 | Learning Rate: 0.002000 | Progress: 0.52838 +Step 3,250 | Tokens: 106,496,000 | Train Loss EWMA: 4.3298 | Learning Rate: 0.002000 | Progress: 0.53248 +Step 3,275 | Tokens: 107,315,200 | Train Loss EWMA: 4.3199 | Learning Rate: 0.002000 | Progress: 0.53658 +Step 3,300 | Tokens: 108,134,400 | Train Loss EWMA: 4.3155 | Learning Rate: 0.002000 | Progress: 0.54067 +Step 3,325 | Tokens: 108,953,600 | Train Loss EWMA: 4.3100 | Learning Rate: 0.002000 | Progress: 0.54477 +Step 3,350 | Tokens: 109,772,800 | Train Loss EWMA: 4.3041 | Learning Rate: 0.002000 | Progress: 0.54886 +Step 3,375 | Tokens: 110,592,000 | Train Loss EWMA: 4.2989 | Learning Rate: 0.002000 | Progress: 0.55296 +Step 3,400 | Tokens: 111,411,200 | Train Loss EWMA: 4.2905 | Learning Rate: 0.002000 | Progress: 0.55706 +Step 3,425 | Tokens: 112,230,400 | Train Loss EWMA: 4.2890 | Learning Rate: 0.002000 | Progress: 0.56115 +Step 3,450 | Tokens: 113,049,600 | Train Loss EWMA: 4.2854 | Learning Rate: 0.002000 | Progress: 0.56525 +Step 3,475 | Tokens: 113,868,800 | Train Loss EWMA: 4.2768 | Learning Rate: 0.002000 | Progress: 0.56934 +Step 3,500 | Tokens: 114,688,000 | Train Loss EWMA: 4.2696 | Learning Rate: 0.002000 | Progress: 0.57344 +Step 3,525 | Tokens: 115,507,200 | Train Loss EWMA: 4.2680 | Learning Rate: 0.002000 | Progress: 0.57754 +Step 3,550 | Tokens: 116,326,400 | Train Loss EWMA: 4.2653 | Learning Rate: 0.002000 | Progress: 0.58163 +Step 3,575 | Tokens: 117,145,600 | Train Loss EWMA: 4.2654 | Learning Rate: 0.002000 | Progress: 0.58573 +Step 3,600 | Tokens: 117,964,800 | Train Loss EWMA: 4.2497 | Learning Rate: 0.002000 | Progress: 0.58982 +Step 3,625 | Tokens: 118,784,000 | Train Loss EWMA: 4.2486 | Learning Rate: 0.002000 | Progress: 0.59392 +Step 3,650 | Tokens: 119,603,200 | Train Loss EWMA: 4.2350 | Learning Rate: 0.002000 | Progress: 0.59802 +Step 3,675 | Tokens: 120,422,400 | Train Loss EWMA: 4.2372 | Learning Rate: 0.002000 | Progress: 0.60211 +Step 3,700 | Tokens: 121,241,600 | Train Loss EWMA: 4.2410 | Learning Rate: 0.002000 | Progress: 0.60621 +Step 3,725 | Tokens: 122,060,800 | Train Loss EWMA: 4.2390 | Learning Rate: 0.002000 | Progress: 0.61030 +Step 3,750 | Tokens: 122,880,000 | Train Loss EWMA: 4.2275 | Learning Rate: 0.002000 | Progress: 0.61440 +Step 3,775 | Tokens: 123,699,200 | Train Loss EWMA: 4.2362 | Learning Rate: 0.002000 | Progress: 0.61850 +Step 3,800 | Tokens: 124,518,400 | Train Loss EWMA: 4.2352 | Learning Rate: 0.002000 | Progress: 0.62259 +Step 3,825 | Tokens: 125,337,600 | Train Loss EWMA: 4.2308 | Learning Rate: 0.002000 | Progress: 0.62669 +Step 3,850 | Tokens: 126,156,800 | Train Loss EWMA: 4.2283 | Learning Rate: 0.002000 | Progress: 0.63078 +Step 3,875 | Tokens: 126,976,000 | Train Loss EWMA: 4.2207 | Learning Rate: 0.002000 | Progress: 0.63488 +Step 3,900 | Tokens: 127,795,200 | Train Loss EWMA: 4.2179 | Learning Rate: 0.002000 | Progress: 0.63898 +Step 3,925 | Tokens: 128,614,400 | Train Loss EWMA: 4.2196 | Learning Rate: 0.002000 | Progress: 0.64307 +Step 3,950 | Tokens: 129,433,600 | Train Loss EWMA: 4.2207 | Learning Rate: 0.002000 | Progress: 0.64717 +Step 3,975 | Tokens: 130,252,800 | Train Loss EWMA: 4.2106 | Learning Rate: 0.002000 | Progress: 0.65126 +Step 4,000 | Tokens: 131,072,000 | Train Loss EWMA: 4.2206 | Learning Rate: 0.002000 | Progress: 0.65536 +Step 4,025 | Tokens: 131,891,200 | Train Loss EWMA: 4.2135 | Learning Rate: 0.002000 | Progress: 0.65946 +Step 4,050 | Tokens: 132,710,400 | Train Loss EWMA: 4.2209 | Learning Rate: 0.002000 | Progress: 0.66355 +Step 4,075 | Tokens: 133,529,600 | Train Loss EWMA: 4.2160 | Learning Rate: 0.002000 | Progress: 0.66765 +Step 4,100 | Tokens: 134,348,800 | Train Loss EWMA: 4.2162 | Learning Rate: 0.002000 | Progress: 0.67174 +Step 4,125 | Tokens: 135,168,000 | Train Loss EWMA: 4.2080 | Learning Rate: 0.002000 | Progress: 0.67584 +Step 4,150 | Tokens: 135,987,200 | Train Loss EWMA: 4.2075 | Learning Rate: 0.002000 | Progress: 0.67994 +Step 4,175 | Tokens: 136,806,400 | Train Loss EWMA: 4.1962 | Learning Rate: 0.002000 | Progress: 0.68403 +Step 4,200 | Tokens: 137,625,600 | Train Loss EWMA: 4.1926 | Learning Rate: 0.002000 | Progress: 0.68813 +Step 4,225 | Tokens: 138,444,800 | Train Loss EWMA: 4.1909 | Learning Rate: 0.002000 | Progress: 0.69222 +Step 4,250 | Tokens: 139,264,000 | Train Loss EWMA: 4.1942 | Learning Rate: 0.002000 | Progress: 0.69632 +Step 4,275 | Tokens: 140,083,200 | Train Loss EWMA: 4.1819 | Learning Rate: 0.002000 | Progress: 0.70042 +Step 4,300 | Tokens: 140,902,400 | Train Loss EWMA: 4.1738 | Learning Rate: 0.002000 | Progress: 0.70451 +Step 4,325 | Tokens: 141,721,600 | Train Loss EWMA: 4.1775 | Learning Rate: 0.002000 | Progress: 0.70861 +Step 4,350 | Tokens: 142,540,800 | Train Loss EWMA: 4.1705 | Learning Rate: 0.002000 | Progress: 0.71270 +Step 4,375 | Tokens: 143,360,000 | Train Loss EWMA: 4.1654 | Learning Rate: 0.002000 | Progress: 0.71680 +Step 4,400 | Tokens: 144,179,200 | Train Loss EWMA: 4.1596 | Learning Rate: 0.002000 | Progress: 0.72090 +Step 4,425 | Tokens: 144,998,400 | Train Loss EWMA: 4.1496 | Learning Rate: 0.002000 | Progress: 0.72499 +Step 4,450 | Tokens: 145,817,600 | Train Loss EWMA: 4.1540 | Learning Rate: 0.002000 | Progress: 0.72909 +Step 4,475 | Tokens: 146,636,800 | Train Loss EWMA: 4.1497 | Learning Rate: 0.002000 | Progress: 0.73318 +Step 4,500 | Tokens: 147,456,000 | Train Loss EWMA: 4.1376 | Learning Rate: 0.002000 | Progress: 0.73728 +Step 4,525 | Tokens: 148,275,200 | Train Loss EWMA: 4.1430 | Learning Rate: 0.002000 | Progress: 0.74138 +Step 4,550 | Tokens: 149,094,400 | Train Loss EWMA: 4.1415 | Learning Rate: 0.002000 | Progress: 0.74547 +Step 4,575 | Tokens: 149,913,600 | Train Loss EWMA: 4.1423 | Learning Rate: 0.002000 | Progress: 0.74957 +Step 4,600 | Tokens: 150,732,800 | Train Loss EWMA: 4.1341 | Learning Rate: 0.002000 | Progress: 0.75366 +Step 4,625 | Tokens: 151,552,000 | Train Loss EWMA: 4.1455 | Learning Rate: 0.002000 | Progress: 0.75776 +Step 4,650 | Tokens: 152,371,200 | Train Loss EWMA: 4.1427 | Learning Rate: 0.002000 | Progress: 0.76186 +Step 4,675 | Tokens: 153,190,400 | Train Loss EWMA: 4.1510 | Learning Rate: 0.002000 | Progress: 0.76595 +Step 4,700 | Tokens: 154,009,600 | Train Loss EWMA: 4.1515 | Learning Rate: 0.002000 | Progress: 0.77005 +Step 4,725 | Tokens: 154,828,800 | Train Loss EWMA: 4.1430 | Learning Rate: 0.002000 | Progress: 0.77414 +Step 4,750 | Tokens: 155,648,000 | Train Loss EWMA: 4.1413 | Learning Rate: 0.002000 | Progress: 0.77824 +Step 4,775 | Tokens: 156,467,200 | Train Loss EWMA: 4.1422 | Learning Rate: 0.002000 | Progress: 0.78234 +Step 4,800 | Tokens: 157,286,400 | Train Loss EWMA: 4.1442 | Learning Rate: 0.002000 | Progress: 0.78643 +Step 4,825 | Tokens: 158,105,600 | Train Loss EWMA: 4.1285 | Learning Rate: 0.002000 | Progress: 0.79053 +Step 4,850 | Tokens: 158,924,800 | Train Loss EWMA: 4.1189 | Learning Rate: 0.002000 | Progress: 0.79462 +Step 4,875 | Tokens: 159,744,000 | Train Loss EWMA: 4.1334 | Learning Rate: 0.002000 | Progress: 0.79872 +Step 4,900 | Tokens: 160,563,200 | Train Loss EWMA: 4.1376 | Learning Rate: 0.002000 | Progress: 0.80282 +Step 4,925 | Tokens: 161,382,400 | Train Loss EWMA: 4.1554 | Learning Rate: 0.002000 | Progress: 0.80691 +Step 4,950 | Tokens: 162,201,600 | Train Loss EWMA: 4.1438 | Learning Rate: 0.002000 | Progress: 0.81101 +Step 4,975 | Tokens: 163,020,800 | Train Loss EWMA: 4.1383 | Learning Rate: 0.002000 | Progress: 0.81510 +Step 5,000 | Tokens: 163,840,000 | Train Loss EWMA: 4.1341 | Learning Rate: 0.002000 | Progress: 0.81920 +Step 5,025 | Tokens: 164,659,200 | Train Loss EWMA: 4.1229 | Learning Rate: 0.002000 | Progress: 0.82330 +Step 5,050 | Tokens: 165,478,400 | Train Loss EWMA: 4.1256 | Learning Rate: 0.002000 | Progress: 0.82739 +Step 5,075 | Tokens: 166,297,600 | Train Loss EWMA: 4.1188 | Learning Rate: 0.002000 | Progress: 0.83149 +Step 5,100 | Tokens: 167,116,800 | Train Loss EWMA: 4.1242 | Learning Rate: 0.002000 | Progress: 0.83558 +Step 5,125 | Tokens: 167,936,000 | Train Loss EWMA: 4.1235 | Learning Rate: 0.002000 | Progress: 0.83968 +Step 5,150 | Tokens: 168,755,200 | Train Loss EWMA: 4.1072 | Learning Rate: 0.002000 | Progress: 0.84378 +Step 5,175 | Tokens: 169,574,400 | Train Loss EWMA: 4.0977 | Learning Rate: 0.002000 | Progress: 0.84787 +Step 5,200 | Tokens: 170,393,600 | Train Loss EWMA: 4.0892 | Learning Rate: 0.002000 | Progress: 0.85197 +Step 5,225 | Tokens: 171,212,800 | Train Loss EWMA: 4.0865 | Learning Rate: 0.002000 | Progress: 0.85606 +Step 5,250 | Tokens: 172,032,000 | Train Loss EWMA: 4.0848 | Learning Rate: 0.002000 | Progress: 0.86016 +Step 5,275 | Tokens: 172,851,200 | Train Loss EWMA: 4.0903 | Learning Rate: 0.002000 | Progress: 0.86426 +Step 5,300 | Tokens: 173,670,400 | Train Loss EWMA: 4.0842 | Learning Rate: 0.002000 | Progress: 0.86835 +Step 5,325 | Tokens: 174,489,600 | Train Loss EWMA: 4.0868 | Learning Rate: 0.002000 | Progress: 0.87245 +Step 5,350 | Tokens: 175,308,800 | Train Loss EWMA: 4.0879 | Learning Rate: 0.002000 | Progress: 0.87654 +Step 5,375 | Tokens: 176,128,000 | Train Loss EWMA: 4.0954 | Learning Rate: 0.002000 | Progress: 0.88064 +Step 5,400 | Tokens: 176,947,200 | Train Loss EWMA: 4.0946 | Learning Rate: 0.002000 | Progress: 0.88474 +Step 5,425 | Tokens: 177,766,400 | Train Loss EWMA: 4.0949 | Learning Rate: 0.002000 | Progress: 0.88883 +Step 5,450 | Tokens: 178,585,600 | Train Loss EWMA: 4.0980 | Learning Rate: 0.002000 | Progress: 0.89293 +Step 5,475 | Tokens: 179,404,800 | Train Loss EWMA: 4.0932 | Learning Rate: 0.002000 | Progress: 0.89702 +Step 5,500 | Tokens: 180,224,000 | Train Loss EWMA: 4.0898 | Learning Rate: 0.002000 | Progress: 0.90112 +Step 5,525 | Tokens: 181,043,200 | Train Loss EWMA: 4.0852 | Learning Rate: 0.002000 | Progress: 0.90522 +Step 5,550 | Tokens: 181,862,400 | Train Loss EWMA: 4.0863 | Learning Rate: 0.002000 | Progress: 0.90931 +Step 5,575 | Tokens: 182,681,600 | Train Loss EWMA: 4.0869 | Learning Rate: 0.002000 | Progress: 0.91341 +Step 5,600 | Tokens: 183,500,800 | Train Loss EWMA: 4.0830 | Learning Rate: 0.002000 | Progress: 0.91750 +Step 5,625 | Tokens: 184,320,000 | Train Loss EWMA: 4.0868 | Learning Rate: 0.002000 | Progress: 0.92160 +Step 5,650 | Tokens: 185,139,200 | Train Loss EWMA: 4.0705 | Learning Rate: 0.002000 | Progress: 0.92570 +Step 5,675 | Tokens: 185,958,400 | Train Loss EWMA: 4.0747 | Learning Rate: 0.002000 | Progress: 0.92979 +Step 5,700 | Tokens: 186,777,600 | Train Loss EWMA: 4.0757 | Learning Rate: 0.002000 | Progress: 0.93389 +Step 5,725 | Tokens: 187,596,800 | Train Loss EWMA: 4.0729 | Learning Rate: 0.002000 | Progress: 0.93798 +Step 5,750 | Tokens: 188,416,000 | Train Loss EWMA: 4.0765 | Learning Rate: 0.002000 | Progress: 0.94208 +Step 5,775 | Tokens: 189,235,200 | Train Loss EWMA: 4.0685 | Learning Rate: 0.002000 | Progress: 0.94618 +Step 5,800 | Tokens: 190,054,400 | Train Loss EWMA: 4.0607 | Learning Rate: 0.002000 | Progress: 0.95027 +Step 5,825 | Tokens: 190,873,600 | Train Loss EWMA: 4.0591 | Learning Rate: 0.002000 | Progress: 0.95437 +Step 5,850 | Tokens: 191,692,800 | Train Loss EWMA: 4.0590 | Learning Rate: 0.002000 | Progress: 0.95846 +Step 5,875 | Tokens: 192,512,000 | Train Loss EWMA: 4.0580 | Learning Rate: 0.002000 | Progress: 0.96256 +Step 5,900 | Tokens: 193,331,200 | Train Loss EWMA: 4.0684 | Learning Rate: 0.002000 | Progress: 0.96666 +Step 5,925 | Tokens: 194,150,400 | Train Loss EWMA: 4.0680 | Learning Rate: 0.002000 | Progress: 0.97075 +Step 5,950 | Tokens: 194,969,600 | Train Loss EWMA: 4.0731 | Learning Rate: 0.002000 | Progress: 0.97485 +Step 5,975 | Tokens: 195,788,800 | Train Loss EWMA: 4.0670 | Learning Rate: 0.002000 | Progress: 0.97894 +Step 6,000 | Tokens: 196,608,000 | Train Loss EWMA: 4.0548 | Learning Rate: 0.002000 | Progress: 0.98304 +Step 6,025 | Tokens: 197,427,200 | Train Loss EWMA: 4.0411 | Learning Rate: 0.002000 | Progress: 0.98714 +Step 6,050 | Tokens: 198,246,400 | Train Loss EWMA: 4.0300 | Learning Rate: 0.002000 | Progress: 0.99123 +Step 6,075 | Tokens: 199,065,600 | Train Loss EWMA: 4.0327 | Learning Rate: 0.002000 | Progress: 0.99533 +Step 6,100 | Tokens: 199,884,800 | Train Loss EWMA: 4.0275 | Learning Rate: 0.002000 | Progress: 0.99942 diff --git a/wandb/run-20250819_063627-4rs47wj5/files/requirements.txt b/wandb/run-20250819_063627-4rs47wj5/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..bbed86cc72e05aec5c78850f9963d0e3471caff0 --- /dev/null +++ b/wandb/run-20250819_063627-4rs47wj5/files/requirements.txt @@ -0,0 +1,185 @@ +fsspec==2025.3.0 +PyYAML==6.0.2 +certifi==2025.8.3 +comm==0.2.3 +multidict==6.6.3 +widgetsnbextension==4.0.14 +Jinja2==3.1.6 +rich==14.1.0 +httpcore==1.0.9 +nvidia-cuda-cupti-cu12==12.8.90 +nvidia-cuda-nvrtc-cu12==12.8.93 +pyzmq==27.0.1 +jsonschema==4.25.0 +asttokens==3.0.0 +jsonschema-specifications==2025.4.1 +cycler==0.12.1 +stack-data==0.6.3 +aiosignal==1.4.0 +lark==1.2.2 +ptyprocess==0.7.0 +xxhash==3.5.0 +setuptools==65.5.0 +mpmath==1.3.0 +wadler_lindig==0.1.7 +typing_extensions==4.14.1 +nbformat==5.10.4 +huggingface-hub==0.34.4 +torchvision==0.23.0 +tqdm==4.67.1 +contourpy==1.3.3 +nvidia-nvtx-cu12==12.8.90 +nvidia-cuda-runtime-cu12==12.8.90 +yarl==1.20.1 +jupyter-events==0.12.0 +protobuf==6.31.1 +nbclient==0.10.2 +numpy==1.26.4 +decorator==5.2.1 +networkx==3.5 +smmap==5.0.2 +nbconvert==7.16.6 +pytz==2025.2 +aiohappyeyeballs==2.6.1 +tinycss2==1.4.0 +defusedxml==0.7.1 +matplotlib-inline==0.1.7 +hf-xet==1.1.7 +jedi==0.19.2 +transformer-lens==2.16.1 +pandas==2.3.1 +sympy==1.14.0 +jupyterlab_pygments==0.3.0 +overrides==7.7.0 +notebook_shim==0.2.4 +matplotlib==3.10.5 +jupyter==1.1.1 +dotenv==0.9.9 +accelerate==1.10.0 +better-abc==0.0.3 +jsonpointer==3.0.0 +terminado==0.18.1 +rfc3987-syntax==1.1.0 +annotated-types==0.7.0 +pyarrow==21.0.0 +webencodings==0.5.1 +wcwidth==0.2.13 +mistune==3.1.3 +cffi==1.17.1 +jupyterlab_server==2.27.3 +argon2-cffi-bindings==25.1.0 +nvidia-nvjitlink-cu12==12.8.93 +jaxtyping==0.3.2 +Pygments==2.19.2 +torch==2.8.0 +rfc3339-validator==0.1.4 +urllib3==2.5.0 +jupyterlab_widgets==3.0.15 +ipykernel==6.30.1 +nvidia-cudnn-cu12==9.10.2.21 +transformers==4.55.0 +babel==2.17.0 +pure_eval==0.2.3 +pyparsing==3.2.3 +nvidia-cublas-cu12==12.8.4.1 +sniffio==1.3.1 +notebook==7.4.5 +pycparser==2.22 +packaging==25.0 +h11==0.16.0 +psutil==7.0.0 +pexpect==4.9.0 +gitdb==4.0.12 +rfc3986-validator==0.1.1 +toy_models==0.1.0 +narwhals==2.0.1 +torchaudio==2.8.0 +prompt_toolkit==3.0.51 +attrs==25.3.0 +regex==2025.7.34 +jupyter_core==5.8.1 +bleach==6.2.0 +fqdn==1.5.1 +async-lru==2.0.5 +nvidia-nccl-cu12==2.27.3 +GitPython==3.1.45 +referencing==0.36.2 +click==8.2.1 +prometheus_client==0.22.1 +httpx==0.28.1 +requests==2.32.4 +fonttools==4.59.0 +argon2-cffi==25.1.0 +executing==2.2.0 +arrow==1.3.0 +beartype==0.14.1 +ipywidgets==8.1.7 +pydantic_core==2.33.2 +tokenizers==0.21.4 +pip==23.2.1 +python-dotenv==1.1.1 +isoduration==20.11.0 +python-dateutil==2.9.0.post0 +json5==0.12.0 +nvidia-curand-cu12==10.3.9.90 +webcolors==24.11.1 +MarkupSafe==3.0.2 +nvidia-cusolver-cu12==11.7.3.90 +sentry-sdk==2.34.1 +Send2Trash==1.8.3 +jupyter_server_terminals==0.5.3 +debugpy==1.8.16 +nvidia-cufft-cu12==11.3.3.83 +typing-inspection==0.4.1 +rpds-py==0.27.0 +nvidia-cufile-cu12==1.13.1.3 +mdurl==0.1.2 +websocket-client==1.8.0 +python-json-logger==3.3.0 +filelock==3.18.0 +types-python-dateutil==2.9.0.20250809 +kiwisolver==1.4.8 +einops==0.8.1 +jupyter_client==8.6.3 +ipython_pygments_lexers==1.1.1 +tabulate==0.9.0 +propcache==0.3.2 +tornado==6.5.2 +typeguard==4.4.4 +tomlkit==0.13.2 +pydantic==2.11.7 +ipython==9.4.0 +charset-normalizer==3.4.2 +fancy-einsum==0.0.3 +datasets==4.0.0 +pillow==11.3.0 +beautifulsoup4==4.13.4 +soupsieve==2.7 +aiohttp==3.12.15 +plotly==6.2.0 +wandb==0.21.1 +tzdata==2025.2 +jupyter-lsp==2.2.6 +triton==3.4.0 +idna==3.10 +jupyterlab==4.4.5 +multiprocess==0.70.16 +dill==0.3.8 +fastjsonschema==2.21.1 +transformers-stream-generator==0.0.5 +nvidia-cusparselt-cu12==0.7.1 +parso==0.8.4 +pandocfilters==1.5.1 +jupyter-console==6.6.3 +anyio==4.10.0 +six==1.17.0 +uri-template==1.3.0 +sentencepiece==0.2.0 +markdown-it-py==3.0.0 +nest-asyncio==1.6.0 +nvidia-cusparse-cu12==12.5.8.93 +platformdirs==4.3.8 +traitlets==5.14.3 +jupyter_server==2.16.0 +safetensors==0.6.2 +frozenlist==1.7.0 diff --git a/wandb/run-20250819_063627-4rs47wj5/files/wandb-metadata.json b/wandb/run-20250819_063627-4rs47wj5/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..1cf637aa3708384085ecb953a05c17cca0f43753 --- /dev/null +++ b/wandb/run-20250819_063627-4rs47wj5/files/wandb-metadata.json @@ -0,0 +1,38 @@ +{ + "os": "Linux-5.19.0-45-generic-x86_64-with-glibc2.35", + "python": "CPython 3.11.7", + "startedAt": "2025-08-19T06:36:27.586603Z", + "program": "", + "git": { + "remote": "git@github.com:jgroh3/toy_models.git", + "commit": "c3cfb768d471036c37848ff2c6d223b68ad88e82" + }, + "email": "efarrel4@tcd.ie", + "root": "/notebooks/toy_models/models/gelu_2l_v6_200m_subset", + "host": "nww2895qc3", + "executable": "/notebooks/clean_env/bin/python", + "cpu_count": 8, + "cpu_count_logical": 8, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 1, + "disk": { + "/": { + "total": "262240792576", + "used": "165823086592" + } + }, + "memory": { + "total": "47332843520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere", + "uuid": "GPU-0907b282-5ffc-ff4c-cea5-6f6015ec84ee" + } + ], + "cudaVersion": "12.4", + "writerId": "g93xw3liifw4vi0dlgwvsmir8hggg0ll" +} \ No newline at end of file diff --git a/wandb/run-20250819_063627-4rs47wj5/files/wandb-summary.json b/wandb/run-20250819_063627-4rs47wj5/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..8fd63fabff0643d786dbc51d90dbb434f076d8d8 --- /dev/null +++ b/wandb/run-20250819_063627-4rs47wj5/files/wandb-summary.json @@ -0,0 +1 @@ +{"_step":6100,"tokens_seen":199884800,"_runtime":4138.591636379,"_wandb":{"runtime":4138},"train_loss_ewma":4.0275392320695085,"train_loss":4.050985336303711,"_timestamp":1.755589524703568e+09,"learning_rate":0.002,"step":6100,"tokens_per_second":32768,"progress":0.999424} \ No newline at end of file diff --git a/wandb/run-20250819_063627-4rs47wj5/logs/debug-internal.log b/wandb/run-20250819_063627-4rs47wj5/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..c6517d5b70bd65bd3289f3016a891cb424b94b59 --- /dev/null +++ b/wandb/run-20250819_063627-4rs47wj5/logs/debug-internal.log @@ -0,0 +1,12 @@ +{"time":"2025-08-19T06:36:27.806787676Z","level":"INFO","msg":"stream: starting","core version":"0.21.1"} +{"time":"2025-08-19T06:36:27.983837809Z","level":"INFO","msg":"stream: created new stream","id":"4rs47wj5"} +{"time":"2025-08-19T06:36:27.983881866Z","level":"INFO","msg":"stream: started","id":"4rs47wj5"} +{"time":"2025-08-19T06:36:27.983924068Z","level":"INFO","msg":"writer: started","stream_id":"4rs47wj5"} +{"time":"2025-08-19T06:36:27.98394089Z","level":"INFO","msg":"sender: started","stream_id":"4rs47wj5"} +{"time":"2025-08-19T06:36:27.984004621Z","level":"INFO","msg":"handler: started","stream_id":"4rs47wj5"} +{"time":"2025-08-19T07:45:27.029996056Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-08-19T07:45:27.276957993Z","level":"INFO","msg":"handler: operation stats","stats":{}} +{"time":"2025-08-19T07:45:27.280737625Z","level":"INFO","msg":"stream: closing","id":"4rs47wj5"} +{"time":"2025-08-19T07:45:27.280787969Z","level":"INFO","msg":"handler: closed","stream_id":"4rs47wj5"} +{"time":"2025-08-19T07:45:27.280828684Z","level":"INFO","msg":"sender: closed","stream_id":"4rs47wj5"} +{"time":"2025-08-19T07:45:27.280836763Z","level":"INFO","msg":"stream: closed","id":"4rs47wj5"} diff --git a/wandb/run-20250819_063627-4rs47wj5/logs/debug.log b/wandb/run-20250819_063627-4rs47wj5/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..917cfe47f9f7ab051dc866c88a5060928fec1e27 --- /dev/null +++ b/wandb/run-20250819_063627-4rs47wj5/logs/debug.log @@ -0,0 +1,28 @@ +2025-08-19 06:36:27,594 INFO MainThread:1898 [wandb_setup.py:_flush():80] Current SDK version is 0.21.1 +2025-08-19 06:36:27,594 INFO MainThread:1898 [wandb_setup.py:_flush():80] Configure stats pid to 1898 +2025-08-19 06:36:27,594 INFO MainThread:1898 [wandb_setup.py:_flush():80] Loading settings from /root/.config/wandb/settings +2025-08-19 06:36:27,594 INFO MainThread:1898 [wandb_setup.py:_flush():80] Loading settings from /notebooks/toy_models/models/gelu_2l_v6_200m_subset/wandb/settings +2025-08-19 06:36:27,594 INFO MainThread:1898 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-08-19 06:36:27,594 INFO MainThread:1898 [wandb_init.py:setup_run_log_directory():703] Logging user logs to /notebooks/toy_models/models/gelu_2l_v6_200m_subset/wandb/run-20250819_063627-4rs47wj5/logs/debug.log +2025-08-19 06:36:27,594 INFO MainThread:1898 [wandb_init.py:setup_run_log_directory():704] Logging internal logs to /notebooks/toy_models/models/gelu_2l_v6_200m_subset/wandb/run-20250819_063627-4rs47wj5/logs/debug-internal.log +2025-08-19 06:36:27,594 INFO MainThread:1898 [wandb_init.py:init():830] calling init triggers +2025-08-19 06:36:27,594 INFO MainThread:1898 [wandb_init.py:init():835] wandb.init called with sweep_config: {} +config: {'model_name': 'gelu_2l_v5_random_above_15000', 'n_layers': 2, 'd_model': 512, 'd_mlp': 2048, 'd_head': 64, 'n_heads': 8, 'attn_only': False, 'layer_norm_eps': 1e-05, 'init_range': 0.02, 'n_ctx': 1024, 'd_vocab': 48262, 'dataset_name': 'eoinf/unprocessed-c4-code-test', 'tokenizer_name': 'NeelNanda/gpt-neox-tokenizer-digits', 'seed': 10, 'device': 'cuda', 'use_bfloat16_matmul': False, 'batch_size_per_device': 32, 'n_devices': 1, 'batches_per_step': 1, 'max_tokens': 200000000, 'lr_hidden': 0.002, 'lr_vector': 0.001, 'lr_schedule': 'constant_with_warmup', 'warmup_tokens': 30000000, 'weight_decay': 0.05, 'grad_norm_clip': 1.0, 'train_loss_moving_average_beta': 0.99, 'log_interval': 25, 'save_checkpoints': True, 'checkpoint_interval': 500, 'checkpoint_interval_ratio': 1.08, 'save_log_checkpoints': True, 'use_wandb': True, 'batch_size': 32, 'tokens_per_step': 32768, 'warmup_steps': 915, 'max_steps': 6103, '_wandb': {}} +2025-08-19 06:36:27,594 INFO MainThread:1898 [wandb_init.py:init():871] starting backend +2025-08-19 06:36:27,801 INFO MainThread:1898 [wandb_init.py:init():874] sending inform_init request +2025-08-19 06:36:27,804 INFO MainThread:1898 [wandb_init.py:init():882] backend started and connected +2025-08-19 06:36:27,806 INFO MainThread:1898 [wandb_init.py:init():953] updated telemetry +2025-08-19 06:36:27,809 INFO MainThread:1898 [wandb_init.py:init():977] communicating run to backend with 90.0 second timeout +2025-08-19 06:36:28,120 INFO MainThread:1898 [wandb_init.py:init():1029] starting run threads in backend +2025-08-19 06:36:28,215 INFO MainThread:1898 [wandb_run.py:_console_start():2494] atexit reg +2025-08-19 06:36:28,215 INFO MainThread:1898 [wandb_run.py:_redirect():2342] redirect: wrap_raw +2025-08-19 06:36:28,215 INFO MainThread:1898 [wandb_run.py:_redirect():2411] Wrapping output streams. +2025-08-19 06:36:28,216 INFO MainThread:1898 [wandb_run.py:_redirect():2434] Redirects installed. +2025-08-19 06:36:28,218 INFO MainThread:1898 [wandb_init.py:init():1075] run started, returning control to user process +2025-08-19 07:45:26,690 INFO MainThread:1898 [wandb_run.py:_finish():2260] finishing run eoin/toy-transformer-replication/4rs47wj5 +2025-08-19 07:45:26,697 INFO MainThread:1898 [wandb_run.py:_atexit_cleanup():2459] got exitcode: 0 +2025-08-19 07:45:26,697 INFO MainThread:1898 [wandb_run.py:_restore():2441] restore +2025-08-19 07:45:26,698 INFO MainThread:1898 [wandb_run.py:_restore():2447] restore done +2025-08-19 07:45:27,278 INFO MainThread:1898 [wandb_run.py:_footer_history_summary_info():3895] rendering history +2025-08-19 07:45:27,278 INFO MainThread:1898 [wandb_run.py:_footer_history_summary_info():3927] rendering summary +2025-08-19 07:45:27,279 INFO MainThread:1898 [wandb_run.py:_footer_sync_info():3856] logging synced files diff --git a/wandb/run-20250819_063627-4rs47wj5/run-4rs47wj5.wandb b/wandb/run-20250819_063627-4rs47wj5/run-4rs47wj5.wandb new file mode 100644 index 0000000000000000000000000000000000000000..280e3b7b2be69b76f733356a3177a622319504ee --- /dev/null +++ b/wandb/run-20250819_063627-4rs47wj5/run-4rs47wj5.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1ebe2dcf101e9fad087642f8fe22be77ef39dd5b972e3672c5fe65c42702cf1 +size 4164475