diff --git a/.gitattributes b/.gitattributes index d1c7997026325cad4baa5f0b0a95f2ae81223f92..5dd9d48057b35ecb57aedf1dadf40fcfc2e28a57 100644 --- a/.gitattributes +++ b/.gitattributes @@ -35,3 +35,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text wandb/run-20260226_135602-696nxyfr/run-696nxyfr.wandb filter=lfs diff=lfs merge=lfs -text wandb/run-20260226_153026-trcpjlfd/run-trcpjlfd.wandb filter=lfs diff=lfs merge=lfs -text +wandb/run-20260319_063518-29lbcxak/run-29lbcxak.wandb filter=lfs diff=lfs merge=lfs -text diff --git a/checkpoints/metadata_000000032768.json b/checkpoints/metadata_000000032768.json index ff44a084aafa754e5474df41ea15c374a23fa268..d0e99ce0c6f570a854b2b114aea6a89a7532fce7 100644 --- a/checkpoints/metadata_000000032768.json +++ b/checkpoints/metadata_000000032768.json @@ -1 +1 @@ -{"step": 1, "tokens_seen": 32768, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.47472858428955} \ No newline at end of file +{"step": 1, "tokens_seen": 32768, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.462437629699707} \ No newline at end of file diff --git a/checkpoints/metadata_000000327680.json b/checkpoints/metadata_000000327680.json index 8ce0422abf1ed1343400bcc795303c1f9c616194..20e027cc76f07f371a4058de1c753a16439a42e5 100644 --- a/checkpoints/metadata_000000327680.json +++ b/checkpoints/metadata_000000327680.json @@ -1 +1 @@ -{"step": 10, "tokens_seen": 327680, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.472703523224029} \ No newline at end of file +{"step": 10, "tokens_seen": 327680, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.44581323632758} \ No newline at end of file diff --git a/checkpoints/metadata_000000360448.json b/checkpoints/metadata_000000360448.json index 5f86bf14630ffed3dfd8509fb27841b4158561a2..44a19446c1116d33f61ddf595abc0fbb446109c1 100644 --- a/checkpoints/metadata_000000360448.json +++ b/checkpoints/metadata_000000360448.json @@ -1 +1 @@ -{"step": 11, "tokens_seen": 360448, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.471990140793302} \ No newline at end of file +{"step": 11, "tokens_seen": 360448, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.441161565298534} \ No newline at end of file diff --git a/checkpoints/metadata_000000425984.json b/checkpoints/metadata_000000425984.json index f890a96d7e29d632ed10db4aff1abeeb4da6adaf..28e3c55176e21b7151e870bf8c0d8b69dd85e8fa 100644 --- a/checkpoints/metadata_000000425984.json +++ b/checkpoints/metadata_000000425984.json @@ -1 +1 @@ -{"step": 13, "tokens_seen": 425984, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.469721053741852} \ No newline at end of file +{"step": 13, "tokens_seen": 425984, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.429075783774888} \ No newline at end of file diff --git a/checkpoints/metadata_000000458752.json b/checkpoints/metadata_000000458752.json index dcf880858f9f072a94a3ae49e79d7227ce2d792e..328e5625da713d190d238b090457d5f46289fda4 100644 --- a/checkpoints/metadata_000000458752.json +++ b/checkpoints/metadata_000000458752.json @@ -1 +1 @@ -{"step": 14, "tokens_seen": 458752, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.468519898807461} \ No newline at end of file +{"step": 14, "tokens_seen": 458752, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.423034213406622} \ No newline at end of file diff --git a/checkpoints/metadata_000000491520.json b/checkpoints/metadata_000000491520.json index f71dda81d7464c25452230a045723a1c40ad90b8..f1478ad8c12500a120427dc5441980a09c041a05 100644 --- a/checkpoints/metadata_000000491520.json +++ b/checkpoints/metadata_000000491520.json @@ -1 +1 @@ -{"step": 15, "tokens_seen": 491520, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.467125200460256} \ No newline at end of file +{"step": 15, "tokens_seen": 491520, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.416432102321139} \ No newline at end of file diff --git a/checkpoints/metadata_000000557056.json b/checkpoints/metadata_000000557056.json index 4469dc93cea90e4e284ab879ad82de065e161d52..c75e35d7689d2ca0be22cbcb0ace945ee405ddfc 100644 --- a/checkpoints/metadata_000000557056.json +++ b/checkpoints/metadata_000000557056.json @@ -1 +1 @@ -{"step": 17, "tokens_seen": 557056, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.463455310860898} \ No newline at end of file +{"step": 17, "tokens_seen": 557056, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.401797919113} \ No newline at end of file diff --git a/checkpoints/metadata_000000622592.json b/checkpoints/metadata_000000622592.json index 169f67b8b435103d417869bd206168a0e27f9d85..d279f8237609739ffeae3ed088ed1a18687cef4e 100644 --- a/checkpoints/metadata_000000622592.json +++ b/checkpoints/metadata_000000622592.json @@ -1 +1 @@ -{"step": 19, "tokens_seen": 622592, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.459099738819177} \ No newline at end of file +{"step": 19, "tokens_seen": 622592, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.387243368149452} \ No newline at end of file diff --git a/checkpoints/metadata_000000688128.json b/checkpoints/metadata_000000688128.json index 760d87d6aadad649e919701a79502eb04732aa4e..f3457162b0ed773047d16fa01b73f38e88a98baa 100644 --- a/checkpoints/metadata_000000688128.json +++ b/checkpoints/metadata_000000688128.json @@ -1 +1 @@ -{"step": 21, "tokens_seen": 688128, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.453407187818708} \ No newline at end of file +{"step": 21, "tokens_seen": 688128, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.37150742909514} \ No newline at end of file diff --git a/checkpoints/metadata_000000753664.json b/checkpoints/metadata_000000753664.json index ede06b34062135e04a840100102f55a312046402..6c07b74fc081100e070b3baf2383a199cbc98648 100644 --- a/checkpoints/metadata_000000753664.json +++ b/checkpoints/metadata_000000753664.json @@ -1 +1 @@ -{"step": 23, "tokens_seen": 753664, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.446099322563402} \ No newline at end of file +{"step": 23, "tokens_seen": 753664, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.354241956204268} \ No newline at end of file diff --git a/checkpoints/metadata_000000819200.json b/checkpoints/metadata_000000819200.json index 57f305d5c02a894fd0fb94b03bb1d3bf5fbecd06..4f6d9fd3e1367ca7d97c9adc2a2d9f3dc48bab23 100644 --- a/checkpoints/metadata_000000819200.json +++ b/checkpoints/metadata_000000819200.json @@ -1 +1 @@ -{"step": 25, "tokens_seen": 819200, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.438267091556018} \ No newline at end of file +{"step": 25, "tokens_seen": 819200, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.337766838322462} \ No newline at end of file diff --git a/checkpoints/metadata_000000917504.json b/checkpoints/metadata_000000917504.json index fa6312c3000fdd374487baa1d1c54c8ed70d0730..9c5dfdc0b21d0ae2119aaaa2a01556013a166e69 100644 --- a/checkpoints/metadata_000000917504.json +++ b/checkpoints/metadata_000000917504.json @@ -1 +1 @@ -{"step": 28, "tokens_seen": 917504, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.423273612378466} \ No newline at end of file +{"step": 28, "tokens_seen": 917504, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.311147664996973} \ No newline at end of file diff --git a/checkpoints/metadata_000000983040.json b/checkpoints/metadata_000000983040.json index 7df98d9217407dbaf03e1224946390148fe1a478..f387e3860729f218a742964f5aa838aff338e355 100644 --- a/checkpoints/metadata_000000983040.json +++ b/checkpoints/metadata_000000983040.json @@ -1 +1 @@ -{"step": 30, "tokens_seen": 983040, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.411245689106972} \ No newline at end of file +{"step": 30, "tokens_seen": 983040, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.292225877179932} \ No newline at end of file diff --git a/checkpoints/metadata_000001114112.json b/checkpoints/metadata_000001114112.json index 61cc72f65bfff95fa8815f469a1e2b6e9859595d..57d5c64836e446acfa2457130d2dd4a58e985699 100644 --- a/checkpoints/metadata_000001114112.json +++ b/checkpoints/metadata_000001114112.json @@ -1 +1 @@ -{"step": 34, "tokens_seen": 1114112, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.383828797056317} \ No newline at end of file +{"step": 34, "tokens_seen": 1114112, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.252617267525892} \ No newline at end of file diff --git a/checkpoints/metadata_000001212416.json b/checkpoints/metadata_000001212416.json index 5cda40b1bb7426ee1f31d21d62c169b32e645a2a..50dd91ce1b207d52dba858f8f25a3e0c61461175 100644 --- a/checkpoints/metadata_000001212416.json +++ b/checkpoints/metadata_000001212416.json @@ -1 +1 @@ -{"step": 37, "tokens_seen": 1212416, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.361741983796891} \ No newline at end of file +{"step": 37, "tokens_seen": 1212416, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.222297308724979} \ No newline at end of file diff --git a/checkpoints/metadata_000001343488.json b/checkpoints/metadata_000001343488.json index 17654f11fb2396b31d4a563f94dfca2094779a92..11021f316cf9b1227eec13a706b68ec4b60366e1 100644 --- a/checkpoints/metadata_000001343488.json +++ b/checkpoints/metadata_000001343488.json @@ -1 +1 @@ -{"step": 41, "tokens_seen": 1343488, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.329621238518811} \ No newline at end of file +{"step": 41, "tokens_seen": 1343488, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.177976434778357} \ No newline at end of file diff --git a/checkpoints/metadata_000001474560.json b/checkpoints/metadata_000001474560.json index 57747bb43b744b5cfaddeaa93681e48b47393f9d..6fbe50438c39fbfc959c500325f8c1404342da90 100644 --- a/checkpoints/metadata_000001474560.json +++ b/checkpoints/metadata_000001474560.json @@ -1 +1 @@ -{"step": 45, "tokens_seen": 1474560, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.29327811546785} \ No newline at end of file +{"step": 45, "tokens_seen": 1474560, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.127112486045279} \ No newline at end of file diff --git a/checkpoints/metadata_000001605632.json b/checkpoints/metadata_000001605632.json index f58443b698730310105307f06dc671c835d7c45e..0410a231bba6404e7269bbc75ac0437c400def4a 100644 --- a/checkpoints/metadata_000001605632.json +++ b/checkpoints/metadata_000001605632.json @@ -1 +1 @@ -{"step": 49, "tokens_seen": 1605632, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.253395153119124} \ No newline at end of file +{"step": 49, "tokens_seen": 1605632, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.069664843206045} \ No newline at end of file diff --git a/checkpoints/metadata_000001769472.json b/checkpoints/metadata_000001769472.json index fd5b42aa72cb3fbfa104058f7fe0976e0763565e..7e13831a019324010dff72cd450a528109d6fe6a 100644 --- a/checkpoints/metadata_000001769472.json +++ b/checkpoints/metadata_000001769472.json @@ -1 +1 @@ -{"step": 54, "tokens_seen": 1769472, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.20219309142512} \ No newline at end of file +{"step": 54, "tokens_seen": 1769472, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.994979895512557} \ No newline at end of file diff --git a/checkpoints/metadata_000001966080.json b/checkpoints/metadata_000001966080.json index 833734f418268eb90e2f46c9025c5872770665f3..85f1311fdd63b36cdd23404a555c87cadef8ff86 100644 --- a/checkpoints/metadata_000001966080.json +++ b/checkpoints/metadata_000001966080.json @@ -1 +1 @@ -{"step": 60, "tokens_seen": 1966080, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.133052898451657} \ No newline at end of file +{"step": 60, "tokens_seen": 1966080, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.894455992269652} \ No newline at end of file diff --git a/checkpoints/metadata_000002162688.json b/checkpoints/metadata_000002162688.json index fc305f8231544b3175065c791cdf3f3dbf671f23..3fa3fb93a6d7e43c21febb540817f72a82e6573c 100644 --- a/checkpoints/metadata_000002162688.json +++ b/checkpoints/metadata_000002162688.json @@ -1 +1 @@ -{"step": 66, "tokens_seen": 2162688, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.057631285417521} \ No newline at end of file +{"step": 66, "tokens_seen": 2162688, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.787456261828652} \ No newline at end of file diff --git a/checkpoints/metadata_000002359296.json b/checkpoints/metadata_000002359296.json index ec9114785ac35b157295da59260e093b87222ed4..ffbe35846f8e852de42f010f0dfae0d28c259cd0 100644 --- a/checkpoints/metadata_000002359296.json +++ b/checkpoints/metadata_000002359296.json @@ -1 +1 @@ -{"step": 72, "tokens_seen": 2359296, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.97402740012332} \ No newline at end of file +{"step": 72, "tokens_seen": 2359296, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.671371978705555} \ No newline at end of file diff --git a/checkpoints/metadata_000002621440.json b/checkpoints/metadata_000002621440.json index 9b5f8392cd19f09aa15165606806faf612079ea2..0fe7cb5720a24774ef20332c56721ac37ab2be2c 100644 --- a/checkpoints/metadata_000002621440.json +++ b/checkpoints/metadata_000002621440.json @@ -1 +1 @@ -{"step": 80, "tokens_seen": 2621440, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.856754308431446} \ No newline at end of file +{"step": 80, "tokens_seen": 2621440, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.511887299894651} \ No newline at end of file diff --git a/checkpoints/metadata_000002883584.json b/checkpoints/metadata_000002883584.json index f3d388f25073232f44e21a2d44bd8672f3342f43..a04181a875ef1f9547766fa23696bb8303abfb5e 100644 --- a/checkpoints/metadata_000002883584.json +++ b/checkpoints/metadata_000002883584.json @@ -1 +1 @@ -{"step": 88, "tokens_seen": 2883584, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.726206998293742} \ No newline at end of file +{"step": 88, "tokens_seen": 2883584, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.339867149283457} \ No newline at end of file diff --git a/checkpoints/metadata_000003178496.json b/checkpoints/metadata_000003178496.json index 3088c0f9c63292080302b9f7a90b4b6705945a43..0416f651639941889b0b3f890ce2969de7b3df7c 100644 --- a/checkpoints/metadata_000003178496.json +++ b/checkpoints/metadata_000003178496.json @@ -1 +1 @@ -{"step": 97, "tokens_seen": 3178496, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.577359730084346} \ No newline at end of file +{"step": 97, "tokens_seen": 3178496, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.149718302323809} \ No newline at end of file diff --git a/checkpoints/metadata_000003473408.json b/checkpoints/metadata_000003473408.json index 3bdf20a37639fae37b731f501c37d02494aeaa6d..c011cb594f1e17cacf62b287c3ecce4a1a0fb3ec 100644 --- a/checkpoints/metadata_000003473408.json +++ b/checkpoints/metadata_000003473408.json @@ -1 +1 @@ -{"step": 106, "tokens_seen": 3473408, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.417083930648644} \ No newline at end of file +{"step": 106, "tokens_seen": 3473408, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 8.954948132966997} \ No newline at end of file diff --git a/checkpoints/metadata_000003833856.json b/checkpoints/metadata_000003833856.json index 9eaf8a8e7a7d69389e7d5d824fb2e102d45413c2..9c9d9f885dda98ab4af485828c1281546f6053a0 100644 --- a/checkpoints/metadata_000003833856.json +++ b/checkpoints/metadata_000003833856.json @@ -1 +1 @@ -{"step": 117, "tokens_seen": 3833856, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.216006205212386} \ No newline at end of file +{"step": 117, "tokens_seen": 3833856, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 8.725915986013053} \ No newline at end of file diff --git a/checkpoints/metadata_000004227072.json b/checkpoints/metadata_000004227072.json index 9ef9c58749ecef4f8489abd78500ce31e9de9a9f..57df490695fcd3033722792a4d504ea6fc3bc84a 100644 --- a/checkpoints/metadata_000004227072.json +++ b/checkpoints/metadata_000004227072.json @@ -1 +1 @@ -{"step": 129, "tokens_seen": 4227072, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 8.982881949139617} \ No newline at end of file +{"step": 129, "tokens_seen": 4227072, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 8.479047281676335} \ No newline at end of file diff --git a/checkpoints/metadata_000004653056.json b/checkpoints/metadata_000004653056.json index f90891326e5bc88212c8e48a74d5753f34d840df..f117a66cc2b9c04b421a30280056659e1210fe03 100644 --- a/checkpoints/metadata_000004653056.json +++ b/checkpoints/metadata_000004653056.json @@ -1 +1 @@ -{"step": 142, "tokens_seen": 4653056, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 8.732197937516098} \ No newline at end of file +{"step": 142, "tokens_seen": 4653056, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 8.226625281843564} \ No newline at end of file diff --git a/checkpoints/metadata_000005111808.json b/checkpoints/metadata_000005111808.json index b30fe773f60284a1ff2d4b1493bb3209f674e24a..4ccabdfbc0bcf547a0b85fb5163cac9b76c6e6a7 100644 --- a/checkpoints/metadata_000005111808.json +++ b/checkpoints/metadata_000005111808.json @@ -1 +1 @@ -{"step": 156, "tokens_seen": 5111808, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 8.482621967493646} \ No newline at end of file +{"step": 156, "tokens_seen": 5111808, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.982290411331379} \ No newline at end of file diff --git a/checkpoints/metadata_000005603328.json b/checkpoints/metadata_000005603328.json index 1f631700de350606383e6a10ad869dfc1cf83df7..9e719c1e3966fa2470d517941b15f9ac9cec4a1c 100644 --- a/checkpoints/metadata_000005603328.json +++ b/checkpoints/metadata_000005603328.json @@ -1 +1 @@ -{"step": 171, "tokens_seen": 5603328, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 8.22768931743315} \ No newline at end of file +{"step": 171, "tokens_seen": 5603328, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.735026211560928} \ No newline at end of file diff --git a/checkpoints/metadata_000006193152.json b/checkpoints/metadata_000006193152.json index 1115309a4798c9088e10bb0870da72a1bacb08e5..dfb51e4314f076d9937d33253069939ad59bd49e 100644 --- a/checkpoints/metadata_000006193152.json +++ b/checkpoints/metadata_000006193152.json @@ -1 +1 @@ -{"step": 189, "tokens_seen": 6193152, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.945035313729822} \ No newline at end of file +{"step": 189, "tokens_seen": 6193152, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.4613181284979175} \ No newline at end of file diff --git a/checkpoints/metadata_000006782976.json b/checkpoints/metadata_000006782976.json index b9aa0541e818adabdeb9b428504248b376a9a34f..4df54fd265b791c9b2a4e18a772b7f1c17a0c625 100644 --- a/checkpoints/metadata_000006782976.json +++ b/checkpoints/metadata_000006782976.json @@ -1 +1 @@ -{"step": 207, "tokens_seen": 6782976, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.694594853436912} \ No newline at end of file +{"step": 207, "tokens_seen": 6782976, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.223525674157066} \ No newline at end of file diff --git a/checkpoints/metadata_000007471104.json b/checkpoints/metadata_000007471104.json index 6a69b982ffb9ae6bf611ca89c34439195f84f0de..1ce7276f9e14acbe631144abe1f5026a13e5ca08 100644 --- a/checkpoints/metadata_000007471104.json +++ b/checkpoints/metadata_000007471104.json @@ -1 +1 @@ -{"step": 228, "tokens_seen": 7471104, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.419854967755447} \ No newline at end of file +{"step": 228, "tokens_seen": 7471104, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.968900871965506} \ No newline at end of file diff --git a/checkpoints/metadata_000008224768.json b/checkpoints/metadata_000008224768.json index 25de7fad0c012c8e854f51f055664a154b89b218..51bc68555c44296ecff283677dcf260a9966be38 100644 --- a/checkpoints/metadata_000008224768.json +++ b/checkpoints/metadata_000008224768.json @@ -1 +1 @@ -{"step": 251, "tokens_seen": 8224768, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.159385434456035} \ No newline at end of file +{"step": 251, "tokens_seen": 8224768, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.737094952796088} \ No newline at end of file diff --git a/checkpoints/metadata_000009043968.json b/checkpoints/metadata_000009043968.json index 6ca5b76834dfc2ba5254b212d17b641471af9d28..8b15afa7269f57227453e419d1946bc7889ac5f3 100644 --- a/checkpoints/metadata_000009043968.json +++ b/checkpoints/metadata_000009043968.json @@ -1 +1 @@ -{"step": 276, "tokens_seen": 9043968, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.904178105557777} \ No newline at end of file +{"step": 276, "tokens_seen": 9043968, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.516684829899095} \ No newline at end of file diff --git a/checkpoints/metadata_000009961472.json b/checkpoints/metadata_000009961472.json index 8cd6a284051989e63d303001e841081e0ad82cc7..2fc18b0894a373ee5ba0448128cb92ea28408fe4 100644 --- a/checkpoints/metadata_000009961472.json +++ b/checkpoints/metadata_000009961472.json @@ -1 +1 @@ -{"step": 304, "tokens_seen": 9961472, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.653267985439542} \ No newline at end of file +{"step": 304, "tokens_seen": 9961472, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.302264189489683} \ No newline at end of file diff --git a/checkpoints/metadata_000010944512.json b/checkpoints/metadata_000010944512.json index 5cb321ed1a85cbc50fdd3d0e5d22ef606c076820..73382cbab8b993e5e827acc6a06adca240397e80 100644 --- a/checkpoints/metadata_000010944512.json +++ b/checkpoints/metadata_000010944512.json @@ -1 +1 @@ -{"step": 334, "tokens_seen": 10944512, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.430630889399794} \ No newline at end of file +{"step": 334, "tokens_seen": 10944512, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.113615961017287} \ No newline at end of file diff --git a/checkpoints/metadata_000012058624.json b/checkpoints/metadata_000012058624.json index 2d9db15c0aacbfb2796c469dc615bd71dd158d8e..23aeb35d1c40a7f771a558f83ab7815010913c03 100644 --- a/checkpoints/metadata_000012058624.json +++ b/checkpoints/metadata_000012058624.json @@ -1 +1 @@ -{"step": 368, "tokens_seen": 12058624, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.216337437233496} \ No newline at end of file +{"step": 368, "tokens_seen": 12058624, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.930052665016961} \ No newline at end of file diff --git a/checkpoints/metadata_000013271040.json b/checkpoints/metadata_000013271040.json index 2a294d851e292b4a5633f1eacd4ae79d6961b240..c7d72a9c4773ed9d10f4456e22bf8c8ec54a61b6 100644 --- a/checkpoints/metadata_000013271040.json +++ b/checkpoints/metadata_000013271040.json @@ -1 +1 @@ -{"step": 405, "tokens_seen": 13271040, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.02448773214943} \ No newline at end of file +{"step": 405, "tokens_seen": 13271040, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.770665921846031} \ No newline at end of file diff --git a/checkpoints/metadata_000014581760.json b/checkpoints/metadata_000014581760.json index 26ac4ebf8b8a5e7faf04cfa24c95588e902d0349..8a7867c3cc7d1a7ad5a6f996ee85bcf08850af31 100644 --- a/checkpoints/metadata_000014581760.json +++ b/checkpoints/metadata_000014581760.json @@ -1 +1 @@ -{"step": 445, "tokens_seen": 14581760, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.847221819414238} \ No newline at end of file +{"step": 445, "tokens_seen": 14581760, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.6236167282136575} \ No newline at end of file diff --git a/checkpoints/metadata_000016056320.json b/checkpoints/metadata_000016056320.json index 590173c59916f4cd7b11e47b8cd24bc88dbfc63a..fff05a9a0f5db92153070791c16b6c4055ba3e05 100644 --- a/checkpoints/metadata_000016056320.json +++ b/checkpoints/metadata_000016056320.json @@ -1 +1 @@ -{"step": 490, "tokens_seen": 16056320, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.685284731362977} \ No newline at end of file +{"step": 490, "tokens_seen": 16056320, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.488310843039606} \ No newline at end of file diff --git a/checkpoints/metadata_000016384000.json b/checkpoints/metadata_000016384000.json index ffe89385a5087cbf9731d347ded5b32427e134d8..0a273985d4c41512db864ec5f11a7e5c0869d76a 100644 --- a/checkpoints/metadata_000016384000.json +++ b/checkpoints/metadata_000016384000.json @@ -1 +1 @@ -{"step": 500, "tokens_seen": 16384000, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.659568437263502} \ No newline at end of file +{"step": 500, "tokens_seen": 16384000, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.468160554157632} \ No newline at end of file diff --git a/checkpoints/metadata_000017661952.json b/checkpoints/metadata_000017661952.json index e837633165c3119c6c67db088cf8ba2570624478..705a11d50d72f0f369ecc50d1be60c4f3207630d 100644 --- a/checkpoints/metadata_000017661952.json +++ b/checkpoints/metadata_000017661952.json @@ -1 +1 @@ -{"step": 539, "tokens_seen": 17661952, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.53900757717826} \ No newline at end of file +{"step": 539, "tokens_seen": 17661952, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.367383792049406} \ No newline at end of file diff --git a/checkpoints/metadata_000019431424.json b/checkpoints/metadata_000019431424.json index 02977bc9c72b837edcb76061d0dffca0d3485675..c53b84f83540d25bb279302fe7e55ed1a5255942 100644 --- a/checkpoints/metadata_000019431424.json +++ b/checkpoints/metadata_000019431424.json @@ -1 +1 @@ -{"step": 593, "tokens_seen": 19431424, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.394161902688204} \ No newline at end of file +{"step": 593, "tokens_seen": 19431424, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.244713683265055} \ No newline at end of file diff --git a/checkpoints/metadata_000021364736.json b/checkpoints/metadata_000021364736.json index 0c6c1fd1641f10c33c9434363114be5f46d675c1..9c8cce0080df86ceedd63bdf5bae60b2e6e761b7 100644 --- a/checkpoints/metadata_000021364736.json +++ b/checkpoints/metadata_000021364736.json @@ -1 +1 @@ -{"step": 652, "tokens_seen": 21364736, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.283358080025116} \ No newline at end of file +{"step": 652, "tokens_seen": 21364736, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.1562503005627045} \ No newline at end of file diff --git a/checkpoints/metadata_000023494656.json b/checkpoints/metadata_000023494656.json index 5981d85d8e173e9686e2a59840941621345cefd2..25b67647fdf92e0a67d8fc8a3089e13c3ba1d384 100644 --- a/checkpoints/metadata_000023494656.json +++ b/checkpoints/metadata_000023494656.json @@ -1 +1 @@ -{"step": 717, "tokens_seen": 23494656, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.175760950696042} \ No newline at end of file +{"step": 717, "tokens_seen": 23494656, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.064713024898478} \ No newline at end of file diff --git a/checkpoints/metadata_000025853952.json b/checkpoints/metadata_000025853952.json index c0498817e058407d93aa996aa938260f173b6107..d587c512c2eb34701830f07ff2c98b7b20d2d1d3 100644 --- a/checkpoints/metadata_000025853952.json +++ b/checkpoints/metadata_000025853952.json @@ -1 +1 @@ -{"step": 789, "tokens_seen": 25853952, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.0629671796907045} \ No newline at end of file +{"step": 789, "tokens_seen": 25853952, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.9753058592233055} \ No newline at end of file diff --git a/checkpoints/metadata_000028442624.json b/checkpoints/metadata_000028442624.json index eaea19c57783d15384c951ebdf687d41fd92a60e..d1acb34432bb13d5e58883e60ff0d79ea1e95926 100644 --- a/checkpoints/metadata_000028442624.json +++ b/checkpoints/metadata_000028442624.json @@ -1 +1 @@ -{"step": 868, "tokens_seen": 28442624, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.963105635300831} \ No newline at end of file +{"step": 868, "tokens_seen": 28442624, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.881099863088409} \ No newline at end of file diff --git a/checkpoints/metadata_000031293440.json b/checkpoints/metadata_000031293440.json index a18268521149516c134baa8844e4e75bfd6db3b3..d88658860d777eed248a36bdf2c36d5af353b5c7 100644 --- a/checkpoints/metadata_000031293440.json +++ b/checkpoints/metadata_000031293440.json @@ -1 +1 @@ -{"step": 955, "tokens_seen": 31293440, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.865391816028884} \ No newline at end of file +{"step": 955, "tokens_seen": 31293440, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.794771261136894} \ No newline at end of file diff --git a/checkpoints/metadata_000032768000.json b/checkpoints/metadata_000032768000.json index a4a31d4225bbea49c2eacd5409dbd9c51168d59a..f6af140337b60e6ed11b0468d888e64445a3f7f7 100644 --- a/checkpoints/metadata_000032768000.json +++ b/checkpoints/metadata_000032768000.json @@ -1 +1 @@ -{"step": 1000, "tokens_seen": 32768000, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.812689049158225} \ No newline at end of file +{"step": 1000, "tokens_seen": 32768000, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.743142605580698} \ No newline at end of file diff --git a/checkpoints/metadata_000034439168.json b/checkpoints/metadata_000034439168.json index a68b397fdef42ad1922edff35ba07d71336569f7..a8e3799243490be051c79701a4f9acfb66e1a31a 100644 --- a/checkpoints/metadata_000034439168.json +++ b/checkpoints/metadata_000034439168.json @@ -1 +1 @@ -{"step": 1051, "tokens_seen": 34439168, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.753625057604216} \ No newline at end of file +{"step": 1051, "tokens_seen": 34439168, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.687631394674228} \ No newline at end of file diff --git a/checkpoints/metadata_000037879808.json b/checkpoints/metadata_000037879808.json index 44634a3befed1723b762b6b24cc20b50185aadba..1c5cc412b5f4989330e74177912095f348fa7dca 100644 --- a/checkpoints/metadata_000037879808.json +++ b/checkpoints/metadata_000037879808.json @@ -1 +1 @@ -{"step": 1156, "tokens_seen": 37879808, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.641372455953141} \ No newline at end of file +{"step": 1156, "tokens_seen": 37879808, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.584697024859014} \ No newline at end of file diff --git a/checkpoints/metadata_000041648128.json b/checkpoints/metadata_000041648128.json index 6ebf5d90537b189dbddec1ae5aff4c110c49c2ce..a32ad5d552756103c7607aaeb761686e767b19f6 100644 --- a/checkpoints/metadata_000041648128.json +++ b/checkpoints/metadata_000041648128.json @@ -1 +1 @@ -{"step": 1271, "tokens_seen": 41648128, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.529437655293409} \ No newline at end of file +{"step": 1271, "tokens_seen": 41648128, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.479492940101269} \ No newline at end of file diff --git a/checkpoints/metadata_000045842432.json b/checkpoints/metadata_000045842432.json index 561d0ed658b95f638c9dfbefecba10260848f7c4..8c4e52a49c82fe868b9df64d434ea021e89d11ab 100644 --- a/checkpoints/metadata_000045842432.json +++ b/checkpoints/metadata_000045842432.json @@ -1 +1 @@ -{"step": 1399, "tokens_seen": 45842432, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.429992510229746} \ No newline at end of file +{"step": 1399, "tokens_seen": 45842432, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.359371060873147} \ No newline at end of file diff --git a/checkpoints/metadata_000049152000.json b/checkpoints/metadata_000049152000.json index f58e0f35ce0d556166c16df04de837d0dc59b62e..cd0558d252830ccd2eebc692f23f807f6b94b1ce 100644 --- a/checkpoints/metadata_000049152000.json +++ b/checkpoints/metadata_000049152000.json @@ -1 +1 @@ -{"step": 1500, "tokens_seen": 49152000, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.3700635583946745} \ No newline at end of file +{"step": 1500, "tokens_seen": 49152000, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.300045652251939} \ No newline at end of file diff --git a/checkpoints/metadata_000050397184.json b/checkpoints/metadata_000050397184.json index 915e41755cb397f5b943211c4314449951073a04..47a435ae122d752ec16089b4ab3efe1f3218b2af 100644 --- a/checkpoints/metadata_000050397184.json +++ b/checkpoints/metadata_000050397184.json @@ -1 +1 @@ -{"step": 1538, "tokens_seen": 50397184, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.358773860729051} \ No newline at end of file +{"step": 1538, "tokens_seen": 50397184, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.284214765378866} \ No newline at end of file diff --git a/checkpoints/metadata_000055443456.json b/checkpoints/metadata_000055443456.json index d0b29dae8e6bf176faf72fc0ddef8e07c4a12e54..8540b414d45531f7f9611373b383f8a0da2d40c9 100644 --- a/checkpoints/metadata_000055443456.json +++ b/checkpoints/metadata_000055443456.json @@ -1 +1 @@ -{"step": 1692, "tokens_seen": 55443456, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.25258944505606} \ No newline at end of file +{"step": 1692, "tokens_seen": 55443456, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.135964355594053} \ No newline at end of file diff --git a/checkpoints/metadata_000061014016.json b/checkpoints/metadata_000061014016.json index cf943a25c069d7264540ce4f729d780df1725bc5..45cda348df93d678d78192798e53c542accc4377 100644 --- a/checkpoints/metadata_000061014016.json +++ b/checkpoints/metadata_000061014016.json @@ -1 +1 @@ -{"step": 1862, "tokens_seen": 61014016, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.183231115945514} \ No newline at end of file +{"step": 1862, "tokens_seen": 61014016, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.9819966090161265} \ No newline at end of file diff --git a/checkpoints/metadata_000065536000.json b/checkpoints/metadata_000065536000.json index 095034cb30f07d2c539d55632afbe2bbc445e026..03f9872fed2e9157dd116e0dd8a80ff6f90f3c6a 100644 --- a/checkpoints/metadata_000065536000.json +++ b/checkpoints/metadata_000065536000.json @@ -1 +1 @@ -{"step": 2000, "tokens_seen": 65536000, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.158511824063886} \ No newline at end of file +{"step": 2000, "tokens_seen": 65536000, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.892819235470663} \ No newline at end of file diff --git a/checkpoints/metadata_000067108864.json b/checkpoints/metadata_000067108864.json index 9e43bbafbeefa64ef788c0aeefa00c58293ecf34..174a8a4c34c5aa7bb990c312446467d2ab662437 100644 --- a/checkpoints/metadata_000067108864.json +++ b/checkpoints/metadata_000067108864.json @@ -1 +1 @@ -{"step": 2048, "tokens_seen": 67108864, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.13458860322571} \ No newline at end of file +{"step": 2048, "tokens_seen": 67108864, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.852328401139968} \ No newline at end of file diff --git a/checkpoints/metadata_000073826304.json b/checkpoints/metadata_000073826304.json index 0bd394ca6a17cbbfc2527b46aef66c34741cde2f..9c710b3aa9fc103bf412d359ba2f9ed6a3bbce9c 100644 --- a/checkpoints/metadata_000073826304.json +++ b/checkpoints/metadata_000073826304.json @@ -1 +1 @@ -{"step": 2253, "tokens_seen": 73826304, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.024738042947748} \ No newline at end of file +{"step": 2253, "tokens_seen": 73826304, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.70811647117009} \ No newline at end of file diff --git a/checkpoints/metadata_000081199104.json b/checkpoints/metadata_000081199104.json index b698ae94e97926f7a8d1723c0dc6fa28f9d0d06d..3b64cc8dee28aca55271df982d9d5ea90b3c1e4d 100644 --- a/checkpoints/metadata_000081199104.json +++ b/checkpoints/metadata_000081199104.json @@ -1 +1 @@ -{"step": 2478, "tokens_seen": 81199104, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.927671735758744} \ No newline at end of file +{"step": 2478, "tokens_seen": 81199104, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.616063727550929} \ No newline at end of file diff --git a/checkpoints/metadata_000081920000.json b/checkpoints/metadata_000081920000.json index 725394c0ba78bb90207199c17cc59388220c1451..bce8604694de5b2f642941bcbcce0e4838c33b1c 100644 --- a/checkpoints/metadata_000081920000.json +++ b/checkpoints/metadata_000081920000.json @@ -1 +1 @@ -{"step": 2500, "tokens_seen": 81920000, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.9130802265358606} \ No newline at end of file +{"step": 2500, "tokens_seen": 81920000, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.6025823046975267} \ No newline at end of file diff --git a/checkpoints/metadata_000089325568.json b/checkpoints/metadata_000089325568.json index 1bcdd508cbe265fd9f824ee54812d9584252137c..c31353c6d44f39edfe38802d6565a7dbd6ed4822 100644 --- a/checkpoints/metadata_000089325568.json +++ b/checkpoints/metadata_000089325568.json @@ -1 +1 @@ -{"step": 2726, "tokens_seen": 89325568, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.8304603164886952} \ No newline at end of file +{"step": 2726, "tokens_seen": 89325568, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.5297980905706705} \ No newline at end of file diff --git a/checkpoints/metadata_000098271232.json b/checkpoints/metadata_000098271232.json index 9cf92798184b3a206d8942ee6cb0f4bd235e678b..8efc947e08df077ea64c6618c46f57fc06c1e7fe 100644 --- a/checkpoints/metadata_000098271232.json +++ b/checkpoints/metadata_000098271232.json @@ -1 +1 @@ -{"step": 2999, "tokens_seen": 98271232, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.7642715290485924} \ No newline at end of file +{"step": 2999, "tokens_seen": 98271232, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.4688791846861817} \ No newline at end of file diff --git a/checkpoints/metadata_000098304000.json b/checkpoints/metadata_000098304000.json index e21cb3f0b0a4e4144214d84c110a74c9c298cd7a..52f8d0549b89044a9646cf6572937fafdc0797e2 100644 --- a/checkpoints/metadata_000098304000.json +++ b/checkpoints/metadata_000098304000.json @@ -1 +1 @@ -{"step": 3000, "tokens_seen": 98304000, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.7636144447471813} \ No newline at end of file +{"step": 3000, "tokens_seen": 98304000, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.468022187101252} \ No newline at end of file diff --git a/checkpoints/metadata_000108068864.json b/checkpoints/metadata_000108068864.json index 57c9f28533becf32b762d640094e8681ea43dae5..71baf7c60f0aeebf7499a5b32f43df43aff6faff 100644 --- a/checkpoints/metadata_000108068864.json +++ b/checkpoints/metadata_000108068864.json @@ -1 +1 @@ -{"step": 3298, "tokens_seen": 108068864, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.694283825208702} \ No newline at end of file +{"step": 3298, "tokens_seen": 108068864, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.402881423257483} \ No newline at end of file diff --git a/checkpoints/metadata_000114688000.json b/checkpoints/metadata_000114688000.json index 9b478b9812509599b5e76f70f061a8571e8c476c..3fdab2eb102f9b8d6429a3046f93f4e4addb41ee 100644 --- a/checkpoints/metadata_000114688000.json +++ b/checkpoints/metadata_000114688000.json @@ -1 +1 @@ -{"step": 3500, "tokens_seen": 114688000, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.65164887493004} \ No newline at end of file +{"step": 3500, "tokens_seen": 114688000, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.3578212791343707} \ No newline at end of file diff --git a/checkpoints/metadata_000118882304.json b/checkpoints/metadata_000118882304.json index 15832bff87e29c66cbf794c5943c408098a1ce57..4bb20bd7dcb247af15e3be802decd0ec9d567e95 100644 --- a/checkpoints/metadata_000118882304.json +++ b/checkpoints/metadata_000118882304.json @@ -1 +1 @@ -{"step": 3628, "tokens_seen": 118882304, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.6573982438370076} \ No newline at end of file +{"step": 3628, "tokens_seen": 118882304, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.363343283490335} \ No newline at end of file diff --git a/checkpoints/metadata_000130777088.json b/checkpoints/metadata_000130777088.json index 0fe5c214dd2fc92cc0df2fd5e2f098bb0370f8c4..80a21213c61b3e6ee6a2123bfa342cdfffeb47f1 100644 --- a/checkpoints/metadata_000130777088.json +++ b/checkpoints/metadata_000130777088.json @@ -1 +1 @@ -{"step": 3991, "tokens_seen": 130777088, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.6080018543413277} \ No newline at end of file +{"step": 3991, "tokens_seen": 130777088, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.315968705398669} \ No newline at end of file diff --git a/checkpoints/metadata_000131072000.json b/checkpoints/metadata_000131072000.json index a13961bcfcdbf04681adce56586f3d6b7e1ab6d4..984c03867206a22b51058816392d69333c11e40e 100644 --- a/checkpoints/metadata_000131072000.json +++ b/checkpoints/metadata_000131072000.json @@ -1 +1 @@ -{"step": 4000, "tokens_seen": 131072000, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.6143836509798017} \ No newline at end of file +{"step": 4000, "tokens_seen": 131072000, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.3232916026020605} \ No newline at end of file diff --git a/checkpoints/metadata_000143851520.json b/checkpoints/metadata_000143851520.json index 289dd038fe4246d0751ffac83e6f51d71ed9b446..5b366520139f62f32557566f19839bcea1067bda 100644 --- a/checkpoints/metadata_000143851520.json +++ b/checkpoints/metadata_000143851520.json @@ -1 +1 @@ -{"step": 4390, "tokens_seen": 143851520, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.5763677759688544} \ No newline at end of file +{"step": 4390, "tokens_seen": 143851520, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.2712891613103365} \ No newline at end of file diff --git a/checkpoints/metadata_000147456000.json b/checkpoints/metadata_000147456000.json index f5a0e93b92798fd436c3bde2b3792a23911de7c4..919f6a3d70056e56609f9918ae5debfbf8729e7e 100644 --- a/checkpoints/metadata_000147456000.json +++ b/checkpoints/metadata_000147456000.json @@ -1 +1 @@ -{"step": 4500, "tokens_seen": 147456000, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.5651027425659474} \ No newline at end of file +{"step": 4500, "tokens_seen": 147456000, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.2607066672599236} \ No newline at end of file diff --git a/checkpoints/metadata_000158269440.json b/checkpoints/metadata_000158269440.json index a7e806f1fbbf1d467b33aee6e9b1a463a4a84718..920abcd8c3277ee5a50bfa8d98afb6fa40252227 100644 --- a/checkpoints/metadata_000158269440.json +++ b/checkpoints/metadata_000158269440.json @@ -1 +1 @@ -{"step": 4830, "tokens_seen": 158269440, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.5430792461603593} \ No newline at end of file +{"step": 4830, "tokens_seen": 158269440, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.2387536035281825} \ No newline at end of file diff --git a/checkpoints/metadata_000163840000.json b/checkpoints/metadata_000163840000.json index 45c60bd6ceb16eb6e7db403ca3d28a38afc42bea..7c93c82aaec4b9e1a3ac7ef6d4849eb5b689c16c 100644 --- a/checkpoints/metadata_000163840000.json +++ b/checkpoints/metadata_000163840000.json @@ -1 +1 @@ -{"step": 5000, "tokens_seen": 163840000, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.52576845325292} \ No newline at end of file +{"step": 5000, "tokens_seen": 163840000, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.219682038509445} \ No newline at end of file diff --git a/checkpoints/metadata_000174096384.json b/checkpoints/metadata_000174096384.json index 5e8ce295d26be0f92744862fa8cf6eb4e4199404..e75608397d95e7ce870a175acbcf45d64df70d7f 100644 --- a/checkpoints/metadata_000174096384.json +++ b/checkpoints/metadata_000174096384.json @@ -1 +1 @@ -{"step": 5313, "tokens_seen": 174096384, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.518635521589029} \ No newline at end of file +{"step": 5313, "tokens_seen": 174096384, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.2087791353150585} \ No newline at end of file diff --git a/checkpoints/metadata_000180224000.json b/checkpoints/metadata_000180224000.json index b7ef04172a16775033215df9cf3f488490da10a0..6ae9979b89dcaeb66bfea8204564188e5859f022 100644 --- a/checkpoints/metadata_000180224000.json +++ b/checkpoints/metadata_000180224000.json @@ -1 +1 @@ -{"step": 5500, "tokens_seen": 180224000, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.496639097510411} \ No newline at end of file +{"step": 5500, "tokens_seen": 180224000, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.186685546578624} \ No newline at end of file diff --git a/checkpoints/metadata_000191496192.json b/checkpoints/metadata_000191496192.json index 15c3593964ade8598a247720fdc328e407d571d4..48e0ea61ef492a89ba4df49cbf9564391262e98e 100644 --- a/checkpoints/metadata_000191496192.json +++ b/checkpoints/metadata_000191496192.json @@ -1 +1 @@ -{"step": 5844, "tokens_seen": 191496192, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.4751529632338487} \ No newline at end of file +{"step": 5844, "tokens_seen": 191496192, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.1639664520238506} \ No newline at end of file diff --git a/checkpoints/metadata_000196608000.json b/checkpoints/metadata_000196608000.json index f5e73fb9aa1782d04a533279f7a1ca0a05cd5f14..be9927f83f64923f3e9863bedac7bdc20c9fc049 100644 --- a/checkpoints/metadata_000196608000.json +++ b/checkpoints/metadata_000196608000.json @@ -1 +1 @@ -{"step": 6000, "tokens_seen": 196608000, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.4667979776226288} \ No newline at end of file +{"step": 6000, "tokens_seen": 196608000, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.1691659182780376} \ No newline at end of file diff --git a/checkpoints/metadata_000196706304.json b/checkpoints/metadata_000196706304.json index 3616d69d0c9f060aec57046856a761ea04ea34f8..091ba727afb589129184a806c1cab1fc05faeb3d 100644 --- a/checkpoints/metadata_000196706304.json +++ b/checkpoints/metadata_000196706304.json @@ -1 +1 @@ -{"step": 6003, "tokens_seen": 196706304, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.467987821875873} \ No newline at end of file +{"step": 6003, "tokens_seen": 196706304, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.170731699051464} \ No newline at end of file diff --git a/checkpoints/metadata_000197361664.json b/checkpoints/metadata_000197361664.json index 5a5aa2479a487facafbdfd02cd4bb43a293104f0..4dd976c60671fb365de6d65d072afa3398c65704 100644 --- a/checkpoints/metadata_000197361664.json +++ b/checkpoints/metadata_000197361664.json @@ -1 +1 @@ -{"step": 6023, "tokens_seen": 197361664, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.469824998532631} \ No newline at end of file +{"step": 6023, "tokens_seen": 197361664, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.171016553932927} \ No newline at end of file diff --git a/checkpoints/metadata_000198017024.json b/checkpoints/metadata_000198017024.json index 183007c52293fca3689a264336443d8903fa2f7b..38cdb48c0dd05bf9a39895dc6e72069290375bd7 100644 --- a/checkpoints/metadata_000198017024.json +++ b/checkpoints/metadata_000198017024.json @@ -1 +1 @@ -{"step": 6043, "tokens_seen": 198017024, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.4694174847315966} \ No newline at end of file +{"step": 6043, "tokens_seen": 198017024, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.169122244180558} \ No newline at end of file diff --git a/checkpoints/metadata_000198672384.json b/checkpoints/metadata_000198672384.json index 733d6428c7d8ae7e6af8df55ca487d669e9fb1a3..3c8e8b4c2235fb166c9c118d7b1c647de04111e9 100644 --- a/checkpoints/metadata_000198672384.json +++ b/checkpoints/metadata_000198672384.json @@ -1 +1 @@ -{"step": 6063, "tokens_seen": 198672384, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.459399634434712} \ No newline at end of file +{"step": 6063, "tokens_seen": 198672384, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.157218086042496} \ No newline at end of file diff --git a/checkpoints/metadata_000199327744.json b/checkpoints/metadata_000199327744.json index 790fcda4f1140aacdc832a6e198db76a2e7e8762..36a32e7174c63e3f9e67c6ea5a6b75a14a018aa4 100644 --- a/checkpoints/metadata_000199327744.json +++ b/checkpoints/metadata_000199327744.json @@ -1 +1 @@ -{"step": 6083, "tokens_seen": 199327744, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.4660093119605064} \ No newline at end of file +{"step": 6083, "tokens_seen": 199327744, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.162601982560343} \ No newline at end of file diff --git a/checkpoints/metadata_000199950336.json b/checkpoints/metadata_000199950336.json index c16e16df2be11e6f5e6f06b5472547126445306f..389eb4b3f672e2d2c62d2b59ee77e494577e09db 100644 --- a/checkpoints/metadata_000199950336.json +++ b/checkpoints/metadata_000199950336.json @@ -1 +1 @@ -{"step": 6102, "tokens_seen": 199950336, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.4578413544944855} \ No newline at end of file +{"step": 6102, "tokens_seen": 199950336, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.1541889009453166} \ No newline at end of file diff --git a/checkpoints/model_weights_000000032768.pt b/checkpoints/model_weights_000000032768.pt index 376c326ec71dbc6b804601b07905e6e80d816acf..7f6221a777783bc8bde50d7335a40476671f01ca 100644 --- a/checkpoints/model_weights_000000032768.pt +++ b/checkpoints/model_weights_000000032768.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:42433f5564c5d2d4c25460186789bd97ad364c1c5c2eef47d74cd86afc616302 -size 151183829 +oid sha256:89ab8b7a75e0611edbb0998131098c13642cfcb21a4b84411f8557a8dfb3f4d3 +size 234226143 diff --git a/checkpoints/model_weights_000000327680.pt b/checkpoints/model_weights_000000327680.pt index 19063561ddd9d93960cc79bb8e2b1a501e1c67b1..f6bd4628bb2bc97fcfd18ccd18b71bcba01da25f 100644 --- a/checkpoints/model_weights_000000327680.pt +++ b/checkpoints/model_weights_000000327680.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:db794c3e5d8df814d89a81233ad2ba07778990fd19d2a8c96a920e020b1dd3ab -size 151183829 +oid sha256:e919b6c40b2489584d343c4750166faa04ad1deaa6de0ddd9ed60268d9e1e6ec +size 234226143 diff --git a/checkpoints/model_weights_000000360448.pt b/checkpoints/model_weights_000000360448.pt index bf86fb0902fd26ebc2505e8ceba3e55d6ac82862..902b4bc3b41770af8a4a94670079dc631e46ef4b 100644 --- a/checkpoints/model_weights_000000360448.pt +++ b/checkpoints/model_weights_000000360448.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1dd70be592b6eabaa2c7bda67f4cbbfe21775502ca401f451ad5fb7069aeb743 -size 151183829 +oid sha256:542473aff0cb1db66f02bc9bbe4dc366161165ccb834fa51c6d6752ddeb1438f +size 234226143 diff --git a/checkpoints/model_weights_000000425984.pt b/checkpoints/model_weights_000000425984.pt index 15d1f2988371ca4f3924557770796dc95450e3aa..d3d6948415f6beb6751ceaa41f128b86cc602477 100644 --- a/checkpoints/model_weights_000000425984.pt +++ b/checkpoints/model_weights_000000425984.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6a7777a3ddb57b350f3177dd63c89ed0f89ecfe7e728a13d0df97b8fbd084327 -size 151183829 +oid sha256:dc8ce9c839e1cfc5ae837e14f9f44230bda4df49491ae6664dd8691f30f9cec1 +size 234226143 diff --git a/checkpoints/model_weights_000000458752.pt b/checkpoints/model_weights_000000458752.pt index dcd79c52f4174c035605924c4153d1de7c789414..5134c42f22ce4bfaa26b5baeae27b8e9b9ec15be 100644 --- a/checkpoints/model_weights_000000458752.pt +++ b/checkpoints/model_weights_000000458752.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:95b5f4e7ffef83a9ac3ebf622af21a2e629b0bf80bcabbc4871f976a43b10ec8 -size 151183829 +oid sha256:4ed191da20dfbb2e1ab5ec35861168f2b323a668d5e12e87fba7481cd1c8a6a3 +size 234226143 diff --git a/checkpoints/model_weights_000000491520.pt b/checkpoints/model_weights_000000491520.pt index 8061814c1c3db953c94b6d3ed01861559f726498..03b775da958ab4f5f58213445fa71e86dbe41d6c 100644 --- a/checkpoints/model_weights_000000491520.pt +++ b/checkpoints/model_weights_000000491520.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a6f8cbe6dcba81cc86f86208b5ec74a3763697d6f1ddd46f117c9e6e0d87e128 -size 151183829 +oid sha256:5b1ec2fc29759ec5487e365bef4114629736e4ff29a8be4e8b307eecdd9783d7 +size 234226143 diff --git a/checkpoints/model_weights_000000557056.pt b/checkpoints/model_weights_000000557056.pt index f89dfd185635bd77e3a40838d1ac0c158442ee31..0d77ae0210a65259418e216445b195e14a0e2cb9 100644 --- a/checkpoints/model_weights_000000557056.pt +++ b/checkpoints/model_weights_000000557056.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0463774351018f03ae653c12f7463646e654610b88ffd26bc1d128fc075a92e9 -size 151183829 +oid sha256:4dc3b57140362171dd1aa9a0019cae21ae325876a30580ccfa61d73895273ba1 +size 234226143 diff --git a/checkpoints/model_weights_000000622592.pt b/checkpoints/model_weights_000000622592.pt index 2cc46c8ebcbbad73bd7df01d91cde23707731bd5..2c4dcd13b67e2d9bfce6193bc52053049c6f6f5c 100644 --- a/checkpoints/model_weights_000000622592.pt +++ b/checkpoints/model_weights_000000622592.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:df016922d93fe069ae76dcf58f3134d1100f759179d50c7badbfdcd3673a1e37 -size 151183829 +oid sha256:d87ec2e8370d2d8b83941c4a2bd0be914b2d3eb75a7b154605958eda3540e1a8 +size 234226143 diff --git a/checkpoints/model_weights_000000688128.pt b/checkpoints/model_weights_000000688128.pt index b02e2242a4d44cb17c70f83fcdc5368b3b58195a..fa1ffaba69669307124a99a6d82895a1fc196c0d 100644 --- a/checkpoints/model_weights_000000688128.pt +++ b/checkpoints/model_weights_000000688128.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d86f0e9be18093e885f7634ad6c8228959c22d5e7a2aa253fb20eb97842c7e2e -size 151183829 +oid sha256:96e2432cbcc9ca950d50b6aef0a6f10b7a3665feea562e207b951bda43dcf06f +size 234226143 diff --git a/checkpoints/model_weights_000000753664.pt b/checkpoints/model_weights_000000753664.pt index b1059a2581bf8566832556116ea34969bde39c35..1f7e79d68d09ffe329f1ee896630e723569880c0 100644 --- a/checkpoints/model_weights_000000753664.pt +++ b/checkpoints/model_weights_000000753664.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:638b19a3867bff80c72488e884cd5f37f8d71bf8d680ca983cf561fce8da6de2 -size 151183829 +oid sha256:ddc97a8e9a1935a85072e4c3311362b6174d98c22ba0d0bc8f5ae94b3e18b419 +size 234226143 diff --git a/checkpoints/model_weights_000000819200.pt b/checkpoints/model_weights_000000819200.pt index 3aec4ef2f9012c88d7e27db751cb174306cbd64b..b90d53654b4c1a75848b339edaf88eb3d18d0a5e 100644 --- a/checkpoints/model_weights_000000819200.pt +++ b/checkpoints/model_weights_000000819200.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dd104b519fd0f20ed8987c167ea67f9988d7d7956a7e45e20146e113e87b51f0 -size 151183829 +oid sha256:da3f925a6a633ace6fd69547852513754b71ddc64a6aaecfd8b9caf99975b9e4 +size 234226143 diff --git a/checkpoints/model_weights_000000917504.pt b/checkpoints/model_weights_000000917504.pt index 5d787972ea54fde61897364380d142d36a860c4f..ccfd6d4b61e46b6231541a93822d56bb6eb7735d 100644 --- a/checkpoints/model_weights_000000917504.pt +++ b/checkpoints/model_weights_000000917504.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0f599c573d7305d1ba1c7539112de7bb3822588a5824e37168887cbde2f637b8 -size 151183829 +oid sha256:07a8f2fbe1e8fe3ff169c64d240a8d523bca992961447052b98ebf6df9c72ba0 +size 234226143 diff --git a/checkpoints/model_weights_000000983040.pt b/checkpoints/model_weights_000000983040.pt index 375c0c8b6277fc149a9ca346d5145039041ee643..80d79333576ccc2bc86a2bbad3c288736acdfcc0 100644 --- a/checkpoints/model_weights_000000983040.pt +++ b/checkpoints/model_weights_000000983040.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4f7a67cef948740f7cac26379149cf0986f704ac0a0c0e6c80a9d17b3ed593c6 -size 151183829 +oid sha256:2fcc3047289920770967eef0c6c14741f74165f6da09765b641c50297f6e0814 +size 234226143 diff --git a/checkpoints/model_weights_000001114112.pt b/checkpoints/model_weights_000001114112.pt index b2e969fa9f3d59331f5d96f3ebb7c0bd71bf7b34..0686facd020770139cd2b38460493b97ec182962 100644 --- a/checkpoints/model_weights_000001114112.pt +++ b/checkpoints/model_weights_000001114112.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e0473da235bc7205a424439e0b2a398ef4006194e8fa571874ad81daf6650dbd -size 151183829 +oid sha256:e83b6a781bb4390bc10296470f7482c5cb90f75c80482146d306cccda267af6b +size 234226143 diff --git a/checkpoints/model_weights_000001212416.pt b/checkpoints/model_weights_000001212416.pt index dac017a2edb1164cb692bcc57e1aa219d1541a72..75df8d232d29c64925764e6f77e3d87e94dc3d5a 100644 --- a/checkpoints/model_weights_000001212416.pt +++ b/checkpoints/model_weights_000001212416.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d0f3fcb05a9719802a88b31832a835f1a15d84525fbe5bb1a8d1068314658df0 -size 151183829 +oid sha256:37bfab82766373124e39cb3eec839fcd69c312465aa6f5be15be972a9471e1aa +size 234226143 diff --git a/checkpoints/model_weights_000001343488.pt b/checkpoints/model_weights_000001343488.pt index ad72843801e15c4b0712817eabb244b216c85f07..bbaa9b3374c698d41479aa298614a9ab83767d9a 100644 --- a/checkpoints/model_weights_000001343488.pt +++ b/checkpoints/model_weights_000001343488.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:39e6b66db637360c2b1abe19aae6c56f147483e32c8c6c55a00f234079557876 -size 151183829 +oid sha256:477bd84800c8b4dafc151d62ccfc7bb07cddfc82d5d778a60cc920ec31b5410c +size 234226143 diff --git a/checkpoints/model_weights_000001474560.pt b/checkpoints/model_weights_000001474560.pt index 32109fc9264949593f4925dc551af42dbddbffb9..8c050fbff9e32c1ecfcad01172e74d326bd79938 100644 --- a/checkpoints/model_weights_000001474560.pt +++ b/checkpoints/model_weights_000001474560.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:57ee8c9e66069116e157beac7b5e2decffb05050e27250c971983cbfee1f177f -size 151183829 +oid sha256:4346551f028b9f10c6d4523c4619026ecef9242b1ebcf98d555df150f5aee1cf +size 234226143 diff --git a/checkpoints/model_weights_000001605632.pt b/checkpoints/model_weights_000001605632.pt index 4eaff40abb3948aec1100330017553d7eb5a98e4..68a4dc162c9521a7da9b6f6f1a5911ce21d715e7 100644 --- a/checkpoints/model_weights_000001605632.pt +++ b/checkpoints/model_weights_000001605632.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:204c319617ce9ec5767f169407c8c174baa2af19d3c14a199d01efc65dc491ed -size 151183829 +oid sha256:8c7bb6ff56f7dc2f7a24587ca39650d1d9ceb0ac13a8a6360181b20fffc7bff2 +size 234226143 diff --git a/checkpoints/model_weights_000001769472.pt b/checkpoints/model_weights_000001769472.pt index e9863b1820d8faccaee73b1d525b1942ec5b319c..7feeefb7da097222e3ada9ac704d3707ee804eb7 100644 --- a/checkpoints/model_weights_000001769472.pt +++ b/checkpoints/model_weights_000001769472.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a02724cecf461b8b8200029ae86f6eb24f824bc25e85e69e2590fb9f02d4ee77 -size 151183829 +oid sha256:2d82f842ee0ee91e9cf6514d719c7cdc8126fa137bc66554832dfd8ac60f2748 +size 234226143 diff --git a/checkpoints/model_weights_000001966080.pt b/checkpoints/model_weights_000001966080.pt index 5e6e50635d3b7b04c2f3c0fd88fde3a3a1bdfaa3..430ef6d253666c5acc8375746060c3d809c286c1 100644 --- a/checkpoints/model_weights_000001966080.pt +++ b/checkpoints/model_weights_000001966080.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b904da673c2c2feb4c34afa919c8381b94a9c765ff8ed4b45f1f76f82242cf3b -size 151183829 +oid sha256:596b1ab35a23ff5183b566e1ed0c7cf07c8809894261a007b517a125bfc45d94 +size 234226143 diff --git a/checkpoints/model_weights_000002162688.pt b/checkpoints/model_weights_000002162688.pt index d7e99bb049bacef878c73477d0f0cfb4be1d9409..feb22aeb0517993cd3e73a22a8ed7b445f03ed79 100644 --- a/checkpoints/model_weights_000002162688.pt +++ b/checkpoints/model_weights_000002162688.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5261d7dc82bda08823ad9026a093362eb0a210a5be66b2ad4d979b3650d5a9aa -size 151183829 +oid sha256:beb251cf79705d29cf2da2f32037e82b14ec2c2f2ec1ee88ea37b27effa11fcf +size 234226143 diff --git a/checkpoints/model_weights_000002359296.pt b/checkpoints/model_weights_000002359296.pt index ef3cc0a2ef7d569608c58e45a6f4d27123a4eb9e..698dbb638292ef8338130b0f5d71a02200a3756e 100644 --- a/checkpoints/model_weights_000002359296.pt +++ b/checkpoints/model_weights_000002359296.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f4ca07aee0021b68ded9b96d5b193fbf1695019ad8f9a188e32c82c45f00fa20 -size 151183829 +oid sha256:583a1c68a6520f7c6edb53c1da582a829b79921fa43694e848d1532fca7bd9ae +size 234226143 diff --git a/checkpoints/model_weights_000002621440.pt b/checkpoints/model_weights_000002621440.pt index 1746c53738064742a413254bdfcb52b08497dc5d..56a2f7f1419007aeb2dbb531eaf40d7fed93600a 100644 --- a/checkpoints/model_weights_000002621440.pt +++ b/checkpoints/model_weights_000002621440.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:07f146338da7afe3e283927b973ccbfaad53d5dba0295e50d65acc9d9b378899 -size 151183829 +oid sha256:f728fc79cda08757ad883858724005cb0e61d52e0efa16249a5a4e1495453b8b +size 234226143 diff --git a/checkpoints/model_weights_000002883584.pt b/checkpoints/model_weights_000002883584.pt index 68d507ec9eb81ca45b4f7a0ad62a99bd9998fed9..89094b78fd732036110269334269ef31207e88c0 100644 --- a/checkpoints/model_weights_000002883584.pt +++ b/checkpoints/model_weights_000002883584.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5c2c6a3e3df454ecad9fffb523d27a81e278d55fb32dc5feb29caa946cd85a30 -size 151183829 +oid sha256:cb1f20d25f77bdf93c0307f2ca0aee737dfbccacadb09f7e7e364d7635e46ebe +size 234226143 diff --git a/checkpoints/model_weights_000003178496.pt b/checkpoints/model_weights_000003178496.pt index 75c04f5edb859391602e4aa4132fd4e71170fee0..510b0f7a8d19e05e5b3bea697d098d9e161fd40d 100644 --- a/checkpoints/model_weights_000003178496.pt +++ b/checkpoints/model_weights_000003178496.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2a4cf0c0648f20f77d3e22ed014cfa33e4e31c6d4a67f7128bf011b09f768c30 -size 151183829 +oid sha256:424e084a912b5d8d2760b82daa80a209fa37dc42e2c77bae49d5d71c8d993a91 +size 234226143 diff --git a/checkpoints/model_weights_000003473408.pt b/checkpoints/model_weights_000003473408.pt index 0186375b7ab9e55f823845525eea558aeb317917..539ec0de6d2d2163693936929940d982b0b8f82f 100644 --- a/checkpoints/model_weights_000003473408.pt +++ b/checkpoints/model_weights_000003473408.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:08e18b6fd6ba0e258c99c5cbe2fdfe45ba24986c2ea1bd20000bdb8bbd5ab4dc -size 151183829 +oid sha256:a12e7df724b5697fff28f280036e9a8b11b9428274e089adab7779ec77e8f574 +size 234226143 diff --git a/checkpoints/model_weights_000003833856.pt b/checkpoints/model_weights_000003833856.pt index 9ba787e7161e0039a0a7e20369f0765a8d93542e..80422291ac54fc201c102ec8b788722ba80136ef 100644 --- a/checkpoints/model_weights_000003833856.pt +++ b/checkpoints/model_weights_000003833856.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:23a7729a5ad3cf0fd0ebec77dbc5c3c78420d5404d8eb9b89eabbc68584a851f -size 151183829 +oid sha256:66e1fb23f55d74be5473a2117c33f539f67fa884f9ece2d90ca7e29551868242 +size 234226143 diff --git a/checkpoints/model_weights_000004227072.pt b/checkpoints/model_weights_000004227072.pt index bb5452dc8c775c8ec756d20adc80ad8f10209e1c..ec0043b4860e679e61f4e88b86debc2f4e457237 100644 --- a/checkpoints/model_weights_000004227072.pt +++ b/checkpoints/model_weights_000004227072.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c9382f7259d338426cfde10b275afe8f19c00fdb98f47347768d365ec785a0d2 -size 151183829 +oid sha256:77b7b9f348b63e11fdbd62ebe813651e3ebb38cec81a52811c3ba752bc1bbb64 +size 234226143 diff --git a/checkpoints/model_weights_000004653056.pt b/checkpoints/model_weights_000004653056.pt index e37c8f096bafd11940b9cf52f851fe6a9d639763..0ee9ba4fa2d99b5fb0040c67f617ed798ac10c94 100644 --- a/checkpoints/model_weights_000004653056.pt +++ b/checkpoints/model_weights_000004653056.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ad5352ada566445b01eee6fc71c36a49c857c8e58626a9bf1f1a1a76296a0412 -size 151183829 +oid sha256:247721eef02d0a40d746e0c970619ab1724baa3e47618f87825d19a68a48961f +size 234226143 diff --git a/checkpoints/model_weights_000005111808.pt b/checkpoints/model_weights_000005111808.pt index aca5974b67baeb89d0f94d704c39c1b2da151bec..a05d4d1b83077517aa0cd3ac461f85559730b0db 100644 --- a/checkpoints/model_weights_000005111808.pt +++ b/checkpoints/model_weights_000005111808.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f9ed98aef22cede1dbfcc52f6d25336c81e17f044ace0dcbd84ade7cc2358141 -size 151183829 +oid sha256:6f6608a08aa882a6e995b9ea23f36252831e699cf52a27de6ef99361212222be +size 234226143 diff --git a/checkpoints/model_weights_000005603328.pt b/checkpoints/model_weights_000005603328.pt index 4d53c685f50c644f620491bd70630db581f8c2a9..8abfd7cfeef613562de9a8b0429a769d1b50c9ec 100644 --- a/checkpoints/model_weights_000005603328.pt +++ b/checkpoints/model_weights_000005603328.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1a4d5517914298d9aa5f3d9d3e375323932c280d9de39ec7eb3d6b49653c046d -size 151183829 +oid sha256:17e22e7c1a2d5361b4f46bdec1ba8d89aecfd0ea61840de7f41f144338c24ff6 +size 234226143 diff --git a/checkpoints/model_weights_000006193152.pt b/checkpoints/model_weights_000006193152.pt index f02319a5956a28fb949f08adf07ce46342e210dc..3590396df7926ac7d04a83e5f571082d93a132ac 100644 --- a/checkpoints/model_weights_000006193152.pt +++ b/checkpoints/model_weights_000006193152.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ba5a26e612e66843500e1b8c5a3a0910c97fc7753b399d8411d3978eb340a07b -size 151183829 +oid sha256:f92c1d9982c936b2b0fb9c14d3a06b86e54375bf29af2306279d5644e2585816 +size 234226143 diff --git a/checkpoints/model_weights_000006782976.pt b/checkpoints/model_weights_000006782976.pt index b11fd169c7d0fe9acae47092eacea131115237a6..51c7baf8894c354b5847cf1735dc1d642e65009f 100644 --- a/checkpoints/model_weights_000006782976.pt +++ b/checkpoints/model_weights_000006782976.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:083f24cdb0e950f84e0f34b4edc4767f7e2348328286ff9deb51d2a910d1c827 -size 151183829 +oid sha256:60f21c9fb16f352e55ae929c3d81eb0c0655cccb602b02a28f496870e4556e19 +size 234226143 diff --git a/checkpoints/model_weights_000007471104.pt b/checkpoints/model_weights_000007471104.pt index 9d24e020d53f28008317a3d29ce0a9471ab3af86..cabc5878d652bee1fbf7c0ca81f6128088126bcb 100644 --- a/checkpoints/model_weights_000007471104.pt +++ b/checkpoints/model_weights_000007471104.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:93487c03a02fb1addc8352798ea870e408833a2736dd71dcced0a13dc6eba50b -size 151183829 +oid sha256:34e6101f8430ba83678c0fe59f814137957e3095fcfc8517fc966423ed666b7d +size 234226143 diff --git a/checkpoints/model_weights_000008224768.pt b/checkpoints/model_weights_000008224768.pt index 50df6c2762eea4ffaef63c3df18389ac79367af0..55c57f20068a9959f96b6a7181b55493f7ba343e 100644 --- a/checkpoints/model_weights_000008224768.pt +++ b/checkpoints/model_weights_000008224768.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d526f740034d485dbad918bbe050f19ecc8f3bc0244c9d2064c649f3fd7cf99b -size 151183829 +oid sha256:4298ea42d8fa0210ebd70c8bb5e7e95b522d938528d78c3c4b09991c2ba99814 +size 234226143 diff --git a/checkpoints/model_weights_000009043968.pt b/checkpoints/model_weights_000009043968.pt index 683fd6956c01355d3a0ae433e4e0305d29b482bd..b2507f9b3bce55be93ea093ff0f5283833094ac5 100644 --- a/checkpoints/model_weights_000009043968.pt +++ b/checkpoints/model_weights_000009043968.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3bb36f8be7f35222b80b0922e9d28f894af35a06ad8492a4a21acaef5a3feec4 -size 151183829 +oid sha256:854e476019163e43b52d792f911f844c9198cb2373599b65b12092cc12a9d596 +size 234226143 diff --git a/checkpoints/model_weights_000009961472.pt b/checkpoints/model_weights_000009961472.pt index b0e02403a11d6c2518d1df65113e9842d685bde4..6417ec5e289c85e6ad773524946a0d1fbb15fedd 100644 --- a/checkpoints/model_weights_000009961472.pt +++ b/checkpoints/model_weights_000009961472.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3934ad71e84858fc2d50d4fa7cd92cf148efbfe68f6a722bdfbb027b2bb3ccfd -size 151183829 +oid sha256:01cca75622fc160b12682ff7fb7ec3486274a98dd0b49605fcde9e669e080797 +size 234226143 diff --git a/checkpoints/model_weights_000010944512.pt b/checkpoints/model_weights_000010944512.pt index fd214cc9e44e67aaaa6e817b6ce665a42bf6022d..076c4917ab259f0f24c9bc87da48866c135e835a 100644 --- a/checkpoints/model_weights_000010944512.pt +++ b/checkpoints/model_weights_000010944512.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8bbc9cfafcecf0273ca6514fe38193714af5d9e7eab4a238d75e68be525de510 -size 151183829 +oid sha256:2c51399248c5ceb6ee13416c1e6c35c4271cc52e83ef931217399e6635bf777d +size 234226143 diff --git a/checkpoints/model_weights_000012058624.pt b/checkpoints/model_weights_000012058624.pt index 0f5c74cbfe2ed2540e64e1bd482ea15a16819fbd..2955fd04fd245ac7e57296c48fe222f45181a7c6 100644 --- a/checkpoints/model_weights_000012058624.pt +++ b/checkpoints/model_weights_000012058624.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b5d2e6e5f5db1e53cc0a370e610c0054e49df0e98a5ae830a304dbc357613d5f -size 151183829 +oid sha256:f3a74f634633a278a58209572058ae827ec7e01ee5a96696045bb3f93afab04b +size 234226143 diff --git a/checkpoints/model_weights_000013271040.pt b/checkpoints/model_weights_000013271040.pt index f1462a56348de7811dfbe0b4ad30b4e747ee6964..d9941d4d968286fdbae786d179b8a6cd9965360f 100644 --- a/checkpoints/model_weights_000013271040.pt +++ b/checkpoints/model_weights_000013271040.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:46446fb28d810141b2f8e96d991bb8f58069dd3f08a4f2d5e930b3ce71694898 -size 151183829 +oid sha256:53155fb2890011ad150acc43f6003581a314c9bd6959a97ad543617bb1d53647 +size 234226143 diff --git a/checkpoints/model_weights_000014581760.pt b/checkpoints/model_weights_000014581760.pt index 8b5efdc0e85f4efb46ba7920c360fbeac8ed2302..364af4a9a7c7f3eefcce4575da2505caa22ad969 100644 --- a/checkpoints/model_weights_000014581760.pt +++ b/checkpoints/model_weights_000014581760.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e95da01edab15e09f239369497d0f5c7d26e5422a4f1dcb3978eeb39b86171ef -size 151183829 +oid sha256:98682e9f320585ccba1290bf5ef7b2137c49a1798377da25b44f50d6ade4629c +size 234226143 diff --git a/checkpoints/model_weights_000016056320.pt b/checkpoints/model_weights_000016056320.pt index bc03e1c9f4634efba5a8221e0473bcf28c4205bc..0514bdf48148ba8737ceaa9bafd763f0a3a6e426 100644 --- a/checkpoints/model_weights_000016056320.pt +++ b/checkpoints/model_weights_000016056320.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:828ff7704873a2faf8541d9199322f47ba105af7824e2cab720efe44447cfe5c -size 151183829 +oid sha256:83681036defad3242b3cc385e4fbffe11670693e2fdf6fe99a08ccc322eb509b +size 234226143 diff --git a/checkpoints/model_weights_000016384000.pt b/checkpoints/model_weights_000016384000.pt index 5801beb772f24955afe66546518bd4aaf468367a..735ad8aaf85522fe1073f724336a222df762b8dd 100644 --- a/checkpoints/model_weights_000016384000.pt +++ b/checkpoints/model_weights_000016384000.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:52ccd8304cd0bfa6068838b9ca42704d5d6796c4d6697bc988e49ca7b30a080c -size 151183829 +oid sha256:dee7ce0f45dd723646197a9f196cfad58d74ff5540a6dd950743d4ea0fd08012 +size 234226143 diff --git a/checkpoints/model_weights_000017661952.pt b/checkpoints/model_weights_000017661952.pt index 48d43bd62b74ba5b6733582cb42c629ca6a95e91..06b95902d9fa525f18cd330886f06a47499a3388 100644 --- a/checkpoints/model_weights_000017661952.pt +++ b/checkpoints/model_weights_000017661952.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f37334ac2b0ae22a9a4bca6399f811c5e9d0d2c982e6c8a0f987fc78baba8777 -size 151183829 +oid sha256:f7ba7aa2beb95e2579537b631a8f6ab19053cb730a04a501395f8d3ad5a845aa +size 234226143 diff --git a/checkpoints/model_weights_000019431424.pt b/checkpoints/model_weights_000019431424.pt index a0f9d79c9e108ae30af94c1a803f0e2e634a9f70..1fe831a15d8792add2fafee2459818a5afd0dec9 100644 --- a/checkpoints/model_weights_000019431424.pt +++ b/checkpoints/model_weights_000019431424.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e75c402639aa9c6c399f478431f2c2fea7fc61d00012687b92c8b429237a16dc -size 151183829 +oid sha256:e0c35c25503b15b80089e09854214bb7782ff7366474833fe72e54143e4f172d +size 234226143 diff --git a/checkpoints/model_weights_000021364736.pt b/checkpoints/model_weights_000021364736.pt index 0f23f72e9d3048640eba3b4c871840e1fe521095..2554d52655e083f97470ada9644fa8677fc56dff 100644 --- a/checkpoints/model_weights_000021364736.pt +++ b/checkpoints/model_weights_000021364736.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:11567a200db2ec7810ac9940014e175dd5cd959cac8566aab1aca6a83b573364 -size 151183829 +oid sha256:08f53fc8f71b546c0c5be5fa3ef779a7e36d812fe78db08927da8b08c96c8977 +size 234226143 diff --git a/checkpoints/model_weights_000023494656.pt b/checkpoints/model_weights_000023494656.pt index 21ed12d5711a75f95923f79e9c03882287aaa616..4c493a78da3df2b832f4c763c16af9649576ddf7 100644 --- a/checkpoints/model_weights_000023494656.pt +++ b/checkpoints/model_weights_000023494656.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6838fd8b423557094093aed4910cbd6cf4250c9e517c55009409e807a629c2ae -size 151183829 +oid sha256:cd1eb2c66b9bb9020998baf0a974fc68c92ae6bfa98124e172bf8dd3c6b6a411 +size 234226143 diff --git a/checkpoints/model_weights_000025853952.pt b/checkpoints/model_weights_000025853952.pt index 11179895b3ef2ec5f719d421304d7c3765e152b9..3a7e8b204ef3f537974dbaa4c561b329aaa47fbb 100644 --- a/checkpoints/model_weights_000025853952.pt +++ b/checkpoints/model_weights_000025853952.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cbfeb2be6a31ea180ef2013ffdaa0f39690441aa9bcd48def933fb74f7e56ed2 -size 151183829 +oid sha256:488368a572ee91b15a723f0f866444e638dc460d0e18ceffc6b4cbf6a4e81c06 +size 234226143 diff --git a/checkpoints/model_weights_000028442624.pt b/checkpoints/model_weights_000028442624.pt index 9f5d4c299ee4f9ef6a45889e5d673e110f70c8ac..3867e39044f12c1fcc6bc0aca23a25d1ca582ac8 100644 --- a/checkpoints/model_weights_000028442624.pt +++ b/checkpoints/model_weights_000028442624.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:61ce07417697759084c7e3ff9b31b5aba3095ab0d9b9309c3ebf4aacd5022dd1 -size 151183829 +oid sha256:8a930faffe752cb358c54d987904d347ab7694a35a95f0c5b07c3593686fd436 +size 234226143 diff --git a/checkpoints/model_weights_000031293440.pt b/checkpoints/model_weights_000031293440.pt index 4b69774535a1a089d74eb78f6cba59bc4c03759d..316a024dd5f83352bc2118900a323962b76ab790 100644 --- a/checkpoints/model_weights_000031293440.pt +++ b/checkpoints/model_weights_000031293440.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:042561b3ec35d52a3533fdcb99afdc9f6144e5c5eba960419adbf0103e01456e -size 151183829 +oid sha256:dd17484832cfb8608f78df72c8ac2d11ca9596bf3ff436b6eb2e4e1cad23f5ee +size 234226143 diff --git a/checkpoints/model_weights_000032768000.pt b/checkpoints/model_weights_000032768000.pt index 45a50870ad83f2a62d4ff0af26f3f85e9cc02311..7ba84ead13efa990f176de34d228ea97677eb226 100644 --- a/checkpoints/model_weights_000032768000.pt +++ b/checkpoints/model_weights_000032768000.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f614f90bab827faa96b6de3ca2e178ef708f9454b85e24601d5bb54286b9b1f5 -size 151183829 +oid sha256:4caac26e32c45f3785dd5020d02dd1a7161860a82614dc7292138564d6030c93 +size 234226143 diff --git a/checkpoints/model_weights_000034439168.pt b/checkpoints/model_weights_000034439168.pt index 2d32f62fc6edbffe458a092ce16e02f7b8a81658..71820fb741c4709f03503d12fa470dc26355cd1c 100644 --- a/checkpoints/model_weights_000034439168.pt +++ b/checkpoints/model_weights_000034439168.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:91f2646a7df66bd7312d63973b4ac1eae7e759f4ad7dae615188adbe7a0d5bf5 -size 151183829 +oid sha256:9b8e201de0aa04157fe850fcbab9bb6300f0a71114b9240aa5387c958c4e4453 +size 234226143 diff --git a/checkpoints/model_weights_000037879808.pt b/checkpoints/model_weights_000037879808.pt index 1eb7259b4cb230c0d0ca277432d33a69b1454236..1966638e70807baed85dacf03e5cfa25f9d1456c 100644 --- a/checkpoints/model_weights_000037879808.pt +++ b/checkpoints/model_weights_000037879808.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f47d553fbfe3f73aedff54fd431f1e2370ba89bbed1cc6482d6c2b42593aeb43 -size 151183829 +oid sha256:8795d7d0636032dea7d7ef81e5111a88940dad649ebad0d91c82a4c28c713adc +size 234226143 diff --git a/checkpoints/model_weights_000041648128.pt b/checkpoints/model_weights_000041648128.pt index 2af16d30909c34e9facf7045a2d34d37760c12c4..6a4888864cc59abfaed17a7ba83d595d888ef3ee 100644 --- a/checkpoints/model_weights_000041648128.pt +++ b/checkpoints/model_weights_000041648128.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:80dee0538147c669d85dcf4ce90aec4aea903cae25964555ee2a470de2612018 -size 151183829 +oid sha256:5a8f39e4306d5aa3b47e64148d033e545b834ef35cfa46cf657fe22b058fdef1 +size 234226143 diff --git a/checkpoints/model_weights_000045842432.pt b/checkpoints/model_weights_000045842432.pt index 0343f22bcfc6bb775e739ef906549a0d645b3f42..aaa08f48a4da7ad3929a773ae1d6214701e92f2e 100644 --- a/checkpoints/model_weights_000045842432.pt +++ b/checkpoints/model_weights_000045842432.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a438f8998185c715001a3841ace7e917863da8472b8340cdbf66552f1d043298 -size 151183829 +oid sha256:fe91d7a212fcce8c1dc14df5844fe6c3d2b89633349ab7ea0bcbaf8c0f7def29 +size 234226143 diff --git a/checkpoints/model_weights_000049152000.pt b/checkpoints/model_weights_000049152000.pt index a7db098f9f92e20293f792021ca815bdecb030c1..b4b2e0f91f29e800f9506d014d4c9b66b8a19fa2 100644 --- a/checkpoints/model_weights_000049152000.pt +++ b/checkpoints/model_weights_000049152000.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eb3fa59b41a4223650318d22e1601ad2a7a46711fef2b2a14bd1c6ed280d69de -size 151183829 +oid sha256:d5b4dd998b47d1fb49d6e2cc31df3988a300a8c14a7b0e148687dd5227370a2d +size 234226143 diff --git a/checkpoints/model_weights_000050397184.pt b/checkpoints/model_weights_000050397184.pt index 9ab3c3dc46a9d492eed5dfd3a346fb8fd289836a..e61d62108fb4ea7aa58967b218f92045cb5b7595 100644 --- a/checkpoints/model_weights_000050397184.pt +++ b/checkpoints/model_weights_000050397184.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bef4f0c18c238515259680784c3634d2d9ba2ac1e7c303684105d6dd3200cd2e -size 151183829 +oid sha256:06db438684ac92e74939d7ecc86aad8558416a405a2ddf361b316e0d768a6b32 +size 234226143 diff --git a/checkpoints/model_weights_000055443456.pt b/checkpoints/model_weights_000055443456.pt index f1b298943e76c59470aa1e5aca29967e4949f97e..a4d09cb08658ad2d42461112d7573dd7917222bc 100644 --- a/checkpoints/model_weights_000055443456.pt +++ b/checkpoints/model_weights_000055443456.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:281776104ff868bda35d31807f41eb483139f1a491ee48e5ca0e9e0d5adbdc8c -size 151183829 +oid sha256:b451b39c0ea8ce9c3a6bec718d46aaf8f7a09cf72cd8169955855ef37379fe8f +size 234226143 diff --git a/checkpoints/model_weights_000061014016.pt b/checkpoints/model_weights_000061014016.pt index 8c6358d272b4d87ae587f4cb014ddeccf9211634..d13e5182f3d30ca82e5ff81fd311d5480487a0df 100644 --- a/checkpoints/model_weights_000061014016.pt +++ b/checkpoints/model_weights_000061014016.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2b31d7d39989c7ff62c47ef826577d40a8d3418f2c49cd15e2ee726debfd58ae -size 151183829 +oid sha256:3e0a928edc7cd404af61c89de19eed98e127d97b416b0c47e837acecd6286d10 +size 234226143 diff --git a/checkpoints/model_weights_000065536000.pt b/checkpoints/model_weights_000065536000.pt index ff42a72c2bf3d2e6fddc574a04e3a0dff892f4f8..578588dd1bd7a0f634a00c918487ccfb78e6bd68 100644 --- a/checkpoints/model_weights_000065536000.pt +++ b/checkpoints/model_weights_000065536000.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d5d01fc8e8cbf1be76dde579e17a4d1a97089c28df6b5b08580563b8dafaf654 -size 151183829 +oid sha256:92e572ed93ae54bdf02f9099dc87b87ce1b810b63e71e3ed67e76cc7d08659ff +size 234226143 diff --git a/checkpoints/model_weights_000067108864.pt b/checkpoints/model_weights_000067108864.pt index 53e9ca243caec38f03f4c2b40298bff8802429ef..2d59299fe2afcef2b9bd28c3c077d67906b2541b 100644 --- a/checkpoints/model_weights_000067108864.pt +++ b/checkpoints/model_weights_000067108864.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a21031ab0598fd69733d09e98d8d3cb931e35b7a4963a9f980e24a584d5e2bcc -size 151183829 +oid sha256:46bcafd243add89b20b23d4097ddc7efdea9e117cfb025edffaaa18550e478f3 +size 234226143 diff --git a/checkpoints/model_weights_000073826304.pt b/checkpoints/model_weights_000073826304.pt index 79822cdc629d1f7e31514d87cd69d3fbb2658052..e53e991516a3389b7603dd4583f5f93d5278fa45 100644 --- a/checkpoints/model_weights_000073826304.pt +++ b/checkpoints/model_weights_000073826304.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:314b1497679e95d116e8de86e2860d1737cc8ecb6ff1abf7e1c59e970fdee058 -size 151183829 +oid sha256:b3de99fa60c89a43bcda5d3d04dcb628de952b530161db6d1619cb01d26fee98 +size 234226143 diff --git a/checkpoints/model_weights_000081199104.pt b/checkpoints/model_weights_000081199104.pt index cbd579f41857871d959be57b3fa25f249bba93d4..590d5df557a6110a963bd0f83b7c4c4fae5ed696 100644 --- a/checkpoints/model_weights_000081199104.pt +++ b/checkpoints/model_weights_000081199104.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:74f658f4876476245691bf56fec17a2a395ec0471b7f17a7fb54b68adbab13c3 -size 151183829 +oid sha256:c85e8965bbd8347a7155dd82b37c802daa744efb84745e1782a84081bdb773e7 +size 234226143 diff --git a/checkpoints/model_weights_000081920000.pt b/checkpoints/model_weights_000081920000.pt index 81b0ffc1ac7cec9a04917782a679904dfa133e5e..1c4f87e06e74d1b9dca21eae02f383372d869f72 100644 --- a/checkpoints/model_weights_000081920000.pt +++ b/checkpoints/model_weights_000081920000.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c9bc4c2470f520c4ed3f4ce01e60e855eddaf23181c0ea4340e07286c0c4c3ce -size 151183829 +oid sha256:2d1162bfd5bd1e15bce09767ece8e48a3a79a2adb81f37e505bfea386cb10927 +size 234226143 diff --git a/checkpoints/model_weights_000089325568.pt b/checkpoints/model_weights_000089325568.pt index 310c9e855a8e039c462a6ec112e19262cde0fcc6..1b56291ea59961c8bc9c04aa6673b821224cbbde 100644 --- a/checkpoints/model_weights_000089325568.pt +++ b/checkpoints/model_weights_000089325568.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9c14d7690e178ee57282a4e2efc4204d6baaed9d68ead3dac949607713305ac7 -size 151183829 +oid sha256:9e1db1bbb93b348314618ed80819c031deb8b450dfe4e1f2acc7fbf9e8b779a5 +size 234226143 diff --git a/checkpoints/model_weights_000098271232.pt b/checkpoints/model_weights_000098271232.pt index d88e41e93e4a37f89aba2c990d609d28172d6709..f38d739e2eae6a1603b014c3e78d93ece4c46524 100644 --- a/checkpoints/model_weights_000098271232.pt +++ b/checkpoints/model_weights_000098271232.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:41677d899a01ccf663f74a8a9bf3971c5a5e84a90324c60d21300ddba55e758c -size 151183829 +oid sha256:338e71fc44b43448b2d3f3b034e15bd612cb868e35658e83822a3d4e972a3437 +size 234226143 diff --git a/checkpoints/model_weights_000098304000.pt b/checkpoints/model_weights_000098304000.pt index db499af120b15aba8c84bf6b2ba2738698e88a19..627f8009cd1120c0153672ab90a79c47ea0a0e92 100644 --- a/checkpoints/model_weights_000098304000.pt +++ b/checkpoints/model_weights_000098304000.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f81a65ba5579eb6b59e17d218983f0f3eab85fccdfd786ba27ccee9e5111a3f3 -size 151183829 +oid sha256:ecb9dc3ca0c5d84882a29242eb11b155f98f06a37697a85a1548e86e237d89bf +size 234226143 diff --git a/checkpoints/model_weights_000108068864.pt b/checkpoints/model_weights_000108068864.pt index a11ae31e330babbc8e6af349a47d664d7f715e5f..bc70b3434a43b2791e21280701dabba2e2263391 100644 --- a/checkpoints/model_weights_000108068864.pt +++ b/checkpoints/model_weights_000108068864.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4ec8cac49cb57a18bc9ca472040af42605f6588d1a3c1db216bf4649d5c21602 -size 151183829 +oid sha256:d9ad7b6e22daf4d41497117ce2cac4f3db44182c86c442f53155c80219bbb0c1 +size 234226143 diff --git a/checkpoints/model_weights_000114688000.pt b/checkpoints/model_weights_000114688000.pt index 8370268805a180a2ba93acdfd7be04fa9cd60971..9bfba36ee41af7c3f26d1e1aef2795a7ccbd7c10 100644 --- a/checkpoints/model_weights_000114688000.pt +++ b/checkpoints/model_weights_000114688000.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bbb4bca3debb8a5f9aeff78b7c6b44adeea5abbcfc494e85d160590208ef338a -size 151183829 +oid sha256:8f21e5848c706c29bb35ef8117b4c4b66921cf45313bbe248fe3c36abd0b761f +size 234226143 diff --git a/checkpoints/model_weights_000118882304.pt b/checkpoints/model_weights_000118882304.pt index 2a161a49cd473c4ae33aea2f1464077e057e87ea..a89c1570145ecdd9b5707e2b76a4f50b6f9ebee0 100644 --- a/checkpoints/model_weights_000118882304.pt +++ b/checkpoints/model_weights_000118882304.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dad525ac75a786b7c48559f73292ce6b0761326ecff0a00abf45fca0c7dbc4d1 -size 151183829 +oid sha256:34b950bf9a41abcfaceae04695a1912b5be2e71b662816a605cbb19008f71e8c +size 234226143 diff --git a/checkpoints/model_weights_000130777088.pt b/checkpoints/model_weights_000130777088.pt index 03615802dd2ac727c5dd12106fc3a1291c357204..c82593bc1c8bf935fcda384fdf5c0da1f3652208 100644 --- a/checkpoints/model_weights_000130777088.pt +++ b/checkpoints/model_weights_000130777088.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5dead3853e6408b78c95fd11735281656f8700f9ba0ab45a635b8180b77ea6e8 -size 151183829 +oid sha256:3f52da6c35e1be7d6b6a92686d0111af451f3fdc0a36f6f7f32adb21e344d8c1 +size 234226143 diff --git a/checkpoints/model_weights_000131072000.pt b/checkpoints/model_weights_000131072000.pt index 149912c671981c7d942909ffda7e745655fcbe85..7ef3abed78bf8dcaf1641ee1de82a52ed1f47faf 100644 --- a/checkpoints/model_weights_000131072000.pt +++ b/checkpoints/model_weights_000131072000.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0bf1f29a7bcfdb09251db1c83484db6d7a1067fe264303822a59479c66b03158 -size 151183829 +oid sha256:5aa791b7e23c29d6b470ee3af51ce73cae658522112be61f4fafe23f728682fe +size 234226143 diff --git a/checkpoints/model_weights_000143851520.pt b/checkpoints/model_weights_000143851520.pt index 738ca0d08fe94a2fd6b26ed25e11025ffb0d6610..aabd3d12c36f1745ff17cf4a6d7266f9e58171b4 100644 --- a/checkpoints/model_weights_000143851520.pt +++ b/checkpoints/model_weights_000143851520.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:288de1c16d9a08421c961db7352d39b8df472768ac6c703449f7a34650bf7861 -size 151183829 +oid sha256:cd4e9687b4c43d505f4687f04ffc0b051b6020890afb17dc7529c181d7b0c368 +size 234226143 diff --git a/checkpoints/model_weights_000147456000.pt b/checkpoints/model_weights_000147456000.pt index 295e31bafe27774ac3a568b02199d0c55781b737..0f5efe6f5b8e2046c6261ba40b9f5da01b5ef521 100644 --- a/checkpoints/model_weights_000147456000.pt +++ b/checkpoints/model_weights_000147456000.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a60dbba192bc2b853332448301d37f9472e007a2b1100deca8c73706a2293b82 -size 151183829 +oid sha256:5d1b493766132ed4e9242e9be2302f4a4e4b0cea52ab5654cb7a7ff5722537b9 +size 234226143 diff --git a/checkpoints/model_weights_000158269440.pt b/checkpoints/model_weights_000158269440.pt index 235d45a02eae169a8664d155213d44345ef80a30..2d735f4cb9451eb33b82aea4378ce0b214dd7665 100644 --- a/checkpoints/model_weights_000158269440.pt +++ b/checkpoints/model_weights_000158269440.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fcf6983f31ea0a6e8dfccfbc518ae4429a50b81a78e96151c4226c6be2dbbc38 -size 151183829 +oid sha256:c65078a8240e0621ba3b12596c8c3f680f0c733841722d11641e20271bb6a845 +size 234226143 diff --git a/checkpoints/model_weights_000163840000.pt b/checkpoints/model_weights_000163840000.pt index 6826a5805895bf1b9d9434cf4f677df435c4220f..9977bd5035510c1a0952adad2faa6332eeb733d9 100644 --- a/checkpoints/model_weights_000163840000.pt +++ b/checkpoints/model_weights_000163840000.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4b1c766285545374a4628169306238921323892fabc16efe4cac2e030f580e0a -size 151183829 +oid sha256:47c01e1e0df8d086a8081f34224be6188bbb070f0fbd772995382f9768e05150 +size 234226143 diff --git a/checkpoints/model_weights_000174096384.pt b/checkpoints/model_weights_000174096384.pt index 7b846307b998656161deee1c626c0b5ad135e49c..0145419d784d5739745fb198e5c27a389b289df2 100644 --- a/checkpoints/model_weights_000174096384.pt +++ b/checkpoints/model_weights_000174096384.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6385d8078c4dd13097d6991735ac2a9ff1095e6ef82e2040420907f1984d51d6 -size 151183829 +oid sha256:0de60c4154f8f65b2911c5e5578f57f1e5042d7e9a570c823d2e54f77e46425c +size 234226143 diff --git a/checkpoints/model_weights_000180224000.pt b/checkpoints/model_weights_000180224000.pt index 6e522514bacaa7bcd105b267f9cb1ad8d99e5adb..0891871d0f07c14a1a53bd7b5a45d7452e00d850 100644 --- a/checkpoints/model_weights_000180224000.pt +++ b/checkpoints/model_weights_000180224000.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:119c9199dfadb5302077fb1def57e503f39ae4ad5bf75a2a67c8b509259f7aa6 -size 151183829 +oid sha256:689819a3ebd7a0252d9afba99726a081c4547f9e6453768639ed2efa2fa6e6ae +size 234226143 diff --git a/checkpoints/model_weights_000191496192.pt b/checkpoints/model_weights_000191496192.pt index 8799e849631ed66969ccd21e967ddf034e0139bb..16555d2d1eb388fe7b327d4b74943889c8b7d832 100644 --- a/checkpoints/model_weights_000191496192.pt +++ b/checkpoints/model_weights_000191496192.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a48916d983b57e57dde693012ee713e4ccc9fa8f1754f2707e0e384bfa15e65e -size 151183829 +oid sha256:934ba6c4eaf145cdef1e42b42f134e5cd79cd6d5ba30f96700a1bd4635e21f79 +size 234226143 diff --git a/checkpoints/model_weights_000196608000.pt b/checkpoints/model_weights_000196608000.pt index 1202b4a615d2ec11821b8952a6bab85812339055..ca57ea82e1a430e5005ff49ab5757fd57686f99e 100644 --- a/checkpoints/model_weights_000196608000.pt +++ b/checkpoints/model_weights_000196608000.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f25fa26c5946fd7d6d7cbcff540914b9b8b004a8651828cdb4b9cbf9135d1c4f -size 151183829 +oid sha256:1897fb0e3d6a21cf439e83f645f75ac227f95fff8bb408595c02b4791bf3fdf8 +size 234226143 diff --git a/checkpoints/model_weights_000196706304.pt b/checkpoints/model_weights_000196706304.pt index 7ff06a94fa8066aeccd44e42aea2b4564f0b3308..87413e3b81a437dc6a81456324543df0f7efdead 100644 --- a/checkpoints/model_weights_000196706304.pt +++ b/checkpoints/model_weights_000196706304.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:38f0208bbe10a6a6672097497731f30b432c88667eedb126c6ac70bc64147270 -size 151183829 +oid sha256:ef39db7ad7139b8a7f4bab77435ce6bd4e6579515b8a135949d2f2b346a13e8d +size 234226143 diff --git a/checkpoints/model_weights_000197361664.pt b/checkpoints/model_weights_000197361664.pt index ca026d08bd4a33b81cf2278bac31121c87068023..58ba66b3f26734457ab9748a0016e433372a3e54 100644 --- a/checkpoints/model_weights_000197361664.pt +++ b/checkpoints/model_weights_000197361664.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c7babbfc6a6011af20eff8003ae7b8e86bbcc5648a7c3181bc06b490ba489301 -size 151183829 +oid sha256:a839970ad54d993bb31d3bde0311c0a1dbdffefcd8f2285eb48b5622b509779b +size 234226143 diff --git a/checkpoints/model_weights_000198017024.pt b/checkpoints/model_weights_000198017024.pt index 48415b8b7cd5e365ddd3cbd62e166bba51ff2c71..59a81bf675d6d92f3707246c64b5788b88886f62 100644 --- a/checkpoints/model_weights_000198017024.pt +++ b/checkpoints/model_weights_000198017024.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a8cd08fa6349fe63147e470b0a35ea18b734de5e0e3153515e1bbabc33e20923 -size 151183829 +oid sha256:fb31b63f40c200bb434543831190875d1fb736d94c82872ab676bb5b2fee9ba8 +size 234226143 diff --git a/checkpoints/model_weights_000198672384.pt b/checkpoints/model_weights_000198672384.pt index c1a0c1580dc80a3d0deda0e08c4f1179e2d17e76..d8462a5cab81304d3c2fe9e85730b284c4ad4f06 100644 --- a/checkpoints/model_weights_000198672384.pt +++ b/checkpoints/model_weights_000198672384.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:afd76f6a6e1b938d423b5f746d1fec3d8aeb9c2c762b8e239395da8d92be3e8d -size 151183829 +oid sha256:12e823b683e61e7cde11b92c1ffe1320d97ecfb0c8bbcfd3cbf8f2a3d0189e92 +size 234226143 diff --git a/checkpoints/model_weights_000199327744.pt b/checkpoints/model_weights_000199327744.pt index 595b1de70a4f6bea0a49a20ce9947b36c948222e..c37b17dd766a426d4b2d2bd4fa83e4ef36166d84 100644 --- a/checkpoints/model_weights_000199327744.pt +++ b/checkpoints/model_weights_000199327744.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c95e9d70f55f1b3f54e0a844bb9dd5b9858d36f03cc47c2570b42bf1d721af62 -size 151183829 +oid sha256:d81f5635d528a252848e3941c25c68eb0bea66c831e541ddf480f54f876a5e3d +size 234226143 diff --git a/checkpoints/model_weights_000199950336.pt b/checkpoints/model_weights_000199950336.pt index 81f824e80dba0868cdfb5d3160572a4907470be5..9f637fdc68dd587bfa714acfb915cc39185f04d9 100644 --- a/checkpoints/model_weights_000199950336.pt +++ b/checkpoints/model_weights_000199950336.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:33b2fbc1cefb3a8426289be152254266ef313622ff306cbe21ac0d611b819ae6 -size 151183829 +oid sha256:191a3ab543909a00d9a77cf97897f402074243a69b2e8a05a74dcccf0d90195e +size 234226143 diff --git a/config.toml b/config.toml index e3fc3cc147c7dc2190c1cb6c26c85b7f029103b9..80db26854e37fd5bff8adf0d0bc35e8b9cc82360 100644 --- a/config.toml +++ b/config.toml @@ -1,15 +1,15 @@ -model_name = "pile_llama_H1_L2" -n_layers = 2 +model_name = "pile_llama_replace_17367_new" +dataset_name = "eoinf/PL_Replace17367_L2_alldataset" +n_layers = 8 d_model = 512 d_mlp = 2048 d_head = 64 -n_heads = 1 +n_heads = 8 attn_only = false layer_norm_eps = 1e-05 init_range = 0.02 n_ctx = 1024 d_vocab = 32000 -dataset_name = "eoinf/pile_llama" seed = 10 device = "cuda" use_bfloat16_matmul = false @@ -17,8 +17,8 @@ batch_size_per_device = 32 n_devices = 1 batches_per_step = 1 max_tokens = 200000000 -lr_hidden = 0.001 -lr_vector = 0.0005 +lr_hidden = 0.002 +lr_vector = 0.001 lr_schedule = "constant_with_warmup" warmup_tokens = 30000000 weight_decay = 0.05 diff --git a/latest_checkpoint.pt b/latest_checkpoint.pt index 21a8ed1a06d4824fa6cce070d074ebc21d4d7f1b..6ebad479267640af5984618443d6d88f363c9c5a 100644 --- a/latest_checkpoint.pt +++ b/latest_checkpoint.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a841316540aacca701236202ac8a9224d21e8c06feead0486a3b0b90ace4bc2a -size 151183351 +oid sha256:5afdfec5e54636ba35ba3fe4adcc0c626b47c8fc61b2b5c7ee57f4a1469efeb6 +size 234224683 diff --git a/latest_metadata.json b/latest_metadata.json index c16e16df2be11e6f5e6f06b5472547126445306f..389eb4b3f672e2d2c62d2b59ee77e494577e09db 100644 --- a/latest_metadata.json +++ b/latest_metadata.json @@ -1 +1 @@ -{"step": 6102, "tokens_seen": 199950336, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.4578413544944855} \ No newline at end of file +{"step": 6102, "tokens_seen": 199950336, "config": {"model_name": "pile_llama_replace_17367_new", "n_layers": 8, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/PL_Replace17367_L2_alldataset", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.1541889009453166} \ No newline at end of file diff --git a/latest_optimizer.pt b/latest_optimizer.pt index dc3fd016ddf8f7527f07be956e5891a24bcd5bf5..cdd7d061852d1e6939136e551780fccd6f5382a0 100644 --- a/latest_optimizer.pt +++ b/latest_optimizer.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b412b0850dd6e899a32a7a77dcf97e5a851bbfa14d20715330f35dcd4888314d -size 302372627 +oid sha256:68d4a23c5ae1feb0c0677d5fc0fff682ffbfff0aada4fbc60104a0057ee50435 +size 468470963 diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log index f48362b4d868d528b4262bd90ae26bc82d13006a..d0ef417f5fa4b559f89e60d07c150e2e3f061f99 100644 --- a/wandb/debug-internal.log +++ b/wandb/debug-internal.log @@ -1,13 +1,12 @@ -{"time":"2026-02-26T15:30:26.517889161Z","level":"INFO","msg":"stream: starting","core version":"0.21.4"} -{"time":"2026-02-26T15:30:31.757267268Z","level":"INFO","msg":"stream: created new stream","id":"trcpjlfd"} -{"time":"2026-02-26T15:30:31.757342751Z","level":"INFO","msg":"stream: started","id":"trcpjlfd"} -{"time":"2026-02-26T15:30:31.757375145Z","level":"INFO","msg":"handler: started","stream_id":"trcpjlfd"} -{"time":"2026-02-26T15:30:31.757422001Z","level":"INFO","msg":"sender: started","stream_id":"trcpjlfd"} -{"time":"2026-02-26T15:30:31.757462295Z","level":"INFO","msg":"writer: started","stream_id":"trcpjlfd"} -{"time":"2026-02-26T16:08:47.579624032Z","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/tzach/toy-transformer-replication/trcpjlfd/file_stream","body":"\n\n\n502 Server Error\n\n\n

Error: Server Error

\n

The server encountered a temporary error and could not complete your request.

Please try again in 30 seconds.

\n

\n\n"} -{"time":"2026-02-26T16:12:40.720756645Z","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run metadata","runtime_seconds":0.00031624}],"total_operations":1}} -{"time":"2026-02-26T16:12:41.198683788Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} -{"time":"2026-02-26T16:12:41.385614105Z","level":"INFO","msg":"stream: closing","id":"trcpjlfd"} -{"time":"2026-02-26T16:12:41.385657054Z","level":"INFO","msg":"handler: closed","stream_id":"trcpjlfd"} -{"time":"2026-02-26T16:12:41.385711626Z","level":"INFO","msg":"sender: closed","stream_id":"trcpjlfd"} -{"time":"2026-02-26T16:12:41.385718584Z","level":"INFO","msg":"stream: closed","id":"trcpjlfd"} +{"time":"2026-03-19T06:35:19.214150803Z","level":"INFO","msg":"stream: starting","core version":"0.21.4"} +{"time":"2026-03-19T06:35:19.454890439Z","level":"INFO","msg":"stream: created new stream","id":"29lbcxak"} +{"time":"2026-03-19T06:35:19.454977401Z","level":"INFO","msg":"stream: started","id":"29lbcxak"} +{"time":"2026-03-19T06:35:19.455055746Z","level":"INFO","msg":"sender: started","stream_id":"29lbcxak"} +{"time":"2026-03-19T06:35:19.455093161Z","level":"INFO","msg":"writer: started","stream_id":"29lbcxak"} +{"time":"2026-03-19T06:35:19.455093346Z","level":"INFO","msg":"handler: started","stream_id":"29lbcxak"} +{"time":"2026-03-19T08:40:04.994899105Z","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run metadata","runtime_seconds":0.001043706}],"total_operations":1}} +{"time":"2026-03-19T08:40:05.630105109Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2026-03-19T08:40:08.650518881Z","level":"INFO","msg":"stream: closing","id":"29lbcxak"} +{"time":"2026-03-19T08:40:08.650561694Z","level":"INFO","msg":"handler: closed","stream_id":"29lbcxak"} +{"time":"2026-03-19T08:40:08.650605449Z","level":"INFO","msg":"sender: closed","stream_id":"29lbcxak"} +{"time":"2026-03-19T08:40:08.650623595Z","level":"INFO","msg":"stream: closed","id":"29lbcxak"} diff --git a/wandb/debug.log b/wandb/debug.log index 3ee3411527a8785b9507f88e164f4432869d104c..cfed7b25c4e3c86ac39418d92b12214e64089747 100644 --- a/wandb/debug.log +++ b/wandb/debug.log @@ -1,26 +1,26 @@ -2026-02-26 15:30:26,235 INFO MainThread:5904 [wandb_setup.py:_flush():81] Current SDK version is 0.21.4 -2026-02-26 15:30:26,236 INFO MainThread:5904 [wandb_setup.py:_flush():81] Configure stats pid to 5904 -2026-02-26 15:30:26,236 INFO MainThread:5904 [wandb_setup.py:_flush():81] Loading settings from /root/.config/wandb/settings -2026-02-26 15:30:26,236 INFO MainThread:5904 [wandb_setup.py:_flush():81] Loading settings from /notebooks/toy_models/model_training/model/wandb/settings -2026-02-26 15:30:26,236 INFO MainThread:5904 [wandb_setup.py:_flush():81] Loading settings from environment variables -2026-02-26 15:30:26,236 INFO MainThread:5904 [wandb_init.py:setup_run_log_directory():686] Logging user logs to /notebooks/toy_models/model_training/model/wandb/run-20260226_153026-trcpjlfd/logs/debug.log -2026-02-26 15:30:26,236 INFO MainThread:5904 [wandb_init.py:setup_run_log_directory():687] Logging internal logs to /notebooks/toy_models/model_training/model/wandb/run-20260226_153026-trcpjlfd/logs/debug-internal.log -2026-02-26 15:30:26,236 INFO MainThread:5904 [wandb_init.py:init():813] calling init triggers -2026-02-26 15:30:26,236 INFO MainThread:5904 [wandb_init.py:init():818] wandb.init called with sweep_config: {} -config: {'model_name': 'pile_llama_H1_L2', 'n_layers': 2, 'd_model': 512, 'd_mlp': 2048, 'd_head': 64, 'n_heads': 1, 'attn_only': False, 'layer_norm_eps': 1e-05, 'init_range': 0.02, 'n_ctx': 1024, 'd_vocab': 32000, 'dataset_name': 'eoinf/pile_llama', 'tokenizer_name': '', 'seed': 10, 'data_seed': 10, 'device': 'cuda', 'use_bfloat16_matmul': False, 'batch_size_per_device': 32, 'n_devices': 1, 'batches_per_step': 1, 'max_tokens': 200000000, 'lr_hidden': 0.001, 'lr_vector': 0.0005, 'lr_schedule': 'constant_with_warmup', 'warmup_tokens': 30000000, 'weight_decay': 0.05, 'grad_norm_clip': 1.0, 'train_loss_moving_average_beta': 0.99, 'log_interval': 25, 'save_checkpoints': True, 'checkpoint_interval': 500, 'checkpoint_interval_ratio': 1.1, 'save_log_checkpoints': True, 'use_wandb': True, 'batch_size': 32, 'tokens_per_step': 32768, 'warmup_steps': 915, 'max_steps': 6103, '_wandb': {}} -2026-02-26 15:30:26,236 INFO MainThread:5904 [wandb_init.py:init():854] starting backend -2026-02-26 15:30:26,504 INFO MainThread:5904 [wandb_init.py:init():857] sending inform_init request -2026-02-26 15:30:26,514 INFO MainThread:5904 [wandb_init.py:init():865] backend started and connected -2026-02-26 15:30:26,515 INFO MainThread:5904 [wandb_init.py:init():936] updated telemetry -2026-02-26 15:30:26,520 INFO MainThread:5904 [wandb_init.py:init():960] communicating run to backend with 90.0 second timeout -2026-02-26 15:30:32,186 INFO MainThread:5904 [wandb_init.py:init():1011] starting run threads in backend -2026-02-26 15:30:33,011 INFO MainThread:5904 [wandb_run.py:_console_start():2506] atexit reg -2026-02-26 15:30:33,011 INFO MainThread:5904 [wandb_run.py:_redirect():2354] redirect: wrap_raw -2026-02-26 15:30:33,011 INFO MainThread:5904 [wandb_run.py:_redirect():2423] Wrapping output streams. -2026-02-26 15:30:33,011 INFO MainThread:5904 [wandb_run.py:_redirect():2446] Redirects installed. -2026-02-26 15:30:33,026 INFO MainThread:5904 [wandb_init.py:init():1049] run started, returning control to user process -2026-02-26 16:12:40,715 INFO MainThread:5904 [wandb_run.py:_finish():2272] finishing run tzach/toy-transformer-replication/trcpjlfd -2026-02-26 16:12:40,719 INFO MainThread:5904 [wandb_run.py:_atexit_cleanup():2471] got exitcode: 0 -2026-02-26 16:12:40,719 INFO MainThread:5904 [wandb_run.py:_restore():2453] restore -2026-02-26 16:12:40,719 INFO MainThread:5904 [wandb_run.py:_restore():2459] restore done -2026-02-26 16:12:41,384 INFO MainThread:5904 [wandb_run.py:_footer_sync_info():3867] logging synced files +2026-03-19 06:35:18,696 INFO MainThread:2015 [wandb_setup.py:_flush():81] Current SDK version is 0.21.4 +2026-03-19 06:35:18,696 INFO MainThread:2015 [wandb_setup.py:_flush():81] Configure stats pid to 2015 +2026-03-19 06:35:18,696 INFO MainThread:2015 [wandb_setup.py:_flush():81] Loading settings from /root/.config/wandb/settings +2026-03-19 06:35:18,696 INFO MainThread:2015 [wandb_setup.py:_flush():81] Loading settings from /notebooks/toy_models/model_training/model/wandb/settings +2026-03-19 06:35:18,696 INFO MainThread:2015 [wandb_setup.py:_flush():81] Loading settings from environment variables +2026-03-19 06:35:18,696 INFO MainThread:2015 [wandb_init.py:setup_run_log_directory():686] Logging user logs to /notebooks/toy_models/model_training/model/wandb/run-20260319_063518-29lbcxak/logs/debug.log +2026-03-19 06:35:18,696 INFO MainThread:2015 [wandb_init.py:setup_run_log_directory():687] Logging internal logs to /notebooks/toy_models/model_training/model/wandb/run-20260319_063518-29lbcxak/logs/debug-internal.log +2026-03-19 06:35:18,696 INFO MainThread:2015 [wandb_init.py:init():813] calling init triggers +2026-03-19 06:35:18,696 INFO MainThread:2015 [wandb_init.py:init():818] wandb.init called with sweep_config: {} +config: {'model_name': 'pile_llama_replace_17367_new', 'n_layers': 8, 'd_model': 512, 'd_mlp': 2048, 'd_head': 64, 'n_heads': 8, 'attn_only': False, 'layer_norm_eps': 1e-05, 'init_range': 0.02, 'n_ctx': 1024, 'd_vocab': 32000, 'dataset_name': 'eoinf/PL_Replace17367_L2_alldataset', 'tokenizer_name': '', 'seed': 10, 'data_seed': 10, 'device': 'cuda', 'use_bfloat16_matmul': False, 'batch_size_per_device': 32, 'n_devices': 1, 'batches_per_step': 1, 'max_tokens': 200000000, 'lr_hidden': 0.002, 'lr_vector': 0.001, 'lr_schedule': 'constant_with_warmup', 'warmup_tokens': 30000000, 'weight_decay': 0.05, 'grad_norm_clip': 1.0, 'train_loss_moving_average_beta': 0.99, 'log_interval': 25, 'save_checkpoints': True, 'checkpoint_interval': 500, 'checkpoint_interval_ratio': 1.1, 'save_log_checkpoints': True, 'use_wandb': True, 'batch_size': 32, 'tokens_per_step': 32768, 'warmup_steps': 915, 'max_steps': 6103, '_wandb': {}} +2026-03-19 06:35:18,696 INFO MainThread:2015 [wandb_init.py:init():854] starting backend +2026-03-19 06:35:19,200 INFO MainThread:2015 [wandb_init.py:init():857] sending inform_init request +2026-03-19 06:35:19,210 INFO MainThread:2015 [wandb_init.py:init():865] backend started and connected +2026-03-19 06:35:19,211 INFO MainThread:2015 [wandb_init.py:init():936] updated telemetry +2026-03-19 06:35:19,221 INFO MainThread:2015 [wandb_init.py:init():960] communicating run to backend with 90.0 second timeout +2026-03-19 06:35:20,106 INFO MainThread:2015 [wandb_init.py:init():1011] starting run threads in backend +2026-03-19 06:35:20,927 INFO MainThread:2015 [wandb_run.py:_console_start():2506] atexit reg +2026-03-19 06:35:20,927 INFO MainThread:2015 [wandb_run.py:_redirect():2354] redirect: wrap_raw +2026-03-19 06:35:20,928 INFO MainThread:2015 [wandb_run.py:_redirect():2423] Wrapping output streams. +2026-03-19 06:35:20,928 INFO MainThread:2015 [wandb_run.py:_redirect():2446] Redirects installed. +2026-03-19 06:35:20,941 INFO MainThread:2015 [wandb_init.py:init():1049] run started, returning control to user process +2026-03-19 08:40:04,989 INFO MainThread:2015 [wandb_run.py:_finish():2272] finishing run tzach/toy-transformer-replication/29lbcxak +2026-03-19 08:40:04,993 INFO MainThread:2015 [wandb_run.py:_atexit_cleanup():2471] got exitcode: 0 +2026-03-19 08:40:04,993 INFO MainThread:2015 [wandb_run.py:_restore():2453] restore +2026-03-19 08:40:04,993 INFO MainThread:2015 [wandb_run.py:_restore():2459] restore done +2026-03-19 08:40:08,649 INFO MainThread:2015 [wandb_run.py:_footer_sync_info():3867] logging synced files diff --git a/wandb/run-20260319_063518-29lbcxak/files/config.yaml b/wandb/run-20260319_063518-29lbcxak/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6557506261c44d38d14131b1cfdc037cdcec333a --- /dev/null +++ b/wandb/run-20260319_063518-29lbcxak/files/config.yaml @@ -0,0 +1,140 @@ +_wandb: + value: + cli_version: 0.21.4 + e: + etlry1vz32kjtednysvbuccj86zuz6bd: + cpu_count: 8 + cpu_count_logical: 8 + cudaVersion: "12.4" + disk: + /: + total: "262240792576" + used: "133357727744" + email: tzfof8@gmail.com + executable: /notebooks/toy_models/.toy_models_env/bin/python + git: + commit: d722bb952956265d0387df9c35a76703a66824ec + remote: https://github.com/jgroh3/toy_models.git + gpu: NVIDIA RTX A6000 + gpu_count: 1 + gpu_nvidia: + - architecture: Ampere + cudaCores: 10752 + memoryTotal: "51527024640" + name: NVIDIA RTX A6000 + uuid: GPU-7a4c6671-13c7-c5cf-2024-6b167a982ac9 + host: na3g1nqrym + memory: + total: "47332843520" + os: Linux-5.19.0-45-generic-x86_64-with-glibc2.35 + program: + python: CPython 3.11.7 + root: /notebooks/toy_models/model_training/model + startedAt: "2026-03-19T06:35:18.692601Z" + writerId: etlry1vz32kjtednysvbuccj86zuz6bd + m: [] + python_version: 3.11.7 + t: + "1": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 71 + "2": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 71 + "3": + - 2 + - 13 + - 15 + - 16 + - 61 + "4": 3.11.7 + "5": 0.21.4 + "6": 4.56.1 + "12": 0.21.4 + "13": linux-x86_64 +attn_only: + value: false +batch_size: + value: 32 +batch_size_per_device: + value: 32 +batches_per_step: + value: 1 +checkpoint_interval: + value: 500 +checkpoint_interval_ratio: + value: 1.1 +d_head: + value: 64 +d_mlp: + value: 2048 +d_model: + value: 512 +d_vocab: + value: 32000 +data_seed: + value: 10 +dataset_name: + value: eoinf/PL_Replace17367_L2_alldataset +device: + value: cuda +grad_norm_clip: + value: 1 +init_range: + value: 0.02 +layer_norm_eps: + value: 1e-05 +log_interval: + value: 25 +lr_hidden: + value: 0.002 +lr_schedule: + value: constant_with_warmup +lr_vector: + value: 0.001 +max_steps: + value: 6103 +max_tokens: + value: 200000000 +model_name: + value: pile_llama_replace_17367_new +n_ctx: + value: 1024 +n_devices: + value: 1 +n_heads: + value: 8 +n_layers: + value: 8 +save_checkpoints: + value: true +save_log_checkpoints: + value: true +seed: + value: 10 +tokenizer_name: + value: "" +tokens_per_step: + value: 32768 +train_loss_moving_average_beta: + value: 0.99 +use_bfloat16_matmul: + value: false +use_wandb: + value: true +warmup_steps: + value: 915 +warmup_tokens: + value: 30000000 +weight_decay: + value: 0.05 diff --git a/wandb/run-20260319_063518-29lbcxak/files/output.log b/wandb/run-20260319_063518-29lbcxak/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..fe603d273fa84c3e8b4d4dbfccd652383b28f714 --- /dev/null +++ b/wandb/run-20260319_063518-29lbcxak/files/output.log @@ -0,0 +1,252 @@ +Training on cuda +Model: 8L, 512d, 8h +Max steps: 6,103, Max tokens: 200,000,000 +Warmup steps: 915, Warmup tokens: 30,000,000 +Batch size per device: 32 +Context length: 1024 +Learning rates - Hidden: 0.002, Vector: 0.001 + +Step 25 | Tokens: 819,200 | Train Loss EWMA: 10.3378 | Learning Rate: 0.000055 | Progress: 0.00410 +Step 50 | Tokens: 1,638,400 | Train Loss EWMA: 10.0546 | Learning Rate: 0.000109 | Progress: 0.00819 +Step 75 | Tokens: 2,457,600 | Train Loss EWMA: 9.6130 | Learning Rate: 0.000164 | Progress: 0.01229 +Step 100 | Tokens: 3,276,800 | Train Loss EWMA: 9.0818 | Learning Rate: 0.000219 | Progress: 0.01638 +Step 125 | Tokens: 4,096,000 | Train Loss EWMA: 8.5594 | Learning Rate: 0.000273 | Progress: 0.02048 +Step 150 | Tokens: 4,915,200 | Train Loss EWMA: 8.0830 | Learning Rate: 0.000328 | Progress: 0.02458 +Step 175 | Tokens: 5,734,400 | Train Loss EWMA: 7.6730 | Learning Rate: 0.000383 | Progress: 0.02867 +Step 200 | Tokens: 6,553,600 | Train Loss EWMA: 7.3124 | Learning Rate: 0.000437 | Progress: 0.03277 +Step 225 | Tokens: 7,372,800 | Train Loss EWMA: 7.0014 | Learning Rate: 0.000492 | Progress: 0.03686 +Step 250 | Tokens: 8,192,000 | Train Loss EWMA: 6.7478 | Learning Rate: 0.000546 | Progress: 0.04096 +Step 275 | Tokens: 9,011,200 | Train Loss EWMA: 6.5264 | Learning Rate: 0.000601 | Progress: 0.04506 +Step 300 | Tokens: 9,830,400 | Train Loss EWMA: 6.3331 | Learning Rate: 0.000656 | Progress: 0.04915 +Step 325 | Tokens: 10,649,600 | Train Loss EWMA: 6.1676 | Learning Rate: 0.000710 | Progress: 0.05325 +Step 350 | Tokens: 11,468,800 | Train Loss EWMA: 6.0258 | Learning Rate: 0.000765 | Progress: 0.05734 +Step 375 | Tokens: 12,288,000 | Train Loss EWMA: 5.8999 | Learning Rate: 0.000820 | Progress: 0.06144 +Step 400 | Tokens: 13,107,200 | Train Loss EWMA: 5.7919 | Learning Rate: 0.000874 | Progress: 0.06554 +Step 425 | Tokens: 13,926,400 | Train Loss EWMA: 5.6908 | Learning Rate: 0.000929 | Progress: 0.06963 +Step 450 | Tokens: 14,745,600 | Train Loss EWMA: 5.6080 | Learning Rate: 0.000984 | Progress: 0.07373 +Step 475 | Tokens: 15,564,800 | Train Loss EWMA: 5.5270 | Learning Rate: 0.001038 | Progress: 0.07782 +Step 500 | Tokens: 16,384,000 | Train Loss EWMA: 5.4682 | Learning Rate: 0.001093 | Progress: 0.08192 +Step 525 | Tokens: 17,203,200 | Train Loss EWMA: 5.4024 | Learning Rate: 0.001148 | Progress: 0.08602 +Step 550 | Tokens: 18,022,400 | Train Loss EWMA: 5.3377 | Learning Rate: 0.001202 | Progress: 0.09011 +Step 575 | Tokens: 18,841,600 | Train Loss EWMA: 5.2773 | Learning Rate: 0.001257 | Progress: 0.09421 +Step 600 | Tokens: 19,660,800 | Train Loss EWMA: 5.2321 | Learning Rate: 0.001311 | Progress: 0.09830 +Step 625 | Tokens: 20,480,000 | Train Loss EWMA: 5.1874 | Learning Rate: 0.001366 | Progress: 0.10240 +Step 650 | Tokens: 21,299,200 | Train Loss EWMA: 5.1590 | Learning Rate: 0.001421 | Progress: 0.10650 +Step 675 | Tokens: 22,118,400 | Train Loss EWMA: 5.1198 | Learning Rate: 0.001475 | Progress: 0.11059 +Step 700 | Tokens: 22,937,600 | Train Loss EWMA: 5.0864 | Learning Rate: 0.001530 | Progress: 0.11469 +Step 725 | Tokens: 23,756,800 | Train Loss EWMA: 5.0536 | Learning Rate: 0.001585 | Progress: 0.11878 +Step 750 | Tokens: 24,576,000 | Train Loss EWMA: 5.0131 | Learning Rate: 0.001639 | Progress: 0.12288 +Step 775 | Tokens: 25,395,200 | Train Loss EWMA: 4.9905 | Learning Rate: 0.001694 | Progress: 0.12698 +Step 800 | Tokens: 26,214,400 | Train Loss EWMA: 4.9665 | Learning Rate: 0.001749 | Progress: 0.13107 +Step 825 | Tokens: 27,033,600 | Train Loss EWMA: 4.9321 | Learning Rate: 0.001803 | Progress: 0.13517 +Step 850 | Tokens: 27,852,800 | Train Loss EWMA: 4.9048 | Learning Rate: 0.001858 | Progress: 0.13926 +Step 875 | Tokens: 28,672,000 | Train Loss EWMA: 4.8742 | Learning Rate: 0.001913 | Progress: 0.14336 +Step 900 | Tokens: 29,491,200 | Train Loss EWMA: 4.8486 | Learning Rate: 0.001967 | Progress: 0.14746 +Step 925 | Tokens: 30,310,400 | Train Loss EWMA: 4.8184 | Learning Rate: 0.002000 | Progress: 0.15155 +Step 950 | Tokens: 31,129,600 | Train Loss EWMA: 4.7980 | Learning Rate: 0.002000 | Progress: 0.15565 +Step 975 | Tokens: 31,948,800 | Train Loss EWMA: 4.7749 | Learning Rate: 0.002000 | Progress: 0.15974 +Step 1,000 | Tokens: 32,768,000 | Train Loss EWMA: 4.7431 | Learning Rate: 0.002000 | Progress: 0.16384 +Step 1,025 | Tokens: 33,587,200 | Train Loss EWMA: 4.7112 | Learning Rate: 0.002000 | Progress: 0.16794 +Step 1,050 | Tokens: 34,406,400 | Train Loss EWMA: 4.6885 | Learning Rate: 0.002000 | Progress: 0.17203 +Step 1,075 | Tokens: 35,225,600 | Train Loss EWMA: 4.6652 | Learning Rate: 0.002000 | Progress: 0.17613 +Step 1,100 | Tokens: 36,044,800 | Train Loss EWMA: 4.6406 | Learning Rate: 0.002000 | Progress: 0.18022 +Step 1,125 | Tokens: 36,864,000 | Train Loss EWMA: 4.6121 | Learning Rate: 0.002000 | Progress: 0.18432 +Step 1,150 | Tokens: 37,683,200 | Train Loss EWMA: 4.5906 | Learning Rate: 0.002000 | Progress: 0.18842 +Step 1,175 | Tokens: 38,502,400 | Train Loss EWMA: 4.5626 | Learning Rate: 0.002000 | Progress: 0.19251 +Step 1,200 | Tokens: 39,321,600 | Train Loss EWMA: 4.5467 | Learning Rate: 0.002000 | Progress: 0.19661 +Step 1,225 | Tokens: 40,140,800 | Train Loss EWMA: 4.5282 | Learning Rate: 0.002000 | Progress: 0.20070 +Step 1,250 | Tokens: 40,960,000 | Train Loss EWMA: 4.5011 | Learning Rate: 0.002000 | Progress: 0.20480 +Step 1,275 | Tokens: 41,779,200 | Train Loss EWMA: 4.4776 | Learning Rate: 0.002000 | Progress: 0.20890 +Step 1,300 | Tokens: 42,598,400 | Train Loss EWMA: 4.4530 | Learning Rate: 0.002000 | Progress: 0.21299 +Step 1,325 | Tokens: 43,417,600 | Train Loss EWMA: 4.4366 | Learning Rate: 0.002000 | Progress: 0.21709 +Step 1,350 | Tokens: 44,236,800 | Train Loss EWMA: 4.4041 | Learning Rate: 0.002000 | Progress: 0.22118 +Step 1,375 | Tokens: 45,056,000 | Train Loss EWMA: 4.3785 | Learning Rate: 0.002000 | Progress: 0.22528 +Step 1,400 | Tokens: 45,875,200 | Train Loss EWMA: 4.3594 | Learning Rate: 0.002000 | Progress: 0.22938 +Step 1,425 | Tokens: 46,694,400 | Train Loss EWMA: 4.3443 | Learning Rate: 0.002000 | Progress: 0.23347 +Step 1,450 | Tokens: 47,513,600 | Train Loss EWMA: 4.3304 | Learning Rate: 0.002000 | Progress: 0.23757 +Step 1,475 | Tokens: 48,332,800 | Train Loss EWMA: 4.3171 | Learning Rate: 0.002000 | Progress: 0.24166 +Step 1,500 | Tokens: 49,152,000 | Train Loss EWMA: 4.3000 | Learning Rate: 0.002000 | Progress: 0.24576 +Step 1,525 | Tokens: 49,971,200 | Train Loss EWMA: 4.2921 | Learning Rate: 0.002000 | Progress: 0.24986 +Step 1,550 | Tokens: 50,790,400 | Train Loss EWMA: 4.2719 | Learning Rate: 0.002000 | Progress: 0.25395 +Step 1,575 | Tokens: 51,609,600 | Train Loss EWMA: 4.2488 | Learning Rate: 0.002000 | Progress: 0.25805 +Step 1,600 | Tokens: 52,428,800 | Train Loss EWMA: 4.2281 | Learning Rate: 0.002000 | Progress: 0.26214 +Step 1,625 | Tokens: 53,248,000 | Train Loss EWMA: 4.2028 | Learning Rate: 0.002000 | Progress: 0.26624 +Step 1,650 | Tokens: 54,067,200 | Train Loss EWMA: 4.1845 | Learning Rate: 0.002000 | Progress: 0.27034 +Step 1,675 | Tokens: 54,886,400 | Train Loss EWMA: 4.1590 | Learning Rate: 0.002000 | Progress: 0.27443 +Step 1,700 | Tokens: 55,705,600 | Train Loss EWMA: 4.1295 | Learning Rate: 0.002000 | Progress: 0.27853 +Step 1,725 | Tokens: 56,524,800 | Train Loss EWMA: 4.1115 | Learning Rate: 0.002000 | Progress: 0.28262 +Step 1,750 | Tokens: 57,344,000 | Train Loss EWMA: 4.0819 | Learning Rate: 0.002000 | Progress: 0.28672 +Step 1,775 | Tokens: 58,163,200 | Train Loss EWMA: 4.0539 | Learning Rate: 0.002000 | Progress: 0.29082 +Step 1,800 | Tokens: 58,982,400 | Train Loss EWMA: 4.0431 | Learning Rate: 0.002000 | Progress: 0.29491 +Step 1,825 | Tokens: 59,801,600 | Train Loss EWMA: 4.0154 | Learning Rate: 0.002000 | Progress: 0.29901 +Step 1,850 | Tokens: 60,620,800 | Train Loss EWMA: 3.9932 | Learning Rate: 0.002000 | Progress: 0.30310 +Step 1,875 | Tokens: 61,440,000 | Train Loss EWMA: 3.9726 | Learning Rate: 0.002000 | Progress: 0.30720 +Step 1,900 | Tokens: 62,259,200 | Train Loss EWMA: 3.9562 | Learning Rate: 0.002000 | Progress: 0.31130 +Step 1,925 | Tokens: 63,078,400 | Train Loss EWMA: 3.9468 | Learning Rate: 0.002000 | Progress: 0.31539 +Step 1,950 | Tokens: 63,897,600 | Train Loss EWMA: 3.9317 | Learning Rate: 0.002000 | Progress: 0.31949 +Step 1,975 | Tokens: 64,716,800 | Train Loss EWMA: 3.9134 | Learning Rate: 0.002000 | Progress: 0.32358 +Step 2,000 | Tokens: 65,536,000 | Train Loss EWMA: 3.8928 | Learning Rate: 0.002000 | Progress: 0.32768 +Step 2,025 | Tokens: 66,355,200 | Train Loss EWMA: 3.8700 | Learning Rate: 0.002000 | Progress: 0.33178 +Step 2,050 | Tokens: 67,174,400 | Train Loss EWMA: 3.8520 | Learning Rate: 0.002000 | Progress: 0.33587 +Step 2,075 | Tokens: 67,993,600 | Train Loss EWMA: 3.8230 | Learning Rate: 0.002000 | Progress: 0.33997 +Step 2,100 | Tokens: 68,812,800 | Train Loss EWMA: 3.7972 | Learning Rate: 0.002000 | Progress: 0.34406 +Step 2,125 | Tokens: 69,632,000 | Train Loss EWMA: 3.7820 | Learning Rate: 0.002000 | Progress: 0.34816 +Step 2,150 | Tokens: 70,451,200 | Train Loss EWMA: 3.7699 | Learning Rate: 0.002000 | Progress: 0.35226 +Step 2,175 | Tokens: 71,270,400 | Train Loss EWMA: 3.7566 | Learning Rate: 0.002000 | Progress: 0.35635 +Step 2,200 | Tokens: 72,089,600 | Train Loss EWMA: 3.7449 | Learning Rate: 0.002000 | Progress: 0.36045 +Step 2,225 | Tokens: 72,908,800 | Train Loss EWMA: 3.7241 | Learning Rate: 0.002000 | Progress: 0.36454 +Step 2,250 | Tokens: 73,728,000 | Train Loss EWMA: 3.7098 | Learning Rate: 0.002000 | Progress: 0.36864 +Step 2,275 | Tokens: 74,547,200 | Train Loss EWMA: 3.7001 | Learning Rate: 0.002000 | Progress: 0.37274 +Step 2,300 | Tokens: 75,366,400 | Train Loss EWMA: 3.6903 | Learning Rate: 0.002000 | Progress: 0.37683 +Step 2,325 | Tokens: 76,185,600 | Train Loss EWMA: 3.6793 | Learning Rate: 0.002000 | Progress: 0.38093 +Step 2,350 | Tokens: 77,004,800 | Train Loss EWMA: 3.6738 | Learning Rate: 0.002000 | Progress: 0.38502 +Step 2,375 | Tokens: 77,824,000 | Train Loss EWMA: 3.6604 | Learning Rate: 0.002000 | Progress: 0.38912 +Step 2,400 | Tokens: 78,643,200 | Train Loss EWMA: 3.6461 | Learning Rate: 0.002000 | Progress: 0.39322 +Step 2,425 | Tokens: 79,462,400 | Train Loss EWMA: 3.6370 | Learning Rate: 0.002000 | Progress: 0.39731 +Step 2,450 | Tokens: 80,281,600 | Train Loss EWMA: 3.6308 | Learning Rate: 0.002000 | Progress: 0.40141 +Step 2,475 | Tokens: 81,100,800 | Train Loss EWMA: 3.6153 | Learning Rate: 0.002000 | Progress: 0.40550 +Step 2,500 | Tokens: 81,920,000 | Train Loss EWMA: 3.6026 | Learning Rate: 0.002000 | Progress: 0.40960 +Step 2,525 | Tokens: 82,739,200 | Train Loss EWMA: 3.5924 | Learning Rate: 0.002000 | Progress: 0.41370 +Step 2,550 | Tokens: 83,558,400 | Train Loss EWMA: 3.5840 | Learning Rate: 0.002000 | Progress: 0.41779 +Step 2,575 | Tokens: 84,377,600 | Train Loss EWMA: 3.5676 | Learning Rate: 0.002000 | Progress: 0.42189 +Step 2,600 | Tokens: 85,196,800 | Train Loss EWMA: 3.5569 | Learning Rate: 0.002000 | Progress: 0.42598 +Step 2,625 | Tokens: 86,016,000 | Train Loss EWMA: 3.5555 | Learning Rate: 0.002000 | Progress: 0.43008 +Step 2,650 | Tokens: 86,835,200 | Train Loss EWMA: 3.5490 | Learning Rate: 0.002000 | Progress: 0.43418 +Step 2,675 | Tokens: 87,654,400 | Train Loss EWMA: 3.5458 | Learning Rate: 0.002000 | Progress: 0.43827 +Step 2,700 | Tokens: 88,473,600 | Train Loss EWMA: 3.5316 | Learning Rate: 0.002000 | Progress: 0.44237 +Step 2,725 | Tokens: 89,292,800 | Train Loss EWMA: 3.5295 | Learning Rate: 0.002000 | Progress: 0.44646 +Step 2,750 | Tokens: 90,112,000 | Train Loss EWMA: 3.5193 | Learning Rate: 0.002000 | Progress: 0.45056 +Step 2,775 | Tokens: 90,931,200 | Train Loss EWMA: 3.5156 | Learning Rate: 0.002000 | Progress: 0.45466 +Step 2,800 | Tokens: 91,750,400 | Train Loss EWMA: 3.5098 | Learning Rate: 0.002000 | Progress: 0.45875 +Step 2,825 | Tokens: 92,569,600 | Train Loss EWMA: 3.5101 | Learning Rate: 0.002000 | Progress: 0.46285 +Step 2,850 | Tokens: 93,388,800 | Train Loss EWMA: 3.5041 | Learning Rate: 0.002000 | Progress: 0.46694 +Step 2,875 | Tokens: 94,208,000 | Train Loss EWMA: 3.4893 | Learning Rate: 0.002000 | Progress: 0.47104 +Step 2,900 | Tokens: 95,027,200 | Train Loss EWMA: 3.4848 | Learning Rate: 0.002000 | Progress: 0.47514 +Step 2,925 | Tokens: 95,846,400 | Train Loss EWMA: 3.4756 | Learning Rate: 0.002000 | Progress: 0.47923 +Step 2,950 | Tokens: 96,665,600 | Train Loss EWMA: 3.4717 | Learning Rate: 0.002000 | Progress: 0.48333 +Step 2,975 | Tokens: 97,484,800 | Train Loss EWMA: 3.4694 | Learning Rate: 0.002000 | Progress: 0.48742 +Step 3,000 | Tokens: 98,304,000 | Train Loss EWMA: 3.4680 | Learning Rate: 0.002000 | Progress: 0.49152 +Step 3,025 | Tokens: 99,123,200 | Train Loss EWMA: 3.4655 | Learning Rate: 0.002000 | Progress: 0.49562 +Step 3,050 | Tokens: 99,942,400 | Train Loss EWMA: 3.4528 | Learning Rate: 0.002000 | Progress: 0.49971 +Step 3,075 | Tokens: 100,761,600 | Train Loss EWMA: 3.4496 | Learning Rate: 0.002000 | Progress: 0.50381 +Step 3,100 | Tokens: 101,580,800 | Train Loss EWMA: 3.4472 | Learning Rate: 0.002000 | Progress: 0.50790 +Step 3,125 | Tokens: 102,400,000 | Train Loss EWMA: 3.4466 | Learning Rate: 0.002000 | Progress: 0.51200 +Step 3,150 | Tokens: 103,219,200 | Train Loss EWMA: 3.4393 | Learning Rate: 0.002000 | Progress: 0.51610 +Step 3,175 | Tokens: 104,038,400 | Train Loss EWMA: 3.4373 | Learning Rate: 0.002000 | Progress: 0.52019 +Step 3,200 | Tokens: 104,857,600 | Train Loss EWMA: 3.4210 | Learning Rate: 0.002000 | Progress: 0.52429 +Step 3,225 | Tokens: 105,676,800 | Train Loss EWMA: 3.4112 | Learning Rate: 0.002000 | Progress: 0.52838 +Step 3,250 | Tokens: 106,496,000 | Train Loss EWMA: 3.4126 | Learning Rate: 0.002000 | Progress: 0.53248 +Step 3,275 | Tokens: 107,315,200 | Train Loss EWMA: 3.4166 | Learning Rate: 0.002000 | Progress: 0.53658 +Step 3,300 | Tokens: 108,134,400 | Train Loss EWMA: 3.4013 | Learning Rate: 0.002000 | Progress: 0.54067 +Step 3,325 | Tokens: 108,953,600 | Train Loss EWMA: 3.4000 | Learning Rate: 0.002000 | Progress: 0.54477 +Step 3,350 | Tokens: 109,772,800 | Train Loss EWMA: 3.3998 | Learning Rate: 0.002000 | Progress: 0.54886 +Step 3,375 | Tokens: 110,592,000 | Train Loss EWMA: 3.3816 | Learning Rate: 0.002000 | Progress: 0.55296 +Step 3,400 | Tokens: 111,411,200 | Train Loss EWMA: 3.3776 | Learning Rate: 0.002000 | Progress: 0.55706 +Step 3,425 | Tokens: 112,230,400 | Train Loss EWMA: 3.3725 | Learning Rate: 0.002000 | Progress: 0.56115 +Step 3,450 | Tokens: 113,049,600 | Train Loss EWMA: 3.3699 | Learning Rate: 0.002000 | Progress: 0.56525 +Step 3,475 | Tokens: 113,868,800 | Train Loss EWMA: 3.3708 | Learning Rate: 0.002000 | Progress: 0.56934 +Step 3,500 | Tokens: 114,688,000 | Train Loss EWMA: 3.3578 | Learning Rate: 0.002000 | Progress: 0.57344 +Step 3,525 | Tokens: 115,507,200 | Train Loss EWMA: 3.3589 | Learning Rate: 0.002000 | Progress: 0.57754 +Step 3,550 | Tokens: 116,326,400 | Train Loss EWMA: 3.3537 | Learning Rate: 0.002000 | Progress: 0.58163 +Step 3,575 | Tokens: 117,145,600 | Train Loss EWMA: 3.3652 | Learning Rate: 0.002000 | Progress: 0.58573 +Step 3,600 | Tokens: 117,964,800 | Train Loss EWMA: 3.3576 | Learning Rate: 0.002000 | Progress: 0.58982 +Step 3,625 | Tokens: 118,784,000 | Train Loss EWMA: 3.3610 | Learning Rate: 0.002000 | Progress: 0.59392 +Step 3,650 | Tokens: 119,603,200 | Train Loss EWMA: 3.3587 | Learning Rate: 0.002000 | Progress: 0.59802 +Step 3,675 | Tokens: 120,422,400 | Train Loss EWMA: 3.3589 | Learning Rate: 0.002000 | Progress: 0.60211 +Step 3,700 | Tokens: 121,241,600 | Train Loss EWMA: 3.3538 | Learning Rate: 0.002000 | Progress: 0.60621 +Step 3,725 | Tokens: 122,060,800 | Train Loss EWMA: 3.3502 | Learning Rate: 0.002000 | Progress: 0.61030 +Step 3,750 | Tokens: 122,880,000 | Train Loss EWMA: 3.3429 | Learning Rate: 0.002000 | Progress: 0.61440 +Step 3,775 | Tokens: 123,699,200 | Train Loss EWMA: 3.3353 | Learning Rate: 0.002000 | Progress: 0.61850 +Step 3,800 | Tokens: 124,518,400 | Train Loss EWMA: 3.3263 | Learning Rate: 0.002000 | Progress: 0.62259 +Step 3,825 | Tokens: 125,337,600 | Train Loss EWMA: 3.3278 | Learning Rate: 0.002000 | Progress: 0.62669 +Step 3,850 | Tokens: 126,156,800 | Train Loss EWMA: 3.3330 | Learning Rate: 0.002000 | Progress: 0.63078 +Step 3,875 | Tokens: 126,976,000 | Train Loss EWMA: 3.3234 | Learning Rate: 0.002000 | Progress: 0.63488 +Step 3,900 | Tokens: 127,795,200 | Train Loss EWMA: 3.3295 | Learning Rate: 0.002000 | Progress: 0.63898 +Step 3,925 | Tokens: 128,614,400 | Train Loss EWMA: 3.3255 | Learning Rate: 0.002000 | Progress: 0.64307 +Step 3,950 | Tokens: 129,433,600 | Train Loss EWMA: 3.3255 | Learning Rate: 0.002000 | Progress: 0.64717 +Step 3,975 | Tokens: 130,252,800 | Train Loss EWMA: 3.3158 | Learning Rate: 0.002000 | Progress: 0.65126 +Step 4,000 | Tokens: 131,072,000 | Train Loss EWMA: 3.3233 | Learning Rate: 0.002000 | Progress: 0.65536 +Step 4,025 | Tokens: 131,891,200 | Train Loss EWMA: 3.3176 | Learning Rate: 0.002000 | Progress: 0.65946 +Step 4,050 | Tokens: 132,710,400 | Train Loss EWMA: 3.3000 | Learning Rate: 0.002000 | Progress: 0.66355 +Step 4,075 | Tokens: 133,529,600 | Train Loss EWMA: 3.2949 | Learning Rate: 0.002000 | Progress: 0.66765 +Step 4,100 | Tokens: 134,348,800 | Train Loss EWMA: 3.2907 | Learning Rate: 0.002000 | Progress: 0.67174 +Step 4,125 | Tokens: 135,168,000 | Train Loss EWMA: 3.2926 | Learning Rate: 0.002000 | Progress: 0.67584 +Step 4,150 | Tokens: 135,987,200 | Train Loss EWMA: 3.2954 | Learning Rate: 0.002000 | Progress: 0.67994 +Step 4,175 | Tokens: 136,806,400 | Train Loss EWMA: 3.2895 | Learning Rate: 0.002000 | Progress: 0.68403 +Step 4,200 | Tokens: 137,625,600 | Train Loss EWMA: 3.2985 | Learning Rate: 0.002000 | Progress: 0.68813 +Step 4,225 | Tokens: 138,444,800 | Train Loss EWMA: 3.2893 | Learning Rate: 0.002000 | Progress: 0.69222 +Step 4,250 | Tokens: 139,264,000 | Train Loss EWMA: 3.2895 | Learning Rate: 0.002000 | Progress: 0.69632 +Step 4,275 | Tokens: 140,083,200 | Train Loss EWMA: 3.2877 | Learning Rate: 0.002000 | Progress: 0.70042 +Step 4,300 | Tokens: 140,902,400 | Train Loss EWMA: 3.2787 | Learning Rate: 0.002000 | Progress: 0.70451 +Step 4,325 | Tokens: 141,721,600 | Train Loss EWMA: 3.2774 | Learning Rate: 0.002000 | Progress: 0.70861 +Step 4,350 | Tokens: 142,540,800 | Train Loss EWMA: 3.2726 | Learning Rate: 0.002000 | Progress: 0.71270 +Step 4,375 | Tokens: 143,360,000 | Train Loss EWMA: 3.2701 | Learning Rate: 0.002000 | Progress: 0.71680 +Step 4,400 | Tokens: 144,179,200 | Train Loss EWMA: 3.2740 | Learning Rate: 0.002000 | Progress: 0.72090 +Step 4,425 | Tokens: 144,998,400 | Train Loss EWMA: 3.2809 | Learning Rate: 0.002000 | Progress: 0.72499 +Step 4,450 | Tokens: 145,817,600 | Train Loss EWMA: 3.2775 | Learning Rate: 0.002000 | Progress: 0.72909 +Step 4,475 | Tokens: 146,636,800 | Train Loss EWMA: 3.2770 | Learning Rate: 0.002000 | Progress: 0.73318 +Step 4,500 | Tokens: 147,456,000 | Train Loss EWMA: 3.2607 | Learning Rate: 0.002000 | Progress: 0.73728 +Step 4,525 | Tokens: 148,275,200 | Train Loss EWMA: 3.2466 | Learning Rate: 0.002000 | Progress: 0.74138 +Step 4,550 | Tokens: 149,094,400 | Train Loss EWMA: 3.2466 | Learning Rate: 0.002000 | Progress: 0.74547 +Step 4,575 | Tokens: 149,913,600 | Train Loss EWMA: 3.2428 | Learning Rate: 0.002000 | Progress: 0.74957 +Step 4,600 | Tokens: 150,732,800 | Train Loss EWMA: 3.2489 | Learning Rate: 0.002000 | Progress: 0.75366 +Step 4,625 | Tokens: 151,552,000 | Train Loss EWMA: 3.2437 | Learning Rate: 0.002000 | Progress: 0.75776 +Step 4,650 | Tokens: 152,371,200 | Train Loss EWMA: 3.2345 | Learning Rate: 0.002000 | Progress: 0.76186 +Step 4,675 | Tokens: 153,190,400 | Train Loss EWMA: 3.2283 | Learning Rate: 0.002000 | Progress: 0.76595 +Step 4,700 | Tokens: 154,009,600 | Train Loss EWMA: 3.2352 | Learning Rate: 0.002000 | Progress: 0.77005 +Step 4,725 | Tokens: 154,828,800 | Train Loss EWMA: 3.2390 | Learning Rate: 0.002000 | Progress: 0.77414 +Step 4,750 | Tokens: 155,648,000 | Train Loss EWMA: 3.2386 | Learning Rate: 0.002000 | Progress: 0.77824 +Step 4,775 | Tokens: 156,467,200 | Train Loss EWMA: 3.2404 | Learning Rate: 0.002000 | Progress: 0.78234 +Step 4,800 | Tokens: 157,286,400 | Train Loss EWMA: 3.2413 | Learning Rate: 0.002000 | Progress: 0.78643 +Step 4,825 | Tokens: 158,105,600 | Train Loss EWMA: 3.2377 | Learning Rate: 0.002000 | Progress: 0.79053 +Step 4,850 | Tokens: 158,924,800 | Train Loss EWMA: 3.2370 | Learning Rate: 0.002000 | Progress: 0.79462 +Step 4,875 | Tokens: 159,744,000 | Train Loss EWMA: 3.2366 | Learning Rate: 0.002000 | Progress: 0.79872 +Step 4,900 | Tokens: 160,563,200 | Train Loss EWMA: 3.2376 | Learning Rate: 0.002000 | Progress: 0.80282 +Step 4,925 | Tokens: 161,382,400 | Train Loss EWMA: 3.2245 | Learning Rate: 0.002000 | Progress: 0.80691 +Step 4,950 | Tokens: 162,201,600 | Train Loss EWMA: 3.2265 | Learning Rate: 0.002000 | Progress: 0.81101 +Step 4,975 | Tokens: 163,020,800 | Train Loss EWMA: 3.2274 | Learning Rate: 0.002000 | Progress: 0.81510 +Step 5,000 | Tokens: 163,840,000 | Train Loss EWMA: 3.2197 | Learning Rate: 0.002000 | Progress: 0.81920 +Step 5,025 | Tokens: 164,659,200 | Train Loss EWMA: 3.2257 | Learning Rate: 0.002000 | Progress: 0.82330 +Step 5,050 | Tokens: 165,478,400 | Train Loss EWMA: 3.2224 | Learning Rate: 0.002000 | Progress: 0.82739 +Step 5,075 | Tokens: 166,297,600 | Train Loss EWMA: 3.2227 | Learning Rate: 0.002000 | Progress: 0.83149 +Step 5,100 | Tokens: 167,116,800 | Train Loss EWMA: 3.2191 | Learning Rate: 0.002000 | Progress: 0.83558 +Step 5,125 | Tokens: 167,936,000 | Train Loss EWMA: 3.2120 | Learning Rate: 0.002000 | Progress: 0.83968 +Step 5,150 | Tokens: 168,755,200 | Train Loss EWMA: 3.2110 | Learning Rate: 0.002000 | Progress: 0.84378 +Step 5,175 | Tokens: 169,574,400 | Train Loss EWMA: 3.2097 | Learning Rate: 0.002000 | Progress: 0.84787 +Step 5,200 | Tokens: 170,393,600 | Train Loss EWMA: 3.2115 | Learning Rate: 0.002000 | Progress: 0.85197 +Step 5,225 | Tokens: 171,212,800 | Train Loss EWMA: 3.2113 | Learning Rate: 0.002000 | Progress: 0.85606 +Step 5,250 | Tokens: 172,032,000 | Train Loss EWMA: 3.2135 | Learning Rate: 0.002000 | Progress: 0.86016 +Step 5,275 | Tokens: 172,851,200 | Train Loss EWMA: 3.2118 | Learning Rate: 0.002000 | Progress: 0.86426 +Step 5,300 | Tokens: 173,670,400 | Train Loss EWMA: 3.2082 | Learning Rate: 0.002000 | Progress: 0.86835 +Step 5,325 | Tokens: 174,489,600 | Train Loss EWMA: 3.2100 | Learning Rate: 0.002000 | Progress: 0.87245 +Step 5,350 | Tokens: 175,308,800 | Train Loss EWMA: 3.2053 | Learning Rate: 0.002000 | Progress: 0.87654 +Step 5,375 | Tokens: 176,128,000 | Train Loss EWMA: 3.2066 | Learning Rate: 0.002000 | Progress: 0.88064 +Step 5,400 | Tokens: 176,947,200 | Train Loss EWMA: 3.1950 | Learning Rate: 0.002000 | Progress: 0.88474 +Step 5,425 | Tokens: 177,766,400 | Train Loss EWMA: 3.1932 | Learning Rate: 0.002000 | Progress: 0.88883 +Step 5,450 | Tokens: 178,585,600 | Train Loss EWMA: 3.1884 | Learning Rate: 0.002000 | Progress: 0.89293 +Step 5,475 | Tokens: 179,404,800 | Train Loss EWMA: 3.1887 | Learning Rate: 0.002000 | Progress: 0.89702 +Step 5,500 | Tokens: 180,224,000 | Train Loss EWMA: 3.1867 | Learning Rate: 0.002000 | Progress: 0.90112 +Step 5,525 | Tokens: 181,043,200 | Train Loss EWMA: 3.1845 | Learning Rate: 0.002000 | Progress: 0.90522 +Step 5,550 | Tokens: 181,862,400 | Train Loss EWMA: 3.1802 | Learning Rate: 0.002000 | Progress: 0.90931 +Step 5,575 | Tokens: 182,681,600 | Train Loss EWMA: 3.1818 | Learning Rate: 0.002000 | Progress: 0.91341 +Step 5,600 | Tokens: 183,500,800 | Train Loss EWMA: 3.1839 | Learning Rate: 0.002000 | Progress: 0.91750 +Step 5,625 | Tokens: 184,320,000 | Train Loss EWMA: 3.1766 | Learning Rate: 0.002000 | Progress: 0.92160 +Step 5,650 | Tokens: 185,139,200 | Train Loss EWMA: 3.1715 | Learning Rate: 0.002000 | Progress: 0.92570 +Step 5,675 | Tokens: 185,958,400 | Train Loss EWMA: 3.1704 | Learning Rate: 0.002000 | Progress: 0.92979 +Step 5,700 | Tokens: 186,777,600 | Train Loss EWMA: 3.1703 | Learning Rate: 0.002000 | Progress: 0.93389 +Step 5,725 | Tokens: 187,596,800 | Train Loss EWMA: 3.1682 | Learning Rate: 0.002000 | Progress: 0.93798 +Step 5,750 | Tokens: 188,416,000 | Train Loss EWMA: 3.1728 | Learning Rate: 0.002000 | Progress: 0.94208 +Step 5,775 | Tokens: 189,235,200 | Train Loss EWMA: 3.1642 | Learning Rate: 0.002000 | Progress: 0.94618 +Step 5,800 | Tokens: 190,054,400 | Train Loss EWMA: 3.1605 | Learning Rate: 0.002000 | Progress: 0.95027 +Step 5,825 | Tokens: 190,873,600 | Train Loss EWMA: 3.1612 | Learning Rate: 0.002000 | Progress: 0.95437 +Step 5,850 | Tokens: 191,692,800 | Train Loss EWMA: 3.1639 | Learning Rate: 0.002000 | Progress: 0.95846 +Step 5,875 | Tokens: 192,512,000 | Train Loss EWMA: 3.1602 | Learning Rate: 0.002000 | Progress: 0.96256 +Step 5,900 | Tokens: 193,331,200 | Train Loss EWMA: 3.1603 | Learning Rate: 0.002000 | Progress: 0.96666 +Step 5,925 | Tokens: 194,150,400 | Train Loss EWMA: 3.1727 | Learning Rate: 0.002000 | Progress: 0.97075 +Step 5,950 | Tokens: 194,969,600 | Train Loss EWMA: 3.1791 | Learning Rate: 0.002000 | Progress: 0.97485 +Step 5,975 | Tokens: 195,788,800 | Train Loss EWMA: 3.1783 | Learning Rate: 0.002000 | Progress: 0.97894 +Step 6,000 | Tokens: 196,608,000 | Train Loss EWMA: 3.1692 | Learning Rate: 0.002000 | Progress: 0.98304 +Step 6,025 | Tokens: 197,427,200 | Train Loss EWMA: 3.1725 | Learning Rate: 0.002000 | Progress: 0.98714 +Step 6,050 | Tokens: 198,246,400 | Train Loss EWMA: 3.1633 | Learning Rate: 0.002000 | Progress: 0.99123 +Step 6,075 | Tokens: 199,065,600 | Train Loss EWMA: 3.1604 | Learning Rate: 0.002000 | Progress: 0.99533 +Step 6,100 | Tokens: 199,884,800 | Train Loss EWMA: 3.1554 | Learning Rate: 0.002000 | Progress: 0.99942 diff --git a/wandb/run-20260319_063518-29lbcxak/files/requirements.txt b/wandb/run-20260319_063518-29lbcxak/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..c32285d10ba18c2e783ff2ead305d5976caef668 --- /dev/null +++ b/wandb/run-20260319_063518-29lbcxak/files/requirements.txt @@ -0,0 +1,222 @@ +fsspec==2025.3.0 +PyYAML==6.0.2 +certifi==2025.8.3 +comm==0.2.3 +widgetsnbextension==4.0.14 +Jinja2==3.1.6 +rich==14.1.0 +circuitsvis==1.43.3 +hf-xet==1.1.9 +param==2.2.1 +httpcore==1.0.9 +nvidia-cuda-cupti-cu12==12.8.90 +nvidia-cuda-nvrtc-cu12==12.8.93 +asttokens==3.0.0 +filelock==3.19.1 +types-python-dateutil==2.9.0.20250822 +cycler==0.12.1 +stack-data==0.6.3 +jupyter_server==2.17.0 +aiosignal==1.4.0 +xyzservices==2025.4.0 +lark==1.2.2 +ptyprocess==0.7.0 +xxhash==3.5.0 +mpmath==1.3.0 +seaborn==0.13.2 +wadler_lindig==0.1.7 +nbformat==5.10.4 +panel==1.8.0 +accelerate==1.10.1 +plotly==6.3.0 +narwhals==2.4.0 +huggingface-hub==0.34.4 +sentencepiece==0.2.1 +torchvision==0.23.0 +ipython==9.5.0 +tqdm==4.67.1 +contourpy==1.3.3 +nvidia-nvtx-cu12==12.8.90 +nvidia-cuda-runtime-cu12==12.8.90 +yarl==1.20.1 +charset-normalizer==3.4.3 +jupyter-events==0.12.0 +nbclient==0.10.2 +numpy==1.26.4 +decorator==5.2.1 +threadpoolctl==3.6.0 +networkx==3.5 +smmap==5.0.2 +nbconvert==7.16.6 +pytz==2025.2 +aiohappyeyeballs==2.6.1 +requests==2.32.5 +tinycss2==1.4.0 +defusedxml==0.7.1 +matplotlib-inline==0.1.7 +rpds-py==0.27.1 +wandb==0.21.4 +jedi==0.19.2 +pathspec==0.12.1 +transformer-lens==2.16.1 +sympy==1.14.0 +jupyterlab_pygments==0.3.0 +overrides==7.7.0 +notebook_shim==0.2.4 +jupyter==1.1.1 +protobuf==6.32.1 +better-abc==0.0.3 +jsonpointer==3.0.0 +terminado==0.18.1 +cfgv==3.4.0 +rfc3987-syntax==1.1.0 +annotated-types==0.7.0 +pyarrow==21.0.0 +webencodings==0.5.1 +wcwidth==0.2.13 +jupyterlab_server==2.27.3 +argon2-cffi-bindings==25.1.0 +nvidia-nvjitlink-cu12==12.8.93 +jaxtyping==0.3.2 +Pygments==2.19.2 +torch==2.8.0 +rfc3339-validator==0.1.4 +urllib3==2.5.0 +jupyterlab_widgets==3.0.15 +ipykernel==6.30.1 +nvidia-cudnn-cu12==9.10.2.21 +beautifulsoup4==4.13.5 +babel==2.17.0 +pure_eval==0.2.3 +pyparsing==3.2.3 +nvidia-cublas-cu12==12.8.4.1 +regex==2025.9.1 +pycparser==2.23 +soupsieve==2.8 +pytest-cov==7.0.0 +sniffio==1.3.1 +mypy==1.18.1 +notebook==7.4.5 +packaging==25.0 +h11==0.16.0 +psutil==7.0.0 +pexpect==4.9.0 +zstandard==0.25.0 +gitdb==4.0.12 +rfc3986-validator==0.1.1 +pyzmq==27.1.0 +jupyterlab==4.4.7 +toy_models==0.1.0 +torchaudio==2.8.0 +cffi==2.0.0 +mypy_extensions==1.1.0 +attrs==25.3.0 +statsmodels==0.14.6 +transformers==4.56.1 +jupyter_core==5.8.1 +bleach==6.2.0 +fqdn==1.5.1 +async-lru==2.0.5 +nvidia-nccl-cu12==2.27.3 +GitPython==3.1.45 +referencing==0.36.2 +click==8.2.1 +prometheus_client==0.22.1 +bokeh==3.8.0 +httpx==0.28.1 +setuptools==80.9.0 +argon2-cffi==25.1.0 +patsy==1.0.2 +multidict==6.6.4 +pyviz_comms==3.0.6 +arrow==1.3.0 +scikit-learn==1.8.0 +beartype==0.14.1 +ipywidgets==8.1.7 +pydantic_core==2.33.2 +markdown-it-py==4.0.0 +pandas==2.3.2 +virtualenv==20.34.0 +python-dotenv==1.1.1 +isoduration==20.11.0 +python-dateutil==2.9.0.post0 +nodeenv==1.9.1 +nvidia-curand-cu12==10.3.9.90 +webcolors==24.11.1 +MarkupSafe==3.0.2 +nvidia-cusolver-cu12==11.7.3.90 +Send2Trash==1.8.3 +coverage==7.10.6 +jupyter_server_terminals==0.5.3 +debugpy==1.8.16 +json5==0.12.1 +linkify-it-py==2.0.3 +importlib_metadata==8.7.0 +nvidia-cufft-cu12==11.3.3.83 +distlib==0.4.0 +typing-inspection==0.4.1 +identify==2.6.14 +nvidia-cufile-cu12==1.13.1.3 +scipy==1.17.0 +mdurl==0.1.2 +websocket-client==1.8.0 +jsonschema==4.25.1 +python-json-logger==3.3.0 +typing_extensions==4.15.0 +tokenizers==0.22.0 +ipympl==0.9.7 +einops==0.8.1 +jupyter_client==8.6.3 +ipython_pygments_lexers==1.1.1 +h5py==3.14.0 +tabulate==0.9.0 +propcache==0.3.2 +ruff==0.13.0 +tornado==6.5.2 +typeguard==4.4.4 +tomlkit==0.13.2 +pluggy==1.6.0 +pydantic==2.11.7 +zipp==3.23.0 +fancy-einsum==0.0.3 +fastjsonschema==2.21.2 +datasets==4.0.0 +fonttools==4.59.2 +executing==2.2.1 +pillow==11.3.0 +uc-micro-py==1.0.3 +Markdown==3.9 +pre_commit==4.3.0 +aiohttp==3.12.15 +mistune==3.1.4 +tzdata==2025.2 +parso==0.8.5 +triton==3.4.0 +kiwisolver==1.4.9 +idna==3.10 +multiprocess==0.70.16 +dill==0.3.8 +jupyter-lsp==2.3.0 +platformdirs==4.4.0 +sentry-sdk==2.37.1 +prompt_toolkit==3.0.52 +jsonschema-specifications==2025.9.1 +pytest==8.4.2 +mdit-py-plugins==0.5.0 +transformers-stream-generator==0.0.5 +nvidia-cusparselt-cu12==0.7.1 +joblib==1.5.3 +pandocfilters==1.5.1 +jupyter-console==6.6.3 +anyio==4.10.0 +six==1.17.0 +holoviews==1.21.0 +matplotlib==3.10.6 +colorcet==3.1.0 +uri-template==1.3.0 +nest-asyncio==1.6.0 +nvidia-cusparse-cu12==12.5.8.93 +iniconfig==2.1.0 +traitlets==5.14.3 +safetensors==0.6.2 +frozenlist==1.7.0 diff --git a/wandb/run-20260319_063518-29lbcxak/files/wandb-metadata.json b/wandb/run-20260319_063518-29lbcxak/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..5488be5289873e17ad9280121bb25211d8a0e0e6 --- /dev/null +++ b/wandb/run-20260319_063518-29lbcxak/files/wandb-metadata.json @@ -0,0 +1,38 @@ +{ + "os": "Linux-5.19.0-45-generic-x86_64-with-glibc2.35", + "python": "CPython 3.11.7", + "startedAt": "2026-03-19T06:35:18.692601Z", + "program": "", + "git": { + "remote": "https://github.com/jgroh3/toy_models.git", + "commit": "d722bb952956265d0387df9c35a76703a66824ec" + }, + "email": "tzfof8@gmail.com", + "root": "/notebooks/toy_models/model_training/model", + "host": "na3g1nqrym", + "executable": "/notebooks/toy_models/.toy_models_env/bin/python", + "cpu_count": 8, + "cpu_count_logical": 8, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 1, + "disk": { + "/": { + "total": "262240792576", + "used": "133357727744" + } + }, + "memory": { + "total": "47332843520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere", + "uuid": "GPU-7a4c6671-13c7-c5cf-2024-6b167a982ac9" + } + ], + "cudaVersion": "12.4", + "writerId": "etlry1vz32kjtednysvbuccj86zuz6bd" +} \ No newline at end of file diff --git a/wandb/run-20260319_063518-29lbcxak/files/wandb-summary.json b/wandb/run-20260319_063518-29lbcxak/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..90781817d0f1e6d106215e0cf7deadfce78c416c --- /dev/null +++ b/wandb/run-20260319_063518-29lbcxak/files/wandb-summary.json @@ -0,0 +1 @@ +{"tokens_seen":199884800,"_timestamp":1.7739096001040854e+09,"train_loss_ewma":3.1553501937332564,"learning_rate":0.002,"train_loss":2.9686617851257324,"_runtime":7484.887482683,"progress":0.999424,"_step":6100,"_wandb":{"runtime":7484},"step":6100,"tokens_per_second":32768} \ No newline at end of file diff --git a/wandb/run-20260319_063518-29lbcxak/logs/debug-core.log b/wandb/run-20260319_063518-29lbcxak/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..ea408acf3562f4468fe4199f97f51b305a2e20fe --- /dev/null +++ b/wandb/run-20260319_063518-29lbcxak/logs/debug-core.log @@ -0,0 +1,16 @@ +{"time":"2026-03-19T06:35:19.161304359Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpozb5m68l/port-2015.txt","pid":2015,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2026-03-19T06:35:19.162022748Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-2015-2051-1497119454/socket","Net":"unix"}} +{"time":"2026-03-19T06:35:19.162144524Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":2015} +{"time":"2026-03-19T06:35:19.200134107Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2026-03-19T06:35:19.214038916Z","level":"INFO","msg":"handleInformInit: received","streamId":"29lbcxak","id":"1(@)"} +{"time":"2026-03-19T06:35:19.454984852Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"29lbcxak","id":"1(@)"} +{"time":"2026-03-19T08:40:08.65048688Z","level":"INFO","msg":"handleInformFinish: finish message received","streamId":"29lbcxak","id":"1(@)"} +{"time":"2026-03-19T08:40:08.652717133Z","level":"INFO","msg":"handleInformFinish: stream closed","streamId":"29lbcxak","id":"1(@)"} +{"time":"2026-03-19T08:40:08.65273196Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"} +{"time":"2026-03-19T08:40:08.652743547Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"} +{"time":"2026-03-19T08:40:08.652752436Z","level":"INFO","msg":"connection: closing","id":"1(@)"} +{"time":"2026-03-19T08:40:08.652765601Z","level":"INFO","msg":"server is shutting down"} +{"time":"2026-03-19T08:40:08.652780366Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"} +{"time":"2026-03-19T08:40:08.652785852Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"} +{"time":"2026-03-19T08:40:08.65285183Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/tmp/wandb-2015-2051-1497119454/socket","Net":"unix"}} +{"time":"2026-03-19T08:40:08.652877202Z","level":"INFO","msg":"server is closed"} diff --git a/wandb/run-20260319_063518-29lbcxak/logs/debug-internal.log b/wandb/run-20260319_063518-29lbcxak/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..d0ef417f5fa4b559f89e60d07c150e2e3f061f99 --- /dev/null +++ b/wandb/run-20260319_063518-29lbcxak/logs/debug-internal.log @@ -0,0 +1,12 @@ +{"time":"2026-03-19T06:35:19.214150803Z","level":"INFO","msg":"stream: starting","core version":"0.21.4"} +{"time":"2026-03-19T06:35:19.454890439Z","level":"INFO","msg":"stream: created new stream","id":"29lbcxak"} +{"time":"2026-03-19T06:35:19.454977401Z","level":"INFO","msg":"stream: started","id":"29lbcxak"} +{"time":"2026-03-19T06:35:19.455055746Z","level":"INFO","msg":"sender: started","stream_id":"29lbcxak"} +{"time":"2026-03-19T06:35:19.455093161Z","level":"INFO","msg":"writer: started","stream_id":"29lbcxak"} +{"time":"2026-03-19T06:35:19.455093346Z","level":"INFO","msg":"handler: started","stream_id":"29lbcxak"} +{"time":"2026-03-19T08:40:04.994899105Z","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run metadata","runtime_seconds":0.001043706}],"total_operations":1}} +{"time":"2026-03-19T08:40:05.630105109Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2026-03-19T08:40:08.650518881Z","level":"INFO","msg":"stream: closing","id":"29lbcxak"} +{"time":"2026-03-19T08:40:08.650561694Z","level":"INFO","msg":"handler: closed","stream_id":"29lbcxak"} +{"time":"2026-03-19T08:40:08.650605449Z","level":"INFO","msg":"sender: closed","stream_id":"29lbcxak"} +{"time":"2026-03-19T08:40:08.650623595Z","level":"INFO","msg":"stream: closed","id":"29lbcxak"} diff --git a/wandb/run-20260319_063518-29lbcxak/logs/debug.log b/wandb/run-20260319_063518-29lbcxak/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..cfed7b25c4e3c86ac39418d92b12214e64089747 --- /dev/null +++ b/wandb/run-20260319_063518-29lbcxak/logs/debug.log @@ -0,0 +1,26 @@ +2026-03-19 06:35:18,696 INFO MainThread:2015 [wandb_setup.py:_flush():81] Current SDK version is 0.21.4 +2026-03-19 06:35:18,696 INFO MainThread:2015 [wandb_setup.py:_flush():81] Configure stats pid to 2015 +2026-03-19 06:35:18,696 INFO MainThread:2015 [wandb_setup.py:_flush():81] Loading settings from /root/.config/wandb/settings +2026-03-19 06:35:18,696 INFO MainThread:2015 [wandb_setup.py:_flush():81] Loading settings from /notebooks/toy_models/model_training/model/wandb/settings +2026-03-19 06:35:18,696 INFO MainThread:2015 [wandb_setup.py:_flush():81] Loading settings from environment variables +2026-03-19 06:35:18,696 INFO MainThread:2015 [wandb_init.py:setup_run_log_directory():686] Logging user logs to /notebooks/toy_models/model_training/model/wandb/run-20260319_063518-29lbcxak/logs/debug.log +2026-03-19 06:35:18,696 INFO MainThread:2015 [wandb_init.py:setup_run_log_directory():687] Logging internal logs to /notebooks/toy_models/model_training/model/wandb/run-20260319_063518-29lbcxak/logs/debug-internal.log +2026-03-19 06:35:18,696 INFO MainThread:2015 [wandb_init.py:init():813] calling init triggers +2026-03-19 06:35:18,696 INFO MainThread:2015 [wandb_init.py:init():818] wandb.init called with sweep_config: {} +config: {'model_name': 'pile_llama_replace_17367_new', 'n_layers': 8, 'd_model': 512, 'd_mlp': 2048, 'd_head': 64, 'n_heads': 8, 'attn_only': False, 'layer_norm_eps': 1e-05, 'init_range': 0.02, 'n_ctx': 1024, 'd_vocab': 32000, 'dataset_name': 'eoinf/PL_Replace17367_L2_alldataset', 'tokenizer_name': '', 'seed': 10, 'data_seed': 10, 'device': 'cuda', 'use_bfloat16_matmul': False, 'batch_size_per_device': 32, 'n_devices': 1, 'batches_per_step': 1, 'max_tokens': 200000000, 'lr_hidden': 0.002, 'lr_vector': 0.001, 'lr_schedule': 'constant_with_warmup', 'warmup_tokens': 30000000, 'weight_decay': 0.05, 'grad_norm_clip': 1.0, 'train_loss_moving_average_beta': 0.99, 'log_interval': 25, 'save_checkpoints': True, 'checkpoint_interval': 500, 'checkpoint_interval_ratio': 1.1, 'save_log_checkpoints': True, 'use_wandb': True, 'batch_size': 32, 'tokens_per_step': 32768, 'warmup_steps': 915, 'max_steps': 6103, '_wandb': {}} +2026-03-19 06:35:18,696 INFO MainThread:2015 [wandb_init.py:init():854] starting backend +2026-03-19 06:35:19,200 INFO MainThread:2015 [wandb_init.py:init():857] sending inform_init request +2026-03-19 06:35:19,210 INFO MainThread:2015 [wandb_init.py:init():865] backend started and connected +2026-03-19 06:35:19,211 INFO MainThread:2015 [wandb_init.py:init():936] updated telemetry +2026-03-19 06:35:19,221 INFO MainThread:2015 [wandb_init.py:init():960] communicating run to backend with 90.0 second timeout +2026-03-19 06:35:20,106 INFO MainThread:2015 [wandb_init.py:init():1011] starting run threads in backend +2026-03-19 06:35:20,927 INFO MainThread:2015 [wandb_run.py:_console_start():2506] atexit reg +2026-03-19 06:35:20,927 INFO MainThread:2015 [wandb_run.py:_redirect():2354] redirect: wrap_raw +2026-03-19 06:35:20,928 INFO MainThread:2015 [wandb_run.py:_redirect():2423] Wrapping output streams. +2026-03-19 06:35:20,928 INFO MainThread:2015 [wandb_run.py:_redirect():2446] Redirects installed. +2026-03-19 06:35:20,941 INFO MainThread:2015 [wandb_init.py:init():1049] run started, returning control to user process +2026-03-19 08:40:04,989 INFO MainThread:2015 [wandb_run.py:_finish():2272] finishing run tzach/toy-transformer-replication/29lbcxak +2026-03-19 08:40:04,993 INFO MainThread:2015 [wandb_run.py:_atexit_cleanup():2471] got exitcode: 0 +2026-03-19 08:40:04,993 INFO MainThread:2015 [wandb_run.py:_restore():2453] restore +2026-03-19 08:40:04,993 INFO MainThread:2015 [wandb_run.py:_restore():2459] restore done +2026-03-19 08:40:08,649 INFO MainThread:2015 [wandb_run.py:_footer_sync_info():3867] logging synced files diff --git a/wandb/run-20260319_063518-29lbcxak/run-29lbcxak.wandb b/wandb/run-20260319_063518-29lbcxak/run-29lbcxak.wandb new file mode 100644 index 0000000000000000000000000000000000000000..59c435352a3ae83b62b61a4ba74ea927ec764e24 --- /dev/null +++ b/wandb/run-20260319_063518-29lbcxak/run-29lbcxak.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d4717fab4dfd9608b4916cf22219b3b800bd864fa4b9d8e15c2d94d9e385ace +size 4214848