diff --git a/.gitattributes b/.gitattributes index fd857501c7ca4b874b61dd810867d733bc2ffeaf..d1c7997026325cad4baa5f0b0a95f2ae81223f92 100644 --- a/.gitattributes +++ b/.gitattributes @@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text wandb/run-20260226_135602-696nxyfr/run-696nxyfr.wandb filter=lfs diff=lfs merge=lfs -text +wandb/run-20260226_153026-trcpjlfd/run-trcpjlfd.wandb filter=lfs diff=lfs merge=lfs -text diff --git a/checkpoints/metadata_000000032768.json b/checkpoints/metadata_000000032768.json index b8a7704ce050180271ca25f2f255fcf595a47b6c..ff44a084aafa754e5474df41ea15c374a23fa268 100644 --- a/checkpoints/metadata_000000032768.json +++ b/checkpoints/metadata_000000032768.json @@ -1 +1 @@ -{"step": 1, "tokens_seen": 32768, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.47806167602539} \ No newline at end of file +{"step": 1, "tokens_seen": 32768, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.47472858428955} \ No newline at end of file diff --git a/checkpoints/metadata_000000327680.json b/checkpoints/metadata_000000327680.json index d42d566ea67f268fa6f57813ee4d541346608b09..8ce0422abf1ed1343400bcc795303c1f9c616194 100644 --- a/checkpoints/metadata_000000327680.json +++ b/checkpoints/metadata_000000327680.json @@ -1 +1 @@ -{"step": 10, "tokens_seen": 327680, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.47627660688796} \ No newline at end of file +{"step": 10, "tokens_seen": 327680, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.472703523224029} \ No newline at end of file diff --git a/checkpoints/metadata_000000360448.json b/checkpoints/metadata_000000360448.json index 3d7caabc185ac1f7af4a964848e301009e0bee87..5f86bf14630ffed3dfd8509fb27841b4158561a2 100644 --- a/checkpoints/metadata_000000360448.json +++ b/checkpoints/metadata_000000360448.json @@ -1 +1 @@ -{"step": 11, "tokens_seen": 360448, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.475470339918813} \ No newline at end of file +{"step": 11, "tokens_seen": 360448, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.471990140793302} \ No newline at end of file diff --git a/checkpoints/metadata_000000425984.json b/checkpoints/metadata_000000425984.json index 8fe398142a24e02b10c494cb6ff60557b8f9f5f4..f890a96d7e29d632ed10db4aff1abeeb4da6adaf 100644 --- a/checkpoints/metadata_000000425984.json +++ b/checkpoints/metadata_000000425984.json @@ -1 +1 @@ -{"step": 13, "tokens_seen": 425984, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.473083299528513} \ No newline at end of file +{"step": 13, "tokens_seen": 425984, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.469721053741852} \ No newline at end of file diff --git a/checkpoints/metadata_000000458752.json b/checkpoints/metadata_000000458752.json index f162026671441be3ff60f893882f1a74bc817645..dcf880858f9f072a94a3ae49e79d7227ce2d792e 100644 --- a/checkpoints/metadata_000000458752.json +++ b/checkpoints/metadata_000000458752.json @@ -1 +1 @@ -{"step": 14, "tokens_seen": 458752, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.471743989894433} \ No newline at end of file +{"step": 14, "tokens_seen": 458752, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.468519898807461} \ No newline at end of file diff --git a/checkpoints/metadata_000000491520.json b/checkpoints/metadata_000000491520.json index 4c79139f1f06a20bc53600249f641d0ccb6435b6..f71dda81d7464c25452230a045723a1c40ad90b8 100644 --- a/checkpoints/metadata_000000491520.json +++ b/checkpoints/metadata_000000491520.json @@ -1 +1 @@ -{"step": 15, "tokens_seen": 491520, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.470174342811651} \ No newline at end of file +{"step": 15, "tokens_seen": 491520, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.467125200460256} \ No newline at end of file diff --git a/checkpoints/metadata_000000557056.json b/checkpoints/metadata_000000557056.json index aaeb3f54a99814ce2d5cd77e2a76e06123246521..4469dc93cea90e4e284ab879ad82de065e161d52 100644 --- a/checkpoints/metadata_000000557056.json +++ b/checkpoints/metadata_000000557056.json @@ -1 +1 @@ -{"step": 17, "tokens_seen": 557056, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.466137204272727} \ No newline at end of file +{"step": 17, "tokens_seen": 557056, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.463455310860898} \ No newline at end of file diff --git a/checkpoints/metadata_000000622592.json b/checkpoints/metadata_000000622592.json index dc64e5070c0c8acda29fae1e838e840583d89fda..169f67b8b435103d417869bd206168a0e27f9d85 100644 --- a/checkpoints/metadata_000000622592.json +++ b/checkpoints/metadata_000000622592.json @@ -1 +1 @@ -{"step": 19, "tokens_seen": 622592, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.461292862352218} \ No newline at end of file +{"step": 19, "tokens_seen": 622592, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.459099738819177} \ No newline at end of file diff --git a/checkpoints/metadata_000000688128.json b/checkpoints/metadata_000000688128.json index ee621bde74f1b70e7745f6d6982ee0ce657f9468..760d87d6aadad649e919701a79502eb04732aa4e 100644 --- a/checkpoints/metadata_000000688128.json +++ b/checkpoints/metadata_000000688128.json @@ -1 +1 @@ -{"step": 21, "tokens_seen": 688128, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.455025942618947} \ No newline at end of file +{"step": 21, "tokens_seen": 688128, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.453407187818708} \ No newline at end of file diff --git a/checkpoints/metadata_000000753664.json b/checkpoints/metadata_000000753664.json index 1baef2ebe8e428cd1b1f752d67724878e2d5fd9f..ede06b34062135e04a840100102f55a312046402 100644 --- a/checkpoints/metadata_000000753664.json +++ b/checkpoints/metadata_000000753664.json @@ -1 +1 @@ -{"step": 23, "tokens_seen": 753664, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.446919057636006} \ No newline at end of file +{"step": 23, "tokens_seen": 753664, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.446099322563402} \ No newline at end of file diff --git a/checkpoints/metadata_000000819200.json b/checkpoints/metadata_000000819200.json index 62bf26e4dcb7882b53b7443ab588061ba613e705..57f305d5c02a894fd0fb94b03bb1d3bf5fbecd06 100644 --- a/checkpoints/metadata_000000819200.json +++ b/checkpoints/metadata_000000819200.json @@ -1 +1 @@ -{"step": 25, "tokens_seen": 819200, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.438280975135724} \ No newline at end of file +{"step": 25, "tokens_seen": 819200, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.438267091556018} \ No newline at end of file diff --git a/checkpoints/metadata_000000917504.json b/checkpoints/metadata_000000917504.json index 2f0e48b7dcf9d3e421a608b288bc8edd356acd51..fa6312c3000fdd374487baa1d1c54c8ed70d0730 100644 --- a/checkpoints/metadata_000000917504.json +++ b/checkpoints/metadata_000000917504.json @@ -1 +1 @@ -{"step": 28, "tokens_seen": 917504, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.422044364253066} \ No newline at end of file +{"step": 28, "tokens_seen": 917504, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.423273612378466} \ No newline at end of file diff --git a/checkpoints/metadata_000000983040.json b/checkpoints/metadata_000000983040.json index 699cee11d0214ac7e5944ecdd04e8cb13a300994..7df98d9217407dbaf03e1224946390148fe1a478 100644 --- a/checkpoints/metadata_000000983040.json +++ b/checkpoints/metadata_000000983040.json @@ -1 +1 @@ -{"step": 30, "tokens_seen": 983040, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.409236852859813} \ No newline at end of file +{"step": 30, "tokens_seen": 983040, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.411245689106972} \ No newline at end of file diff --git a/checkpoints/metadata_000001114112.json b/checkpoints/metadata_000001114112.json index 141574201d643111e2cf18c3c5d36f20e45211a4..61cc72f65bfff95fa8815f469a1e2b6e9859595d 100644 --- a/checkpoints/metadata_000001114112.json +++ b/checkpoints/metadata_000001114112.json @@ -1 +1 @@ -{"step": 34, "tokens_seen": 1114112, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.380797570445656} \ No newline at end of file +{"step": 34, "tokens_seen": 1114112, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.383828797056317} \ No newline at end of file diff --git a/checkpoints/metadata_000001212416.json b/checkpoints/metadata_000001212416.json index e03c279a8bed8806a93a4b404c61377ac32160c4..5cda40b1bb7426ee1f31d21d62c169b32e645a2a 100644 --- a/checkpoints/metadata_000001212416.json +++ b/checkpoints/metadata_000001212416.json @@ -1 +1 @@ -{"step": 37, "tokens_seen": 1212416, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.3582079333492} \ No newline at end of file +{"step": 37, "tokens_seen": 1212416, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.361741983796891} \ No newline at end of file diff --git a/checkpoints/metadata_000001343488.json b/checkpoints/metadata_000001343488.json index fcdd216343e123858497be637c6bb7b0e680f4c6..17654f11fb2396b31d4a563f94dfca2094779a92 100644 --- a/checkpoints/metadata_000001343488.json +++ b/checkpoints/metadata_000001343488.json @@ -1 +1 @@ -{"step": 41, "tokens_seen": 1343488, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.325794613706586} \ No newline at end of file +{"step": 41, "tokens_seen": 1343488, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.329621238518811} \ No newline at end of file diff --git a/checkpoints/metadata_000001474560.json b/checkpoints/metadata_000001474560.json index 0d18f8776a72ed8ccd933da3d907f83603159d46..57747bb43b744b5cfaddeaa93681e48b47393f9d 100644 --- a/checkpoints/metadata_000001474560.json +++ b/checkpoints/metadata_000001474560.json @@ -1 +1 @@ -{"step": 45, "tokens_seen": 1474560, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.289813354053809} \ No newline at end of file +{"step": 45, "tokens_seen": 1474560, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.29327811546785} \ No newline at end of file diff --git a/checkpoints/metadata_000001605632.json b/checkpoints/metadata_000001605632.json index b36f5e561d2af918ee2cba8a2ea944fffa623b7e..f58443b698730310105307f06dc671c835d7c45e 100644 --- a/checkpoints/metadata_000001605632.json +++ b/checkpoints/metadata_000001605632.json @@ -1 +1 @@ -{"step": 49, "tokens_seen": 1605632, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.250201881192115} \ No newline at end of file +{"step": 49, "tokens_seen": 1605632, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.253395153119124} \ No newline at end of file diff --git a/checkpoints/metadata_000001769472.json b/checkpoints/metadata_000001769472.json index b3ef822016a8a8f7beb31e27a3229e9be5246628..fd5b42aa72cb3fbfa104058f7fe0976e0763565e 100644 --- a/checkpoints/metadata_000001769472.json +++ b/checkpoints/metadata_000001769472.json @@ -1 +1 @@ -{"step": 54, "tokens_seen": 1769472, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.19959723315217} \ No newline at end of file +{"step": 54, "tokens_seen": 1769472, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.20219309142512} \ No newline at end of file diff --git a/checkpoints/metadata_000001966080.json b/checkpoints/metadata_000001966080.json index c2a1487afc63bae04e991f52ae073bf2a1a19546..833734f418268eb90e2f46c9025c5872770665f3 100644 --- a/checkpoints/metadata_000001966080.json +++ b/checkpoints/metadata_000001966080.json @@ -1 +1 @@ -{"step": 60, "tokens_seen": 1966080, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.13133821097923} \ No newline at end of file +{"step": 60, "tokens_seen": 1966080, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.133052898451657} \ No newline at end of file diff --git a/checkpoints/metadata_000002162688.json b/checkpoints/metadata_000002162688.json index 0987d20d4d9e2bed5b80ffa13991890f19111b52..fc305f8231544b3175065c791cdf3f3dbf671f23 100644 --- a/checkpoints/metadata_000002162688.json +++ b/checkpoints/metadata_000002162688.json @@ -1 +1 @@ -{"step": 66, "tokens_seen": 2162688, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.0570300679636} \ No newline at end of file +{"step": 66, "tokens_seen": 2162688, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.057631285417521} \ No newline at end of file diff --git a/checkpoints/metadata_000002359296.json b/checkpoints/metadata_000002359296.json index e4cc5f37f9cc6cf896d9020ba39870d9c5207eaf..ec9114785ac35b157295da59260e093b87222ed4 100644 --- a/checkpoints/metadata_000002359296.json +++ b/checkpoints/metadata_000002359296.json @@ -1 +1 @@ -{"step": 72, "tokens_seen": 2359296, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.974723497158761} \ No newline at end of file +{"step": 72, "tokens_seen": 2359296, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.97402740012332} \ No newline at end of file diff --git a/checkpoints/metadata_000002621440.json b/checkpoints/metadata_000002621440.json index 229c04826656632bc75f6db36a8162b0844225db..9b5f8392cd19f09aa15165606806faf612079ea2 100644 --- a/checkpoints/metadata_000002621440.json +++ b/checkpoints/metadata_000002621440.json @@ -1 +1 @@ -{"step": 80, "tokens_seen": 2621440, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.858762341449337} \ No newline at end of file +{"step": 80, "tokens_seen": 2621440, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.856754308431446} \ No newline at end of file diff --git a/checkpoints/metadata_000002883584.json b/checkpoints/metadata_000002883584.json index 9c16151176a43d6ba48c2cf9d77e7d0842e1cd5b..f3d388f25073232f44e21a2d44bd8672f3342f43 100644 --- a/checkpoints/metadata_000002883584.json +++ b/checkpoints/metadata_000002883584.json @@ -1 +1 @@ -{"step": 88, "tokens_seen": 2883584, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.730080927623849} \ No newline at end of file +{"step": 88, "tokens_seen": 2883584, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.726206998293742} \ No newline at end of file diff --git a/checkpoints/metadata_000003178496.json b/checkpoints/metadata_000003178496.json index 022fba1fc00a947fd087ddbc78cd0d5f32e0efb1..3088c0f9c63292080302b9f7a90b4b6705945a43 100644 --- a/checkpoints/metadata_000003178496.json +++ b/checkpoints/metadata_000003178496.json @@ -1 +1 @@ -{"step": 97, "tokens_seen": 3178496, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.582970532481923} \ No newline at end of file +{"step": 97, "tokens_seen": 3178496, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.577359730084346} \ No newline at end of file diff --git a/checkpoints/metadata_000003473408.json b/checkpoints/metadata_000003473408.json index 4c4a6bda07a832da46b9026337d8e10541540c70..3bdf20a37639fae37b731f501c37d02494aeaa6d 100644 --- a/checkpoints/metadata_000003473408.json +++ b/checkpoints/metadata_000003473408.json @@ -1 +1 @@ -{"step": 106, "tokens_seen": 3473408, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.424478446215208} \ No newline at end of file +{"step": 106, "tokens_seen": 3473408, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.417083930648644} \ No newline at end of file diff --git a/checkpoints/metadata_000003833856.json b/checkpoints/metadata_000003833856.json index 725ce05c5d991e1f3dd8cdb521eaa8f1ff8e82cd..9eaf8a8e7a7d69389e7d5d824fb2e102d45413c2 100644 --- a/checkpoints/metadata_000003833856.json +++ b/checkpoints/metadata_000003833856.json @@ -1 +1 @@ -{"step": 117, "tokens_seen": 3833856, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.225284321418476} \ No newline at end of file +{"step": 117, "tokens_seen": 3833856, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.216006205212386} \ No newline at end of file diff --git a/checkpoints/metadata_000004227072.json b/checkpoints/metadata_000004227072.json index e281a19b012143bbf7a5e49a68ca79fcd9013594..9ef9c58749ecef4f8489abd78500ce31e9de9a9f 100644 --- a/checkpoints/metadata_000004227072.json +++ b/checkpoints/metadata_000004227072.json @@ -1 +1 @@ -{"step": 129, "tokens_seen": 4227072, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 8.99394007776402} \ No newline at end of file +{"step": 129, "tokens_seen": 4227072, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 8.982881949139617} \ No newline at end of file diff --git a/checkpoints/metadata_000004653056.json b/checkpoints/metadata_000004653056.json index b705fd1804265a2b8c8b9272d36c894ceb86cbf2..f90891326e5bc88212c8e48a74d5753f34d840df 100644 --- a/checkpoints/metadata_000004653056.json +++ b/checkpoints/metadata_000004653056.json @@ -1 +1 @@ -{"step": 142, "tokens_seen": 4653056, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 8.744109641520092} \ No newline at end of file +{"step": 142, "tokens_seen": 4653056, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 8.732197937516098} \ No newline at end of file diff --git a/checkpoints/metadata_000005111808.json b/checkpoints/metadata_000005111808.json index 674db93cfc7bfa57cfbb6bb6a7ea503278c9964f..b30fe773f60284a1ff2d4b1493bb3209f674e24a 100644 --- a/checkpoints/metadata_000005111808.json +++ b/checkpoints/metadata_000005111808.json @@ -1 +1 @@ -{"step": 156, "tokens_seen": 5111808, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 8.493460174429384} \ No newline at end of file +{"step": 156, "tokens_seen": 5111808, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 8.482621967493646} \ No newline at end of file diff --git a/checkpoints/metadata_000005603328.json b/checkpoints/metadata_000005603328.json index 5ec22e9a8d4a96a9fc014de4122cb5fea5df3543..1f631700de350606383e6a10ad869dfc1cf83df7 100644 --- a/checkpoints/metadata_000005603328.json +++ b/checkpoints/metadata_000005603328.json @@ -1 +1 @@ -{"step": 171, "tokens_seen": 5603328, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 8.236969204325344} \ No newline at end of file +{"step": 171, "tokens_seen": 5603328, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 8.22768931743315} \ No newline at end of file diff --git a/checkpoints/metadata_000006193152.json b/checkpoints/metadata_000006193152.json index 1b7d974dcfa9137c070ccd4821c0dd18cf205bbd..1115309a4798c9088e10bb0870da72a1bacb08e5 100644 --- a/checkpoints/metadata_000006193152.json +++ b/checkpoints/metadata_000006193152.json @@ -1 +1 @@ -{"step": 189, "tokens_seen": 6193152, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.953434445580491} \ No newline at end of file +{"step": 189, "tokens_seen": 6193152, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.945035313729822} \ No newline at end of file diff --git a/checkpoints/metadata_000006782976.json b/checkpoints/metadata_000006782976.json index 522eaa9db8475977b9e6bdc1dd093a2c00068144..b9aa0541e818adabdeb9b428504248b376a9a34f 100644 --- a/checkpoints/metadata_000006782976.json +++ b/checkpoints/metadata_000006782976.json @@ -1 +1 @@ -{"step": 207, "tokens_seen": 6782976, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.701950555362339} \ No newline at end of file +{"step": 207, "tokens_seen": 6782976, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.694594853436912} \ No newline at end of file diff --git a/checkpoints/metadata_000007471104.json b/checkpoints/metadata_000007471104.json index 99d6b4f594adf36b9c235b7e80a492b6b8559fa2..6a69b982ffb9ae6bf611ca89c34439195f84f0de 100644 --- a/checkpoints/metadata_000007471104.json +++ b/checkpoints/metadata_000007471104.json @@ -1 +1 @@ -{"step": 228, "tokens_seen": 7471104, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.42616407882357} \ No newline at end of file +{"step": 228, "tokens_seen": 7471104, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.419854967755447} \ No newline at end of file diff --git a/checkpoints/metadata_000008224768.json b/checkpoints/metadata_000008224768.json index 59813b8f5d1563a930c257626e2f884613387b9e..25de7fad0c012c8e854f51f055664a154b89b218 100644 --- a/checkpoints/metadata_000008224768.json +++ b/checkpoints/metadata_000008224768.json @@ -1 +1 @@ -{"step": 251, "tokens_seen": 8224768, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.164880585767827} \ No newline at end of file +{"step": 251, "tokens_seen": 8224768, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.159385434456035} \ No newline at end of file diff --git a/checkpoints/metadata_000009043968.json b/checkpoints/metadata_000009043968.json index 8a63166dc6e3c2a6edc73070bf7be87c14a84a77..6ca5b76834dfc2ba5254b212d17b641471af9d28 100644 --- a/checkpoints/metadata_000009043968.json +++ b/checkpoints/metadata_000009043968.json @@ -1 +1 @@ -{"step": 276, "tokens_seen": 9043968, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.909649069151253} \ No newline at end of file +{"step": 276, "tokens_seen": 9043968, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.904178105557777} \ No newline at end of file diff --git a/checkpoints/metadata_000009961472.json b/checkpoints/metadata_000009961472.json index c3517fd833e0f7cadb502a735212b52d2f4107c4..8cd6a284051989e63d303001e841081e0ad82cc7 100644 --- a/checkpoints/metadata_000009961472.json +++ b/checkpoints/metadata_000009961472.json @@ -1 +1 @@ -{"step": 304, "tokens_seen": 9961472, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.658180864800952} \ No newline at end of file +{"step": 304, "tokens_seen": 9961472, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.653267985439542} \ No newline at end of file diff --git a/checkpoints/metadata_000010944512.json b/checkpoints/metadata_000010944512.json index d4bd723383d6fc38c44747393bddd3cd111d57cd..5cb321ed1a85cbc50fdd3d0e5d22ef606c076820 100644 --- a/checkpoints/metadata_000010944512.json +++ b/checkpoints/metadata_000010944512.json @@ -1 +1 @@ -{"step": 334, "tokens_seen": 10944512, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.4353112753232855} \ No newline at end of file +{"step": 334, "tokens_seen": 10944512, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.430630889399794} \ No newline at end of file diff --git a/checkpoints/metadata_000012058624.json b/checkpoints/metadata_000012058624.json index 92877621131e2e706f1f6f358c8032d01e012524..2d9db15c0aacbfb2796c469dc615bd71dd158d8e 100644 --- a/checkpoints/metadata_000012058624.json +++ b/checkpoints/metadata_000012058624.json @@ -1 +1 @@ -{"step": 368, "tokens_seen": 12058624, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.221434970794934} \ No newline at end of file +{"step": 368, "tokens_seen": 12058624, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.216337437233496} \ No newline at end of file diff --git a/checkpoints/metadata_000013271040.json b/checkpoints/metadata_000013271040.json index eae251b83789c3befe5d920139e3123303fd4b10..2a294d851e292b4a5633f1eacd4ae79d6961b240 100644 --- a/checkpoints/metadata_000013271040.json +++ b/checkpoints/metadata_000013271040.json @@ -1 +1 @@ -{"step": 405, "tokens_seen": 13271040, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.030378599797957} \ No newline at end of file +{"step": 405, "tokens_seen": 13271040, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.02448773214943} \ No newline at end of file diff --git a/checkpoints/metadata_000014581760.json b/checkpoints/metadata_000014581760.json index 16225b102b9c4ba26bfbb61a1e2354b122728295..26ac4ebf8b8a5e7faf04cfa24c95588e902d0349 100644 --- a/checkpoints/metadata_000014581760.json +++ b/checkpoints/metadata_000014581760.json @@ -1 +1 @@ -{"step": 445, "tokens_seen": 14581760, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.85191591132156} \ No newline at end of file +{"step": 445, "tokens_seen": 14581760, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.847221819414238} \ No newline at end of file diff --git a/checkpoints/metadata_000016056320.json b/checkpoints/metadata_000016056320.json index 7871463ae5e10f6b474f59ed222854d3b52bb8e4..590173c59916f4cd7b11e47b8cd24bc88dbfc63a 100644 --- a/checkpoints/metadata_000016056320.json +++ b/checkpoints/metadata_000016056320.json @@ -1 +1 @@ -{"step": 490, "tokens_seen": 16056320, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.68748776856525} \ No newline at end of file +{"step": 490, "tokens_seen": 16056320, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.685284731362977} \ No newline at end of file diff --git a/checkpoints/metadata_000016384000.json b/checkpoints/metadata_000016384000.json index 0791b7ee0fc6cd8a0d751ae8eb9047aec7eb5111..ffe89385a5087cbf9731d347ded5b32427e134d8 100644 --- a/checkpoints/metadata_000016384000.json +++ b/checkpoints/metadata_000016384000.json @@ -1 +1 @@ -{"step": 500, "tokens_seen": 16384000, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.661260096068413} \ No newline at end of file +{"step": 500, "tokens_seen": 16384000, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.659568437263502} \ No newline at end of file diff --git a/checkpoints/metadata_000017661952.json b/checkpoints/metadata_000017661952.json index e4d8a81e6e63ab148e54ccd4a0c64ced8f1c54cb..e837633165c3119c6c67db088cf8ba2570624478 100644 --- a/checkpoints/metadata_000017661952.json +++ b/checkpoints/metadata_000017661952.json @@ -1 +1 @@ -{"step": 539, "tokens_seen": 17661952, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.539440078913039} \ No newline at end of file +{"step": 539, "tokens_seen": 17661952, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.53900757717826} \ No newline at end of file diff --git a/checkpoints/metadata_000019431424.json b/checkpoints/metadata_000019431424.json index 795a61c7538edf75fadbc573f4c1a6a3efd34000..02977bc9c72b837edcb76061d0dffca0d3485675 100644 --- a/checkpoints/metadata_000019431424.json +++ b/checkpoints/metadata_000019431424.json @@ -1 +1 @@ -{"step": 593, "tokens_seen": 19431424, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.392000028286869} \ No newline at end of file +{"step": 593, "tokens_seen": 19431424, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.394161902688204} \ No newline at end of file diff --git a/checkpoints/metadata_000021364736.json b/checkpoints/metadata_000021364736.json index c61b790b1e58816b8c6ad37b2c9dd1a7570b25d4..0c6c1fd1641f10c33c9434363114be5f46d675c1 100644 --- a/checkpoints/metadata_000021364736.json +++ b/checkpoints/metadata_000021364736.json @@ -1 +1 @@ -{"step": 652, "tokens_seen": 21364736, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.281319973885694} \ No newline at end of file +{"step": 652, "tokens_seen": 21364736, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.283358080025116} \ No newline at end of file diff --git a/checkpoints/metadata_000023494656.json b/checkpoints/metadata_000023494656.json index 8b6585cbaf873238467d8c3501ab32ffa66b637d..5981d85d8e173e9686e2a59840941621345cefd2 100644 --- a/checkpoints/metadata_000023494656.json +++ b/checkpoints/metadata_000023494656.json @@ -1 +1 @@ -{"step": 717, "tokens_seen": 23494656, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.169868723380111} \ No newline at end of file +{"step": 717, "tokens_seen": 23494656, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.175760950696042} \ No newline at end of file diff --git a/checkpoints/metadata_000025853952.json b/checkpoints/metadata_000025853952.json index f33b26f878db5f9b249b6d9af9415a415a9044f4..c0498817e058407d93aa996aa938260f173b6107 100644 --- a/checkpoints/metadata_000025853952.json +++ b/checkpoints/metadata_000025853952.json @@ -1 +1 @@ -{"step": 789, "tokens_seen": 25853952, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.055504738591524} \ No newline at end of file +{"step": 789, "tokens_seen": 25853952, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.0629671796907045} \ No newline at end of file diff --git a/checkpoints/metadata_000028442624.json b/checkpoints/metadata_000028442624.json index 66b591df2df3b2b40f9b89e19e29f7d9b0bb30f3..eaea19c57783d15384c951ebdf687d41fd92a60e 100644 --- a/checkpoints/metadata_000028442624.json +++ b/checkpoints/metadata_000028442624.json @@ -1 +1 @@ -{"step": 868, "tokens_seen": 28442624, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.954886293959172} \ No newline at end of file +{"step": 868, "tokens_seen": 28442624, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.963105635300831} \ No newline at end of file diff --git a/checkpoints/metadata_000031293440.json b/checkpoints/metadata_000031293440.json index ac6d4f57c0dcddf36e0e4bee3210521fe8c37c8f..a18268521149516c134baa8844e4e75bfd6db3b3 100644 --- a/checkpoints/metadata_000031293440.json +++ b/checkpoints/metadata_000031293440.json @@ -1 +1 @@ -{"step": 955, "tokens_seen": 31293440, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.851076791499597} \ No newline at end of file +{"step": 955, "tokens_seen": 31293440, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.865391816028884} \ No newline at end of file diff --git a/checkpoints/metadata_000032768000.json b/checkpoints/metadata_000032768000.json index c41eb409957a149e6a51386411cf35f708a14e5b..a4a31d4225bbea49c2eacd5409dbd9c51168d59a 100644 --- a/checkpoints/metadata_000032768000.json +++ b/checkpoints/metadata_000032768000.json @@ -1 +1 @@ -{"step": 1000, "tokens_seen": 32768000, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.7911227754693355} \ No newline at end of file +{"step": 1000, "tokens_seen": 32768000, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.812689049158225} \ No newline at end of file diff --git a/checkpoints/metadata_000034439168.json b/checkpoints/metadata_000034439168.json index 4492c2ab7920610c1664e2bfed985120d5a0a2dd..a68b397fdef42ad1922edff35ba07d71336569f7 100644 --- a/checkpoints/metadata_000034439168.json +++ b/checkpoints/metadata_000034439168.json @@ -1 +1 @@ -{"step": 1051, "tokens_seen": 34439168, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.724863075960573} \ No newline at end of file +{"step": 1051, "tokens_seen": 34439168, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.753625057604216} \ No newline at end of file diff --git a/checkpoints/metadata_000037879808.json b/checkpoints/metadata_000037879808.json index 400734ca04f2abd7b190cd3da788988a890f6216..44634a3befed1723b762b6b24cc20b50185aadba 100644 --- a/checkpoints/metadata_000037879808.json +++ b/checkpoints/metadata_000037879808.json @@ -1 +1 @@ -{"step": 1156, "tokens_seen": 37879808, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.601169516817602} \ No newline at end of file +{"step": 1156, "tokens_seen": 37879808, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.641372455953141} \ No newline at end of file diff --git a/checkpoints/metadata_000041648128.json b/checkpoints/metadata_000041648128.json index c34c2d7368aa04268987cd74fd7e077b0c7c0897..6ebf5d90537b189dbddec1ae5aff4c110c49c2ce 100644 --- a/checkpoints/metadata_000041648128.json +++ b/checkpoints/metadata_000041648128.json @@ -1 +1 @@ -{"step": 1271, "tokens_seen": 41648128, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.486551862501883} \ No newline at end of file +{"step": 1271, "tokens_seen": 41648128, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.529437655293409} \ No newline at end of file diff --git a/checkpoints/metadata_000045842432.json b/checkpoints/metadata_000045842432.json index b61af622c244d21522e59812abacdd7397c77481..561d0ed658b95f638c9dfbefecba10260848f7c4 100644 --- a/checkpoints/metadata_000045842432.json +++ b/checkpoints/metadata_000045842432.json @@ -1 +1 @@ -{"step": 1399, "tokens_seen": 45842432, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.381150849922109} \ No newline at end of file +{"step": 1399, "tokens_seen": 45842432, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.429992510229746} \ No newline at end of file diff --git a/checkpoints/metadata_000049152000.json b/checkpoints/metadata_000049152000.json index a245ac5d882cff4d36b700408e920214664a80d8..f58e0f35ce0d556166c16df04de837d0dc59b62e 100644 --- a/checkpoints/metadata_000049152000.json +++ b/checkpoints/metadata_000049152000.json @@ -1 +1 @@ -{"step": 1500, "tokens_seen": 49152000, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.324639904720941} \ No newline at end of file +{"step": 1500, "tokens_seen": 49152000, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.3700635583946745} \ No newline at end of file diff --git a/checkpoints/metadata_000050397184.json b/checkpoints/metadata_000050397184.json index badddafe6eb760acafe95a7b859e7c742b39c8ea..915e41755cb397f5b943211c4314449951073a04 100644 --- a/checkpoints/metadata_000050397184.json +++ b/checkpoints/metadata_000050397184.json @@ -1 +1 @@ -{"step": 1538, "tokens_seen": 50397184, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.3145797956722785} \ No newline at end of file +{"step": 1538, "tokens_seen": 50397184, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.358773860729051} \ No newline at end of file diff --git a/checkpoints/metadata_000055443456.json b/checkpoints/metadata_000055443456.json index 8ec28fbc614e9c1aaa1e91ca920322588046cb94..d0b29dae8e6bf176faf72fc0ddef8e07c4a12e54 100644 --- a/checkpoints/metadata_000055443456.json +++ b/checkpoints/metadata_000055443456.json @@ -1 +1 @@ -{"step": 1692, "tokens_seen": 55443456, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.2051704572488475} \ No newline at end of file +{"step": 1692, "tokens_seen": 55443456, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.25258944505606} \ No newline at end of file diff --git a/checkpoints/metadata_000061014016.json b/checkpoints/metadata_000061014016.json index 556fdebeccab6bdfe632de548af5ada71be43d8d..cf943a25c069d7264540ce4f729d780df1725bc5 100644 --- a/checkpoints/metadata_000061014016.json +++ b/checkpoints/metadata_000061014016.json @@ -1 +1 @@ -{"step": 1862, "tokens_seen": 61014016, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.134354490996065} \ No newline at end of file +{"step": 1862, "tokens_seen": 61014016, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.183231115945514} \ No newline at end of file diff --git a/checkpoints/metadata_000065536000.json b/checkpoints/metadata_000065536000.json index bb529b5b0cd935e4afb3680420849cbf71aa27cb..095034cb30f07d2c539d55632afbe2bbc445e026 100644 --- a/checkpoints/metadata_000065536000.json +++ b/checkpoints/metadata_000065536000.json @@ -1 +1 @@ -{"step": 2000, "tokens_seen": 65536000, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.110466300872504} \ No newline at end of file +{"step": 2000, "tokens_seen": 65536000, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.158511824063886} \ No newline at end of file diff --git a/checkpoints/metadata_000067108864.json b/checkpoints/metadata_000067108864.json index 1757eb988ac492c771c269c1acdae12053d66363..9e43bbafbeefa64ef788c0aeefa00c58293ecf34 100644 --- a/checkpoints/metadata_000067108864.json +++ b/checkpoints/metadata_000067108864.json @@ -1 +1 @@ -{"step": 2048, "tokens_seen": 67108864, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.087188933090007} \ No newline at end of file +{"step": 2048, "tokens_seen": 67108864, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.13458860322571} \ No newline at end of file diff --git a/checkpoints/metadata_000073826304.json b/checkpoints/metadata_000073826304.json index feae5e8c48e60711ffdbb85611b8ba987fabba14..0bd394ca6a17cbbfc2527b46aef66c34741cde2f 100644 --- a/checkpoints/metadata_000073826304.json +++ b/checkpoints/metadata_000073826304.json @@ -1 +1 @@ -{"step": 2253, "tokens_seen": 73826304, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.97070491059368} \ No newline at end of file +{"step": 2253, "tokens_seen": 73826304, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.024738042947748} \ No newline at end of file diff --git a/checkpoints/metadata_000081199104.json b/checkpoints/metadata_000081199104.json index a0c4218d2d588662edcbd50049228cb1fb4ebe0f..b698ae94e97926f7a8d1723c0dc6fa28f9d0d06d 100644 --- a/checkpoints/metadata_000081199104.json +++ b/checkpoints/metadata_000081199104.json @@ -1 +1 @@ -{"step": 2478, "tokens_seen": 81199104, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.8737115629455845} \ No newline at end of file +{"step": 2478, "tokens_seen": 81199104, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.927671735758744} \ No newline at end of file diff --git a/checkpoints/metadata_000081920000.json b/checkpoints/metadata_000081920000.json index 3a831bc92542c503310faaa6aa9e960341ee8c76..725394c0ba78bb90207199c17cc59388220c1451 100644 --- a/checkpoints/metadata_000081920000.json +++ b/checkpoints/metadata_000081920000.json @@ -1 +1 @@ -{"step": 2500, "tokens_seen": 81920000, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.8585853468518287} \ No newline at end of file +{"step": 2500, "tokens_seen": 81920000, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.9130802265358606} \ No newline at end of file diff --git a/checkpoints/metadata_000089325568.json b/checkpoints/metadata_000089325568.json index 1900aa4a0762fb0a255dc1645e91ea221d4d7027..1bcdd508cbe265fd9f824ee54812d9584252137c 100644 --- a/checkpoints/metadata_000089325568.json +++ b/checkpoints/metadata_000089325568.json @@ -1 +1 @@ -{"step": 2726, "tokens_seen": 89325568, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.7747637464189885} \ No newline at end of file +{"step": 2726, "tokens_seen": 89325568, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.8304603164886952} \ No newline at end of file diff --git a/checkpoints/metadata_000098271232.json b/checkpoints/metadata_000098271232.json index 69c03cfbca3f59ddc3750ac1161f9fc26fe79a99..9cf92798184b3a206d8942ee6cb0f4bd235e678b 100644 --- a/checkpoints/metadata_000098271232.json +++ b/checkpoints/metadata_000098271232.json @@ -1 +1 @@ -{"step": 2999, "tokens_seen": 98271232, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.7082875029547786} \ No newline at end of file +{"step": 2999, "tokens_seen": 98271232, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.7642715290485924} \ No newline at end of file diff --git a/checkpoints/metadata_000098304000.json b/checkpoints/metadata_000098304000.json index fe05efda3ec61761ebcd661636e651d38f216499..e21cb3f0b0a4e4144214d84c110a74c9c298cd7a 100644 --- a/checkpoints/metadata_000098304000.json +++ b/checkpoints/metadata_000098304000.json @@ -1 +1 @@ -{"step": 3000, "tokens_seen": 98304000, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.7076009311021108} \ No newline at end of file +{"step": 3000, "tokens_seen": 98304000, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.7636144447471813} \ No newline at end of file diff --git a/checkpoints/metadata_000108068864.json b/checkpoints/metadata_000108068864.json index ce2f7c30bb5007b0d289b584e6f89fda470f5258..57c9f28533becf32b762d640094e8681ea43dae5 100644 --- a/checkpoints/metadata_000108068864.json +++ b/checkpoints/metadata_000108068864.json @@ -1 +1 @@ -{"step": 3298, "tokens_seen": 108068864, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.636359072621687} \ No newline at end of file +{"step": 3298, "tokens_seen": 108068864, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.694283825208702} \ No newline at end of file diff --git a/checkpoints/metadata_000114688000.json b/checkpoints/metadata_000114688000.json index 227048f0521935ea72250b63a45ae9745aae6a27..9b478b9812509599b5e76f70f061a8571e8c476c 100644 --- a/checkpoints/metadata_000114688000.json +++ b/checkpoints/metadata_000114688000.json @@ -1 +1 @@ -{"step": 3500, "tokens_seen": 114688000, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.593479082164686} \ No newline at end of file +{"step": 3500, "tokens_seen": 114688000, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.65164887493004} \ No newline at end of file diff --git a/checkpoints/metadata_000118882304.json b/checkpoints/metadata_000118882304.json index d479fbddbf8e5db81adc95f020df87b47c80b8fb..15832bff87e29c66cbf794c5943c408098a1ce57 100644 --- a/checkpoints/metadata_000118882304.json +++ b/checkpoints/metadata_000118882304.json @@ -1 +1 @@ -{"step": 3628, "tokens_seen": 118882304, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.601109134096057} \ No newline at end of file +{"step": 3628, "tokens_seen": 118882304, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.6573982438370076} \ No newline at end of file diff --git a/checkpoints/metadata_000130777088.json b/checkpoints/metadata_000130777088.json index 2750ed3e5e1ef1fa5c25cf17093eb16d000e9381..0fe5c214dd2fc92cc0df2fd5e2f098bb0370f8c4 100644 --- a/checkpoints/metadata_000130777088.json +++ b/checkpoints/metadata_000130777088.json @@ -1 +1 @@ -{"step": 3991, "tokens_seen": 130777088, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.550488793464195} \ No newline at end of file +{"step": 3991, "tokens_seen": 130777088, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.6080018543413277} \ No newline at end of file diff --git a/checkpoints/metadata_000131072000.json b/checkpoints/metadata_000131072000.json index 18849e9ca96ae5b68de14fc0c43a78c7ba26c957..a13961bcfcdbf04681adce56586f3d6b7e1ab6d4 100644 --- a/checkpoints/metadata_000131072000.json +++ b/checkpoints/metadata_000131072000.json @@ -1 +1 @@ -{"step": 4000, "tokens_seen": 131072000, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.5571278406617366} \ No newline at end of file +{"step": 4000, "tokens_seen": 131072000, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.6143836509798017} \ No newline at end of file diff --git a/checkpoints/metadata_000143851520.json b/checkpoints/metadata_000143851520.json index 448586e0d1ab30dc4d690a5b53269d25e1601eec..289dd038fe4246d0751ffac83e6f51d71ed9b446 100644 --- a/checkpoints/metadata_000143851520.json +++ b/checkpoints/metadata_000143851520.json @@ -1 +1 @@ -{"step": 4390, "tokens_seen": 143851520, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.516337038989698} \ No newline at end of file +{"step": 4390, "tokens_seen": 143851520, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.5763677759688544} \ No newline at end of file diff --git a/checkpoints/metadata_000147456000.json b/checkpoints/metadata_000147456000.json index 17e8a812b87c0d027a0353f3a5529ff5481b4209..f5a0e93b92798fd436c3bde2b3792a23911de7c4 100644 --- a/checkpoints/metadata_000147456000.json +++ b/checkpoints/metadata_000147456000.json @@ -1 +1 @@ -{"step": 4500, "tokens_seen": 147456000, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.5048954366844023} \ No newline at end of file +{"step": 4500, "tokens_seen": 147456000, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.5651027425659474} \ No newline at end of file diff --git a/checkpoints/metadata_000158269440.json b/checkpoints/metadata_000158269440.json index 1a74e1ce5e1f7800d7e27db85ce7587d4abf0598..a7e806f1fbbf1d467b33aee6e9b1a463a4a84718 100644 --- a/checkpoints/metadata_000158269440.json +++ b/checkpoints/metadata_000158269440.json @@ -1 +1 @@ -{"step": 4830, "tokens_seen": 158269440, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.4822720271960272} \ No newline at end of file +{"step": 4830, "tokens_seen": 158269440, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.5430792461603593} \ No newline at end of file diff --git a/checkpoints/metadata_000163840000.json b/checkpoints/metadata_000163840000.json index 5a6222b1f86d0e2a9b09aed0e616d860deff098d..45c60bd6ceb16eb6e7db403ca3d28a38afc42bea 100644 --- a/checkpoints/metadata_000163840000.json +++ b/checkpoints/metadata_000163840000.json @@ -1 +1 @@ -{"step": 5000, "tokens_seen": 163840000, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.464319650781479} \ No newline at end of file +{"step": 5000, "tokens_seen": 163840000, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.52576845325292} \ No newline at end of file diff --git a/checkpoints/metadata_000174096384.json b/checkpoints/metadata_000174096384.json index 820bc4775926ef6f018b4d36a759993c6b617299..5e8ce295d26be0f92744862fa8cf6eb4e4199404 100644 --- a/checkpoints/metadata_000174096384.json +++ b/checkpoints/metadata_000174096384.json @@ -1 +1 @@ -{"step": 5313, "tokens_seen": 174096384, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.457515758075735} \ No newline at end of file +{"step": 5313, "tokens_seen": 174096384, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.518635521589029} \ No newline at end of file diff --git a/checkpoints/metadata_000180224000.json b/checkpoints/metadata_000180224000.json index 4917538da5440464bf3d9d54e46618a83d4b6a26..b7ef04172a16775033215df9cf3f488490da10a0 100644 --- a/checkpoints/metadata_000180224000.json +++ b/checkpoints/metadata_000180224000.json @@ -1 +1 @@ -{"step": 5500, "tokens_seen": 180224000, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.435171628668802} \ No newline at end of file +{"step": 5500, "tokens_seen": 180224000, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.496639097510411} \ No newline at end of file diff --git a/checkpoints/metadata_000191496192.json b/checkpoints/metadata_000191496192.json index dc49e0d1b798e38bc1aff80cc8aef63848f28682..15c3593964ade8598a247720fdc328e407d571d4 100644 --- a/checkpoints/metadata_000191496192.json +++ b/checkpoints/metadata_000191496192.json @@ -1 +1 @@ -{"step": 5844, "tokens_seen": 191496192, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.4124552194940834} \ No newline at end of file +{"step": 5844, "tokens_seen": 191496192, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.4751529632338487} \ No newline at end of file diff --git a/checkpoints/metadata_000196608000.json b/checkpoints/metadata_000196608000.json index cb221ea4c8266a72c3feb632c6e8ce269ce7b76e..f5e73fb9aa1782d04a533279f7a1ca0a05cd5f14 100644 --- a/checkpoints/metadata_000196608000.json +++ b/checkpoints/metadata_000196608000.json @@ -1 +1 @@ -{"step": 6000, "tokens_seen": 196608000, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.4061981319064123} \ No newline at end of file +{"step": 6000, "tokens_seen": 196608000, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.4667979776226288} \ No newline at end of file diff --git a/checkpoints/metadata_000196706304.json b/checkpoints/metadata_000196706304.json index ce6fcda663fa048fd81037b1686b7c1d12ade99d..3616d69d0c9f060aec57046856a761ea04ea34f8 100644 --- a/checkpoints/metadata_000196706304.json +++ b/checkpoints/metadata_000196706304.json @@ -1 +1 @@ -{"step": 6003, "tokens_seen": 196706304, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.4074958290792825} \ No newline at end of file +{"step": 6003, "tokens_seen": 196706304, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.467987821875873} \ No newline at end of file diff --git a/checkpoints/metadata_000197361664.json b/checkpoints/metadata_000197361664.json index f9e7708d8ebc7af127c910c4684b4680751be145..5a5aa2479a487facafbdfd02cd4bb43a293104f0 100644 --- a/checkpoints/metadata_000197361664.json +++ b/checkpoints/metadata_000197361664.json @@ -1 +1 @@ -{"step": 6023, "tokens_seen": 197361664, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.408092934298367} \ No newline at end of file +{"step": 6023, "tokens_seen": 197361664, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.469824998532631} \ No newline at end of file diff --git a/checkpoints/metadata_000198017024.json b/checkpoints/metadata_000198017024.json index 6ef2aaeeceec1383b420fc78c804b25e00da04e8..183007c52293fca3689a264336443d8903fa2f7b 100644 --- a/checkpoints/metadata_000198017024.json +++ b/checkpoints/metadata_000198017024.json @@ -1 +1 @@ -{"step": 6043, "tokens_seen": 198017024, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.407055360049092} \ No newline at end of file +{"step": 6043, "tokens_seen": 198017024, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.4694174847315966} \ No newline at end of file diff --git a/checkpoints/metadata_000198672384.json b/checkpoints/metadata_000198672384.json index e6e02759141141cdd27f8bccb4fda874a7d1fccf..733d6428c7d8ae7e6af8df55ca487d669e9fb1a3 100644 --- a/checkpoints/metadata_000198672384.json +++ b/checkpoints/metadata_000198672384.json @@ -1 +1 @@ -{"step": 6063, "tokens_seen": 198672384, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.396747844162258} \ No newline at end of file +{"step": 6063, "tokens_seen": 198672384, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.459399634434712} \ No newline at end of file diff --git a/checkpoints/metadata_000199327744.json b/checkpoints/metadata_000199327744.json index 71ac03b566844b42f50cf08167ada7bacd8830f9..790fcda4f1140aacdc832a6e198db76a2e7e8762 100644 --- a/checkpoints/metadata_000199327744.json +++ b/checkpoints/metadata_000199327744.json @@ -1 +1 @@ -{"step": 6083, "tokens_seen": 199327744, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.403324384960973} \ No newline at end of file +{"step": 6083, "tokens_seen": 199327744, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.4660093119605064} \ No newline at end of file diff --git a/checkpoints/metadata_000199950336.json b/checkpoints/metadata_000199950336.json index a72f6c7b92a2dcca81dc2215710976a9adf66bb8..c16e16df2be11e6f5e6f06b5472547126445306f 100644 --- a/checkpoints/metadata_000199950336.json +++ b/checkpoints/metadata_000199950336.json @@ -1 +1 @@ -{"step": 6102, "tokens_seen": 199950336, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.394872257897474} \ No newline at end of file +{"step": 6102, "tokens_seen": 199950336, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.4578413544944855} \ No newline at end of file diff --git a/checkpoints/model_weights_000000032768.pt b/checkpoints/model_weights_000000032768.pt index a912b9b69c392865dae7eb20b2190ee48cb78cc5..376c326ec71dbc6b804601b07905e6e80d816acf 100644 --- a/checkpoints/model_weights_000000032768.pt +++ b/checkpoints/model_weights_000000032768.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:126e91c770ee10d33ef0dee8496e4965913669ea9a7354bf884bcdf37e7c5add -size 152233941 +oid sha256:42433f5564c5d2d4c25460186789bd97ad364c1c5c2eef47d74cd86afc616302 +size 151183829 diff --git a/checkpoints/model_weights_000000327680.pt b/checkpoints/model_weights_000000327680.pt index 2b63539146f3825bf9247c2de3baf602b1057e91..19063561ddd9d93960cc79bb8e2b1a501e1c67b1 100644 --- a/checkpoints/model_weights_000000327680.pt +++ b/checkpoints/model_weights_000000327680.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ea98c23a24df1202ce83af552102b4277bcac1c2c7c94f9da950eb17405619f0 -size 152233941 +oid sha256:db794c3e5d8df814d89a81233ad2ba07778990fd19d2a8c96a920e020b1dd3ab +size 151183829 diff --git a/checkpoints/model_weights_000000360448.pt b/checkpoints/model_weights_000000360448.pt index faec3e6cc724ac45bdab04fe6d3aa176954f73cc..bf86fb0902fd26ebc2505e8ceba3e55d6ac82862 100644 --- a/checkpoints/model_weights_000000360448.pt +++ b/checkpoints/model_weights_000000360448.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c8388e7b2b76bce252ad09d6838b7cd237b21bbd11e43f6f9799306f36773562 -size 152233941 +oid sha256:1dd70be592b6eabaa2c7bda67f4cbbfe21775502ca401f451ad5fb7069aeb743 +size 151183829 diff --git a/checkpoints/model_weights_000000425984.pt b/checkpoints/model_weights_000000425984.pt index 2569ca3c9986d2eebce21a244c571db1d75e763c..15d1f2988371ca4f3924557770796dc95450e3aa 100644 --- a/checkpoints/model_weights_000000425984.pt +++ b/checkpoints/model_weights_000000425984.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3917ef49615df35325dacbccba2940f7925e10f273e3b8a03b809d81fa79f6e0 -size 152233941 +oid sha256:6a7777a3ddb57b350f3177dd63c89ed0f89ecfe7e728a13d0df97b8fbd084327 +size 151183829 diff --git a/checkpoints/model_weights_000000458752.pt b/checkpoints/model_weights_000000458752.pt index fb0bc8d9e232cc1d899c65dc684e6aefd0699c89..dcd79c52f4174c035605924c4153d1de7c789414 100644 --- a/checkpoints/model_weights_000000458752.pt +++ b/checkpoints/model_weights_000000458752.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9b291e796976bf61ef32d6b25a318d435fafb9ff544822bb86ad9dc6bdee50c3 -size 152233941 +oid sha256:95b5f4e7ffef83a9ac3ebf622af21a2e629b0bf80bcabbc4871f976a43b10ec8 +size 151183829 diff --git a/checkpoints/model_weights_000000491520.pt b/checkpoints/model_weights_000000491520.pt index bbabfa266688403c0b4bb69b295467b47519330e..8061814c1c3db953c94b6d3ed01861559f726498 100644 --- a/checkpoints/model_weights_000000491520.pt +++ b/checkpoints/model_weights_000000491520.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:642a8a4244c50cc016d27d94755f1b1655d29a49e1c50557a577e296acc9db4f -size 152233941 +oid sha256:a6f8cbe6dcba81cc86f86208b5ec74a3763697d6f1ddd46f117c9e6e0d87e128 +size 151183829 diff --git a/checkpoints/model_weights_000000557056.pt b/checkpoints/model_weights_000000557056.pt index 0cc0d7784a5f8a020f58611c62c85f3b8cc8b19f..f89dfd185635bd77e3a40838d1ac0c158442ee31 100644 --- a/checkpoints/model_weights_000000557056.pt +++ b/checkpoints/model_weights_000000557056.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3cb29c4a8239ae521964d645a1c3006b0c19d7aff7878279db43127ae09d749a -size 152233941 +oid sha256:0463774351018f03ae653c12f7463646e654610b88ffd26bc1d128fc075a92e9 +size 151183829 diff --git a/checkpoints/model_weights_000000622592.pt b/checkpoints/model_weights_000000622592.pt index a9dc21f7772aa20c59df58cd5501705d297ba875..2cc46c8ebcbbad73bd7df01d91cde23707731bd5 100644 --- a/checkpoints/model_weights_000000622592.pt +++ b/checkpoints/model_weights_000000622592.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:df47ea9321b321ba87765b5756b7629b48706909590a5adacf9823f9ffd37648 -size 152233941 +oid sha256:df016922d93fe069ae76dcf58f3134d1100f759179d50c7badbfdcd3673a1e37 +size 151183829 diff --git a/checkpoints/model_weights_000000688128.pt b/checkpoints/model_weights_000000688128.pt index 5dbe4e3dbfbb135c6790399aa231613adc5dc1d9..b02e2242a4d44cb17c70f83fcdc5368b3b58195a 100644 --- a/checkpoints/model_weights_000000688128.pt +++ b/checkpoints/model_weights_000000688128.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2ba296c1826535e86b4097ed1eb5a63d7fb5233dc4fd9345f818fb4653c1bf79 -size 152233941 +oid sha256:d86f0e9be18093e885f7634ad6c8228959c22d5e7a2aa253fb20eb97842c7e2e +size 151183829 diff --git a/checkpoints/model_weights_000000753664.pt b/checkpoints/model_weights_000000753664.pt index 9efda2cc78dee96ba3409f9cc3503fe79a6400fa..b1059a2581bf8566832556116ea34969bde39c35 100644 --- a/checkpoints/model_weights_000000753664.pt +++ b/checkpoints/model_weights_000000753664.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7e51f2cb71ae53a36f7e18300cf3bab5a0f1551f9d5eaf49997320b6f693eafa -size 152233941 +oid sha256:638b19a3867bff80c72488e884cd5f37f8d71bf8d680ca983cf561fce8da6de2 +size 151183829 diff --git a/checkpoints/model_weights_000000819200.pt b/checkpoints/model_weights_000000819200.pt index 701c895eabfe7fdd3f0be7baf08f327c2a4f88b0..3aec4ef2f9012c88d7e27db751cb174306cbd64b 100644 --- a/checkpoints/model_weights_000000819200.pt +++ b/checkpoints/model_weights_000000819200.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:89e7f4890a08ee3f6cb4eea8ad6adcbea64111882dfdbc3dfe12fae7e1201f90 -size 152233941 +oid sha256:dd104b519fd0f20ed8987c167ea67f9988d7d7956a7e45e20146e113e87b51f0 +size 151183829 diff --git a/checkpoints/model_weights_000000917504.pt b/checkpoints/model_weights_000000917504.pt index eb1e419cbb8e243f32d1c8b5c91a28736bd12a46..5d787972ea54fde61897364380d142d36a860c4f 100644 --- a/checkpoints/model_weights_000000917504.pt +++ b/checkpoints/model_weights_000000917504.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d32e920c21376e8bbc60342aed5bca729a6bb035fb4c31d672c96a8bde9a0f3e -size 152233941 +oid sha256:0f599c573d7305d1ba1c7539112de7bb3822588a5824e37168887cbde2f637b8 +size 151183829 diff --git a/checkpoints/model_weights_000000983040.pt b/checkpoints/model_weights_000000983040.pt index 5d3d1b045c12a235bede5755c3883bf6d32cceb1..375c0c8b6277fc149a9ca346d5145039041ee643 100644 --- a/checkpoints/model_weights_000000983040.pt +++ b/checkpoints/model_weights_000000983040.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:755c587dadfc7dbeab7660e6b1e3871bc67d2591350985da88bfe25cfe3477c6 -size 152233941 +oid sha256:4f7a67cef948740f7cac26379149cf0986f704ac0a0c0e6c80a9d17b3ed593c6 +size 151183829 diff --git a/checkpoints/model_weights_000001114112.pt b/checkpoints/model_weights_000001114112.pt index 475234a9985e5b40008a8f27773cdd44a538ed5f..b2e969fa9f3d59331f5d96f3ebb7c0bd71bf7b34 100644 --- a/checkpoints/model_weights_000001114112.pt +++ b/checkpoints/model_weights_000001114112.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:78d249d659833651df00fc1c946ef98ac5d34321290f97b49c08602fc744a07e -size 152233941 +oid sha256:e0473da235bc7205a424439e0b2a398ef4006194e8fa571874ad81daf6650dbd +size 151183829 diff --git a/checkpoints/model_weights_000001212416.pt b/checkpoints/model_weights_000001212416.pt index 14e5f3069811e3fba530846a0d346cf61eced1a8..dac017a2edb1164cb692bcc57e1aa219d1541a72 100644 --- a/checkpoints/model_weights_000001212416.pt +++ b/checkpoints/model_weights_000001212416.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ac757ab9ae59a742327542c4010e53789355750c273af57c418922049bd45c13 -size 152233941 +oid sha256:d0f3fcb05a9719802a88b31832a835f1a15d84525fbe5bb1a8d1068314658df0 +size 151183829 diff --git a/checkpoints/model_weights_000001343488.pt b/checkpoints/model_weights_000001343488.pt index a189b3bb83a92186eb877a0d10a42308330ad864..ad72843801e15c4b0712817eabb244b216c85f07 100644 --- a/checkpoints/model_weights_000001343488.pt +++ b/checkpoints/model_weights_000001343488.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8c7cb7b849007d479509bcb97749fa8eed22a45e3db82d08883bd7f012383419 -size 152233941 +oid sha256:39e6b66db637360c2b1abe19aae6c56f147483e32c8c6c55a00f234079557876 +size 151183829 diff --git a/checkpoints/model_weights_000001474560.pt b/checkpoints/model_weights_000001474560.pt index 9177c54e697abfea0280d11d05a8c38dbc6dc78a..32109fc9264949593f4925dc551af42dbddbffb9 100644 --- a/checkpoints/model_weights_000001474560.pt +++ b/checkpoints/model_weights_000001474560.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e7b4cb1b8f3b9891e35f3517e843d51516924a2fdba36f91bd7f05ae2df17b7d -size 152233941 +oid sha256:57ee8c9e66069116e157beac7b5e2decffb05050e27250c971983cbfee1f177f +size 151183829 diff --git a/checkpoints/model_weights_000001605632.pt b/checkpoints/model_weights_000001605632.pt index 9254f18b0adf59e749eebf81bd43f6579ea9f856..4eaff40abb3948aec1100330017553d7eb5a98e4 100644 --- a/checkpoints/model_weights_000001605632.pt +++ b/checkpoints/model_weights_000001605632.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:86e31677717bdc8294fc7eca35b5ef9bfc2cb261186f2de93f3bea134e7c0fb3 -size 152233941 +oid sha256:204c319617ce9ec5767f169407c8c174baa2af19d3c14a199d01efc65dc491ed +size 151183829 diff --git a/checkpoints/model_weights_000001769472.pt b/checkpoints/model_weights_000001769472.pt index a7e84fdba917d3f91f16855ae9426ff8c9fc40d9..e9863b1820d8faccaee73b1d525b1942ec5b319c 100644 --- a/checkpoints/model_weights_000001769472.pt +++ b/checkpoints/model_weights_000001769472.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6ec994fd9c622b838312f914b019f46a1f4682b75f0259095aa79447cfdb55cc -size 152233941 +oid sha256:a02724cecf461b8b8200029ae86f6eb24f824bc25e85e69e2590fb9f02d4ee77 +size 151183829 diff --git a/checkpoints/model_weights_000001966080.pt b/checkpoints/model_weights_000001966080.pt index d8a780005510d1e5e91d02939415fc3c123544e1..5e6e50635d3b7b04c2f3c0fd88fde3a3a1bdfaa3 100644 --- a/checkpoints/model_weights_000001966080.pt +++ b/checkpoints/model_weights_000001966080.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:012058c0a6d2f457ddc997b74be0ef3b9006aedeef426bce61ab9eba832b89df -size 152233941 +oid sha256:b904da673c2c2feb4c34afa919c8381b94a9c765ff8ed4b45f1f76f82242cf3b +size 151183829 diff --git a/checkpoints/model_weights_000002162688.pt b/checkpoints/model_weights_000002162688.pt index a7fa17e0b366d57d2961a9958de90d03480f666f..d7e99bb049bacef878c73477d0f0cfb4be1d9409 100644 --- a/checkpoints/model_weights_000002162688.pt +++ b/checkpoints/model_weights_000002162688.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3d109259b8927d0bee42e2999c5fdf0eeca64a2a484f875db58e752418946d34 -size 152233941 +oid sha256:5261d7dc82bda08823ad9026a093362eb0a210a5be66b2ad4d979b3650d5a9aa +size 151183829 diff --git a/checkpoints/model_weights_000002359296.pt b/checkpoints/model_weights_000002359296.pt index 05e69b9fd548cd6bf68ab2fc8be4ba0e3db1e0db..ef3cc0a2ef7d569608c58e45a6f4d27123a4eb9e 100644 --- a/checkpoints/model_weights_000002359296.pt +++ b/checkpoints/model_weights_000002359296.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a919bccd79ef2a7249045e9c1f7ff08bc1075a19b5a66f455e37f71181044ad4 -size 152233941 +oid sha256:f4ca07aee0021b68ded9b96d5b193fbf1695019ad8f9a188e32c82c45f00fa20 +size 151183829 diff --git a/checkpoints/model_weights_000002621440.pt b/checkpoints/model_weights_000002621440.pt index 8fb540f0e0e30638a3a1c1827747f8085952ba30..1746c53738064742a413254bdfcb52b08497dc5d 100644 --- a/checkpoints/model_weights_000002621440.pt +++ b/checkpoints/model_weights_000002621440.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6c0ff6d6283609474fc694d39fd180e288929b5194e822e612720e7358958676 -size 152233941 +oid sha256:07f146338da7afe3e283927b973ccbfaad53d5dba0295e50d65acc9d9b378899 +size 151183829 diff --git a/checkpoints/model_weights_000002883584.pt b/checkpoints/model_weights_000002883584.pt index c3fc48d8073ba80c2e3ef5757efbb27887ac89ae..68d507ec9eb81ca45b4f7a0ad62a99bd9998fed9 100644 --- a/checkpoints/model_weights_000002883584.pt +++ b/checkpoints/model_weights_000002883584.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5b25bed5c785fc1204d1e4281ff82ee1da20b90af02488fc91b9ba9c4a8c60c4 -size 152233941 +oid sha256:5c2c6a3e3df454ecad9fffb523d27a81e278d55fb32dc5feb29caa946cd85a30 +size 151183829 diff --git a/checkpoints/model_weights_000003178496.pt b/checkpoints/model_weights_000003178496.pt index 8064f00ad6dc98bc1165129919f986f70bbbfcc3..75c04f5edb859391602e4aa4132fd4e71170fee0 100644 --- a/checkpoints/model_weights_000003178496.pt +++ b/checkpoints/model_weights_000003178496.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0a24228699d57eb3fd0117d453029b057a0b2bb3802ecafc6c2338d7a9f3e78d -size 152233941 +oid sha256:2a4cf0c0648f20f77d3e22ed014cfa33e4e31c6d4a67f7128bf011b09f768c30 +size 151183829 diff --git a/checkpoints/model_weights_000003473408.pt b/checkpoints/model_weights_000003473408.pt index acfbf29ae253cec0fbfb4240bc084eafe92a3d8b..0186375b7ab9e55f823845525eea558aeb317917 100644 --- a/checkpoints/model_weights_000003473408.pt +++ b/checkpoints/model_weights_000003473408.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:423c2886095a6fccb4fd402f82ad314907f99d3045ef5a99f809022aa8b6bbbc -size 152233941 +oid sha256:08e18b6fd6ba0e258c99c5cbe2fdfe45ba24986c2ea1bd20000bdb8bbd5ab4dc +size 151183829 diff --git a/checkpoints/model_weights_000003833856.pt b/checkpoints/model_weights_000003833856.pt index 16915b15238bb94c964d8cdcfb35fdd1c75f2385..9ba787e7161e0039a0a7e20369f0765a8d93542e 100644 --- a/checkpoints/model_weights_000003833856.pt +++ b/checkpoints/model_weights_000003833856.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:afc1ddf5ef6c825ced0502a0df7489cd6974991022036a49b6ff0173a7919e87 -size 152233941 +oid sha256:23a7729a5ad3cf0fd0ebec77dbc5c3c78420d5404d8eb9b89eabbc68584a851f +size 151183829 diff --git a/checkpoints/model_weights_000004227072.pt b/checkpoints/model_weights_000004227072.pt index 7891151d758869db673486fe6a4686d902e7a5b5..bb5452dc8c775c8ec756d20adc80ad8f10209e1c 100644 --- a/checkpoints/model_weights_000004227072.pt +++ b/checkpoints/model_weights_000004227072.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:faae6d34dda55becb2508088ca57995ca954837911326285acf14db1cc52206c -size 152233941 +oid sha256:c9382f7259d338426cfde10b275afe8f19c00fdb98f47347768d365ec785a0d2 +size 151183829 diff --git a/checkpoints/model_weights_000004653056.pt b/checkpoints/model_weights_000004653056.pt index c8704ea794d7582b9225c9af22fc8382b547751a..e37c8f096bafd11940b9cf52f851fe6a9d639763 100644 --- a/checkpoints/model_weights_000004653056.pt +++ b/checkpoints/model_weights_000004653056.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:94d9f889bf05c772c064e57ff19b43395f79187ed9c3ac3ebecea22b7b7ef46e -size 152233941 +oid sha256:ad5352ada566445b01eee6fc71c36a49c857c8e58626a9bf1f1a1a76296a0412 +size 151183829 diff --git a/checkpoints/model_weights_000005111808.pt b/checkpoints/model_weights_000005111808.pt index 9b40b0979bb9cb2771e89a3883d9c934983f2cd6..aca5974b67baeb89d0f94d704c39c1b2da151bec 100644 --- a/checkpoints/model_weights_000005111808.pt +++ b/checkpoints/model_weights_000005111808.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ea9793e864030102e412c47a7026f175f7b2beb65e1c3c4f1191db17faacf838 -size 152233941 +oid sha256:f9ed98aef22cede1dbfcc52f6d25336c81e17f044ace0dcbd84ade7cc2358141 +size 151183829 diff --git a/checkpoints/model_weights_000005603328.pt b/checkpoints/model_weights_000005603328.pt index 21fbef56f8733adac9cda032444c0be8572a84a1..4d53c685f50c644f620491bd70630db581f8c2a9 100644 --- a/checkpoints/model_weights_000005603328.pt +++ b/checkpoints/model_weights_000005603328.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0d698e8eeed127a4bdcca424e0a7415778b814bc3d5a6d9b9d8439708ee33b51 -size 152233941 +oid sha256:1a4d5517914298d9aa5f3d9d3e375323932c280d9de39ec7eb3d6b49653c046d +size 151183829 diff --git a/checkpoints/model_weights_000006193152.pt b/checkpoints/model_weights_000006193152.pt index 47c0c86d113acd749a5720bd0db8cc863a113a0c..f02319a5956a28fb949f08adf07ce46342e210dc 100644 --- a/checkpoints/model_weights_000006193152.pt +++ b/checkpoints/model_weights_000006193152.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b873b008c2cc4cbf0943b3fb4f16f2c2bb93dc0a52415cfeaf7ed7b1b3ad78e3 -size 152233941 +oid sha256:ba5a26e612e66843500e1b8c5a3a0910c97fc7753b399d8411d3978eb340a07b +size 151183829 diff --git a/checkpoints/model_weights_000006782976.pt b/checkpoints/model_weights_000006782976.pt index 5b1dc2622f69343033fc20fa8ebc43276de69184..b11fd169c7d0fe9acae47092eacea131115237a6 100644 --- a/checkpoints/model_weights_000006782976.pt +++ b/checkpoints/model_weights_000006782976.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3feb8d2736812ca3272ec7f9e044097cb48874a2053cf71d11d80e8c1e9f8f23 -size 152233941 +oid sha256:083f24cdb0e950f84e0f34b4edc4767f7e2348328286ff9deb51d2a910d1c827 +size 151183829 diff --git a/checkpoints/model_weights_000007471104.pt b/checkpoints/model_weights_000007471104.pt index e8e575374c9c7a9e8aaf2735872218351105a955..9d24e020d53f28008317a3d29ce0a9471ab3af86 100644 --- a/checkpoints/model_weights_000007471104.pt +++ b/checkpoints/model_weights_000007471104.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:01102ee0a169f6142736d385d72fe2c79192848731d9cc4143048b887176d423 -size 152233941 +oid sha256:93487c03a02fb1addc8352798ea870e408833a2736dd71dcced0a13dc6eba50b +size 151183829 diff --git a/checkpoints/model_weights_000008224768.pt b/checkpoints/model_weights_000008224768.pt index 3d43ef91d7d0de07b30a6612a40b50d05d3e9efb..50df6c2762eea4ffaef63c3df18389ac79367af0 100644 --- a/checkpoints/model_weights_000008224768.pt +++ b/checkpoints/model_weights_000008224768.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:af5c9aed40ecd638baf207e10fad27acec54e665bd912b210a6405e847680ca0 -size 152233941 +oid sha256:d526f740034d485dbad918bbe050f19ecc8f3bc0244c9d2064c649f3fd7cf99b +size 151183829 diff --git a/checkpoints/model_weights_000009043968.pt b/checkpoints/model_weights_000009043968.pt index 82e2d5f550a33e18e65660e5041fe52bfa6b443e..683fd6956c01355d3a0ae433e4e0305d29b482bd 100644 --- a/checkpoints/model_weights_000009043968.pt +++ b/checkpoints/model_weights_000009043968.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8ec1c3a070a125c58a8152f9e7aa56a763037cc123e36e1ca323ded182a76b28 -size 152233941 +oid sha256:3bb36f8be7f35222b80b0922e9d28f894af35a06ad8492a4a21acaef5a3feec4 +size 151183829 diff --git a/checkpoints/model_weights_000009961472.pt b/checkpoints/model_weights_000009961472.pt index 6b1732733e6b3e13c4bb850f49630acffb1dfce4..b0e02403a11d6c2518d1df65113e9842d685bde4 100644 --- a/checkpoints/model_weights_000009961472.pt +++ b/checkpoints/model_weights_000009961472.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:adc6b64d722aeb3d5891411326c3ebd46c6ed896e1a91613d26e141e58e07f0f -size 152233941 +oid sha256:3934ad71e84858fc2d50d4fa7cd92cf148efbfe68f6a722bdfbb027b2bb3ccfd +size 151183829 diff --git a/checkpoints/model_weights_000010944512.pt b/checkpoints/model_weights_000010944512.pt index 1d8fc60df1d8db055ede761be609c87ce24d9efc..fd214cc9e44e67aaaa6e817b6ce665a42bf6022d 100644 --- a/checkpoints/model_weights_000010944512.pt +++ b/checkpoints/model_weights_000010944512.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6bf00dde065e4d6d87753ab5a3a5078d0e8dabfbf2f2a1af2b58e028d02f2f8e -size 152233941 +oid sha256:8bbc9cfafcecf0273ca6514fe38193714af5d9e7eab4a238d75e68be525de510 +size 151183829 diff --git a/checkpoints/model_weights_000012058624.pt b/checkpoints/model_weights_000012058624.pt index d1886e335bf71b91c53a8087f00c6daaece8cdf0..0f5c74cbfe2ed2540e64e1bd482ea15a16819fbd 100644 --- a/checkpoints/model_weights_000012058624.pt +++ b/checkpoints/model_weights_000012058624.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cf7cec5bed9521d96c13b93dc544f2c2d308eab2d5c7e12560d04c47f86bc176 -size 152233941 +oid sha256:b5d2e6e5f5db1e53cc0a370e610c0054e49df0e98a5ae830a304dbc357613d5f +size 151183829 diff --git a/checkpoints/model_weights_000013271040.pt b/checkpoints/model_weights_000013271040.pt index 255b0372e30c9286f214df77c975ed208d0d9dad..f1462a56348de7811dfbe0b4ad30b4e747ee6964 100644 --- a/checkpoints/model_weights_000013271040.pt +++ b/checkpoints/model_weights_000013271040.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2332cb6ff3d3a184384af69b52bc9c1c7cac3b40022b009920334144a0296d24 -size 152233941 +oid sha256:46446fb28d810141b2f8e96d991bb8f58069dd3f08a4f2d5e930b3ce71694898 +size 151183829 diff --git a/checkpoints/model_weights_000014581760.pt b/checkpoints/model_weights_000014581760.pt index bc8176574d9bacad2342276f023efa58d1833c78..8b5efdc0e85f4efb46ba7920c360fbeac8ed2302 100644 --- a/checkpoints/model_weights_000014581760.pt +++ b/checkpoints/model_weights_000014581760.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:42dc3d4700cec02974c3720c719319c7fea5bbb1a11fd62efa367aad310f7abd -size 152233941 +oid sha256:e95da01edab15e09f239369497d0f5c7d26e5422a4f1dcb3978eeb39b86171ef +size 151183829 diff --git a/checkpoints/model_weights_000016056320.pt b/checkpoints/model_weights_000016056320.pt index 2ec9e33eca945b9bb8cf8d38f544d5cacc67daf5..bc03e1c9f4634efba5a8221e0473bcf28c4205bc 100644 --- a/checkpoints/model_weights_000016056320.pt +++ b/checkpoints/model_weights_000016056320.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:09f58d2c1ccffae75a9a52a63da53a6f98329e0e836abfd6a9ec0a7450045e15 -size 152233941 +oid sha256:828ff7704873a2faf8541d9199322f47ba105af7824e2cab720efe44447cfe5c +size 151183829 diff --git a/checkpoints/model_weights_000016384000.pt b/checkpoints/model_weights_000016384000.pt index 0408b044443c73da974f321fb42ca97d79b14fdb..5801beb772f24955afe66546518bd4aaf468367a 100644 --- a/checkpoints/model_weights_000016384000.pt +++ b/checkpoints/model_weights_000016384000.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d625a5b9816d03f4cb433038eb4107e4aa0cadf0c8704548cc2dd5be1e9d1fb1 -size 152233941 +oid sha256:52ccd8304cd0bfa6068838b9ca42704d5d6796c4d6697bc988e49ca7b30a080c +size 151183829 diff --git a/checkpoints/model_weights_000017661952.pt b/checkpoints/model_weights_000017661952.pt index 52ab525700d8dee59df2f3ba28e171f09a255a65..48d43bd62b74ba5b6733582cb42c629ca6a95e91 100644 --- a/checkpoints/model_weights_000017661952.pt +++ b/checkpoints/model_weights_000017661952.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:aecad5609a506e4bfdfd454fa850b361a8c05279eaf105d553943b63061e32fe -size 152233941 +oid sha256:f37334ac2b0ae22a9a4bca6399f811c5e9d0d2c982e6c8a0f987fc78baba8777 +size 151183829 diff --git a/checkpoints/model_weights_000019431424.pt b/checkpoints/model_weights_000019431424.pt index f17342914969257159a68a350520580bb51c3768..a0f9d79c9e108ae30af94c1a803f0e2e634a9f70 100644 --- a/checkpoints/model_weights_000019431424.pt +++ b/checkpoints/model_weights_000019431424.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:093f6cd231419ec95b430623b2b141b603d309d0862e9f163b4d892d2b8f1786 -size 152233941 +oid sha256:e75c402639aa9c6c399f478431f2c2fea7fc61d00012687b92c8b429237a16dc +size 151183829 diff --git a/checkpoints/model_weights_000021364736.pt b/checkpoints/model_weights_000021364736.pt index f94240825b86174f244ded1928209dd4d2cf76a1..0f23f72e9d3048640eba3b4c871840e1fe521095 100644 --- a/checkpoints/model_weights_000021364736.pt +++ b/checkpoints/model_weights_000021364736.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:164c1b90a703d53adfe900f7dee5bc994e42c6e2563f17cef932c570943a362a -size 152233941 +oid sha256:11567a200db2ec7810ac9940014e175dd5cd959cac8566aab1aca6a83b573364 +size 151183829 diff --git a/checkpoints/model_weights_000023494656.pt b/checkpoints/model_weights_000023494656.pt index 7e46eaa709fe7eb5e5af3c5788334df71f018186..21ed12d5711a75f95923f79e9c03882287aaa616 100644 --- a/checkpoints/model_weights_000023494656.pt +++ b/checkpoints/model_weights_000023494656.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4c528494b42970ead192e2adda45d77e8600f8a2856f6858eb7f696e726b80b7 -size 152233941 +oid sha256:6838fd8b423557094093aed4910cbd6cf4250c9e517c55009409e807a629c2ae +size 151183829 diff --git a/checkpoints/model_weights_000025853952.pt b/checkpoints/model_weights_000025853952.pt index 5a6c66612a50d420dd428ef4ce0bbe42381f0213..11179895b3ef2ec5f719d421304d7c3765e152b9 100644 --- a/checkpoints/model_weights_000025853952.pt +++ b/checkpoints/model_weights_000025853952.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9c8102377bb9f3a4bccf97b7a7cef5299b1f5e49c052e9922cc3f802096393d5 -size 152233941 +oid sha256:cbfeb2be6a31ea180ef2013ffdaa0f39690441aa9bcd48def933fb74f7e56ed2 +size 151183829 diff --git a/checkpoints/model_weights_000028442624.pt b/checkpoints/model_weights_000028442624.pt index b07eafba18c566fe9b92b7a3aae5dde623159988..9f5d4c299ee4f9ef6a45889e5d673e110f70c8ac 100644 --- a/checkpoints/model_weights_000028442624.pt +++ b/checkpoints/model_weights_000028442624.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1baaffe2770f4f6e4e4ca88348ee82a2c812ae062ad8644f22c0fc8ca6c46632 -size 152233941 +oid sha256:61ce07417697759084c7e3ff9b31b5aba3095ab0d9b9309c3ebf4aacd5022dd1 +size 151183829 diff --git a/checkpoints/model_weights_000031293440.pt b/checkpoints/model_weights_000031293440.pt index ce334b1210b8c5d20edad3e3c8e89e7f3c08b212..4b69774535a1a089d74eb78f6cba59bc4c03759d 100644 --- a/checkpoints/model_weights_000031293440.pt +++ b/checkpoints/model_weights_000031293440.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:503f8a171c1e370ea78dad681fdd7b89e9d9ce2d3509bff4aac6ff60e2e0f337 -size 152233941 +oid sha256:042561b3ec35d52a3533fdcb99afdc9f6144e5c5eba960419adbf0103e01456e +size 151183829 diff --git a/checkpoints/model_weights_000032768000.pt b/checkpoints/model_weights_000032768000.pt index 226023dc2caf5968a6fb6c1059bb88a07d71ea81..45a50870ad83f2a62d4ff0af26f3f85e9cc02311 100644 --- a/checkpoints/model_weights_000032768000.pt +++ b/checkpoints/model_weights_000032768000.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:acbcfdb221584d350a91b208e3b95a8774a0eae4fd255c455a8ce3fcb210d671 -size 152233941 +oid sha256:f614f90bab827faa96b6de3ca2e178ef708f9454b85e24601d5bb54286b9b1f5 +size 151183829 diff --git a/checkpoints/model_weights_000034439168.pt b/checkpoints/model_weights_000034439168.pt index 0eb974dfb4223bb07bf9a149710cc80ec1f1d877..2d32f62fc6edbffe458a092ce16e02f7b8a81658 100644 --- a/checkpoints/model_weights_000034439168.pt +++ b/checkpoints/model_weights_000034439168.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7fdd604445a5aa3e34356d6652a6fe689d68bc8604be7e5fa562e1d8529d295b -size 152233941 +oid sha256:91f2646a7df66bd7312d63973b4ac1eae7e759f4ad7dae615188adbe7a0d5bf5 +size 151183829 diff --git a/checkpoints/model_weights_000037879808.pt b/checkpoints/model_weights_000037879808.pt index c18014f3af41a222c8b4bac0163971461a52fb6e..1eb7259b4cb230c0d0ca277432d33a69b1454236 100644 --- a/checkpoints/model_weights_000037879808.pt +++ b/checkpoints/model_weights_000037879808.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4ac997ac32245bab013ef3f95ea9d087bdc4047eaa14aedb88b0d056e8aab576 -size 152233941 +oid sha256:f47d553fbfe3f73aedff54fd431f1e2370ba89bbed1cc6482d6c2b42593aeb43 +size 151183829 diff --git a/checkpoints/model_weights_000041648128.pt b/checkpoints/model_weights_000041648128.pt index c765fc096e0f2c4b902e9311870a2b92d9a886bb..2af16d30909c34e9facf7045a2d34d37760c12c4 100644 --- a/checkpoints/model_weights_000041648128.pt +++ b/checkpoints/model_weights_000041648128.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0e28e7ac643111e0d6623d362311f8f93488d0fef7f10fe8c7e50c48be1f9051 -size 152233941 +oid sha256:80dee0538147c669d85dcf4ce90aec4aea903cae25964555ee2a470de2612018 +size 151183829 diff --git a/checkpoints/model_weights_000045842432.pt b/checkpoints/model_weights_000045842432.pt index 824c2220548e5b00121e7f6e410de407ec0b9ecd..0343f22bcfc6bb775e739ef906549a0d645b3f42 100644 --- a/checkpoints/model_weights_000045842432.pt +++ b/checkpoints/model_weights_000045842432.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:145dae94ded6da1cac5c684072745603739c8be20f7a98ddd293bfec9786fb6d -size 152233941 +oid sha256:a438f8998185c715001a3841ace7e917863da8472b8340cdbf66552f1d043298 +size 151183829 diff --git a/checkpoints/model_weights_000049152000.pt b/checkpoints/model_weights_000049152000.pt index 2a0725b5dda375953e52465991f20b3806bdbfd5..a7db098f9f92e20293f792021ca815bdecb030c1 100644 --- a/checkpoints/model_weights_000049152000.pt +++ b/checkpoints/model_weights_000049152000.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d17b4f03b730b1f4fc93ca8e5b1b3b261a2277dc8137a70dd41a16757bd02c56 -size 152233941 +oid sha256:eb3fa59b41a4223650318d22e1601ad2a7a46711fef2b2a14bd1c6ed280d69de +size 151183829 diff --git a/checkpoints/model_weights_000050397184.pt b/checkpoints/model_weights_000050397184.pt index ceddcb316c05d99837ad7f68f78d701fbfbc844b..9ab3c3dc46a9d492eed5dfd3a346fb8fd289836a 100644 --- a/checkpoints/model_weights_000050397184.pt +++ b/checkpoints/model_weights_000050397184.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:550774647b6b21f542e14c314fcd670e19b56819a734c09060450d93abd5c29c -size 152233941 +oid sha256:bef4f0c18c238515259680784c3634d2d9ba2ac1e7c303684105d6dd3200cd2e +size 151183829 diff --git a/checkpoints/model_weights_000055443456.pt b/checkpoints/model_weights_000055443456.pt index cd73743fed5d3cc74de48590e11767d265a210be..f1b298943e76c59470aa1e5aca29967e4949f97e 100644 --- a/checkpoints/model_weights_000055443456.pt +++ b/checkpoints/model_weights_000055443456.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d1714506a833c289bcf2c715ed5257bcd5284b097d4f6453ca8192887897fe01 -size 152233941 +oid sha256:281776104ff868bda35d31807f41eb483139f1a491ee48e5ca0e9e0d5adbdc8c +size 151183829 diff --git a/checkpoints/model_weights_000061014016.pt b/checkpoints/model_weights_000061014016.pt index 7deb9759f973bd0666b38baf101714ea5d6b4f64..8c6358d272b4d87ae587f4cb014ddeccf9211634 100644 --- a/checkpoints/model_weights_000061014016.pt +++ b/checkpoints/model_weights_000061014016.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fd9f1ea57c9011456c6c76f6fee2ea9798a8ab3c2f19e09231561e0c03823827 -size 152233941 +oid sha256:2b31d7d39989c7ff62c47ef826577d40a8d3418f2c49cd15e2ee726debfd58ae +size 151183829 diff --git a/checkpoints/model_weights_000065536000.pt b/checkpoints/model_weights_000065536000.pt index 3b410aba36229e58a5462945242d20c3254242af..ff42a72c2bf3d2e6fddc574a04e3a0dff892f4f8 100644 --- a/checkpoints/model_weights_000065536000.pt +++ b/checkpoints/model_weights_000065536000.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4cad227f3d000126119966f6d62a26b065fc793b02a42a92ef40bc00dcff35e1 -size 152233941 +oid sha256:d5d01fc8e8cbf1be76dde579e17a4d1a97089c28df6b5b08580563b8dafaf654 +size 151183829 diff --git a/checkpoints/model_weights_000067108864.pt b/checkpoints/model_weights_000067108864.pt index 6f82840deff98192c06e6394a950bb45135c5844..53e9ca243caec38f03f4c2b40298bff8802429ef 100644 --- a/checkpoints/model_weights_000067108864.pt +++ b/checkpoints/model_weights_000067108864.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b91b60d58242c335595ec8c6f8c90bb871a777743f37015f6614e73fbf81b679 -size 152233941 +oid sha256:a21031ab0598fd69733d09e98d8d3cb931e35b7a4963a9f980e24a584d5e2bcc +size 151183829 diff --git a/checkpoints/model_weights_000073826304.pt b/checkpoints/model_weights_000073826304.pt index ec285c3fd1c53bfb798f0c4cb8e3f4cf1d2fa38e..79822cdc629d1f7e31514d87cd69d3fbb2658052 100644 --- a/checkpoints/model_weights_000073826304.pt +++ b/checkpoints/model_weights_000073826304.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:894dd15c2ae9d0889fdeae587be200f108853520f5df96705ff80dcee9c0a123 -size 152233941 +oid sha256:314b1497679e95d116e8de86e2860d1737cc8ecb6ff1abf7e1c59e970fdee058 +size 151183829 diff --git a/checkpoints/model_weights_000081199104.pt b/checkpoints/model_weights_000081199104.pt index 4e09a6f650c83426334a15c3d1eda01ab2cea4a8..cbd579f41857871d959be57b3fa25f249bba93d4 100644 --- a/checkpoints/model_weights_000081199104.pt +++ b/checkpoints/model_weights_000081199104.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:35a217e9b909d9230421a822b1eaa20bd7b9e496fda0621b7628bfbbef0fd170 -size 152233941 +oid sha256:74f658f4876476245691bf56fec17a2a395ec0471b7f17a7fb54b68adbab13c3 +size 151183829 diff --git a/checkpoints/model_weights_000081920000.pt b/checkpoints/model_weights_000081920000.pt index 0c9523bf3db1af112af2dfafc5671bb5dbfbc5b9..81b0ffc1ac7cec9a04917782a679904dfa133e5e 100644 --- a/checkpoints/model_weights_000081920000.pt +++ b/checkpoints/model_weights_000081920000.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:28d461ede60245a11fe7aef30a2e8ce56906383c039be6a4ac5c0c319eab5573 -size 152233941 +oid sha256:c9bc4c2470f520c4ed3f4ce01e60e855eddaf23181c0ea4340e07286c0c4c3ce +size 151183829 diff --git a/checkpoints/model_weights_000089325568.pt b/checkpoints/model_weights_000089325568.pt index 8553d028d8f2b39302ab9a3dd06ad5d95a7c0eef..310c9e855a8e039c462a6ec112e19262cde0fcc6 100644 --- a/checkpoints/model_weights_000089325568.pt +++ b/checkpoints/model_weights_000089325568.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:014156fcd4ba5cbc68d73713da1cd95344e30180c7b1c47cd64e8cab53d441b8 -size 152233941 +oid sha256:9c14d7690e178ee57282a4e2efc4204d6baaed9d68ead3dac949607713305ac7 +size 151183829 diff --git a/checkpoints/model_weights_000098271232.pt b/checkpoints/model_weights_000098271232.pt index 816778f4c887d11a7c6dbacfc53e4f30fd142c3a..d88e41e93e4a37f89aba2c990d609d28172d6709 100644 --- a/checkpoints/model_weights_000098271232.pt +++ b/checkpoints/model_weights_000098271232.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ab0baa7b27d2e5a28c30520e51d756c85a6e6bd3d15e46e969713cb1cc06cc0f -size 152233941 +oid sha256:41677d899a01ccf663f74a8a9bf3971c5a5e84a90324c60d21300ddba55e758c +size 151183829 diff --git a/checkpoints/model_weights_000098304000.pt b/checkpoints/model_weights_000098304000.pt index 07c62a9a7ccb00073765d38ee743654875b8d2e4..db499af120b15aba8c84bf6b2ba2738698e88a19 100644 --- a/checkpoints/model_weights_000098304000.pt +++ b/checkpoints/model_weights_000098304000.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:14ca2b202fafbeb89ebcc905d7c537b3c2f00a633ce131432374d62323bc430f -size 152233941 +oid sha256:f81a65ba5579eb6b59e17d218983f0f3eab85fccdfd786ba27ccee9e5111a3f3 +size 151183829 diff --git a/checkpoints/model_weights_000108068864.pt b/checkpoints/model_weights_000108068864.pt index 3691ba6d872ec388fd1aed5444ff74ab57cf31ae..a11ae31e330babbc8e6af349a47d664d7f715e5f 100644 --- a/checkpoints/model_weights_000108068864.pt +++ b/checkpoints/model_weights_000108068864.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1ba49386eaa354e23e0332e0633091419e30097b6e330f9a7e425c4d2e69f335 -size 152233941 +oid sha256:4ec8cac49cb57a18bc9ca472040af42605f6588d1a3c1db216bf4649d5c21602 +size 151183829 diff --git a/checkpoints/model_weights_000114688000.pt b/checkpoints/model_weights_000114688000.pt index fc2333b0f59f7eb7a6c2d7b1b01db81e527649b1..8370268805a180a2ba93acdfd7be04fa9cd60971 100644 --- a/checkpoints/model_weights_000114688000.pt +++ b/checkpoints/model_weights_000114688000.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:453c809cb1ec24a354c679af03f27159aa586df502e648fdec1e445020f27513 -size 152233941 +oid sha256:bbb4bca3debb8a5f9aeff78b7c6b44adeea5abbcfc494e85d160590208ef338a +size 151183829 diff --git a/checkpoints/model_weights_000118882304.pt b/checkpoints/model_weights_000118882304.pt index 3578bb3a9828e97260ec1148d2d52aadc21eb7dc..2a161a49cd473c4ae33aea2f1464077e057e87ea 100644 --- a/checkpoints/model_weights_000118882304.pt +++ b/checkpoints/model_weights_000118882304.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4cbab3d452ea3a9409d030e5776c662fe950d0d79f8784f1817231d7fa083aea -size 152233941 +oid sha256:dad525ac75a786b7c48559f73292ce6b0761326ecff0a00abf45fca0c7dbc4d1 +size 151183829 diff --git a/checkpoints/model_weights_000130777088.pt b/checkpoints/model_weights_000130777088.pt index 5eaaa96134a158c178dc3d70dc72940e0bd3f369..03615802dd2ac727c5dd12106fc3a1291c357204 100644 --- a/checkpoints/model_weights_000130777088.pt +++ b/checkpoints/model_weights_000130777088.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d487b622a2cc5a65570831502e60128c9cb56ae6fdc1e26d43e573a6ca812d54 -size 152233941 +oid sha256:5dead3853e6408b78c95fd11735281656f8700f9ba0ab45a635b8180b77ea6e8 +size 151183829 diff --git a/checkpoints/model_weights_000131072000.pt b/checkpoints/model_weights_000131072000.pt index 45f0441040c212b11c8be2a6ea782ec5f767e719..149912c671981c7d942909ffda7e745655fcbe85 100644 --- a/checkpoints/model_weights_000131072000.pt +++ b/checkpoints/model_weights_000131072000.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9a3679d177c2f603ed401d074578d3cf0551e07025298db66df3f7d42226a5d2 -size 152233941 +oid sha256:0bf1f29a7bcfdb09251db1c83484db6d7a1067fe264303822a59479c66b03158 +size 151183829 diff --git a/checkpoints/model_weights_000143851520.pt b/checkpoints/model_weights_000143851520.pt index c3883e06ab684415faaa441be7be34e303646414..738ca0d08fe94a2fd6b26ed25e11025ffb0d6610 100644 --- a/checkpoints/model_weights_000143851520.pt +++ b/checkpoints/model_weights_000143851520.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a011307b957b05c47eef2a28fbddb10dfd636099bbe9821d2cccadd68d6d943a -size 152233941 +oid sha256:288de1c16d9a08421c961db7352d39b8df472768ac6c703449f7a34650bf7861 +size 151183829 diff --git a/checkpoints/model_weights_000147456000.pt b/checkpoints/model_weights_000147456000.pt index 514a9e898b663415c8af25f04a0a5c88a645c8c6..295e31bafe27774ac3a568b02199d0c55781b737 100644 --- a/checkpoints/model_weights_000147456000.pt +++ b/checkpoints/model_weights_000147456000.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9b6b44f0256a3e937b44d98ba5db43827beca6b2afff730e982c589845da5b18 -size 152233941 +oid sha256:a60dbba192bc2b853332448301d37f9472e007a2b1100deca8c73706a2293b82 +size 151183829 diff --git a/checkpoints/model_weights_000158269440.pt b/checkpoints/model_weights_000158269440.pt index be0ee6790c96a2cca0e9132b3166a5f39a0979b8..235d45a02eae169a8664d155213d44345ef80a30 100644 --- a/checkpoints/model_weights_000158269440.pt +++ b/checkpoints/model_weights_000158269440.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7847986f977020865a7d834725deb30709d5886a4f48f2925e21990b2dc258b0 -size 152233941 +oid sha256:fcf6983f31ea0a6e8dfccfbc518ae4429a50b81a78e96151c4226c6be2dbbc38 +size 151183829 diff --git a/checkpoints/model_weights_000163840000.pt b/checkpoints/model_weights_000163840000.pt index 0d557680859f05efe278c880365fd3959ddf75c6..6826a5805895bf1b9d9434cf4f677df435c4220f 100644 --- a/checkpoints/model_weights_000163840000.pt +++ b/checkpoints/model_weights_000163840000.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6d43f4d6cd56278c9b1fb28ca91bc4b22b098e445a3af3568b41d2d6c3b0dc90 -size 152233941 +oid sha256:4b1c766285545374a4628169306238921323892fabc16efe4cac2e030f580e0a +size 151183829 diff --git a/checkpoints/model_weights_000174096384.pt b/checkpoints/model_weights_000174096384.pt index f741f45f2edb2762dbda3c2e15d637324d538fee..7b846307b998656161deee1c626c0b5ad135e49c 100644 --- a/checkpoints/model_weights_000174096384.pt +++ b/checkpoints/model_weights_000174096384.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:84d64815d19e8664b4c3ca40b7c0b491f2e83847ac4fd2bd94bbc3359226efb3 -size 152233941 +oid sha256:6385d8078c4dd13097d6991735ac2a9ff1095e6ef82e2040420907f1984d51d6 +size 151183829 diff --git a/checkpoints/model_weights_000180224000.pt b/checkpoints/model_weights_000180224000.pt index 6cdb48395c43e01cee8e2eda42e0c85c301a74fc..6e522514bacaa7bcd105b267f9cb1ad8d99e5adb 100644 --- a/checkpoints/model_weights_000180224000.pt +++ b/checkpoints/model_weights_000180224000.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:76a3ff11eaf9031309562053ae7288d5b0f74743d7a7499fd90ffa74001603ae -size 152233941 +oid sha256:119c9199dfadb5302077fb1def57e503f39ae4ad5bf75a2a67c8b509259f7aa6 +size 151183829 diff --git a/checkpoints/model_weights_000191496192.pt b/checkpoints/model_weights_000191496192.pt index a3d0946db81a3269f0c8e9b2b96cc9b9c574cd16..8799e849631ed66969ccd21e967ddf034e0139bb 100644 --- a/checkpoints/model_weights_000191496192.pt +++ b/checkpoints/model_weights_000191496192.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8812454f499055a1f98385091e53f48070cc66e842541d10aabf65f6cfcd7222 -size 152233941 +oid sha256:a48916d983b57e57dde693012ee713e4ccc9fa8f1754f2707e0e384bfa15e65e +size 151183829 diff --git a/checkpoints/model_weights_000196608000.pt b/checkpoints/model_weights_000196608000.pt index ff59d7535bee5594443ea13785e8a0bc7577d762..1202b4a615d2ec11821b8952a6bab85812339055 100644 --- a/checkpoints/model_weights_000196608000.pt +++ b/checkpoints/model_weights_000196608000.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cddba86502e17ef2a84aa5c7b449b3cff6ee0e15ff5aa85b80379dcecd396627 -size 152233941 +oid sha256:f25fa26c5946fd7d6d7cbcff540914b9b8b004a8651828cdb4b9cbf9135d1c4f +size 151183829 diff --git a/checkpoints/model_weights_000196706304.pt b/checkpoints/model_weights_000196706304.pt index 433d4aec0afab5d0567e48a9de8ca0c25419a7a1..7ff06a94fa8066aeccd44e42aea2b4564f0b3308 100644 --- a/checkpoints/model_weights_000196706304.pt +++ b/checkpoints/model_weights_000196706304.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e5d5ca4396fe845458580e8d547be53c1e1f59bce7b5c3d0227b3f4305449f9d -size 152233941 +oid sha256:38f0208bbe10a6a6672097497731f30b432c88667eedb126c6ac70bc64147270 +size 151183829 diff --git a/checkpoints/model_weights_000197361664.pt b/checkpoints/model_weights_000197361664.pt index 4f727b30685a1937c350b899d2726ee317e64baa..ca026d08bd4a33b81cf2278bac31121c87068023 100644 --- a/checkpoints/model_weights_000197361664.pt +++ b/checkpoints/model_weights_000197361664.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a794cd481aae45b3763d1f0d1ef884f639409209a25daeb80de6e094b999cb7f -size 152233941 +oid sha256:c7babbfc6a6011af20eff8003ae7b8e86bbcc5648a7c3181bc06b490ba489301 +size 151183829 diff --git a/checkpoints/model_weights_000198017024.pt b/checkpoints/model_weights_000198017024.pt index bedfa0050d95e80d8772a3a15676b8c00ae81bb5..48415b8b7cd5e365ddd3cbd62e166bba51ff2c71 100644 --- a/checkpoints/model_weights_000198017024.pt +++ b/checkpoints/model_weights_000198017024.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:59e123b98191719e7dfcf691c195733edcca35e3946152f15c9073d0c2f40eb4 -size 152233941 +oid sha256:a8cd08fa6349fe63147e470b0a35ea18b734de5e0e3153515e1bbabc33e20923 +size 151183829 diff --git a/checkpoints/model_weights_000198672384.pt b/checkpoints/model_weights_000198672384.pt index c3ae3d9ac7a52d0e0d81915053abfeb29c689206..c1a0c1580dc80a3d0deda0e08c4f1179e2d17e76 100644 --- a/checkpoints/model_weights_000198672384.pt +++ b/checkpoints/model_weights_000198672384.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:828b4f79307f851edf450f962649cb453a5a47274c95f8d7918c5f80129fa19b -size 152233941 +oid sha256:afd76f6a6e1b938d423b5f746d1fec3d8aeb9c2c762b8e239395da8d92be3e8d +size 151183829 diff --git a/checkpoints/model_weights_000199327744.pt b/checkpoints/model_weights_000199327744.pt index d18137a221d8b5bd6c9b13a98bba2c0ba541be86..595b1de70a4f6bea0a49a20ce9947b36c948222e 100644 --- a/checkpoints/model_weights_000199327744.pt +++ b/checkpoints/model_weights_000199327744.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b2e53dfe4aaab1cfd0f9061a8bf68f9d4cab62315efd02eb216c9185dce67fa2 -size 152233941 +oid sha256:c95e9d70f55f1b3f54e0a844bb9dd5b9858d36f03cc47c2570b42bf1d721af62 +size 151183829 diff --git a/checkpoints/model_weights_000199950336.pt b/checkpoints/model_weights_000199950336.pt index 3253e3e498e8acf07593ff1324499978d00551b2..81f824e80dba0868cdfb5d3160572a4907470be5 100644 --- a/checkpoints/model_weights_000199950336.pt +++ b/checkpoints/model_weights_000199950336.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cc589a5603f394a6da0d93b34d49b5e45a908d01546454c928de9510ca9e2393 -size 152233941 +oid sha256:33b2fbc1cefb3a8426289be152254266ef313622ff306cbe21ac0d611b819ae6 +size 151183829 diff --git a/config.toml b/config.toml index 713efbdc25c69e02626f4cbfab1de46ce1d330ee..e3fc3cc147c7dc2190c1cb6c26c85b7f029103b9 100644 --- a/config.toml +++ b/config.toml @@ -1,9 +1,9 @@ -model_name = "pile_llama_2H_2L" +model_name = "pile_llama_H1_L2" n_layers = 2 d_model = 512 d_mlp = 2048 d_head = 64 -n_heads = 2 +n_heads = 1 attn_only = false layer_norm_eps = 1e-05 init_range = 0.02 diff --git a/latest_checkpoint.pt b/latest_checkpoint.pt index 12c1f37f5205a55bcaa313370202c732b4033408..21a8ed1a06d4824fa6cce070d074ebc21d4d7f1b 100644 --- a/latest_checkpoint.pt +++ b/latest_checkpoint.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cbf4a3701ff768aefc7f1a06a33f399b56def10807110a8e2f7cb5139f6b89ec -size 152233463 +oid sha256:a841316540aacca701236202ac8a9224d21e8c06feead0486a3b0b90ace4bc2a +size 151183351 diff --git a/latest_metadata.json b/latest_metadata.json index a72f6c7b92a2dcca81dc2215710976a9adf66bb8..c16e16df2be11e6f5e6f06b5472547126445306f 100644 --- a/latest_metadata.json +++ b/latest_metadata.json @@ -1 +1 @@ -{"step": 6102, "tokens_seen": 199950336, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.394872257897474} \ No newline at end of file +{"step": 6102, "tokens_seen": 199950336, "config": {"model_name": "pile_llama_H1_L2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 1, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.4578413544944855} \ No newline at end of file diff --git a/latest_optimizer.pt b/latest_optimizer.pt index 8da33203ecaad91a8827342a257f37a8ca03d174..dc3fd016ddf8f7527f07be956e5891a24bcd5bf5 100644 --- a/latest_optimizer.pt +++ b/latest_optimizer.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:45f29280cd5d337e2f07c52c96e8f786c41131009adff2f4450a76255889657c -size 304472851 +oid sha256:b412b0850dd6e899a32a7a77dcf97e5a851bbfa14d20715330f35dcd4888314d +size 302372627 diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log index 8a8b762c57499dae83b531207444b547fc77a22f..f48362b4d868d528b4262bd90ae26bc82d13006a 100644 --- a/wandb/debug-internal.log +++ b/wandb/debug-internal.log @@ -1,12 +1,13 @@ -{"time":"2026-02-26T13:56:02.310641183Z","level":"INFO","msg":"stream: starting","core version":"0.21.4"} -{"time":"2026-02-26T13:56:02.505385385Z","level":"INFO","msg":"stream: created new stream","id":"696nxyfr"} -{"time":"2026-02-26T13:56:02.505474889Z","level":"INFO","msg":"stream: started","id":"696nxyfr"} -{"time":"2026-02-26T13:56:02.505547204Z","level":"INFO","msg":"writer: started","stream_id":"696nxyfr"} -{"time":"2026-02-26T13:56:02.505558638Z","level":"INFO","msg":"handler: started","stream_id":"696nxyfr"} -{"time":"2026-02-26T13:56:02.5055985Z","level":"INFO","msg":"sender: started","stream_id":"696nxyfr"} -{"time":"2026-02-26T14:39:41.476737152Z","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run metadata","runtime_seconds":0.003364817}],"total_operations":1}} -{"time":"2026-02-26T14:39:41.82022831Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} -{"time":"2026-02-26T14:39:42.35788107Z","level":"INFO","msg":"stream: closing","id":"696nxyfr"} -{"time":"2026-02-26T14:39:42.357919856Z","level":"INFO","msg":"handler: closed","stream_id":"696nxyfr"} -{"time":"2026-02-26T14:39:42.357981703Z","level":"INFO","msg":"sender: closed","stream_id":"696nxyfr"} -{"time":"2026-02-26T14:39:42.357990898Z","level":"INFO","msg":"stream: closed","id":"696nxyfr"} +{"time":"2026-02-26T15:30:26.517889161Z","level":"INFO","msg":"stream: starting","core version":"0.21.4"} +{"time":"2026-02-26T15:30:31.757267268Z","level":"INFO","msg":"stream: created new stream","id":"trcpjlfd"} +{"time":"2026-02-26T15:30:31.757342751Z","level":"INFO","msg":"stream: started","id":"trcpjlfd"} +{"time":"2026-02-26T15:30:31.757375145Z","level":"INFO","msg":"handler: started","stream_id":"trcpjlfd"} +{"time":"2026-02-26T15:30:31.757422001Z","level":"INFO","msg":"sender: started","stream_id":"trcpjlfd"} +{"time":"2026-02-26T15:30:31.757462295Z","level":"INFO","msg":"writer: started","stream_id":"trcpjlfd"} +{"time":"2026-02-26T16:08:47.579624032Z","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/tzach/toy-transformer-replication/trcpjlfd/file_stream","body":"\n
\n\nPlease try again in 30 seconds.\n
\n\n"} +{"time":"2026-02-26T16:12:40.720756645Z","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run metadata","runtime_seconds":0.00031624}],"total_operations":1}} +{"time":"2026-02-26T16:12:41.198683788Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2026-02-26T16:12:41.385614105Z","level":"INFO","msg":"stream: closing","id":"trcpjlfd"} +{"time":"2026-02-26T16:12:41.385657054Z","level":"INFO","msg":"handler: closed","stream_id":"trcpjlfd"} +{"time":"2026-02-26T16:12:41.385711626Z","level":"INFO","msg":"sender: closed","stream_id":"trcpjlfd"} +{"time":"2026-02-26T16:12:41.385718584Z","level":"INFO","msg":"stream: closed","id":"trcpjlfd"} diff --git a/wandb/debug.log b/wandb/debug.log index d628c19744d0df790b8f77df43df800da59c34ce..3ee3411527a8785b9507f88e164f4432869d104c 100644 --- a/wandb/debug.log +++ b/wandb/debug.log @@ -1,26 +1,26 @@ -2026-02-26 13:56:02,097 INFO MainThread:4744 [wandb_setup.py:_flush():81] Current SDK version is 0.21.4 -2026-02-26 13:56:02,097 INFO MainThread:4744 [wandb_setup.py:_flush():81] Configure stats pid to 4744 -2026-02-26 13:56:02,097 INFO MainThread:4744 [wandb_setup.py:_flush():81] Loading settings from /root/.config/wandb/settings -2026-02-26 13:56:02,097 INFO MainThread:4744 [wandb_setup.py:_flush():81] Loading settings from /notebooks/toy_models/model_training/model/wandb/settings -2026-02-26 13:56:02,097 INFO MainThread:4744 [wandb_setup.py:_flush():81] Loading settings from environment variables -2026-02-26 13:56:02,097 INFO MainThread:4744 [wandb_init.py:setup_run_log_directory():686] Logging user logs to /notebooks/toy_models/model_training/model/wandb/run-20260226_135602-696nxyfr/logs/debug.log -2026-02-26 13:56:02,097 INFO MainThread:4744 [wandb_init.py:setup_run_log_directory():687] Logging internal logs to /notebooks/toy_models/model_training/model/wandb/run-20260226_135602-696nxyfr/logs/debug-internal.log -2026-02-26 13:56:02,097 INFO MainThread:4744 [wandb_init.py:init():813] calling init triggers -2026-02-26 13:56:02,098 INFO MainThread:4744 [wandb_init.py:init():818] wandb.init called with sweep_config: {} -config: {'model_name': 'pile_llama_2H_2L', 'n_layers': 2, 'd_model': 512, 'd_mlp': 2048, 'd_head': 64, 'n_heads': 2, 'attn_only': False, 'layer_norm_eps': 1e-05, 'init_range': 0.02, 'n_ctx': 1024, 'd_vocab': 32000, 'dataset_name': 'eoinf/pile_llama', 'tokenizer_name': '', 'seed': 10, 'data_seed': 10, 'device': 'cuda', 'use_bfloat16_matmul': False, 'batch_size_per_device': 32, 'n_devices': 1, 'batches_per_step': 1, 'max_tokens': 200000000, 'lr_hidden': 0.001, 'lr_vector': 0.0005, 'lr_schedule': 'constant_with_warmup', 'warmup_tokens': 30000000, 'weight_decay': 0.05, 'grad_norm_clip': 1.0, 'train_loss_moving_average_beta': 0.99, 'log_interval': 25, 'save_checkpoints': True, 'checkpoint_interval': 500, 'checkpoint_interval_ratio': 1.1, 'save_log_checkpoints': True, 'use_wandb': True, 'batch_size': 32, 'tokens_per_step': 32768, 'warmup_steps': 915, 'max_steps': 6103, '_wandb': {}} -2026-02-26 13:56:02,098 INFO MainThread:4744 [wandb_init.py:init():854] starting backend -2026-02-26 13:56:02,302 INFO MainThread:4744 [wandb_init.py:init():857] sending inform_init request -2026-02-26 13:56:02,306 INFO MainThread:4744 [wandb_init.py:init():865] backend started and connected -2026-02-26 13:56:02,308 INFO MainThread:4744 [wandb_init.py:init():936] updated telemetry -2026-02-26 13:56:02,312 INFO MainThread:4744 [wandb_init.py:init():960] communicating run to backend with 90.0 second timeout -2026-02-26 13:56:02,787 INFO MainThread:4744 [wandb_init.py:init():1011] starting run threads in backend -2026-02-26 13:56:02,902 INFO MainThread:4744 [wandb_run.py:_console_start():2506] atexit reg -2026-02-26 13:56:02,902 INFO MainThread:4744 [wandb_run.py:_redirect():2354] redirect: wrap_raw -2026-02-26 13:56:02,902 INFO MainThread:4744 [wandb_run.py:_redirect():2423] Wrapping output streams. -2026-02-26 13:56:02,902 INFO MainThread:4744 [wandb_run.py:_redirect():2446] Redirects installed. -2026-02-26 13:56:02,905 INFO MainThread:4744 [wandb_init.py:init():1049] run started, returning control to user process -2026-02-26 14:39:41,470 INFO MainThread:4744 [wandb_run.py:_finish():2272] finishing run tzach/toy-transformer-replication/696nxyfr -2026-02-26 14:39:41,472 INFO MainThread:4744 [wandb_run.py:_atexit_cleanup():2471] got exitcode: 0 -2026-02-26 14:39:41,473 INFO MainThread:4744 [wandb_run.py:_restore():2453] restore -2026-02-26 14:39:41,473 INFO MainThread:4744 [wandb_run.py:_restore():2459] restore done -2026-02-26 14:39:42,357 INFO MainThread:4744 [wandb_run.py:_footer_sync_info():3867] logging synced files +2026-02-26 15:30:26,235 INFO MainThread:5904 [wandb_setup.py:_flush():81] Current SDK version is 0.21.4 +2026-02-26 15:30:26,236 INFO MainThread:5904 [wandb_setup.py:_flush():81] Configure stats pid to 5904 +2026-02-26 15:30:26,236 INFO MainThread:5904 [wandb_setup.py:_flush():81] Loading settings from /root/.config/wandb/settings +2026-02-26 15:30:26,236 INFO MainThread:5904 [wandb_setup.py:_flush():81] Loading settings from /notebooks/toy_models/model_training/model/wandb/settings +2026-02-26 15:30:26,236 INFO MainThread:5904 [wandb_setup.py:_flush():81] Loading settings from environment variables +2026-02-26 15:30:26,236 INFO MainThread:5904 [wandb_init.py:setup_run_log_directory():686] Logging user logs to /notebooks/toy_models/model_training/model/wandb/run-20260226_153026-trcpjlfd/logs/debug.log +2026-02-26 15:30:26,236 INFO MainThread:5904 [wandb_init.py:setup_run_log_directory():687] Logging internal logs to /notebooks/toy_models/model_training/model/wandb/run-20260226_153026-trcpjlfd/logs/debug-internal.log +2026-02-26 15:30:26,236 INFO MainThread:5904 [wandb_init.py:init():813] calling init triggers +2026-02-26 15:30:26,236 INFO MainThread:5904 [wandb_init.py:init():818] wandb.init called with sweep_config: {} +config: {'model_name': 'pile_llama_H1_L2', 'n_layers': 2, 'd_model': 512, 'd_mlp': 2048, 'd_head': 64, 'n_heads': 1, 'attn_only': False, 'layer_norm_eps': 1e-05, 'init_range': 0.02, 'n_ctx': 1024, 'd_vocab': 32000, 'dataset_name': 'eoinf/pile_llama', 'tokenizer_name': '', 'seed': 10, 'data_seed': 10, 'device': 'cuda', 'use_bfloat16_matmul': False, 'batch_size_per_device': 32, 'n_devices': 1, 'batches_per_step': 1, 'max_tokens': 200000000, 'lr_hidden': 0.001, 'lr_vector': 0.0005, 'lr_schedule': 'constant_with_warmup', 'warmup_tokens': 30000000, 'weight_decay': 0.05, 'grad_norm_clip': 1.0, 'train_loss_moving_average_beta': 0.99, 'log_interval': 25, 'save_checkpoints': True, 'checkpoint_interval': 500, 'checkpoint_interval_ratio': 1.1, 'save_log_checkpoints': True, 'use_wandb': True, 'batch_size': 32, 'tokens_per_step': 32768, 'warmup_steps': 915, 'max_steps': 6103, '_wandb': {}} +2026-02-26 15:30:26,236 INFO MainThread:5904 [wandb_init.py:init():854] starting backend +2026-02-26 15:30:26,504 INFO MainThread:5904 [wandb_init.py:init():857] sending inform_init request +2026-02-26 15:30:26,514 INFO MainThread:5904 [wandb_init.py:init():865] backend started and connected +2026-02-26 15:30:26,515 INFO MainThread:5904 [wandb_init.py:init():936] updated telemetry +2026-02-26 15:30:26,520 INFO MainThread:5904 [wandb_init.py:init():960] communicating run to backend with 90.0 second timeout +2026-02-26 15:30:32,186 INFO MainThread:5904 [wandb_init.py:init():1011] starting run threads in backend +2026-02-26 15:30:33,011 INFO MainThread:5904 [wandb_run.py:_console_start():2506] atexit reg +2026-02-26 15:30:33,011 INFO MainThread:5904 [wandb_run.py:_redirect():2354] redirect: wrap_raw +2026-02-26 15:30:33,011 INFO MainThread:5904 [wandb_run.py:_redirect():2423] Wrapping output streams. +2026-02-26 15:30:33,011 INFO MainThread:5904 [wandb_run.py:_redirect():2446] Redirects installed. +2026-02-26 15:30:33,026 INFO MainThread:5904 [wandb_init.py:init():1049] run started, returning control to user process +2026-02-26 16:12:40,715 INFO MainThread:5904 [wandb_run.py:_finish():2272] finishing run tzach/toy-transformer-replication/trcpjlfd +2026-02-26 16:12:40,719 INFO MainThread:5904 [wandb_run.py:_atexit_cleanup():2471] got exitcode: 0 +2026-02-26 16:12:40,719 INFO MainThread:5904 [wandb_run.py:_restore():2453] restore +2026-02-26 16:12:40,719 INFO MainThread:5904 [wandb_run.py:_restore():2459] restore done +2026-02-26 16:12:41,384 INFO MainThread:5904 [wandb_run.py:_footer_sync_info():3867] logging synced files diff --git a/wandb/run-20260226_153026-trcpjlfd/files/config.yaml b/wandb/run-20260226_153026-trcpjlfd/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7d21fb675869d2249a2b12d4d5f02578cae2b4e4 --- /dev/null +++ b/wandb/run-20260226_153026-trcpjlfd/files/config.yaml @@ -0,0 +1,140 @@ +_wandb: + value: + cli_version: 0.21.4 + e: + 1azdhqr66s9127dpz7denwtgp0pkf39b: + cpu_count: 8 + cpu_count_logical: 8 + cudaVersion: "12.4" + disk: + /: + total: "262240792576" + used: "179147603968" + email: tzfof8@gmail.com + executable: /notebooks/toy_models/.toy_models_env/bin/python + git: + commit: 4a8df1a57a456227acca0b4948ad48c9f03a929a + remote: https://github.com/jgroh3/toy_models.git + gpu: NVIDIA RTX A6000 + gpu_count: 1 + gpu_nvidia: + - architecture: Ampere + cudaCores: 10752 + memoryTotal: "51527024640" + name: NVIDIA RTX A6000 + uuid: GPU-fa82911d-b891-1e23-11c8-f31149299e8c + host: nb5tce73r3 + memory: + total: "47332843520" + os: Linux-5.19.0-45-generic-x86_64-with-glibc2.35 + program:Please try again in 30 seconds.\n
\n\n"} +{"time":"2026-02-26T16:12:40.720756645Z","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run metadata","runtime_seconds":0.00031624}],"total_operations":1}} +{"time":"2026-02-26T16:12:41.198683788Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2026-02-26T16:12:41.385614105Z","level":"INFO","msg":"stream: closing","id":"trcpjlfd"} +{"time":"2026-02-26T16:12:41.385657054Z","level":"INFO","msg":"handler: closed","stream_id":"trcpjlfd"} +{"time":"2026-02-26T16:12:41.385711626Z","level":"INFO","msg":"sender: closed","stream_id":"trcpjlfd"} +{"time":"2026-02-26T16:12:41.385718584Z","level":"INFO","msg":"stream: closed","id":"trcpjlfd"} diff --git a/wandb/run-20260226_153026-trcpjlfd/logs/debug.log b/wandb/run-20260226_153026-trcpjlfd/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..3ee3411527a8785b9507f88e164f4432869d104c --- /dev/null +++ b/wandb/run-20260226_153026-trcpjlfd/logs/debug.log @@ -0,0 +1,26 @@ +2026-02-26 15:30:26,235 INFO MainThread:5904 [wandb_setup.py:_flush():81] Current SDK version is 0.21.4 +2026-02-26 15:30:26,236 INFO MainThread:5904 [wandb_setup.py:_flush():81] Configure stats pid to 5904 +2026-02-26 15:30:26,236 INFO MainThread:5904 [wandb_setup.py:_flush():81] Loading settings from /root/.config/wandb/settings +2026-02-26 15:30:26,236 INFO MainThread:5904 [wandb_setup.py:_flush():81] Loading settings from /notebooks/toy_models/model_training/model/wandb/settings +2026-02-26 15:30:26,236 INFO MainThread:5904 [wandb_setup.py:_flush():81] Loading settings from environment variables +2026-02-26 15:30:26,236 INFO MainThread:5904 [wandb_init.py:setup_run_log_directory():686] Logging user logs to /notebooks/toy_models/model_training/model/wandb/run-20260226_153026-trcpjlfd/logs/debug.log +2026-02-26 15:30:26,236 INFO MainThread:5904 [wandb_init.py:setup_run_log_directory():687] Logging internal logs to /notebooks/toy_models/model_training/model/wandb/run-20260226_153026-trcpjlfd/logs/debug-internal.log +2026-02-26 15:30:26,236 INFO MainThread:5904 [wandb_init.py:init():813] calling init triggers +2026-02-26 15:30:26,236 INFO MainThread:5904 [wandb_init.py:init():818] wandb.init called with sweep_config: {} +config: {'model_name': 'pile_llama_H1_L2', 'n_layers': 2, 'd_model': 512, 'd_mlp': 2048, 'd_head': 64, 'n_heads': 1, 'attn_only': False, 'layer_norm_eps': 1e-05, 'init_range': 0.02, 'n_ctx': 1024, 'd_vocab': 32000, 'dataset_name': 'eoinf/pile_llama', 'tokenizer_name': '', 'seed': 10, 'data_seed': 10, 'device': 'cuda', 'use_bfloat16_matmul': False, 'batch_size_per_device': 32, 'n_devices': 1, 'batches_per_step': 1, 'max_tokens': 200000000, 'lr_hidden': 0.001, 'lr_vector': 0.0005, 'lr_schedule': 'constant_with_warmup', 'warmup_tokens': 30000000, 'weight_decay': 0.05, 'grad_norm_clip': 1.0, 'train_loss_moving_average_beta': 0.99, 'log_interval': 25, 'save_checkpoints': True, 'checkpoint_interval': 500, 'checkpoint_interval_ratio': 1.1, 'save_log_checkpoints': True, 'use_wandb': True, 'batch_size': 32, 'tokens_per_step': 32768, 'warmup_steps': 915, 'max_steps': 6103, '_wandb': {}} +2026-02-26 15:30:26,236 INFO MainThread:5904 [wandb_init.py:init():854] starting backend +2026-02-26 15:30:26,504 INFO MainThread:5904 [wandb_init.py:init():857] sending inform_init request +2026-02-26 15:30:26,514 INFO MainThread:5904 [wandb_init.py:init():865] backend started and connected +2026-02-26 15:30:26,515 INFO MainThread:5904 [wandb_init.py:init():936] updated telemetry +2026-02-26 15:30:26,520 INFO MainThread:5904 [wandb_init.py:init():960] communicating run to backend with 90.0 second timeout +2026-02-26 15:30:32,186 INFO MainThread:5904 [wandb_init.py:init():1011] starting run threads in backend +2026-02-26 15:30:33,011 INFO MainThread:5904 [wandb_run.py:_console_start():2506] atexit reg +2026-02-26 15:30:33,011 INFO MainThread:5904 [wandb_run.py:_redirect():2354] redirect: wrap_raw +2026-02-26 15:30:33,011 INFO MainThread:5904 [wandb_run.py:_redirect():2423] Wrapping output streams. +2026-02-26 15:30:33,011 INFO MainThread:5904 [wandb_run.py:_redirect():2446] Redirects installed. +2026-02-26 15:30:33,026 INFO MainThread:5904 [wandb_init.py:init():1049] run started, returning control to user process +2026-02-26 16:12:40,715 INFO MainThread:5904 [wandb_run.py:_finish():2272] finishing run tzach/toy-transformer-replication/trcpjlfd +2026-02-26 16:12:40,719 INFO MainThread:5904 [wandb_run.py:_atexit_cleanup():2471] got exitcode: 0 +2026-02-26 16:12:40,719 INFO MainThread:5904 [wandb_run.py:_restore():2453] restore +2026-02-26 16:12:40,719 INFO MainThread:5904 [wandb_run.py:_restore():2459] restore done +2026-02-26 16:12:41,384 INFO MainThread:5904 [wandb_run.py:_footer_sync_info():3867] logging synced files diff --git a/wandb/run-20260226_153026-trcpjlfd/run-trcpjlfd.wandb b/wandb/run-20260226_153026-trcpjlfd/run-trcpjlfd.wandb new file mode 100644 index 0000000000000000000000000000000000000000..e4ca3845d1c41e91a4d67d218e262317c59ac3c2 --- /dev/null +++ b/wandb/run-20260226_153026-trcpjlfd/run-trcpjlfd.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63948b1b9244b00019c110f3957fac0b53dc62ad14a2ef91eeb719b4e1244b34 +size 3893836