diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..fd857501c7ca4b874b61dd810867d733bc2ffeaf 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +wandb/run-20260226_135602-696nxyfr/run-696nxyfr.wandb filter=lfs diff=lfs merge=lfs -text diff --git a/checkpoints/metadata_000000032768.json b/checkpoints/metadata_000000032768.json new file mode 100644 index 0000000000000000000000000000000000000000..b8a7704ce050180271ca25f2f255fcf595a47b6c --- /dev/null +++ b/checkpoints/metadata_000000032768.json @@ -0,0 +1 @@ +{"step": 1, "tokens_seen": 32768, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.47806167602539} \ No newline at end of file diff --git a/checkpoints/metadata_000000327680.json b/checkpoints/metadata_000000327680.json new file mode 100644 index 0000000000000000000000000000000000000000..d42d566ea67f268fa6f57813ee4d541346608b09 --- /dev/null +++ b/checkpoints/metadata_000000327680.json @@ -0,0 +1 @@ +{"step": 10, "tokens_seen": 327680, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.47627660688796} \ No newline at end of file diff --git a/checkpoints/metadata_000000360448.json b/checkpoints/metadata_000000360448.json new file mode 100644 index 0000000000000000000000000000000000000000..3d7caabc185ac1f7af4a964848e301009e0bee87 --- /dev/null +++ b/checkpoints/metadata_000000360448.json @@ -0,0 +1 @@ +{"step": 11, "tokens_seen": 360448, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.475470339918813} \ No newline at end of file diff --git a/checkpoints/metadata_000000425984.json b/checkpoints/metadata_000000425984.json new file mode 100644 index 0000000000000000000000000000000000000000..8fe398142a24e02b10c494cb6ff60557b8f9f5f4 --- /dev/null +++ b/checkpoints/metadata_000000425984.json @@ -0,0 +1 @@ +{"step": 13, "tokens_seen": 425984, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.473083299528513} \ No newline at end of file diff --git a/checkpoints/metadata_000000458752.json b/checkpoints/metadata_000000458752.json new file mode 100644 index 0000000000000000000000000000000000000000..f162026671441be3ff60f893882f1a74bc817645 --- /dev/null +++ b/checkpoints/metadata_000000458752.json @@ -0,0 +1 @@ +{"step": 14, "tokens_seen": 458752, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.471743989894433} \ No newline at end of file diff --git a/checkpoints/metadata_000000491520.json b/checkpoints/metadata_000000491520.json new file mode 100644 index 0000000000000000000000000000000000000000..4c79139f1f06a20bc53600249f641d0ccb6435b6 --- /dev/null +++ b/checkpoints/metadata_000000491520.json @@ -0,0 +1 @@ +{"step": 15, "tokens_seen": 491520, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.470174342811651} \ No newline at end of file diff --git a/checkpoints/metadata_000000557056.json b/checkpoints/metadata_000000557056.json new file mode 100644 index 0000000000000000000000000000000000000000..aaeb3f54a99814ce2d5cd77e2a76e06123246521 --- /dev/null +++ b/checkpoints/metadata_000000557056.json @@ -0,0 +1 @@ +{"step": 17, "tokens_seen": 557056, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.466137204272727} \ No newline at end of file diff --git a/checkpoints/metadata_000000622592.json b/checkpoints/metadata_000000622592.json new file mode 100644 index 0000000000000000000000000000000000000000..dc64e5070c0c8acda29fae1e838e840583d89fda --- /dev/null +++ b/checkpoints/metadata_000000622592.json @@ -0,0 +1 @@ +{"step": 19, "tokens_seen": 622592, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.461292862352218} \ No newline at end of file diff --git a/checkpoints/metadata_000000688128.json b/checkpoints/metadata_000000688128.json new file mode 100644 index 0000000000000000000000000000000000000000..ee621bde74f1b70e7745f6d6982ee0ce657f9468 --- /dev/null +++ b/checkpoints/metadata_000000688128.json @@ -0,0 +1 @@ +{"step": 21, "tokens_seen": 688128, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.455025942618947} \ No newline at end of file diff --git a/checkpoints/metadata_000000753664.json b/checkpoints/metadata_000000753664.json new file mode 100644 index 0000000000000000000000000000000000000000..1baef2ebe8e428cd1b1f752d67724878e2d5fd9f --- /dev/null +++ b/checkpoints/metadata_000000753664.json @@ -0,0 +1 @@ +{"step": 23, "tokens_seen": 753664, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.446919057636006} \ No newline at end of file diff --git a/checkpoints/metadata_000000819200.json b/checkpoints/metadata_000000819200.json new file mode 100644 index 0000000000000000000000000000000000000000..62bf26e4dcb7882b53b7443ab588061ba613e705 --- /dev/null +++ b/checkpoints/metadata_000000819200.json @@ -0,0 +1 @@ +{"step": 25, "tokens_seen": 819200, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.438280975135724} \ No newline at end of file diff --git a/checkpoints/metadata_000000917504.json b/checkpoints/metadata_000000917504.json new file mode 100644 index 0000000000000000000000000000000000000000..2f0e48b7dcf9d3e421a608b288bc8edd356acd51 --- /dev/null +++ b/checkpoints/metadata_000000917504.json @@ -0,0 +1 @@ +{"step": 28, "tokens_seen": 917504, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.422044364253066} \ No newline at end of file diff --git a/checkpoints/metadata_000000983040.json b/checkpoints/metadata_000000983040.json new file mode 100644 index 0000000000000000000000000000000000000000..699cee11d0214ac7e5944ecdd04e8cb13a300994 --- /dev/null +++ b/checkpoints/metadata_000000983040.json @@ -0,0 +1 @@ +{"step": 30, "tokens_seen": 983040, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.409236852859813} \ No newline at end of file diff --git a/checkpoints/metadata_000001114112.json b/checkpoints/metadata_000001114112.json new file mode 100644 index 0000000000000000000000000000000000000000..141574201d643111e2cf18c3c5d36f20e45211a4 --- /dev/null +++ b/checkpoints/metadata_000001114112.json @@ -0,0 +1 @@ +{"step": 34, "tokens_seen": 1114112, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.380797570445656} \ No newline at end of file diff --git a/checkpoints/metadata_000001212416.json b/checkpoints/metadata_000001212416.json new file mode 100644 index 0000000000000000000000000000000000000000..e03c279a8bed8806a93a4b404c61377ac32160c4 --- /dev/null +++ b/checkpoints/metadata_000001212416.json @@ -0,0 +1 @@ +{"step": 37, "tokens_seen": 1212416, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.3582079333492} \ No newline at end of file diff --git a/checkpoints/metadata_000001343488.json b/checkpoints/metadata_000001343488.json new file mode 100644 index 0000000000000000000000000000000000000000..fcdd216343e123858497be637c6bb7b0e680f4c6 --- /dev/null +++ b/checkpoints/metadata_000001343488.json @@ -0,0 +1 @@ +{"step": 41, "tokens_seen": 1343488, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.325794613706586} \ No newline at end of file diff --git a/checkpoints/metadata_000001474560.json b/checkpoints/metadata_000001474560.json new file mode 100644 index 0000000000000000000000000000000000000000..0d18f8776a72ed8ccd933da3d907f83603159d46 --- /dev/null +++ b/checkpoints/metadata_000001474560.json @@ -0,0 +1 @@ +{"step": 45, "tokens_seen": 1474560, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.289813354053809} \ No newline at end of file diff --git a/checkpoints/metadata_000001605632.json b/checkpoints/metadata_000001605632.json new file mode 100644 index 0000000000000000000000000000000000000000..b36f5e561d2af918ee2cba8a2ea944fffa623b7e --- /dev/null +++ b/checkpoints/metadata_000001605632.json @@ -0,0 +1 @@ +{"step": 49, "tokens_seen": 1605632, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.250201881192115} \ No newline at end of file diff --git a/checkpoints/metadata_000001769472.json b/checkpoints/metadata_000001769472.json new file mode 100644 index 0000000000000000000000000000000000000000..b3ef822016a8a8f7beb31e27a3229e9be5246628 --- /dev/null +++ b/checkpoints/metadata_000001769472.json @@ -0,0 +1 @@ +{"step": 54, "tokens_seen": 1769472, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.19959723315217} \ No newline at end of file diff --git a/checkpoints/metadata_000001966080.json b/checkpoints/metadata_000001966080.json new file mode 100644 index 0000000000000000000000000000000000000000..c2a1487afc63bae04e991f52ae073bf2a1a19546 --- /dev/null +++ b/checkpoints/metadata_000001966080.json @@ -0,0 +1 @@ +{"step": 60, "tokens_seen": 1966080, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.13133821097923} \ No newline at end of file diff --git a/checkpoints/metadata_000002162688.json b/checkpoints/metadata_000002162688.json new file mode 100644 index 0000000000000000000000000000000000000000..0987d20d4d9e2bed5b80ffa13991890f19111b52 --- /dev/null +++ b/checkpoints/metadata_000002162688.json @@ -0,0 +1 @@ +{"step": 66, "tokens_seen": 2162688, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.0570300679636} \ No newline at end of file diff --git a/checkpoints/metadata_000002359296.json b/checkpoints/metadata_000002359296.json new file mode 100644 index 0000000000000000000000000000000000000000..e4cc5f37f9cc6cf896d9020ba39870d9c5207eaf --- /dev/null +++ b/checkpoints/metadata_000002359296.json @@ -0,0 +1 @@ +{"step": 72, "tokens_seen": 2359296, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.974723497158761} \ No newline at end of file diff --git a/checkpoints/metadata_000002621440.json b/checkpoints/metadata_000002621440.json new file mode 100644 index 0000000000000000000000000000000000000000..229c04826656632bc75f6db36a8162b0844225db --- /dev/null +++ b/checkpoints/metadata_000002621440.json @@ -0,0 +1 @@ +{"step": 80, "tokens_seen": 2621440, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.858762341449337} \ No newline at end of file diff --git a/checkpoints/metadata_000002883584.json b/checkpoints/metadata_000002883584.json new file mode 100644 index 0000000000000000000000000000000000000000..9c16151176a43d6ba48c2cf9d77e7d0842e1cd5b --- /dev/null +++ b/checkpoints/metadata_000002883584.json @@ -0,0 +1 @@ +{"step": 88, "tokens_seen": 2883584, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.730080927623849} \ No newline at end of file diff --git a/checkpoints/metadata_000003178496.json b/checkpoints/metadata_000003178496.json new file mode 100644 index 0000000000000000000000000000000000000000..022fba1fc00a947fd087ddbc78cd0d5f32e0efb1 --- /dev/null +++ b/checkpoints/metadata_000003178496.json @@ -0,0 +1 @@ +{"step": 97, "tokens_seen": 3178496, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.582970532481923} \ No newline at end of file diff --git a/checkpoints/metadata_000003473408.json b/checkpoints/metadata_000003473408.json new file mode 100644 index 0000000000000000000000000000000000000000..4c4a6bda07a832da46b9026337d8e10541540c70 --- /dev/null +++ b/checkpoints/metadata_000003473408.json @@ -0,0 +1 @@ +{"step": 106, "tokens_seen": 3473408, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.424478446215208} \ No newline at end of file diff --git a/checkpoints/metadata_000003833856.json b/checkpoints/metadata_000003833856.json new file mode 100644 index 0000000000000000000000000000000000000000..725ce05c5d991e1f3dd8cdb521eaa8f1ff8e82cd --- /dev/null +++ b/checkpoints/metadata_000003833856.json @@ -0,0 +1 @@ +{"step": 117, "tokens_seen": 3833856, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.225284321418476} \ No newline at end of file diff --git a/checkpoints/metadata_000004227072.json b/checkpoints/metadata_000004227072.json new file mode 100644 index 0000000000000000000000000000000000000000..e281a19b012143bbf7a5e49a68ca79fcd9013594 --- /dev/null +++ b/checkpoints/metadata_000004227072.json @@ -0,0 +1 @@ +{"step": 129, "tokens_seen": 4227072, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 8.99394007776402} \ No newline at end of file diff --git a/checkpoints/metadata_000004653056.json b/checkpoints/metadata_000004653056.json new file mode 100644 index 0000000000000000000000000000000000000000..b705fd1804265a2b8c8b9272d36c894ceb86cbf2 --- /dev/null +++ b/checkpoints/metadata_000004653056.json @@ -0,0 +1 @@ +{"step": 142, "tokens_seen": 4653056, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 8.744109641520092} \ No newline at end of file diff --git a/checkpoints/metadata_000005111808.json b/checkpoints/metadata_000005111808.json new file mode 100644 index 0000000000000000000000000000000000000000..674db93cfc7bfa57cfbb6bb6a7ea503278c9964f --- /dev/null +++ b/checkpoints/metadata_000005111808.json @@ -0,0 +1 @@ +{"step": 156, "tokens_seen": 5111808, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 8.493460174429384} \ No newline at end of file diff --git a/checkpoints/metadata_000005603328.json b/checkpoints/metadata_000005603328.json new file mode 100644 index 0000000000000000000000000000000000000000..5ec22e9a8d4a96a9fc014de4122cb5fea5df3543 --- /dev/null +++ b/checkpoints/metadata_000005603328.json @@ -0,0 +1 @@ +{"step": 171, "tokens_seen": 5603328, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 8.236969204325344} \ No newline at end of file diff --git a/checkpoints/metadata_000006193152.json b/checkpoints/metadata_000006193152.json new file mode 100644 index 0000000000000000000000000000000000000000..1b7d974dcfa9137c070ccd4821c0dd18cf205bbd --- /dev/null +++ b/checkpoints/metadata_000006193152.json @@ -0,0 +1 @@ +{"step": 189, "tokens_seen": 6193152, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.953434445580491} \ No newline at end of file diff --git a/checkpoints/metadata_000006782976.json b/checkpoints/metadata_000006782976.json new file mode 100644 index 0000000000000000000000000000000000000000..522eaa9db8475977b9e6bdc1dd093a2c00068144 --- /dev/null +++ b/checkpoints/metadata_000006782976.json @@ -0,0 +1 @@ +{"step": 207, "tokens_seen": 6782976, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.701950555362339} \ No newline at end of file diff --git a/checkpoints/metadata_000007471104.json b/checkpoints/metadata_000007471104.json new file mode 100644 index 0000000000000000000000000000000000000000..99d6b4f594adf36b9c235b7e80a492b6b8559fa2 --- /dev/null +++ b/checkpoints/metadata_000007471104.json @@ -0,0 +1 @@ +{"step": 228, "tokens_seen": 7471104, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.42616407882357} \ No newline at end of file diff --git a/checkpoints/metadata_000008224768.json b/checkpoints/metadata_000008224768.json new file mode 100644 index 0000000000000000000000000000000000000000..59813b8f5d1563a930c257626e2f884613387b9e --- /dev/null +++ b/checkpoints/metadata_000008224768.json @@ -0,0 +1 @@ +{"step": 251, "tokens_seen": 8224768, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.164880585767827} \ No newline at end of file diff --git a/checkpoints/metadata_000009043968.json b/checkpoints/metadata_000009043968.json new file mode 100644 index 0000000000000000000000000000000000000000..8a63166dc6e3c2a6edc73070bf7be87c14a84a77 --- /dev/null +++ b/checkpoints/metadata_000009043968.json @@ -0,0 +1 @@ +{"step": 276, "tokens_seen": 9043968, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.909649069151253} \ No newline at end of file diff --git a/checkpoints/metadata_000009961472.json b/checkpoints/metadata_000009961472.json new file mode 100644 index 0000000000000000000000000000000000000000..c3517fd833e0f7cadb502a735212b52d2f4107c4 --- /dev/null +++ b/checkpoints/metadata_000009961472.json @@ -0,0 +1 @@ +{"step": 304, "tokens_seen": 9961472, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.658180864800952} \ No newline at end of file diff --git a/checkpoints/metadata_000010944512.json b/checkpoints/metadata_000010944512.json new file mode 100644 index 0000000000000000000000000000000000000000..d4bd723383d6fc38c44747393bddd3cd111d57cd --- /dev/null +++ b/checkpoints/metadata_000010944512.json @@ -0,0 +1 @@ +{"step": 334, "tokens_seen": 10944512, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.4353112753232855} \ No newline at end of file diff --git a/checkpoints/metadata_000012058624.json b/checkpoints/metadata_000012058624.json new file mode 100644 index 0000000000000000000000000000000000000000..92877621131e2e706f1f6f358c8032d01e012524 --- /dev/null +++ b/checkpoints/metadata_000012058624.json @@ -0,0 +1 @@ +{"step": 368, "tokens_seen": 12058624, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.221434970794934} \ No newline at end of file diff --git a/checkpoints/metadata_000013271040.json b/checkpoints/metadata_000013271040.json new file mode 100644 index 0000000000000000000000000000000000000000..eae251b83789c3befe5d920139e3123303fd4b10 --- /dev/null +++ b/checkpoints/metadata_000013271040.json @@ -0,0 +1 @@ +{"step": 405, "tokens_seen": 13271040, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.030378599797957} \ No newline at end of file diff --git a/checkpoints/metadata_000014581760.json b/checkpoints/metadata_000014581760.json new file mode 100644 index 0000000000000000000000000000000000000000..16225b102b9c4ba26bfbb61a1e2354b122728295 --- /dev/null +++ b/checkpoints/metadata_000014581760.json @@ -0,0 +1 @@ +{"step": 445, "tokens_seen": 14581760, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.85191591132156} \ No newline at end of file diff --git a/checkpoints/metadata_000016056320.json b/checkpoints/metadata_000016056320.json new file mode 100644 index 0000000000000000000000000000000000000000..7871463ae5e10f6b474f59ed222854d3b52bb8e4 --- /dev/null +++ b/checkpoints/metadata_000016056320.json @@ -0,0 +1 @@ +{"step": 490, "tokens_seen": 16056320, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.68748776856525} \ No newline at end of file diff --git a/checkpoints/metadata_000016384000.json b/checkpoints/metadata_000016384000.json new file mode 100644 index 0000000000000000000000000000000000000000..0791b7ee0fc6cd8a0d751ae8eb9047aec7eb5111 --- /dev/null +++ b/checkpoints/metadata_000016384000.json @@ -0,0 +1 @@ +{"step": 500, "tokens_seen": 16384000, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.661260096068413} \ No newline at end of file diff --git a/checkpoints/metadata_000017661952.json b/checkpoints/metadata_000017661952.json new file mode 100644 index 0000000000000000000000000000000000000000..e4d8a81e6e63ab148e54ccd4a0c64ced8f1c54cb --- /dev/null +++ b/checkpoints/metadata_000017661952.json @@ -0,0 +1 @@ +{"step": 539, "tokens_seen": 17661952, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.539440078913039} \ No newline at end of file diff --git a/checkpoints/metadata_000019431424.json b/checkpoints/metadata_000019431424.json new file mode 100644 index 0000000000000000000000000000000000000000..795a61c7538edf75fadbc573f4c1a6a3efd34000 --- /dev/null +++ b/checkpoints/metadata_000019431424.json @@ -0,0 +1 @@ +{"step": 593, "tokens_seen": 19431424, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.392000028286869} \ No newline at end of file diff --git a/checkpoints/metadata_000021364736.json b/checkpoints/metadata_000021364736.json new file mode 100644 index 0000000000000000000000000000000000000000..c61b790b1e58816b8c6ad37b2c9dd1a7570b25d4 --- /dev/null +++ b/checkpoints/metadata_000021364736.json @@ -0,0 +1 @@ +{"step": 652, "tokens_seen": 21364736, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.281319973885694} \ No newline at end of file diff --git a/checkpoints/metadata_000023494656.json b/checkpoints/metadata_000023494656.json new file mode 100644 index 0000000000000000000000000000000000000000..8b6585cbaf873238467d8c3501ab32ffa66b637d --- /dev/null +++ b/checkpoints/metadata_000023494656.json @@ -0,0 +1 @@ +{"step": 717, "tokens_seen": 23494656, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.169868723380111} \ No newline at end of file diff --git a/checkpoints/metadata_000025853952.json b/checkpoints/metadata_000025853952.json new file mode 100644 index 0000000000000000000000000000000000000000..f33b26f878db5f9b249b6d9af9415a415a9044f4 --- /dev/null +++ b/checkpoints/metadata_000025853952.json @@ -0,0 +1 @@ +{"step": 789, "tokens_seen": 25853952, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 5.055504738591524} \ No newline at end of file diff --git a/checkpoints/metadata_000028442624.json b/checkpoints/metadata_000028442624.json new file mode 100644 index 0000000000000000000000000000000000000000..66b591df2df3b2b40f9b89e19e29f7d9b0bb30f3 --- /dev/null +++ b/checkpoints/metadata_000028442624.json @@ -0,0 +1 @@ +{"step": 868, "tokens_seen": 28442624, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.954886293959172} \ No newline at end of file diff --git a/checkpoints/metadata_000031293440.json b/checkpoints/metadata_000031293440.json new file mode 100644 index 0000000000000000000000000000000000000000..ac6d4f57c0dcddf36e0e4bee3210521fe8c37c8f --- /dev/null +++ b/checkpoints/metadata_000031293440.json @@ -0,0 +1 @@ +{"step": 955, "tokens_seen": 31293440, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.851076791499597} \ No newline at end of file diff --git a/checkpoints/metadata_000032768000.json b/checkpoints/metadata_000032768000.json new file mode 100644 index 0000000000000000000000000000000000000000..c41eb409957a149e6a51386411cf35f708a14e5b --- /dev/null +++ b/checkpoints/metadata_000032768000.json @@ -0,0 +1 @@ +{"step": 1000, "tokens_seen": 32768000, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.7911227754693355} \ No newline at end of file diff --git a/checkpoints/metadata_000034439168.json b/checkpoints/metadata_000034439168.json new file mode 100644 index 0000000000000000000000000000000000000000..4492c2ab7920610c1664e2bfed985120d5a0a2dd --- /dev/null +++ b/checkpoints/metadata_000034439168.json @@ -0,0 +1 @@ +{"step": 1051, "tokens_seen": 34439168, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.724863075960573} \ No newline at end of file diff --git a/checkpoints/metadata_000037879808.json b/checkpoints/metadata_000037879808.json new file mode 100644 index 0000000000000000000000000000000000000000..400734ca04f2abd7b190cd3da788988a890f6216 --- /dev/null +++ b/checkpoints/metadata_000037879808.json @@ -0,0 +1 @@ +{"step": 1156, "tokens_seen": 37879808, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.601169516817602} \ No newline at end of file diff --git a/checkpoints/metadata_000041648128.json b/checkpoints/metadata_000041648128.json new file mode 100644 index 0000000000000000000000000000000000000000..c34c2d7368aa04268987cd74fd7e077b0c7c0897 --- /dev/null +++ b/checkpoints/metadata_000041648128.json @@ -0,0 +1 @@ +{"step": 1271, "tokens_seen": 41648128, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.486551862501883} \ No newline at end of file diff --git a/checkpoints/metadata_000045842432.json b/checkpoints/metadata_000045842432.json new file mode 100644 index 0000000000000000000000000000000000000000..b61af622c244d21522e59812abacdd7397c77481 --- /dev/null +++ b/checkpoints/metadata_000045842432.json @@ -0,0 +1 @@ +{"step": 1399, "tokens_seen": 45842432, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.381150849922109} \ No newline at end of file diff --git a/checkpoints/metadata_000049152000.json b/checkpoints/metadata_000049152000.json new file mode 100644 index 0000000000000000000000000000000000000000..a245ac5d882cff4d36b700408e920214664a80d8 --- /dev/null +++ b/checkpoints/metadata_000049152000.json @@ -0,0 +1 @@ +{"step": 1500, "tokens_seen": 49152000, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.324639904720941} \ No newline at end of file diff --git a/checkpoints/metadata_000050397184.json b/checkpoints/metadata_000050397184.json new file mode 100644 index 0000000000000000000000000000000000000000..badddafe6eb760acafe95a7b859e7c742b39c8ea --- /dev/null +++ b/checkpoints/metadata_000050397184.json @@ -0,0 +1 @@ +{"step": 1538, "tokens_seen": 50397184, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.3145797956722785} \ No newline at end of file diff --git a/checkpoints/metadata_000055443456.json b/checkpoints/metadata_000055443456.json new file mode 100644 index 0000000000000000000000000000000000000000..8ec28fbc614e9c1aaa1e91ca920322588046cb94 --- /dev/null +++ b/checkpoints/metadata_000055443456.json @@ -0,0 +1 @@ +{"step": 1692, "tokens_seen": 55443456, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.2051704572488475} \ No newline at end of file diff --git a/checkpoints/metadata_000061014016.json b/checkpoints/metadata_000061014016.json new file mode 100644 index 0000000000000000000000000000000000000000..556fdebeccab6bdfe632de548af5ada71be43d8d --- /dev/null +++ b/checkpoints/metadata_000061014016.json @@ -0,0 +1 @@ +{"step": 1862, "tokens_seen": 61014016, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.134354490996065} \ No newline at end of file diff --git a/checkpoints/metadata_000065536000.json b/checkpoints/metadata_000065536000.json new file mode 100644 index 0000000000000000000000000000000000000000..bb529b5b0cd935e4afb3680420849cbf71aa27cb --- /dev/null +++ b/checkpoints/metadata_000065536000.json @@ -0,0 +1 @@ +{"step": 2000, "tokens_seen": 65536000, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.110466300872504} \ No newline at end of file diff --git a/checkpoints/metadata_000067108864.json b/checkpoints/metadata_000067108864.json new file mode 100644 index 0000000000000000000000000000000000000000..1757eb988ac492c771c269c1acdae12053d66363 --- /dev/null +++ b/checkpoints/metadata_000067108864.json @@ -0,0 +1 @@ +{"step": 2048, "tokens_seen": 67108864, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 4.087188933090007} \ No newline at end of file diff --git a/checkpoints/metadata_000073826304.json b/checkpoints/metadata_000073826304.json new file mode 100644 index 0000000000000000000000000000000000000000..feae5e8c48e60711ffdbb85611b8ba987fabba14 --- /dev/null +++ b/checkpoints/metadata_000073826304.json @@ -0,0 +1 @@ +{"step": 2253, "tokens_seen": 73826304, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.97070491059368} \ No newline at end of file diff --git a/checkpoints/metadata_000081199104.json b/checkpoints/metadata_000081199104.json new file mode 100644 index 0000000000000000000000000000000000000000..a0c4218d2d588662edcbd50049228cb1fb4ebe0f --- /dev/null +++ b/checkpoints/metadata_000081199104.json @@ -0,0 +1 @@ +{"step": 2478, "tokens_seen": 81199104, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.8737115629455845} \ No newline at end of file diff --git a/checkpoints/metadata_000081920000.json b/checkpoints/metadata_000081920000.json new file mode 100644 index 0000000000000000000000000000000000000000..3a831bc92542c503310faaa6aa9e960341ee8c76 --- /dev/null +++ b/checkpoints/metadata_000081920000.json @@ -0,0 +1 @@ +{"step": 2500, "tokens_seen": 81920000, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.8585853468518287} \ No newline at end of file diff --git a/checkpoints/metadata_000089325568.json b/checkpoints/metadata_000089325568.json new file mode 100644 index 0000000000000000000000000000000000000000..1900aa4a0762fb0a255dc1645e91ea221d4d7027 --- /dev/null +++ b/checkpoints/metadata_000089325568.json @@ -0,0 +1 @@ +{"step": 2726, "tokens_seen": 89325568, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.7747637464189885} \ No newline at end of file diff --git a/checkpoints/metadata_000098271232.json b/checkpoints/metadata_000098271232.json new file mode 100644 index 0000000000000000000000000000000000000000..69c03cfbca3f59ddc3750ac1161f9fc26fe79a99 --- /dev/null +++ b/checkpoints/metadata_000098271232.json @@ -0,0 +1 @@ +{"step": 2999, "tokens_seen": 98271232, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.7082875029547786} \ No newline at end of file diff --git a/checkpoints/metadata_000098304000.json b/checkpoints/metadata_000098304000.json new file mode 100644 index 0000000000000000000000000000000000000000..fe05efda3ec61761ebcd661636e651d38f216499 --- /dev/null +++ b/checkpoints/metadata_000098304000.json @@ -0,0 +1 @@ +{"step": 3000, "tokens_seen": 98304000, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.7076009311021108} \ No newline at end of file diff --git a/checkpoints/metadata_000108068864.json b/checkpoints/metadata_000108068864.json new file mode 100644 index 0000000000000000000000000000000000000000..ce2f7c30bb5007b0d289b584e6f89fda470f5258 --- /dev/null +++ b/checkpoints/metadata_000108068864.json @@ -0,0 +1 @@ +{"step": 3298, "tokens_seen": 108068864, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.636359072621687} \ No newline at end of file diff --git a/checkpoints/metadata_000114688000.json b/checkpoints/metadata_000114688000.json new file mode 100644 index 0000000000000000000000000000000000000000..227048f0521935ea72250b63a45ae9745aae6a27 --- /dev/null +++ b/checkpoints/metadata_000114688000.json @@ -0,0 +1 @@ +{"step": 3500, "tokens_seen": 114688000, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.593479082164686} \ No newline at end of file diff --git a/checkpoints/metadata_000118882304.json b/checkpoints/metadata_000118882304.json new file mode 100644 index 0000000000000000000000000000000000000000..d479fbddbf8e5db81adc95f020df87b47c80b8fb --- /dev/null +++ b/checkpoints/metadata_000118882304.json @@ -0,0 +1 @@ +{"step": 3628, "tokens_seen": 118882304, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.601109134096057} \ No newline at end of file diff --git a/checkpoints/metadata_000130777088.json b/checkpoints/metadata_000130777088.json new file mode 100644 index 0000000000000000000000000000000000000000..2750ed3e5e1ef1fa5c25cf17093eb16d000e9381 --- /dev/null +++ b/checkpoints/metadata_000130777088.json @@ -0,0 +1 @@ +{"step": 3991, "tokens_seen": 130777088, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.550488793464195} \ No newline at end of file diff --git a/checkpoints/metadata_000131072000.json b/checkpoints/metadata_000131072000.json new file mode 100644 index 0000000000000000000000000000000000000000..18849e9ca96ae5b68de14fc0c43a78c7ba26c957 --- /dev/null +++ b/checkpoints/metadata_000131072000.json @@ -0,0 +1 @@ +{"step": 4000, "tokens_seen": 131072000, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.5571278406617366} \ No newline at end of file diff --git a/checkpoints/metadata_000143851520.json b/checkpoints/metadata_000143851520.json new file mode 100644 index 0000000000000000000000000000000000000000..448586e0d1ab30dc4d690a5b53269d25e1601eec --- /dev/null +++ b/checkpoints/metadata_000143851520.json @@ -0,0 +1 @@ +{"step": 4390, "tokens_seen": 143851520, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.516337038989698} \ No newline at end of file diff --git a/checkpoints/metadata_000147456000.json b/checkpoints/metadata_000147456000.json new file mode 100644 index 0000000000000000000000000000000000000000..17e8a812b87c0d027a0353f3a5529ff5481b4209 --- /dev/null +++ b/checkpoints/metadata_000147456000.json @@ -0,0 +1 @@ +{"step": 4500, "tokens_seen": 147456000, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.5048954366844023} \ No newline at end of file diff --git a/checkpoints/metadata_000158269440.json b/checkpoints/metadata_000158269440.json new file mode 100644 index 0000000000000000000000000000000000000000..1a74e1ce5e1f7800d7e27db85ce7587d4abf0598 --- /dev/null +++ b/checkpoints/metadata_000158269440.json @@ -0,0 +1 @@ +{"step": 4830, "tokens_seen": 158269440, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.4822720271960272} \ No newline at end of file diff --git a/checkpoints/metadata_000163840000.json b/checkpoints/metadata_000163840000.json new file mode 100644 index 0000000000000000000000000000000000000000..5a6222b1f86d0e2a9b09aed0e616d860deff098d --- /dev/null +++ b/checkpoints/metadata_000163840000.json @@ -0,0 +1 @@ +{"step": 5000, "tokens_seen": 163840000, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.464319650781479} \ No newline at end of file diff --git a/checkpoints/metadata_000174096384.json b/checkpoints/metadata_000174096384.json new file mode 100644 index 0000000000000000000000000000000000000000..820bc4775926ef6f018b4d36a759993c6b617299 --- /dev/null +++ b/checkpoints/metadata_000174096384.json @@ -0,0 +1 @@ +{"step": 5313, "tokens_seen": 174096384, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.457515758075735} \ No newline at end of file diff --git a/checkpoints/metadata_000180224000.json b/checkpoints/metadata_000180224000.json new file mode 100644 index 0000000000000000000000000000000000000000..4917538da5440464bf3d9d54e46618a83d4b6a26 --- /dev/null +++ b/checkpoints/metadata_000180224000.json @@ -0,0 +1 @@ +{"step": 5500, "tokens_seen": 180224000, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.435171628668802} \ No newline at end of file diff --git a/checkpoints/metadata_000191496192.json b/checkpoints/metadata_000191496192.json new file mode 100644 index 0000000000000000000000000000000000000000..dc49e0d1b798e38bc1aff80cc8aef63848f28682 --- /dev/null +++ b/checkpoints/metadata_000191496192.json @@ -0,0 +1 @@ +{"step": 5844, "tokens_seen": 191496192, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.4124552194940834} \ No newline at end of file diff --git a/checkpoints/metadata_000196608000.json b/checkpoints/metadata_000196608000.json new file mode 100644 index 0000000000000000000000000000000000000000..cb221ea4c8266a72c3feb632c6e8ce269ce7b76e --- /dev/null +++ b/checkpoints/metadata_000196608000.json @@ -0,0 +1 @@ +{"step": 6000, "tokens_seen": 196608000, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.4061981319064123} \ No newline at end of file diff --git a/checkpoints/metadata_000196706304.json b/checkpoints/metadata_000196706304.json new file mode 100644 index 0000000000000000000000000000000000000000..ce6fcda663fa048fd81037b1686b7c1d12ade99d --- /dev/null +++ b/checkpoints/metadata_000196706304.json @@ -0,0 +1 @@ +{"step": 6003, "tokens_seen": 196706304, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.4074958290792825} \ No newline at end of file diff --git a/checkpoints/metadata_000197361664.json b/checkpoints/metadata_000197361664.json new file mode 100644 index 0000000000000000000000000000000000000000..f9e7708d8ebc7af127c910c4684b4680751be145 --- /dev/null +++ b/checkpoints/metadata_000197361664.json @@ -0,0 +1 @@ +{"step": 6023, "tokens_seen": 197361664, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.408092934298367} \ No newline at end of file diff --git a/checkpoints/metadata_000198017024.json b/checkpoints/metadata_000198017024.json new file mode 100644 index 0000000000000000000000000000000000000000..6ef2aaeeceec1383b420fc78c804b25e00da04e8 --- /dev/null +++ b/checkpoints/metadata_000198017024.json @@ -0,0 +1 @@ +{"step": 6043, "tokens_seen": 198017024, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.407055360049092} \ No newline at end of file diff --git a/checkpoints/metadata_000198672384.json b/checkpoints/metadata_000198672384.json new file mode 100644 index 0000000000000000000000000000000000000000..e6e02759141141cdd27f8bccb4fda874a7d1fccf --- /dev/null +++ b/checkpoints/metadata_000198672384.json @@ -0,0 +1 @@ +{"step": 6063, "tokens_seen": 198672384, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.396747844162258} \ No newline at end of file diff --git a/checkpoints/metadata_000199327744.json b/checkpoints/metadata_000199327744.json new file mode 100644 index 0000000000000000000000000000000000000000..71ac03b566844b42f50cf08167ada7bacd8830f9 --- /dev/null +++ b/checkpoints/metadata_000199327744.json @@ -0,0 +1 @@ +{"step": 6083, "tokens_seen": 199327744, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.403324384960973} \ No newline at end of file diff --git a/checkpoints/metadata_000199950336.json b/checkpoints/metadata_000199950336.json new file mode 100644 index 0000000000000000000000000000000000000000..a72f6c7b92a2dcca81dc2215710976a9adf66bb8 --- /dev/null +++ b/checkpoints/metadata_000199950336.json @@ -0,0 +1 @@ +{"step": 6102, "tokens_seen": 199950336, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.394872257897474} \ No newline at end of file diff --git a/checkpoints/model_weights_000000032768.pt b/checkpoints/model_weights_000000032768.pt new file mode 100644 index 0000000000000000000000000000000000000000..a912b9b69c392865dae7eb20b2190ee48cb78cc5 --- /dev/null +++ b/checkpoints/model_weights_000000032768.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:126e91c770ee10d33ef0dee8496e4965913669ea9a7354bf884bcdf37e7c5add +size 152233941 diff --git a/checkpoints/model_weights_000000327680.pt b/checkpoints/model_weights_000000327680.pt new file mode 100644 index 0000000000000000000000000000000000000000..2b63539146f3825bf9247c2de3baf602b1057e91 --- /dev/null +++ b/checkpoints/model_weights_000000327680.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea98c23a24df1202ce83af552102b4277bcac1c2c7c94f9da950eb17405619f0 +size 152233941 diff --git a/checkpoints/model_weights_000000360448.pt b/checkpoints/model_weights_000000360448.pt new file mode 100644 index 0000000000000000000000000000000000000000..faec3e6cc724ac45bdab04fe6d3aa176954f73cc --- /dev/null +++ b/checkpoints/model_weights_000000360448.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8388e7b2b76bce252ad09d6838b7cd237b21bbd11e43f6f9799306f36773562 +size 152233941 diff --git a/checkpoints/model_weights_000000425984.pt b/checkpoints/model_weights_000000425984.pt new file mode 100644 index 0000000000000000000000000000000000000000..2569ca3c9986d2eebce21a244c571db1d75e763c --- /dev/null +++ b/checkpoints/model_weights_000000425984.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3917ef49615df35325dacbccba2940f7925e10f273e3b8a03b809d81fa79f6e0 +size 152233941 diff --git a/checkpoints/model_weights_000000458752.pt b/checkpoints/model_weights_000000458752.pt new file mode 100644 index 0000000000000000000000000000000000000000..fb0bc8d9e232cc1d899c65dc684e6aefd0699c89 --- /dev/null +++ b/checkpoints/model_weights_000000458752.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b291e796976bf61ef32d6b25a318d435fafb9ff544822bb86ad9dc6bdee50c3 +size 152233941 diff --git a/checkpoints/model_weights_000000491520.pt b/checkpoints/model_weights_000000491520.pt new file mode 100644 index 0000000000000000000000000000000000000000..bbabfa266688403c0b4bb69b295467b47519330e --- /dev/null +++ b/checkpoints/model_weights_000000491520.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:642a8a4244c50cc016d27d94755f1b1655d29a49e1c50557a577e296acc9db4f +size 152233941 diff --git a/checkpoints/model_weights_000000557056.pt b/checkpoints/model_weights_000000557056.pt new file mode 100644 index 0000000000000000000000000000000000000000..0cc0d7784a5f8a020f58611c62c85f3b8cc8b19f --- /dev/null +++ b/checkpoints/model_weights_000000557056.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3cb29c4a8239ae521964d645a1c3006b0c19d7aff7878279db43127ae09d749a +size 152233941 diff --git a/checkpoints/model_weights_000000622592.pt b/checkpoints/model_weights_000000622592.pt new file mode 100644 index 0000000000000000000000000000000000000000..a9dc21f7772aa20c59df58cd5501705d297ba875 --- /dev/null +++ b/checkpoints/model_weights_000000622592.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df47ea9321b321ba87765b5756b7629b48706909590a5adacf9823f9ffd37648 +size 152233941 diff --git a/checkpoints/model_weights_000000688128.pt b/checkpoints/model_weights_000000688128.pt new file mode 100644 index 0000000000000000000000000000000000000000..5dbe4e3dbfbb135c6790399aa231613adc5dc1d9 --- /dev/null +++ b/checkpoints/model_weights_000000688128.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ba296c1826535e86b4097ed1eb5a63d7fb5233dc4fd9345f818fb4653c1bf79 +size 152233941 diff --git a/checkpoints/model_weights_000000753664.pt b/checkpoints/model_weights_000000753664.pt new file mode 100644 index 0000000000000000000000000000000000000000..9efda2cc78dee96ba3409f9cc3503fe79a6400fa --- /dev/null +++ b/checkpoints/model_weights_000000753664.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e51f2cb71ae53a36f7e18300cf3bab5a0f1551f9d5eaf49997320b6f693eafa +size 152233941 diff --git a/checkpoints/model_weights_000000819200.pt b/checkpoints/model_weights_000000819200.pt new file mode 100644 index 0000000000000000000000000000000000000000..701c895eabfe7fdd3f0be7baf08f327c2a4f88b0 --- /dev/null +++ b/checkpoints/model_weights_000000819200.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89e7f4890a08ee3f6cb4eea8ad6adcbea64111882dfdbc3dfe12fae7e1201f90 +size 152233941 diff --git a/checkpoints/model_weights_000000917504.pt b/checkpoints/model_weights_000000917504.pt new file mode 100644 index 0000000000000000000000000000000000000000..eb1e419cbb8e243f32d1c8b5c91a28736bd12a46 --- /dev/null +++ b/checkpoints/model_weights_000000917504.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d32e920c21376e8bbc60342aed5bca729a6bb035fb4c31d672c96a8bde9a0f3e +size 152233941 diff --git a/checkpoints/model_weights_000000983040.pt b/checkpoints/model_weights_000000983040.pt new file mode 100644 index 0000000000000000000000000000000000000000..5d3d1b045c12a235bede5755c3883bf6d32cceb1 --- /dev/null +++ b/checkpoints/model_weights_000000983040.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:755c587dadfc7dbeab7660e6b1e3871bc67d2591350985da88bfe25cfe3477c6 +size 152233941 diff --git a/checkpoints/model_weights_000001114112.pt b/checkpoints/model_weights_000001114112.pt new file mode 100644 index 0000000000000000000000000000000000000000..475234a9985e5b40008a8f27773cdd44a538ed5f --- /dev/null +++ b/checkpoints/model_weights_000001114112.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78d249d659833651df00fc1c946ef98ac5d34321290f97b49c08602fc744a07e +size 152233941 diff --git a/checkpoints/model_weights_000001212416.pt b/checkpoints/model_weights_000001212416.pt new file mode 100644 index 0000000000000000000000000000000000000000..14e5f3069811e3fba530846a0d346cf61eced1a8 --- /dev/null +++ b/checkpoints/model_weights_000001212416.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac757ab9ae59a742327542c4010e53789355750c273af57c418922049bd45c13 +size 152233941 diff --git a/checkpoints/model_weights_000001343488.pt b/checkpoints/model_weights_000001343488.pt new file mode 100644 index 0000000000000000000000000000000000000000..a189b3bb83a92186eb877a0d10a42308330ad864 --- /dev/null +++ b/checkpoints/model_weights_000001343488.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c7cb7b849007d479509bcb97749fa8eed22a45e3db82d08883bd7f012383419 +size 152233941 diff --git a/checkpoints/model_weights_000001474560.pt b/checkpoints/model_weights_000001474560.pt new file mode 100644 index 0000000000000000000000000000000000000000..9177c54e697abfea0280d11d05a8c38dbc6dc78a --- /dev/null +++ b/checkpoints/model_weights_000001474560.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7b4cb1b8f3b9891e35f3517e843d51516924a2fdba36f91bd7f05ae2df17b7d +size 152233941 diff --git a/checkpoints/model_weights_000001605632.pt b/checkpoints/model_weights_000001605632.pt new file mode 100644 index 0000000000000000000000000000000000000000..9254f18b0adf59e749eebf81bd43f6579ea9f856 --- /dev/null +++ b/checkpoints/model_weights_000001605632.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86e31677717bdc8294fc7eca35b5ef9bfc2cb261186f2de93f3bea134e7c0fb3 +size 152233941 diff --git a/checkpoints/model_weights_000001769472.pt b/checkpoints/model_weights_000001769472.pt new file mode 100644 index 0000000000000000000000000000000000000000..a7e84fdba917d3f91f16855ae9426ff8c9fc40d9 --- /dev/null +++ b/checkpoints/model_weights_000001769472.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ec994fd9c622b838312f914b019f46a1f4682b75f0259095aa79447cfdb55cc +size 152233941 diff --git a/checkpoints/model_weights_000001966080.pt b/checkpoints/model_weights_000001966080.pt new file mode 100644 index 0000000000000000000000000000000000000000..d8a780005510d1e5e91d02939415fc3c123544e1 --- /dev/null +++ b/checkpoints/model_weights_000001966080.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:012058c0a6d2f457ddc997b74be0ef3b9006aedeef426bce61ab9eba832b89df +size 152233941 diff --git a/checkpoints/model_weights_000002162688.pt b/checkpoints/model_weights_000002162688.pt new file mode 100644 index 0000000000000000000000000000000000000000..a7fa17e0b366d57d2961a9958de90d03480f666f --- /dev/null +++ b/checkpoints/model_weights_000002162688.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d109259b8927d0bee42e2999c5fdf0eeca64a2a484f875db58e752418946d34 +size 152233941 diff --git a/checkpoints/model_weights_000002359296.pt b/checkpoints/model_weights_000002359296.pt new file mode 100644 index 0000000000000000000000000000000000000000..05e69b9fd548cd6bf68ab2fc8be4ba0e3db1e0db --- /dev/null +++ b/checkpoints/model_weights_000002359296.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a919bccd79ef2a7249045e9c1f7ff08bc1075a19b5a66f455e37f71181044ad4 +size 152233941 diff --git a/checkpoints/model_weights_000002621440.pt b/checkpoints/model_weights_000002621440.pt new file mode 100644 index 0000000000000000000000000000000000000000..8fb540f0e0e30638a3a1c1827747f8085952ba30 --- /dev/null +++ b/checkpoints/model_weights_000002621440.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c0ff6d6283609474fc694d39fd180e288929b5194e822e612720e7358958676 +size 152233941 diff --git a/checkpoints/model_weights_000002883584.pt b/checkpoints/model_weights_000002883584.pt new file mode 100644 index 0000000000000000000000000000000000000000..c3fc48d8073ba80c2e3ef5757efbb27887ac89ae --- /dev/null +++ b/checkpoints/model_weights_000002883584.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b25bed5c785fc1204d1e4281ff82ee1da20b90af02488fc91b9ba9c4a8c60c4 +size 152233941 diff --git a/checkpoints/model_weights_000003178496.pt b/checkpoints/model_weights_000003178496.pt new file mode 100644 index 0000000000000000000000000000000000000000..8064f00ad6dc98bc1165129919f986f70bbbfcc3 --- /dev/null +++ b/checkpoints/model_weights_000003178496.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a24228699d57eb3fd0117d453029b057a0b2bb3802ecafc6c2338d7a9f3e78d +size 152233941 diff --git a/checkpoints/model_weights_000003473408.pt b/checkpoints/model_weights_000003473408.pt new file mode 100644 index 0000000000000000000000000000000000000000..acfbf29ae253cec0fbfb4240bc084eafe92a3d8b --- /dev/null +++ b/checkpoints/model_weights_000003473408.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:423c2886095a6fccb4fd402f82ad314907f99d3045ef5a99f809022aa8b6bbbc +size 152233941 diff --git a/checkpoints/model_weights_000003833856.pt b/checkpoints/model_weights_000003833856.pt new file mode 100644 index 0000000000000000000000000000000000000000..16915b15238bb94c964d8cdcfb35fdd1c75f2385 --- /dev/null +++ b/checkpoints/model_weights_000003833856.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:afc1ddf5ef6c825ced0502a0df7489cd6974991022036a49b6ff0173a7919e87 +size 152233941 diff --git a/checkpoints/model_weights_000004227072.pt b/checkpoints/model_weights_000004227072.pt new file mode 100644 index 0000000000000000000000000000000000000000..7891151d758869db673486fe6a4686d902e7a5b5 --- /dev/null +++ b/checkpoints/model_weights_000004227072.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:faae6d34dda55becb2508088ca57995ca954837911326285acf14db1cc52206c +size 152233941 diff --git a/checkpoints/model_weights_000004653056.pt b/checkpoints/model_weights_000004653056.pt new file mode 100644 index 0000000000000000000000000000000000000000..c8704ea794d7582b9225c9af22fc8382b547751a --- /dev/null +++ b/checkpoints/model_weights_000004653056.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:94d9f889bf05c772c064e57ff19b43395f79187ed9c3ac3ebecea22b7b7ef46e +size 152233941 diff --git a/checkpoints/model_weights_000005111808.pt b/checkpoints/model_weights_000005111808.pt new file mode 100644 index 0000000000000000000000000000000000000000..9b40b0979bb9cb2771e89a3883d9c934983f2cd6 --- /dev/null +++ b/checkpoints/model_weights_000005111808.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea9793e864030102e412c47a7026f175f7b2beb65e1c3c4f1191db17faacf838 +size 152233941 diff --git a/checkpoints/model_weights_000005603328.pt b/checkpoints/model_weights_000005603328.pt new file mode 100644 index 0000000000000000000000000000000000000000..21fbef56f8733adac9cda032444c0be8572a84a1 --- /dev/null +++ b/checkpoints/model_weights_000005603328.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d698e8eeed127a4bdcca424e0a7415778b814bc3d5a6d9b9d8439708ee33b51 +size 152233941 diff --git a/checkpoints/model_weights_000006193152.pt b/checkpoints/model_weights_000006193152.pt new file mode 100644 index 0000000000000000000000000000000000000000..47c0c86d113acd749a5720bd0db8cc863a113a0c --- /dev/null +++ b/checkpoints/model_weights_000006193152.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b873b008c2cc4cbf0943b3fb4f16f2c2bb93dc0a52415cfeaf7ed7b1b3ad78e3 +size 152233941 diff --git a/checkpoints/model_weights_000006782976.pt b/checkpoints/model_weights_000006782976.pt new file mode 100644 index 0000000000000000000000000000000000000000..5b1dc2622f69343033fc20fa8ebc43276de69184 --- /dev/null +++ b/checkpoints/model_weights_000006782976.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3feb8d2736812ca3272ec7f9e044097cb48874a2053cf71d11d80e8c1e9f8f23 +size 152233941 diff --git a/checkpoints/model_weights_000007471104.pt b/checkpoints/model_weights_000007471104.pt new file mode 100644 index 0000000000000000000000000000000000000000..e8e575374c9c7a9e8aaf2735872218351105a955 --- /dev/null +++ b/checkpoints/model_weights_000007471104.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01102ee0a169f6142736d385d72fe2c79192848731d9cc4143048b887176d423 +size 152233941 diff --git a/checkpoints/model_weights_000008224768.pt b/checkpoints/model_weights_000008224768.pt new file mode 100644 index 0000000000000000000000000000000000000000..3d43ef91d7d0de07b30a6612a40b50d05d3e9efb --- /dev/null +++ b/checkpoints/model_weights_000008224768.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af5c9aed40ecd638baf207e10fad27acec54e665bd912b210a6405e847680ca0 +size 152233941 diff --git a/checkpoints/model_weights_000009043968.pt b/checkpoints/model_weights_000009043968.pt new file mode 100644 index 0000000000000000000000000000000000000000..82e2d5f550a33e18e65660e5041fe52bfa6b443e --- /dev/null +++ b/checkpoints/model_weights_000009043968.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ec1c3a070a125c58a8152f9e7aa56a763037cc123e36e1ca323ded182a76b28 +size 152233941 diff --git a/checkpoints/model_weights_000009961472.pt b/checkpoints/model_weights_000009961472.pt new file mode 100644 index 0000000000000000000000000000000000000000..6b1732733e6b3e13c4bb850f49630acffb1dfce4 --- /dev/null +++ b/checkpoints/model_weights_000009961472.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:adc6b64d722aeb3d5891411326c3ebd46c6ed896e1a91613d26e141e58e07f0f +size 152233941 diff --git a/checkpoints/model_weights_000010944512.pt b/checkpoints/model_weights_000010944512.pt new file mode 100644 index 0000000000000000000000000000000000000000..1d8fc60df1d8db055ede761be609c87ce24d9efc --- /dev/null +++ b/checkpoints/model_weights_000010944512.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6bf00dde065e4d6d87753ab5a3a5078d0e8dabfbf2f2a1af2b58e028d02f2f8e +size 152233941 diff --git a/checkpoints/model_weights_000012058624.pt b/checkpoints/model_weights_000012058624.pt new file mode 100644 index 0000000000000000000000000000000000000000..d1886e335bf71b91c53a8087f00c6daaece8cdf0 --- /dev/null +++ b/checkpoints/model_weights_000012058624.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf7cec5bed9521d96c13b93dc544f2c2d308eab2d5c7e12560d04c47f86bc176 +size 152233941 diff --git a/checkpoints/model_weights_000013271040.pt b/checkpoints/model_weights_000013271040.pt new file mode 100644 index 0000000000000000000000000000000000000000..255b0372e30c9286f214df77c975ed208d0d9dad --- /dev/null +++ b/checkpoints/model_weights_000013271040.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2332cb6ff3d3a184384af69b52bc9c1c7cac3b40022b009920334144a0296d24 +size 152233941 diff --git a/checkpoints/model_weights_000014581760.pt b/checkpoints/model_weights_000014581760.pt new file mode 100644 index 0000000000000000000000000000000000000000..bc8176574d9bacad2342276f023efa58d1833c78 --- /dev/null +++ b/checkpoints/model_weights_000014581760.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42dc3d4700cec02974c3720c719319c7fea5bbb1a11fd62efa367aad310f7abd +size 152233941 diff --git a/checkpoints/model_weights_000016056320.pt b/checkpoints/model_weights_000016056320.pt new file mode 100644 index 0000000000000000000000000000000000000000..2ec9e33eca945b9bb8cf8d38f544d5cacc67daf5 --- /dev/null +++ b/checkpoints/model_weights_000016056320.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09f58d2c1ccffae75a9a52a63da53a6f98329e0e836abfd6a9ec0a7450045e15 +size 152233941 diff --git a/checkpoints/model_weights_000016384000.pt b/checkpoints/model_weights_000016384000.pt new file mode 100644 index 0000000000000000000000000000000000000000..0408b044443c73da974f321fb42ca97d79b14fdb --- /dev/null +++ b/checkpoints/model_weights_000016384000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d625a5b9816d03f4cb433038eb4107e4aa0cadf0c8704548cc2dd5be1e9d1fb1 +size 152233941 diff --git a/checkpoints/model_weights_000017661952.pt b/checkpoints/model_weights_000017661952.pt new file mode 100644 index 0000000000000000000000000000000000000000..52ab525700d8dee59df2f3ba28e171f09a255a65 --- /dev/null +++ b/checkpoints/model_weights_000017661952.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aecad5609a506e4bfdfd454fa850b361a8c05279eaf105d553943b63061e32fe +size 152233941 diff --git a/checkpoints/model_weights_000019431424.pt b/checkpoints/model_weights_000019431424.pt new file mode 100644 index 0000000000000000000000000000000000000000..f17342914969257159a68a350520580bb51c3768 --- /dev/null +++ b/checkpoints/model_weights_000019431424.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:093f6cd231419ec95b430623b2b141b603d309d0862e9f163b4d892d2b8f1786 +size 152233941 diff --git a/checkpoints/model_weights_000021364736.pt b/checkpoints/model_weights_000021364736.pt new file mode 100644 index 0000000000000000000000000000000000000000..f94240825b86174f244ded1928209dd4d2cf76a1 --- /dev/null +++ b/checkpoints/model_weights_000021364736.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:164c1b90a703d53adfe900f7dee5bc994e42c6e2563f17cef932c570943a362a +size 152233941 diff --git a/checkpoints/model_weights_000023494656.pt b/checkpoints/model_weights_000023494656.pt new file mode 100644 index 0000000000000000000000000000000000000000..7e46eaa709fe7eb5e5af3c5788334df71f018186 --- /dev/null +++ b/checkpoints/model_weights_000023494656.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c528494b42970ead192e2adda45d77e8600f8a2856f6858eb7f696e726b80b7 +size 152233941 diff --git a/checkpoints/model_weights_000025853952.pt b/checkpoints/model_weights_000025853952.pt new file mode 100644 index 0000000000000000000000000000000000000000..5a6c66612a50d420dd428ef4ce0bbe42381f0213 --- /dev/null +++ b/checkpoints/model_weights_000025853952.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c8102377bb9f3a4bccf97b7a7cef5299b1f5e49c052e9922cc3f802096393d5 +size 152233941 diff --git a/checkpoints/model_weights_000028442624.pt b/checkpoints/model_weights_000028442624.pt new file mode 100644 index 0000000000000000000000000000000000000000..b07eafba18c566fe9b92b7a3aae5dde623159988 --- /dev/null +++ b/checkpoints/model_weights_000028442624.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1baaffe2770f4f6e4e4ca88348ee82a2c812ae062ad8644f22c0fc8ca6c46632 +size 152233941 diff --git a/checkpoints/model_weights_000031293440.pt b/checkpoints/model_weights_000031293440.pt new file mode 100644 index 0000000000000000000000000000000000000000..ce334b1210b8c5d20edad3e3c8e89e7f3c08b212 --- /dev/null +++ b/checkpoints/model_weights_000031293440.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:503f8a171c1e370ea78dad681fdd7b89e9d9ce2d3509bff4aac6ff60e2e0f337 +size 152233941 diff --git a/checkpoints/model_weights_000032768000.pt b/checkpoints/model_weights_000032768000.pt new file mode 100644 index 0000000000000000000000000000000000000000..226023dc2caf5968a6fb6c1059bb88a07d71ea81 --- /dev/null +++ b/checkpoints/model_weights_000032768000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:acbcfdb221584d350a91b208e3b95a8774a0eae4fd255c455a8ce3fcb210d671 +size 152233941 diff --git a/checkpoints/model_weights_000034439168.pt b/checkpoints/model_weights_000034439168.pt new file mode 100644 index 0000000000000000000000000000000000000000..0eb974dfb4223bb07bf9a149710cc80ec1f1d877 --- /dev/null +++ b/checkpoints/model_weights_000034439168.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7fdd604445a5aa3e34356d6652a6fe689d68bc8604be7e5fa562e1d8529d295b +size 152233941 diff --git a/checkpoints/model_weights_000037879808.pt b/checkpoints/model_weights_000037879808.pt new file mode 100644 index 0000000000000000000000000000000000000000..c18014f3af41a222c8b4bac0163971461a52fb6e --- /dev/null +++ b/checkpoints/model_weights_000037879808.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ac997ac32245bab013ef3f95ea9d087bdc4047eaa14aedb88b0d056e8aab576 +size 152233941 diff --git a/checkpoints/model_weights_000041648128.pt b/checkpoints/model_weights_000041648128.pt new file mode 100644 index 0000000000000000000000000000000000000000..c765fc096e0f2c4b902e9311870a2b92d9a886bb --- /dev/null +++ b/checkpoints/model_weights_000041648128.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e28e7ac643111e0d6623d362311f8f93488d0fef7f10fe8c7e50c48be1f9051 +size 152233941 diff --git a/checkpoints/model_weights_000045842432.pt b/checkpoints/model_weights_000045842432.pt new file mode 100644 index 0000000000000000000000000000000000000000..824c2220548e5b00121e7f6e410de407ec0b9ecd --- /dev/null +++ b/checkpoints/model_weights_000045842432.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:145dae94ded6da1cac5c684072745603739c8be20f7a98ddd293bfec9786fb6d +size 152233941 diff --git a/checkpoints/model_weights_000049152000.pt b/checkpoints/model_weights_000049152000.pt new file mode 100644 index 0000000000000000000000000000000000000000..2a0725b5dda375953e52465991f20b3806bdbfd5 --- /dev/null +++ b/checkpoints/model_weights_000049152000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d17b4f03b730b1f4fc93ca8e5b1b3b261a2277dc8137a70dd41a16757bd02c56 +size 152233941 diff --git a/checkpoints/model_weights_000050397184.pt b/checkpoints/model_weights_000050397184.pt new file mode 100644 index 0000000000000000000000000000000000000000..ceddcb316c05d99837ad7f68f78d701fbfbc844b --- /dev/null +++ b/checkpoints/model_weights_000050397184.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:550774647b6b21f542e14c314fcd670e19b56819a734c09060450d93abd5c29c +size 152233941 diff --git a/checkpoints/model_weights_000055443456.pt b/checkpoints/model_weights_000055443456.pt new file mode 100644 index 0000000000000000000000000000000000000000..cd73743fed5d3cc74de48590e11767d265a210be --- /dev/null +++ b/checkpoints/model_weights_000055443456.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1714506a833c289bcf2c715ed5257bcd5284b097d4f6453ca8192887897fe01 +size 152233941 diff --git a/checkpoints/model_weights_000061014016.pt b/checkpoints/model_weights_000061014016.pt new file mode 100644 index 0000000000000000000000000000000000000000..7deb9759f973bd0666b38baf101714ea5d6b4f64 --- /dev/null +++ b/checkpoints/model_weights_000061014016.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd9f1ea57c9011456c6c76f6fee2ea9798a8ab3c2f19e09231561e0c03823827 +size 152233941 diff --git a/checkpoints/model_weights_000065536000.pt b/checkpoints/model_weights_000065536000.pt new file mode 100644 index 0000000000000000000000000000000000000000..3b410aba36229e58a5462945242d20c3254242af --- /dev/null +++ b/checkpoints/model_weights_000065536000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4cad227f3d000126119966f6d62a26b065fc793b02a42a92ef40bc00dcff35e1 +size 152233941 diff --git a/checkpoints/model_weights_000067108864.pt b/checkpoints/model_weights_000067108864.pt new file mode 100644 index 0000000000000000000000000000000000000000..6f82840deff98192c06e6394a950bb45135c5844 --- /dev/null +++ b/checkpoints/model_weights_000067108864.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b91b60d58242c335595ec8c6f8c90bb871a777743f37015f6614e73fbf81b679 +size 152233941 diff --git a/checkpoints/model_weights_000073826304.pt b/checkpoints/model_weights_000073826304.pt new file mode 100644 index 0000000000000000000000000000000000000000..ec285c3fd1c53bfb798f0c4cb8e3f4cf1d2fa38e --- /dev/null +++ b/checkpoints/model_weights_000073826304.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:894dd15c2ae9d0889fdeae587be200f108853520f5df96705ff80dcee9c0a123 +size 152233941 diff --git a/checkpoints/model_weights_000081199104.pt b/checkpoints/model_weights_000081199104.pt new file mode 100644 index 0000000000000000000000000000000000000000..4e09a6f650c83426334a15c3d1eda01ab2cea4a8 --- /dev/null +++ b/checkpoints/model_weights_000081199104.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35a217e9b909d9230421a822b1eaa20bd7b9e496fda0621b7628bfbbef0fd170 +size 152233941 diff --git a/checkpoints/model_weights_000081920000.pt b/checkpoints/model_weights_000081920000.pt new file mode 100644 index 0000000000000000000000000000000000000000..0c9523bf3db1af112af2dfafc5671bb5dbfbc5b9 --- /dev/null +++ b/checkpoints/model_weights_000081920000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28d461ede60245a11fe7aef30a2e8ce56906383c039be6a4ac5c0c319eab5573 +size 152233941 diff --git a/checkpoints/model_weights_000089325568.pt b/checkpoints/model_weights_000089325568.pt new file mode 100644 index 0000000000000000000000000000000000000000..8553d028d8f2b39302ab9a3dd06ad5d95a7c0eef --- /dev/null +++ b/checkpoints/model_weights_000089325568.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:014156fcd4ba5cbc68d73713da1cd95344e30180c7b1c47cd64e8cab53d441b8 +size 152233941 diff --git a/checkpoints/model_weights_000098271232.pt b/checkpoints/model_weights_000098271232.pt new file mode 100644 index 0000000000000000000000000000000000000000..816778f4c887d11a7c6dbacfc53e4f30fd142c3a --- /dev/null +++ b/checkpoints/model_weights_000098271232.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab0baa7b27d2e5a28c30520e51d756c85a6e6bd3d15e46e969713cb1cc06cc0f +size 152233941 diff --git a/checkpoints/model_weights_000098304000.pt b/checkpoints/model_weights_000098304000.pt new file mode 100644 index 0000000000000000000000000000000000000000..07c62a9a7ccb00073765d38ee743654875b8d2e4 --- /dev/null +++ b/checkpoints/model_weights_000098304000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14ca2b202fafbeb89ebcc905d7c537b3c2f00a633ce131432374d62323bc430f +size 152233941 diff --git a/checkpoints/model_weights_000108068864.pt b/checkpoints/model_weights_000108068864.pt new file mode 100644 index 0000000000000000000000000000000000000000..3691ba6d872ec388fd1aed5444ff74ab57cf31ae --- /dev/null +++ b/checkpoints/model_weights_000108068864.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ba49386eaa354e23e0332e0633091419e30097b6e330f9a7e425c4d2e69f335 +size 152233941 diff --git a/checkpoints/model_weights_000114688000.pt b/checkpoints/model_weights_000114688000.pt new file mode 100644 index 0000000000000000000000000000000000000000..fc2333b0f59f7eb7a6c2d7b1b01db81e527649b1 --- /dev/null +++ b/checkpoints/model_weights_000114688000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:453c809cb1ec24a354c679af03f27159aa586df502e648fdec1e445020f27513 +size 152233941 diff --git a/checkpoints/model_weights_000118882304.pt b/checkpoints/model_weights_000118882304.pt new file mode 100644 index 0000000000000000000000000000000000000000..3578bb3a9828e97260ec1148d2d52aadc21eb7dc --- /dev/null +++ b/checkpoints/model_weights_000118882304.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4cbab3d452ea3a9409d030e5776c662fe950d0d79f8784f1817231d7fa083aea +size 152233941 diff --git a/checkpoints/model_weights_000130777088.pt b/checkpoints/model_weights_000130777088.pt new file mode 100644 index 0000000000000000000000000000000000000000..5eaaa96134a158c178dc3d70dc72940e0bd3f369 --- /dev/null +++ b/checkpoints/model_weights_000130777088.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d487b622a2cc5a65570831502e60128c9cb56ae6fdc1e26d43e573a6ca812d54 +size 152233941 diff --git a/checkpoints/model_weights_000131072000.pt b/checkpoints/model_weights_000131072000.pt new file mode 100644 index 0000000000000000000000000000000000000000..45f0441040c212b11c8be2a6ea782ec5f767e719 --- /dev/null +++ b/checkpoints/model_weights_000131072000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a3679d177c2f603ed401d074578d3cf0551e07025298db66df3f7d42226a5d2 +size 152233941 diff --git a/checkpoints/model_weights_000143851520.pt b/checkpoints/model_weights_000143851520.pt new file mode 100644 index 0000000000000000000000000000000000000000..c3883e06ab684415faaa441be7be34e303646414 --- /dev/null +++ b/checkpoints/model_weights_000143851520.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a011307b957b05c47eef2a28fbddb10dfd636099bbe9821d2cccadd68d6d943a +size 152233941 diff --git a/checkpoints/model_weights_000147456000.pt b/checkpoints/model_weights_000147456000.pt new file mode 100644 index 0000000000000000000000000000000000000000..514a9e898b663415c8af25f04a0a5c88a645c8c6 --- /dev/null +++ b/checkpoints/model_weights_000147456000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b6b44f0256a3e937b44d98ba5db43827beca6b2afff730e982c589845da5b18 +size 152233941 diff --git a/checkpoints/model_weights_000158269440.pt b/checkpoints/model_weights_000158269440.pt new file mode 100644 index 0000000000000000000000000000000000000000..be0ee6790c96a2cca0e9132b3166a5f39a0979b8 --- /dev/null +++ b/checkpoints/model_weights_000158269440.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7847986f977020865a7d834725deb30709d5886a4f48f2925e21990b2dc258b0 +size 152233941 diff --git a/checkpoints/model_weights_000163840000.pt b/checkpoints/model_weights_000163840000.pt new file mode 100644 index 0000000000000000000000000000000000000000..0d557680859f05efe278c880365fd3959ddf75c6 --- /dev/null +++ b/checkpoints/model_weights_000163840000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d43f4d6cd56278c9b1fb28ca91bc4b22b098e445a3af3568b41d2d6c3b0dc90 +size 152233941 diff --git a/checkpoints/model_weights_000174096384.pt b/checkpoints/model_weights_000174096384.pt new file mode 100644 index 0000000000000000000000000000000000000000..f741f45f2edb2762dbda3c2e15d637324d538fee --- /dev/null +++ b/checkpoints/model_weights_000174096384.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84d64815d19e8664b4c3ca40b7c0b491f2e83847ac4fd2bd94bbc3359226efb3 +size 152233941 diff --git a/checkpoints/model_weights_000180224000.pt b/checkpoints/model_weights_000180224000.pt new file mode 100644 index 0000000000000000000000000000000000000000..6cdb48395c43e01cee8e2eda42e0c85c301a74fc --- /dev/null +++ b/checkpoints/model_weights_000180224000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76a3ff11eaf9031309562053ae7288d5b0f74743d7a7499fd90ffa74001603ae +size 152233941 diff --git a/checkpoints/model_weights_000191496192.pt b/checkpoints/model_weights_000191496192.pt new file mode 100644 index 0000000000000000000000000000000000000000..a3d0946db81a3269f0c8e9b2b96cc9b9c574cd16 --- /dev/null +++ b/checkpoints/model_weights_000191496192.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8812454f499055a1f98385091e53f48070cc66e842541d10aabf65f6cfcd7222 +size 152233941 diff --git a/checkpoints/model_weights_000196608000.pt b/checkpoints/model_weights_000196608000.pt new file mode 100644 index 0000000000000000000000000000000000000000..ff59d7535bee5594443ea13785e8a0bc7577d762 --- /dev/null +++ b/checkpoints/model_weights_000196608000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cddba86502e17ef2a84aa5c7b449b3cff6ee0e15ff5aa85b80379dcecd396627 +size 152233941 diff --git a/checkpoints/model_weights_000196706304.pt b/checkpoints/model_weights_000196706304.pt new file mode 100644 index 0000000000000000000000000000000000000000..433d4aec0afab5d0567e48a9de8ca0c25419a7a1 --- /dev/null +++ b/checkpoints/model_weights_000196706304.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5d5ca4396fe845458580e8d547be53c1e1f59bce7b5c3d0227b3f4305449f9d +size 152233941 diff --git a/checkpoints/model_weights_000197361664.pt b/checkpoints/model_weights_000197361664.pt new file mode 100644 index 0000000000000000000000000000000000000000..4f727b30685a1937c350b899d2726ee317e64baa --- /dev/null +++ b/checkpoints/model_weights_000197361664.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a794cd481aae45b3763d1f0d1ef884f639409209a25daeb80de6e094b999cb7f +size 152233941 diff --git a/checkpoints/model_weights_000198017024.pt b/checkpoints/model_weights_000198017024.pt new file mode 100644 index 0000000000000000000000000000000000000000..bedfa0050d95e80d8772a3a15676b8c00ae81bb5 --- /dev/null +++ b/checkpoints/model_weights_000198017024.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59e123b98191719e7dfcf691c195733edcca35e3946152f15c9073d0c2f40eb4 +size 152233941 diff --git a/checkpoints/model_weights_000198672384.pt b/checkpoints/model_weights_000198672384.pt new file mode 100644 index 0000000000000000000000000000000000000000..c3ae3d9ac7a52d0e0d81915053abfeb29c689206 --- /dev/null +++ b/checkpoints/model_weights_000198672384.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:828b4f79307f851edf450f962649cb453a5a47274c95f8d7918c5f80129fa19b +size 152233941 diff --git a/checkpoints/model_weights_000199327744.pt b/checkpoints/model_weights_000199327744.pt new file mode 100644 index 0000000000000000000000000000000000000000..d18137a221d8b5bd6c9b13a98bba2c0ba541be86 --- /dev/null +++ b/checkpoints/model_weights_000199327744.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2e53dfe4aaab1cfd0f9061a8bf68f9d4cab62315efd02eb216c9185dce67fa2 +size 152233941 diff --git a/checkpoints/model_weights_000199950336.pt b/checkpoints/model_weights_000199950336.pt new file mode 100644 index 0000000000000000000000000000000000000000..3253e3e498e8acf07593ff1324499978d00551b2 --- /dev/null +++ b/checkpoints/model_weights_000199950336.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc589a5603f394a6da0d93b34d49b5e45a908d01546454c928de9510ca9e2393 +size 152233941 diff --git a/config.toml b/config.toml new file mode 100644 index 0000000000000000000000000000000000000000..713efbdc25c69e02626f4cbfab1de46ce1d330ee --- /dev/null +++ b/config.toml @@ -0,0 +1,31 @@ +model_name = "pile_llama_2H_2L" +n_layers = 2 +d_model = 512 +d_mlp = 2048 +d_head = 64 +n_heads = 2 +attn_only = false +layer_norm_eps = 1e-05 +init_range = 0.02 +n_ctx = 1024 +d_vocab = 32000 +dataset_name = "eoinf/pile_llama" +seed = 10 +device = "cuda" +use_bfloat16_matmul = false +batch_size_per_device = 32 +n_devices = 1 +batches_per_step = 1 +max_tokens = 200000000 +lr_hidden = 0.001 +lr_vector = 0.0005 +lr_schedule = "constant_with_warmup" +warmup_tokens = 30000000 +weight_decay = 0.05 +grad_norm_clip = 1.0 +train_loss_moving_average_beta = 0.99 +log_interval = 25 +save_checkpoints = true +checkpoint_interval = 500 +checkpoint_interval_ratio = 1.1 +save_log_checkpoints = true \ No newline at end of file diff --git a/latest_checkpoint.pt b/latest_checkpoint.pt new file mode 100644 index 0000000000000000000000000000000000000000..12c1f37f5205a55bcaa313370202c732b4033408 --- /dev/null +++ b/latest_checkpoint.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cbf4a3701ff768aefc7f1a06a33f399b56def10807110a8e2f7cb5139f6b89ec +size 152233463 diff --git a/latest_metadata.json b/latest_metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..a72f6c7b92a2dcca81dc2215710976a9adf66bb8 --- /dev/null +++ b/latest_metadata.json @@ -0,0 +1 @@ +{"step": 6102, "tokens_seen": 199950336, "config": {"model_name": "pile_llama_2H_2L", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 2, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 3.394872257897474} \ No newline at end of file diff --git a/latest_optimizer.pt b/latest_optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..8da33203ecaad91a8827342a257f37a8ca03d174 --- /dev/null +++ b/latest_optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45f29280cd5d337e2f07c52c96e8f786c41131009adff2f4450a76255889657c +size 304472851 diff --git a/run.sh b/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..7f267f430e5b2294c4e308dc61c1c813beb7e718 --- /dev/null +++ b/run.sh @@ -0,0 +1,27 @@ +#!/bin/bash +# Check if "restart" argument is passed to force normal training +if [ "$1" = "restart" ]; then + echo "Force restart: Running normal training ..." + python -c " +import os +from toy_models.models.trainer import train_transformer_from_config +current_dir = os.getcwd() +train_transformer_from_config('config.toml', current_dir) +" +else + # Check for checkpoints and run appropriate training + python -c " +import os +from pathlib import Path +from toy_models.models.trainer import train_transformer_from_config, restart_from_checkpoint +current_dir = os.getcwd() +# Check if checkpoints directory exists and has .pt files +latest_checkpoint = Path('latest_checkpoint.pt') +if latest_checkpoint.exists(): + print(f'Found checkpoint: {latest_checkpoint}. Restarting from checkpoint...') + restart_from_checkpoint(current_dir) +else: + print('Starting training from beginning ...') + train_transformer_from_config(current_dir) +" +fi diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..8a8b762c57499dae83b531207444b547fc77a22f --- /dev/null +++ b/wandb/debug-internal.log @@ -0,0 +1,12 @@ +{"time":"2026-02-26T13:56:02.310641183Z","level":"INFO","msg":"stream: starting","core version":"0.21.4"} +{"time":"2026-02-26T13:56:02.505385385Z","level":"INFO","msg":"stream: created new stream","id":"696nxyfr"} +{"time":"2026-02-26T13:56:02.505474889Z","level":"INFO","msg":"stream: started","id":"696nxyfr"} +{"time":"2026-02-26T13:56:02.505547204Z","level":"INFO","msg":"writer: started","stream_id":"696nxyfr"} +{"time":"2026-02-26T13:56:02.505558638Z","level":"INFO","msg":"handler: started","stream_id":"696nxyfr"} +{"time":"2026-02-26T13:56:02.5055985Z","level":"INFO","msg":"sender: started","stream_id":"696nxyfr"} +{"time":"2026-02-26T14:39:41.476737152Z","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run metadata","runtime_seconds":0.003364817}],"total_operations":1}} +{"time":"2026-02-26T14:39:41.82022831Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2026-02-26T14:39:42.35788107Z","level":"INFO","msg":"stream: closing","id":"696nxyfr"} +{"time":"2026-02-26T14:39:42.357919856Z","level":"INFO","msg":"handler: closed","stream_id":"696nxyfr"} +{"time":"2026-02-26T14:39:42.357981703Z","level":"INFO","msg":"sender: closed","stream_id":"696nxyfr"} +{"time":"2026-02-26T14:39:42.357990898Z","level":"INFO","msg":"stream: closed","id":"696nxyfr"} diff --git a/wandb/debug.log b/wandb/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..d628c19744d0df790b8f77df43df800da59c34ce --- /dev/null +++ b/wandb/debug.log @@ -0,0 +1,26 @@ +2026-02-26 13:56:02,097 INFO MainThread:4744 [wandb_setup.py:_flush():81] Current SDK version is 0.21.4 +2026-02-26 13:56:02,097 INFO MainThread:4744 [wandb_setup.py:_flush():81] Configure stats pid to 4744 +2026-02-26 13:56:02,097 INFO MainThread:4744 [wandb_setup.py:_flush():81] Loading settings from /root/.config/wandb/settings +2026-02-26 13:56:02,097 INFO MainThread:4744 [wandb_setup.py:_flush():81] Loading settings from /notebooks/toy_models/model_training/model/wandb/settings +2026-02-26 13:56:02,097 INFO MainThread:4744 [wandb_setup.py:_flush():81] Loading settings from environment variables +2026-02-26 13:56:02,097 INFO MainThread:4744 [wandb_init.py:setup_run_log_directory():686] Logging user logs to /notebooks/toy_models/model_training/model/wandb/run-20260226_135602-696nxyfr/logs/debug.log +2026-02-26 13:56:02,097 INFO MainThread:4744 [wandb_init.py:setup_run_log_directory():687] Logging internal logs to /notebooks/toy_models/model_training/model/wandb/run-20260226_135602-696nxyfr/logs/debug-internal.log +2026-02-26 13:56:02,097 INFO MainThread:4744 [wandb_init.py:init():813] calling init triggers +2026-02-26 13:56:02,098 INFO MainThread:4744 [wandb_init.py:init():818] wandb.init called with sweep_config: {} +config: {'model_name': 'pile_llama_2H_2L', 'n_layers': 2, 'd_model': 512, 'd_mlp': 2048, 'd_head': 64, 'n_heads': 2, 'attn_only': False, 'layer_norm_eps': 1e-05, 'init_range': 0.02, 'n_ctx': 1024, 'd_vocab': 32000, 'dataset_name': 'eoinf/pile_llama', 'tokenizer_name': '', 'seed': 10, 'data_seed': 10, 'device': 'cuda', 'use_bfloat16_matmul': False, 'batch_size_per_device': 32, 'n_devices': 1, 'batches_per_step': 1, 'max_tokens': 200000000, 'lr_hidden': 0.001, 'lr_vector': 0.0005, 'lr_schedule': 'constant_with_warmup', 'warmup_tokens': 30000000, 'weight_decay': 0.05, 'grad_norm_clip': 1.0, 'train_loss_moving_average_beta': 0.99, 'log_interval': 25, 'save_checkpoints': True, 'checkpoint_interval': 500, 'checkpoint_interval_ratio': 1.1, 'save_log_checkpoints': True, 'use_wandb': True, 'batch_size': 32, 'tokens_per_step': 32768, 'warmup_steps': 915, 'max_steps': 6103, '_wandb': {}} +2026-02-26 13:56:02,098 INFO MainThread:4744 [wandb_init.py:init():854] starting backend +2026-02-26 13:56:02,302 INFO MainThread:4744 [wandb_init.py:init():857] sending inform_init request +2026-02-26 13:56:02,306 INFO MainThread:4744 [wandb_init.py:init():865] backend started and connected +2026-02-26 13:56:02,308 INFO MainThread:4744 [wandb_init.py:init():936] updated telemetry +2026-02-26 13:56:02,312 INFO MainThread:4744 [wandb_init.py:init():960] communicating run to backend with 90.0 second timeout +2026-02-26 13:56:02,787 INFO MainThread:4744 [wandb_init.py:init():1011] starting run threads in backend +2026-02-26 13:56:02,902 INFO MainThread:4744 [wandb_run.py:_console_start():2506] atexit reg +2026-02-26 13:56:02,902 INFO MainThread:4744 [wandb_run.py:_redirect():2354] redirect: wrap_raw +2026-02-26 13:56:02,902 INFO MainThread:4744 [wandb_run.py:_redirect():2423] Wrapping output streams. +2026-02-26 13:56:02,902 INFO MainThread:4744 [wandb_run.py:_redirect():2446] Redirects installed. +2026-02-26 13:56:02,905 INFO MainThread:4744 [wandb_init.py:init():1049] run started, returning control to user process +2026-02-26 14:39:41,470 INFO MainThread:4744 [wandb_run.py:_finish():2272] finishing run tzach/toy-transformer-replication/696nxyfr +2026-02-26 14:39:41,472 INFO MainThread:4744 [wandb_run.py:_atexit_cleanup():2471] got exitcode: 0 +2026-02-26 14:39:41,473 INFO MainThread:4744 [wandb_run.py:_restore():2453] restore +2026-02-26 14:39:41,473 INFO MainThread:4744 [wandb_run.py:_restore():2459] restore done +2026-02-26 14:39:42,357 INFO MainThread:4744 [wandb_run.py:_footer_sync_info():3867] logging synced files diff --git a/wandb/run-20260226_135602-696nxyfr/files/config.yaml b/wandb/run-20260226_135602-696nxyfr/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2afe665e873d0ab9e82a52cf7a3f3a2aa3697f42 --- /dev/null +++ b/wandb/run-20260226_135602-696nxyfr/files/config.yaml @@ -0,0 +1,140 @@ +_wandb: + value: + cli_version: 0.21.4 + e: + t10b7xwxume5c3as2o24by9uohwatl53: + cpu_count: 8 + cpu_count_logical: 8 + cudaVersion: "12.4" + disk: + /: + total: "262240792576" + used: "165972180992" + email: tzfof8@gmail.com + executable: /notebooks/toy_models/.toy_models_env/bin/python + git: + commit: 4a8df1a57a456227acca0b4948ad48c9f03a929a + remote: https://github.com/jgroh3/toy_models.git + gpu: NVIDIA RTX A6000 + gpu_count: 1 + gpu_nvidia: + - architecture: Ampere + cudaCores: 10752 + memoryTotal: "51527024640" + name: NVIDIA RTX A6000 + uuid: GPU-fa82911d-b891-1e23-11c8-f31149299e8c + host: nb5tce73r3 + memory: + total: "47332843520" + os: Linux-5.19.0-45-generic-x86_64-with-glibc2.35 + program: + python: CPython 3.11.7 + root: /notebooks/toy_models/model_training/model + startedAt: "2026-02-26T13:56:02.093247Z" + writerId: t10b7xwxume5c3as2o24by9uohwatl53 + m: [] + python_version: 3.11.7 + t: + "1": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 71 + "2": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 71 + "3": + - 2 + - 13 + - 15 + - 16 + - 61 + "4": 3.11.7 + "5": 0.21.4 + "6": 4.56.1 + "12": 0.21.4 + "13": linux-x86_64 +attn_only: + value: false +batch_size: + value: 32 +batch_size_per_device: + value: 32 +batches_per_step: + value: 1 +checkpoint_interval: + value: 500 +checkpoint_interval_ratio: + value: 1.1 +d_head: + value: 64 +d_mlp: + value: 2048 +d_model: + value: 512 +d_vocab: + value: 32000 +data_seed: + value: 10 +dataset_name: + value: eoinf/pile_llama +device: + value: cuda +grad_norm_clip: + value: 1 +init_range: + value: 0.02 +layer_norm_eps: + value: 1e-05 +log_interval: + value: 25 +lr_hidden: + value: 0.001 +lr_schedule: + value: constant_with_warmup +lr_vector: + value: 0.0005 +max_steps: + value: 6103 +max_tokens: + value: 200000000 +model_name: + value: pile_llama_2H_2L +n_ctx: + value: 1024 +n_devices: + value: 1 +n_heads: + value: 2 +n_layers: + value: 2 +save_checkpoints: + value: true +save_log_checkpoints: + value: true +seed: + value: 10 +tokenizer_name: + value: "" +tokens_per_step: + value: 32768 +train_loss_moving_average_beta: + value: 0.99 +use_bfloat16_matmul: + value: false +use_wandb: + value: true +warmup_steps: + value: 915 +warmup_tokens: + value: 30000000 +weight_decay: + value: 0.05 diff --git a/wandb/run-20260226_135602-696nxyfr/files/output.log b/wandb/run-20260226_135602-696nxyfr/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..7bd49a9d4276e2a2685f40a7f8c6b696a042820b --- /dev/null +++ b/wandb/run-20260226_135602-696nxyfr/files/output.log @@ -0,0 +1,252 @@ +Training on cuda +Model: 2L, 512d, 2h +Max steps: 6,103, Max tokens: 200,000,000 +Warmup steps: 915, Warmup tokens: 30,000,000 +Batch size per device: 32 +Context length: 1024 +Learning rates - Hidden: 0.001, Vector: 0.0005 + +Step 25 | Tokens: 819,200 | Train Loss EWMA: 10.4383 | Learning Rate: 0.000027 | Progress: 0.00410 +Step 50 | Tokens: 1,638,400 | Train Loss EWMA: 10.2399 | Learning Rate: 0.000055 | Progress: 0.00819 +Step 75 | Tokens: 2,457,600 | Train Loss EWMA: 9.9327 | Learning Rate: 0.000082 | Progress: 0.01229 +Step 100 | Tokens: 3,276,800 | Train Loss EWMA: 9.5286 | Learning Rate: 0.000109 | Progress: 0.01638 +Step 125 | Tokens: 4,096,000 | Train Loss EWMA: 9.0705 | Learning Rate: 0.000137 | Progress: 0.02048 +Step 150 | Tokens: 4,915,200 | Train Loss EWMA: 8.5979 | Learning Rate: 0.000164 | Progress: 0.02458 +Step 175 | Tokens: 5,734,400 | Train Loss EWMA: 8.1730 | Learning Rate: 0.000191 | Progress: 0.02867 +Step 200 | Tokens: 6,553,600 | Train Loss EWMA: 7.7965 | Learning Rate: 0.000219 | Progress: 0.03277 +Step 225 | Tokens: 7,372,800 | Train Loss EWMA: 7.4617 | Learning Rate: 0.000246 | Progress: 0.03686 +Step 250 | Tokens: 8,192,000 | Train Loss EWMA: 7.1771 | Learning Rate: 0.000273 | Progress: 0.04096 +Step 275 | Tokens: 9,011,200 | Train Loss EWMA: 6.9205 | Learning Rate: 0.000301 | Progress: 0.04506 +Step 300 | Tokens: 9,830,400 | Train Loss EWMA: 6.6937 | Learning Rate: 0.000328 | Progress: 0.04915 +Step 325 | Tokens: 10,649,600 | Train Loss EWMA: 6.4997 | Learning Rate: 0.000355 | Progress: 0.05325 +Step 350 | Tokens: 11,468,800 | Train Loss EWMA: 6.3320 | Learning Rate: 0.000383 | Progress: 0.05734 +Step 375 | Tokens: 12,288,000 | Train Loss EWMA: 6.1851 | Learning Rate: 0.000410 | Progress: 0.06144 +Step 400 | Tokens: 13,107,200 | Train Loss EWMA: 6.0555 | Learning Rate: 0.000437 | Progress: 0.06554 +Step 425 | Tokens: 13,926,400 | Train Loss EWMA: 5.9345 | Learning Rate: 0.000464 | Progress: 0.06963 +Step 450 | Tokens: 14,745,600 | Train Loss EWMA: 5.8331 | Learning Rate: 0.000492 | Progress: 0.07373 +Step 475 | Tokens: 15,564,800 | Train Loss EWMA: 5.7357 | Learning Rate: 0.000519 | Progress: 0.07782 +Step 500 | Tokens: 16,384,000 | Train Loss EWMA: 5.6613 | Learning Rate: 0.000546 | Progress: 0.08192 +Step 525 | Tokens: 17,203,200 | Train Loss EWMA: 5.5814 | Learning Rate: 0.000574 | Progress: 0.08602 +Step 550 | Tokens: 18,022,400 | Train Loss EWMA: 5.5058 | Learning Rate: 0.000601 | Progress: 0.09011 +Step 575 | Tokens: 18,841,600 | Train Loss EWMA: 5.4335 | Learning Rate: 0.000628 | Progress: 0.09421 +Step 600 | Tokens: 19,660,800 | Train Loss EWMA: 5.3756 | Learning Rate: 0.000656 | Progress: 0.09830 +Step 625 | Tokens: 20,480,000 | Train Loss EWMA: 5.3204 | Learning Rate: 0.000683 | Progress: 0.10240 +Step 650 | Tokens: 21,299,200 | Train Loss EWMA: 5.2848 | Learning Rate: 0.000710 | Progress: 0.10650 +Step 675 | Tokens: 22,118,400 | Train Loss EWMA: 5.2389 | Learning Rate: 0.000738 | Progress: 0.11059 +Step 700 | Tokens: 22,937,600 | Train Loss EWMA: 5.1989 | Learning Rate: 0.000765 | Progress: 0.11469 +Step 725 | Tokens: 23,756,800 | Train Loss EWMA: 5.1545 | Learning Rate: 0.000792 | Progress: 0.11878 +Step 750 | Tokens: 24,576,000 | Train Loss EWMA: 5.1071 | Learning Rate: 0.000820 | Progress: 0.12288 +Step 775 | Tokens: 25,395,200 | Train Loss EWMA: 5.0744 | Learning Rate: 0.000847 | Progress: 0.12698 +Step 800 | Tokens: 26,214,400 | Train Loss EWMA: 5.0443 | Learning Rate: 0.000874 | Progress: 0.13107 +Step 825 | Tokens: 27,033,600 | Train Loss EWMA: 5.0069 | Learning Rate: 0.000902 | Progress: 0.13517 +Step 850 | Tokens: 27,852,800 | Train Loss EWMA: 4.9799 | Learning Rate: 0.000929 | Progress: 0.13926 +Step 875 | Tokens: 28,672,000 | Train Loss EWMA: 4.9471 | Learning Rate: 0.000956 | Progress: 0.14336 +Step 900 | Tokens: 29,491,200 | Train Loss EWMA: 4.9174 | Learning Rate: 0.000984 | Progress: 0.14746 +Step 925 | Tokens: 30,310,400 | Train Loss EWMA: 4.8822 | Learning Rate: 0.001000 | Progress: 0.15155 +Step 950 | Tokens: 31,129,600 | Train Loss EWMA: 4.8552 | Learning Rate: 0.001000 | Progress: 0.15565 +Step 975 | Tokens: 31,948,800 | Train Loss EWMA: 4.8276 | Learning Rate: 0.001000 | Progress: 0.15974 +Step 1,000 | Tokens: 32,768,000 | Train Loss EWMA: 4.7911 | Learning Rate: 0.001000 | Progress: 0.16384 +Step 1,025 | Tokens: 33,587,200 | Train Loss EWMA: 4.7542 | Learning Rate: 0.001000 | Progress: 0.16794 +Step 1,050 | Tokens: 34,406,400 | Train Loss EWMA: 4.7262 | Learning Rate: 0.001000 | Progress: 0.17203 +Step 1,075 | Tokens: 35,225,600 | Train Loss EWMA: 4.6966 | Learning Rate: 0.001000 | Progress: 0.17613 +Step 1,100 | Tokens: 36,044,800 | Train Loss EWMA: 4.6649 | Learning Rate: 0.001000 | Progress: 0.18022 +Step 1,125 | Tokens: 36,864,000 | Train Loss EWMA: 4.6326 | Learning Rate: 0.001000 | Progress: 0.18432 +Step 1,150 | Tokens: 37,683,200 | Train Loss EWMA: 4.6079 | Learning Rate: 0.001000 | Progress: 0.18842 +Step 1,175 | Tokens: 38,502,400 | Train Loss EWMA: 4.5771 | Learning Rate: 0.001000 | Progress: 0.19251 +Step 1,200 | Tokens: 39,321,600 | Train Loss EWMA: 4.5582 | Learning Rate: 0.001000 | Progress: 0.19661 +Step 1,225 | Tokens: 40,140,800 | Train Loss EWMA: 4.5359 | Learning Rate: 0.001000 | Progress: 0.20070 +Step 1,250 | Tokens: 40,960,000 | Train Loss EWMA: 4.5072 | Learning Rate: 0.001000 | Progress: 0.20480 +Step 1,275 | Tokens: 41,779,200 | Train Loss EWMA: 4.4853 | Learning Rate: 0.001000 | Progress: 0.20890 +Step 1,300 | Tokens: 42,598,400 | Train Loss EWMA: 4.4637 | Learning Rate: 0.001000 | Progress: 0.21299 +Step 1,325 | Tokens: 43,417,600 | Train Loss EWMA: 4.4504 | Learning Rate: 0.001000 | Progress: 0.21709 +Step 1,350 | Tokens: 44,236,800 | Train Loss EWMA: 4.4213 | Learning Rate: 0.001000 | Progress: 0.22118 +Step 1,375 | Tokens: 45,056,000 | Train Loss EWMA: 4.3989 | Learning Rate: 0.001000 | Progress: 0.22528 +Step 1,400 | Tokens: 45,875,200 | Train Loss EWMA: 4.3813 | Learning Rate: 0.001000 | Progress: 0.22938 +Step 1,425 | Tokens: 46,694,400 | Train Loss EWMA: 4.3693 | Learning Rate: 0.001000 | Progress: 0.23347 +Step 1,450 | Tokens: 47,513,600 | Train Loss EWMA: 4.3565 | Learning Rate: 0.001000 | Progress: 0.23757 +Step 1,475 | Tokens: 48,332,800 | Train Loss EWMA: 4.3384 | Learning Rate: 0.001000 | Progress: 0.24166 +Step 1,500 | Tokens: 49,152,000 | Train Loss EWMA: 4.3246 | Learning Rate: 0.001000 | Progress: 0.24576 +Step 1,525 | Tokens: 49,971,200 | Train Loss EWMA: 4.3208 | Learning Rate: 0.001000 | Progress: 0.24986 +Step 1,550 | Tokens: 50,790,400 | Train Loss EWMA: 4.3031 | Learning Rate: 0.001000 | Progress: 0.25395 +Step 1,575 | Tokens: 51,609,600 | Train Loss EWMA: 4.2852 | Learning Rate: 0.001000 | Progress: 0.25805 +Step 1,600 | Tokens: 52,428,800 | Train Loss EWMA: 4.2704 | Learning Rate: 0.001000 | Progress: 0.26214 +Step 1,625 | Tokens: 53,248,000 | Train Loss EWMA: 4.2530 | Learning Rate: 0.001000 | Progress: 0.26624 +Step 1,650 | Tokens: 54,067,200 | Train Loss EWMA: 4.2422 | Learning Rate: 0.001000 | Progress: 0.27034 +Step 1,675 | Tokens: 54,886,400 | Train Loss EWMA: 4.2240 | Learning Rate: 0.001000 | Progress: 0.27443 +Step 1,700 | Tokens: 55,705,600 | Train Loss EWMA: 4.2018 | Learning Rate: 0.001000 | Progress: 0.27853 +Step 1,725 | Tokens: 56,524,800 | Train Loss EWMA: 4.1949 | Learning Rate: 0.001000 | Progress: 0.28262 +Step 1,750 | Tokens: 57,344,000 | Train Loss EWMA: 4.1788 | Learning Rate: 0.001000 | Progress: 0.28672 +Step 1,775 | Tokens: 58,163,200 | Train Loss EWMA: 4.1660 | Learning Rate: 0.001000 | Progress: 0.29082 +Step 1,800 | Tokens: 58,982,400 | Train Loss EWMA: 4.1678 | Learning Rate: 0.001000 | Progress: 0.29491 +Step 1,825 | Tokens: 59,801,600 | Train Loss EWMA: 4.1517 | Learning Rate: 0.001000 | Progress: 0.29901 +Step 1,850 | Tokens: 60,620,800 | Train Loss EWMA: 4.1388 | Learning Rate: 0.001000 | Progress: 0.30310 +Step 1,875 | Tokens: 61,440,000 | Train Loss EWMA: 4.1323 | Learning Rate: 0.001000 | Progress: 0.30720 +Step 1,900 | Tokens: 62,259,200 | Train Loss EWMA: 4.1275 | Learning Rate: 0.001000 | Progress: 0.31130 +Step 1,925 | Tokens: 63,078,400 | Train Loss EWMA: 4.1302 | Learning Rate: 0.001000 | Progress: 0.31539 +Step 1,950 | Tokens: 63,897,600 | Train Loss EWMA: 4.1248 | Learning Rate: 0.001000 | Progress: 0.31949 +Step 1,975 | Tokens: 64,716,800 | Train Loss EWMA: 4.1182 | Learning Rate: 0.001000 | Progress: 0.32358 +Step 2,000 | Tokens: 65,536,000 | Train Loss EWMA: 4.1105 | Learning Rate: 0.001000 | Progress: 0.32768 +Step 2,025 | Tokens: 66,355,200 | Train Loss EWMA: 4.0973 | Learning Rate: 0.001000 | Progress: 0.33178 +Step 2,050 | Tokens: 67,174,400 | Train Loss EWMA: 4.0878 | Learning Rate: 0.001000 | Progress: 0.33587 +Step 2,075 | Tokens: 67,993,600 | Train Loss EWMA: 4.0646 | Learning Rate: 0.001000 | Progress: 0.33997 +Step 2,100 | Tokens: 68,812,800 | Train Loss EWMA: 4.0451 | Learning Rate: 0.001000 | Progress: 0.34406 +Step 2,125 | Tokens: 69,632,000 | Train Loss EWMA: 4.0348 | Learning Rate: 0.001000 | Progress: 0.34816 +Step 2,150 | Tokens: 70,451,200 | Train Loss EWMA: 4.0273 | Learning Rate: 0.001000 | Progress: 0.35226 +Step 2,175 | Tokens: 71,270,400 | Train Loss EWMA: 4.0140 | Learning Rate: 0.001000 | Progress: 0.35635 +Step 2,200 | Tokens: 72,089,600 | Train Loss EWMA: 4.0052 | Learning Rate: 0.001000 | Progress: 0.36045 +Step 2,225 | Tokens: 72,908,800 | Train Loss EWMA: 3.9860 | Learning Rate: 0.001000 | Progress: 0.36454 +Step 2,250 | Tokens: 73,728,000 | Train Loss EWMA: 3.9721 | Learning Rate: 0.001000 | Progress: 0.36864 +Step 2,275 | Tokens: 74,547,200 | Train Loss EWMA: 3.9630 | Learning Rate: 0.001000 | Progress: 0.37274 +Step 2,300 | Tokens: 75,366,400 | Train Loss EWMA: 3.9500 | Learning Rate: 0.001000 | Progress: 0.37683 +Step 2,325 | Tokens: 76,185,600 | Train Loss EWMA: 3.9401 | Learning Rate: 0.001000 | Progress: 0.38093 +Step 2,350 | Tokens: 77,004,800 | Train Loss EWMA: 3.9329 | Learning Rate: 0.001000 | Progress: 0.38502 +Step 2,375 | Tokens: 77,824,000 | Train Loss EWMA: 3.9190 | Learning Rate: 0.001000 | Progress: 0.38912 +Step 2,400 | Tokens: 78,643,200 | Train Loss EWMA: 3.9036 | Learning Rate: 0.001000 | Progress: 0.39322 +Step 2,425 | Tokens: 79,462,400 | Train Loss EWMA: 3.8935 | Learning Rate: 0.001000 | Progress: 0.39731 +Step 2,450 | Tokens: 80,281,600 | Train Loss EWMA: 3.8901 | Learning Rate: 0.001000 | Progress: 0.40141 +Step 2,475 | Tokens: 81,100,800 | Train Loss EWMA: 3.8736 | Learning Rate: 0.001000 | Progress: 0.40550 +Step 2,500 | Tokens: 81,920,000 | Train Loss EWMA: 3.8586 | Learning Rate: 0.001000 | Progress: 0.40960 +Step 2,525 | Tokens: 82,739,200 | Train Loss EWMA: 3.8475 | Learning Rate: 0.001000 | Progress: 0.41370 +Step 2,550 | Tokens: 83,558,400 | Train Loss EWMA: 3.8380 | Learning Rate: 0.001000 | Progress: 0.41779 +Step 2,575 | Tokens: 84,377,600 | Train Loss EWMA: 3.8198 | Learning Rate: 0.001000 | Progress: 0.42189 +Step 2,600 | Tokens: 85,196,800 | Train Loss EWMA: 3.8070 | Learning Rate: 0.001000 | Progress: 0.42598 +Step 2,625 | Tokens: 86,016,000 | Train Loss EWMA: 3.8056 | Learning Rate: 0.001000 | Progress: 0.43008 +Step 2,650 | Tokens: 86,835,200 | Train Loss EWMA: 3.7988 | Learning Rate: 0.001000 | Progress: 0.43418 +Step 2,675 | Tokens: 87,654,400 | Train Loss EWMA: 3.7947 | Learning Rate: 0.001000 | Progress: 0.43827 +Step 2,700 | Tokens: 88,473,600 | Train Loss EWMA: 3.7780 | Learning Rate: 0.001000 | Progress: 0.44237 +Step 2,725 | Tokens: 89,292,800 | Train Loss EWMA: 3.7745 | Learning Rate: 0.001000 | Progress: 0.44646 +Step 2,750 | Tokens: 90,112,000 | Train Loss EWMA: 3.7625 | Learning Rate: 0.001000 | Progress: 0.45056 +Step 2,775 | Tokens: 90,931,200 | Train Loss EWMA: 3.7560 | Learning Rate: 0.001000 | Progress: 0.45466 +Step 2,800 | Tokens: 91,750,400 | Train Loss EWMA: 3.7477 | Learning Rate: 0.001000 | Progress: 0.45875 +Step 2,825 | Tokens: 92,569,600 | Train Loss EWMA: 3.7464 | Learning Rate: 0.001000 | Progress: 0.46285 +Step 2,850 | Tokens: 93,388,800 | Train Loss EWMA: 3.7398 | Learning Rate: 0.001000 | Progress: 0.46694 +Step 2,875 | Tokens: 94,208,000 | Train Loss EWMA: 3.7265 | Learning Rate: 0.001000 | Progress: 0.47104 +Step 2,900 | Tokens: 95,027,200 | Train Loss EWMA: 3.7226 | Learning Rate: 0.001000 | Progress: 0.47514 +Step 2,925 | Tokens: 95,846,400 | Train Loss EWMA: 3.7131 | Learning Rate: 0.001000 | Progress: 0.47923 +Step 2,950 | Tokens: 96,665,600 | Train Loss EWMA: 3.7091 | Learning Rate: 0.001000 | Progress: 0.48333 +Step 2,975 | Tokens: 97,484,800 | Train Loss EWMA: 3.7080 | Learning Rate: 0.001000 | Progress: 0.48742 +Step 3,000 | Tokens: 98,304,000 | Train Loss EWMA: 3.7076 | Learning Rate: 0.001000 | Progress: 0.49152 +Step 3,025 | Tokens: 99,123,200 | Train Loss EWMA: 3.7050 | Learning Rate: 0.001000 | Progress: 0.49562 +Step 3,050 | Tokens: 99,942,400 | Train Loss EWMA: 3.6943 | Learning Rate: 0.001000 | Progress: 0.49971 +Step 3,075 | Tokens: 100,761,600 | Train Loss EWMA: 3.6899 | Learning Rate: 0.001000 | Progress: 0.50381 +Step 3,100 | Tokens: 101,580,800 | Train Loss EWMA: 3.6864 | Learning Rate: 0.001000 | Progress: 0.50790 +Step 3,125 | Tokens: 102,400,000 | Train Loss EWMA: 3.6852 | Learning Rate: 0.001000 | Progress: 0.51200 +Step 3,150 | Tokens: 103,219,200 | Train Loss EWMA: 3.6783 | Learning Rate: 0.001000 | Progress: 0.51610 +Step 3,175 | Tokens: 104,038,400 | Train Loss EWMA: 3.6752 | Learning Rate: 0.001000 | Progress: 0.52019 +Step 3,200 | Tokens: 104,857,600 | Train Loss EWMA: 3.6593 | Learning Rate: 0.001000 | Progress: 0.52429 +Step 3,225 | Tokens: 105,676,800 | Train Loss EWMA: 3.6488 | Learning Rate: 0.001000 | Progress: 0.52838 +Step 3,250 | Tokens: 106,496,000 | Train Loss EWMA: 3.6477 | Learning Rate: 0.001000 | Progress: 0.53248 +Step 3,275 | Tokens: 107,315,200 | Train Loss EWMA: 3.6496 | Learning Rate: 0.001000 | Progress: 0.53658 +Step 3,300 | Tokens: 108,134,400 | Train Loss EWMA: 3.6346 | Learning Rate: 0.001000 | Progress: 0.54067 +Step 3,325 | Tokens: 108,953,600 | Train Loss EWMA: 3.6332 | Learning Rate: 0.001000 | Progress: 0.54477 +Step 3,350 | Tokens: 109,772,800 | Train Loss EWMA: 3.6332 | Learning Rate: 0.001000 | Progress: 0.54886 +Step 3,375 | Tokens: 110,592,000 | Train Loss EWMA: 3.6164 | Learning Rate: 0.001000 | Progress: 0.55296 +Step 3,400 | Tokens: 111,411,200 | Train Loss EWMA: 3.6130 | Learning Rate: 0.001000 | Progress: 0.55706 +Step 3,425 | Tokens: 112,230,400 | Train Loss EWMA: 3.6068 | Learning Rate: 0.001000 | Progress: 0.56115 +Step 3,450 | Tokens: 113,049,600 | Train Loss EWMA: 3.6038 | Learning Rate: 0.001000 | Progress: 0.56525 +Step 3,475 | Tokens: 113,868,800 | Train Loss EWMA: 3.6061 | Learning Rate: 0.001000 | Progress: 0.56934 +Step 3,500 | Tokens: 114,688,000 | Train Loss EWMA: 3.5935 | Learning Rate: 0.001000 | Progress: 0.57344 +Step 3,525 | Tokens: 115,507,200 | Train Loss EWMA: 3.5949 | Learning Rate: 0.001000 | Progress: 0.57754 +Step 3,550 | Tokens: 116,326,400 | Train Loss EWMA: 3.5897 | Learning Rate: 0.001000 | Progress: 0.58163 +Step 3,575 | Tokens: 117,145,600 | Train Loss EWMA: 3.6016 | Learning Rate: 0.001000 | Progress: 0.58573 +Step 3,600 | Tokens: 117,964,800 | Train Loss EWMA: 3.5951 | Learning Rate: 0.001000 | Progress: 0.58982 +Step 3,625 | Tokens: 118,784,000 | Train Loss EWMA: 3.5985 | Learning Rate: 0.001000 | Progress: 0.59392 +Step 3,650 | Tokens: 119,603,200 | Train Loss EWMA: 3.5961 | Learning Rate: 0.001000 | Progress: 0.59802 +Step 3,675 | Tokens: 120,422,400 | Train Loss EWMA: 3.5969 | Learning Rate: 0.001000 | Progress: 0.60211 +Step 3,700 | Tokens: 121,241,600 | Train Loss EWMA: 3.5917 | Learning Rate: 0.001000 | Progress: 0.60621 +Step 3,725 | Tokens: 122,060,800 | Train Loss EWMA: 3.5878 | Learning Rate: 0.001000 | Progress: 0.61030 +Step 3,750 | Tokens: 122,880,000 | Train Loss EWMA: 3.5800 | Learning Rate: 0.001000 | Progress: 0.61440 +Step 3,775 | Tokens: 123,699,200 | Train Loss EWMA: 3.5714 | Learning Rate: 0.001000 | Progress: 0.61850 +Step 3,800 | Tokens: 124,518,400 | Train Loss EWMA: 3.5627 | Learning Rate: 0.001000 | Progress: 0.62259 +Step 3,825 | Tokens: 125,337,600 | Train Loss EWMA: 3.5643 | Learning Rate: 0.001000 | Progress: 0.62669 +Step 3,850 | Tokens: 126,156,800 | Train Loss EWMA: 3.5685 | Learning Rate: 0.001000 | Progress: 0.63078 +Step 3,875 | Tokens: 126,976,000 | Train Loss EWMA: 3.5589 | Learning Rate: 0.001000 | Progress: 0.63488 +Step 3,900 | Tokens: 127,795,200 | Train Loss EWMA: 3.5660 | Learning Rate: 0.001000 | Progress: 0.63898 +Step 3,925 | Tokens: 128,614,400 | Train Loss EWMA: 3.5619 | Learning Rate: 0.001000 | Progress: 0.64307 +Step 3,950 | Tokens: 129,433,600 | Train Loss EWMA: 3.5617 | Learning Rate: 0.001000 | Progress: 0.64717 +Step 3,975 | Tokens: 130,252,800 | Train Loss EWMA: 3.5500 | Learning Rate: 0.001000 | Progress: 0.65126 +Step 4,000 | Tokens: 131,072,000 | Train Loss EWMA: 3.5571 | Learning Rate: 0.001000 | Progress: 0.65536 +Step 4,025 | Tokens: 131,891,200 | Train Loss EWMA: 3.5515 | Learning Rate: 0.001000 | Progress: 0.65946 +Step 4,050 | Tokens: 132,710,400 | Train Loss EWMA: 3.5347 | Learning Rate: 0.001000 | Progress: 0.66355 +Step 4,075 | Tokens: 133,529,600 | Train Loss EWMA: 3.5300 | Learning Rate: 0.001000 | Progress: 0.66765 +Step 4,100 | Tokens: 134,348,800 | Train Loss EWMA: 3.5266 | Learning Rate: 0.001000 | Progress: 0.67174 +Step 4,125 | Tokens: 135,168,000 | Train Loss EWMA: 3.5305 | Learning Rate: 0.001000 | Progress: 0.67584 +Step 4,150 | Tokens: 135,987,200 | Train Loss EWMA: 3.5329 | Learning Rate: 0.001000 | Progress: 0.67994 +Step 4,175 | Tokens: 136,806,400 | Train Loss EWMA: 3.5271 | Learning Rate: 0.001000 | Progress: 0.68403 +Step 4,200 | Tokens: 137,625,600 | Train Loss EWMA: 3.5368 | Learning Rate: 0.001000 | Progress: 0.68813 +Step 4,225 | Tokens: 138,444,800 | Train Loss EWMA: 3.5271 | Learning Rate: 0.001000 | Progress: 0.69222 +Step 4,250 | Tokens: 139,264,000 | Train Loss EWMA: 3.5294 | Learning Rate: 0.001000 | Progress: 0.69632 +Step 4,275 | Tokens: 140,083,200 | Train Loss EWMA: 3.5299 | Learning Rate: 0.001000 | Progress: 0.70042 +Step 4,300 | Tokens: 140,902,400 | Train Loss EWMA: 3.5220 | Learning Rate: 0.001000 | Progress: 0.70451 +Step 4,325 | Tokens: 141,721,600 | Train Loss EWMA: 3.5216 | Learning Rate: 0.001000 | Progress: 0.70861 +Step 4,350 | Tokens: 142,540,800 | Train Loss EWMA: 3.5166 | Learning Rate: 0.001000 | Progress: 0.71270 +Step 4,375 | Tokens: 143,360,000 | Train Loss EWMA: 3.5151 | Learning Rate: 0.001000 | Progress: 0.71680 +Step 4,400 | Tokens: 144,179,200 | Train Loss EWMA: 3.5194 | Learning Rate: 0.001000 | Progress: 0.72090 +Step 4,425 | Tokens: 144,998,400 | Train Loss EWMA: 3.5254 | Learning Rate: 0.001000 | Progress: 0.72499 +Step 4,450 | Tokens: 145,817,600 | Train Loss EWMA: 3.5223 | Learning Rate: 0.001000 | Progress: 0.72909 +Step 4,475 | Tokens: 146,636,800 | Train Loss EWMA: 3.5208 | Learning Rate: 0.001000 | Progress: 0.73318 +Step 4,500 | Tokens: 147,456,000 | Train Loss EWMA: 3.5049 | Learning Rate: 0.001000 | Progress: 0.73728 +Step 4,525 | Tokens: 148,275,200 | Train Loss EWMA: 3.4897 | Learning Rate: 0.001000 | Progress: 0.74138 +Step 4,550 | Tokens: 149,094,400 | Train Loss EWMA: 3.4910 | Learning Rate: 0.001000 | Progress: 0.74547 +Step 4,575 | Tokens: 149,913,600 | Train Loss EWMA: 3.4869 | Learning Rate: 0.001000 | Progress: 0.74957 +Step 4,600 | Tokens: 150,732,800 | Train Loss EWMA: 3.4934 | Learning Rate: 0.001000 | Progress: 0.75366 +Step 4,625 | Tokens: 151,552,000 | Train Loss EWMA: 3.4865 | Learning Rate: 0.001000 | Progress: 0.75776 +Step 4,650 | Tokens: 152,371,200 | Train Loss EWMA: 3.4767 | Learning Rate: 0.001000 | Progress: 0.76186 +Step 4,675 | Tokens: 153,190,400 | Train Loss EWMA: 3.4701 | Learning Rate: 0.001000 | Progress: 0.76595 +Step 4,700 | Tokens: 154,009,600 | Train Loss EWMA: 3.4767 | Learning Rate: 0.001000 | Progress: 0.77005 +Step 4,725 | Tokens: 154,828,800 | Train Loss EWMA: 3.4821 | Learning Rate: 0.001000 | Progress: 0.77414 +Step 4,750 | Tokens: 155,648,000 | Train Loss EWMA: 3.4814 | Learning Rate: 0.001000 | Progress: 0.77824 +Step 4,775 | Tokens: 156,467,200 | Train Loss EWMA: 3.4823 | Learning Rate: 0.001000 | Progress: 0.78234 +Step 4,800 | Tokens: 157,286,400 | Train Loss EWMA: 3.4844 | Learning Rate: 0.001000 | Progress: 0.78643 +Step 4,825 | Tokens: 158,105,600 | Train Loss EWMA: 3.4816 | Learning Rate: 0.001000 | Progress: 0.79053 +Step 4,850 | Tokens: 158,924,800 | Train Loss EWMA: 3.4813 | Learning Rate: 0.001000 | Progress: 0.79462 +Step 4,875 | Tokens: 159,744,000 | Train Loss EWMA: 3.4812 | Learning Rate: 0.001000 | Progress: 0.79872 +Step 4,900 | Tokens: 160,563,200 | Train Loss EWMA: 3.4815 | Learning Rate: 0.001000 | Progress: 0.80282 +Step 4,925 | Tokens: 161,382,400 | Train Loss EWMA: 3.4681 | Learning Rate: 0.001000 | Progress: 0.80691 +Step 4,950 | Tokens: 162,201,600 | Train Loss EWMA: 3.4704 | Learning Rate: 0.001000 | Progress: 0.81101 +Step 4,975 | Tokens: 163,020,800 | Train Loss EWMA: 3.4717 | Learning Rate: 0.001000 | Progress: 0.81510 +Step 5,000 | Tokens: 163,840,000 | Train Loss EWMA: 3.4643 | Learning Rate: 0.001000 | Progress: 0.81920 +Step 5,025 | Tokens: 164,659,200 | Train Loss EWMA: 3.4700 | Learning Rate: 0.001000 | Progress: 0.82330 +Step 5,050 | Tokens: 165,478,400 | Train Loss EWMA: 3.4659 | Learning Rate: 0.001000 | Progress: 0.82739 +Step 5,075 | Tokens: 166,297,600 | Train Loss EWMA: 3.4664 | Learning Rate: 0.001000 | Progress: 0.83149 +Step 5,100 | Tokens: 167,116,800 | Train Loss EWMA: 3.4626 | Learning Rate: 0.001000 | Progress: 0.83558 +Step 5,125 | Tokens: 167,936,000 | Train Loss EWMA: 3.4556 | Learning Rate: 0.001000 | Progress: 0.83968 +Step 5,150 | Tokens: 168,755,200 | Train Loss EWMA: 3.4562 | Learning Rate: 0.001000 | Progress: 0.84378 +Step 5,175 | Tokens: 169,574,400 | Train Loss EWMA: 3.4560 | Learning Rate: 0.001000 | Progress: 0.84787 +Step 5,200 | Tokens: 170,393,600 | Train Loss EWMA: 3.4589 | Learning Rate: 0.001000 | Progress: 0.85197 +Step 5,225 | Tokens: 171,212,800 | Train Loss EWMA: 3.4578 | Learning Rate: 0.001000 | Progress: 0.85606 +Step 5,250 | Tokens: 172,032,000 | Train Loss EWMA: 3.4614 | Learning Rate: 0.001000 | Progress: 0.86016 +Step 5,275 | Tokens: 172,851,200 | Train Loss EWMA: 3.4597 | Learning Rate: 0.001000 | Progress: 0.86426 +Step 5,300 | Tokens: 173,670,400 | Train Loss EWMA: 3.4567 | Learning Rate: 0.001000 | Progress: 0.86835 +Step 5,325 | Tokens: 174,489,600 | Train Loss EWMA: 3.4587 | Learning Rate: 0.001000 | Progress: 0.87245 +Step 5,350 | Tokens: 175,308,800 | Train Loss EWMA: 3.4547 | Learning Rate: 0.001000 | Progress: 0.87654 +Step 5,375 | Tokens: 176,128,000 | Train Loss EWMA: 3.4551 | Learning Rate: 0.001000 | Progress: 0.88064 +Step 5,400 | Tokens: 176,947,200 | Train Loss EWMA: 3.4431 | Learning Rate: 0.001000 | Progress: 0.88474 +Step 5,425 | Tokens: 177,766,400 | Train Loss EWMA: 3.4419 | Learning Rate: 0.001000 | Progress: 0.88883 +Step 5,450 | Tokens: 178,585,600 | Train Loss EWMA: 3.4359 | Learning Rate: 0.001000 | Progress: 0.89293 +Step 5,475 | Tokens: 179,404,800 | Train Loss EWMA: 3.4364 | Learning Rate: 0.001000 | Progress: 0.89702 +Step 5,500 | Tokens: 180,224,000 | Train Loss EWMA: 3.4352 | Learning Rate: 0.001000 | Progress: 0.90112 +Step 5,525 | Tokens: 181,043,200 | Train Loss EWMA: 3.4334 | Learning Rate: 0.001000 | Progress: 0.90522 +Step 5,550 | Tokens: 181,862,400 | Train Loss EWMA: 3.4315 | Learning Rate: 0.001000 | Progress: 0.90931 +Step 5,575 | Tokens: 182,681,600 | Train Loss EWMA: 3.4341 | Learning Rate: 0.001000 | Progress: 0.91341 +Step 5,600 | Tokens: 183,500,800 | Train Loss EWMA: 3.4366 | Learning Rate: 0.001000 | Progress: 0.91750 +Step 5,625 | Tokens: 184,320,000 | Train Loss EWMA: 3.4294 | Learning Rate: 0.001000 | Progress: 0.92160 +Step 5,650 | Tokens: 185,139,200 | Train Loss EWMA: 3.4246 | Learning Rate: 0.001000 | Progress: 0.92570 +Step 5,675 | Tokens: 185,958,400 | Train Loss EWMA: 3.4210 | Learning Rate: 0.001000 | Progress: 0.92979 +Step 5,700 | Tokens: 186,777,600 | Train Loss EWMA: 3.4197 | Learning Rate: 0.001000 | Progress: 0.93389 +Step 5,725 | Tokens: 187,596,800 | Train Loss EWMA: 3.4173 | Learning Rate: 0.001000 | Progress: 0.93798 +Step 5,750 | Tokens: 188,416,000 | Train Loss EWMA: 3.4225 | Learning Rate: 0.001000 | Progress: 0.94208 +Step 5,775 | Tokens: 189,235,200 | Train Loss EWMA: 3.4131 | Learning Rate: 0.001000 | Progress: 0.94618 +Step 5,800 | Tokens: 190,054,400 | Train Loss EWMA: 3.4089 | Learning Rate: 0.001000 | Progress: 0.95027 +Step 5,825 | Tokens: 190,873,600 | Train Loss EWMA: 3.4100 | Learning Rate: 0.001000 | Progress: 0.95437 +Step 5,850 | Tokens: 191,692,800 | Train Loss EWMA: 3.4125 | Learning Rate: 0.001000 | Progress: 0.95846 +Step 5,875 | Tokens: 192,512,000 | Train Loss EWMA: 3.4081 | Learning Rate: 0.001000 | Progress: 0.96256 +Step 5,900 | Tokens: 193,331,200 | Train Loss EWMA: 3.4072 | Learning Rate: 0.001000 | Progress: 0.96666 +Step 5,925 | Tokens: 194,150,400 | Train Loss EWMA: 3.4107 | Learning Rate: 0.001000 | Progress: 0.97075 +Step 5,950 | Tokens: 194,969,600 | Train Loss EWMA: 3.4141 | Learning Rate: 0.001000 | Progress: 0.97485 +Step 5,975 | Tokens: 195,788,800 | Train Loss EWMA: 3.4136 | Learning Rate: 0.001000 | Progress: 0.97894 +Step 6,000 | Tokens: 196,608,000 | Train Loss EWMA: 3.4062 | Learning Rate: 0.001000 | Progress: 0.98304 +Step 6,025 | Tokens: 197,427,200 | Train Loss EWMA: 3.4095 | Learning Rate: 0.001000 | Progress: 0.98714 +Step 6,050 | Tokens: 198,246,400 | Train Loss EWMA: 3.4018 | Learning Rate: 0.001000 | Progress: 0.99123 +Step 6,075 | Tokens: 199,065,600 | Train Loss EWMA: 3.4004 | Learning Rate: 0.001000 | Progress: 0.99533 +Step 6,100 | Tokens: 199,884,800 | Train Loss EWMA: 3.3959 | Learning Rate: 0.001000 | Progress: 0.99942 diff --git a/wandb/run-20260226_135602-696nxyfr/files/requirements.txt b/wandb/run-20260226_135602-696nxyfr/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..c32285d10ba18c2e783ff2ead305d5976caef668 --- /dev/null +++ b/wandb/run-20260226_135602-696nxyfr/files/requirements.txt @@ -0,0 +1,222 @@ +fsspec==2025.3.0 +PyYAML==6.0.2 +certifi==2025.8.3 +comm==0.2.3 +widgetsnbextension==4.0.14 +Jinja2==3.1.6 +rich==14.1.0 +circuitsvis==1.43.3 +hf-xet==1.1.9 +param==2.2.1 +httpcore==1.0.9 +nvidia-cuda-cupti-cu12==12.8.90 +nvidia-cuda-nvrtc-cu12==12.8.93 +asttokens==3.0.0 +filelock==3.19.1 +types-python-dateutil==2.9.0.20250822 +cycler==0.12.1 +stack-data==0.6.3 +jupyter_server==2.17.0 +aiosignal==1.4.0 +xyzservices==2025.4.0 +lark==1.2.2 +ptyprocess==0.7.0 +xxhash==3.5.0 +mpmath==1.3.0 +seaborn==0.13.2 +wadler_lindig==0.1.7 +nbformat==5.10.4 +panel==1.8.0 +accelerate==1.10.1 +plotly==6.3.0 +narwhals==2.4.0 +huggingface-hub==0.34.4 +sentencepiece==0.2.1 +torchvision==0.23.0 +ipython==9.5.0 +tqdm==4.67.1 +contourpy==1.3.3 +nvidia-nvtx-cu12==12.8.90 +nvidia-cuda-runtime-cu12==12.8.90 +yarl==1.20.1 +charset-normalizer==3.4.3 +jupyter-events==0.12.0 +nbclient==0.10.2 +numpy==1.26.4 +decorator==5.2.1 +threadpoolctl==3.6.0 +networkx==3.5 +smmap==5.0.2 +nbconvert==7.16.6 +pytz==2025.2 +aiohappyeyeballs==2.6.1 +requests==2.32.5 +tinycss2==1.4.0 +defusedxml==0.7.1 +matplotlib-inline==0.1.7 +rpds-py==0.27.1 +wandb==0.21.4 +jedi==0.19.2 +pathspec==0.12.1 +transformer-lens==2.16.1 +sympy==1.14.0 +jupyterlab_pygments==0.3.0 +overrides==7.7.0 +notebook_shim==0.2.4 +jupyter==1.1.1 +protobuf==6.32.1 +better-abc==0.0.3 +jsonpointer==3.0.0 +terminado==0.18.1 +cfgv==3.4.0 +rfc3987-syntax==1.1.0 +annotated-types==0.7.0 +pyarrow==21.0.0 +webencodings==0.5.1 +wcwidth==0.2.13 +jupyterlab_server==2.27.3 +argon2-cffi-bindings==25.1.0 +nvidia-nvjitlink-cu12==12.8.93 +jaxtyping==0.3.2 +Pygments==2.19.2 +torch==2.8.0 +rfc3339-validator==0.1.4 +urllib3==2.5.0 +jupyterlab_widgets==3.0.15 +ipykernel==6.30.1 +nvidia-cudnn-cu12==9.10.2.21 +beautifulsoup4==4.13.5 +babel==2.17.0 +pure_eval==0.2.3 +pyparsing==3.2.3 +nvidia-cublas-cu12==12.8.4.1 +regex==2025.9.1 +pycparser==2.23 +soupsieve==2.8 +pytest-cov==7.0.0 +sniffio==1.3.1 +mypy==1.18.1 +notebook==7.4.5 +packaging==25.0 +h11==0.16.0 +psutil==7.0.0 +pexpect==4.9.0 +zstandard==0.25.0 +gitdb==4.0.12 +rfc3986-validator==0.1.1 +pyzmq==27.1.0 +jupyterlab==4.4.7 +toy_models==0.1.0 +torchaudio==2.8.0 +cffi==2.0.0 +mypy_extensions==1.1.0 +attrs==25.3.0 +statsmodels==0.14.6 +transformers==4.56.1 +jupyter_core==5.8.1 +bleach==6.2.0 +fqdn==1.5.1 +async-lru==2.0.5 +nvidia-nccl-cu12==2.27.3 +GitPython==3.1.45 +referencing==0.36.2 +click==8.2.1 +prometheus_client==0.22.1 +bokeh==3.8.0 +httpx==0.28.1 +setuptools==80.9.0 +argon2-cffi==25.1.0 +patsy==1.0.2 +multidict==6.6.4 +pyviz_comms==3.0.6 +arrow==1.3.0 +scikit-learn==1.8.0 +beartype==0.14.1 +ipywidgets==8.1.7 +pydantic_core==2.33.2 +markdown-it-py==4.0.0 +pandas==2.3.2 +virtualenv==20.34.0 +python-dotenv==1.1.1 +isoduration==20.11.0 +python-dateutil==2.9.0.post0 +nodeenv==1.9.1 +nvidia-curand-cu12==10.3.9.90 +webcolors==24.11.1 +MarkupSafe==3.0.2 +nvidia-cusolver-cu12==11.7.3.90 +Send2Trash==1.8.3 +coverage==7.10.6 +jupyter_server_terminals==0.5.3 +debugpy==1.8.16 +json5==0.12.1 +linkify-it-py==2.0.3 +importlib_metadata==8.7.0 +nvidia-cufft-cu12==11.3.3.83 +distlib==0.4.0 +typing-inspection==0.4.1 +identify==2.6.14 +nvidia-cufile-cu12==1.13.1.3 +scipy==1.17.0 +mdurl==0.1.2 +websocket-client==1.8.0 +jsonschema==4.25.1 +python-json-logger==3.3.0 +typing_extensions==4.15.0 +tokenizers==0.22.0 +ipympl==0.9.7 +einops==0.8.1 +jupyter_client==8.6.3 +ipython_pygments_lexers==1.1.1 +h5py==3.14.0 +tabulate==0.9.0 +propcache==0.3.2 +ruff==0.13.0 +tornado==6.5.2 +typeguard==4.4.4 +tomlkit==0.13.2 +pluggy==1.6.0 +pydantic==2.11.7 +zipp==3.23.0 +fancy-einsum==0.0.3 +fastjsonschema==2.21.2 +datasets==4.0.0 +fonttools==4.59.2 +executing==2.2.1 +pillow==11.3.0 +uc-micro-py==1.0.3 +Markdown==3.9 +pre_commit==4.3.0 +aiohttp==3.12.15 +mistune==3.1.4 +tzdata==2025.2 +parso==0.8.5 +triton==3.4.0 +kiwisolver==1.4.9 +idna==3.10 +multiprocess==0.70.16 +dill==0.3.8 +jupyter-lsp==2.3.0 +platformdirs==4.4.0 +sentry-sdk==2.37.1 +prompt_toolkit==3.0.52 +jsonschema-specifications==2025.9.1 +pytest==8.4.2 +mdit-py-plugins==0.5.0 +transformers-stream-generator==0.0.5 +nvidia-cusparselt-cu12==0.7.1 +joblib==1.5.3 +pandocfilters==1.5.1 +jupyter-console==6.6.3 +anyio==4.10.0 +six==1.17.0 +holoviews==1.21.0 +matplotlib==3.10.6 +colorcet==3.1.0 +uri-template==1.3.0 +nest-asyncio==1.6.0 +nvidia-cusparse-cu12==12.5.8.93 +iniconfig==2.1.0 +traitlets==5.14.3 +safetensors==0.6.2 +frozenlist==1.7.0 diff --git a/wandb/run-20260226_135602-696nxyfr/files/wandb-metadata.json b/wandb/run-20260226_135602-696nxyfr/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..e94c48e852a1ff6b4059fd8fc358801e216100bd --- /dev/null +++ b/wandb/run-20260226_135602-696nxyfr/files/wandb-metadata.json @@ -0,0 +1,38 @@ +{ + "os": "Linux-5.19.0-45-generic-x86_64-with-glibc2.35", + "python": "CPython 3.11.7", + "startedAt": "2026-02-26T13:56:02.093247Z", + "program": "", + "git": { + "remote": "https://github.com/jgroh3/toy_models.git", + "commit": "4a8df1a57a456227acca0b4948ad48c9f03a929a" + }, + "email": "tzfof8@gmail.com", + "root": "/notebooks/toy_models/model_training/model", + "host": "nb5tce73r3", + "executable": "/notebooks/toy_models/.toy_models_env/bin/python", + "cpu_count": 8, + "cpu_count_logical": 8, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 1, + "disk": { + "/": { + "total": "262240792576", + "used": "165972180992" + } + }, + "memory": { + "total": "47332843520" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere", + "uuid": "GPU-fa82911d-b891-1e23-11c8-f31149299e8c" + } + ], + "cudaVersion": "12.4", + "writerId": "t10b7xwxume5c3as2o24by9uohwatl53" +} \ No newline at end of file diff --git a/wandb/run-20260226_135602-696nxyfr/files/wandb-summary.json b/wandb/run-20260226_135602-696nxyfr/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..5e2abe25e0a196240e6d05fe6c58e18612ed5c78 --- /dev/null +++ b/wandb/run-20260226_135602-696nxyfr/files/wandb-summary.json @@ -0,0 +1 @@ +{"_runtime":2618.685094549,"train_loss_ewma":3.3958888326073158,"learning_rate":0.001,"_step":6100,"train_loss":3.2086336612701416,"tokens_seen":199884800,"_wandb":{"runtime":2618},"step":6100,"tokens_per_second":32768,"progress":0.999424,"_timestamp":1.772116779369793e+09} \ No newline at end of file diff --git a/wandb/run-20260226_135602-696nxyfr/logs/debug-core.log b/wandb/run-20260226_135602-696nxyfr/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..9c0376529638e86ec88e230cc7869172f8f34d14 --- /dev/null +++ b/wandb/run-20260226_135602-696nxyfr/logs/debug-core.log @@ -0,0 +1,16 @@ +{"time":"2026-02-26T13:56:02.117974369Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpn_w795cs/port-4744.txt","pid":4744,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2026-02-26T13:56:02.118646589Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":4744} +{"time":"2026-02-26T13:56:02.118622691Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-4744-4804-2768624779/socket","Net":"unix"}} +{"time":"2026-02-26T13:56:02.302844153Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2026-02-26T13:56:02.310431426Z","level":"INFO","msg":"handleInformInit: received","streamId":"696nxyfr","id":"1(@)"} +{"time":"2026-02-26T13:56:02.505483993Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"696nxyfr","id":"1(@)"} +{"time":"2026-02-26T14:39:42.357833594Z","level":"INFO","msg":"handleInformFinish: finish message received","streamId":"696nxyfr","id":"1(@)"} +{"time":"2026-02-26T14:39:42.365347058Z","level":"INFO","msg":"handleInformFinish: stream closed","streamId":"696nxyfr","id":"1(@)"} +{"time":"2026-02-26T14:39:42.365381809Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"} +{"time":"2026-02-26T14:39:42.365401102Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"} +{"time":"2026-02-26T14:39:42.365412035Z","level":"INFO","msg":"connection: closing","id":"1(@)"} +{"time":"2026-02-26T14:39:42.365433798Z","level":"INFO","msg":"server is shutting down"} +{"time":"2026-02-26T14:39:42.365451526Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"} +{"time":"2026-02-26T14:39:42.365535693Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"} +{"time":"2026-02-26T14:39:42.365594824Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/tmp/wandb-4744-4804-2768624779/socket","Net":"unix"}} +{"time":"2026-02-26T14:39:42.365636969Z","level":"INFO","msg":"server is closed"} diff --git a/wandb/run-20260226_135602-696nxyfr/logs/debug-internal.log b/wandb/run-20260226_135602-696nxyfr/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..8a8b762c57499dae83b531207444b547fc77a22f --- /dev/null +++ b/wandb/run-20260226_135602-696nxyfr/logs/debug-internal.log @@ -0,0 +1,12 @@ +{"time":"2026-02-26T13:56:02.310641183Z","level":"INFO","msg":"stream: starting","core version":"0.21.4"} +{"time":"2026-02-26T13:56:02.505385385Z","level":"INFO","msg":"stream: created new stream","id":"696nxyfr"} +{"time":"2026-02-26T13:56:02.505474889Z","level":"INFO","msg":"stream: started","id":"696nxyfr"} +{"time":"2026-02-26T13:56:02.505547204Z","level":"INFO","msg":"writer: started","stream_id":"696nxyfr"} +{"time":"2026-02-26T13:56:02.505558638Z","level":"INFO","msg":"handler: started","stream_id":"696nxyfr"} +{"time":"2026-02-26T13:56:02.5055985Z","level":"INFO","msg":"sender: started","stream_id":"696nxyfr"} +{"time":"2026-02-26T14:39:41.476737152Z","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run metadata","runtime_seconds":0.003364817}],"total_operations":1}} +{"time":"2026-02-26T14:39:41.82022831Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2026-02-26T14:39:42.35788107Z","level":"INFO","msg":"stream: closing","id":"696nxyfr"} +{"time":"2026-02-26T14:39:42.357919856Z","level":"INFO","msg":"handler: closed","stream_id":"696nxyfr"} +{"time":"2026-02-26T14:39:42.357981703Z","level":"INFO","msg":"sender: closed","stream_id":"696nxyfr"} +{"time":"2026-02-26T14:39:42.357990898Z","level":"INFO","msg":"stream: closed","id":"696nxyfr"} diff --git a/wandb/run-20260226_135602-696nxyfr/logs/debug.log b/wandb/run-20260226_135602-696nxyfr/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..d628c19744d0df790b8f77df43df800da59c34ce --- /dev/null +++ b/wandb/run-20260226_135602-696nxyfr/logs/debug.log @@ -0,0 +1,26 @@ +2026-02-26 13:56:02,097 INFO MainThread:4744 [wandb_setup.py:_flush():81] Current SDK version is 0.21.4 +2026-02-26 13:56:02,097 INFO MainThread:4744 [wandb_setup.py:_flush():81] Configure stats pid to 4744 +2026-02-26 13:56:02,097 INFO MainThread:4744 [wandb_setup.py:_flush():81] Loading settings from /root/.config/wandb/settings +2026-02-26 13:56:02,097 INFO MainThread:4744 [wandb_setup.py:_flush():81] Loading settings from /notebooks/toy_models/model_training/model/wandb/settings +2026-02-26 13:56:02,097 INFO MainThread:4744 [wandb_setup.py:_flush():81] Loading settings from environment variables +2026-02-26 13:56:02,097 INFO MainThread:4744 [wandb_init.py:setup_run_log_directory():686] Logging user logs to /notebooks/toy_models/model_training/model/wandb/run-20260226_135602-696nxyfr/logs/debug.log +2026-02-26 13:56:02,097 INFO MainThread:4744 [wandb_init.py:setup_run_log_directory():687] Logging internal logs to /notebooks/toy_models/model_training/model/wandb/run-20260226_135602-696nxyfr/logs/debug-internal.log +2026-02-26 13:56:02,097 INFO MainThread:4744 [wandb_init.py:init():813] calling init triggers +2026-02-26 13:56:02,098 INFO MainThread:4744 [wandb_init.py:init():818] wandb.init called with sweep_config: {} +config: {'model_name': 'pile_llama_2H_2L', 'n_layers': 2, 'd_model': 512, 'd_mlp': 2048, 'd_head': 64, 'n_heads': 2, 'attn_only': False, 'layer_norm_eps': 1e-05, 'init_range': 0.02, 'n_ctx': 1024, 'd_vocab': 32000, 'dataset_name': 'eoinf/pile_llama', 'tokenizer_name': '', 'seed': 10, 'data_seed': 10, 'device': 'cuda', 'use_bfloat16_matmul': False, 'batch_size_per_device': 32, 'n_devices': 1, 'batches_per_step': 1, 'max_tokens': 200000000, 'lr_hidden': 0.001, 'lr_vector': 0.0005, 'lr_schedule': 'constant_with_warmup', 'warmup_tokens': 30000000, 'weight_decay': 0.05, 'grad_norm_clip': 1.0, 'train_loss_moving_average_beta': 0.99, 'log_interval': 25, 'save_checkpoints': True, 'checkpoint_interval': 500, 'checkpoint_interval_ratio': 1.1, 'save_log_checkpoints': True, 'use_wandb': True, 'batch_size': 32, 'tokens_per_step': 32768, 'warmup_steps': 915, 'max_steps': 6103, '_wandb': {}} +2026-02-26 13:56:02,098 INFO MainThread:4744 [wandb_init.py:init():854] starting backend +2026-02-26 13:56:02,302 INFO MainThread:4744 [wandb_init.py:init():857] sending inform_init request +2026-02-26 13:56:02,306 INFO MainThread:4744 [wandb_init.py:init():865] backend started and connected +2026-02-26 13:56:02,308 INFO MainThread:4744 [wandb_init.py:init():936] updated telemetry +2026-02-26 13:56:02,312 INFO MainThread:4744 [wandb_init.py:init():960] communicating run to backend with 90.0 second timeout +2026-02-26 13:56:02,787 INFO MainThread:4744 [wandb_init.py:init():1011] starting run threads in backend +2026-02-26 13:56:02,902 INFO MainThread:4744 [wandb_run.py:_console_start():2506] atexit reg +2026-02-26 13:56:02,902 INFO MainThread:4744 [wandb_run.py:_redirect():2354] redirect: wrap_raw +2026-02-26 13:56:02,902 INFO MainThread:4744 [wandb_run.py:_redirect():2423] Wrapping output streams. +2026-02-26 13:56:02,902 INFO MainThread:4744 [wandb_run.py:_redirect():2446] Redirects installed. +2026-02-26 13:56:02,905 INFO MainThread:4744 [wandb_init.py:init():1049] run started, returning control to user process +2026-02-26 14:39:41,470 INFO MainThread:4744 [wandb_run.py:_finish():2272] finishing run tzach/toy-transformer-replication/696nxyfr +2026-02-26 14:39:41,472 INFO MainThread:4744 [wandb_run.py:_atexit_cleanup():2471] got exitcode: 0 +2026-02-26 14:39:41,473 INFO MainThread:4744 [wandb_run.py:_restore():2453] restore +2026-02-26 14:39:41,473 INFO MainThread:4744 [wandb_run.py:_restore():2459] restore done +2026-02-26 14:39:42,357 INFO MainThread:4744 [wandb_run.py:_footer_sync_info():3867] logging synced files diff --git a/wandb/run-20260226_135602-696nxyfr/run-696nxyfr.wandb b/wandb/run-20260226_135602-696nxyfr/run-696nxyfr.wandb new file mode 100644 index 0000000000000000000000000000000000000000..6bd3537a9fb2e589eafbf7ce7d5a3edd5a07b96a --- /dev/null +++ b/wandb/run-20260226_135602-696nxyfr/run-696nxyfr.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9acd81e4603f662c572bb3ba87da16f817d5997d55fa49e530466c399990d2f2 +size 3900422