diff --git a/base_emb_use/final_model/config.json b/base_emb_use/final_model/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/base_emb_use/final_model/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/base_emb_use/model_1000/config.json b/base_emb_use/model_1000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/base_emb_use/model_1000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/base_emb_use/model_1000/training_state.json b/base_emb_use/model_1000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..15c7b7c170b2bdd2a6709fbddff5ee62c80158f3 --- /dev/null +++ b/base_emb_use/model_1000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 1000, + "update_step": 1000, + "tokens_seen": 639025152, + "tokens_seen_before": 638386176, + "update_time": 2.865692138671875, + "wandb_id": "vtdicubq" +} \ No newline at end of file diff --git a/base_emb_use/model_10000/config.json b/base_emb_use/model_10000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/base_emb_use/model_10000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/base_emb_use/model_10000/training_state.json b/base_emb_use/model_10000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..cba3d8b207a518e1a9ce5757ab13133ccb971265 --- /dev/null +++ b/base_emb_use/model_10000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 10000, + "update_step": 10000, + "tokens_seen": 6389809152, + "tokens_seen_before": 6389170176, + "update_time": 2.8687188625335693, + "wandb_id": "vtdicubq" +} \ No newline at end of file diff --git a/base_emb_use/model_11000/config.json b/base_emb_use/model_11000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/base_emb_use/model_11000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/base_emb_use/model_11000/training_state.json b/base_emb_use/model_11000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..245e9888611613adb64a4c1f9c3367ab47df92a0 --- /dev/null +++ b/base_emb_use/model_11000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 11000, + "update_step": 11000, + "tokens_seen": 7028785152, + "tokens_seen_before": 7028146176, + "update_time": 2.8690357208251953, + "wandb_id": "vtdicubq" +} \ No newline at end of file diff --git a/base_emb_use/model_12000/config.json b/base_emb_use/model_12000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/base_emb_use/model_12000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/base_emb_use/model_12000/training_state.json b/base_emb_use/model_12000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ed97d1836c7d1e0816c6ef949191bdf6c5477fba --- /dev/null +++ b/base_emb_use/model_12000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 12000, + "update_step": 12000, + "tokens_seen": 7667761152, + "tokens_seen_before": 7667122176, + "update_time": 2.8693509101867676, + "wandb_id": "vtdicubq" +} \ No newline at end of file diff --git a/base_emb_use/model_13000/config.json b/base_emb_use/model_13000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/base_emb_use/model_13000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/base_emb_use/model_13000/training_state.json b/base_emb_use/model_13000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ea8b73b4cab0607cb84ef410d31b44b910080640 --- /dev/null +++ b/base_emb_use/model_13000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 13000, + "update_step": 13000, + "tokens_seen": 8306737152, + "tokens_seen_before": 8306098176, + "update_time": 2.8668320178985596, + "wandb_id": "vtdicubq" +} \ No newline at end of file diff --git a/base_emb_use/model_14000/config.json b/base_emb_use/model_14000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/base_emb_use/model_14000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/base_emb_use/model_14000/training_state.json b/base_emb_use/model_14000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..3dc989bd461d1054a7e60e7c27feaf1b061c90bc --- /dev/null +++ b/base_emb_use/model_14000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 14000, + "update_step": 14000, + "tokens_seen": 8945713152, + "tokens_seen_before": 8945074176, + "update_time": 2.8690578937530518, + "wandb_id": "vtdicubq" +} \ No newline at end of file diff --git a/base_emb_use/model_15000/config.json b/base_emb_use/model_15000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/base_emb_use/model_15000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/base_emb_use/model_15000/training_state.json b/base_emb_use/model_15000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..bca0f28ea6a223fbddcb90c9abc772d5d73fd5b8 --- /dev/null +++ b/base_emb_use/model_15000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 15000, + "update_step": 15000, + "tokens_seen": 9584689152, + "tokens_seen_before": 9584050176, + "update_time": 2.8692784309387207, + "wandb_id": "vtdicubq" +} \ No newline at end of file diff --git a/base_emb_use/model_2000/config.json b/base_emb_use/model_2000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/base_emb_use/model_2000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/base_emb_use/model_2000/training_state.json b/base_emb_use/model_2000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..963645bc4bf5d27b819434636e40d61a39d0937e --- /dev/null +++ b/base_emb_use/model_2000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 2000, + "update_step": 2000, + "tokens_seen": 1278001152, + "tokens_seen_before": 1277362176, + "update_time": 2.8680050373077393, + "wandb_id": "vtdicubq" +} \ No newline at end of file diff --git a/base_emb_use/model_3000/config.json b/base_emb_use/model_3000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/base_emb_use/model_3000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/base_emb_use/model_3000/training_state.json b/base_emb_use/model_3000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..089346950a4012713b060b1c9bd1fdf5d9393650 --- /dev/null +++ b/base_emb_use/model_3000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 3000, + "update_step": 3000, + "tokens_seen": 1916977152, + "tokens_seen_before": 1916338176, + "update_time": 2.8667635917663574, + "wandb_id": "vtdicubq" +} \ No newline at end of file diff --git a/base_emb_use/model_4000/config.json b/base_emb_use/model_4000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/base_emb_use/model_4000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/base_emb_use/model_4000/training_state.json b/base_emb_use/model_4000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..18d2b1417390c02178c2691612078763ace81a12 --- /dev/null +++ b/base_emb_use/model_4000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 4000, + "update_step": 4000, + "tokens_seen": 2555953152, + "tokens_seen_before": 2555314176, + "update_time": 2.869117259979248, + "wandb_id": "vtdicubq" +} \ No newline at end of file diff --git a/base_emb_use/model_5000/config.json b/base_emb_use/model_5000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/base_emb_use/model_5000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/base_emb_use/model_5000/training_state.json b/base_emb_use/model_5000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..8f1c981ee81450e83ac28926a47d0cdbc111161b --- /dev/null +++ b/base_emb_use/model_5000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 5000, + "update_step": 5000, + "tokens_seen": 3194929152, + "tokens_seen_before": 3194290176, + "update_time": 2.867243528366089, + "wandb_id": "vtdicubq" +} \ No newline at end of file diff --git a/base_emb_use/model_6000/config.json b/base_emb_use/model_6000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/base_emb_use/model_6000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/base_emb_use/model_6000/training_state.json b/base_emb_use/model_6000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f30faa6326964eb2260502458c29fe37d47c0cea --- /dev/null +++ b/base_emb_use/model_6000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 6000, + "update_step": 6000, + "tokens_seen": 3833905152, + "tokens_seen_before": 3833266176, + "update_time": 2.87217116355896, + "wandb_id": "vtdicubq" +} \ No newline at end of file diff --git a/base_emb_use/model_7000/config.json b/base_emb_use/model_7000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/base_emb_use/model_7000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/base_emb_use/model_7000/training_state.json b/base_emb_use/model_7000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..690748dff4b6c99476814b14cebab8c76e2d5db3 --- /dev/null +++ b/base_emb_use/model_7000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 7000, + "update_step": 7000, + "tokens_seen": 4472881152, + "tokens_seen_before": 4472242176, + "update_time": 2.8670828342437744, + "wandb_id": "vtdicubq" +} \ No newline at end of file diff --git a/base_emb_use/model_8000/config.json b/base_emb_use/model_8000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/base_emb_use/model_8000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/base_emb_use/model_8000/training_state.json b/base_emb_use/model_8000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d154d6c0c0fd2f7293e6abd8d8371d23d9811245 --- /dev/null +++ b/base_emb_use/model_8000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 8000, + "update_step": 8000, + "tokens_seen": 5111857152, + "tokens_seen_before": 5111218176, + "update_time": 2.869532823562622, + "wandb_id": "vtdicubq" +} \ No newline at end of file diff --git a/base_emb_use/model_9000/config.json b/base_emb_use/model_9000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/base_emb_use/model_9000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/base_emb_use/model_9000/training_state.json b/base_emb_use/model_9000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..00bc3c2501fd9fda837d803b51db6878c348c3dd --- /dev/null +++ b/base_emb_use/model_9000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 9000, + "update_step": 9000, + "tokens_seen": 5750833152, + "tokens_seen_before": 5750194176, + "update_time": 2.8697056770324707, + "wandb_id": "vtdicubq" +} \ No newline at end of file diff --git a/base_emb_use/training_config.yaml b/base_emb_use/training_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..14cb1ac941fde0cdefb8974b52f9bbf15640a7c2 --- /dev/null +++ b/base_emb_use/training_config.yaml @@ -0,0 +1,47 @@ +adam_beta1: 0.9 +adam_beta2: 0.95 +adjust_step: 0 +autoresume: false +batch_size: 6 +clip_grad_norm: 1.0 +comment: null +cycle_length: null +dtype: bfloat16 +emb_freeze: null +eval_dataset_path: /work01/yanokazuki/fineweb/valid_data_gpt2/ +eval_every: 1000 +first_attention: false +first_attention_resume: false +gradient_accumulation: 13 +keep_checkpoints: null +layer_freeze: null +layer_freeze_2: false +load_optimizer_state_on_resume: true +lr: 0.0004 +max_length: 1024 +max_train_tokens: null +min_lr_ratio: 0.1 +model_config: model_config/478m.json +model_name_or_path: null +model_revision: null +num_training_steps: 15000 +optimizer: Adam +restart_warmup_steps: null +resume_from: null +run_name: base_emb_use +save_dir: checkpoints/base_emb_use +save_every: 1000 +scheduler: cosine +seed: 0 +shuffle: true +skip_batches: !!set {} +tags: +- 396m-for-680m +total_batch_size: 624 +train_dataset_path: /work01/yanokazuki/fineweb/train_data_gpt2/ +training_config: training_config/two_stage/478m_base_emb_use.yaml +wandb_watch: true +warmed_up_model: null +warmup_steps: 1500 +weight_decay: 0.0 +workers: 8 diff --git a/bigram_1/final_model/config.json b/bigram_1/final_model/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/bigram_1/final_model/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/bigram_1/model_1000/model_config.json b/bigram_1/model_1000/model_config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/bigram_1/model_1000/model_config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/bigram_1/model_1000/training_state.json b/bigram_1/model_1000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..6a8527c06538b94e9ded4610cdd2eb7582dc955f --- /dev/null +++ b/bigram_1/model_1000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 1000, + "update_step": 1000, + "tokens_seen": 639025152, + "tokens_seen_before": 638386176, + "update_time": 1.4602692127227783, + "wandb_id": "9rdftyoq" +} \ No newline at end of file diff --git a/bigram_1/model_10000/model_config.json b/bigram_1/model_10000/model_config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/bigram_1/model_10000/model_config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/bigram_1/model_10000/training_state.json b/bigram_1/model_10000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..851a4c2483a767a43bfad91a458492280ea1ca6d --- /dev/null +++ b/bigram_1/model_10000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 10000, + "update_step": 10000, + "tokens_seen": 6389809152, + "tokens_seen_before": 6389170176, + "update_time": 1.4594264030456543, + "wandb_id": "9rdftyoq" +} \ No newline at end of file diff --git a/bigram_1/model_11000/model_config.json b/bigram_1/model_11000/model_config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/bigram_1/model_11000/model_config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/bigram_1/model_11000/training_state.json b/bigram_1/model_11000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..33a5854ea9ba43e8479e3648261eeb18a7a26198 --- /dev/null +++ b/bigram_1/model_11000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 11000, + "update_step": 11000, + "tokens_seen": 7028785152, + "tokens_seen_before": 7028146176, + "update_time": 1.4602222442626953, + "wandb_id": "9rdftyoq" +} \ No newline at end of file diff --git a/bigram_1/model_12000/model_config.json b/bigram_1/model_12000/model_config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/bigram_1/model_12000/model_config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/bigram_1/model_12000/training_state.json b/bigram_1/model_12000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a277b09cb4c01895aac7da3f098d262b8c1f88a8 --- /dev/null +++ b/bigram_1/model_12000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 12000, + "update_step": 12000, + "tokens_seen": 7667761152, + "tokens_seen_before": 7667122176, + "update_time": 1.4616353511810303, + "wandb_id": "9rdftyoq" +} \ No newline at end of file diff --git a/bigram_1/model_13000/model_config.json b/bigram_1/model_13000/model_config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/bigram_1/model_13000/model_config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/bigram_1/model_13000/training_state.json b/bigram_1/model_13000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..9ef205b5e7a0016e140a63303f8eaf44973ab23c --- /dev/null +++ b/bigram_1/model_13000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 13000, + "update_step": 13000, + "tokens_seen": 8306737152, + "tokens_seen_before": 8306098176, + "update_time": 1.4623231887817383, + "wandb_id": "9rdftyoq" +} \ No newline at end of file diff --git a/bigram_1/model_14000/model_config.json b/bigram_1/model_14000/model_config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/bigram_1/model_14000/model_config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/bigram_1/model_14000/training_state.json b/bigram_1/model_14000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..9be1d5be549704d4584754ace88f696b06c0e205 --- /dev/null +++ b/bigram_1/model_14000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 14000, + "update_step": 14000, + "tokens_seen": 8945713152, + "tokens_seen_before": 8945074176, + "update_time": 1.4596848487854004, + "wandb_id": "9rdftyoq" +} \ No newline at end of file diff --git a/bigram_1/model_15000/model_config.json b/bigram_1/model_15000/model_config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/bigram_1/model_15000/model_config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/bigram_1/model_15000/training_state.json b/bigram_1/model_15000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..8092b1ff2c39b42f7b5743de61a85bd93d8b6e5e --- /dev/null +++ b/bigram_1/model_15000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 15000, + "update_step": 15000, + "tokens_seen": 9584689152, + "tokens_seen_before": 9584050176, + "update_time": 1.460613489151001, + "wandb_id": "9rdftyoq" +} \ No newline at end of file diff --git a/bigram_1/model_2000/model_config.json b/bigram_1/model_2000/model_config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/bigram_1/model_2000/model_config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/bigram_1/model_2000/training_state.json b/bigram_1/model_2000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ef31dc3840f71925977be593df1df10c0f08fb22 --- /dev/null +++ b/bigram_1/model_2000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 2000, + "update_step": 2000, + "tokens_seen": 1278001152, + "tokens_seen_before": 1277362176, + "update_time": 1.460418701171875, + "wandb_id": "9rdftyoq" +} \ No newline at end of file diff --git a/bigram_1/model_3000/model_config.json b/bigram_1/model_3000/model_config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/bigram_1/model_3000/model_config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/bigram_1/model_3000/training_state.json b/bigram_1/model_3000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..752ae3b10c4c25cc4dddfca8ea800add533221c6 --- /dev/null +++ b/bigram_1/model_3000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 3000, + "update_step": 3000, + "tokens_seen": 1916977152, + "tokens_seen_before": 1916338176, + "update_time": 1.4594852924346924, + "wandb_id": "9rdftyoq" +} \ No newline at end of file diff --git a/bigram_1/model_4000/model_config.json b/bigram_1/model_4000/model_config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/bigram_1/model_4000/model_config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/bigram_1/model_4000/training_state.json b/bigram_1/model_4000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c10efc393f7f3b23ecf2e74ef5c3696c21112c5c --- /dev/null +++ b/bigram_1/model_4000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 4000, + "update_step": 4000, + "tokens_seen": 2555953152, + "tokens_seen_before": 2555314176, + "update_time": 1.4598579406738281, + "wandb_id": "9rdftyoq" +} \ No newline at end of file diff --git a/bigram_1/model_5000/model_config.json b/bigram_1/model_5000/model_config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/bigram_1/model_5000/model_config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/bigram_1/model_5000/training_state.json b/bigram_1/model_5000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..bab9be1f637448f4bbd6f07b1123bf464188b8c5 --- /dev/null +++ b/bigram_1/model_5000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 5000, + "update_step": 5000, + "tokens_seen": 3194929152, + "tokens_seen_before": 3194290176, + "update_time": 1.4597187042236328, + "wandb_id": "9rdftyoq" +} \ No newline at end of file diff --git a/bigram_1/model_6000/model_config.json b/bigram_1/model_6000/model_config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/bigram_1/model_6000/model_config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/bigram_1/model_6000/training_state.json b/bigram_1/model_6000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2bba4037e97c425fb7c9c4bfe56034ed88d0b8e9 --- /dev/null +++ b/bigram_1/model_6000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 6000, + "update_step": 6000, + "tokens_seen": 3833905152, + "tokens_seen_before": 3833266176, + "update_time": 1.4601149559020996, + "wandb_id": "9rdftyoq" +} \ No newline at end of file diff --git a/bigram_1/model_7000/model_config.json b/bigram_1/model_7000/model_config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/bigram_1/model_7000/model_config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/bigram_1/model_7000/training_state.json b/bigram_1/model_7000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c3338b1f7574e213dba55cfb986a495185ca4d13 --- /dev/null +++ b/bigram_1/model_7000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 7000, + "update_step": 7000, + "tokens_seen": 4472881152, + "tokens_seen_before": 4472242176, + "update_time": 1.4616649150848389, + "wandb_id": "9rdftyoq" +} \ No newline at end of file diff --git a/bigram_1/model_8000/model_config.json b/bigram_1/model_8000/model_config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/bigram_1/model_8000/model_config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/bigram_1/model_8000/training_state.json b/bigram_1/model_8000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..874020c08ca3578a0cb6c450ad6654480ce0aa2a --- /dev/null +++ b/bigram_1/model_8000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 8000, + "update_step": 8000, + "tokens_seen": 5111857152, + "tokens_seen_before": 5111218176, + "update_time": 1.4601621627807617, + "wandb_id": "9rdftyoq" +} \ No newline at end of file diff --git a/bigram_1/model_9000/model_config.json b/bigram_1/model_9000/model_config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/bigram_1/model_9000/model_config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/bigram_1/model_9000/training_state.json b/bigram_1/model_9000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..dd511bb734c6009d8e358f157c570bbe7843376d --- /dev/null +++ b/bigram_1/model_9000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 9000, + "update_step": 9000, + "tokens_seen": 5750833152, + "tokens_seen_before": 5750194176, + "update_time": 1.459611415863037, + "wandb_id": "9rdftyoq" +} \ No newline at end of file diff --git a/bigram_1/training_config.yaml b/bigram_1/training_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..bfa550f853af011e724cac942f21074b788478a3 --- /dev/null +++ b/bigram_1/training_config.yaml @@ -0,0 +1,44 @@ +adam_beta1: 0.9 +adam_beta2: 0.95 +adjust_step: 0 +autoresume: false +batch_size: 6 +clip_grad_norm: 1.0 +comment: null +cycle_length: null +dtype: bfloat16 +emb_freeze: null +eval_dataset_path: /work01/yanokazuki/fineweb/valid_data_gpt2/ +eval_every: 1000 +gradient_accumulation: 13 +keep_checkpoints: null +layer_freeze: true +load_optimizer_state_on_resume: true +lr: 0.0004 +max_length: 1024 +max_train_tokens: null +min_lr_ratio: 0.1 +model_config: model_config/478m.json +model_name_or_path: null +model_revision: null +num_training_steps: 15000 +optimizer: Adam +restart_warmup_steps: null +resume_from: null +run_name: tough-snowflake-18 +save_dir: checkpoints/tough-snowflake-18 +save_every: 1000 +scheduler: cosine +seed: 0 +shuffle: true +skip_batches: !!set {} +tags: +- 396m-for-680m +total_batch_size: 624 +train_dataset_path: /work01/yanokazuki/fineweb/train_data_gpt2/ +training_config: training_config/478m_emb.yaml +wandb_watch: true +warmed_up_model: null +warmup_steps: 1500 +weight_decay: 0.0 +workers: 8 diff --git a/bigram_2/model_13000/model_config.json b/bigram_2/model_13000/model_config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/bigram_2/model_13000/model_config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/bigram_2/model_13000/training_state.json b/bigram_2/model_13000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..78a8b95b4483b403a7456daa2eb92589b6f439b0 --- /dev/null +++ b/bigram_2/model_13000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 13000, + "update_step": 13000, + "tokens_seen": 8306737152, + "tokens_seen_before": 8306098176, + "update_time": 2.8910369873046875, + "wandb_id": "walilhuz" +} \ No newline at end of file diff --git a/bigram_2/model_2000/model_config.json b/bigram_2/model_2000/model_config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/bigram_2/model_2000/model_config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/bigram_2/model_2000/training_state.json b/bigram_2/model_2000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..638a58cb0f19936cf4fb000a23ad0483fb5a586a --- /dev/null +++ b/bigram_2/model_2000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 2000, + "update_step": 2000, + "tokens_seen": 1278001152, + "tokens_seen_before": 1277362176, + "update_time": 2.8903284072875977, + "wandb_id": "walilhuz" +} \ No newline at end of file diff --git a/bigram_2/model_3000/model_config.json b/bigram_2/model_3000/model_config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/bigram_2/model_3000/model_config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/bigram_2/model_3000/training_state.json b/bigram_2/model_3000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..6f674c8fadf58cf21661a3b31f1c05a81de8745c --- /dev/null +++ b/bigram_2/model_3000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 3000, + "update_step": 3000, + "tokens_seen": 1916977152, + "tokens_seen_before": 1916338176, + "update_time": 2.8923609256744385, + "wandb_id": "walilhuz" +} \ No newline at end of file diff --git a/bigram_2/model_4000/training_state.json b/bigram_2/model_4000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2a78d59b93837e7419ddd6ab6378123c36d25835 --- /dev/null +++ b/bigram_2/model_4000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 4000, + "update_step": 4000, + "tokens_seen": 2555953152, + "tokens_seen_before": 2555314176, + "update_time": 2.890476942062378, + "wandb_id": "walilhuz" +} \ No newline at end of file diff --git a/bigram_2/training_config.yaml b/bigram_2/training_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c9710c9024bed12ebc70c9169a2ebc75dbb9f6f8 --- /dev/null +++ b/bigram_2/training_config.yaml @@ -0,0 +1,44 @@ +adam_beta1: 0.9 +adam_beta2: 0.95 +adjust_step: 0 +autoresume: false +batch_size: 6 +clip_grad_norm: 1.0 +comment: null +cycle_length: null +dtype: bfloat16 +emb_freeze: true +eval_dataset_path: /work01/yanokazuki/fineweb/valid_data_gpt2/ +eval_every: 1000 +gradient_accumulation: 13 +keep_checkpoints: null +layer_freeze: null +load_optimizer_state_on_resume: true +lr: 0.0004 +max_length: 1024 +max_train_tokens: null +min_lr_ratio: 0.1 +model_config: model_config/478m.json +model_name_or_path: null +model_revision: null +num_training_steps: 15000 +optimizer: Adam +restart_warmup_steps: null +resume_from: null +run_name: easy-pyramid-21 +save_dir: checkpoints/easy-pyramid-21 +save_every: 1000 +scheduler: cosine +seed: 0 +shuffle: true +skip_batches: !!set {} +tags: +- 396m-for-680m +total_batch_size: 624 +train_dataset_path: /work01/yanokazuki/fineweb/train_data_gpt2/ +training_config: training_config/two_stage/478m_resume.yaml +wandb_watch: true +warmed_up_model: /lee_embedding/checkpoints/tough-snowflake-18/final_model/ +warmup_steps: 1500 +weight_decay: 0.0 +workers: 8 diff --git a/first_layer_1/model_12000/config.json b/first_layer_1/model_12000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/first_layer_1/model_12000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/first_layer_1/model_12000/training_state.json b/first_layer_1/model_12000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e699f8ea442474ed874ec31f026c4465ee0e48d9 --- /dev/null +++ b/first_layer_1/model_12000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 12000, + "update_step": 12000, + "tokens_seen": 7667761152, + "tokens_seen_before": 7667122176, + "update_time": 0.9768052101135254, + "wandb_id": "krzb2185" +} \ No newline at end of file diff --git a/first_layer_1/model_6000/training_state.json b/first_layer_1/model_6000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..8314347de47392a155876dde4a891399d05d9f47 --- /dev/null +++ b/first_layer_1/model_6000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 6000, + "update_step": 6000, + "tokens_seen": 3833905152, + "tokens_seen_before": 3833266176, + "update_time": 0.977527379989624, + "wandb_id": "krzb2185" +} \ No newline at end of file diff --git a/first_layer_2/model_9000/pytorch_model.bin b/first_layer_2/model_9000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..d04e5f9dc34987738cf39e4c383552c37c3fff94 --- /dev/null +++ b/first_layer_2/model_9000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6745c348c0cdb36a848e7e72f188458016717fed5ab15ffa0565d69987a3202b +size 2533545094 diff --git a/silver-butterfly-62/model_1000/optimizer.pt b/silver-butterfly-62/model_1000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..543b5a4608f64e67e0532ffc6d324012824cf0b0 --- /dev/null +++ b/silver-butterfly-62/model_1000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:72b1e3e63c0256105b14fa9aeb5a04bffa4e5952d54786b6c75fb9de5621ba71 +size 1235135766 diff --git a/silver-butterfly-62/model_10000/optimizer.pt b/silver-butterfly-62/model_10000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..11ea4ec8144ddf84a5001ecad2d22353e968a6c0 --- /dev/null +++ b/silver-butterfly-62/model_10000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca0d85c7ef287a84a161a03981d3360ba8cf7f41d144b26a49c9373501173f55 +size 1235135766 diff --git a/silver-butterfly-62/model_11000/optimizer.pt b/silver-butterfly-62/model_11000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..15feebcfe0bfbd82a86ebbaaf041707c25a2f455 --- /dev/null +++ b/silver-butterfly-62/model_11000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4b4145f1aef02cae173c97b1b93ddc668481350c8903f68fa14aa7f54fa2605 +size 1235135766 diff --git a/silver-butterfly-62/model_12000/optimizer.pt b/silver-butterfly-62/model_12000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..e43810ef1ed52dbd7ffa6928d6a6a7c7b9c215b4 --- /dev/null +++ b/silver-butterfly-62/model_12000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7455000e9f6168aef86412f6fe0f862598edef592d2000218e345a2e3d1e45b +size 1235135766 diff --git a/silver-butterfly-62/model_13000/optimizer.pt b/silver-butterfly-62/model_13000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..8860b5b2e77a899a1218217be770be4e870f3b2c --- /dev/null +++ b/silver-butterfly-62/model_13000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7402d975169b0e5b54adffb83235261bbf4c9063b32e053b8fd0accb4528795a +size 1235135766 diff --git a/silver-butterfly-62/model_14000/optimizer.pt b/silver-butterfly-62/model_14000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..31914b3b6d35436342a17f2d8d15dff6895d5fee --- /dev/null +++ b/silver-butterfly-62/model_14000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89aec104c5a49fad7e4bf4a44e6dc279af6c9dbe585064b0f001af2e3268e992 +size 1235135766 diff --git a/silver-butterfly-62/model_15000/optimizer.pt b/silver-butterfly-62/model_15000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..9c8cd8f0f73b7ef33c57a11ea509667dfe1106c0 --- /dev/null +++ b/silver-butterfly-62/model_15000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4fd8003a615e3ec327175d0ed6ed7fd20a7010b90e9c8db88072970d17a17f5b +size 1235135766 diff --git a/silver-butterfly-62/model_2000/optimizer.pt b/silver-butterfly-62/model_2000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..eee231f43d1f169472670d4485bca03822f56ad4 --- /dev/null +++ b/silver-butterfly-62/model_2000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f5a29b9f94d945cf4afde35c7711ed424c5cfb24e940f8d43fb553e2b266573 +size 1235135766 diff --git a/silver-butterfly-62/model_3000/optimizer.pt b/silver-butterfly-62/model_3000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..4cdecbcc565a7311d150305c387aad554fada14a --- /dev/null +++ b/silver-butterfly-62/model_3000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60fa44e905ef6b4c72c56389899c83c9f0c5a5029f3ec6de260b70502782f32c +size 1235135766 diff --git a/silver-butterfly-62/model_4000/optimizer.pt b/silver-butterfly-62/model_4000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..b178b3d6d393c655c1fdf66b7d9a8f0e7a0dad55 --- /dev/null +++ b/silver-butterfly-62/model_4000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ae0091d1e41e9eca96f11184f42abb83415a9d70da73d4c67b88e725de33641 +size 1235135766 diff --git a/silver-butterfly-62/model_5000/optimizer.pt b/silver-butterfly-62/model_5000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..029cd62cd241df7594294515134be01f8aba0b47 --- /dev/null +++ b/silver-butterfly-62/model_5000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d2d0584676c47e68513f45f920a523bf5c8c75094f983477aef36cb4cae5c64 +size 1235135766 diff --git a/silver-butterfly-62/model_6000/optimizer.pt b/silver-butterfly-62/model_6000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..4976954cd7b8241ec05b68ebff5cff5fadffb8e7 --- /dev/null +++ b/silver-butterfly-62/model_6000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ab30af3a81974a3871c7c41c49992493fe8c78ef91cede2d61d0d9dfa189008 +size 1235135766 diff --git a/silver-butterfly-62/model_7000/optimizer.pt b/silver-butterfly-62/model_7000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..343a403523126d79b869b7fa973527d4d1c58fe9 --- /dev/null +++ b/silver-butterfly-62/model_7000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:689eb545a9b45e7997f23e5082e0b1e15ee9a92e71d8cd3f945306647c1dd7f9 +size 1235135766 diff --git a/silver-butterfly-62/model_8000/optimizer.pt b/silver-butterfly-62/model_8000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..c2fd6dd36e68379819e6e780ecda437e33ef4e00 --- /dev/null +++ b/silver-butterfly-62/model_8000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb3fd05cf43b7e954af23d09238cdf72c41d81f01856fcbaf495c7f0154bc7af +size 1235135766 diff --git a/silver-butterfly-62/model_9000/optimizer.pt b/silver-butterfly-62/model_9000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..12b620b357ca3a49bf86ef2ae82774a8289a9752 --- /dev/null +++ b/silver-butterfly-62/model_9000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26fb5237797af3db1168c7cbe4dac1f10cbc24016038ca6e363f9c1a8b8fead2 +size 1235135766