diff --git a/.gitattributes b/.gitattributes index 2fbec69e5b0fadbb74bf4603539a76d50ecf6b82..86931a25dd8dba37b1afad8f3c7d0e749c47240b 100644 --- a/.gitattributes +++ b/.gitattributes @@ -43,3 +43,9 @@ checkpoint-4000/tokenizer.json filter=lfs diff=lfs merge=lfs -text checkpoint-4500/tokenizer.json filter=lfs diff=lfs merge=lfs -text checkpoint-500/tokenizer.json filter=lfs diff=lfs merge=lfs -text tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-5000/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-5500/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-6000/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-6500/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-7000/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-7242/tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/adapter_config.json b/adapter_config.json index 1a5356a6269df402dbde851645d59d282fadd9df..84dde86ee0bd87c0f5d5a0a36710a2a58516a7c4 100644 --- a/adapter_config.json +++ b/adapter_config.json @@ -29,9 +29,9 @@ "rank_pattern": {}, "revision": null, "target_modules": [ + "q_proj", "o_proj", "v_proj", - "q_proj", "k_proj" ], "target_parameters": null, diff --git a/adapter_model.safetensors b/adapter_model.safetensors index e19bdd6237caeb3d30847a6ca892f3bdb4ab3920..3accea21d4d6eecc8f6c4447d66d5ad0c1978920 100644 --- a/adapter_model.safetensors +++ b/adapter_model.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8737189ec50534340f940487b7bbcfbb3c0341cdc991f458aa11988b0dcf614e +oid sha256:6ed806adeae688d7c41407f6645cccc7ce2b13d73c5c283a964e550db5cccdfd size 54560368 diff --git a/amiya_training_config.json b/amiya_training_config.json new file mode 100644 index 0000000000000000000000000000000000000000..fcc6a3fa5b1d6a40df274f0d30f7799b947e96d2 --- /dev/null +++ b/amiya_training_config.json @@ -0,0 +1,30 @@ +{ + "task": "AMIYA - Palestinian Dialect Generation & Translation", + "base_model_id": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model_name": "llama3.1-8b-amiya-palestinian", + "lora_config": { + "r": 16, + "alpha": 32, + "dropout": 0.1, + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj" + ] + }, + "training_config": { + "learning_rate": 0.0002, + "batch_size": 4, + "gradient_accumulation_steps": 4, + "num_epochs": 3, + "max_seq_length": 512 + }, + "data_info": { + "train_examples": 38610, + "val_examples": 4826, + "task_distribution": { + "generation": 38610 + } + } +} \ No newline at end of file diff --git a/checkpoint-1000/adapter_config.json b/checkpoint-1000/adapter_config.json index 1a5356a6269df402dbde851645d59d282fadd9df..84dde86ee0bd87c0f5d5a0a36710a2a58516a7c4 100644 --- a/checkpoint-1000/adapter_config.json +++ b/checkpoint-1000/adapter_config.json @@ -29,9 +29,9 @@ "rank_pattern": {}, "revision": null, "target_modules": [ + "q_proj", "o_proj", "v_proj", - "q_proj", "k_proj" ], "target_parameters": null, diff --git a/checkpoint-1000/adapter_model.safetensors b/checkpoint-1000/adapter_model.safetensors index 4e0399e17ba8923b90c761d79f361bbcc40d70bd..28527035a3a13d62b09d7f4c4a5c46c74ba2c589 100644 --- a/checkpoint-1000/adapter_model.safetensors +++ b/checkpoint-1000/adapter_model.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a337408970398c2e9a24e688bf7ae27f447fa36418d8f264d28e4a21f2f49314 +oid sha256:e0e9afa212c03f271afa6d36c899544f890afc05a2c223d980daf1a6e15ef57c size 54560368 diff --git a/checkpoint-1000/optimizer.pt b/checkpoint-1000/optimizer.pt index 9d00181334a0de26cff60292421e98fb0227d05f..d852e406e24810960c22ad22e6feb80cd6621a61 100644 --- a/checkpoint-1000/optimizer.pt +++ b/checkpoint-1000/optimizer.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:836a20d29ed19417deb4b6ed2fc4b4569861de82c4c55fabfe49fbe87f5fb08d +oid sha256:d5f323c02c1896e68421c3a31c11a7088016245c869c50a4426821b3cd7a3b19 size 109267450 diff --git a/checkpoint-1000/rng_state.pth b/checkpoint-1000/rng_state.pth index 314959754555d154df7b6188ddb285d974f18981..93dcf8865ea5c71b6ef56fe00a9baf33b7fbdcfb 100644 --- a/checkpoint-1000/rng_state.pth +++ b/checkpoint-1000/rng_state.pth @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fd8c7fc2d07824f068e75323719839356ff5fdee8fb7889a50120d59de9dba54 +oid sha256:0848c22229788451a8855f4ad6b26100cfddc951d37153298ef3edaa793e835b size 14244 diff --git a/checkpoint-1000/scheduler.pt b/checkpoint-1000/scheduler.pt index 630ea081dc7126548a789051b970f22659ee7c8d..787312e21965cfcdb38bf356a3e13a1d716c17dc 100644 --- a/checkpoint-1000/scheduler.pt +++ b/checkpoint-1000/scheduler.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d1e1088243bd7a7c628a47a6bf4ac054b65997c9a7e139b848ea5fe3e7d04eb2 +oid sha256:b1136245b007779a968f37d1aeab3ab161c76720f4fca73eee284d9fc931f26e size 1064 diff --git a/checkpoint-1000/trainer_state.json b/checkpoint-1000/trainer_state.json index e7d0cf6987a1d65a9c24cd365f85ff731493e373..8803d66f98b7c8dcd94b9f43806e5856981323d0 100644 --- a/checkpoint-1000/trainer_state.json +++ b/checkpoint-1000/trainer_state.json @@ -1,8 +1,8 @@ { "best_global_step": 1000, - "best_metric": 0.5663638710975647, - "best_model_checkpoint": "finetuned_models/llama3.1-8b-lora/checkpoint-1000", - "epoch": 0.6666666666666666, + "best_metric": 0.7030432820320129, + "best_model_checkpoint": "amiya_outputs/llama3.1-8b-amiya-palestinian/checkpoint-1000", + "epoch": 0.4143789495493629, "eval_steps": 250, "global_step": 1000, "is_hyper_param_search": false, @@ -10,180 +10,180 @@ "is_world_process_zero": true, "log_history": [ { - "epoch": 0.03333333333333333, - "grad_norm": 0.5346225500106812, + "epoch": 0.020718947477468146, + "grad_norm": 1.0589393377304077, "learning_rate": 9.8e-05, - "loss": 2.4955, + "loss": 2.6567, "step": 50 }, { - "epoch": 0.06666666666666667, - "grad_norm": 0.719093918800354, + "epoch": 0.04143789495493629, + "grad_norm": 0.9738045334815979, "learning_rate": 0.00019800000000000002, - "loss": 0.71, + "loss": 0.9502, "step": 100 }, { - "epoch": 0.1, - "grad_norm": 0.4840560853481293, - "learning_rate": 0.0001977727272727273, - "loss": 0.6405, + "epoch": 0.062156842432404436, + "grad_norm": 0.8801347017288208, + "learning_rate": 0.00019862783534024082, + "loss": 0.8496, "step": 150 }, { - "epoch": 0.13333333333333333, - "grad_norm": 0.3332301676273346, - "learning_rate": 0.0001955, - "loss": 0.6287, + "epoch": 0.08287578990987259, + "grad_norm": 0.7272312045097351, + "learning_rate": 0.00019722766732007841, + "loss": 0.8184, "step": 200 }, { - "epoch": 0.16666666666666666, - "grad_norm": 0.40639588236808777, - "learning_rate": 0.00019322727272727276, - "loss": 0.5572, + "epoch": 0.10359473738734072, + "grad_norm": 0.7850629091262817, + "learning_rate": 0.000195827499299916, + "loss": 0.8392, "step": 250 }, { - "epoch": 0.16666666666666666, - "eval_loss": 0.5975945591926575, - "eval_runtime": 80.8004, - "eval_samples_per_second": 37.129, - "eval_steps_per_second": 9.282, + "epoch": 0.10359473738734072, + "eval_loss": 0.7402811050415039, + "eval_runtime": 85.9367, + "eval_samples_per_second": 56.158, + "eval_steps_per_second": 14.045, "step": 250 }, { - "epoch": 0.2, - "grad_norm": 0.3970712423324585, - "learning_rate": 0.00019095454545454545, - "loss": 0.6165, + "epoch": 0.12431368486480887, + "grad_norm": 0.40629276633262634, + "learning_rate": 0.00019442733127975358, + "loss": 0.8108, "step": 300 }, { - "epoch": 0.23333333333333334, - "grad_norm": 0.38409528136253357, - "learning_rate": 0.00018868181818181817, - "loss": 0.639, + "epoch": 0.145032632342277, + "grad_norm": 0.5258236527442932, + "learning_rate": 0.00019302716325959117, + "loss": 0.8116, "step": 350 }, { - "epoch": 0.26666666666666666, - "grad_norm": 0.44628769159317017, - "learning_rate": 0.00018640909090909092, - "loss": 0.636, + "epoch": 0.16575157981974517, + "grad_norm": 0.6879925727844238, + "learning_rate": 0.00019162699523942874, + "loss": 0.9089, "step": 400 }, { - "epoch": 0.3, - "grad_norm": 0.3697021007537842, - "learning_rate": 0.00018413636363636364, - "loss": 0.6192, + "epoch": 0.1864705272972133, + "grad_norm": 0.7583937048912048, + "learning_rate": 0.00019022682721926633, + "loss": 0.874, "step": 450 }, { - "epoch": 0.3333333333333333, - "grad_norm": 0.36338189244270325, - "learning_rate": 0.00018186363636363636, - "loss": 0.6134, + "epoch": 0.20718947477468144, + "grad_norm": 0.6399120688438416, + "learning_rate": 0.0001888266591991039, + "loss": 0.8366, "step": 500 }, { - "epoch": 0.3333333333333333, - "eval_loss": 0.5813060998916626, - "eval_runtime": 80.7819, - "eval_samples_per_second": 37.137, - "eval_steps_per_second": 9.284, + "epoch": 0.20718947477468144, + "eval_loss": 0.7194066047668457, + "eval_runtime": 86.2811, + "eval_samples_per_second": 55.933, + "eval_steps_per_second": 13.989, "step": 500 }, { - "epoch": 0.36666666666666664, - "grad_norm": 0.35211533308029175, - "learning_rate": 0.0001795909090909091, - "loss": 0.6128, + "epoch": 0.22790842225214958, + "grad_norm": 0.7763131856918335, + "learning_rate": 0.0001874264911789415, + "loss": 0.7912, "step": 550 }, { - "epoch": 0.4, - "grad_norm": 0.36327463388442993, - "learning_rate": 0.00017731818181818183, - "loss": 0.5915, + "epoch": 0.24862736972961774, + "grad_norm": 0.6845299601554871, + "learning_rate": 0.00018602632315877906, + "loss": 0.8506, "step": 600 }, { - "epoch": 0.43333333333333335, - "grad_norm": 0.40672942996025085, - "learning_rate": 0.00017504545454545455, - "loss": 0.5807, + "epoch": 0.2693463172070859, + "grad_norm": 0.8045451045036316, + "learning_rate": 0.00018462615513861665, + "loss": 0.763, "step": 650 }, { - "epoch": 0.4666666666666667, - "grad_norm": 0.4689007103443146, - "learning_rate": 0.00017277272727272728, - "loss": 0.602, + "epoch": 0.290065264684554, + "grad_norm": 0.7035927176475525, + "learning_rate": 0.00018322598711845422, + "loss": 0.7769, "step": 700 }, { - "epoch": 0.5, - "grad_norm": 0.3979697823524475, - "learning_rate": 0.00017050000000000002, - "loss": 0.5703, + "epoch": 0.3107842121620222, + "grad_norm": 0.465000718832016, + "learning_rate": 0.00018182581909829179, + "loss": 0.7705, "step": 750 }, { - "epoch": 0.5, - "eval_loss": 0.5740106701850891, - "eval_runtime": 80.8209, - "eval_samples_per_second": 37.119, - "eval_steps_per_second": 9.28, + "epoch": 0.3107842121620222, + "eval_loss": 0.7103215456008911, + "eval_runtime": 86.1101, + "eval_samples_per_second": 56.045, + "eval_steps_per_second": 14.017, "step": 750 }, { - "epoch": 0.5333333333333333, - "grad_norm": 0.3071135878562927, - "learning_rate": 0.00016822727272727275, - "loss": 0.5746, + "epoch": 0.33150315963949034, + "grad_norm": 0.4990151524543762, + "learning_rate": 0.00018042565107812938, + "loss": 0.8438, "step": 800 }, { - "epoch": 0.5666666666666667, - "grad_norm": 0.318085253238678, - "learning_rate": 0.00016595454545454544, - "loss": 0.5873, + "epoch": 0.35222210711695845, + "grad_norm": 0.7391067147254944, + "learning_rate": 0.00017902548305796695, + "loss": 0.7688, "step": 850 }, { - "epoch": 0.6, - "grad_norm": 0.35915374755859375, - "learning_rate": 0.0001636818181818182, - "loss": 0.6283, + "epoch": 0.3729410545944266, + "grad_norm": 0.8036171197891235, + "learning_rate": 0.00017762531503780454, + "loss": 0.753, "step": 900 }, { - "epoch": 0.6333333333333333, - "grad_norm": 0.3174057602882385, - "learning_rate": 0.0001614090909090909, - "loss": 0.5912, + "epoch": 0.3936600020718947, + "grad_norm": 0.44744470715522766, + "learning_rate": 0.00017622514701764213, + "loss": 0.7793, "step": 950 }, { - "epoch": 0.6666666666666666, - "grad_norm": 0.416111022233963, - "learning_rate": 0.00015913636363636363, - "loss": 0.5647, + "epoch": 0.4143789495493629, + "grad_norm": 0.630820631980896, + "learning_rate": 0.00017482497899747973, + "loss": 0.7555, "step": 1000 }, { - "epoch": 0.6666666666666666, - "eval_loss": 0.5663638710975647, - "eval_runtime": 80.8183, - "eval_samples_per_second": 37.12, - "eval_steps_per_second": 9.28, + "epoch": 0.4143789495493629, + "eval_loss": 0.7030432820320129, + "eval_runtime": 86.2543, + "eval_samples_per_second": 55.951, + "eval_steps_per_second": 13.994, "step": 1000 } ], "logging_steps": 50, - "max_steps": 4500, + "max_steps": 7242, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, @@ -199,7 +199,7 @@ "attributes": {} } }, - "total_flos": 1.2237370421673984e+17, + "total_flos": 6.287293343858688e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null diff --git a/checkpoint-1000/training_args.bin b/checkpoint-1000/training_args.bin index eb236cf9d6a781057673256e00030110ee72a3cc..818731b2d637beef34d216b7461a12ef35db62df 100644 --- a/checkpoint-1000/training_args.bin +++ b/checkpoint-1000/training_args.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eb76e3e4d2123d52e529262f1ff37bfc600a160bb369e2338a54f71b47c17108 +oid sha256:a783872a61d64f4d1b4e002fa0fa67fa54dd9c59c4f7fbad58c794a77134069d size 5432 diff --git a/checkpoint-1500/adapter_config.json b/checkpoint-1500/adapter_config.json index 1a5356a6269df402dbde851645d59d282fadd9df..84dde86ee0bd87c0f5d5a0a36710a2a58516a7c4 100644 --- a/checkpoint-1500/adapter_config.json +++ b/checkpoint-1500/adapter_config.json @@ -29,9 +29,9 @@ "rank_pattern": {}, "revision": null, "target_modules": [ + "q_proj", "o_proj", "v_proj", - "q_proj", "k_proj" ], "target_parameters": null, diff --git a/checkpoint-1500/adapter_model.safetensors b/checkpoint-1500/adapter_model.safetensors index a02e30688fb6400c86b7a9e973e8fe407d23f4cb..a07aea44f27e926a20044196c03b37a4639906a3 100644 --- a/checkpoint-1500/adapter_model.safetensors +++ b/checkpoint-1500/adapter_model.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fffc39fa6d134c8e1c7fb75eb4b8bba5ec8ab6c346da3a5eb4d76438cd39ae0c +oid sha256:4ea46f8890e14af515a490916d64d71a2431e0fc2dcd93524c7fc01129a8a616 size 54560368 diff --git a/checkpoint-1500/optimizer.pt b/checkpoint-1500/optimizer.pt index 6ad8e07f50676532db8d8027daa4800025626d00..cafea69b2bdd6e2d239bff420492adfa2adea6f0 100644 --- a/checkpoint-1500/optimizer.pt +++ b/checkpoint-1500/optimizer.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:717a1347063e278eeee0f830ff534ecc2eac4a766bbc7c3f20365db80f97a61c +oid sha256:354f1db55f58b61b25e0b64b403e6dde75e6311037b61694806cdbc4c95a72ff size 109267450 diff --git a/checkpoint-1500/rng_state.pth b/checkpoint-1500/rng_state.pth index f370d5f7c5194c43b51af64efdab6e28ef1f4a07..07d241c48b935a9bfba103b86cb9dc31a49379dc 100644 --- a/checkpoint-1500/rng_state.pth +++ b/checkpoint-1500/rng_state.pth @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:81714e5e2c84586d42b5d5f07880ce07b947cdccdb018347e47dd6d73d8228e1 +oid sha256:dc994964dd77b4b17f41bf873360fc1a0838df4b2f5359ed8062b49a57ca0441 size 14244 diff --git a/checkpoint-1500/scheduler.pt b/checkpoint-1500/scheduler.pt index dc2e4882f8c20f8e78ca0df51ff29a28bf30b6d4..7e9fb13ee789ac95bd67513d49d32562689036ed 100644 --- a/checkpoint-1500/scheduler.pt +++ b/checkpoint-1500/scheduler.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:00b75657512bf4d369b2b5bae16105c8cc283d42aacd01df4e2a83091d439a73 +oid sha256:e226332b6c4b4510f2c3b1022f832e7e6d32594e02d1e8b882e79ae8cbda6044 size 1064 diff --git a/checkpoint-1500/trainer_state.json b/checkpoint-1500/trainer_state.json index 34a9048a7caf82ec38383ed97ca66a0bf5ecae13..3e8aab1a84138db9bee07e761d2a1572b15e78fc 100644 --- a/checkpoint-1500/trainer_state.json +++ b/checkpoint-1500/trainer_state.json @@ -1,8 +1,8 @@ { "best_global_step": 1500, - "best_metric": 0.5581239461898804, - "best_model_checkpoint": "finetuned_models/llama3.1-8b-lora/checkpoint-1500", - "epoch": 1.0, + "best_metric": 0.6915447115898132, + "best_model_checkpoint": "amiya_outputs/llama3.1-8b-amiya-palestinian/checkpoint-1500", + "epoch": 0.6215684243240444, "eval_steps": 250, "global_step": 1500, "is_hyper_param_search": false, @@ -10,266 +10,266 @@ "is_world_process_zero": true, "log_history": [ { - "epoch": 0.03333333333333333, - "grad_norm": 0.5346225500106812, + "epoch": 0.020718947477468146, + "grad_norm": 1.0589393377304077, "learning_rate": 9.8e-05, - "loss": 2.4955, + "loss": 2.6567, "step": 50 }, { - "epoch": 0.06666666666666667, - "grad_norm": 0.719093918800354, + "epoch": 0.04143789495493629, + "grad_norm": 0.9738045334815979, "learning_rate": 0.00019800000000000002, - "loss": 0.71, + "loss": 0.9502, "step": 100 }, { - "epoch": 0.1, - "grad_norm": 0.4840560853481293, - "learning_rate": 0.0001977727272727273, - "loss": 0.6405, + "epoch": 0.062156842432404436, + "grad_norm": 0.8801347017288208, + "learning_rate": 0.00019862783534024082, + "loss": 0.8496, "step": 150 }, { - "epoch": 0.13333333333333333, - "grad_norm": 0.3332301676273346, - "learning_rate": 0.0001955, - "loss": 0.6287, + "epoch": 0.08287578990987259, + "grad_norm": 0.7272312045097351, + "learning_rate": 0.00019722766732007841, + "loss": 0.8184, "step": 200 }, { - "epoch": 0.16666666666666666, - "grad_norm": 0.40639588236808777, - "learning_rate": 0.00019322727272727276, - "loss": 0.5572, + "epoch": 0.10359473738734072, + "grad_norm": 0.7850629091262817, + "learning_rate": 0.000195827499299916, + "loss": 0.8392, "step": 250 }, { - "epoch": 0.16666666666666666, - "eval_loss": 0.5975945591926575, - "eval_runtime": 80.8004, - "eval_samples_per_second": 37.129, - "eval_steps_per_second": 9.282, + "epoch": 0.10359473738734072, + "eval_loss": 0.7402811050415039, + "eval_runtime": 85.9367, + "eval_samples_per_second": 56.158, + "eval_steps_per_second": 14.045, "step": 250 }, { - "epoch": 0.2, - "grad_norm": 0.3970712423324585, - "learning_rate": 0.00019095454545454545, - "loss": 0.6165, + "epoch": 0.12431368486480887, + "grad_norm": 0.40629276633262634, + "learning_rate": 0.00019442733127975358, + "loss": 0.8108, "step": 300 }, { - "epoch": 0.23333333333333334, - "grad_norm": 0.38409528136253357, - "learning_rate": 0.00018868181818181817, - "loss": 0.639, + "epoch": 0.145032632342277, + "grad_norm": 0.5258236527442932, + "learning_rate": 0.00019302716325959117, + "loss": 0.8116, "step": 350 }, { - "epoch": 0.26666666666666666, - "grad_norm": 0.44628769159317017, - "learning_rate": 0.00018640909090909092, - "loss": 0.636, + "epoch": 0.16575157981974517, + "grad_norm": 0.6879925727844238, + "learning_rate": 0.00019162699523942874, + "loss": 0.9089, "step": 400 }, { - "epoch": 0.3, - "grad_norm": 0.3697021007537842, - "learning_rate": 0.00018413636363636364, - "loss": 0.6192, + "epoch": 0.1864705272972133, + "grad_norm": 0.7583937048912048, + "learning_rate": 0.00019022682721926633, + "loss": 0.874, "step": 450 }, { - "epoch": 0.3333333333333333, - "grad_norm": 0.36338189244270325, - "learning_rate": 0.00018186363636363636, - "loss": 0.6134, + "epoch": 0.20718947477468144, + "grad_norm": 0.6399120688438416, + "learning_rate": 0.0001888266591991039, + "loss": 0.8366, "step": 500 }, { - "epoch": 0.3333333333333333, - "eval_loss": 0.5813060998916626, - "eval_runtime": 80.7819, - "eval_samples_per_second": 37.137, - "eval_steps_per_second": 9.284, + "epoch": 0.20718947477468144, + "eval_loss": 0.7194066047668457, + "eval_runtime": 86.2811, + "eval_samples_per_second": 55.933, + "eval_steps_per_second": 13.989, "step": 500 }, { - "epoch": 0.36666666666666664, - "grad_norm": 0.35211533308029175, - "learning_rate": 0.0001795909090909091, - "loss": 0.6128, + "epoch": 0.22790842225214958, + "grad_norm": 0.7763131856918335, + "learning_rate": 0.0001874264911789415, + "loss": 0.7912, "step": 550 }, { - "epoch": 0.4, - "grad_norm": 0.36327463388442993, - "learning_rate": 0.00017731818181818183, - "loss": 0.5915, + "epoch": 0.24862736972961774, + "grad_norm": 0.6845299601554871, + "learning_rate": 0.00018602632315877906, + "loss": 0.8506, "step": 600 }, { - "epoch": 0.43333333333333335, - "grad_norm": 0.40672942996025085, - "learning_rate": 0.00017504545454545455, - "loss": 0.5807, + "epoch": 0.2693463172070859, + "grad_norm": 0.8045451045036316, + "learning_rate": 0.00018462615513861665, + "loss": 0.763, "step": 650 }, { - "epoch": 0.4666666666666667, - "grad_norm": 0.4689007103443146, - "learning_rate": 0.00017277272727272728, - "loss": 0.602, + "epoch": 0.290065264684554, + "grad_norm": 0.7035927176475525, + "learning_rate": 0.00018322598711845422, + "loss": 0.7769, "step": 700 }, { - "epoch": 0.5, - "grad_norm": 0.3979697823524475, - "learning_rate": 0.00017050000000000002, - "loss": 0.5703, + "epoch": 0.3107842121620222, + "grad_norm": 0.465000718832016, + "learning_rate": 0.00018182581909829179, + "loss": 0.7705, "step": 750 }, { - "epoch": 0.5, - "eval_loss": 0.5740106701850891, - "eval_runtime": 80.8209, - "eval_samples_per_second": 37.119, - "eval_steps_per_second": 9.28, + "epoch": 0.3107842121620222, + "eval_loss": 0.7103215456008911, + "eval_runtime": 86.1101, + "eval_samples_per_second": 56.045, + "eval_steps_per_second": 14.017, "step": 750 }, { - "epoch": 0.5333333333333333, - "grad_norm": 0.3071135878562927, - "learning_rate": 0.00016822727272727275, - "loss": 0.5746, + "epoch": 0.33150315963949034, + "grad_norm": 0.4990151524543762, + "learning_rate": 0.00018042565107812938, + "loss": 0.8438, "step": 800 }, { - "epoch": 0.5666666666666667, - "grad_norm": 0.318085253238678, - "learning_rate": 0.00016595454545454544, - "loss": 0.5873, + "epoch": 0.35222210711695845, + "grad_norm": 0.7391067147254944, + "learning_rate": 0.00017902548305796695, + "loss": 0.7688, "step": 850 }, { - "epoch": 0.6, - "grad_norm": 0.35915374755859375, - "learning_rate": 0.0001636818181818182, - "loss": 0.6283, + "epoch": 0.3729410545944266, + "grad_norm": 0.8036171197891235, + "learning_rate": 0.00017762531503780454, + "loss": 0.753, "step": 900 }, { - "epoch": 0.6333333333333333, - "grad_norm": 0.3174057602882385, - "learning_rate": 0.0001614090909090909, - "loss": 0.5912, + "epoch": 0.3936600020718947, + "grad_norm": 0.44744470715522766, + "learning_rate": 0.00017622514701764213, + "loss": 0.7793, "step": 950 }, { - "epoch": 0.6666666666666666, - "grad_norm": 0.416111022233963, - "learning_rate": 0.00015913636363636363, - "loss": 0.5647, + "epoch": 0.4143789495493629, + "grad_norm": 0.630820631980896, + "learning_rate": 0.00017482497899747973, + "loss": 0.7555, "step": 1000 }, { - "epoch": 0.6666666666666666, - "eval_loss": 0.5663638710975647, - "eval_runtime": 80.8183, - "eval_samples_per_second": 37.12, - "eval_steps_per_second": 9.28, + "epoch": 0.4143789495493629, + "eval_loss": 0.7030432820320129, + "eval_runtime": 86.2543, + "eval_samples_per_second": 55.951, + "eval_steps_per_second": 13.994, "step": 1000 }, { - "epoch": 0.7, - "grad_norm": 0.41202324628829956, - "learning_rate": 0.00015686363636363638, - "loss": 0.6118, + "epoch": 0.43509789702683105, + "grad_norm": 0.45690879225730896, + "learning_rate": 0.0001734248109773173, + "loss": 0.793, "step": 1050 }, { - "epoch": 0.7333333333333333, - "grad_norm": 0.3883333206176758, - "learning_rate": 0.0001545909090909091, - "loss": 0.5392, + "epoch": 0.45581684450429916, + "grad_norm": 0.5000227093696594, + "learning_rate": 0.00017202464295715486, + "loss": 0.8342, "step": 1100 }, { - "epoch": 0.7666666666666667, - "grad_norm": 0.31973451375961304, - "learning_rate": 0.00015231818181818182, - "loss": 0.5602, + "epoch": 0.4765357919817673, + "grad_norm": 0.47182488441467285, + "learning_rate": 0.00017062447493699246, + "loss": 0.7997, "step": 1150 }, { - "epoch": 0.8, - "grad_norm": 0.31378698348999023, - "learning_rate": 0.00015004545454545454, - "loss": 0.5642, + "epoch": 0.4972547394592355, + "grad_norm": 0.7060516476631165, + "learning_rate": 0.00016922430691683002, + "loss": 0.7788, "step": 1200 }, { - "epoch": 0.8333333333333334, - "grad_norm": 0.3346308171749115, - "learning_rate": 0.0001477727272727273, - "loss": 0.5925, + "epoch": 0.5179736869367036, + "grad_norm": 0.46701857447624207, + "learning_rate": 0.00016782413889666762, + "loss": 0.7518, "step": 1250 }, { - "epoch": 0.8333333333333334, - "eval_loss": 0.5619704723358154, - "eval_runtime": 80.824, - "eval_samples_per_second": 37.118, - "eval_steps_per_second": 9.279, + "epoch": 0.5179736869367036, + "eval_loss": 0.7023425698280334, + "eval_runtime": 86.3015, + "eval_samples_per_second": 55.92, + "eval_steps_per_second": 13.986, "step": 1250 }, { - "epoch": 0.8666666666666667, - "grad_norm": 0.5573959946632385, - "learning_rate": 0.0001455, - "loss": 0.5829, + "epoch": 0.5386926344141718, + "grad_norm": 0.668192446231842, + "learning_rate": 0.00016642397087650518, + "loss": 0.7682, "step": 1300 }, { - "epoch": 0.9, - "grad_norm": 0.36054643988609314, - "learning_rate": 0.00014322727272727273, - "loss": 0.5923, + "epoch": 0.5594115818916399, + "grad_norm": 0.47292283177375793, + "learning_rate": 0.00016502380285634278, + "loss": 0.7985, "step": 1350 }, { - "epoch": 0.9333333333333333, - "grad_norm": 0.36059027910232544, - "learning_rate": 0.00014095454545454546, - "loss": 0.5808, + "epoch": 0.580130529369108, + "grad_norm": 0.7327275276184082, + "learning_rate": 0.00016362363483618034, + "loss": 0.8378, "step": 1400 }, { - "epoch": 0.9666666666666667, - "grad_norm": 0.3942534327507019, - "learning_rate": 0.00013868181818181818, - "loss": 0.5597, + "epoch": 0.6008494768465762, + "grad_norm": 0.8417996764183044, + "learning_rate": 0.0001622234668160179, + "loss": 0.7962, "step": 1450 }, { - "epoch": 1.0, - "grad_norm": 0.3995835483074188, - "learning_rate": 0.0001364090909090909, - "loss": 0.5554, + "epoch": 0.6215684243240444, + "grad_norm": 0.6189562678337097, + "learning_rate": 0.0001608232987958555, + "loss": 0.8028, "step": 1500 }, { - "epoch": 1.0, - "eval_loss": 0.5581239461898804, - "eval_runtime": 80.8326, - "eval_samples_per_second": 37.114, - "eval_steps_per_second": 9.278, + "epoch": 0.6215684243240444, + "eval_loss": 0.6915447115898132, + "eval_runtime": 86.2147, + "eval_samples_per_second": 55.977, + "eval_steps_per_second": 14.0, "step": 1500 } ], "logging_steps": 50, - "max_steps": 4500, + "max_steps": 7242, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, @@ -285,7 +285,7 @@ "attributes": {} } }, - "total_flos": 1.834623940558848e+17, + "total_flos": 9.496080786358272e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null diff --git a/checkpoint-1500/training_args.bin b/checkpoint-1500/training_args.bin index eb236cf9d6a781057673256e00030110ee72a3cc..818731b2d637beef34d216b7461a12ef35db62df 100644 --- a/checkpoint-1500/training_args.bin +++ b/checkpoint-1500/training_args.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eb76e3e4d2123d52e529262f1ff37bfc600a160bb369e2338a54f71b47c17108 +oid sha256:a783872a61d64f4d1b4e002fa0fa67fa54dd9c59c4f7fbad58c794a77134069d size 5432 diff --git a/checkpoint-2000/adapter_config.json b/checkpoint-2000/adapter_config.json index 1a5356a6269df402dbde851645d59d282fadd9df..84dde86ee0bd87c0f5d5a0a36710a2a58516a7c4 100644 --- a/checkpoint-2000/adapter_config.json +++ b/checkpoint-2000/adapter_config.json @@ -29,9 +29,9 @@ "rank_pattern": {}, "revision": null, "target_modules": [ + "q_proj", "o_proj", "v_proj", - "q_proj", "k_proj" ], "target_parameters": null, diff --git a/checkpoint-2000/adapter_model.safetensors b/checkpoint-2000/adapter_model.safetensors index 587dee70c29dd1389f0073786bc7ed23156c60cf..4e59c3f5c7c65dde1ba47fc3ec3673e7771fe844 100644 --- a/checkpoint-2000/adapter_model.safetensors +++ b/checkpoint-2000/adapter_model.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7c3a88ad5e47e99721d27a7cc47580f0dd445458c5b5d383d9746ac5150752b3 +oid sha256:0bc2c38a4feb292b5953adbca7e889a9849d3aae23bebf65624fa6ef2a12e814 size 54560368 diff --git a/checkpoint-2000/optimizer.pt b/checkpoint-2000/optimizer.pt index ddde109b39cf462dde298a8504bacf97c3bf1481..c8317ee4d4fabbf614fa6367b7740169daea3fea 100644 --- a/checkpoint-2000/optimizer.pt +++ b/checkpoint-2000/optimizer.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1aa882db7f1e97aecd9e23deab24ec52c1966c084ef84d480101777cb20b2b38 +oid sha256:02591c829ebc2e9c023c77020f2505b8055865e72743aebf570d809d20a5bf01 size 109267450 diff --git a/checkpoint-2000/rng_state.pth b/checkpoint-2000/rng_state.pth index ab527a2f8c7128a9ea3895fe3f6a3f850827de11..20fb8ddaecfc743315142a802c234e74c2b773e8 100644 --- a/checkpoint-2000/rng_state.pth +++ b/checkpoint-2000/rng_state.pth @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e86b19998400264e99e08275eb288ef36b233377938a1f173b2ecb9fa75ffacc +oid sha256:b69ce190b07f928f0db402b171b4a32695620a6cc7680ee0294e1d3ca9955e84 size 14244 diff --git a/checkpoint-2000/scheduler.pt b/checkpoint-2000/scheduler.pt index d5074d96eeb2bbac4652594a3348b487fe78c64a..f3a3e82ab0cfb25d9c6a8575da17be323561ae80 100644 --- a/checkpoint-2000/scheduler.pt +++ b/checkpoint-2000/scheduler.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:de481359f7708f1c509bebbc539f8384d41101271a19491884a2ffc4b1dd3c44 +oid sha256:aff236ec96fe456a1d48a1c988fd9dbf62d3fbd22f57121ebf0b02e7d4ca2c27 size 1064 diff --git a/checkpoint-2000/trainer_state.json b/checkpoint-2000/trainer_state.json index b10882df0a690f4a17e797f2ab5d40744a4d1e22..7fa49bc1d1cef0d5f9b1d637c00ae7b7b5c39771 100644 --- a/checkpoint-2000/trainer_state.json +++ b/checkpoint-2000/trainer_state.json @@ -1,8 +1,8 @@ { "best_global_step": 2000, - "best_metric": 0.5535863637924194, - "best_model_checkpoint": "finetuned_models/llama3.1-8b-lora/checkpoint-2000", - "epoch": 1.3333333333333333, + "best_metric": 0.6823315024375916, + "best_model_checkpoint": "amiya_outputs/llama3.1-8b-amiya-palestinian/checkpoint-2000", + "epoch": 0.8287578990987258, "eval_steps": 250, "global_step": 2000, "is_hyper_param_search": false, @@ -10,352 +10,352 @@ "is_world_process_zero": true, "log_history": [ { - "epoch": 0.03333333333333333, - "grad_norm": 0.5346225500106812, + "epoch": 0.020718947477468146, + "grad_norm": 1.0589393377304077, "learning_rate": 9.8e-05, - "loss": 2.4955, + "loss": 2.6567, "step": 50 }, { - "epoch": 0.06666666666666667, - "grad_norm": 0.719093918800354, + "epoch": 0.04143789495493629, + "grad_norm": 0.9738045334815979, "learning_rate": 0.00019800000000000002, - "loss": 0.71, + "loss": 0.9502, "step": 100 }, { - "epoch": 0.1, - "grad_norm": 0.4840560853481293, - "learning_rate": 0.0001977727272727273, - "loss": 0.6405, + "epoch": 0.062156842432404436, + "grad_norm": 0.8801347017288208, + "learning_rate": 0.00019862783534024082, + "loss": 0.8496, "step": 150 }, { - "epoch": 0.13333333333333333, - "grad_norm": 0.3332301676273346, - "learning_rate": 0.0001955, - "loss": 0.6287, + "epoch": 0.08287578990987259, + "grad_norm": 0.7272312045097351, + "learning_rate": 0.00019722766732007841, + "loss": 0.8184, "step": 200 }, { - "epoch": 0.16666666666666666, - "grad_norm": 0.40639588236808777, - "learning_rate": 0.00019322727272727276, - "loss": 0.5572, + "epoch": 0.10359473738734072, + "grad_norm": 0.7850629091262817, + "learning_rate": 0.000195827499299916, + "loss": 0.8392, "step": 250 }, { - "epoch": 0.16666666666666666, - "eval_loss": 0.5975945591926575, - "eval_runtime": 80.8004, - "eval_samples_per_second": 37.129, - "eval_steps_per_second": 9.282, + "epoch": 0.10359473738734072, + "eval_loss": 0.7402811050415039, + "eval_runtime": 85.9367, + "eval_samples_per_second": 56.158, + "eval_steps_per_second": 14.045, "step": 250 }, { - "epoch": 0.2, - "grad_norm": 0.3970712423324585, - "learning_rate": 0.00019095454545454545, - "loss": 0.6165, + "epoch": 0.12431368486480887, + "grad_norm": 0.40629276633262634, + "learning_rate": 0.00019442733127975358, + "loss": 0.8108, "step": 300 }, { - "epoch": 0.23333333333333334, - "grad_norm": 0.38409528136253357, - "learning_rate": 0.00018868181818181817, - "loss": 0.639, + "epoch": 0.145032632342277, + "grad_norm": 0.5258236527442932, + "learning_rate": 0.00019302716325959117, + "loss": 0.8116, "step": 350 }, { - "epoch": 0.26666666666666666, - "grad_norm": 0.44628769159317017, - "learning_rate": 0.00018640909090909092, - "loss": 0.636, + "epoch": 0.16575157981974517, + "grad_norm": 0.6879925727844238, + "learning_rate": 0.00019162699523942874, + "loss": 0.9089, "step": 400 }, { - "epoch": 0.3, - "grad_norm": 0.3697021007537842, - "learning_rate": 0.00018413636363636364, - "loss": 0.6192, + "epoch": 0.1864705272972133, + "grad_norm": 0.7583937048912048, + "learning_rate": 0.00019022682721926633, + "loss": 0.874, "step": 450 }, { - "epoch": 0.3333333333333333, - "grad_norm": 0.36338189244270325, - "learning_rate": 0.00018186363636363636, - "loss": 0.6134, + "epoch": 0.20718947477468144, + "grad_norm": 0.6399120688438416, + "learning_rate": 0.0001888266591991039, + "loss": 0.8366, "step": 500 }, { - "epoch": 0.3333333333333333, - "eval_loss": 0.5813060998916626, - "eval_runtime": 80.7819, - "eval_samples_per_second": 37.137, - "eval_steps_per_second": 9.284, + "epoch": 0.20718947477468144, + "eval_loss": 0.7194066047668457, + "eval_runtime": 86.2811, + "eval_samples_per_second": 55.933, + "eval_steps_per_second": 13.989, "step": 500 }, { - "epoch": 0.36666666666666664, - "grad_norm": 0.35211533308029175, - "learning_rate": 0.0001795909090909091, - "loss": 0.6128, + "epoch": 0.22790842225214958, + "grad_norm": 0.7763131856918335, + "learning_rate": 0.0001874264911789415, + "loss": 0.7912, "step": 550 }, { - "epoch": 0.4, - "grad_norm": 0.36327463388442993, - "learning_rate": 0.00017731818181818183, - "loss": 0.5915, + "epoch": 0.24862736972961774, + "grad_norm": 0.6845299601554871, + "learning_rate": 0.00018602632315877906, + "loss": 0.8506, "step": 600 }, { - "epoch": 0.43333333333333335, - "grad_norm": 0.40672942996025085, - "learning_rate": 0.00017504545454545455, - "loss": 0.5807, + "epoch": 0.2693463172070859, + "grad_norm": 0.8045451045036316, + "learning_rate": 0.00018462615513861665, + "loss": 0.763, "step": 650 }, { - "epoch": 0.4666666666666667, - "grad_norm": 0.4689007103443146, - "learning_rate": 0.00017277272727272728, - "loss": 0.602, + "epoch": 0.290065264684554, + "grad_norm": 0.7035927176475525, + "learning_rate": 0.00018322598711845422, + "loss": 0.7769, "step": 700 }, { - "epoch": 0.5, - "grad_norm": 0.3979697823524475, - "learning_rate": 0.00017050000000000002, - "loss": 0.5703, + "epoch": 0.3107842121620222, + "grad_norm": 0.465000718832016, + "learning_rate": 0.00018182581909829179, + "loss": 0.7705, "step": 750 }, { - "epoch": 0.5, - "eval_loss": 0.5740106701850891, - "eval_runtime": 80.8209, - "eval_samples_per_second": 37.119, - "eval_steps_per_second": 9.28, + "epoch": 0.3107842121620222, + "eval_loss": 0.7103215456008911, + "eval_runtime": 86.1101, + "eval_samples_per_second": 56.045, + "eval_steps_per_second": 14.017, "step": 750 }, { - "epoch": 0.5333333333333333, - "grad_norm": 0.3071135878562927, - "learning_rate": 0.00016822727272727275, - "loss": 0.5746, + "epoch": 0.33150315963949034, + "grad_norm": 0.4990151524543762, + "learning_rate": 0.00018042565107812938, + "loss": 0.8438, "step": 800 }, { - "epoch": 0.5666666666666667, - "grad_norm": 0.318085253238678, - "learning_rate": 0.00016595454545454544, - "loss": 0.5873, + "epoch": 0.35222210711695845, + "grad_norm": 0.7391067147254944, + "learning_rate": 0.00017902548305796695, + "loss": 0.7688, "step": 850 }, { - "epoch": 0.6, - "grad_norm": 0.35915374755859375, - "learning_rate": 0.0001636818181818182, - "loss": 0.6283, + "epoch": 0.3729410545944266, + "grad_norm": 0.8036171197891235, + "learning_rate": 0.00017762531503780454, + "loss": 0.753, "step": 900 }, { - "epoch": 0.6333333333333333, - "grad_norm": 0.3174057602882385, - "learning_rate": 0.0001614090909090909, - "loss": 0.5912, + "epoch": 0.3936600020718947, + "grad_norm": 0.44744470715522766, + "learning_rate": 0.00017622514701764213, + "loss": 0.7793, "step": 950 }, { - "epoch": 0.6666666666666666, - "grad_norm": 0.416111022233963, - "learning_rate": 0.00015913636363636363, - "loss": 0.5647, + "epoch": 0.4143789495493629, + "grad_norm": 0.630820631980896, + "learning_rate": 0.00017482497899747973, + "loss": 0.7555, "step": 1000 }, { - "epoch": 0.6666666666666666, - "eval_loss": 0.5663638710975647, - "eval_runtime": 80.8183, - "eval_samples_per_second": 37.12, - "eval_steps_per_second": 9.28, + "epoch": 0.4143789495493629, + "eval_loss": 0.7030432820320129, + "eval_runtime": 86.2543, + "eval_samples_per_second": 55.951, + "eval_steps_per_second": 13.994, "step": 1000 }, { - "epoch": 0.7, - "grad_norm": 0.41202324628829956, - "learning_rate": 0.00015686363636363638, - "loss": 0.6118, + "epoch": 0.43509789702683105, + "grad_norm": 0.45690879225730896, + "learning_rate": 0.0001734248109773173, + "loss": 0.793, "step": 1050 }, { - "epoch": 0.7333333333333333, - "grad_norm": 0.3883333206176758, - "learning_rate": 0.0001545909090909091, - "loss": 0.5392, + "epoch": 0.45581684450429916, + "grad_norm": 0.5000227093696594, + "learning_rate": 0.00017202464295715486, + "loss": 0.8342, "step": 1100 }, { - "epoch": 0.7666666666666667, - "grad_norm": 0.31973451375961304, - "learning_rate": 0.00015231818181818182, - "loss": 0.5602, + "epoch": 0.4765357919817673, + "grad_norm": 0.47182488441467285, + "learning_rate": 0.00017062447493699246, + "loss": 0.7997, "step": 1150 }, { - "epoch": 0.8, - "grad_norm": 0.31378698348999023, - "learning_rate": 0.00015004545454545454, - "loss": 0.5642, + "epoch": 0.4972547394592355, + "grad_norm": 0.7060516476631165, + "learning_rate": 0.00016922430691683002, + "loss": 0.7788, "step": 1200 }, { - "epoch": 0.8333333333333334, - "grad_norm": 0.3346308171749115, - "learning_rate": 0.0001477727272727273, - "loss": 0.5925, + "epoch": 0.5179736869367036, + "grad_norm": 0.46701857447624207, + "learning_rate": 0.00016782413889666762, + "loss": 0.7518, "step": 1250 }, { - "epoch": 0.8333333333333334, - "eval_loss": 0.5619704723358154, - "eval_runtime": 80.824, - "eval_samples_per_second": 37.118, - "eval_steps_per_second": 9.279, + "epoch": 0.5179736869367036, + "eval_loss": 0.7023425698280334, + "eval_runtime": 86.3015, + "eval_samples_per_second": 55.92, + "eval_steps_per_second": 13.986, "step": 1250 }, { - "epoch": 0.8666666666666667, - "grad_norm": 0.5573959946632385, - "learning_rate": 0.0001455, - "loss": 0.5829, + "epoch": 0.5386926344141718, + "grad_norm": 0.668192446231842, + "learning_rate": 0.00016642397087650518, + "loss": 0.7682, "step": 1300 }, { - "epoch": 0.9, - "grad_norm": 0.36054643988609314, - "learning_rate": 0.00014322727272727273, - "loss": 0.5923, + "epoch": 0.5594115818916399, + "grad_norm": 0.47292283177375793, + "learning_rate": 0.00016502380285634278, + "loss": 0.7985, "step": 1350 }, { - "epoch": 0.9333333333333333, - "grad_norm": 0.36059027910232544, - "learning_rate": 0.00014095454545454546, - "loss": 0.5808, + "epoch": 0.580130529369108, + "grad_norm": 0.7327275276184082, + "learning_rate": 0.00016362363483618034, + "loss": 0.8378, "step": 1400 }, { - "epoch": 0.9666666666666667, - "grad_norm": 0.3942534327507019, - "learning_rate": 0.00013868181818181818, - "loss": 0.5597, + "epoch": 0.6008494768465762, + "grad_norm": 0.8417996764183044, + "learning_rate": 0.0001622234668160179, + "loss": 0.7962, "step": 1450 }, { - "epoch": 1.0, - "grad_norm": 0.3995835483074188, - "learning_rate": 0.0001364090909090909, - "loss": 0.5554, + "epoch": 0.6215684243240444, + "grad_norm": 0.6189562678337097, + "learning_rate": 0.0001608232987958555, + "loss": 0.8028, "step": 1500 }, { - "epoch": 1.0, - "eval_loss": 0.5581239461898804, - "eval_runtime": 80.8326, - "eval_samples_per_second": 37.114, - "eval_steps_per_second": 9.278, + "epoch": 0.6215684243240444, + "eval_loss": 0.6915447115898132, + "eval_runtime": 86.2147, + "eval_samples_per_second": 55.977, + "eval_steps_per_second": 14.0, "step": 1500 }, { - "epoch": 1.0333333333333334, - "grad_norm": 0.3405410051345825, - "learning_rate": 0.00013413636363636365, - "loss": 0.5571, + "epoch": 0.6422873718015125, + "grad_norm": 0.7345826625823975, + "learning_rate": 0.0001594231307756931, + "loss": 0.7978, "step": 1550 }, { - "epoch": 1.0666666666666667, - "grad_norm": 0.4485073983669281, - "learning_rate": 0.00013186363636363637, - "loss": 0.5674, + "epoch": 0.6630063192789807, + "grad_norm": 0.6538310050964355, + "learning_rate": 0.0001580229627555307, + "loss": 0.7672, "step": 1600 }, { - "epoch": 1.1, - "grad_norm": 0.34938374161720276, - "learning_rate": 0.0001295909090909091, - "loss": 0.5354, + "epoch": 0.6837252667564487, + "grad_norm": 0.661582350730896, + "learning_rate": 0.00015662279473536826, + "loss": 0.7378, "step": 1650 }, { - "epoch": 1.1333333333333333, - "grad_norm": 0.33084195852279663, - "learning_rate": 0.00012731818181818184, - "loss": 0.5765, + "epoch": 0.7044442142339169, + "grad_norm": 0.3603042960166931, + "learning_rate": 0.00015522262671520583, + "loss": 0.6741, "step": 1700 }, { - "epoch": 1.1666666666666667, - "grad_norm": 0.3667336404323578, - "learning_rate": 0.00012504545454545456, - "loss": 0.5486, + "epoch": 0.7251631617113851, + "grad_norm": 0.8882561326026917, + "learning_rate": 0.00015382245869504342, + "loss": 0.7695, "step": 1750 }, { - "epoch": 1.1666666666666667, - "eval_loss": 0.5557209253311157, - "eval_runtime": 80.8386, - "eval_samples_per_second": 37.111, - "eval_steps_per_second": 9.278, + "epoch": 0.7251631617113851, + "eval_loss": 0.6858941316604614, + "eval_runtime": 86.6358, + "eval_samples_per_second": 55.704, + "eval_steps_per_second": 13.932, "step": 1750 }, { - "epoch": 1.2, - "grad_norm": 0.33248019218444824, - "learning_rate": 0.00012277272727272728, - "loss": 0.5617, + "epoch": 0.7458821091888532, + "grad_norm": 0.5933266282081604, + "learning_rate": 0.000152422290674881, + "loss": 0.7548, "step": 1800 }, { - "epoch": 1.2333333333333334, - "grad_norm": 0.4447474479675293, - "learning_rate": 0.00012050000000000002, - "loss": 0.567, + "epoch": 0.7666010566663214, + "grad_norm": 0.8178608417510986, + "learning_rate": 0.00015102212265471858, + "loss": 0.7639, "step": 1850 }, { - "epoch": 1.2666666666666666, - "grad_norm": 0.42134660482406616, - "learning_rate": 0.00011822727272727274, - "loss": 0.5319, + "epoch": 0.7873200041437894, + "grad_norm": 0.4378993511199951, + "learning_rate": 0.00014962195463455615, + "loss": 0.7985, "step": 1900 }, { - "epoch": 1.3, - "grad_norm": 0.3942984640598297, - "learning_rate": 0.00011595454545454544, - "loss": 0.5325, + "epoch": 0.8080389516212576, + "grad_norm": 0.3732803463935852, + "learning_rate": 0.00014822178661439374, + "loss": 0.8481, "step": 1950 }, { - "epoch": 1.3333333333333333, - "grad_norm": 0.4929428696632385, - "learning_rate": 0.00011368181818181818, - "loss": 0.5565, + "epoch": 0.8287578990987258, + "grad_norm": 0.7421035170555115, + "learning_rate": 0.0001468216185942313, + "loss": 0.7223, "step": 2000 }, { - "epoch": 1.3333333333333333, - "eval_loss": 0.5535863637924194, - "eval_runtime": 80.8279, - "eval_samples_per_second": 37.116, - "eval_steps_per_second": 9.279, + "epoch": 0.8287578990987258, + "eval_loss": 0.6823315024375916, + "eval_runtime": 86.5575, + "eval_samples_per_second": 55.755, + "eval_steps_per_second": 13.944, "step": 2000 } ], "logging_steps": 50, - "max_steps": 4500, + "max_steps": 7242, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, @@ -371,7 +371,7 @@ "attributes": {} } }, - "total_flos": 2.4518949953568768e+17, + "total_flos": 1.262946011799552e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null diff --git a/checkpoint-2000/training_args.bin b/checkpoint-2000/training_args.bin index eb236cf9d6a781057673256e00030110ee72a3cc..818731b2d637beef34d216b7461a12ef35db62df 100644 --- a/checkpoint-2000/training_args.bin +++ b/checkpoint-2000/training_args.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eb76e3e4d2123d52e529262f1ff37bfc600a160bb369e2338a54f71b47c17108 +oid sha256:a783872a61d64f4d1b4e002fa0fa67fa54dd9c59c4f7fbad58c794a77134069d size 5432 diff --git a/checkpoint-2500/adapter_config.json b/checkpoint-2500/adapter_config.json index 1a5356a6269df402dbde851645d59d282fadd9df..84dde86ee0bd87c0f5d5a0a36710a2a58516a7c4 100644 --- a/checkpoint-2500/adapter_config.json +++ b/checkpoint-2500/adapter_config.json @@ -29,9 +29,9 @@ "rank_pattern": {}, "revision": null, "target_modules": [ + "q_proj", "o_proj", "v_proj", - "q_proj", "k_proj" ], "target_parameters": null, diff --git a/checkpoint-2500/adapter_model.safetensors b/checkpoint-2500/adapter_model.safetensors index 821a5ef959824a508385c0cd35d7281b903ed3b2..42c6110f5628731afbf421110428c60cfccca1a3 100644 --- a/checkpoint-2500/adapter_model.safetensors +++ b/checkpoint-2500/adapter_model.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a5207bd2a71d6e74489cb5103f4173305575b69a2798a53c970da2f8e42cfd1b +oid sha256:49efc21966de98bf9994a157e2e4dabb68153133adf2745eefe182249b3f3197 size 54560368 diff --git a/checkpoint-2500/optimizer.pt b/checkpoint-2500/optimizer.pt index c34380d43b6ce5c8ec85b981d422f5de4ab6c3e4..5cff3a5305cffff7ccd86cfd6bc47bf03fac15ea 100644 --- a/checkpoint-2500/optimizer.pt +++ b/checkpoint-2500/optimizer.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:848bae18dd111819cc86ee93c9822b1baf9b23656a8810d3bbb4140c26fa04d8 +oid sha256:fcd2efd5a1e4f00990e049fca5bee5c43d977c21ae7fd093d6c7fdff6fe068b8 size 109267450 diff --git a/checkpoint-2500/rng_state.pth b/checkpoint-2500/rng_state.pth index b80f470d0f1096503c0fbc50efd48abdfd141ec9..32b83538638a37593e60a7125d8b457bc574cd2a 100644 --- a/checkpoint-2500/rng_state.pth +++ b/checkpoint-2500/rng_state.pth @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:02297c389f0848a1a674f64fd3230c94f24e9dbabcb192a80189b95e9b26ab11 +oid sha256:24368a441ca3fc9abe92a436629bb258dee7296cd6e160cb97d6948bbd91695b size 14244 diff --git a/checkpoint-2500/scaler.pt b/checkpoint-2500/scaler.pt index 4b9902c4e2dee2972ff0943d57b3678364435cdf..6ec6228f0fc99c44465dc002d310253c1778bc62 100644 --- a/checkpoint-2500/scaler.pt +++ b/checkpoint-2500/scaler.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:48e2d97f563bb838328076a1666504681962151a3975a2f064be3a03e6500740 +oid sha256:7ab3b49628f2ae2ec7cdbb0bc103569c008e8a11af2787309237ce369c80d7b9 size 988 diff --git a/checkpoint-2500/scheduler.pt b/checkpoint-2500/scheduler.pt index b161ffe5ee9d1ee5cb98674cb838e3900dc6af2d..f5ffd2230daaace0434333f3c5d6e9dd734c7fdd 100644 --- a/checkpoint-2500/scheduler.pt +++ b/checkpoint-2500/scheduler.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:23892cead62882c0c408b409776e78c7487ed4ce0dfaca891fbc6687acaa712e +oid sha256:abac4e3ce09d884de31337ea91e7059472492d528353bcdadc9c18f2f41cdb86 size 1064 diff --git a/checkpoint-2500/trainer_state.json b/checkpoint-2500/trainer_state.json index 45cf33a89e085270280596f83c475b3382c5cb4c..f898e7fdaf0894a4ba7947a28c2ff1d118fc8926 100644 --- a/checkpoint-2500/trainer_state.json +++ b/checkpoint-2500/trainer_state.json @@ -1,8 +1,8 @@ { "best_global_step": 2500, - "best_metric": 0.5476261377334595, - "best_model_checkpoint": "finetuned_models/llama3.1-8b-lora/checkpoint-2500", - "epoch": 1.6666666666666665, + "best_metric": 0.675748348236084, + "best_model_checkpoint": "amiya_outputs/llama3.1-8b-amiya-palestinian/checkpoint-2500", + "epoch": 1.0356365896612452, "eval_steps": 250, "global_step": 2500, "is_hyper_param_search": false, @@ -10,438 +10,438 @@ "is_world_process_zero": true, "log_history": [ { - "epoch": 0.03333333333333333, - "grad_norm": 0.5346225500106812, + "epoch": 0.020718947477468146, + "grad_norm": 1.0589393377304077, "learning_rate": 9.8e-05, - "loss": 2.4955, + "loss": 2.6567, "step": 50 }, { - "epoch": 0.06666666666666667, - "grad_norm": 0.719093918800354, + "epoch": 0.04143789495493629, + "grad_norm": 0.9738045334815979, "learning_rate": 0.00019800000000000002, - "loss": 0.71, + "loss": 0.9502, "step": 100 }, { - "epoch": 0.1, - "grad_norm": 0.4840560853481293, - "learning_rate": 0.0001977727272727273, - "loss": 0.6405, + "epoch": 0.062156842432404436, + "grad_norm": 0.8801347017288208, + "learning_rate": 0.00019862783534024082, + "loss": 0.8496, "step": 150 }, { - "epoch": 0.13333333333333333, - "grad_norm": 0.3332301676273346, - "learning_rate": 0.0001955, - "loss": 0.6287, + "epoch": 0.08287578990987259, + "grad_norm": 0.7272312045097351, + "learning_rate": 0.00019722766732007841, + "loss": 0.8184, "step": 200 }, { - "epoch": 0.16666666666666666, - "grad_norm": 0.40639588236808777, - "learning_rate": 0.00019322727272727276, - "loss": 0.5572, + "epoch": 0.10359473738734072, + "grad_norm": 0.7850629091262817, + "learning_rate": 0.000195827499299916, + "loss": 0.8392, "step": 250 }, { - "epoch": 0.16666666666666666, - "eval_loss": 0.5975945591926575, - "eval_runtime": 80.8004, - "eval_samples_per_second": 37.129, - "eval_steps_per_second": 9.282, + "epoch": 0.10359473738734072, + "eval_loss": 0.7402811050415039, + "eval_runtime": 85.9367, + "eval_samples_per_second": 56.158, + "eval_steps_per_second": 14.045, "step": 250 }, { - "epoch": 0.2, - "grad_norm": 0.3970712423324585, - "learning_rate": 0.00019095454545454545, - "loss": 0.6165, + "epoch": 0.12431368486480887, + "grad_norm": 0.40629276633262634, + "learning_rate": 0.00019442733127975358, + "loss": 0.8108, "step": 300 }, { - "epoch": 0.23333333333333334, - "grad_norm": 0.38409528136253357, - "learning_rate": 0.00018868181818181817, - "loss": 0.639, + "epoch": 0.145032632342277, + "grad_norm": 0.5258236527442932, + "learning_rate": 0.00019302716325959117, + "loss": 0.8116, "step": 350 }, { - "epoch": 0.26666666666666666, - "grad_norm": 0.44628769159317017, - "learning_rate": 0.00018640909090909092, - "loss": 0.636, + "epoch": 0.16575157981974517, + "grad_norm": 0.6879925727844238, + "learning_rate": 0.00019162699523942874, + "loss": 0.9089, "step": 400 }, { - "epoch": 0.3, - "grad_norm": 0.3697021007537842, - "learning_rate": 0.00018413636363636364, - "loss": 0.6192, + "epoch": 0.1864705272972133, + "grad_norm": 0.7583937048912048, + "learning_rate": 0.00019022682721926633, + "loss": 0.874, "step": 450 }, { - "epoch": 0.3333333333333333, - "grad_norm": 0.36338189244270325, - "learning_rate": 0.00018186363636363636, - "loss": 0.6134, + "epoch": 0.20718947477468144, + "grad_norm": 0.6399120688438416, + "learning_rate": 0.0001888266591991039, + "loss": 0.8366, "step": 500 }, { - "epoch": 0.3333333333333333, - "eval_loss": 0.5813060998916626, - "eval_runtime": 80.7819, - "eval_samples_per_second": 37.137, - "eval_steps_per_second": 9.284, + "epoch": 0.20718947477468144, + "eval_loss": 0.7194066047668457, + "eval_runtime": 86.2811, + "eval_samples_per_second": 55.933, + "eval_steps_per_second": 13.989, "step": 500 }, { - "epoch": 0.36666666666666664, - "grad_norm": 0.35211533308029175, - "learning_rate": 0.0001795909090909091, - "loss": 0.6128, + "epoch": 0.22790842225214958, + "grad_norm": 0.7763131856918335, + "learning_rate": 0.0001874264911789415, + "loss": 0.7912, "step": 550 }, { - "epoch": 0.4, - "grad_norm": 0.36327463388442993, - "learning_rate": 0.00017731818181818183, - "loss": 0.5915, + "epoch": 0.24862736972961774, + "grad_norm": 0.6845299601554871, + "learning_rate": 0.00018602632315877906, + "loss": 0.8506, "step": 600 }, { - "epoch": 0.43333333333333335, - "grad_norm": 0.40672942996025085, - "learning_rate": 0.00017504545454545455, - "loss": 0.5807, + "epoch": 0.2693463172070859, + "grad_norm": 0.8045451045036316, + "learning_rate": 0.00018462615513861665, + "loss": 0.763, "step": 650 }, { - "epoch": 0.4666666666666667, - "grad_norm": 0.4689007103443146, - "learning_rate": 0.00017277272727272728, - "loss": 0.602, + "epoch": 0.290065264684554, + "grad_norm": 0.7035927176475525, + "learning_rate": 0.00018322598711845422, + "loss": 0.7769, "step": 700 }, { - "epoch": 0.5, - "grad_norm": 0.3979697823524475, - "learning_rate": 0.00017050000000000002, - "loss": 0.5703, + "epoch": 0.3107842121620222, + "grad_norm": 0.465000718832016, + "learning_rate": 0.00018182581909829179, + "loss": 0.7705, "step": 750 }, { - "epoch": 0.5, - "eval_loss": 0.5740106701850891, - "eval_runtime": 80.8209, - "eval_samples_per_second": 37.119, - "eval_steps_per_second": 9.28, + "epoch": 0.3107842121620222, + "eval_loss": 0.7103215456008911, + "eval_runtime": 86.1101, + "eval_samples_per_second": 56.045, + "eval_steps_per_second": 14.017, "step": 750 }, { - "epoch": 0.5333333333333333, - "grad_norm": 0.3071135878562927, - "learning_rate": 0.00016822727272727275, - "loss": 0.5746, + "epoch": 0.33150315963949034, + "grad_norm": 0.4990151524543762, + "learning_rate": 0.00018042565107812938, + "loss": 0.8438, "step": 800 }, { - "epoch": 0.5666666666666667, - "grad_norm": 0.318085253238678, - "learning_rate": 0.00016595454545454544, - "loss": 0.5873, + "epoch": 0.35222210711695845, + "grad_norm": 0.7391067147254944, + "learning_rate": 0.00017902548305796695, + "loss": 0.7688, "step": 850 }, { - "epoch": 0.6, - "grad_norm": 0.35915374755859375, - "learning_rate": 0.0001636818181818182, - "loss": 0.6283, + "epoch": 0.3729410545944266, + "grad_norm": 0.8036171197891235, + "learning_rate": 0.00017762531503780454, + "loss": 0.753, "step": 900 }, { - "epoch": 0.6333333333333333, - "grad_norm": 0.3174057602882385, - "learning_rate": 0.0001614090909090909, - "loss": 0.5912, + "epoch": 0.3936600020718947, + "grad_norm": 0.44744470715522766, + "learning_rate": 0.00017622514701764213, + "loss": 0.7793, "step": 950 }, { - "epoch": 0.6666666666666666, - "grad_norm": 0.416111022233963, - "learning_rate": 0.00015913636363636363, - "loss": 0.5647, + "epoch": 0.4143789495493629, + "grad_norm": 0.630820631980896, + "learning_rate": 0.00017482497899747973, + "loss": 0.7555, "step": 1000 }, { - "epoch": 0.6666666666666666, - "eval_loss": 0.5663638710975647, - "eval_runtime": 80.8183, - "eval_samples_per_second": 37.12, - "eval_steps_per_second": 9.28, + "epoch": 0.4143789495493629, + "eval_loss": 0.7030432820320129, + "eval_runtime": 86.2543, + "eval_samples_per_second": 55.951, + "eval_steps_per_second": 13.994, "step": 1000 }, { - "epoch": 0.7, - "grad_norm": 0.41202324628829956, - "learning_rate": 0.00015686363636363638, - "loss": 0.6118, + "epoch": 0.43509789702683105, + "grad_norm": 0.45690879225730896, + "learning_rate": 0.0001734248109773173, + "loss": 0.793, "step": 1050 }, { - "epoch": 0.7333333333333333, - "grad_norm": 0.3883333206176758, - "learning_rate": 0.0001545909090909091, - "loss": 0.5392, + "epoch": 0.45581684450429916, + "grad_norm": 0.5000227093696594, + "learning_rate": 0.00017202464295715486, + "loss": 0.8342, "step": 1100 }, { - "epoch": 0.7666666666666667, - "grad_norm": 0.31973451375961304, - "learning_rate": 0.00015231818181818182, - "loss": 0.5602, + "epoch": 0.4765357919817673, + "grad_norm": 0.47182488441467285, + "learning_rate": 0.00017062447493699246, + "loss": 0.7997, "step": 1150 }, { - "epoch": 0.8, - "grad_norm": 0.31378698348999023, - "learning_rate": 0.00015004545454545454, - "loss": 0.5642, + "epoch": 0.4972547394592355, + "grad_norm": 0.7060516476631165, + "learning_rate": 0.00016922430691683002, + "loss": 0.7788, "step": 1200 }, { - "epoch": 0.8333333333333334, - "grad_norm": 0.3346308171749115, - "learning_rate": 0.0001477727272727273, - "loss": 0.5925, + "epoch": 0.5179736869367036, + "grad_norm": 0.46701857447624207, + "learning_rate": 0.00016782413889666762, + "loss": 0.7518, "step": 1250 }, { - "epoch": 0.8333333333333334, - "eval_loss": 0.5619704723358154, - "eval_runtime": 80.824, - "eval_samples_per_second": 37.118, - "eval_steps_per_second": 9.279, + "epoch": 0.5179736869367036, + "eval_loss": 0.7023425698280334, + "eval_runtime": 86.3015, + "eval_samples_per_second": 55.92, + "eval_steps_per_second": 13.986, "step": 1250 }, { - "epoch": 0.8666666666666667, - "grad_norm": 0.5573959946632385, - "learning_rate": 0.0001455, - "loss": 0.5829, + "epoch": 0.5386926344141718, + "grad_norm": 0.668192446231842, + "learning_rate": 0.00016642397087650518, + "loss": 0.7682, "step": 1300 }, { - "epoch": 0.9, - "grad_norm": 0.36054643988609314, - "learning_rate": 0.00014322727272727273, - "loss": 0.5923, + "epoch": 0.5594115818916399, + "grad_norm": 0.47292283177375793, + "learning_rate": 0.00016502380285634278, + "loss": 0.7985, "step": 1350 }, { - "epoch": 0.9333333333333333, - "grad_norm": 0.36059027910232544, - "learning_rate": 0.00014095454545454546, - "loss": 0.5808, + "epoch": 0.580130529369108, + "grad_norm": 0.7327275276184082, + "learning_rate": 0.00016362363483618034, + "loss": 0.8378, "step": 1400 }, { - "epoch": 0.9666666666666667, - "grad_norm": 0.3942534327507019, - "learning_rate": 0.00013868181818181818, - "loss": 0.5597, + "epoch": 0.6008494768465762, + "grad_norm": 0.8417996764183044, + "learning_rate": 0.0001622234668160179, + "loss": 0.7962, "step": 1450 }, { - "epoch": 1.0, - "grad_norm": 0.3995835483074188, - "learning_rate": 0.0001364090909090909, - "loss": 0.5554, + "epoch": 0.6215684243240444, + "grad_norm": 0.6189562678337097, + "learning_rate": 0.0001608232987958555, + "loss": 0.8028, "step": 1500 }, { - "epoch": 1.0, - "eval_loss": 0.5581239461898804, - "eval_runtime": 80.8326, - "eval_samples_per_second": 37.114, - "eval_steps_per_second": 9.278, + "epoch": 0.6215684243240444, + "eval_loss": 0.6915447115898132, + "eval_runtime": 86.2147, + "eval_samples_per_second": 55.977, + "eval_steps_per_second": 14.0, "step": 1500 }, { - "epoch": 1.0333333333333334, - "grad_norm": 0.3405410051345825, - "learning_rate": 0.00013413636363636365, - "loss": 0.5571, + "epoch": 0.6422873718015125, + "grad_norm": 0.7345826625823975, + "learning_rate": 0.0001594231307756931, + "loss": 0.7978, "step": 1550 }, { - "epoch": 1.0666666666666667, - "grad_norm": 0.4485073983669281, - "learning_rate": 0.00013186363636363637, - "loss": 0.5674, + "epoch": 0.6630063192789807, + "grad_norm": 0.6538310050964355, + "learning_rate": 0.0001580229627555307, + "loss": 0.7672, "step": 1600 }, { - "epoch": 1.1, - "grad_norm": 0.34938374161720276, - "learning_rate": 0.0001295909090909091, - "loss": 0.5354, + "epoch": 0.6837252667564487, + "grad_norm": 0.661582350730896, + "learning_rate": 0.00015662279473536826, + "loss": 0.7378, "step": 1650 }, { - "epoch": 1.1333333333333333, - "grad_norm": 0.33084195852279663, - "learning_rate": 0.00012731818181818184, - "loss": 0.5765, + "epoch": 0.7044442142339169, + "grad_norm": 0.3603042960166931, + "learning_rate": 0.00015522262671520583, + "loss": 0.6741, "step": 1700 }, { - "epoch": 1.1666666666666667, - "grad_norm": 0.3667336404323578, - "learning_rate": 0.00012504545454545456, - "loss": 0.5486, + "epoch": 0.7251631617113851, + "grad_norm": 0.8882561326026917, + "learning_rate": 0.00015382245869504342, + "loss": 0.7695, "step": 1750 }, { - "epoch": 1.1666666666666667, - "eval_loss": 0.5557209253311157, - "eval_runtime": 80.8386, - "eval_samples_per_second": 37.111, - "eval_steps_per_second": 9.278, + "epoch": 0.7251631617113851, + "eval_loss": 0.6858941316604614, + "eval_runtime": 86.6358, + "eval_samples_per_second": 55.704, + "eval_steps_per_second": 13.932, "step": 1750 }, { - "epoch": 1.2, - "grad_norm": 0.33248019218444824, - "learning_rate": 0.00012277272727272728, - "loss": 0.5617, + "epoch": 0.7458821091888532, + "grad_norm": 0.5933266282081604, + "learning_rate": 0.000152422290674881, + "loss": 0.7548, "step": 1800 }, { - "epoch": 1.2333333333333334, - "grad_norm": 0.4447474479675293, - "learning_rate": 0.00012050000000000002, - "loss": 0.567, + "epoch": 0.7666010566663214, + "grad_norm": 0.8178608417510986, + "learning_rate": 0.00015102212265471858, + "loss": 0.7639, "step": 1850 }, { - "epoch": 1.2666666666666666, - "grad_norm": 0.42134660482406616, - "learning_rate": 0.00011822727272727274, - "loss": 0.5319, + "epoch": 0.7873200041437894, + "grad_norm": 0.4378993511199951, + "learning_rate": 0.00014962195463455615, + "loss": 0.7985, "step": 1900 }, { - "epoch": 1.3, - "grad_norm": 0.3942984640598297, - "learning_rate": 0.00011595454545454544, - "loss": 0.5325, + "epoch": 0.8080389516212576, + "grad_norm": 0.3732803463935852, + "learning_rate": 0.00014822178661439374, + "loss": 0.8481, "step": 1950 }, { - "epoch": 1.3333333333333333, - "grad_norm": 0.4929428696632385, - "learning_rate": 0.00011368181818181818, - "loss": 0.5565, + "epoch": 0.8287578990987258, + "grad_norm": 0.7421035170555115, + "learning_rate": 0.0001468216185942313, + "loss": 0.7223, "step": 2000 }, { - "epoch": 1.3333333333333333, - "eval_loss": 0.5535863637924194, - "eval_runtime": 80.8279, - "eval_samples_per_second": 37.116, - "eval_steps_per_second": 9.279, + "epoch": 0.8287578990987258, + "eval_loss": 0.6823315024375916, + "eval_runtime": 86.5575, + "eval_samples_per_second": 55.755, + "eval_steps_per_second": 13.944, "step": 2000 }, { - "epoch": 1.3666666666666667, - "grad_norm": 0.4141586720943451, - "learning_rate": 0.00011140909090909091, - "loss": 0.5801, + "epoch": 0.8494768465761939, + "grad_norm": 0.5109913349151611, + "learning_rate": 0.00014542145057406888, + "loss": 0.7895, "step": 2050 }, { - "epoch": 1.4, - "grad_norm": 0.45937269926071167, - "learning_rate": 0.00010913636363636364, - "loss": 0.5439, + "epoch": 0.8701957940536621, + "grad_norm": 0.47988179326057434, + "learning_rate": 0.00014402128255390647, + "loss": 0.7385, "step": 2100 }, { - "epoch": 1.4333333333333333, - "grad_norm": 0.47830042243003845, - "learning_rate": 0.00010686363636363637, - "loss": 0.547, + "epoch": 0.8909147415311303, + "grad_norm": 0.7593080997467041, + "learning_rate": 0.00014262111453374404, + "loss": 0.7744, "step": 2150 }, { - "epoch": 1.4666666666666668, - "grad_norm": 0.40260276198387146, - "learning_rate": 0.00010459090909090909, - "loss": 0.5229, + "epoch": 0.9116336890085983, + "grad_norm": 0.5866154432296753, + "learning_rate": 0.00014122094651358163, + "loss": 0.7062, "step": 2200 }, { - "epoch": 1.5, - "grad_norm": 0.5281402468681335, - "learning_rate": 0.00010231818181818183, - "loss": 0.5475, + "epoch": 0.9323526364860665, + "grad_norm": 0.47364088892936707, + "learning_rate": 0.00013982077849341922, + "loss": 0.7792, "step": 2250 }, { - "epoch": 1.5, - "eval_loss": 0.5505018830299377, - "eval_runtime": 80.8409, - "eval_samples_per_second": 37.11, - "eval_steps_per_second": 9.277, + "epoch": 0.9323526364860665, + "eval_loss": 0.6785813570022583, + "eval_runtime": 86.3444, + "eval_samples_per_second": 55.892, + "eval_steps_per_second": 13.979, "step": 2250 }, { - "epoch": 1.5333333333333332, - "grad_norm": 0.3721947968006134, - "learning_rate": 0.00010004545454545455, - "loss": 0.5466, + "epoch": 0.9530715839635346, + "grad_norm": 0.7610514760017395, + "learning_rate": 0.00013842061047325682, + "loss": 0.7804, "step": 2300 }, { - "epoch": 1.5666666666666667, - "grad_norm": 0.3462945818901062, - "learning_rate": 9.777272727272728e-05, - "loss": 0.5209, + "epoch": 0.9737905314410028, + "grad_norm": 0.7689616084098816, + "learning_rate": 0.00013702044245309438, + "loss": 0.7497, "step": 2350 }, { - "epoch": 1.6, - "grad_norm": 0.4027090072631836, - "learning_rate": 9.55e-05, - "loss": 0.5307, + "epoch": 0.994509478918471, + "grad_norm": 0.542168378829956, + "learning_rate": 0.00013562027443293195, + "loss": 0.7333, "step": 2400 }, { - "epoch": 1.6333333333333333, - "grad_norm": 0.3684265613555908, - "learning_rate": 9.322727272727273e-05, - "loss": 0.5118, + "epoch": 1.0149176421837771, + "grad_norm": 0.33903324604034424, + "learning_rate": 0.0001342481097731728, + "loss": 0.6952, "step": 2450 }, { - "epoch": 1.6666666666666665, - "grad_norm": 0.4819887578487396, - "learning_rate": 9.095454545454546e-05, - "loss": 0.561, + "epoch": 1.0356365896612452, + "grad_norm": 0.8183636665344238, + "learning_rate": 0.00013284794175301036, + "loss": 0.7386, "step": 2500 }, { - "epoch": 1.6666666666666665, - "eval_loss": 0.5476261377334595, - "eval_runtime": 80.8288, - "eval_samples_per_second": 37.115, - "eval_steps_per_second": 9.279, + "epoch": 1.0356365896612452, + "eval_loss": 0.675748348236084, + "eval_runtime": 86.2887, + "eval_samples_per_second": 55.929, + "eval_steps_per_second": 13.988, "step": 2500 } ], "logging_steps": 50, - "max_steps": 4500, + "max_steps": 7242, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, @@ -457,7 +457,7 @@ "attributes": {} } }, - "total_flos": 3.061836360125645e+17, + "total_flos": 1.5737739501748224e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null diff --git a/checkpoint-2500/training_args.bin b/checkpoint-2500/training_args.bin index eb236cf9d6a781057673256e00030110ee72a3cc..818731b2d637beef34d216b7461a12ef35db62df 100644 --- a/checkpoint-2500/training_args.bin +++ b/checkpoint-2500/training_args.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eb76e3e4d2123d52e529262f1ff37bfc600a160bb369e2338a54f71b47c17108 +oid sha256:a783872a61d64f4d1b4e002fa0fa67fa54dd9c59c4f7fbad58c794a77134069d size 5432 diff --git a/checkpoint-3000/adapter_config.json b/checkpoint-3000/adapter_config.json index 1a5356a6269df402dbde851645d59d282fadd9df..84dde86ee0bd87c0f5d5a0a36710a2a58516a7c4 100644 --- a/checkpoint-3000/adapter_config.json +++ b/checkpoint-3000/adapter_config.json @@ -29,9 +29,9 @@ "rank_pattern": {}, "revision": null, "target_modules": [ + "q_proj", "o_proj", "v_proj", - "q_proj", "k_proj" ], "target_parameters": null, diff --git a/checkpoint-3000/adapter_model.safetensors b/checkpoint-3000/adapter_model.safetensors index e19bdd6237caeb3d30847a6ca892f3bdb4ab3920..50b2897cebedd344776fc40c1f683d2e56eb44f8 100644 --- a/checkpoint-3000/adapter_model.safetensors +++ b/checkpoint-3000/adapter_model.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8737189ec50534340f940487b7bbcfbb3c0341cdc991f458aa11988b0dcf614e +oid sha256:6cea713a82dfa53e4225af27dadf62a79d6d173e0e322110ef4080d4150c823b size 54560368 diff --git a/checkpoint-3000/optimizer.pt b/checkpoint-3000/optimizer.pt index 3e7f81ba7421779289f968cca149d76616febd84..191c673619885724d1a2115c15046a932bc472a8 100644 --- a/checkpoint-3000/optimizer.pt +++ b/checkpoint-3000/optimizer.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:39f0f99c70b766de881416221876b07b78545e8c0e5a126b92f0fa687a983694 +oid sha256:3acc6fa6243a9ec00f9b0e375b237bf0f64023ebb64d44703bbfd65f25a2f895 size 109267450 diff --git a/checkpoint-3000/rng_state.pth b/checkpoint-3000/rng_state.pth index c2d66276953e0ab116f9c683042c6c0114d4d453..7ba8308274f8b7f158539194c4dfcc38ca92ae37 100644 --- a/checkpoint-3000/rng_state.pth +++ b/checkpoint-3000/rng_state.pth @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:08b64b36dce5f25b027b7d960504594585ac14a5c1168ea02281c808e279d651 +oid sha256:fa346dc61aa799e0160013066342f483bcb52c5551441757ad69edfbabf48bb0 size 14244 diff --git a/checkpoint-3000/scaler.pt b/checkpoint-3000/scaler.pt index 6c13b5448a0852139c8a78d48ae5f4dc71f8a023..06e645c088d222fcf5457c78f9b7f167c02aa06d 100644 --- a/checkpoint-3000/scaler.pt +++ b/checkpoint-3000/scaler.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:21aba8ed0f38ed1c04994c10a9ca7e9925e55ef2ed51283c43ff8e2cce78585f +oid sha256:fab881b6261b7765de00aaece9d42aeb004a99a034f6ff76b068724f6121a7ec size 988 diff --git a/checkpoint-3000/scheduler.pt b/checkpoint-3000/scheduler.pt index 0b0cb2b1081a6f164375cbd360fc50a26f83087d..fcf7183326b53c7c1cc917fb08e8a546f27f1414 100644 --- a/checkpoint-3000/scheduler.pt +++ b/checkpoint-3000/scheduler.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d6b22153a6004ee7569e1ad90f415ae5727df20ad97a541ace0b82f7edb0c83a +oid sha256:06a8d96703998223bf2cf655698a26277cad9e4925693c4c21a22c01308a5a11 size 1064 diff --git a/checkpoint-3000/trainer_state.json b/checkpoint-3000/trainer_state.json index 31d9c276f00736e6aa169ab7d56c806f1101954e..53cd834a3e23463f88e852fb77c3beea057ad08e 100644 --- a/checkpoint-3000/trainer_state.json +++ b/checkpoint-3000/trainer_state.json @@ -1,8 +1,8 @@ { "best_global_step": 3000, - "best_metric": 0.5436099171638489, - "best_model_checkpoint": "finetuned_models/llama3.1-8b-lora/checkpoint-3000", - "epoch": 2.0, + "best_metric": 0.6727278828620911, + "best_model_checkpoint": "amiya_outputs/llama3.1-8b-amiya-palestinian/checkpoint-3000", + "epoch": 1.2428260644359266, "eval_steps": 250, "global_step": 3000, "is_hyper_param_search": false, @@ -10,524 +10,524 @@ "is_world_process_zero": true, "log_history": [ { - "epoch": 0.03333333333333333, - "grad_norm": 0.5346225500106812, + "epoch": 0.020718947477468146, + "grad_norm": 1.0589393377304077, "learning_rate": 9.8e-05, - "loss": 2.4955, + "loss": 2.6567, "step": 50 }, { - "epoch": 0.06666666666666667, - "grad_norm": 0.719093918800354, + "epoch": 0.04143789495493629, + "grad_norm": 0.9738045334815979, "learning_rate": 0.00019800000000000002, - "loss": 0.71, + "loss": 0.9502, "step": 100 }, { - "epoch": 0.1, - "grad_norm": 0.4840560853481293, - "learning_rate": 0.0001977727272727273, - "loss": 0.6405, + "epoch": 0.062156842432404436, + "grad_norm": 0.8801347017288208, + "learning_rate": 0.00019862783534024082, + "loss": 0.8496, "step": 150 }, { - "epoch": 0.13333333333333333, - "grad_norm": 0.3332301676273346, - "learning_rate": 0.0001955, - "loss": 0.6287, + "epoch": 0.08287578990987259, + "grad_norm": 0.7272312045097351, + "learning_rate": 0.00019722766732007841, + "loss": 0.8184, "step": 200 }, { - "epoch": 0.16666666666666666, - "grad_norm": 0.40639588236808777, - "learning_rate": 0.00019322727272727276, - "loss": 0.5572, + "epoch": 0.10359473738734072, + "grad_norm": 0.7850629091262817, + "learning_rate": 0.000195827499299916, + "loss": 0.8392, "step": 250 }, { - "epoch": 0.16666666666666666, - "eval_loss": 0.5975945591926575, - "eval_runtime": 80.8004, - "eval_samples_per_second": 37.129, - "eval_steps_per_second": 9.282, + "epoch": 0.10359473738734072, + "eval_loss": 0.7402811050415039, + "eval_runtime": 85.9367, + "eval_samples_per_second": 56.158, + "eval_steps_per_second": 14.045, "step": 250 }, { - "epoch": 0.2, - "grad_norm": 0.3970712423324585, - "learning_rate": 0.00019095454545454545, - "loss": 0.6165, + "epoch": 0.12431368486480887, + "grad_norm": 0.40629276633262634, + "learning_rate": 0.00019442733127975358, + "loss": 0.8108, "step": 300 }, { - "epoch": 0.23333333333333334, - "grad_norm": 0.38409528136253357, - "learning_rate": 0.00018868181818181817, - "loss": 0.639, + "epoch": 0.145032632342277, + "grad_norm": 0.5258236527442932, + "learning_rate": 0.00019302716325959117, + "loss": 0.8116, "step": 350 }, { - "epoch": 0.26666666666666666, - "grad_norm": 0.44628769159317017, - "learning_rate": 0.00018640909090909092, - "loss": 0.636, + "epoch": 0.16575157981974517, + "grad_norm": 0.6879925727844238, + "learning_rate": 0.00019162699523942874, + "loss": 0.9089, "step": 400 }, { - "epoch": 0.3, - "grad_norm": 0.3697021007537842, - "learning_rate": 0.00018413636363636364, - "loss": 0.6192, + "epoch": 0.1864705272972133, + "grad_norm": 0.7583937048912048, + "learning_rate": 0.00019022682721926633, + "loss": 0.874, "step": 450 }, { - "epoch": 0.3333333333333333, - "grad_norm": 0.36338189244270325, - "learning_rate": 0.00018186363636363636, - "loss": 0.6134, + "epoch": 0.20718947477468144, + "grad_norm": 0.6399120688438416, + "learning_rate": 0.0001888266591991039, + "loss": 0.8366, "step": 500 }, { - "epoch": 0.3333333333333333, - "eval_loss": 0.5813060998916626, - "eval_runtime": 80.7819, - "eval_samples_per_second": 37.137, - "eval_steps_per_second": 9.284, + "epoch": 0.20718947477468144, + "eval_loss": 0.7194066047668457, + "eval_runtime": 86.2811, + "eval_samples_per_second": 55.933, + "eval_steps_per_second": 13.989, "step": 500 }, { - "epoch": 0.36666666666666664, - "grad_norm": 0.35211533308029175, - "learning_rate": 0.0001795909090909091, - "loss": 0.6128, + "epoch": 0.22790842225214958, + "grad_norm": 0.7763131856918335, + "learning_rate": 0.0001874264911789415, + "loss": 0.7912, "step": 550 }, { - "epoch": 0.4, - "grad_norm": 0.36327463388442993, - "learning_rate": 0.00017731818181818183, - "loss": 0.5915, + "epoch": 0.24862736972961774, + "grad_norm": 0.6845299601554871, + "learning_rate": 0.00018602632315877906, + "loss": 0.8506, "step": 600 }, { - "epoch": 0.43333333333333335, - "grad_norm": 0.40672942996025085, - "learning_rate": 0.00017504545454545455, - "loss": 0.5807, + "epoch": 0.2693463172070859, + "grad_norm": 0.8045451045036316, + "learning_rate": 0.00018462615513861665, + "loss": 0.763, "step": 650 }, { - "epoch": 0.4666666666666667, - "grad_norm": 0.4689007103443146, - "learning_rate": 0.00017277272727272728, - "loss": 0.602, + "epoch": 0.290065264684554, + "grad_norm": 0.7035927176475525, + "learning_rate": 0.00018322598711845422, + "loss": 0.7769, "step": 700 }, { - "epoch": 0.5, - "grad_norm": 0.3979697823524475, - "learning_rate": 0.00017050000000000002, - "loss": 0.5703, + "epoch": 0.3107842121620222, + "grad_norm": 0.465000718832016, + "learning_rate": 0.00018182581909829179, + "loss": 0.7705, "step": 750 }, { - "epoch": 0.5, - "eval_loss": 0.5740106701850891, - "eval_runtime": 80.8209, - "eval_samples_per_second": 37.119, - "eval_steps_per_second": 9.28, + "epoch": 0.3107842121620222, + "eval_loss": 0.7103215456008911, + "eval_runtime": 86.1101, + "eval_samples_per_second": 56.045, + "eval_steps_per_second": 14.017, "step": 750 }, { - "epoch": 0.5333333333333333, - "grad_norm": 0.3071135878562927, - "learning_rate": 0.00016822727272727275, - "loss": 0.5746, + "epoch": 0.33150315963949034, + "grad_norm": 0.4990151524543762, + "learning_rate": 0.00018042565107812938, + "loss": 0.8438, "step": 800 }, { - "epoch": 0.5666666666666667, - "grad_norm": 0.318085253238678, - "learning_rate": 0.00016595454545454544, - "loss": 0.5873, + "epoch": 0.35222210711695845, + "grad_norm": 0.7391067147254944, + "learning_rate": 0.00017902548305796695, + "loss": 0.7688, "step": 850 }, { - "epoch": 0.6, - "grad_norm": 0.35915374755859375, - "learning_rate": 0.0001636818181818182, - "loss": 0.6283, + "epoch": 0.3729410545944266, + "grad_norm": 0.8036171197891235, + "learning_rate": 0.00017762531503780454, + "loss": 0.753, "step": 900 }, { - "epoch": 0.6333333333333333, - "grad_norm": 0.3174057602882385, - "learning_rate": 0.0001614090909090909, - "loss": 0.5912, + "epoch": 0.3936600020718947, + "grad_norm": 0.44744470715522766, + "learning_rate": 0.00017622514701764213, + "loss": 0.7793, "step": 950 }, { - "epoch": 0.6666666666666666, - "grad_norm": 0.416111022233963, - "learning_rate": 0.00015913636363636363, - "loss": 0.5647, + "epoch": 0.4143789495493629, + "grad_norm": 0.630820631980896, + "learning_rate": 0.00017482497899747973, + "loss": 0.7555, "step": 1000 }, { - "epoch": 0.6666666666666666, - "eval_loss": 0.5663638710975647, - "eval_runtime": 80.8183, - "eval_samples_per_second": 37.12, - "eval_steps_per_second": 9.28, + "epoch": 0.4143789495493629, + "eval_loss": 0.7030432820320129, + "eval_runtime": 86.2543, + "eval_samples_per_second": 55.951, + "eval_steps_per_second": 13.994, "step": 1000 }, { - "epoch": 0.7, - "grad_norm": 0.41202324628829956, - "learning_rate": 0.00015686363636363638, - "loss": 0.6118, + "epoch": 0.43509789702683105, + "grad_norm": 0.45690879225730896, + "learning_rate": 0.0001734248109773173, + "loss": 0.793, "step": 1050 }, { - "epoch": 0.7333333333333333, - "grad_norm": 0.3883333206176758, - "learning_rate": 0.0001545909090909091, - "loss": 0.5392, + "epoch": 0.45581684450429916, + "grad_norm": 0.5000227093696594, + "learning_rate": 0.00017202464295715486, + "loss": 0.8342, "step": 1100 }, { - "epoch": 0.7666666666666667, - "grad_norm": 0.31973451375961304, - "learning_rate": 0.00015231818181818182, - "loss": 0.5602, + "epoch": 0.4765357919817673, + "grad_norm": 0.47182488441467285, + "learning_rate": 0.00017062447493699246, + "loss": 0.7997, "step": 1150 }, { - "epoch": 0.8, - "grad_norm": 0.31378698348999023, - "learning_rate": 0.00015004545454545454, - "loss": 0.5642, + "epoch": 0.4972547394592355, + "grad_norm": 0.7060516476631165, + "learning_rate": 0.00016922430691683002, + "loss": 0.7788, "step": 1200 }, { - "epoch": 0.8333333333333334, - "grad_norm": 0.3346308171749115, - "learning_rate": 0.0001477727272727273, - "loss": 0.5925, + "epoch": 0.5179736869367036, + "grad_norm": 0.46701857447624207, + "learning_rate": 0.00016782413889666762, + "loss": 0.7518, "step": 1250 }, { - "epoch": 0.8333333333333334, - "eval_loss": 0.5619704723358154, - "eval_runtime": 80.824, - "eval_samples_per_second": 37.118, - "eval_steps_per_second": 9.279, + "epoch": 0.5179736869367036, + "eval_loss": 0.7023425698280334, + "eval_runtime": 86.3015, + "eval_samples_per_second": 55.92, + "eval_steps_per_second": 13.986, "step": 1250 }, { - "epoch": 0.8666666666666667, - "grad_norm": 0.5573959946632385, - "learning_rate": 0.0001455, - "loss": 0.5829, + "epoch": 0.5386926344141718, + "grad_norm": 0.668192446231842, + "learning_rate": 0.00016642397087650518, + "loss": 0.7682, "step": 1300 }, { - "epoch": 0.9, - "grad_norm": 0.36054643988609314, - "learning_rate": 0.00014322727272727273, - "loss": 0.5923, + "epoch": 0.5594115818916399, + "grad_norm": 0.47292283177375793, + "learning_rate": 0.00016502380285634278, + "loss": 0.7985, "step": 1350 }, { - "epoch": 0.9333333333333333, - "grad_norm": 0.36059027910232544, - "learning_rate": 0.00014095454545454546, - "loss": 0.5808, + "epoch": 0.580130529369108, + "grad_norm": 0.7327275276184082, + "learning_rate": 0.00016362363483618034, + "loss": 0.8378, "step": 1400 }, { - "epoch": 0.9666666666666667, - "grad_norm": 0.3942534327507019, - "learning_rate": 0.00013868181818181818, - "loss": 0.5597, + "epoch": 0.6008494768465762, + "grad_norm": 0.8417996764183044, + "learning_rate": 0.0001622234668160179, + "loss": 0.7962, "step": 1450 }, { - "epoch": 1.0, - "grad_norm": 0.3995835483074188, - "learning_rate": 0.0001364090909090909, - "loss": 0.5554, + "epoch": 0.6215684243240444, + "grad_norm": 0.6189562678337097, + "learning_rate": 0.0001608232987958555, + "loss": 0.8028, "step": 1500 }, { - "epoch": 1.0, - "eval_loss": 0.5581239461898804, - "eval_runtime": 80.8326, - "eval_samples_per_second": 37.114, - "eval_steps_per_second": 9.278, + "epoch": 0.6215684243240444, + "eval_loss": 0.6915447115898132, + "eval_runtime": 86.2147, + "eval_samples_per_second": 55.977, + "eval_steps_per_second": 14.0, "step": 1500 }, { - "epoch": 1.0333333333333334, - "grad_norm": 0.3405410051345825, - "learning_rate": 0.00013413636363636365, - "loss": 0.5571, + "epoch": 0.6422873718015125, + "grad_norm": 0.7345826625823975, + "learning_rate": 0.0001594231307756931, + "loss": 0.7978, "step": 1550 }, { - "epoch": 1.0666666666666667, - "grad_norm": 0.4485073983669281, - "learning_rate": 0.00013186363636363637, - "loss": 0.5674, + "epoch": 0.6630063192789807, + "grad_norm": 0.6538310050964355, + "learning_rate": 0.0001580229627555307, + "loss": 0.7672, "step": 1600 }, { - "epoch": 1.1, - "grad_norm": 0.34938374161720276, - "learning_rate": 0.0001295909090909091, - "loss": 0.5354, + "epoch": 0.6837252667564487, + "grad_norm": 0.661582350730896, + "learning_rate": 0.00015662279473536826, + "loss": 0.7378, "step": 1650 }, { - "epoch": 1.1333333333333333, - "grad_norm": 0.33084195852279663, - "learning_rate": 0.00012731818181818184, - "loss": 0.5765, + "epoch": 0.7044442142339169, + "grad_norm": 0.3603042960166931, + "learning_rate": 0.00015522262671520583, + "loss": 0.6741, "step": 1700 }, { - "epoch": 1.1666666666666667, - "grad_norm": 0.3667336404323578, - "learning_rate": 0.00012504545454545456, - "loss": 0.5486, + "epoch": 0.7251631617113851, + "grad_norm": 0.8882561326026917, + "learning_rate": 0.00015382245869504342, + "loss": 0.7695, "step": 1750 }, { - "epoch": 1.1666666666666667, - "eval_loss": 0.5557209253311157, - "eval_runtime": 80.8386, - "eval_samples_per_second": 37.111, - "eval_steps_per_second": 9.278, + "epoch": 0.7251631617113851, + "eval_loss": 0.6858941316604614, + "eval_runtime": 86.6358, + "eval_samples_per_second": 55.704, + "eval_steps_per_second": 13.932, "step": 1750 }, { - "epoch": 1.2, - "grad_norm": 0.33248019218444824, - "learning_rate": 0.00012277272727272728, - "loss": 0.5617, + "epoch": 0.7458821091888532, + "grad_norm": 0.5933266282081604, + "learning_rate": 0.000152422290674881, + "loss": 0.7548, "step": 1800 }, { - "epoch": 1.2333333333333334, - "grad_norm": 0.4447474479675293, - "learning_rate": 0.00012050000000000002, - "loss": 0.567, + "epoch": 0.7666010566663214, + "grad_norm": 0.8178608417510986, + "learning_rate": 0.00015102212265471858, + "loss": 0.7639, "step": 1850 }, { - "epoch": 1.2666666666666666, - "grad_norm": 0.42134660482406616, - "learning_rate": 0.00011822727272727274, - "loss": 0.5319, + "epoch": 0.7873200041437894, + "grad_norm": 0.4378993511199951, + "learning_rate": 0.00014962195463455615, + "loss": 0.7985, "step": 1900 }, { - "epoch": 1.3, - "grad_norm": 0.3942984640598297, - "learning_rate": 0.00011595454545454544, - "loss": 0.5325, + "epoch": 0.8080389516212576, + "grad_norm": 0.3732803463935852, + "learning_rate": 0.00014822178661439374, + "loss": 0.8481, "step": 1950 }, { - "epoch": 1.3333333333333333, - "grad_norm": 0.4929428696632385, - "learning_rate": 0.00011368181818181818, - "loss": 0.5565, + "epoch": 0.8287578990987258, + "grad_norm": 0.7421035170555115, + "learning_rate": 0.0001468216185942313, + "loss": 0.7223, "step": 2000 }, { - "epoch": 1.3333333333333333, - "eval_loss": 0.5535863637924194, - "eval_runtime": 80.8279, - "eval_samples_per_second": 37.116, - "eval_steps_per_second": 9.279, + "epoch": 0.8287578990987258, + "eval_loss": 0.6823315024375916, + "eval_runtime": 86.5575, + "eval_samples_per_second": 55.755, + "eval_steps_per_second": 13.944, "step": 2000 }, { - "epoch": 1.3666666666666667, - "grad_norm": 0.4141586720943451, - "learning_rate": 0.00011140909090909091, - "loss": 0.5801, + "epoch": 0.8494768465761939, + "grad_norm": 0.5109913349151611, + "learning_rate": 0.00014542145057406888, + "loss": 0.7895, "step": 2050 }, { - "epoch": 1.4, - "grad_norm": 0.45937269926071167, - "learning_rate": 0.00010913636363636364, - "loss": 0.5439, + "epoch": 0.8701957940536621, + "grad_norm": 0.47988179326057434, + "learning_rate": 0.00014402128255390647, + "loss": 0.7385, "step": 2100 }, { - "epoch": 1.4333333333333333, - "grad_norm": 0.47830042243003845, - "learning_rate": 0.00010686363636363637, - "loss": 0.547, + "epoch": 0.8909147415311303, + "grad_norm": 0.7593080997467041, + "learning_rate": 0.00014262111453374404, + "loss": 0.7744, "step": 2150 }, { - "epoch": 1.4666666666666668, - "grad_norm": 0.40260276198387146, - "learning_rate": 0.00010459090909090909, - "loss": 0.5229, + "epoch": 0.9116336890085983, + "grad_norm": 0.5866154432296753, + "learning_rate": 0.00014122094651358163, + "loss": 0.7062, "step": 2200 }, { - "epoch": 1.5, - "grad_norm": 0.5281402468681335, - "learning_rate": 0.00010231818181818183, - "loss": 0.5475, + "epoch": 0.9323526364860665, + "grad_norm": 0.47364088892936707, + "learning_rate": 0.00013982077849341922, + "loss": 0.7792, "step": 2250 }, { - "epoch": 1.5, - "eval_loss": 0.5505018830299377, - "eval_runtime": 80.8409, - "eval_samples_per_second": 37.11, - "eval_steps_per_second": 9.277, + "epoch": 0.9323526364860665, + "eval_loss": 0.6785813570022583, + "eval_runtime": 86.3444, + "eval_samples_per_second": 55.892, + "eval_steps_per_second": 13.979, "step": 2250 }, { - "epoch": 1.5333333333333332, - "grad_norm": 0.3721947968006134, - "learning_rate": 0.00010004545454545455, - "loss": 0.5466, + "epoch": 0.9530715839635346, + "grad_norm": 0.7610514760017395, + "learning_rate": 0.00013842061047325682, + "loss": 0.7804, "step": 2300 }, { - "epoch": 1.5666666666666667, - "grad_norm": 0.3462945818901062, - "learning_rate": 9.777272727272728e-05, - "loss": 0.5209, + "epoch": 0.9737905314410028, + "grad_norm": 0.7689616084098816, + "learning_rate": 0.00013702044245309438, + "loss": 0.7497, "step": 2350 }, { - "epoch": 1.6, - "grad_norm": 0.4027090072631836, - "learning_rate": 9.55e-05, - "loss": 0.5307, + "epoch": 0.994509478918471, + "grad_norm": 0.542168378829956, + "learning_rate": 0.00013562027443293195, + "loss": 0.7333, "step": 2400 }, { - "epoch": 1.6333333333333333, - "grad_norm": 0.3684265613555908, - "learning_rate": 9.322727272727273e-05, - "loss": 0.5118, + "epoch": 1.0149176421837771, + "grad_norm": 0.33903324604034424, + "learning_rate": 0.0001342481097731728, + "loss": 0.6952, "step": 2450 }, { - "epoch": 1.6666666666666665, - "grad_norm": 0.4819887578487396, - "learning_rate": 9.095454545454546e-05, - "loss": 0.561, + "epoch": 1.0356365896612452, + "grad_norm": 0.8183636665344238, + "learning_rate": 0.00013284794175301036, + "loss": 0.7386, "step": 2500 }, { - "epoch": 1.6666666666666665, - "eval_loss": 0.5476261377334595, - "eval_runtime": 80.8288, - "eval_samples_per_second": 37.115, - "eval_steps_per_second": 9.279, + "epoch": 1.0356365896612452, + "eval_loss": 0.675748348236084, + "eval_runtime": 86.2887, + "eval_samples_per_second": 55.929, + "eval_steps_per_second": 13.988, "step": 2500 }, { - "epoch": 1.7, - "grad_norm": 0.3161783218383789, - "learning_rate": 8.86818181818182e-05, - "loss": 0.5413, + "epoch": 1.0563555371387134, + "grad_norm": 0.6831589937210083, + "learning_rate": 0.00013144777373284795, + "loss": 0.72, "step": 2550 }, { - "epoch": 1.7333333333333334, - "grad_norm": 0.34697386622428894, - "learning_rate": 8.640909090909092e-05, - "loss": 0.5366, + "epoch": 1.0770744846161815, + "grad_norm": 0.6346258521080017, + "learning_rate": 0.00013004760571268552, + "loss": 0.7026, "step": 2600 }, { - "epoch": 1.7666666666666666, - "grad_norm": 0.4084527790546417, - "learning_rate": 8.413636363636364e-05, - "loss": 0.5426, + "epoch": 1.0977934320936495, + "grad_norm": 0.5658385753631592, + "learning_rate": 0.0001286474376925231, + "loss": 0.7162, "step": 2650 }, { - "epoch": 1.8, - "grad_norm": 0.4053308963775635, - "learning_rate": 8.186363636363636e-05, - "loss": 0.532, + "epoch": 1.1185123795711178, + "grad_norm": 0.4242883026599884, + "learning_rate": 0.00012724726967236068, + "loss": 0.7325, "step": 2700 }, { - "epoch": 1.8333333333333335, - "grad_norm": 0.3551884591579437, - "learning_rate": 7.95909090909091e-05, - "loss": 0.5399, + "epoch": 1.1392313270485859, + "grad_norm": 0.5489133596420288, + "learning_rate": 0.00012584710165219827, + "loss": 0.7138, "step": 2750 }, { - "epoch": 1.8333333333333335, - "eval_loss": 0.546008288860321, - "eval_runtime": 80.8186, - "eval_samples_per_second": 37.12, - "eval_steps_per_second": 9.28, + "epoch": 1.1392313270485859, + "eval_loss": 0.6747092604637146, + "eval_runtime": 86.4239, + "eval_samples_per_second": 55.841, + "eval_steps_per_second": 13.966, "step": 2750 }, { - "epoch": 1.8666666666666667, - "grad_norm": 0.40072572231292725, - "learning_rate": 7.731818181818183e-05, - "loss": 0.5332, + "epoch": 1.1599502745260541, + "grad_norm": 0.6514728665351868, + "learning_rate": 0.00012444693363203587, + "loss": 0.7105, "step": 2800 }, { - "epoch": 1.9, - "grad_norm": 0.3773200213909149, - "learning_rate": 7.504545454545455e-05, - "loss": 0.5296, + "epoch": 1.1806692220035222, + "grad_norm": 0.48897412419319153, + "learning_rate": 0.00012304676561187343, + "loss": 0.7271, "step": 2850 }, { - "epoch": 1.9333333333333333, - "grad_norm": 0.45379436016082764, - "learning_rate": 7.277272727272728e-05, - "loss": 0.5356, + "epoch": 1.2013881694809903, + "grad_norm": 0.7159713506698608, + "learning_rate": 0.00012164659759171101, + "loss": 0.7454, "step": 2900 }, { - "epoch": 1.9666666666666668, - "grad_norm": 0.36246028542518616, - "learning_rate": 7.05e-05, - "loss": 0.5112, + "epoch": 1.2221071169584585, + "grad_norm": 0.7044214010238647, + "learning_rate": 0.0001202464295715486, + "loss": 0.6918, "step": 2950 }, { - "epoch": 2.0, - "grad_norm": 0.40895622968673706, - "learning_rate": 6.822727272727273e-05, - "loss": 0.5358, + "epoch": 1.2428260644359266, + "grad_norm": 0.7934305667877197, + "learning_rate": 0.00011884626155138616, + "loss": 0.7018, "step": 3000 }, { - "epoch": 2.0, - "eval_loss": 0.5436099171638489, - "eval_runtime": 80.8207, - "eval_samples_per_second": 37.119, - "eval_steps_per_second": 9.28, + "epoch": 1.2428260644359266, + "eval_loss": 0.6727278828620911, + "eval_runtime": 86.1985, + "eval_samples_per_second": 55.987, + "eval_steps_per_second": 14.003, "step": 3000 } ], "logging_steps": 50, - "max_steps": 4500, + "max_steps": 7242, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, @@ -543,7 +543,7 @@ "attributes": {} } }, - "total_flos": 3.6691738985250816e+17, + "total_flos": 1.8877939667533824e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null diff --git a/checkpoint-3000/training_args.bin b/checkpoint-3000/training_args.bin index eb236cf9d6a781057673256e00030110ee72a3cc..818731b2d637beef34d216b7461a12ef35db62df 100644 --- a/checkpoint-3000/training_args.bin +++ b/checkpoint-3000/training_args.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eb76e3e4d2123d52e529262f1ff37bfc600a160bb369e2338a54f71b47c17108 +oid sha256:a783872a61d64f4d1b4e002fa0fa67fa54dd9c59c4f7fbad58c794a77134069d size 5432 diff --git a/checkpoint-3500/adapter_config.json b/checkpoint-3500/adapter_config.json index 1a5356a6269df402dbde851645d59d282fadd9df..84dde86ee0bd87c0f5d5a0a36710a2a58516a7c4 100644 --- a/checkpoint-3500/adapter_config.json +++ b/checkpoint-3500/adapter_config.json @@ -29,9 +29,9 @@ "rank_pattern": {}, "revision": null, "target_modules": [ + "q_proj", "o_proj", "v_proj", - "q_proj", "k_proj" ], "target_parameters": null, diff --git a/checkpoint-3500/adapter_model.safetensors b/checkpoint-3500/adapter_model.safetensors index 2b7013272812e2ef9f5b6eb125cee2d47707dbb2..945c9bc46fce9846709f9958290300ba200fe4ed 100644 --- a/checkpoint-3500/adapter_model.safetensors +++ b/checkpoint-3500/adapter_model.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:12f0b3a531cc4b85c51c3d19fd29fa9f2ff0a0aaebca23605d724770413d49dd +oid sha256:efd6ac84524dc525109e0cf3984e4fb4afaa59e8f7de0dc6109c2b12c586afc5 size 54560368 diff --git a/checkpoint-3500/optimizer.pt b/checkpoint-3500/optimizer.pt index 94e1c21a2b04267b13310c3f9c699e6f92a3a62f..b934a98c5481fca9513da906dfeca9bef8558842 100644 --- a/checkpoint-3500/optimizer.pt +++ b/checkpoint-3500/optimizer.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4a4eef6562f7b0c26ac35c5f7d087ce2a7559b2e6cdf2884cceaa3c0ee6e1b36 +oid sha256:98d7c59285360613eaeb682746b1d5e816d8c270b8349c722a70600a8d9d6ddb size 109267450 diff --git a/checkpoint-3500/rng_state.pth b/checkpoint-3500/rng_state.pth index f544b59c9dc082ef0071ff9e821d5251896611b2..3cf759251e37f6951dbcf470480c4680143b38fa 100644 --- a/checkpoint-3500/rng_state.pth +++ b/checkpoint-3500/rng_state.pth @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8b57187731297d3d34a8d707e0d59c7b35e51c65106b068986ec8c8627963b5d +oid sha256:ed5252c8fed9a2f3c650896ede719a729d89d5457a6b7b888d47da3cf1064c08 size 14244 diff --git a/checkpoint-3500/scaler.pt b/checkpoint-3500/scaler.pt index 642d9ddc499e11af6d65baf9612222717fa34b7d..2e074cb3607cf5355b53e3b8420ecddd0ee2824f 100644 --- a/checkpoint-3500/scaler.pt +++ b/checkpoint-3500/scaler.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9d25c9e5c384ba91142c829ef5432ebc4ae7d8c71f3de723046dd3aa202e08a2 +oid sha256:a0ba488383c7d42e68fdfa7344fb6e0324b381de27f6504d975f79101124ff3a size 988 diff --git a/checkpoint-3500/scheduler.pt b/checkpoint-3500/scheduler.pt index 99e329db059bb463c93d1e34f84f037ef278a0e4..a64b2252e75d1cd9579ead2e1c6732d0d537a6a5 100644 --- a/checkpoint-3500/scheduler.pt +++ b/checkpoint-3500/scheduler.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:19eea76bd539d1067fbb6c0af0bc3feabf4a4fcc75b4afa719255b0d413e8ced +oid sha256:71c9cdb357928829533126660dc9acec503bb0b54ce6ea94dffebd2dc851fd2c size 1064 diff --git a/checkpoint-3500/trainer_state.json b/checkpoint-3500/trainer_state.json index b599f4be44fd800479be0aa6c2066bd071656357..5c7dbd82b80a4f70bafd3cd13ae3e8be564d029a 100644 --- a/checkpoint-3500/trainer_state.json +++ b/checkpoint-3500/trainer_state.json @@ -1,8 +1,8 @@ { - "best_global_step": 3000, - "best_metric": 0.5436099171638489, - "best_model_checkpoint": "finetuned_models/llama3.1-8b-lora/checkpoint-3000", - "epoch": 2.3333333333333335, + "best_global_step": 3500, + "best_metric": 0.6663665175437927, + "best_model_checkpoint": "amiya_outputs/llama3.1-8b-amiya-palestinian/checkpoint-3500", + "epoch": 1.4500155392106082, "eval_steps": 250, "global_step": 3500, "is_hyper_param_search": false, @@ -10,610 +10,610 @@ "is_world_process_zero": true, "log_history": [ { - "epoch": 0.03333333333333333, - "grad_norm": 0.5346225500106812, + "epoch": 0.020718947477468146, + "grad_norm": 1.0589393377304077, "learning_rate": 9.8e-05, - "loss": 2.4955, + "loss": 2.6567, "step": 50 }, { - "epoch": 0.06666666666666667, - "grad_norm": 0.719093918800354, + "epoch": 0.04143789495493629, + "grad_norm": 0.9738045334815979, "learning_rate": 0.00019800000000000002, - "loss": 0.71, + "loss": 0.9502, "step": 100 }, { - "epoch": 0.1, - "grad_norm": 0.4840560853481293, - "learning_rate": 0.0001977727272727273, - "loss": 0.6405, + "epoch": 0.062156842432404436, + "grad_norm": 0.8801347017288208, + "learning_rate": 0.00019862783534024082, + "loss": 0.8496, "step": 150 }, { - "epoch": 0.13333333333333333, - "grad_norm": 0.3332301676273346, - "learning_rate": 0.0001955, - "loss": 0.6287, + "epoch": 0.08287578990987259, + "grad_norm": 0.7272312045097351, + "learning_rate": 0.00019722766732007841, + "loss": 0.8184, "step": 200 }, { - "epoch": 0.16666666666666666, - "grad_norm": 0.40639588236808777, - "learning_rate": 0.00019322727272727276, - "loss": 0.5572, + "epoch": 0.10359473738734072, + "grad_norm": 0.7850629091262817, + "learning_rate": 0.000195827499299916, + "loss": 0.8392, "step": 250 }, { - "epoch": 0.16666666666666666, - "eval_loss": 0.5975945591926575, - "eval_runtime": 80.8004, - "eval_samples_per_second": 37.129, - "eval_steps_per_second": 9.282, + "epoch": 0.10359473738734072, + "eval_loss": 0.7402811050415039, + "eval_runtime": 85.9367, + "eval_samples_per_second": 56.158, + "eval_steps_per_second": 14.045, "step": 250 }, { - "epoch": 0.2, - "grad_norm": 0.3970712423324585, - "learning_rate": 0.00019095454545454545, - "loss": 0.6165, + "epoch": 0.12431368486480887, + "grad_norm": 0.40629276633262634, + "learning_rate": 0.00019442733127975358, + "loss": 0.8108, "step": 300 }, { - "epoch": 0.23333333333333334, - "grad_norm": 0.38409528136253357, - "learning_rate": 0.00018868181818181817, - "loss": 0.639, + "epoch": 0.145032632342277, + "grad_norm": 0.5258236527442932, + "learning_rate": 0.00019302716325959117, + "loss": 0.8116, "step": 350 }, { - "epoch": 0.26666666666666666, - "grad_norm": 0.44628769159317017, - "learning_rate": 0.00018640909090909092, - "loss": 0.636, + "epoch": 0.16575157981974517, + "grad_norm": 0.6879925727844238, + "learning_rate": 0.00019162699523942874, + "loss": 0.9089, "step": 400 }, { - "epoch": 0.3, - "grad_norm": 0.3697021007537842, - "learning_rate": 0.00018413636363636364, - "loss": 0.6192, + "epoch": 0.1864705272972133, + "grad_norm": 0.7583937048912048, + "learning_rate": 0.00019022682721926633, + "loss": 0.874, "step": 450 }, { - "epoch": 0.3333333333333333, - "grad_norm": 0.36338189244270325, - "learning_rate": 0.00018186363636363636, - "loss": 0.6134, + "epoch": 0.20718947477468144, + "grad_norm": 0.6399120688438416, + "learning_rate": 0.0001888266591991039, + "loss": 0.8366, "step": 500 }, { - "epoch": 0.3333333333333333, - "eval_loss": 0.5813060998916626, - "eval_runtime": 80.7819, - "eval_samples_per_second": 37.137, - "eval_steps_per_second": 9.284, + "epoch": 0.20718947477468144, + "eval_loss": 0.7194066047668457, + "eval_runtime": 86.2811, + "eval_samples_per_second": 55.933, + "eval_steps_per_second": 13.989, "step": 500 }, { - "epoch": 0.36666666666666664, - "grad_norm": 0.35211533308029175, - "learning_rate": 0.0001795909090909091, - "loss": 0.6128, + "epoch": 0.22790842225214958, + "grad_norm": 0.7763131856918335, + "learning_rate": 0.0001874264911789415, + "loss": 0.7912, "step": 550 }, { - "epoch": 0.4, - "grad_norm": 0.36327463388442993, - "learning_rate": 0.00017731818181818183, - "loss": 0.5915, + "epoch": 0.24862736972961774, + "grad_norm": 0.6845299601554871, + "learning_rate": 0.00018602632315877906, + "loss": 0.8506, "step": 600 }, { - "epoch": 0.43333333333333335, - "grad_norm": 0.40672942996025085, - "learning_rate": 0.00017504545454545455, - "loss": 0.5807, + "epoch": 0.2693463172070859, + "grad_norm": 0.8045451045036316, + "learning_rate": 0.00018462615513861665, + "loss": 0.763, "step": 650 }, { - "epoch": 0.4666666666666667, - "grad_norm": 0.4689007103443146, - "learning_rate": 0.00017277272727272728, - "loss": 0.602, + "epoch": 0.290065264684554, + "grad_norm": 0.7035927176475525, + "learning_rate": 0.00018322598711845422, + "loss": 0.7769, "step": 700 }, { - "epoch": 0.5, - "grad_norm": 0.3979697823524475, - "learning_rate": 0.00017050000000000002, - "loss": 0.5703, + "epoch": 0.3107842121620222, + "grad_norm": 0.465000718832016, + "learning_rate": 0.00018182581909829179, + "loss": 0.7705, "step": 750 }, { - "epoch": 0.5, - "eval_loss": 0.5740106701850891, - "eval_runtime": 80.8209, - "eval_samples_per_second": 37.119, - "eval_steps_per_second": 9.28, + "epoch": 0.3107842121620222, + "eval_loss": 0.7103215456008911, + "eval_runtime": 86.1101, + "eval_samples_per_second": 56.045, + "eval_steps_per_second": 14.017, "step": 750 }, { - "epoch": 0.5333333333333333, - "grad_norm": 0.3071135878562927, - "learning_rate": 0.00016822727272727275, - "loss": 0.5746, + "epoch": 0.33150315963949034, + "grad_norm": 0.4990151524543762, + "learning_rate": 0.00018042565107812938, + "loss": 0.8438, "step": 800 }, { - "epoch": 0.5666666666666667, - "grad_norm": 0.318085253238678, - "learning_rate": 0.00016595454545454544, - "loss": 0.5873, + "epoch": 0.35222210711695845, + "grad_norm": 0.7391067147254944, + "learning_rate": 0.00017902548305796695, + "loss": 0.7688, "step": 850 }, { - "epoch": 0.6, - "grad_norm": 0.35915374755859375, - "learning_rate": 0.0001636818181818182, - "loss": 0.6283, + "epoch": 0.3729410545944266, + "grad_norm": 0.8036171197891235, + "learning_rate": 0.00017762531503780454, + "loss": 0.753, "step": 900 }, { - "epoch": 0.6333333333333333, - "grad_norm": 0.3174057602882385, - "learning_rate": 0.0001614090909090909, - "loss": 0.5912, + "epoch": 0.3936600020718947, + "grad_norm": 0.44744470715522766, + "learning_rate": 0.00017622514701764213, + "loss": 0.7793, "step": 950 }, { - "epoch": 0.6666666666666666, - "grad_norm": 0.416111022233963, - "learning_rate": 0.00015913636363636363, - "loss": 0.5647, + "epoch": 0.4143789495493629, + "grad_norm": 0.630820631980896, + "learning_rate": 0.00017482497899747973, + "loss": 0.7555, "step": 1000 }, { - "epoch": 0.6666666666666666, - "eval_loss": 0.5663638710975647, - "eval_runtime": 80.8183, - "eval_samples_per_second": 37.12, - "eval_steps_per_second": 9.28, + "epoch": 0.4143789495493629, + "eval_loss": 0.7030432820320129, + "eval_runtime": 86.2543, + "eval_samples_per_second": 55.951, + "eval_steps_per_second": 13.994, "step": 1000 }, { - "epoch": 0.7, - "grad_norm": 0.41202324628829956, - "learning_rate": 0.00015686363636363638, - "loss": 0.6118, + "epoch": 0.43509789702683105, + "grad_norm": 0.45690879225730896, + "learning_rate": 0.0001734248109773173, + "loss": 0.793, "step": 1050 }, { - "epoch": 0.7333333333333333, - "grad_norm": 0.3883333206176758, - "learning_rate": 0.0001545909090909091, - "loss": 0.5392, + "epoch": 0.45581684450429916, + "grad_norm": 0.5000227093696594, + "learning_rate": 0.00017202464295715486, + "loss": 0.8342, "step": 1100 }, { - "epoch": 0.7666666666666667, - "grad_norm": 0.31973451375961304, - "learning_rate": 0.00015231818181818182, - "loss": 0.5602, + "epoch": 0.4765357919817673, + "grad_norm": 0.47182488441467285, + "learning_rate": 0.00017062447493699246, + "loss": 0.7997, "step": 1150 }, { - "epoch": 0.8, - "grad_norm": 0.31378698348999023, - "learning_rate": 0.00015004545454545454, - "loss": 0.5642, + "epoch": 0.4972547394592355, + "grad_norm": 0.7060516476631165, + "learning_rate": 0.00016922430691683002, + "loss": 0.7788, "step": 1200 }, { - "epoch": 0.8333333333333334, - "grad_norm": 0.3346308171749115, - "learning_rate": 0.0001477727272727273, - "loss": 0.5925, + "epoch": 0.5179736869367036, + "grad_norm": 0.46701857447624207, + "learning_rate": 0.00016782413889666762, + "loss": 0.7518, "step": 1250 }, { - "epoch": 0.8333333333333334, - "eval_loss": 0.5619704723358154, - "eval_runtime": 80.824, - "eval_samples_per_second": 37.118, - "eval_steps_per_second": 9.279, + "epoch": 0.5179736869367036, + "eval_loss": 0.7023425698280334, + "eval_runtime": 86.3015, + "eval_samples_per_second": 55.92, + "eval_steps_per_second": 13.986, "step": 1250 }, { - "epoch": 0.8666666666666667, - "grad_norm": 0.5573959946632385, - "learning_rate": 0.0001455, - "loss": 0.5829, + "epoch": 0.5386926344141718, + "grad_norm": 0.668192446231842, + "learning_rate": 0.00016642397087650518, + "loss": 0.7682, "step": 1300 }, { - "epoch": 0.9, - "grad_norm": 0.36054643988609314, - "learning_rate": 0.00014322727272727273, - "loss": 0.5923, + "epoch": 0.5594115818916399, + "grad_norm": 0.47292283177375793, + "learning_rate": 0.00016502380285634278, + "loss": 0.7985, "step": 1350 }, { - "epoch": 0.9333333333333333, - "grad_norm": 0.36059027910232544, - "learning_rate": 0.00014095454545454546, - "loss": 0.5808, + "epoch": 0.580130529369108, + "grad_norm": 0.7327275276184082, + "learning_rate": 0.00016362363483618034, + "loss": 0.8378, "step": 1400 }, { - "epoch": 0.9666666666666667, - "grad_norm": 0.3942534327507019, - "learning_rate": 0.00013868181818181818, - "loss": 0.5597, + "epoch": 0.6008494768465762, + "grad_norm": 0.8417996764183044, + "learning_rate": 0.0001622234668160179, + "loss": 0.7962, "step": 1450 }, { - "epoch": 1.0, - "grad_norm": 0.3995835483074188, - "learning_rate": 0.0001364090909090909, - "loss": 0.5554, + "epoch": 0.6215684243240444, + "grad_norm": 0.6189562678337097, + "learning_rate": 0.0001608232987958555, + "loss": 0.8028, "step": 1500 }, { - "epoch": 1.0, - "eval_loss": 0.5581239461898804, - "eval_runtime": 80.8326, - "eval_samples_per_second": 37.114, - "eval_steps_per_second": 9.278, + "epoch": 0.6215684243240444, + "eval_loss": 0.6915447115898132, + "eval_runtime": 86.2147, + "eval_samples_per_second": 55.977, + "eval_steps_per_second": 14.0, "step": 1500 }, { - "epoch": 1.0333333333333334, - "grad_norm": 0.3405410051345825, - "learning_rate": 0.00013413636363636365, - "loss": 0.5571, + "epoch": 0.6422873718015125, + "grad_norm": 0.7345826625823975, + "learning_rate": 0.0001594231307756931, + "loss": 0.7978, "step": 1550 }, { - "epoch": 1.0666666666666667, - "grad_norm": 0.4485073983669281, - "learning_rate": 0.00013186363636363637, - "loss": 0.5674, + "epoch": 0.6630063192789807, + "grad_norm": 0.6538310050964355, + "learning_rate": 0.0001580229627555307, + "loss": 0.7672, "step": 1600 }, { - "epoch": 1.1, - "grad_norm": 0.34938374161720276, - "learning_rate": 0.0001295909090909091, - "loss": 0.5354, + "epoch": 0.6837252667564487, + "grad_norm": 0.661582350730896, + "learning_rate": 0.00015662279473536826, + "loss": 0.7378, "step": 1650 }, { - "epoch": 1.1333333333333333, - "grad_norm": 0.33084195852279663, - "learning_rate": 0.00012731818181818184, - "loss": 0.5765, + "epoch": 0.7044442142339169, + "grad_norm": 0.3603042960166931, + "learning_rate": 0.00015522262671520583, + "loss": 0.6741, "step": 1700 }, { - "epoch": 1.1666666666666667, - "grad_norm": 0.3667336404323578, - "learning_rate": 0.00012504545454545456, - "loss": 0.5486, + "epoch": 0.7251631617113851, + "grad_norm": 0.8882561326026917, + "learning_rate": 0.00015382245869504342, + "loss": 0.7695, "step": 1750 }, { - "epoch": 1.1666666666666667, - "eval_loss": 0.5557209253311157, - "eval_runtime": 80.8386, - "eval_samples_per_second": 37.111, - "eval_steps_per_second": 9.278, + "epoch": 0.7251631617113851, + "eval_loss": 0.6858941316604614, + "eval_runtime": 86.6358, + "eval_samples_per_second": 55.704, + "eval_steps_per_second": 13.932, "step": 1750 }, { - "epoch": 1.2, - "grad_norm": 0.33248019218444824, - "learning_rate": 0.00012277272727272728, - "loss": 0.5617, + "epoch": 0.7458821091888532, + "grad_norm": 0.5933266282081604, + "learning_rate": 0.000152422290674881, + "loss": 0.7548, "step": 1800 }, { - "epoch": 1.2333333333333334, - "grad_norm": 0.4447474479675293, - "learning_rate": 0.00012050000000000002, - "loss": 0.567, + "epoch": 0.7666010566663214, + "grad_norm": 0.8178608417510986, + "learning_rate": 0.00015102212265471858, + "loss": 0.7639, "step": 1850 }, { - "epoch": 1.2666666666666666, - "grad_norm": 0.42134660482406616, - "learning_rate": 0.00011822727272727274, - "loss": 0.5319, + "epoch": 0.7873200041437894, + "grad_norm": 0.4378993511199951, + "learning_rate": 0.00014962195463455615, + "loss": 0.7985, "step": 1900 }, { - "epoch": 1.3, - "grad_norm": 0.3942984640598297, - "learning_rate": 0.00011595454545454544, - "loss": 0.5325, + "epoch": 0.8080389516212576, + "grad_norm": 0.3732803463935852, + "learning_rate": 0.00014822178661439374, + "loss": 0.8481, "step": 1950 }, { - "epoch": 1.3333333333333333, - "grad_norm": 0.4929428696632385, - "learning_rate": 0.00011368181818181818, - "loss": 0.5565, + "epoch": 0.8287578990987258, + "grad_norm": 0.7421035170555115, + "learning_rate": 0.0001468216185942313, + "loss": 0.7223, "step": 2000 }, { - "epoch": 1.3333333333333333, - "eval_loss": 0.5535863637924194, - "eval_runtime": 80.8279, - "eval_samples_per_second": 37.116, - "eval_steps_per_second": 9.279, + "epoch": 0.8287578990987258, + "eval_loss": 0.6823315024375916, + "eval_runtime": 86.5575, + "eval_samples_per_second": 55.755, + "eval_steps_per_second": 13.944, "step": 2000 }, { - "epoch": 1.3666666666666667, - "grad_norm": 0.4141586720943451, - "learning_rate": 0.00011140909090909091, - "loss": 0.5801, + "epoch": 0.8494768465761939, + "grad_norm": 0.5109913349151611, + "learning_rate": 0.00014542145057406888, + "loss": 0.7895, "step": 2050 }, { - "epoch": 1.4, - "grad_norm": 0.45937269926071167, - "learning_rate": 0.00010913636363636364, - "loss": 0.5439, + "epoch": 0.8701957940536621, + "grad_norm": 0.47988179326057434, + "learning_rate": 0.00014402128255390647, + "loss": 0.7385, "step": 2100 }, { - "epoch": 1.4333333333333333, - "grad_norm": 0.47830042243003845, - "learning_rate": 0.00010686363636363637, - "loss": 0.547, + "epoch": 0.8909147415311303, + "grad_norm": 0.7593080997467041, + "learning_rate": 0.00014262111453374404, + "loss": 0.7744, "step": 2150 }, { - "epoch": 1.4666666666666668, - "grad_norm": 0.40260276198387146, - "learning_rate": 0.00010459090909090909, - "loss": 0.5229, + "epoch": 0.9116336890085983, + "grad_norm": 0.5866154432296753, + "learning_rate": 0.00014122094651358163, + "loss": 0.7062, "step": 2200 }, { - "epoch": 1.5, - "grad_norm": 0.5281402468681335, - "learning_rate": 0.00010231818181818183, - "loss": 0.5475, + "epoch": 0.9323526364860665, + "grad_norm": 0.47364088892936707, + "learning_rate": 0.00013982077849341922, + "loss": 0.7792, "step": 2250 }, { - "epoch": 1.5, - "eval_loss": 0.5505018830299377, - "eval_runtime": 80.8409, - "eval_samples_per_second": 37.11, - "eval_steps_per_second": 9.277, + "epoch": 0.9323526364860665, + "eval_loss": 0.6785813570022583, + "eval_runtime": 86.3444, + "eval_samples_per_second": 55.892, + "eval_steps_per_second": 13.979, "step": 2250 }, { - "epoch": 1.5333333333333332, - "grad_norm": 0.3721947968006134, - "learning_rate": 0.00010004545454545455, - "loss": 0.5466, + "epoch": 0.9530715839635346, + "grad_norm": 0.7610514760017395, + "learning_rate": 0.00013842061047325682, + "loss": 0.7804, "step": 2300 }, { - "epoch": 1.5666666666666667, - "grad_norm": 0.3462945818901062, - "learning_rate": 9.777272727272728e-05, - "loss": 0.5209, + "epoch": 0.9737905314410028, + "grad_norm": 0.7689616084098816, + "learning_rate": 0.00013702044245309438, + "loss": 0.7497, "step": 2350 }, { - "epoch": 1.6, - "grad_norm": 0.4027090072631836, - "learning_rate": 9.55e-05, - "loss": 0.5307, + "epoch": 0.994509478918471, + "grad_norm": 0.542168378829956, + "learning_rate": 0.00013562027443293195, + "loss": 0.7333, "step": 2400 }, { - "epoch": 1.6333333333333333, - "grad_norm": 0.3684265613555908, - "learning_rate": 9.322727272727273e-05, - "loss": 0.5118, + "epoch": 1.0149176421837771, + "grad_norm": 0.33903324604034424, + "learning_rate": 0.0001342481097731728, + "loss": 0.6952, "step": 2450 }, { - "epoch": 1.6666666666666665, - "grad_norm": 0.4819887578487396, - "learning_rate": 9.095454545454546e-05, - "loss": 0.561, + "epoch": 1.0356365896612452, + "grad_norm": 0.8183636665344238, + "learning_rate": 0.00013284794175301036, + "loss": 0.7386, "step": 2500 }, { - "epoch": 1.6666666666666665, - "eval_loss": 0.5476261377334595, - "eval_runtime": 80.8288, - "eval_samples_per_second": 37.115, - "eval_steps_per_second": 9.279, + "epoch": 1.0356365896612452, + "eval_loss": 0.675748348236084, + "eval_runtime": 86.2887, + "eval_samples_per_second": 55.929, + "eval_steps_per_second": 13.988, "step": 2500 }, { - "epoch": 1.7, - "grad_norm": 0.3161783218383789, - "learning_rate": 8.86818181818182e-05, - "loss": 0.5413, + "epoch": 1.0563555371387134, + "grad_norm": 0.6831589937210083, + "learning_rate": 0.00013144777373284795, + "loss": 0.72, "step": 2550 }, { - "epoch": 1.7333333333333334, - "grad_norm": 0.34697386622428894, - "learning_rate": 8.640909090909092e-05, - "loss": 0.5366, + "epoch": 1.0770744846161815, + "grad_norm": 0.6346258521080017, + "learning_rate": 0.00013004760571268552, + "loss": 0.7026, "step": 2600 }, { - "epoch": 1.7666666666666666, - "grad_norm": 0.4084527790546417, - "learning_rate": 8.413636363636364e-05, - "loss": 0.5426, + "epoch": 1.0977934320936495, + "grad_norm": 0.5658385753631592, + "learning_rate": 0.0001286474376925231, + "loss": 0.7162, "step": 2650 }, { - "epoch": 1.8, - "grad_norm": 0.4053308963775635, - "learning_rate": 8.186363636363636e-05, - "loss": 0.532, + "epoch": 1.1185123795711178, + "grad_norm": 0.4242883026599884, + "learning_rate": 0.00012724726967236068, + "loss": 0.7325, "step": 2700 }, { - "epoch": 1.8333333333333335, - "grad_norm": 0.3551884591579437, - "learning_rate": 7.95909090909091e-05, - "loss": 0.5399, + "epoch": 1.1392313270485859, + "grad_norm": 0.5489133596420288, + "learning_rate": 0.00012584710165219827, + "loss": 0.7138, "step": 2750 }, { - "epoch": 1.8333333333333335, - "eval_loss": 0.546008288860321, - "eval_runtime": 80.8186, - "eval_samples_per_second": 37.12, - "eval_steps_per_second": 9.28, + "epoch": 1.1392313270485859, + "eval_loss": 0.6747092604637146, + "eval_runtime": 86.4239, + "eval_samples_per_second": 55.841, + "eval_steps_per_second": 13.966, "step": 2750 }, { - "epoch": 1.8666666666666667, - "grad_norm": 0.40072572231292725, - "learning_rate": 7.731818181818183e-05, - "loss": 0.5332, + "epoch": 1.1599502745260541, + "grad_norm": 0.6514728665351868, + "learning_rate": 0.00012444693363203587, + "loss": 0.7105, "step": 2800 }, { - "epoch": 1.9, - "grad_norm": 0.3773200213909149, - "learning_rate": 7.504545454545455e-05, - "loss": 0.5296, + "epoch": 1.1806692220035222, + "grad_norm": 0.48897412419319153, + "learning_rate": 0.00012304676561187343, + "loss": 0.7271, "step": 2850 }, { - "epoch": 1.9333333333333333, - "grad_norm": 0.45379436016082764, - "learning_rate": 7.277272727272728e-05, - "loss": 0.5356, + "epoch": 1.2013881694809903, + "grad_norm": 0.7159713506698608, + "learning_rate": 0.00012164659759171101, + "loss": 0.7454, "step": 2900 }, { - "epoch": 1.9666666666666668, - "grad_norm": 0.36246028542518616, - "learning_rate": 7.05e-05, - "loss": 0.5112, + "epoch": 1.2221071169584585, + "grad_norm": 0.7044214010238647, + "learning_rate": 0.0001202464295715486, + "loss": 0.6918, "step": 2950 }, { - "epoch": 2.0, - "grad_norm": 0.40895622968673706, - "learning_rate": 6.822727272727273e-05, - "loss": 0.5358, + "epoch": 1.2428260644359266, + "grad_norm": 0.7934305667877197, + "learning_rate": 0.00011884626155138616, + "loss": 0.7018, "step": 3000 }, { - "epoch": 2.0, - "eval_loss": 0.5436099171638489, - "eval_runtime": 80.8207, - "eval_samples_per_second": 37.119, - "eval_steps_per_second": 9.28, + "epoch": 1.2428260644359266, + "eval_loss": 0.6727278828620911, + "eval_runtime": 86.1985, + "eval_samples_per_second": 55.987, + "eval_steps_per_second": 14.003, "step": 3000 }, { - "epoch": 2.033333333333333, - "grad_norm": 0.4935952425003052, - "learning_rate": 6.595454545454546e-05, - "loss": 0.5074, + "epoch": 1.2635450119133949, + "grad_norm": 0.8456618785858154, + "learning_rate": 0.00011744609353122375, + "loss": 0.763, "step": 3050 }, { - "epoch": 2.066666666666667, - "grad_norm": 0.505511999130249, - "learning_rate": 6.368181818181818e-05, - "loss": 0.4716, + "epoch": 1.284263959390863, + "grad_norm": 0.5733729600906372, + "learning_rate": 0.00011604592551106132, + "loss": 0.7034, "step": 3100 }, { - "epoch": 2.1, - "grad_norm": 0.47748756408691406, - "learning_rate": 6.140909090909092e-05, - "loss": 0.4909, + "epoch": 1.304982906868331, + "grad_norm": 0.4783104658126831, + "learning_rate": 0.00011464575749089892, + "loss": 0.762, "step": 3150 }, { - "epoch": 2.1333333333333333, - "grad_norm": 0.3205774426460266, - "learning_rate": 5.913636363636363e-05, - "loss": 0.5009, + "epoch": 1.3257018543457992, + "grad_norm": 0.7016689777374268, + "learning_rate": 0.0001132455894707365, + "loss": 0.7049, "step": 3200 }, { - "epoch": 2.1666666666666665, - "grad_norm": 0.437486469745636, - "learning_rate": 5.686363636363636e-05, - "loss": 0.5224, + "epoch": 1.3464208018232675, + "grad_norm": 0.6739513278007507, + "learning_rate": 0.00011184542145057409, + "loss": 0.7137, "step": 3250 }, { - "epoch": 2.1666666666666665, - "eval_loss": 0.5484762787818909, - "eval_runtime": 80.8314, - "eval_samples_per_second": 37.114, - "eval_steps_per_second": 9.279, + "epoch": 1.3464208018232675, + "eval_loss": 0.6689812541007996, + "eval_runtime": 86.4895, + "eval_samples_per_second": 55.799, + "eval_steps_per_second": 13.955, "step": 3250 }, { - "epoch": 2.2, - "grad_norm": 0.49795669317245483, - "learning_rate": 5.4590909090909096e-05, - "loss": 0.516, + "epoch": 1.3671397493007356, + "grad_norm": 0.8907766938209534, + "learning_rate": 0.00011044525343041166, + "loss": 0.7476, "step": 3300 }, { - "epoch": 2.2333333333333334, - "grad_norm": 0.40953299403190613, - "learning_rate": 5.2318181818181824e-05, - "loss": 0.5025, + "epoch": 1.3878586967782036, + "grad_norm": 0.8889743089675903, + "learning_rate": 0.00010904508541024922, + "loss": 0.7059, "step": 3350 }, { - "epoch": 2.2666666666666666, - "grad_norm": 0.5090060830116272, - "learning_rate": 5.004545454545455e-05, - "loss": 0.5064, + "epoch": 1.408577644255672, + "grad_norm": 0.5788094401359558, + "learning_rate": 0.00010764491739008682, + "loss": 0.7018, "step": 3400 }, { - "epoch": 2.3, - "grad_norm": 0.4385254979133606, - "learning_rate": 4.777272727272727e-05, - "loss": 0.497, + "epoch": 1.42929659173314, + "grad_norm": 0.7107548713684082, + "learning_rate": 0.00010624474936992438, + "loss": 0.6796, "step": 3450 }, { - "epoch": 2.3333333333333335, - "grad_norm": 0.4746367037296295, - "learning_rate": 4.55e-05, - "loss": 0.4696, + "epoch": 1.4500155392106082, + "grad_norm": 0.6979348063468933, + "learning_rate": 0.00010484458134976198, + "loss": 0.7212, "step": 3500 }, { - "epoch": 2.3333333333333335, - "eval_loss": 0.5463398098945618, - "eval_runtime": 80.8383, - "eval_samples_per_second": 37.111, - "eval_steps_per_second": 9.278, + "epoch": 1.4500155392106082, + "eval_loss": 0.6663665175437927, + "eval_runtime": 86.5532, + "eval_samples_per_second": 55.758, + "eval_steps_per_second": 13.945, "step": 3500 } ], "logging_steps": 50, - "max_steps": 4500, + "max_steps": 7242, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, @@ -629,7 +629,7 @@ "attributes": {} } }, - "total_flos": 4.2819969754988544e+17, + "total_flos": 2.2030969497550848e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null diff --git a/checkpoint-3500/training_args.bin b/checkpoint-3500/training_args.bin index eb236cf9d6a781057673256e00030110ee72a3cc..818731b2d637beef34d216b7461a12ef35db62df 100644 --- a/checkpoint-3500/training_args.bin +++ b/checkpoint-3500/training_args.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eb76e3e4d2123d52e529262f1ff37bfc600a160bb369e2338a54f71b47c17108 +oid sha256:a783872a61d64f4d1b4e002fa0fa67fa54dd9c59c4f7fbad58c794a77134069d size 5432 diff --git a/checkpoint-4000/adapter_config.json b/checkpoint-4000/adapter_config.json index 1a5356a6269df402dbde851645d59d282fadd9df..84dde86ee0bd87c0f5d5a0a36710a2a58516a7c4 100644 --- a/checkpoint-4000/adapter_config.json +++ b/checkpoint-4000/adapter_config.json @@ -29,9 +29,9 @@ "rank_pattern": {}, "revision": null, "target_modules": [ + "q_proj", "o_proj", "v_proj", - "q_proj", "k_proj" ], "target_parameters": null, diff --git a/checkpoint-4000/adapter_model.safetensors b/checkpoint-4000/adapter_model.safetensors index 3e4fff0f96a7719276735388fb024227ddb741bd..ce88cf3f181c943f206a022582e25a81f93931df 100644 --- a/checkpoint-4000/adapter_model.safetensors +++ b/checkpoint-4000/adapter_model.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:118a950f4ea975de60cc994e2c611b155a822d34f66663ef6d1f6e807eefaa5f +oid sha256:85b005b4e83f32c2196d8624bed8bb92b31607881be607237b6307b299603a5b size 54560368 diff --git a/checkpoint-4000/optimizer.pt b/checkpoint-4000/optimizer.pt index 34e0a24a157d6f27ba859a66a8fdc09991b37c6d..16ee80d014d43274e5b254d144c853d4576288f1 100644 --- a/checkpoint-4000/optimizer.pt +++ b/checkpoint-4000/optimizer.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b25bf4840d6e2d00e714ba870a13fc61a2f1bc0c4f9c9264556e86261cc207d5 +oid sha256:5cbae4d511bab39f44f667a9680b8673a737545b6a1fd1c80ec706b83c4dea15 size 109267450 diff --git a/checkpoint-4000/rng_state.pth b/checkpoint-4000/rng_state.pth index 730743e8fd13f0cdfe86d921114a5d5b2105a888..ca6e79319903ccf7c8ee2226d569af1a0e36d016 100644 --- a/checkpoint-4000/rng_state.pth +++ b/checkpoint-4000/rng_state.pth @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:49df9bbd0eddb1df6cfbfa711a3094a1945c939c98f67892bac66892f54aaeba +oid sha256:d4052f738f0b9ef3f5dced51852ac3e4834750402c534ec34f42615fe36de0d7 size 14244 diff --git a/checkpoint-4000/scaler.pt b/checkpoint-4000/scaler.pt index 8e3c148798a27b525e5967dae1f764aa32ba855f..223555a15906004ce40411a5b427362211d876d5 100644 --- a/checkpoint-4000/scaler.pt +++ b/checkpoint-4000/scaler.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:59a4acfb0da74c479080613978839dd3cbb4608fd2c07e764b4c844401d8dd5f +oid sha256:5c92e97adf84139b7b4dad7ef87ef1261295e4cca197d068b1e45f00fd9f0717 size 988 diff --git a/checkpoint-4000/scheduler.pt b/checkpoint-4000/scheduler.pt index 23fa6ec8c8857315ecef971c9df074cc3faba44d..a4409163622b67c12dc55b759843038808fd69a4 100644 --- a/checkpoint-4000/scheduler.pt +++ b/checkpoint-4000/scheduler.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:246a2efa7a9bd7ba9c405294fe88f57d7fd57e773d83df5b12cd0ce1b2dd97dc +oid sha256:2ae32756094790b8fbe189a6403d753b5edf3b6b69c2acdb4dbe0bd5a08425a7 size 1064 diff --git a/checkpoint-4000/trainer_state.json b/checkpoint-4000/trainer_state.json index b8484309b43fbfd74d3e82f7ae4e9116da78041a..e50a1a71f2e6007485a834f003bb8dc2cb2277e0 100644 --- a/checkpoint-4000/trainer_state.json +++ b/checkpoint-4000/trainer_state.json @@ -1,8 +1,8 @@ { - "best_global_step": 3000, - "best_metric": 0.5436099171638489, - "best_model_checkpoint": "finetuned_models/llama3.1-8b-lora/checkpoint-3000", - "epoch": 2.6666666666666665, + "best_global_step": 4000, + "best_metric": 0.6616591215133667, + "best_model_checkpoint": "amiya_outputs/llama3.1-8b-amiya-palestinian/checkpoint-4000", + "epoch": 1.6572050139852896, "eval_steps": 250, "global_step": 4000, "is_hyper_param_search": false, @@ -10,696 +10,696 @@ "is_world_process_zero": true, "log_history": [ { - "epoch": 0.03333333333333333, - "grad_norm": 0.5346225500106812, + "epoch": 0.020718947477468146, + "grad_norm": 1.0589393377304077, "learning_rate": 9.8e-05, - "loss": 2.4955, + "loss": 2.6567, "step": 50 }, { - "epoch": 0.06666666666666667, - "grad_norm": 0.719093918800354, + "epoch": 0.04143789495493629, + "grad_norm": 0.9738045334815979, "learning_rate": 0.00019800000000000002, - "loss": 0.71, + "loss": 0.9502, "step": 100 }, { - "epoch": 0.1, - "grad_norm": 0.4840560853481293, - "learning_rate": 0.0001977727272727273, - "loss": 0.6405, + "epoch": 0.062156842432404436, + "grad_norm": 0.8801347017288208, + "learning_rate": 0.00019862783534024082, + "loss": 0.8496, "step": 150 }, { - "epoch": 0.13333333333333333, - "grad_norm": 0.3332301676273346, - "learning_rate": 0.0001955, - "loss": 0.6287, + "epoch": 0.08287578990987259, + "grad_norm": 0.7272312045097351, + "learning_rate": 0.00019722766732007841, + "loss": 0.8184, "step": 200 }, { - "epoch": 0.16666666666666666, - "grad_norm": 0.40639588236808777, - "learning_rate": 0.00019322727272727276, - "loss": 0.5572, + "epoch": 0.10359473738734072, + "grad_norm": 0.7850629091262817, + "learning_rate": 0.000195827499299916, + "loss": 0.8392, "step": 250 }, { - "epoch": 0.16666666666666666, - "eval_loss": 0.5975945591926575, - "eval_runtime": 80.8004, - "eval_samples_per_second": 37.129, - "eval_steps_per_second": 9.282, + "epoch": 0.10359473738734072, + "eval_loss": 0.7402811050415039, + "eval_runtime": 85.9367, + "eval_samples_per_second": 56.158, + "eval_steps_per_second": 14.045, "step": 250 }, { - "epoch": 0.2, - "grad_norm": 0.3970712423324585, - "learning_rate": 0.00019095454545454545, - "loss": 0.6165, + "epoch": 0.12431368486480887, + "grad_norm": 0.40629276633262634, + "learning_rate": 0.00019442733127975358, + "loss": 0.8108, "step": 300 }, { - "epoch": 0.23333333333333334, - "grad_norm": 0.38409528136253357, - "learning_rate": 0.00018868181818181817, - "loss": 0.639, + "epoch": 0.145032632342277, + "grad_norm": 0.5258236527442932, + "learning_rate": 0.00019302716325959117, + "loss": 0.8116, "step": 350 }, { - "epoch": 0.26666666666666666, - "grad_norm": 0.44628769159317017, - "learning_rate": 0.00018640909090909092, - "loss": 0.636, + "epoch": 0.16575157981974517, + "grad_norm": 0.6879925727844238, + "learning_rate": 0.00019162699523942874, + "loss": 0.9089, "step": 400 }, { - "epoch": 0.3, - "grad_norm": 0.3697021007537842, - "learning_rate": 0.00018413636363636364, - "loss": 0.6192, + "epoch": 0.1864705272972133, + "grad_norm": 0.7583937048912048, + "learning_rate": 0.00019022682721926633, + "loss": 0.874, "step": 450 }, { - "epoch": 0.3333333333333333, - "grad_norm": 0.36338189244270325, - "learning_rate": 0.00018186363636363636, - "loss": 0.6134, + "epoch": 0.20718947477468144, + "grad_norm": 0.6399120688438416, + "learning_rate": 0.0001888266591991039, + "loss": 0.8366, "step": 500 }, { - "epoch": 0.3333333333333333, - "eval_loss": 0.5813060998916626, - "eval_runtime": 80.7819, - "eval_samples_per_second": 37.137, - "eval_steps_per_second": 9.284, + "epoch": 0.20718947477468144, + "eval_loss": 0.7194066047668457, + "eval_runtime": 86.2811, + "eval_samples_per_second": 55.933, + "eval_steps_per_second": 13.989, "step": 500 }, { - "epoch": 0.36666666666666664, - "grad_norm": 0.35211533308029175, - "learning_rate": 0.0001795909090909091, - "loss": 0.6128, + "epoch": 0.22790842225214958, + "grad_norm": 0.7763131856918335, + "learning_rate": 0.0001874264911789415, + "loss": 0.7912, "step": 550 }, { - "epoch": 0.4, - "grad_norm": 0.36327463388442993, - "learning_rate": 0.00017731818181818183, - "loss": 0.5915, + "epoch": 0.24862736972961774, + "grad_norm": 0.6845299601554871, + "learning_rate": 0.00018602632315877906, + "loss": 0.8506, "step": 600 }, { - "epoch": 0.43333333333333335, - "grad_norm": 0.40672942996025085, - "learning_rate": 0.00017504545454545455, - "loss": 0.5807, + "epoch": 0.2693463172070859, + "grad_norm": 0.8045451045036316, + "learning_rate": 0.00018462615513861665, + "loss": 0.763, "step": 650 }, { - "epoch": 0.4666666666666667, - "grad_norm": 0.4689007103443146, - "learning_rate": 0.00017277272727272728, - "loss": 0.602, + "epoch": 0.290065264684554, + "grad_norm": 0.7035927176475525, + "learning_rate": 0.00018322598711845422, + "loss": 0.7769, "step": 700 }, { - "epoch": 0.5, - "grad_norm": 0.3979697823524475, - "learning_rate": 0.00017050000000000002, - "loss": 0.5703, + "epoch": 0.3107842121620222, + "grad_norm": 0.465000718832016, + "learning_rate": 0.00018182581909829179, + "loss": 0.7705, "step": 750 }, { - "epoch": 0.5, - "eval_loss": 0.5740106701850891, - "eval_runtime": 80.8209, - "eval_samples_per_second": 37.119, - "eval_steps_per_second": 9.28, + "epoch": 0.3107842121620222, + "eval_loss": 0.7103215456008911, + "eval_runtime": 86.1101, + "eval_samples_per_second": 56.045, + "eval_steps_per_second": 14.017, "step": 750 }, { - "epoch": 0.5333333333333333, - "grad_norm": 0.3071135878562927, - "learning_rate": 0.00016822727272727275, - "loss": 0.5746, + "epoch": 0.33150315963949034, + "grad_norm": 0.4990151524543762, + "learning_rate": 0.00018042565107812938, + "loss": 0.8438, "step": 800 }, { - "epoch": 0.5666666666666667, - "grad_norm": 0.318085253238678, - "learning_rate": 0.00016595454545454544, - "loss": 0.5873, + "epoch": 0.35222210711695845, + "grad_norm": 0.7391067147254944, + "learning_rate": 0.00017902548305796695, + "loss": 0.7688, "step": 850 }, { - "epoch": 0.6, - "grad_norm": 0.35915374755859375, - "learning_rate": 0.0001636818181818182, - "loss": 0.6283, + "epoch": 0.3729410545944266, + "grad_norm": 0.8036171197891235, + "learning_rate": 0.00017762531503780454, + "loss": 0.753, "step": 900 }, { - "epoch": 0.6333333333333333, - "grad_norm": 0.3174057602882385, - "learning_rate": 0.0001614090909090909, - "loss": 0.5912, + "epoch": 0.3936600020718947, + "grad_norm": 0.44744470715522766, + "learning_rate": 0.00017622514701764213, + "loss": 0.7793, "step": 950 }, { - "epoch": 0.6666666666666666, - "grad_norm": 0.416111022233963, - "learning_rate": 0.00015913636363636363, - "loss": 0.5647, + "epoch": 0.4143789495493629, + "grad_norm": 0.630820631980896, + "learning_rate": 0.00017482497899747973, + "loss": 0.7555, "step": 1000 }, { - "epoch": 0.6666666666666666, - "eval_loss": 0.5663638710975647, - "eval_runtime": 80.8183, - "eval_samples_per_second": 37.12, - "eval_steps_per_second": 9.28, + "epoch": 0.4143789495493629, + "eval_loss": 0.7030432820320129, + "eval_runtime": 86.2543, + "eval_samples_per_second": 55.951, + "eval_steps_per_second": 13.994, "step": 1000 }, { - "epoch": 0.7, - "grad_norm": 0.41202324628829956, - "learning_rate": 0.00015686363636363638, - "loss": 0.6118, + "epoch": 0.43509789702683105, + "grad_norm": 0.45690879225730896, + "learning_rate": 0.0001734248109773173, + "loss": 0.793, "step": 1050 }, { - "epoch": 0.7333333333333333, - "grad_norm": 0.3883333206176758, - "learning_rate": 0.0001545909090909091, - "loss": 0.5392, + "epoch": 0.45581684450429916, + "grad_norm": 0.5000227093696594, + "learning_rate": 0.00017202464295715486, + "loss": 0.8342, "step": 1100 }, { - "epoch": 0.7666666666666667, - "grad_norm": 0.31973451375961304, - "learning_rate": 0.00015231818181818182, - "loss": 0.5602, + "epoch": 0.4765357919817673, + "grad_norm": 0.47182488441467285, + "learning_rate": 0.00017062447493699246, + "loss": 0.7997, "step": 1150 }, { - "epoch": 0.8, - "grad_norm": 0.31378698348999023, - "learning_rate": 0.00015004545454545454, - "loss": 0.5642, + "epoch": 0.4972547394592355, + "grad_norm": 0.7060516476631165, + "learning_rate": 0.00016922430691683002, + "loss": 0.7788, "step": 1200 }, { - "epoch": 0.8333333333333334, - "grad_norm": 0.3346308171749115, - "learning_rate": 0.0001477727272727273, - "loss": 0.5925, + "epoch": 0.5179736869367036, + "grad_norm": 0.46701857447624207, + "learning_rate": 0.00016782413889666762, + "loss": 0.7518, "step": 1250 }, { - "epoch": 0.8333333333333334, - "eval_loss": 0.5619704723358154, - "eval_runtime": 80.824, - "eval_samples_per_second": 37.118, - "eval_steps_per_second": 9.279, + "epoch": 0.5179736869367036, + "eval_loss": 0.7023425698280334, + "eval_runtime": 86.3015, + "eval_samples_per_second": 55.92, + "eval_steps_per_second": 13.986, "step": 1250 }, { - "epoch": 0.8666666666666667, - "grad_norm": 0.5573959946632385, - "learning_rate": 0.0001455, - "loss": 0.5829, + "epoch": 0.5386926344141718, + "grad_norm": 0.668192446231842, + "learning_rate": 0.00016642397087650518, + "loss": 0.7682, "step": 1300 }, { - "epoch": 0.9, - "grad_norm": 0.36054643988609314, - "learning_rate": 0.00014322727272727273, - "loss": 0.5923, + "epoch": 0.5594115818916399, + "grad_norm": 0.47292283177375793, + "learning_rate": 0.00016502380285634278, + "loss": 0.7985, "step": 1350 }, { - "epoch": 0.9333333333333333, - "grad_norm": 0.36059027910232544, - "learning_rate": 0.00014095454545454546, - "loss": 0.5808, + "epoch": 0.580130529369108, + "grad_norm": 0.7327275276184082, + "learning_rate": 0.00016362363483618034, + "loss": 0.8378, "step": 1400 }, { - "epoch": 0.9666666666666667, - "grad_norm": 0.3942534327507019, - "learning_rate": 0.00013868181818181818, - "loss": 0.5597, + "epoch": 0.6008494768465762, + "grad_norm": 0.8417996764183044, + "learning_rate": 0.0001622234668160179, + "loss": 0.7962, "step": 1450 }, { - "epoch": 1.0, - "grad_norm": 0.3995835483074188, - "learning_rate": 0.0001364090909090909, - "loss": 0.5554, + "epoch": 0.6215684243240444, + "grad_norm": 0.6189562678337097, + "learning_rate": 0.0001608232987958555, + "loss": 0.8028, "step": 1500 }, { - "epoch": 1.0, - "eval_loss": 0.5581239461898804, - "eval_runtime": 80.8326, - "eval_samples_per_second": 37.114, - "eval_steps_per_second": 9.278, + "epoch": 0.6215684243240444, + "eval_loss": 0.6915447115898132, + "eval_runtime": 86.2147, + "eval_samples_per_second": 55.977, + "eval_steps_per_second": 14.0, "step": 1500 }, { - "epoch": 1.0333333333333334, - "grad_norm": 0.3405410051345825, - "learning_rate": 0.00013413636363636365, - "loss": 0.5571, + "epoch": 0.6422873718015125, + "grad_norm": 0.7345826625823975, + "learning_rate": 0.0001594231307756931, + "loss": 0.7978, "step": 1550 }, { - "epoch": 1.0666666666666667, - "grad_norm": 0.4485073983669281, - "learning_rate": 0.00013186363636363637, - "loss": 0.5674, + "epoch": 0.6630063192789807, + "grad_norm": 0.6538310050964355, + "learning_rate": 0.0001580229627555307, + "loss": 0.7672, "step": 1600 }, { - "epoch": 1.1, - "grad_norm": 0.34938374161720276, - "learning_rate": 0.0001295909090909091, - "loss": 0.5354, + "epoch": 0.6837252667564487, + "grad_norm": 0.661582350730896, + "learning_rate": 0.00015662279473536826, + "loss": 0.7378, "step": 1650 }, { - "epoch": 1.1333333333333333, - "grad_norm": 0.33084195852279663, - "learning_rate": 0.00012731818181818184, - "loss": 0.5765, + "epoch": 0.7044442142339169, + "grad_norm": 0.3603042960166931, + "learning_rate": 0.00015522262671520583, + "loss": 0.6741, "step": 1700 }, { - "epoch": 1.1666666666666667, - "grad_norm": 0.3667336404323578, - "learning_rate": 0.00012504545454545456, - "loss": 0.5486, + "epoch": 0.7251631617113851, + "grad_norm": 0.8882561326026917, + "learning_rate": 0.00015382245869504342, + "loss": 0.7695, "step": 1750 }, { - "epoch": 1.1666666666666667, - "eval_loss": 0.5557209253311157, - "eval_runtime": 80.8386, - "eval_samples_per_second": 37.111, - "eval_steps_per_second": 9.278, + "epoch": 0.7251631617113851, + "eval_loss": 0.6858941316604614, + "eval_runtime": 86.6358, + "eval_samples_per_second": 55.704, + "eval_steps_per_second": 13.932, "step": 1750 }, { - "epoch": 1.2, - "grad_norm": 0.33248019218444824, - "learning_rate": 0.00012277272727272728, - "loss": 0.5617, + "epoch": 0.7458821091888532, + "grad_norm": 0.5933266282081604, + "learning_rate": 0.000152422290674881, + "loss": 0.7548, "step": 1800 }, { - "epoch": 1.2333333333333334, - "grad_norm": 0.4447474479675293, - "learning_rate": 0.00012050000000000002, - "loss": 0.567, + "epoch": 0.7666010566663214, + "grad_norm": 0.8178608417510986, + "learning_rate": 0.00015102212265471858, + "loss": 0.7639, "step": 1850 }, { - "epoch": 1.2666666666666666, - "grad_norm": 0.42134660482406616, - "learning_rate": 0.00011822727272727274, - "loss": 0.5319, + "epoch": 0.7873200041437894, + "grad_norm": 0.4378993511199951, + "learning_rate": 0.00014962195463455615, + "loss": 0.7985, "step": 1900 }, { - "epoch": 1.3, - "grad_norm": 0.3942984640598297, - "learning_rate": 0.00011595454545454544, - "loss": 0.5325, + "epoch": 0.8080389516212576, + "grad_norm": 0.3732803463935852, + "learning_rate": 0.00014822178661439374, + "loss": 0.8481, "step": 1950 }, { - "epoch": 1.3333333333333333, - "grad_norm": 0.4929428696632385, - "learning_rate": 0.00011368181818181818, - "loss": 0.5565, + "epoch": 0.8287578990987258, + "grad_norm": 0.7421035170555115, + "learning_rate": 0.0001468216185942313, + "loss": 0.7223, "step": 2000 }, { - "epoch": 1.3333333333333333, - "eval_loss": 0.5535863637924194, - "eval_runtime": 80.8279, - "eval_samples_per_second": 37.116, - "eval_steps_per_second": 9.279, + "epoch": 0.8287578990987258, + "eval_loss": 0.6823315024375916, + "eval_runtime": 86.5575, + "eval_samples_per_second": 55.755, + "eval_steps_per_second": 13.944, "step": 2000 }, { - "epoch": 1.3666666666666667, - "grad_norm": 0.4141586720943451, - "learning_rate": 0.00011140909090909091, - "loss": 0.5801, + "epoch": 0.8494768465761939, + "grad_norm": 0.5109913349151611, + "learning_rate": 0.00014542145057406888, + "loss": 0.7895, "step": 2050 }, { - "epoch": 1.4, - "grad_norm": 0.45937269926071167, - "learning_rate": 0.00010913636363636364, - "loss": 0.5439, + "epoch": 0.8701957940536621, + "grad_norm": 0.47988179326057434, + "learning_rate": 0.00014402128255390647, + "loss": 0.7385, "step": 2100 }, { - "epoch": 1.4333333333333333, - "grad_norm": 0.47830042243003845, - "learning_rate": 0.00010686363636363637, - "loss": 0.547, + "epoch": 0.8909147415311303, + "grad_norm": 0.7593080997467041, + "learning_rate": 0.00014262111453374404, + "loss": 0.7744, "step": 2150 }, { - "epoch": 1.4666666666666668, - "grad_norm": 0.40260276198387146, - "learning_rate": 0.00010459090909090909, - "loss": 0.5229, + "epoch": 0.9116336890085983, + "grad_norm": 0.5866154432296753, + "learning_rate": 0.00014122094651358163, + "loss": 0.7062, "step": 2200 }, { - "epoch": 1.5, - "grad_norm": 0.5281402468681335, - "learning_rate": 0.00010231818181818183, - "loss": 0.5475, + "epoch": 0.9323526364860665, + "grad_norm": 0.47364088892936707, + "learning_rate": 0.00013982077849341922, + "loss": 0.7792, "step": 2250 }, { - "epoch": 1.5, - "eval_loss": 0.5505018830299377, - "eval_runtime": 80.8409, - "eval_samples_per_second": 37.11, - "eval_steps_per_second": 9.277, + "epoch": 0.9323526364860665, + "eval_loss": 0.6785813570022583, + "eval_runtime": 86.3444, + "eval_samples_per_second": 55.892, + "eval_steps_per_second": 13.979, "step": 2250 }, { - "epoch": 1.5333333333333332, - "grad_norm": 0.3721947968006134, - "learning_rate": 0.00010004545454545455, - "loss": 0.5466, + "epoch": 0.9530715839635346, + "grad_norm": 0.7610514760017395, + "learning_rate": 0.00013842061047325682, + "loss": 0.7804, "step": 2300 }, { - "epoch": 1.5666666666666667, - "grad_norm": 0.3462945818901062, - "learning_rate": 9.777272727272728e-05, - "loss": 0.5209, + "epoch": 0.9737905314410028, + "grad_norm": 0.7689616084098816, + "learning_rate": 0.00013702044245309438, + "loss": 0.7497, "step": 2350 }, { - "epoch": 1.6, - "grad_norm": 0.4027090072631836, - "learning_rate": 9.55e-05, - "loss": 0.5307, + "epoch": 0.994509478918471, + "grad_norm": 0.542168378829956, + "learning_rate": 0.00013562027443293195, + "loss": 0.7333, "step": 2400 }, { - "epoch": 1.6333333333333333, - "grad_norm": 0.3684265613555908, - "learning_rate": 9.322727272727273e-05, - "loss": 0.5118, + "epoch": 1.0149176421837771, + "grad_norm": 0.33903324604034424, + "learning_rate": 0.0001342481097731728, + "loss": 0.6952, "step": 2450 }, { - "epoch": 1.6666666666666665, - "grad_norm": 0.4819887578487396, - "learning_rate": 9.095454545454546e-05, - "loss": 0.561, + "epoch": 1.0356365896612452, + "grad_norm": 0.8183636665344238, + "learning_rate": 0.00013284794175301036, + "loss": 0.7386, "step": 2500 }, { - "epoch": 1.6666666666666665, - "eval_loss": 0.5476261377334595, - "eval_runtime": 80.8288, - "eval_samples_per_second": 37.115, - "eval_steps_per_second": 9.279, + "epoch": 1.0356365896612452, + "eval_loss": 0.675748348236084, + "eval_runtime": 86.2887, + "eval_samples_per_second": 55.929, + "eval_steps_per_second": 13.988, "step": 2500 }, { - "epoch": 1.7, - "grad_norm": 0.3161783218383789, - "learning_rate": 8.86818181818182e-05, - "loss": 0.5413, + "epoch": 1.0563555371387134, + "grad_norm": 0.6831589937210083, + "learning_rate": 0.00013144777373284795, + "loss": 0.72, "step": 2550 }, { - "epoch": 1.7333333333333334, - "grad_norm": 0.34697386622428894, - "learning_rate": 8.640909090909092e-05, - "loss": 0.5366, + "epoch": 1.0770744846161815, + "grad_norm": 0.6346258521080017, + "learning_rate": 0.00013004760571268552, + "loss": 0.7026, "step": 2600 }, { - "epoch": 1.7666666666666666, - "grad_norm": 0.4084527790546417, - "learning_rate": 8.413636363636364e-05, - "loss": 0.5426, + "epoch": 1.0977934320936495, + "grad_norm": 0.5658385753631592, + "learning_rate": 0.0001286474376925231, + "loss": 0.7162, "step": 2650 }, { - "epoch": 1.8, - "grad_norm": 0.4053308963775635, - "learning_rate": 8.186363636363636e-05, - "loss": 0.532, + "epoch": 1.1185123795711178, + "grad_norm": 0.4242883026599884, + "learning_rate": 0.00012724726967236068, + "loss": 0.7325, "step": 2700 }, { - "epoch": 1.8333333333333335, - "grad_norm": 0.3551884591579437, - "learning_rate": 7.95909090909091e-05, - "loss": 0.5399, + "epoch": 1.1392313270485859, + "grad_norm": 0.5489133596420288, + "learning_rate": 0.00012584710165219827, + "loss": 0.7138, "step": 2750 }, { - "epoch": 1.8333333333333335, - "eval_loss": 0.546008288860321, - "eval_runtime": 80.8186, - "eval_samples_per_second": 37.12, - "eval_steps_per_second": 9.28, + "epoch": 1.1392313270485859, + "eval_loss": 0.6747092604637146, + "eval_runtime": 86.4239, + "eval_samples_per_second": 55.841, + "eval_steps_per_second": 13.966, "step": 2750 }, { - "epoch": 1.8666666666666667, - "grad_norm": 0.40072572231292725, - "learning_rate": 7.731818181818183e-05, - "loss": 0.5332, + "epoch": 1.1599502745260541, + "grad_norm": 0.6514728665351868, + "learning_rate": 0.00012444693363203587, + "loss": 0.7105, "step": 2800 }, { - "epoch": 1.9, - "grad_norm": 0.3773200213909149, - "learning_rate": 7.504545454545455e-05, - "loss": 0.5296, + "epoch": 1.1806692220035222, + "grad_norm": 0.48897412419319153, + "learning_rate": 0.00012304676561187343, + "loss": 0.7271, "step": 2850 }, { - "epoch": 1.9333333333333333, - "grad_norm": 0.45379436016082764, - "learning_rate": 7.277272727272728e-05, - "loss": 0.5356, + "epoch": 1.2013881694809903, + "grad_norm": 0.7159713506698608, + "learning_rate": 0.00012164659759171101, + "loss": 0.7454, "step": 2900 }, { - "epoch": 1.9666666666666668, - "grad_norm": 0.36246028542518616, - "learning_rate": 7.05e-05, - "loss": 0.5112, + "epoch": 1.2221071169584585, + "grad_norm": 0.7044214010238647, + "learning_rate": 0.0001202464295715486, + "loss": 0.6918, "step": 2950 }, { - "epoch": 2.0, - "grad_norm": 0.40895622968673706, - "learning_rate": 6.822727272727273e-05, - "loss": 0.5358, + "epoch": 1.2428260644359266, + "grad_norm": 0.7934305667877197, + "learning_rate": 0.00011884626155138616, + "loss": 0.7018, "step": 3000 }, { - "epoch": 2.0, - "eval_loss": 0.5436099171638489, - "eval_runtime": 80.8207, - "eval_samples_per_second": 37.119, - "eval_steps_per_second": 9.28, + "epoch": 1.2428260644359266, + "eval_loss": 0.6727278828620911, + "eval_runtime": 86.1985, + "eval_samples_per_second": 55.987, + "eval_steps_per_second": 14.003, "step": 3000 }, { - "epoch": 2.033333333333333, - "grad_norm": 0.4935952425003052, - "learning_rate": 6.595454545454546e-05, - "loss": 0.5074, + "epoch": 1.2635450119133949, + "grad_norm": 0.8456618785858154, + "learning_rate": 0.00011744609353122375, + "loss": 0.763, "step": 3050 }, { - "epoch": 2.066666666666667, - "grad_norm": 0.505511999130249, - "learning_rate": 6.368181818181818e-05, - "loss": 0.4716, + "epoch": 1.284263959390863, + "grad_norm": 0.5733729600906372, + "learning_rate": 0.00011604592551106132, + "loss": 0.7034, "step": 3100 }, { - "epoch": 2.1, - "grad_norm": 0.47748756408691406, - "learning_rate": 6.140909090909092e-05, - "loss": 0.4909, + "epoch": 1.304982906868331, + "grad_norm": 0.4783104658126831, + "learning_rate": 0.00011464575749089892, + "loss": 0.762, "step": 3150 }, { - "epoch": 2.1333333333333333, - "grad_norm": 0.3205774426460266, - "learning_rate": 5.913636363636363e-05, - "loss": 0.5009, + "epoch": 1.3257018543457992, + "grad_norm": 0.7016689777374268, + "learning_rate": 0.0001132455894707365, + "loss": 0.7049, "step": 3200 }, { - "epoch": 2.1666666666666665, - "grad_norm": 0.437486469745636, - "learning_rate": 5.686363636363636e-05, - "loss": 0.5224, + "epoch": 1.3464208018232675, + "grad_norm": 0.6739513278007507, + "learning_rate": 0.00011184542145057409, + "loss": 0.7137, "step": 3250 }, { - "epoch": 2.1666666666666665, - "eval_loss": 0.5484762787818909, - "eval_runtime": 80.8314, - "eval_samples_per_second": 37.114, - "eval_steps_per_second": 9.279, + "epoch": 1.3464208018232675, + "eval_loss": 0.6689812541007996, + "eval_runtime": 86.4895, + "eval_samples_per_second": 55.799, + "eval_steps_per_second": 13.955, "step": 3250 }, { - "epoch": 2.2, - "grad_norm": 0.49795669317245483, - "learning_rate": 5.4590909090909096e-05, - "loss": 0.516, + "epoch": 1.3671397493007356, + "grad_norm": 0.8907766938209534, + "learning_rate": 0.00011044525343041166, + "loss": 0.7476, "step": 3300 }, { - "epoch": 2.2333333333333334, - "grad_norm": 0.40953299403190613, - "learning_rate": 5.2318181818181824e-05, - "loss": 0.5025, + "epoch": 1.3878586967782036, + "grad_norm": 0.8889743089675903, + "learning_rate": 0.00010904508541024922, + "loss": 0.7059, "step": 3350 }, { - "epoch": 2.2666666666666666, - "grad_norm": 0.5090060830116272, - "learning_rate": 5.004545454545455e-05, - "loss": 0.5064, + "epoch": 1.408577644255672, + "grad_norm": 0.5788094401359558, + "learning_rate": 0.00010764491739008682, + "loss": 0.7018, "step": 3400 }, { - "epoch": 2.3, - "grad_norm": 0.4385254979133606, - "learning_rate": 4.777272727272727e-05, - "loss": 0.497, + "epoch": 1.42929659173314, + "grad_norm": 0.7107548713684082, + "learning_rate": 0.00010624474936992438, + "loss": 0.6796, "step": 3450 }, { - "epoch": 2.3333333333333335, - "grad_norm": 0.4746367037296295, - "learning_rate": 4.55e-05, - "loss": 0.4696, + "epoch": 1.4500155392106082, + "grad_norm": 0.6979348063468933, + "learning_rate": 0.00010484458134976198, + "loss": 0.7212, "step": 3500 }, { - "epoch": 2.3333333333333335, - "eval_loss": 0.5463398098945618, - "eval_runtime": 80.8383, - "eval_samples_per_second": 37.111, - "eval_steps_per_second": 9.278, + "epoch": 1.4500155392106082, + "eval_loss": 0.6663665175437927, + "eval_runtime": 86.5532, + "eval_samples_per_second": 55.758, + "eval_steps_per_second": 13.945, "step": 3500 }, { - "epoch": 2.3666666666666667, - "grad_norm": 0.5545032620429993, - "learning_rate": 4.322727272727273e-05, - "loss": 0.5024, + "epoch": 1.4707344866880763, + "grad_norm": 0.7232558727264404, + "learning_rate": 0.00010344441332959956, + "loss": 0.6814, "step": 3550 }, { - "epoch": 2.4, - "grad_norm": 0.5639179944992065, - "learning_rate": 4.095454545454546e-05, - "loss": 0.478, + "epoch": 1.4914534341655443, + "grad_norm": 0.8630662560462952, + "learning_rate": 0.00010204424530943715, + "loss": 0.7012, "step": 3600 }, { - "epoch": 2.4333333333333336, - "grad_norm": 0.418476402759552, - "learning_rate": 3.8681818181818186e-05, - "loss": 0.5017, + "epoch": 1.5121723816430124, + "grad_norm": 0.9553645253181458, + "learning_rate": 0.00010064407728927472, + "loss": 0.7247, "step": 3650 }, { - "epoch": 2.466666666666667, - "grad_norm": 0.4330589771270752, - "learning_rate": 3.640909090909091e-05, - "loss": 0.5075, + "epoch": 1.5328913291204807, + "grad_norm": 0.6892822980880737, + "learning_rate": 9.92439092691123e-05, + "loss": 0.7009, "step": 3700 }, { - "epoch": 2.5, - "grad_norm": 0.5324570536613464, - "learning_rate": 3.413636363636364e-05, - "loss": 0.4757, + "epoch": 1.553610276597949, + "grad_norm": 0.8881245255470276, + "learning_rate": 9.787174460935312e-05, + "loss": 0.7579, "step": 3750 }, { - "epoch": 2.5, - "eval_loss": 0.5462443232536316, - "eval_runtime": 80.8586, - "eval_samples_per_second": 37.102, - "eval_steps_per_second": 9.275, + "epoch": 1.553610276597949, + "eval_loss": 0.6655827164649963, + "eval_runtime": 86.317, + "eval_samples_per_second": 55.91, + "eval_steps_per_second": 13.983, "step": 3750 }, { - "epoch": 2.533333333333333, - "grad_norm": 0.48812639713287354, - "learning_rate": 3.186363636363637e-05, - "loss": 0.4991, + "epoch": 1.574329224075417, + "grad_norm": 0.6604064702987671, + "learning_rate": 9.64715765891907e-05, + "loss": 0.7003, "step": 3800 }, { - "epoch": 2.5666666666666664, - "grad_norm": 0.5571278929710388, - "learning_rate": 2.959090909090909e-05, - "loss": 0.4837, + "epoch": 1.595048171552885, + "grad_norm": 0.5936245918273926, + "learning_rate": 9.507140856902829e-05, + "loss": 0.7093, "step": 3850 }, { - "epoch": 2.6, - "grad_norm": 0.5383552312850952, - "learning_rate": 2.731818181818182e-05, - "loss": 0.4875, + "epoch": 1.6157671190303533, + "grad_norm": 0.6983786225318909, + "learning_rate": 9.367124054886587e-05, + "loss": 0.7009, "step": 3900 }, { - "epoch": 2.6333333333333333, - "grad_norm": 0.4364107549190521, - "learning_rate": 2.5045454545454544e-05, - "loss": 0.4762, + "epoch": 1.6364860665078214, + "grad_norm": 0.6833502054214478, + "learning_rate": 9.227107252870345e-05, + "loss": 0.7293, "step": 3950 }, { - "epoch": 2.6666666666666665, - "grad_norm": 0.4508945345878601, - "learning_rate": 2.2772727272727272e-05, - "loss": 0.5157, + "epoch": 1.6572050139852896, + "grad_norm": 0.5032167434692383, + "learning_rate": 9.087090450854103e-05, + "loss": 0.6968, "step": 4000 }, { - "epoch": 2.6666666666666665, - "eval_loss": 0.5454675555229187, - "eval_runtime": 80.8125, - "eval_samples_per_second": 37.123, - "eval_steps_per_second": 9.281, + "epoch": 1.6572050139852896, + "eval_loss": 0.6616591215133667, + "eval_runtime": 86.2625, + "eval_samples_per_second": 55.946, + "eval_steps_per_second": 13.992, "step": 4000 } ], "logging_steps": 50, - "max_steps": 4500, + "max_steps": 7242, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, @@ -715,7 +715,7 @@ "attributes": {} } }, - "total_flos": 4.8894626300952576e+17, + "total_flos": 2.5174634214014976e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null diff --git a/checkpoint-4000/training_args.bin b/checkpoint-4000/training_args.bin index eb236cf9d6a781057673256e00030110ee72a3cc..818731b2d637beef34d216b7461a12ef35db62df 100644 --- a/checkpoint-4000/training_args.bin +++ b/checkpoint-4000/training_args.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eb76e3e4d2123d52e529262f1ff37bfc600a160bb369e2338a54f71b47c17108 +oid sha256:a783872a61d64f4d1b4e002fa0fa67fa54dd9c59c4f7fbad58c794a77134069d size 5432 diff --git a/checkpoint-4500/adapter_config.json b/checkpoint-4500/adapter_config.json index 1a5356a6269df402dbde851645d59d282fadd9df..84dde86ee0bd87c0f5d5a0a36710a2a58516a7c4 100644 --- a/checkpoint-4500/adapter_config.json +++ b/checkpoint-4500/adapter_config.json @@ -29,9 +29,9 @@ "rank_pattern": {}, "revision": null, "target_modules": [ + "q_proj", "o_proj", "v_proj", - "q_proj", "k_proj" ], "target_parameters": null, diff --git a/checkpoint-4500/adapter_model.safetensors b/checkpoint-4500/adapter_model.safetensors index 94a0174112f35bc94b0e79ffbc8a7e2d2db64327..320f125b4953dbf6b195c7cc9d947098f31cdbdf 100644 --- a/checkpoint-4500/adapter_model.safetensors +++ b/checkpoint-4500/adapter_model.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fb11bc3b71d97ecb5757025adcf619a2d1aab11a0a14fd32a70f3e3bb4869fe3 +oid sha256:cdcbfbfdf51cedc49052e8cf3aba4d253bfd0057682c92f1b2f6a1085207ad05 size 54560368 diff --git a/checkpoint-4500/optimizer.pt b/checkpoint-4500/optimizer.pt index d7b14403cb486b3a161aa3fe76a4925ebd8f94bb..692091e9bd222d84d933c7a3768de9839559520d 100644 --- a/checkpoint-4500/optimizer.pt +++ b/checkpoint-4500/optimizer.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1114bc964f71b64d521ceb3f0953435182748caeb119b8ad746e7002dd8d6616 +oid sha256:81218a18899b0f5f31c32da8cb6bbb70580a0a3d59aece1e36dc30f04458b9de size 109267450 diff --git a/checkpoint-4500/rng_state.pth b/checkpoint-4500/rng_state.pth index 93e17ef6a173434e9d21ed055a6daeb8d2088d22..b328c74d133b27e77c9a2742073ab3ba08066bbe 100644 --- a/checkpoint-4500/rng_state.pth +++ b/checkpoint-4500/rng_state.pth @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1d78276a3f58b87969f8c2b253f6aebf8881bbfa0bd2bf187f536df9936165c8 +oid sha256:e7cca4f6dc017e30f7e4f98a6b7926c18723944baf002784114c27e375afaa04 size 14244 diff --git a/checkpoint-4500/scaler.pt b/checkpoint-4500/scaler.pt index 0eb1718239e9101b6a6e609988d0ab4c61c50ef6..4d9d0ce4c85e3cc83ba46932d43cc9cf769d3b12 100644 --- a/checkpoint-4500/scaler.pt +++ b/checkpoint-4500/scaler.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:25aed798df1ea1f11d3cdd0a7de18dd9b632beaa14981af70f644c899539b4cc +oid sha256:5f6bb5818ced3d826d0f4db8b88459585e3b324b0f36149564b5093c71d65045 size 988 diff --git a/checkpoint-4500/scheduler.pt b/checkpoint-4500/scheduler.pt index 280cc434d66cca3ec180f55ec8ddd1acd69469d5..de1c1138fab058732599edc8f6c01f61141e4733 100644 --- a/checkpoint-4500/scheduler.pt +++ b/checkpoint-4500/scheduler.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a8670c5c048d65fccc45f56703c267d59b5ec466e8f5777cf6ff01b0a32855ba +oid sha256:ef6e72ebe78e977145a028ad9426d261c3206a69efaed38a934a910632c5da0c size 1064 diff --git a/checkpoint-4500/trainer_state.json b/checkpoint-4500/trainer_state.json index 672265ceda80b4df47b1b31e4085cc7dca6babfa..39afd1761835f3a702a821b12374d07b71a917d0 100644 --- a/checkpoint-4500/trainer_state.json +++ b/checkpoint-4500/trainer_state.json @@ -1,8 +1,8 @@ { - "best_global_step": 3000, - "best_metric": 0.5436099171638489, - "best_model_checkpoint": "finetuned_models/llama3.1-8b-lora/checkpoint-3000", - "epoch": 3.0, + "best_global_step": 4500, + "best_metric": 0.6582211852073669, + "best_model_checkpoint": "amiya_outputs/llama3.1-8b-amiya-palestinian/checkpoint-4500", + "epoch": 1.864394488759971, "eval_steps": 250, "global_step": 4500, "is_hyper_param_search": false, @@ -10,782 +10,782 @@ "is_world_process_zero": true, "log_history": [ { - "epoch": 0.03333333333333333, - "grad_norm": 0.5346225500106812, + "epoch": 0.020718947477468146, + "grad_norm": 1.0589393377304077, "learning_rate": 9.8e-05, - "loss": 2.4955, + "loss": 2.6567, "step": 50 }, { - "epoch": 0.06666666666666667, - "grad_norm": 0.719093918800354, + "epoch": 0.04143789495493629, + "grad_norm": 0.9738045334815979, "learning_rate": 0.00019800000000000002, - "loss": 0.71, + "loss": 0.9502, "step": 100 }, { - "epoch": 0.1, - "grad_norm": 0.4840560853481293, - "learning_rate": 0.0001977727272727273, - "loss": 0.6405, + "epoch": 0.062156842432404436, + "grad_norm": 0.8801347017288208, + "learning_rate": 0.00019862783534024082, + "loss": 0.8496, "step": 150 }, { - "epoch": 0.13333333333333333, - "grad_norm": 0.3332301676273346, - "learning_rate": 0.0001955, - "loss": 0.6287, + "epoch": 0.08287578990987259, + "grad_norm": 0.7272312045097351, + "learning_rate": 0.00019722766732007841, + "loss": 0.8184, "step": 200 }, { - "epoch": 0.16666666666666666, - "grad_norm": 0.40639588236808777, - "learning_rate": 0.00019322727272727276, - "loss": 0.5572, + "epoch": 0.10359473738734072, + "grad_norm": 0.7850629091262817, + "learning_rate": 0.000195827499299916, + "loss": 0.8392, "step": 250 }, { - "epoch": 0.16666666666666666, - "eval_loss": 0.5975945591926575, - "eval_runtime": 80.8004, - "eval_samples_per_second": 37.129, - "eval_steps_per_second": 9.282, + "epoch": 0.10359473738734072, + "eval_loss": 0.7402811050415039, + "eval_runtime": 85.9367, + "eval_samples_per_second": 56.158, + "eval_steps_per_second": 14.045, "step": 250 }, { - "epoch": 0.2, - "grad_norm": 0.3970712423324585, - "learning_rate": 0.00019095454545454545, - "loss": 0.6165, + "epoch": 0.12431368486480887, + "grad_norm": 0.40629276633262634, + "learning_rate": 0.00019442733127975358, + "loss": 0.8108, "step": 300 }, { - "epoch": 0.23333333333333334, - "grad_norm": 0.38409528136253357, - "learning_rate": 0.00018868181818181817, - "loss": 0.639, + "epoch": 0.145032632342277, + "grad_norm": 0.5258236527442932, + "learning_rate": 0.00019302716325959117, + "loss": 0.8116, "step": 350 }, { - "epoch": 0.26666666666666666, - "grad_norm": 0.44628769159317017, - "learning_rate": 0.00018640909090909092, - "loss": 0.636, + "epoch": 0.16575157981974517, + "grad_norm": 0.6879925727844238, + "learning_rate": 0.00019162699523942874, + "loss": 0.9089, "step": 400 }, { - "epoch": 0.3, - "grad_norm": 0.3697021007537842, - "learning_rate": 0.00018413636363636364, - "loss": 0.6192, + "epoch": 0.1864705272972133, + "grad_norm": 0.7583937048912048, + "learning_rate": 0.00019022682721926633, + "loss": 0.874, "step": 450 }, { - "epoch": 0.3333333333333333, - "grad_norm": 0.36338189244270325, - "learning_rate": 0.00018186363636363636, - "loss": 0.6134, + "epoch": 0.20718947477468144, + "grad_norm": 0.6399120688438416, + "learning_rate": 0.0001888266591991039, + "loss": 0.8366, "step": 500 }, { - "epoch": 0.3333333333333333, - "eval_loss": 0.5813060998916626, - "eval_runtime": 80.7819, - "eval_samples_per_second": 37.137, - "eval_steps_per_second": 9.284, + "epoch": 0.20718947477468144, + "eval_loss": 0.7194066047668457, + "eval_runtime": 86.2811, + "eval_samples_per_second": 55.933, + "eval_steps_per_second": 13.989, "step": 500 }, { - "epoch": 0.36666666666666664, - "grad_norm": 0.35211533308029175, - "learning_rate": 0.0001795909090909091, - "loss": 0.6128, + "epoch": 0.22790842225214958, + "grad_norm": 0.7763131856918335, + "learning_rate": 0.0001874264911789415, + "loss": 0.7912, "step": 550 }, { - "epoch": 0.4, - "grad_norm": 0.36327463388442993, - "learning_rate": 0.00017731818181818183, - "loss": 0.5915, + "epoch": 0.24862736972961774, + "grad_norm": 0.6845299601554871, + "learning_rate": 0.00018602632315877906, + "loss": 0.8506, "step": 600 }, { - "epoch": 0.43333333333333335, - "grad_norm": 0.40672942996025085, - "learning_rate": 0.00017504545454545455, - "loss": 0.5807, + "epoch": 0.2693463172070859, + "grad_norm": 0.8045451045036316, + "learning_rate": 0.00018462615513861665, + "loss": 0.763, "step": 650 }, { - "epoch": 0.4666666666666667, - "grad_norm": 0.4689007103443146, - "learning_rate": 0.00017277272727272728, - "loss": 0.602, + "epoch": 0.290065264684554, + "grad_norm": 0.7035927176475525, + "learning_rate": 0.00018322598711845422, + "loss": 0.7769, "step": 700 }, { - "epoch": 0.5, - "grad_norm": 0.3979697823524475, - "learning_rate": 0.00017050000000000002, - "loss": 0.5703, + "epoch": 0.3107842121620222, + "grad_norm": 0.465000718832016, + "learning_rate": 0.00018182581909829179, + "loss": 0.7705, "step": 750 }, { - "epoch": 0.5, - "eval_loss": 0.5740106701850891, - "eval_runtime": 80.8209, - "eval_samples_per_second": 37.119, - "eval_steps_per_second": 9.28, + "epoch": 0.3107842121620222, + "eval_loss": 0.7103215456008911, + "eval_runtime": 86.1101, + "eval_samples_per_second": 56.045, + "eval_steps_per_second": 14.017, "step": 750 }, { - "epoch": 0.5333333333333333, - "grad_norm": 0.3071135878562927, - "learning_rate": 0.00016822727272727275, - "loss": 0.5746, + "epoch": 0.33150315963949034, + "grad_norm": 0.4990151524543762, + "learning_rate": 0.00018042565107812938, + "loss": 0.8438, "step": 800 }, { - "epoch": 0.5666666666666667, - "grad_norm": 0.318085253238678, - "learning_rate": 0.00016595454545454544, - "loss": 0.5873, + "epoch": 0.35222210711695845, + "grad_norm": 0.7391067147254944, + "learning_rate": 0.00017902548305796695, + "loss": 0.7688, "step": 850 }, { - "epoch": 0.6, - "grad_norm": 0.35915374755859375, - "learning_rate": 0.0001636818181818182, - "loss": 0.6283, + "epoch": 0.3729410545944266, + "grad_norm": 0.8036171197891235, + "learning_rate": 0.00017762531503780454, + "loss": 0.753, "step": 900 }, { - "epoch": 0.6333333333333333, - "grad_norm": 0.3174057602882385, - "learning_rate": 0.0001614090909090909, - "loss": 0.5912, + "epoch": 0.3936600020718947, + "grad_norm": 0.44744470715522766, + "learning_rate": 0.00017622514701764213, + "loss": 0.7793, "step": 950 }, { - "epoch": 0.6666666666666666, - "grad_norm": 0.416111022233963, - "learning_rate": 0.00015913636363636363, - "loss": 0.5647, + "epoch": 0.4143789495493629, + "grad_norm": 0.630820631980896, + "learning_rate": 0.00017482497899747973, + "loss": 0.7555, "step": 1000 }, { - "epoch": 0.6666666666666666, - "eval_loss": 0.5663638710975647, - "eval_runtime": 80.8183, - "eval_samples_per_second": 37.12, - "eval_steps_per_second": 9.28, + "epoch": 0.4143789495493629, + "eval_loss": 0.7030432820320129, + "eval_runtime": 86.2543, + "eval_samples_per_second": 55.951, + "eval_steps_per_second": 13.994, "step": 1000 }, { - "epoch": 0.7, - "grad_norm": 0.41202324628829956, - "learning_rate": 0.00015686363636363638, - "loss": 0.6118, + "epoch": 0.43509789702683105, + "grad_norm": 0.45690879225730896, + "learning_rate": 0.0001734248109773173, + "loss": 0.793, "step": 1050 }, { - "epoch": 0.7333333333333333, - "grad_norm": 0.3883333206176758, - "learning_rate": 0.0001545909090909091, - "loss": 0.5392, + "epoch": 0.45581684450429916, + "grad_norm": 0.5000227093696594, + "learning_rate": 0.00017202464295715486, + "loss": 0.8342, "step": 1100 }, { - "epoch": 0.7666666666666667, - "grad_norm": 0.31973451375961304, - "learning_rate": 0.00015231818181818182, - "loss": 0.5602, + "epoch": 0.4765357919817673, + "grad_norm": 0.47182488441467285, + "learning_rate": 0.00017062447493699246, + "loss": 0.7997, "step": 1150 }, { - "epoch": 0.8, - "grad_norm": 0.31378698348999023, - "learning_rate": 0.00015004545454545454, - "loss": 0.5642, + "epoch": 0.4972547394592355, + "grad_norm": 0.7060516476631165, + "learning_rate": 0.00016922430691683002, + "loss": 0.7788, "step": 1200 }, { - "epoch": 0.8333333333333334, - "grad_norm": 0.3346308171749115, - "learning_rate": 0.0001477727272727273, - "loss": 0.5925, + "epoch": 0.5179736869367036, + "grad_norm": 0.46701857447624207, + "learning_rate": 0.00016782413889666762, + "loss": 0.7518, "step": 1250 }, { - "epoch": 0.8333333333333334, - "eval_loss": 0.5619704723358154, - "eval_runtime": 80.824, - "eval_samples_per_second": 37.118, - "eval_steps_per_second": 9.279, + "epoch": 0.5179736869367036, + "eval_loss": 0.7023425698280334, + "eval_runtime": 86.3015, + "eval_samples_per_second": 55.92, + "eval_steps_per_second": 13.986, "step": 1250 }, { - "epoch": 0.8666666666666667, - "grad_norm": 0.5573959946632385, - "learning_rate": 0.0001455, - "loss": 0.5829, + "epoch": 0.5386926344141718, + "grad_norm": 0.668192446231842, + "learning_rate": 0.00016642397087650518, + "loss": 0.7682, "step": 1300 }, { - "epoch": 0.9, - "grad_norm": 0.36054643988609314, - "learning_rate": 0.00014322727272727273, - "loss": 0.5923, + "epoch": 0.5594115818916399, + "grad_norm": 0.47292283177375793, + "learning_rate": 0.00016502380285634278, + "loss": 0.7985, "step": 1350 }, { - "epoch": 0.9333333333333333, - "grad_norm": 0.36059027910232544, - "learning_rate": 0.00014095454545454546, - "loss": 0.5808, + "epoch": 0.580130529369108, + "grad_norm": 0.7327275276184082, + "learning_rate": 0.00016362363483618034, + "loss": 0.8378, "step": 1400 }, { - "epoch": 0.9666666666666667, - "grad_norm": 0.3942534327507019, - "learning_rate": 0.00013868181818181818, - "loss": 0.5597, + "epoch": 0.6008494768465762, + "grad_norm": 0.8417996764183044, + "learning_rate": 0.0001622234668160179, + "loss": 0.7962, "step": 1450 }, { - "epoch": 1.0, - "grad_norm": 0.3995835483074188, - "learning_rate": 0.0001364090909090909, - "loss": 0.5554, + "epoch": 0.6215684243240444, + "grad_norm": 0.6189562678337097, + "learning_rate": 0.0001608232987958555, + "loss": 0.8028, "step": 1500 }, { - "epoch": 1.0, - "eval_loss": 0.5581239461898804, - "eval_runtime": 80.8326, - "eval_samples_per_second": 37.114, - "eval_steps_per_second": 9.278, + "epoch": 0.6215684243240444, + "eval_loss": 0.6915447115898132, + "eval_runtime": 86.2147, + "eval_samples_per_second": 55.977, + "eval_steps_per_second": 14.0, "step": 1500 }, { - "epoch": 1.0333333333333334, - "grad_norm": 0.3405410051345825, - "learning_rate": 0.00013413636363636365, - "loss": 0.5571, + "epoch": 0.6422873718015125, + "grad_norm": 0.7345826625823975, + "learning_rate": 0.0001594231307756931, + "loss": 0.7978, "step": 1550 }, { - "epoch": 1.0666666666666667, - "grad_norm": 0.4485073983669281, - "learning_rate": 0.00013186363636363637, - "loss": 0.5674, + "epoch": 0.6630063192789807, + "grad_norm": 0.6538310050964355, + "learning_rate": 0.0001580229627555307, + "loss": 0.7672, "step": 1600 }, { - "epoch": 1.1, - "grad_norm": 0.34938374161720276, - "learning_rate": 0.0001295909090909091, - "loss": 0.5354, + "epoch": 0.6837252667564487, + "grad_norm": 0.661582350730896, + "learning_rate": 0.00015662279473536826, + "loss": 0.7378, "step": 1650 }, { - "epoch": 1.1333333333333333, - "grad_norm": 0.33084195852279663, - "learning_rate": 0.00012731818181818184, - "loss": 0.5765, + "epoch": 0.7044442142339169, + "grad_norm": 0.3603042960166931, + "learning_rate": 0.00015522262671520583, + "loss": 0.6741, "step": 1700 }, { - "epoch": 1.1666666666666667, - "grad_norm": 0.3667336404323578, - "learning_rate": 0.00012504545454545456, - "loss": 0.5486, + "epoch": 0.7251631617113851, + "grad_norm": 0.8882561326026917, + "learning_rate": 0.00015382245869504342, + "loss": 0.7695, "step": 1750 }, { - "epoch": 1.1666666666666667, - "eval_loss": 0.5557209253311157, - "eval_runtime": 80.8386, - "eval_samples_per_second": 37.111, - "eval_steps_per_second": 9.278, + "epoch": 0.7251631617113851, + "eval_loss": 0.6858941316604614, + "eval_runtime": 86.6358, + "eval_samples_per_second": 55.704, + "eval_steps_per_second": 13.932, "step": 1750 }, { - "epoch": 1.2, - "grad_norm": 0.33248019218444824, - "learning_rate": 0.00012277272727272728, - "loss": 0.5617, + "epoch": 0.7458821091888532, + "grad_norm": 0.5933266282081604, + "learning_rate": 0.000152422290674881, + "loss": 0.7548, "step": 1800 }, { - "epoch": 1.2333333333333334, - "grad_norm": 0.4447474479675293, - "learning_rate": 0.00012050000000000002, - "loss": 0.567, + "epoch": 0.7666010566663214, + "grad_norm": 0.8178608417510986, + "learning_rate": 0.00015102212265471858, + "loss": 0.7639, "step": 1850 }, { - "epoch": 1.2666666666666666, - "grad_norm": 0.42134660482406616, - "learning_rate": 0.00011822727272727274, - "loss": 0.5319, + "epoch": 0.7873200041437894, + "grad_norm": 0.4378993511199951, + "learning_rate": 0.00014962195463455615, + "loss": 0.7985, "step": 1900 }, { - "epoch": 1.3, - "grad_norm": 0.3942984640598297, - "learning_rate": 0.00011595454545454544, - "loss": 0.5325, + "epoch": 0.8080389516212576, + "grad_norm": 0.3732803463935852, + "learning_rate": 0.00014822178661439374, + "loss": 0.8481, "step": 1950 }, { - "epoch": 1.3333333333333333, - "grad_norm": 0.4929428696632385, - "learning_rate": 0.00011368181818181818, - "loss": 0.5565, + "epoch": 0.8287578990987258, + "grad_norm": 0.7421035170555115, + "learning_rate": 0.0001468216185942313, + "loss": 0.7223, "step": 2000 }, { - "epoch": 1.3333333333333333, - "eval_loss": 0.5535863637924194, - "eval_runtime": 80.8279, - "eval_samples_per_second": 37.116, - "eval_steps_per_second": 9.279, + "epoch": 0.8287578990987258, + "eval_loss": 0.6823315024375916, + "eval_runtime": 86.5575, + "eval_samples_per_second": 55.755, + "eval_steps_per_second": 13.944, "step": 2000 }, { - "epoch": 1.3666666666666667, - "grad_norm": 0.4141586720943451, - "learning_rate": 0.00011140909090909091, - "loss": 0.5801, + "epoch": 0.8494768465761939, + "grad_norm": 0.5109913349151611, + "learning_rate": 0.00014542145057406888, + "loss": 0.7895, "step": 2050 }, { - "epoch": 1.4, - "grad_norm": 0.45937269926071167, - "learning_rate": 0.00010913636363636364, - "loss": 0.5439, + "epoch": 0.8701957940536621, + "grad_norm": 0.47988179326057434, + "learning_rate": 0.00014402128255390647, + "loss": 0.7385, "step": 2100 }, { - "epoch": 1.4333333333333333, - "grad_norm": 0.47830042243003845, - "learning_rate": 0.00010686363636363637, - "loss": 0.547, + "epoch": 0.8909147415311303, + "grad_norm": 0.7593080997467041, + "learning_rate": 0.00014262111453374404, + "loss": 0.7744, "step": 2150 }, { - "epoch": 1.4666666666666668, - "grad_norm": 0.40260276198387146, - "learning_rate": 0.00010459090909090909, - "loss": 0.5229, + "epoch": 0.9116336890085983, + "grad_norm": 0.5866154432296753, + "learning_rate": 0.00014122094651358163, + "loss": 0.7062, "step": 2200 }, { - "epoch": 1.5, - "grad_norm": 0.5281402468681335, - "learning_rate": 0.00010231818181818183, - "loss": 0.5475, + "epoch": 0.9323526364860665, + "grad_norm": 0.47364088892936707, + "learning_rate": 0.00013982077849341922, + "loss": 0.7792, "step": 2250 }, { - "epoch": 1.5, - "eval_loss": 0.5505018830299377, - "eval_runtime": 80.8409, - "eval_samples_per_second": 37.11, - "eval_steps_per_second": 9.277, + "epoch": 0.9323526364860665, + "eval_loss": 0.6785813570022583, + "eval_runtime": 86.3444, + "eval_samples_per_second": 55.892, + "eval_steps_per_second": 13.979, "step": 2250 }, { - "epoch": 1.5333333333333332, - "grad_norm": 0.3721947968006134, - "learning_rate": 0.00010004545454545455, - "loss": 0.5466, + "epoch": 0.9530715839635346, + "grad_norm": 0.7610514760017395, + "learning_rate": 0.00013842061047325682, + "loss": 0.7804, "step": 2300 }, { - "epoch": 1.5666666666666667, - "grad_norm": 0.3462945818901062, - "learning_rate": 9.777272727272728e-05, - "loss": 0.5209, + "epoch": 0.9737905314410028, + "grad_norm": 0.7689616084098816, + "learning_rate": 0.00013702044245309438, + "loss": 0.7497, "step": 2350 }, { - "epoch": 1.6, - "grad_norm": 0.4027090072631836, - "learning_rate": 9.55e-05, - "loss": 0.5307, + "epoch": 0.994509478918471, + "grad_norm": 0.542168378829956, + "learning_rate": 0.00013562027443293195, + "loss": 0.7333, "step": 2400 }, { - "epoch": 1.6333333333333333, - "grad_norm": 0.3684265613555908, - "learning_rate": 9.322727272727273e-05, - "loss": 0.5118, + "epoch": 1.0149176421837771, + "grad_norm": 0.33903324604034424, + "learning_rate": 0.0001342481097731728, + "loss": 0.6952, "step": 2450 }, { - "epoch": 1.6666666666666665, - "grad_norm": 0.4819887578487396, - "learning_rate": 9.095454545454546e-05, - "loss": 0.561, + "epoch": 1.0356365896612452, + "grad_norm": 0.8183636665344238, + "learning_rate": 0.00013284794175301036, + "loss": 0.7386, "step": 2500 }, { - "epoch": 1.6666666666666665, - "eval_loss": 0.5476261377334595, - "eval_runtime": 80.8288, - "eval_samples_per_second": 37.115, - "eval_steps_per_second": 9.279, + "epoch": 1.0356365896612452, + "eval_loss": 0.675748348236084, + "eval_runtime": 86.2887, + "eval_samples_per_second": 55.929, + "eval_steps_per_second": 13.988, "step": 2500 }, { - "epoch": 1.7, - "grad_norm": 0.3161783218383789, - "learning_rate": 8.86818181818182e-05, - "loss": 0.5413, + "epoch": 1.0563555371387134, + "grad_norm": 0.6831589937210083, + "learning_rate": 0.00013144777373284795, + "loss": 0.72, "step": 2550 }, { - "epoch": 1.7333333333333334, - "grad_norm": 0.34697386622428894, - "learning_rate": 8.640909090909092e-05, - "loss": 0.5366, + "epoch": 1.0770744846161815, + "grad_norm": 0.6346258521080017, + "learning_rate": 0.00013004760571268552, + "loss": 0.7026, "step": 2600 }, { - "epoch": 1.7666666666666666, - "grad_norm": 0.4084527790546417, - "learning_rate": 8.413636363636364e-05, - "loss": 0.5426, + "epoch": 1.0977934320936495, + "grad_norm": 0.5658385753631592, + "learning_rate": 0.0001286474376925231, + "loss": 0.7162, "step": 2650 }, { - "epoch": 1.8, - "grad_norm": 0.4053308963775635, - "learning_rate": 8.186363636363636e-05, - "loss": 0.532, + "epoch": 1.1185123795711178, + "grad_norm": 0.4242883026599884, + "learning_rate": 0.00012724726967236068, + "loss": 0.7325, "step": 2700 }, { - "epoch": 1.8333333333333335, - "grad_norm": 0.3551884591579437, - "learning_rate": 7.95909090909091e-05, - "loss": 0.5399, + "epoch": 1.1392313270485859, + "grad_norm": 0.5489133596420288, + "learning_rate": 0.00012584710165219827, + "loss": 0.7138, "step": 2750 }, { - "epoch": 1.8333333333333335, - "eval_loss": 0.546008288860321, - "eval_runtime": 80.8186, - "eval_samples_per_second": 37.12, - "eval_steps_per_second": 9.28, + "epoch": 1.1392313270485859, + "eval_loss": 0.6747092604637146, + "eval_runtime": 86.4239, + "eval_samples_per_second": 55.841, + "eval_steps_per_second": 13.966, "step": 2750 }, { - "epoch": 1.8666666666666667, - "grad_norm": 0.40072572231292725, - "learning_rate": 7.731818181818183e-05, - "loss": 0.5332, + "epoch": 1.1599502745260541, + "grad_norm": 0.6514728665351868, + "learning_rate": 0.00012444693363203587, + "loss": 0.7105, "step": 2800 }, { - "epoch": 1.9, - "grad_norm": 0.3773200213909149, - "learning_rate": 7.504545454545455e-05, - "loss": 0.5296, + "epoch": 1.1806692220035222, + "grad_norm": 0.48897412419319153, + "learning_rate": 0.00012304676561187343, + "loss": 0.7271, "step": 2850 }, { - "epoch": 1.9333333333333333, - "grad_norm": 0.45379436016082764, - "learning_rate": 7.277272727272728e-05, - "loss": 0.5356, + "epoch": 1.2013881694809903, + "grad_norm": 0.7159713506698608, + "learning_rate": 0.00012164659759171101, + "loss": 0.7454, "step": 2900 }, { - "epoch": 1.9666666666666668, - "grad_norm": 0.36246028542518616, - "learning_rate": 7.05e-05, - "loss": 0.5112, + "epoch": 1.2221071169584585, + "grad_norm": 0.7044214010238647, + "learning_rate": 0.0001202464295715486, + "loss": 0.6918, "step": 2950 }, { - "epoch": 2.0, - "grad_norm": 0.40895622968673706, - "learning_rate": 6.822727272727273e-05, - "loss": 0.5358, + "epoch": 1.2428260644359266, + "grad_norm": 0.7934305667877197, + "learning_rate": 0.00011884626155138616, + "loss": 0.7018, "step": 3000 }, { - "epoch": 2.0, - "eval_loss": 0.5436099171638489, - "eval_runtime": 80.8207, - "eval_samples_per_second": 37.119, - "eval_steps_per_second": 9.28, + "epoch": 1.2428260644359266, + "eval_loss": 0.6727278828620911, + "eval_runtime": 86.1985, + "eval_samples_per_second": 55.987, + "eval_steps_per_second": 14.003, "step": 3000 }, { - "epoch": 2.033333333333333, - "grad_norm": 0.4935952425003052, - "learning_rate": 6.595454545454546e-05, - "loss": 0.5074, + "epoch": 1.2635450119133949, + "grad_norm": 0.8456618785858154, + "learning_rate": 0.00011744609353122375, + "loss": 0.763, "step": 3050 }, { - "epoch": 2.066666666666667, - "grad_norm": 0.505511999130249, - "learning_rate": 6.368181818181818e-05, - "loss": 0.4716, + "epoch": 1.284263959390863, + "grad_norm": 0.5733729600906372, + "learning_rate": 0.00011604592551106132, + "loss": 0.7034, "step": 3100 }, { - "epoch": 2.1, - "grad_norm": 0.47748756408691406, - "learning_rate": 6.140909090909092e-05, - "loss": 0.4909, + "epoch": 1.304982906868331, + "grad_norm": 0.4783104658126831, + "learning_rate": 0.00011464575749089892, + "loss": 0.762, "step": 3150 }, { - "epoch": 2.1333333333333333, - "grad_norm": 0.3205774426460266, - "learning_rate": 5.913636363636363e-05, - "loss": 0.5009, + "epoch": 1.3257018543457992, + "grad_norm": 0.7016689777374268, + "learning_rate": 0.0001132455894707365, + "loss": 0.7049, "step": 3200 }, { - "epoch": 2.1666666666666665, - "grad_norm": 0.437486469745636, - "learning_rate": 5.686363636363636e-05, - "loss": 0.5224, + "epoch": 1.3464208018232675, + "grad_norm": 0.6739513278007507, + "learning_rate": 0.00011184542145057409, + "loss": 0.7137, "step": 3250 }, { - "epoch": 2.1666666666666665, - "eval_loss": 0.5484762787818909, - "eval_runtime": 80.8314, - "eval_samples_per_second": 37.114, - "eval_steps_per_second": 9.279, + "epoch": 1.3464208018232675, + "eval_loss": 0.6689812541007996, + "eval_runtime": 86.4895, + "eval_samples_per_second": 55.799, + "eval_steps_per_second": 13.955, "step": 3250 }, { - "epoch": 2.2, - "grad_norm": 0.49795669317245483, - "learning_rate": 5.4590909090909096e-05, - "loss": 0.516, + "epoch": 1.3671397493007356, + "grad_norm": 0.8907766938209534, + "learning_rate": 0.00011044525343041166, + "loss": 0.7476, "step": 3300 }, { - "epoch": 2.2333333333333334, - "grad_norm": 0.40953299403190613, - "learning_rate": 5.2318181818181824e-05, - "loss": 0.5025, + "epoch": 1.3878586967782036, + "grad_norm": 0.8889743089675903, + "learning_rate": 0.00010904508541024922, + "loss": 0.7059, "step": 3350 }, { - "epoch": 2.2666666666666666, - "grad_norm": 0.5090060830116272, - "learning_rate": 5.004545454545455e-05, - "loss": 0.5064, + "epoch": 1.408577644255672, + "grad_norm": 0.5788094401359558, + "learning_rate": 0.00010764491739008682, + "loss": 0.7018, "step": 3400 }, { - "epoch": 2.3, - "grad_norm": 0.4385254979133606, - "learning_rate": 4.777272727272727e-05, - "loss": 0.497, + "epoch": 1.42929659173314, + "grad_norm": 0.7107548713684082, + "learning_rate": 0.00010624474936992438, + "loss": 0.6796, "step": 3450 }, { - "epoch": 2.3333333333333335, - "grad_norm": 0.4746367037296295, - "learning_rate": 4.55e-05, - "loss": 0.4696, + "epoch": 1.4500155392106082, + "grad_norm": 0.6979348063468933, + "learning_rate": 0.00010484458134976198, + "loss": 0.7212, "step": 3500 }, { - "epoch": 2.3333333333333335, - "eval_loss": 0.5463398098945618, - "eval_runtime": 80.8383, - "eval_samples_per_second": 37.111, - "eval_steps_per_second": 9.278, + "epoch": 1.4500155392106082, + "eval_loss": 0.6663665175437927, + "eval_runtime": 86.5532, + "eval_samples_per_second": 55.758, + "eval_steps_per_second": 13.945, "step": 3500 }, { - "epoch": 2.3666666666666667, - "grad_norm": 0.5545032620429993, - "learning_rate": 4.322727272727273e-05, - "loss": 0.5024, + "epoch": 1.4707344866880763, + "grad_norm": 0.7232558727264404, + "learning_rate": 0.00010344441332959956, + "loss": 0.6814, "step": 3550 }, { - "epoch": 2.4, - "grad_norm": 0.5639179944992065, - "learning_rate": 4.095454545454546e-05, - "loss": 0.478, + "epoch": 1.4914534341655443, + "grad_norm": 0.8630662560462952, + "learning_rate": 0.00010204424530943715, + "loss": 0.7012, "step": 3600 }, { - "epoch": 2.4333333333333336, - "grad_norm": 0.418476402759552, - "learning_rate": 3.8681818181818186e-05, - "loss": 0.5017, + "epoch": 1.5121723816430124, + "grad_norm": 0.9553645253181458, + "learning_rate": 0.00010064407728927472, + "loss": 0.7247, "step": 3650 }, { - "epoch": 2.466666666666667, - "grad_norm": 0.4330589771270752, - "learning_rate": 3.640909090909091e-05, - "loss": 0.5075, + "epoch": 1.5328913291204807, + "grad_norm": 0.6892822980880737, + "learning_rate": 9.92439092691123e-05, + "loss": 0.7009, "step": 3700 }, { - "epoch": 2.5, - "grad_norm": 0.5324570536613464, - "learning_rate": 3.413636363636364e-05, - "loss": 0.4757, + "epoch": 1.553610276597949, + "grad_norm": 0.8881245255470276, + "learning_rate": 9.787174460935312e-05, + "loss": 0.7579, "step": 3750 }, { - "epoch": 2.5, - "eval_loss": 0.5462443232536316, - "eval_runtime": 80.8586, - "eval_samples_per_second": 37.102, - "eval_steps_per_second": 9.275, + "epoch": 1.553610276597949, + "eval_loss": 0.6655827164649963, + "eval_runtime": 86.317, + "eval_samples_per_second": 55.91, + "eval_steps_per_second": 13.983, "step": 3750 }, { - "epoch": 2.533333333333333, - "grad_norm": 0.48812639713287354, - "learning_rate": 3.186363636363637e-05, - "loss": 0.4991, + "epoch": 1.574329224075417, + "grad_norm": 0.6604064702987671, + "learning_rate": 9.64715765891907e-05, + "loss": 0.7003, "step": 3800 }, { - "epoch": 2.5666666666666664, - "grad_norm": 0.5571278929710388, - "learning_rate": 2.959090909090909e-05, - "loss": 0.4837, + "epoch": 1.595048171552885, + "grad_norm": 0.5936245918273926, + "learning_rate": 9.507140856902829e-05, + "loss": 0.7093, "step": 3850 }, { - "epoch": 2.6, - "grad_norm": 0.5383552312850952, - "learning_rate": 2.731818181818182e-05, - "loss": 0.4875, + "epoch": 1.6157671190303533, + "grad_norm": 0.6983786225318909, + "learning_rate": 9.367124054886587e-05, + "loss": 0.7009, "step": 3900 }, { - "epoch": 2.6333333333333333, - "grad_norm": 0.4364107549190521, - "learning_rate": 2.5045454545454544e-05, - "loss": 0.4762, + "epoch": 1.6364860665078214, + "grad_norm": 0.6833502054214478, + "learning_rate": 9.227107252870345e-05, + "loss": 0.7293, "step": 3950 }, { - "epoch": 2.6666666666666665, - "grad_norm": 0.4508945345878601, - "learning_rate": 2.2772727272727272e-05, - "loss": 0.5157, + "epoch": 1.6572050139852896, + "grad_norm": 0.5032167434692383, + "learning_rate": 9.087090450854103e-05, + "loss": 0.6968, "step": 4000 }, { - "epoch": 2.6666666666666665, - "eval_loss": 0.5454675555229187, - "eval_runtime": 80.8125, - "eval_samples_per_second": 37.123, - "eval_steps_per_second": 9.281, + "epoch": 1.6572050139852896, + "eval_loss": 0.6616591215133667, + "eval_runtime": 86.2625, + "eval_samples_per_second": 55.946, + "eval_steps_per_second": 13.992, "step": 4000 }, { - "epoch": 2.7, - "grad_norm": 0.41898655891418457, - "learning_rate": 2.05e-05, - "loss": 0.5017, + "epoch": 1.6779239614627577, + "grad_norm": 0.73284912109375, + "learning_rate": 8.947073648837862e-05, + "loss": 0.723, "step": 4050 }, { - "epoch": 2.7333333333333334, - "grad_norm": 0.4669305086135864, - "learning_rate": 1.822727272727273e-05, - "loss": 0.4927, + "epoch": 1.6986429089402257, + "grad_norm": 0.7727170586585999, + "learning_rate": 8.807056846821619e-05, + "loss": 0.6965, "step": 4100 }, { - "epoch": 2.7666666666666666, - "grad_norm": 0.4099997878074646, - "learning_rate": 1.5954545454545456e-05, - "loss": 0.5123, + "epoch": 1.719361856417694, + "grad_norm": 0.6575957536697388, + "learning_rate": 8.667040044805377e-05, + "loss": 0.6804, "step": 4150 }, { - "epoch": 2.8, - "grad_norm": 0.4900408685207367, - "learning_rate": 1.3681818181818181e-05, - "loss": 0.5022, + "epoch": 1.7400808038951623, + "grad_norm": 0.7174975275993347, + "learning_rate": 8.527023242789135e-05, + "loss": 0.7388, "step": 4200 }, { - "epoch": 2.8333333333333335, - "grad_norm": 0.5202891230583191, - "learning_rate": 1.140909090909091e-05, - "loss": 0.5114, + "epoch": 1.7607997513726303, + "grad_norm": 0.7730789184570312, + "learning_rate": 8.387006440772893e-05, + "loss": 0.6716, "step": 4250 }, { - "epoch": 2.8333333333333335, - "eval_loss": 0.5443527102470398, - "eval_runtime": 80.8318, - "eval_samples_per_second": 37.114, - "eval_steps_per_second": 9.279, + "epoch": 1.7607997513726303, + "eval_loss": 0.6594452857971191, + "eval_runtime": 86.4796, + "eval_samples_per_second": 55.805, + "eval_steps_per_second": 13.957, "step": 4250 }, { - "epoch": 2.8666666666666667, - "grad_norm": 0.4639386236667633, - "learning_rate": 9.136363636363637e-06, - "loss": 0.4805, + "epoch": 1.7815186988500984, + "grad_norm": 0.6953691840171814, + "learning_rate": 8.246989638756651e-05, + "loss": 0.6997, "step": 4300 }, { - "epoch": 2.9, - "grad_norm": 0.43289846181869507, - "learning_rate": 6.863636363636364e-06, - "loss": 0.5253, + "epoch": 1.8022376463275664, + "grad_norm": 0.5468209385871887, + "learning_rate": 8.10697283674041e-05, + "loss": 0.7069, "step": 4350 }, { - "epoch": 2.9333333333333336, - "grad_norm": 0.44327157735824585, - "learning_rate": 4.590909090909091e-06, - "loss": 0.4791, + "epoch": 1.8229565938050347, + "grad_norm": 0.649025559425354, + "learning_rate": 7.966956034724167e-05, + "loss": 0.7179, "step": 4400 }, { - "epoch": 2.966666666666667, - "grad_norm": 0.5715689659118652, - "learning_rate": 2.318181818181818e-06, - "loss": 0.4897, + "epoch": 1.843675541282503, + "grad_norm": 0.9825453162193298, + "learning_rate": 7.826939232707925e-05, + "loss": 0.7224, "step": 4450 }, { - "epoch": 3.0, - "grad_norm": 0.5023526549339294, - "learning_rate": 4.545454545454546e-08, - "loss": 0.5164, + "epoch": 1.864394488759971, + "grad_norm": 0.5808931589126587, + "learning_rate": 7.686922430691683e-05, + "loss": 0.6139, "step": 4500 }, { - "epoch": 3.0, - "eval_loss": 0.5436866283416748, - "eval_runtime": 80.8493, - "eval_samples_per_second": 37.106, - "eval_steps_per_second": 9.277, + "epoch": 1.864394488759971, + "eval_loss": 0.6582211852073669, + "eval_runtime": 86.3106, + "eval_samples_per_second": 55.914, + "eval_steps_per_second": 13.984, "step": 4500 } ], "logging_steps": 50, - "max_steps": 4500, + "max_steps": 7242, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, @@ -796,12 +796,12 @@ "should_evaluate": false, "should_log": false, "should_save": true, - "should_training_stop": true + "should_training_stop": false }, "attributes": {} } }, - "total_flos": 5.5037527277469696e+17, + "total_flos": 2.8285963124146176e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null diff --git a/checkpoint-4500/training_args.bin b/checkpoint-4500/training_args.bin index eb236cf9d6a781057673256e00030110ee72a3cc..818731b2d637beef34d216b7461a12ef35db62df 100644 --- a/checkpoint-4500/training_args.bin +++ b/checkpoint-4500/training_args.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eb76e3e4d2123d52e529262f1ff37bfc600a160bb369e2338a54f71b47c17108 +oid sha256:a783872a61d64f4d1b4e002fa0fa67fa54dd9c59c4f7fbad58c794a77134069d size 5432 diff --git a/checkpoint-500/adapter_config.json b/checkpoint-500/adapter_config.json index 1a5356a6269df402dbde851645d59d282fadd9df..84dde86ee0bd87c0f5d5a0a36710a2a58516a7c4 100644 --- a/checkpoint-500/adapter_config.json +++ b/checkpoint-500/adapter_config.json @@ -29,9 +29,9 @@ "rank_pattern": {}, "revision": null, "target_modules": [ + "q_proj", "o_proj", "v_proj", - "q_proj", "k_proj" ], "target_parameters": null, diff --git a/checkpoint-500/adapter_model.safetensors b/checkpoint-500/adapter_model.safetensors index 6029a6c2a08ebd938bcce3ef22b3b5697ac11780..49173edff2a402da0bde50c771b8dcafd7c58822 100644 --- a/checkpoint-500/adapter_model.safetensors +++ b/checkpoint-500/adapter_model.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e362a9b5e47dada1baa67143d34dfeeb4bdf6775103cfc3f1c1a5aae083d3cd9 +oid sha256:d60d099c77ad1078e8eab0f0cba4946f7b285915cea649f819cc4e18cb458cc2 size 54560368 diff --git a/checkpoint-500/optimizer.pt b/checkpoint-500/optimizer.pt index 952788fc9b4e14941cc3a3f981b2ae8aef7cf565..55b25483699ee8d05388cd4dc7e43119e9aacddd 100644 --- a/checkpoint-500/optimizer.pt +++ b/checkpoint-500/optimizer.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e44d148477a8ee335f32da6c4d425e5aff613d691092e91fc2efb0ed9e554895 +oid sha256:27dc733624b9639d83903e829704fddc54f2f4e303ffcd03456c32008db7d7c8 size 109267450 diff --git a/checkpoint-500/rng_state.pth b/checkpoint-500/rng_state.pth index 71ebe830c0358634be52e84d265b4996167b30d0..7c43c60ae1acf50fe00dde9367397f96f28eda31 100644 --- a/checkpoint-500/rng_state.pth +++ b/checkpoint-500/rng_state.pth @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:249ccca09cb53a3446269bd92de98c3a367bca986912540d38d45f3076993afd +oid sha256:831334df13be78faba63d682d6ceb4502ee0aa7e92683802a4df445a165995fe size 14244 diff --git a/checkpoint-500/scheduler.pt b/checkpoint-500/scheduler.pt index 843974b1d4cd09fd21c20c2df6459f749da316f3..43f03603ac4f82a48853b2c74327bd4da5df3c7b 100644 --- a/checkpoint-500/scheduler.pt +++ b/checkpoint-500/scheduler.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6694c01e5cba9064b9b75b7125be26a18ddd640754dd3d8e1e34b292a1ee6e42 +oid sha256:51e63a6db5047ce7f3d12777488f08670db355aedd3943265a9bb9235f99a5b9 size 1064 diff --git a/checkpoint-500/trainer_state.json b/checkpoint-500/trainer_state.json index 236cd4b210abba4ecff7c0ffc8aea0cce3546ca3..ec757632d6712ed77c00b76a2963d556d51f4c9b 100644 --- a/checkpoint-500/trainer_state.json +++ b/checkpoint-500/trainer_state.json @@ -1,8 +1,8 @@ { "best_global_step": 500, - "best_metric": 0.5813060998916626, - "best_model_checkpoint": "finetuned_models/llama3.1-8b-lora/checkpoint-500", - "epoch": 0.3333333333333333, + "best_metric": 0.7194066047668457, + "best_model_checkpoint": "amiya_outputs/llama3.1-8b-amiya-palestinian/checkpoint-500", + "epoch": 0.20718947477468144, "eval_steps": 250, "global_step": 500, "is_hyper_param_search": false, @@ -10,94 +10,94 @@ "is_world_process_zero": true, "log_history": [ { - "epoch": 0.03333333333333333, - "grad_norm": 0.5346225500106812, + "epoch": 0.020718947477468146, + "grad_norm": 1.0589393377304077, "learning_rate": 9.8e-05, - "loss": 2.4955, + "loss": 2.6567, "step": 50 }, { - "epoch": 0.06666666666666667, - "grad_norm": 0.719093918800354, + "epoch": 0.04143789495493629, + "grad_norm": 0.9738045334815979, "learning_rate": 0.00019800000000000002, - "loss": 0.71, + "loss": 0.9502, "step": 100 }, { - "epoch": 0.1, - "grad_norm": 0.4840560853481293, - "learning_rate": 0.0001977727272727273, - "loss": 0.6405, + "epoch": 0.062156842432404436, + "grad_norm": 0.8801347017288208, + "learning_rate": 0.00019862783534024082, + "loss": 0.8496, "step": 150 }, { - "epoch": 0.13333333333333333, - "grad_norm": 0.3332301676273346, - "learning_rate": 0.0001955, - "loss": 0.6287, + "epoch": 0.08287578990987259, + "grad_norm": 0.7272312045097351, + "learning_rate": 0.00019722766732007841, + "loss": 0.8184, "step": 200 }, { - "epoch": 0.16666666666666666, - "grad_norm": 0.40639588236808777, - "learning_rate": 0.00019322727272727276, - "loss": 0.5572, + "epoch": 0.10359473738734072, + "grad_norm": 0.7850629091262817, + "learning_rate": 0.000195827499299916, + "loss": 0.8392, "step": 250 }, { - "epoch": 0.16666666666666666, - "eval_loss": 0.5975945591926575, - "eval_runtime": 80.8004, - "eval_samples_per_second": 37.129, - "eval_steps_per_second": 9.282, + "epoch": 0.10359473738734072, + "eval_loss": 0.7402811050415039, + "eval_runtime": 85.9367, + "eval_samples_per_second": 56.158, + "eval_steps_per_second": 14.045, "step": 250 }, { - "epoch": 0.2, - "grad_norm": 0.3970712423324585, - "learning_rate": 0.00019095454545454545, - "loss": 0.6165, + "epoch": 0.12431368486480887, + "grad_norm": 0.40629276633262634, + "learning_rate": 0.00019442733127975358, + "loss": 0.8108, "step": 300 }, { - "epoch": 0.23333333333333334, - "grad_norm": 0.38409528136253357, - "learning_rate": 0.00018868181818181817, - "loss": 0.639, + "epoch": 0.145032632342277, + "grad_norm": 0.5258236527442932, + "learning_rate": 0.00019302716325959117, + "loss": 0.8116, "step": 350 }, { - "epoch": 0.26666666666666666, - "grad_norm": 0.44628769159317017, - "learning_rate": 0.00018640909090909092, - "loss": 0.636, + "epoch": 0.16575157981974517, + "grad_norm": 0.6879925727844238, + "learning_rate": 0.00019162699523942874, + "loss": 0.9089, "step": 400 }, { - "epoch": 0.3, - "grad_norm": 0.3697021007537842, - "learning_rate": 0.00018413636363636364, - "loss": 0.6192, + "epoch": 0.1864705272972133, + "grad_norm": 0.7583937048912048, + "learning_rate": 0.00019022682721926633, + "loss": 0.874, "step": 450 }, { - "epoch": 0.3333333333333333, - "grad_norm": 0.36338189244270325, - "learning_rate": 0.00018186363636363636, - "loss": 0.6134, + "epoch": 0.20718947477468144, + "grad_norm": 0.6399120688438416, + "learning_rate": 0.0001888266591991039, + "loss": 0.8366, "step": 500 }, { - "epoch": 0.3333333333333333, - "eval_loss": 0.5813060998916626, - "eval_runtime": 80.7819, - "eval_samples_per_second": 37.137, - "eval_steps_per_second": 9.284, + "epoch": 0.20718947477468144, + "eval_loss": 0.7194066047668457, + "eval_runtime": 86.2811, + "eval_samples_per_second": 55.933, + "eval_steps_per_second": 13.989, "step": 500 } ], "logging_steps": 50, - "max_steps": 4500, + "max_steps": 7242, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, @@ -113,7 +113,7 @@ "attributes": {} } }, - "total_flos": 6.144380628369408e+16, + "total_flos": 3.165715137970176e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null diff --git a/checkpoint-500/training_args.bin b/checkpoint-500/training_args.bin index eb236cf9d6a781057673256e00030110ee72a3cc..818731b2d637beef34d216b7461a12ef35db62df 100644 --- a/checkpoint-500/training_args.bin +++ b/checkpoint-500/training_args.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eb76e3e4d2123d52e529262f1ff37bfc600a160bb369e2338a54f71b47c17108 +oid sha256:a783872a61d64f4d1b4e002fa0fa67fa54dd9c59c4f7fbad58c794a77134069d size 5432 diff --git a/checkpoint-5000/README.md b/checkpoint-5000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..575d436fd9e4a7a4d30fbc4644c1fecf7480fbbe --- /dev/null +++ b/checkpoint-5000/README.md @@ -0,0 +1,207 @@ +--- +base_model: meta-llama/Meta-Llama-3.1-8B-Instruct +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:meta-llama/Meta-Llama-3.1-8B-Instruct +- lora +- transformers +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.0 \ No newline at end of file diff --git a/checkpoint-5000/adapter_config.json b/checkpoint-5000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..84dde86ee0bd87c0f5d5a0a36710a2a58516a7c4 --- /dev/null +++ b/checkpoint-5000/adapter_config.json @@ -0,0 +1,43 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.0", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "o_proj", + "v_proj", + "k_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-5000/adapter_model.safetensors b/checkpoint-5000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e28fe303a70e46a40243e9997ae99466987f7cc6 --- /dev/null +++ b/checkpoint-5000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c995f45d0b7294dbd424b0dae33a6c8c37eaebf44e71075f9df6d2abb9546777 +size 54560368 diff --git a/checkpoint-5000/chat_template.jinja b/checkpoint-5000/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..33089ace1be88f22a10fe861ad49718d5d886090 --- /dev/null +++ b/checkpoint-5000/chat_template.jinja @@ -0,0 +1,109 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- set date_string = "26 Jul 2024" %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message + builtin tools #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if builtin_tools is defined or tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{%- if builtin_tools is defined %} + {{- "Tools: " + builtin_tools | reject('equalto', 'code_interpreter') | join(", ") + "\n\n"}} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {%- if builtin_tools is defined and tool_call.name in builtin_tools %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- "<|python_tag|>" + tool_call.name + ".call(" }} + {%- for arg_name, arg_val in tool_call.arguments | items %} + {{- arg_name + '="' + arg_val + '"' }} + {%- if not loop.last %} + {{- ", " }} + {%- endif %} + {%- endfor %} + {{- ")" }} + {%- else %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {%- endif %} + {%- if builtin_tools is defined %} + {#- This means we're in ipython mode #} + {{- "<|eom_id|>" }} + {%- else %} + {{- "<|eot_id|>" }} + {%- endif %} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/checkpoint-5000/optimizer.pt b/checkpoint-5000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..3c1fc4074f2e6578c30c6fc461ada376e1d9b705 --- /dev/null +++ b/checkpoint-5000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ff3e304e469456507d6100de6aeb3759893240424ee8079653eb986d837779a +size 109267450 diff --git a/checkpoint-5000/rng_state.pth b/checkpoint-5000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..0dc0146feadb19494e88eeeb9d88ccfc13aa86f5 --- /dev/null +++ b/checkpoint-5000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:419658d43e0c65e8614af529b1cf33db5b44343e45ead24bcf37d00929578c69 +size 14244 diff --git a/checkpoint-5000/scaler.pt b/checkpoint-5000/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..dc6ece6106f202f502cf51dff69921cd151efbbe --- /dev/null +++ b/checkpoint-5000/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c2abbbbb8bdb63d82d22d9d3fc6ca91f381355e628923913fe5640e59dd11c2 +size 988 diff --git a/checkpoint-5000/scheduler.pt b/checkpoint-5000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..21f1722cfc05b150711d5731c22dae6744b680ef --- /dev/null +++ b/checkpoint-5000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e1c8e5641d168b8a05f9d9ec9935b553f5c6d5496b51d91b786a5fb50c04d24a +size 1064 diff --git a/checkpoint-5000/special_tokens_map.json b/checkpoint-5000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/checkpoint-5000/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/checkpoint-5000/tokenizer.json b/checkpoint-5000/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..d8b0bff70a20a3e87bcd1c207db61ae7a179bbf3 --- /dev/null +++ b/checkpoint-5000/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c85066e7642934ed09b44155e6566b0b5dab2637fb9433439ba5c9c7f8b50d3 +size 17210018 diff --git a/checkpoint-5000/tokenizer_config.json b/checkpoint-5000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..3beeacc86a6ca3cae14ad3004263ab74a4bac07a --- /dev/null +++ b/checkpoint-5000/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/checkpoint-5000/trainer_state.json b/checkpoint-5000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f770737cbac014f3c72a5a5749f79f5b3b7dae01 --- /dev/null +++ b/checkpoint-5000/trainer_state.json @@ -0,0 +1,894 @@ +{ + "best_global_step": 4750, + "best_metric": 0.6553655862808228, + "best_model_checkpoint": "amiya_outputs/llama3.1-8b-amiya-palestinian/checkpoint-4500", + "epoch": 2.0712731793224903, + "eval_steps": 250, + "global_step": 5000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.020718947477468146, + "grad_norm": 1.0589393377304077, + "learning_rate": 9.8e-05, + "loss": 2.6567, + "step": 50 + }, + { + "epoch": 0.04143789495493629, + "grad_norm": 0.9738045334815979, + "learning_rate": 0.00019800000000000002, + "loss": 0.9502, + "step": 100 + }, + { + "epoch": 0.062156842432404436, + "grad_norm": 0.8801347017288208, + "learning_rate": 0.00019862783534024082, + "loss": 0.8496, + "step": 150 + }, + { + "epoch": 0.08287578990987259, + "grad_norm": 0.7272312045097351, + "learning_rate": 0.00019722766732007841, + "loss": 0.8184, + "step": 200 + }, + { + "epoch": 0.10359473738734072, + "grad_norm": 0.7850629091262817, + "learning_rate": 0.000195827499299916, + "loss": 0.8392, + "step": 250 + }, + { + "epoch": 0.10359473738734072, + "eval_loss": 0.7402811050415039, + "eval_runtime": 85.9367, + "eval_samples_per_second": 56.158, + "eval_steps_per_second": 14.045, + "step": 250 + }, + { + "epoch": 0.12431368486480887, + "grad_norm": 0.40629276633262634, + "learning_rate": 0.00019442733127975358, + "loss": 0.8108, + "step": 300 + }, + { + "epoch": 0.145032632342277, + "grad_norm": 0.5258236527442932, + "learning_rate": 0.00019302716325959117, + "loss": 0.8116, + "step": 350 + }, + { + "epoch": 0.16575157981974517, + "grad_norm": 0.6879925727844238, + "learning_rate": 0.00019162699523942874, + "loss": 0.9089, + "step": 400 + }, + { + "epoch": 0.1864705272972133, + "grad_norm": 0.7583937048912048, + "learning_rate": 0.00019022682721926633, + "loss": 0.874, + "step": 450 + }, + { + "epoch": 0.20718947477468144, + "grad_norm": 0.6399120688438416, + "learning_rate": 0.0001888266591991039, + "loss": 0.8366, + "step": 500 + }, + { + "epoch": 0.20718947477468144, + "eval_loss": 0.7194066047668457, + "eval_runtime": 86.2811, + "eval_samples_per_second": 55.933, + "eval_steps_per_second": 13.989, + "step": 500 + }, + { + "epoch": 0.22790842225214958, + "grad_norm": 0.7763131856918335, + "learning_rate": 0.0001874264911789415, + "loss": 0.7912, + "step": 550 + }, + { + "epoch": 0.24862736972961774, + "grad_norm": 0.6845299601554871, + "learning_rate": 0.00018602632315877906, + "loss": 0.8506, + "step": 600 + }, + { + "epoch": 0.2693463172070859, + "grad_norm": 0.8045451045036316, + "learning_rate": 0.00018462615513861665, + "loss": 0.763, + "step": 650 + }, + { + "epoch": 0.290065264684554, + "grad_norm": 0.7035927176475525, + "learning_rate": 0.00018322598711845422, + "loss": 0.7769, + "step": 700 + }, + { + "epoch": 0.3107842121620222, + "grad_norm": 0.465000718832016, + "learning_rate": 0.00018182581909829179, + "loss": 0.7705, + "step": 750 + }, + { + "epoch": 0.3107842121620222, + "eval_loss": 0.7103215456008911, + "eval_runtime": 86.1101, + "eval_samples_per_second": 56.045, + "eval_steps_per_second": 14.017, + "step": 750 + }, + { + "epoch": 0.33150315963949034, + "grad_norm": 0.4990151524543762, + "learning_rate": 0.00018042565107812938, + "loss": 0.8438, + "step": 800 + }, + { + "epoch": 0.35222210711695845, + "grad_norm": 0.7391067147254944, + "learning_rate": 0.00017902548305796695, + "loss": 0.7688, + "step": 850 + }, + { + "epoch": 0.3729410545944266, + "grad_norm": 0.8036171197891235, + "learning_rate": 0.00017762531503780454, + "loss": 0.753, + "step": 900 + }, + { + "epoch": 0.3936600020718947, + "grad_norm": 0.44744470715522766, + "learning_rate": 0.00017622514701764213, + "loss": 0.7793, + "step": 950 + }, + { + "epoch": 0.4143789495493629, + "grad_norm": 0.630820631980896, + "learning_rate": 0.00017482497899747973, + "loss": 0.7555, + "step": 1000 + }, + { + "epoch": 0.4143789495493629, + "eval_loss": 0.7030432820320129, + "eval_runtime": 86.2543, + "eval_samples_per_second": 55.951, + "eval_steps_per_second": 13.994, + "step": 1000 + }, + { + "epoch": 0.43509789702683105, + "grad_norm": 0.45690879225730896, + "learning_rate": 0.0001734248109773173, + "loss": 0.793, + "step": 1050 + }, + { + "epoch": 0.45581684450429916, + "grad_norm": 0.5000227093696594, + "learning_rate": 0.00017202464295715486, + "loss": 0.8342, + "step": 1100 + }, + { + "epoch": 0.4765357919817673, + "grad_norm": 0.47182488441467285, + "learning_rate": 0.00017062447493699246, + "loss": 0.7997, + "step": 1150 + }, + { + "epoch": 0.4972547394592355, + "grad_norm": 0.7060516476631165, + "learning_rate": 0.00016922430691683002, + "loss": 0.7788, + "step": 1200 + }, + { + "epoch": 0.5179736869367036, + "grad_norm": 0.46701857447624207, + "learning_rate": 0.00016782413889666762, + "loss": 0.7518, + "step": 1250 + }, + { + "epoch": 0.5179736869367036, + "eval_loss": 0.7023425698280334, + "eval_runtime": 86.3015, + "eval_samples_per_second": 55.92, + "eval_steps_per_second": 13.986, + "step": 1250 + }, + { + "epoch": 0.5386926344141718, + "grad_norm": 0.668192446231842, + "learning_rate": 0.00016642397087650518, + "loss": 0.7682, + "step": 1300 + }, + { + "epoch": 0.5594115818916399, + "grad_norm": 0.47292283177375793, + "learning_rate": 0.00016502380285634278, + "loss": 0.7985, + "step": 1350 + }, + { + "epoch": 0.580130529369108, + "grad_norm": 0.7327275276184082, + "learning_rate": 0.00016362363483618034, + "loss": 0.8378, + "step": 1400 + }, + { + "epoch": 0.6008494768465762, + "grad_norm": 0.8417996764183044, + "learning_rate": 0.0001622234668160179, + "loss": 0.7962, + "step": 1450 + }, + { + "epoch": 0.6215684243240444, + "grad_norm": 0.6189562678337097, + "learning_rate": 0.0001608232987958555, + "loss": 0.8028, + "step": 1500 + }, + { + "epoch": 0.6215684243240444, + "eval_loss": 0.6915447115898132, + "eval_runtime": 86.2147, + "eval_samples_per_second": 55.977, + "eval_steps_per_second": 14.0, + "step": 1500 + }, + { + "epoch": 0.6422873718015125, + "grad_norm": 0.7345826625823975, + "learning_rate": 0.0001594231307756931, + "loss": 0.7978, + "step": 1550 + }, + { + "epoch": 0.6630063192789807, + "grad_norm": 0.6538310050964355, + "learning_rate": 0.0001580229627555307, + "loss": 0.7672, + "step": 1600 + }, + { + "epoch": 0.6837252667564487, + "grad_norm": 0.661582350730896, + "learning_rate": 0.00015662279473536826, + "loss": 0.7378, + "step": 1650 + }, + { + "epoch": 0.7044442142339169, + "grad_norm": 0.3603042960166931, + "learning_rate": 0.00015522262671520583, + "loss": 0.6741, + "step": 1700 + }, + { + "epoch": 0.7251631617113851, + "grad_norm": 0.8882561326026917, + "learning_rate": 0.00015382245869504342, + "loss": 0.7695, + "step": 1750 + }, + { + "epoch": 0.7251631617113851, + "eval_loss": 0.6858941316604614, + "eval_runtime": 86.6358, + "eval_samples_per_second": 55.704, + "eval_steps_per_second": 13.932, + "step": 1750 + }, + { + "epoch": 0.7458821091888532, + "grad_norm": 0.5933266282081604, + "learning_rate": 0.000152422290674881, + "loss": 0.7548, + "step": 1800 + }, + { + "epoch": 0.7666010566663214, + "grad_norm": 0.8178608417510986, + "learning_rate": 0.00015102212265471858, + "loss": 0.7639, + "step": 1850 + }, + { + "epoch": 0.7873200041437894, + "grad_norm": 0.4378993511199951, + "learning_rate": 0.00014962195463455615, + "loss": 0.7985, + "step": 1900 + }, + { + "epoch": 0.8080389516212576, + "grad_norm": 0.3732803463935852, + "learning_rate": 0.00014822178661439374, + "loss": 0.8481, + "step": 1950 + }, + { + "epoch": 0.8287578990987258, + "grad_norm": 0.7421035170555115, + "learning_rate": 0.0001468216185942313, + "loss": 0.7223, + "step": 2000 + }, + { + "epoch": 0.8287578990987258, + "eval_loss": 0.6823315024375916, + "eval_runtime": 86.5575, + "eval_samples_per_second": 55.755, + "eval_steps_per_second": 13.944, + "step": 2000 + }, + { + "epoch": 0.8494768465761939, + "grad_norm": 0.5109913349151611, + "learning_rate": 0.00014542145057406888, + "loss": 0.7895, + "step": 2050 + }, + { + "epoch": 0.8701957940536621, + "grad_norm": 0.47988179326057434, + "learning_rate": 0.00014402128255390647, + "loss": 0.7385, + "step": 2100 + }, + { + "epoch": 0.8909147415311303, + "grad_norm": 0.7593080997467041, + "learning_rate": 0.00014262111453374404, + "loss": 0.7744, + "step": 2150 + }, + { + "epoch": 0.9116336890085983, + "grad_norm": 0.5866154432296753, + "learning_rate": 0.00014122094651358163, + "loss": 0.7062, + "step": 2200 + }, + { + "epoch": 0.9323526364860665, + "grad_norm": 0.47364088892936707, + "learning_rate": 0.00013982077849341922, + "loss": 0.7792, + "step": 2250 + }, + { + "epoch": 0.9323526364860665, + "eval_loss": 0.6785813570022583, + "eval_runtime": 86.3444, + "eval_samples_per_second": 55.892, + "eval_steps_per_second": 13.979, + "step": 2250 + }, + { + "epoch": 0.9530715839635346, + "grad_norm": 0.7610514760017395, + "learning_rate": 0.00013842061047325682, + "loss": 0.7804, + "step": 2300 + }, + { + "epoch": 0.9737905314410028, + "grad_norm": 0.7689616084098816, + "learning_rate": 0.00013702044245309438, + "loss": 0.7497, + "step": 2350 + }, + { + "epoch": 0.994509478918471, + "grad_norm": 0.542168378829956, + "learning_rate": 0.00013562027443293195, + "loss": 0.7333, + "step": 2400 + }, + { + "epoch": 1.0149176421837771, + "grad_norm": 0.33903324604034424, + "learning_rate": 0.0001342481097731728, + "loss": 0.6952, + "step": 2450 + }, + { + "epoch": 1.0356365896612452, + "grad_norm": 0.8183636665344238, + "learning_rate": 0.00013284794175301036, + "loss": 0.7386, + "step": 2500 + }, + { + "epoch": 1.0356365896612452, + "eval_loss": 0.675748348236084, + "eval_runtime": 86.2887, + "eval_samples_per_second": 55.929, + "eval_steps_per_second": 13.988, + "step": 2500 + }, + { + "epoch": 1.0563555371387134, + "grad_norm": 0.6831589937210083, + "learning_rate": 0.00013144777373284795, + "loss": 0.72, + "step": 2550 + }, + { + "epoch": 1.0770744846161815, + "grad_norm": 0.6346258521080017, + "learning_rate": 0.00013004760571268552, + "loss": 0.7026, + "step": 2600 + }, + { + "epoch": 1.0977934320936495, + "grad_norm": 0.5658385753631592, + "learning_rate": 0.0001286474376925231, + "loss": 0.7162, + "step": 2650 + }, + { + "epoch": 1.1185123795711178, + "grad_norm": 0.4242883026599884, + "learning_rate": 0.00012724726967236068, + "loss": 0.7325, + "step": 2700 + }, + { + "epoch": 1.1392313270485859, + "grad_norm": 0.5489133596420288, + "learning_rate": 0.00012584710165219827, + "loss": 0.7138, + "step": 2750 + }, + { + "epoch": 1.1392313270485859, + "eval_loss": 0.6747092604637146, + "eval_runtime": 86.4239, + "eval_samples_per_second": 55.841, + "eval_steps_per_second": 13.966, + "step": 2750 + }, + { + "epoch": 1.1599502745260541, + "grad_norm": 0.6514728665351868, + "learning_rate": 0.00012444693363203587, + "loss": 0.7105, + "step": 2800 + }, + { + "epoch": 1.1806692220035222, + "grad_norm": 0.48897412419319153, + "learning_rate": 0.00012304676561187343, + "loss": 0.7271, + "step": 2850 + }, + { + "epoch": 1.2013881694809903, + "grad_norm": 0.7159713506698608, + "learning_rate": 0.00012164659759171101, + "loss": 0.7454, + "step": 2900 + }, + { + "epoch": 1.2221071169584585, + "grad_norm": 0.7044214010238647, + "learning_rate": 0.0001202464295715486, + "loss": 0.6918, + "step": 2950 + }, + { + "epoch": 1.2428260644359266, + "grad_norm": 0.7934305667877197, + "learning_rate": 0.00011884626155138616, + "loss": 0.7018, + "step": 3000 + }, + { + "epoch": 1.2428260644359266, + "eval_loss": 0.6727278828620911, + "eval_runtime": 86.1985, + "eval_samples_per_second": 55.987, + "eval_steps_per_second": 14.003, + "step": 3000 + }, + { + "epoch": 1.2635450119133949, + "grad_norm": 0.8456618785858154, + "learning_rate": 0.00011744609353122375, + "loss": 0.763, + "step": 3050 + }, + { + "epoch": 1.284263959390863, + "grad_norm": 0.5733729600906372, + "learning_rate": 0.00011604592551106132, + "loss": 0.7034, + "step": 3100 + }, + { + "epoch": 1.304982906868331, + "grad_norm": 0.4783104658126831, + "learning_rate": 0.00011464575749089892, + "loss": 0.762, + "step": 3150 + }, + { + "epoch": 1.3257018543457992, + "grad_norm": 0.7016689777374268, + "learning_rate": 0.0001132455894707365, + "loss": 0.7049, + "step": 3200 + }, + { + "epoch": 1.3464208018232675, + "grad_norm": 0.6739513278007507, + "learning_rate": 0.00011184542145057409, + "loss": 0.7137, + "step": 3250 + }, + { + "epoch": 1.3464208018232675, + "eval_loss": 0.6689812541007996, + "eval_runtime": 86.4895, + "eval_samples_per_second": 55.799, + "eval_steps_per_second": 13.955, + "step": 3250 + }, + { + "epoch": 1.3671397493007356, + "grad_norm": 0.8907766938209534, + "learning_rate": 0.00011044525343041166, + "loss": 0.7476, + "step": 3300 + }, + { + "epoch": 1.3878586967782036, + "grad_norm": 0.8889743089675903, + "learning_rate": 0.00010904508541024922, + "loss": 0.7059, + "step": 3350 + }, + { + "epoch": 1.408577644255672, + "grad_norm": 0.5788094401359558, + "learning_rate": 0.00010764491739008682, + "loss": 0.7018, + "step": 3400 + }, + { + "epoch": 1.42929659173314, + "grad_norm": 0.7107548713684082, + "learning_rate": 0.00010624474936992438, + "loss": 0.6796, + "step": 3450 + }, + { + "epoch": 1.4500155392106082, + "grad_norm": 0.6979348063468933, + "learning_rate": 0.00010484458134976198, + "loss": 0.7212, + "step": 3500 + }, + { + "epoch": 1.4500155392106082, + "eval_loss": 0.6663665175437927, + "eval_runtime": 86.5532, + "eval_samples_per_second": 55.758, + "eval_steps_per_second": 13.945, + "step": 3500 + }, + { + "epoch": 1.4707344866880763, + "grad_norm": 0.7232558727264404, + "learning_rate": 0.00010344441332959956, + "loss": 0.6814, + "step": 3550 + }, + { + "epoch": 1.4914534341655443, + "grad_norm": 0.8630662560462952, + "learning_rate": 0.00010204424530943715, + "loss": 0.7012, + "step": 3600 + }, + { + "epoch": 1.5121723816430124, + "grad_norm": 0.9553645253181458, + "learning_rate": 0.00010064407728927472, + "loss": 0.7247, + "step": 3650 + }, + { + "epoch": 1.5328913291204807, + "grad_norm": 0.6892822980880737, + "learning_rate": 9.92439092691123e-05, + "loss": 0.7009, + "step": 3700 + }, + { + "epoch": 1.553610276597949, + "grad_norm": 0.8881245255470276, + "learning_rate": 9.787174460935312e-05, + "loss": 0.7579, + "step": 3750 + }, + { + "epoch": 1.553610276597949, + "eval_loss": 0.6655827164649963, + "eval_runtime": 86.317, + "eval_samples_per_second": 55.91, + "eval_steps_per_second": 13.983, + "step": 3750 + }, + { + "epoch": 1.574329224075417, + "grad_norm": 0.6604064702987671, + "learning_rate": 9.64715765891907e-05, + "loss": 0.7003, + "step": 3800 + }, + { + "epoch": 1.595048171552885, + "grad_norm": 0.5936245918273926, + "learning_rate": 9.507140856902829e-05, + "loss": 0.7093, + "step": 3850 + }, + { + "epoch": 1.6157671190303533, + "grad_norm": 0.6983786225318909, + "learning_rate": 9.367124054886587e-05, + "loss": 0.7009, + "step": 3900 + }, + { + "epoch": 1.6364860665078214, + "grad_norm": 0.6833502054214478, + "learning_rate": 9.227107252870345e-05, + "loss": 0.7293, + "step": 3950 + }, + { + "epoch": 1.6572050139852896, + "grad_norm": 0.5032167434692383, + "learning_rate": 9.087090450854103e-05, + "loss": 0.6968, + "step": 4000 + }, + { + "epoch": 1.6572050139852896, + "eval_loss": 0.6616591215133667, + "eval_runtime": 86.2625, + "eval_samples_per_second": 55.946, + "eval_steps_per_second": 13.992, + "step": 4000 + }, + { + "epoch": 1.6779239614627577, + "grad_norm": 0.73284912109375, + "learning_rate": 8.947073648837862e-05, + "loss": 0.723, + "step": 4050 + }, + { + "epoch": 1.6986429089402257, + "grad_norm": 0.7727170586585999, + "learning_rate": 8.807056846821619e-05, + "loss": 0.6965, + "step": 4100 + }, + { + "epoch": 1.719361856417694, + "grad_norm": 0.6575957536697388, + "learning_rate": 8.667040044805377e-05, + "loss": 0.6804, + "step": 4150 + }, + { + "epoch": 1.7400808038951623, + "grad_norm": 0.7174975275993347, + "learning_rate": 8.527023242789135e-05, + "loss": 0.7388, + "step": 4200 + }, + { + "epoch": 1.7607997513726303, + "grad_norm": 0.7730789184570312, + "learning_rate": 8.387006440772893e-05, + "loss": 0.6716, + "step": 4250 + }, + { + "epoch": 1.7607997513726303, + "eval_loss": 0.6594452857971191, + "eval_runtime": 86.4796, + "eval_samples_per_second": 55.805, + "eval_steps_per_second": 13.957, + "step": 4250 + }, + { + "epoch": 1.7815186988500984, + "grad_norm": 0.6953691840171814, + "learning_rate": 8.246989638756651e-05, + "loss": 0.6997, + "step": 4300 + }, + { + "epoch": 1.8022376463275664, + "grad_norm": 0.5468209385871887, + "learning_rate": 8.10697283674041e-05, + "loss": 0.7069, + "step": 4350 + }, + { + "epoch": 1.8229565938050347, + "grad_norm": 0.649025559425354, + "learning_rate": 7.966956034724167e-05, + "loss": 0.7179, + "step": 4400 + }, + { + "epoch": 1.843675541282503, + "grad_norm": 0.9825453162193298, + "learning_rate": 7.826939232707925e-05, + "loss": 0.7224, + "step": 4450 + }, + { + "epoch": 1.864394488759971, + "grad_norm": 0.5808931589126587, + "learning_rate": 7.686922430691683e-05, + "loss": 0.6139, + "step": 4500 + }, + { + "epoch": 1.864394488759971, + "eval_loss": 0.6582211852073669, + "eval_runtime": 86.3106, + "eval_samples_per_second": 55.914, + "eval_steps_per_second": 13.984, + "step": 4500 + }, + { + "epoch": 1.885113436237439, + "grad_norm": 0.8670596480369568, + "learning_rate": 7.546905628675441e-05, + "loss": 0.6993, + "step": 4550 + }, + { + "epoch": 1.9058323837149072, + "grad_norm": 0.5719550251960754, + "learning_rate": 7.406888826659199e-05, + "loss": 0.7224, + "step": 4600 + }, + { + "epoch": 1.9265513311923754, + "grad_norm": 0.80317622423172, + "learning_rate": 7.266872024642958e-05, + "loss": 0.721, + "step": 4650 + }, + { + "epoch": 1.9472702786698437, + "grad_norm": 0.8696815371513367, + "learning_rate": 7.126855222626717e-05, + "loss": 0.7326, + "step": 4700 + }, + { + "epoch": 1.9679892261473118, + "grad_norm": 0.6648033857345581, + "learning_rate": 6.986838420610473e-05, + "loss": 0.6956, + "step": 4750 + }, + { + "epoch": 1.9679892261473118, + "eval_loss": 0.6553655862808228, + "eval_runtime": 86.4589, + "eval_samples_per_second": 55.818, + "eval_steps_per_second": 13.96, + "step": 4750 + }, + { + "epoch": 1.9887081736247798, + "grad_norm": 0.5694370269775391, + "learning_rate": 6.846821618594231e-05, + "loss": 0.6719, + "step": 4800 + }, + { + "epoch": 2.009116336890086, + "grad_norm": 0.7416983246803284, + "learning_rate": 6.706804816577989e-05, + "loss": 0.6569, + "step": 4850 + }, + { + "epoch": 2.0298352843675542, + "grad_norm": 0.7476940751075745, + "learning_rate": 6.566788014561747e-05, + "loss": 0.6353, + "step": 4900 + }, + { + "epoch": 2.0505542318450223, + "grad_norm": 0.8248530626296997, + "learning_rate": 6.426771212545505e-05, + "loss": 0.6319, + "step": 4950 + }, + { + "epoch": 2.0712731793224903, + "grad_norm": 1.035627841949463, + "learning_rate": 6.286754410529265e-05, + "loss": 0.6003, + "step": 5000 + }, + { + "epoch": 2.0712731793224903, + "eval_loss": 0.6616777777671814, + "eval_runtime": 86.7192, + "eval_samples_per_second": 55.651, + "eval_steps_per_second": 13.918, + "step": 5000 + } + ], + "logging_steps": 50, + "max_steps": 7242, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.139251925482701e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-5000/training_args.bin b/checkpoint-5000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..818731b2d637beef34d216b7461a12ef35db62df --- /dev/null +++ b/checkpoint-5000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a783872a61d64f4d1b4e002fa0fa67fa54dd9c59c4f7fbad58c794a77134069d +size 5432 diff --git a/checkpoint-5500/README.md b/checkpoint-5500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..575d436fd9e4a7a4d30fbc4644c1fecf7480fbbe --- /dev/null +++ b/checkpoint-5500/README.md @@ -0,0 +1,207 @@ +--- +base_model: meta-llama/Meta-Llama-3.1-8B-Instruct +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:meta-llama/Meta-Llama-3.1-8B-Instruct +- lora +- transformers +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.0 \ No newline at end of file diff --git a/checkpoint-5500/adapter_config.json b/checkpoint-5500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..84dde86ee0bd87c0f5d5a0a36710a2a58516a7c4 --- /dev/null +++ b/checkpoint-5500/adapter_config.json @@ -0,0 +1,43 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.0", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "o_proj", + "v_proj", + "k_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-5500/adapter_model.safetensors b/checkpoint-5500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d6f26e283ce219703b1361291548a074b677cac8 --- /dev/null +++ b/checkpoint-5500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef52ae12d46bf478ead1465fa20cb7d4d15b115d2777ed7404462322e9607b14 +size 54560368 diff --git a/checkpoint-5500/chat_template.jinja b/checkpoint-5500/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..33089ace1be88f22a10fe861ad49718d5d886090 --- /dev/null +++ b/checkpoint-5500/chat_template.jinja @@ -0,0 +1,109 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- set date_string = "26 Jul 2024" %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message + builtin tools #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if builtin_tools is defined or tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{%- if builtin_tools is defined %} + {{- "Tools: " + builtin_tools | reject('equalto', 'code_interpreter') | join(", ") + "\n\n"}} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {%- if builtin_tools is defined and tool_call.name in builtin_tools %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- "<|python_tag|>" + tool_call.name + ".call(" }} + {%- for arg_name, arg_val in tool_call.arguments | items %} + {{- arg_name + '="' + arg_val + '"' }} + {%- if not loop.last %} + {{- ", " }} + {%- endif %} + {%- endfor %} + {{- ")" }} + {%- else %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {%- endif %} + {%- if builtin_tools is defined %} + {#- This means we're in ipython mode #} + {{- "<|eom_id|>" }} + {%- else %} + {{- "<|eot_id|>" }} + {%- endif %} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/checkpoint-5500/optimizer.pt b/checkpoint-5500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..22f8b349f51f41fdba5a4a595e49c50e3fd39ca0 --- /dev/null +++ b/checkpoint-5500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d040b2f61d391e743616b95d77c6ef451c32a0c4f2105c9fae15ed983a982027 +size 109267450 diff --git a/checkpoint-5500/rng_state.pth b/checkpoint-5500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..3941229415ee3c18bc7c45b07f11f90df225f4cb --- /dev/null +++ b/checkpoint-5500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb6a5dd68bf3d6187d2f26faccb4852d253e9a01ae59766ef17fa5061d47ef4e +size 14244 diff --git a/checkpoint-5500/scaler.pt b/checkpoint-5500/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..3baa4b953002cc7dd8cd44e6ffd9f8912740c61b --- /dev/null +++ b/checkpoint-5500/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15e346cc2f09786569781f98e8d3289ec5f6857249ec7f864fcd70771c800cdf +size 988 diff --git a/checkpoint-5500/scheduler.pt b/checkpoint-5500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..33c2bdf15a297e84a99f532c5e90f10fa0d7c3bd --- /dev/null +++ b/checkpoint-5500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7117b6aa1cef75e6b7c37755fc6bbdc448d1a0a3c7a6e58a02ee0fe763a1d2c1 +size 1064 diff --git a/checkpoint-5500/special_tokens_map.json b/checkpoint-5500/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/checkpoint-5500/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/checkpoint-5500/tokenizer.json b/checkpoint-5500/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..d8b0bff70a20a3e87bcd1c207db61ae7a179bbf3 --- /dev/null +++ b/checkpoint-5500/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c85066e7642934ed09b44155e6566b0b5dab2637fb9433439ba5c9c7f8b50d3 +size 17210018 diff --git a/checkpoint-5500/tokenizer_config.json b/checkpoint-5500/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..3beeacc86a6ca3cae14ad3004263ab74a4bac07a --- /dev/null +++ b/checkpoint-5500/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/checkpoint-5500/trainer_state.json b/checkpoint-5500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..9d74924282ddce83aea69bc074dfe77c981a1584 --- /dev/null +++ b/checkpoint-5500/trainer_state.json @@ -0,0 +1,980 @@ +{ + "best_global_step": 4750, + "best_metric": 0.6553655862808228, + "best_model_checkpoint": "amiya_outputs/llama3.1-8b-amiya-palestinian/checkpoint-4500", + "epoch": 2.2784626540971717, + "eval_steps": 250, + "global_step": 5500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.020718947477468146, + "grad_norm": 1.0589393377304077, + "learning_rate": 9.8e-05, + "loss": 2.6567, + "step": 50 + }, + { + "epoch": 0.04143789495493629, + "grad_norm": 0.9738045334815979, + "learning_rate": 0.00019800000000000002, + "loss": 0.9502, + "step": 100 + }, + { + "epoch": 0.062156842432404436, + "grad_norm": 0.8801347017288208, + "learning_rate": 0.00019862783534024082, + "loss": 0.8496, + "step": 150 + }, + { + "epoch": 0.08287578990987259, + "grad_norm": 0.7272312045097351, + "learning_rate": 0.00019722766732007841, + "loss": 0.8184, + "step": 200 + }, + { + "epoch": 0.10359473738734072, + "grad_norm": 0.7850629091262817, + "learning_rate": 0.000195827499299916, + "loss": 0.8392, + "step": 250 + }, + { + "epoch": 0.10359473738734072, + "eval_loss": 0.7402811050415039, + "eval_runtime": 85.9367, + "eval_samples_per_second": 56.158, + "eval_steps_per_second": 14.045, + "step": 250 + }, + { + "epoch": 0.12431368486480887, + "grad_norm": 0.40629276633262634, + "learning_rate": 0.00019442733127975358, + "loss": 0.8108, + "step": 300 + }, + { + "epoch": 0.145032632342277, + "grad_norm": 0.5258236527442932, + "learning_rate": 0.00019302716325959117, + "loss": 0.8116, + "step": 350 + }, + { + "epoch": 0.16575157981974517, + "grad_norm": 0.6879925727844238, + "learning_rate": 0.00019162699523942874, + "loss": 0.9089, + "step": 400 + }, + { + "epoch": 0.1864705272972133, + "grad_norm": 0.7583937048912048, + "learning_rate": 0.00019022682721926633, + "loss": 0.874, + "step": 450 + }, + { + "epoch": 0.20718947477468144, + "grad_norm": 0.6399120688438416, + "learning_rate": 0.0001888266591991039, + "loss": 0.8366, + "step": 500 + }, + { + "epoch": 0.20718947477468144, + "eval_loss": 0.7194066047668457, + "eval_runtime": 86.2811, + "eval_samples_per_second": 55.933, + "eval_steps_per_second": 13.989, + "step": 500 + }, + { + "epoch": 0.22790842225214958, + "grad_norm": 0.7763131856918335, + "learning_rate": 0.0001874264911789415, + "loss": 0.7912, + "step": 550 + }, + { + "epoch": 0.24862736972961774, + "grad_norm": 0.6845299601554871, + "learning_rate": 0.00018602632315877906, + "loss": 0.8506, + "step": 600 + }, + { + "epoch": 0.2693463172070859, + "grad_norm": 0.8045451045036316, + "learning_rate": 0.00018462615513861665, + "loss": 0.763, + "step": 650 + }, + { + "epoch": 0.290065264684554, + "grad_norm": 0.7035927176475525, + "learning_rate": 0.00018322598711845422, + "loss": 0.7769, + "step": 700 + }, + { + "epoch": 0.3107842121620222, + "grad_norm": 0.465000718832016, + "learning_rate": 0.00018182581909829179, + "loss": 0.7705, + "step": 750 + }, + { + "epoch": 0.3107842121620222, + "eval_loss": 0.7103215456008911, + "eval_runtime": 86.1101, + "eval_samples_per_second": 56.045, + "eval_steps_per_second": 14.017, + "step": 750 + }, + { + "epoch": 0.33150315963949034, + "grad_norm": 0.4990151524543762, + "learning_rate": 0.00018042565107812938, + "loss": 0.8438, + "step": 800 + }, + { + "epoch": 0.35222210711695845, + "grad_norm": 0.7391067147254944, + "learning_rate": 0.00017902548305796695, + "loss": 0.7688, + "step": 850 + }, + { + "epoch": 0.3729410545944266, + "grad_norm": 0.8036171197891235, + "learning_rate": 0.00017762531503780454, + "loss": 0.753, + "step": 900 + }, + { + "epoch": 0.3936600020718947, + "grad_norm": 0.44744470715522766, + "learning_rate": 0.00017622514701764213, + "loss": 0.7793, + "step": 950 + }, + { + "epoch": 0.4143789495493629, + "grad_norm": 0.630820631980896, + "learning_rate": 0.00017482497899747973, + "loss": 0.7555, + "step": 1000 + }, + { + "epoch": 0.4143789495493629, + "eval_loss": 0.7030432820320129, + "eval_runtime": 86.2543, + "eval_samples_per_second": 55.951, + "eval_steps_per_second": 13.994, + "step": 1000 + }, + { + "epoch": 0.43509789702683105, + "grad_norm": 0.45690879225730896, + "learning_rate": 0.0001734248109773173, + "loss": 0.793, + "step": 1050 + }, + { + "epoch": 0.45581684450429916, + "grad_norm": 0.5000227093696594, + "learning_rate": 0.00017202464295715486, + "loss": 0.8342, + "step": 1100 + }, + { + "epoch": 0.4765357919817673, + "grad_norm": 0.47182488441467285, + "learning_rate": 0.00017062447493699246, + "loss": 0.7997, + "step": 1150 + }, + { + "epoch": 0.4972547394592355, + "grad_norm": 0.7060516476631165, + "learning_rate": 0.00016922430691683002, + "loss": 0.7788, + "step": 1200 + }, + { + "epoch": 0.5179736869367036, + "grad_norm": 0.46701857447624207, + "learning_rate": 0.00016782413889666762, + "loss": 0.7518, + "step": 1250 + }, + { + "epoch": 0.5179736869367036, + "eval_loss": 0.7023425698280334, + "eval_runtime": 86.3015, + "eval_samples_per_second": 55.92, + "eval_steps_per_second": 13.986, + "step": 1250 + }, + { + "epoch": 0.5386926344141718, + "grad_norm": 0.668192446231842, + "learning_rate": 0.00016642397087650518, + "loss": 0.7682, + "step": 1300 + }, + { + "epoch": 0.5594115818916399, + "grad_norm": 0.47292283177375793, + "learning_rate": 0.00016502380285634278, + "loss": 0.7985, + "step": 1350 + }, + { + "epoch": 0.580130529369108, + "grad_norm": 0.7327275276184082, + "learning_rate": 0.00016362363483618034, + "loss": 0.8378, + "step": 1400 + }, + { + "epoch": 0.6008494768465762, + "grad_norm": 0.8417996764183044, + "learning_rate": 0.0001622234668160179, + "loss": 0.7962, + "step": 1450 + }, + { + "epoch": 0.6215684243240444, + "grad_norm": 0.6189562678337097, + "learning_rate": 0.0001608232987958555, + "loss": 0.8028, + "step": 1500 + }, + { + "epoch": 0.6215684243240444, + "eval_loss": 0.6915447115898132, + "eval_runtime": 86.2147, + "eval_samples_per_second": 55.977, + "eval_steps_per_second": 14.0, + "step": 1500 + }, + { + "epoch": 0.6422873718015125, + "grad_norm": 0.7345826625823975, + "learning_rate": 0.0001594231307756931, + "loss": 0.7978, + "step": 1550 + }, + { + "epoch": 0.6630063192789807, + "grad_norm": 0.6538310050964355, + "learning_rate": 0.0001580229627555307, + "loss": 0.7672, + "step": 1600 + }, + { + "epoch": 0.6837252667564487, + "grad_norm": 0.661582350730896, + "learning_rate": 0.00015662279473536826, + "loss": 0.7378, + "step": 1650 + }, + { + "epoch": 0.7044442142339169, + "grad_norm": 0.3603042960166931, + "learning_rate": 0.00015522262671520583, + "loss": 0.6741, + "step": 1700 + }, + { + "epoch": 0.7251631617113851, + "grad_norm": 0.8882561326026917, + "learning_rate": 0.00015382245869504342, + "loss": 0.7695, + "step": 1750 + }, + { + "epoch": 0.7251631617113851, + "eval_loss": 0.6858941316604614, + "eval_runtime": 86.6358, + "eval_samples_per_second": 55.704, + "eval_steps_per_second": 13.932, + "step": 1750 + }, + { + "epoch": 0.7458821091888532, + "grad_norm": 0.5933266282081604, + "learning_rate": 0.000152422290674881, + "loss": 0.7548, + "step": 1800 + }, + { + "epoch": 0.7666010566663214, + "grad_norm": 0.8178608417510986, + "learning_rate": 0.00015102212265471858, + "loss": 0.7639, + "step": 1850 + }, + { + "epoch": 0.7873200041437894, + "grad_norm": 0.4378993511199951, + "learning_rate": 0.00014962195463455615, + "loss": 0.7985, + "step": 1900 + }, + { + "epoch": 0.8080389516212576, + "grad_norm": 0.3732803463935852, + "learning_rate": 0.00014822178661439374, + "loss": 0.8481, + "step": 1950 + }, + { + "epoch": 0.8287578990987258, + "grad_norm": 0.7421035170555115, + "learning_rate": 0.0001468216185942313, + "loss": 0.7223, + "step": 2000 + }, + { + "epoch": 0.8287578990987258, + "eval_loss": 0.6823315024375916, + "eval_runtime": 86.5575, + "eval_samples_per_second": 55.755, + "eval_steps_per_second": 13.944, + "step": 2000 + }, + { + "epoch": 0.8494768465761939, + "grad_norm": 0.5109913349151611, + "learning_rate": 0.00014542145057406888, + "loss": 0.7895, + "step": 2050 + }, + { + "epoch": 0.8701957940536621, + "grad_norm": 0.47988179326057434, + "learning_rate": 0.00014402128255390647, + "loss": 0.7385, + "step": 2100 + }, + { + "epoch": 0.8909147415311303, + "grad_norm": 0.7593080997467041, + "learning_rate": 0.00014262111453374404, + "loss": 0.7744, + "step": 2150 + }, + { + "epoch": 0.9116336890085983, + "grad_norm": 0.5866154432296753, + "learning_rate": 0.00014122094651358163, + "loss": 0.7062, + "step": 2200 + }, + { + "epoch": 0.9323526364860665, + "grad_norm": 0.47364088892936707, + "learning_rate": 0.00013982077849341922, + "loss": 0.7792, + "step": 2250 + }, + { + "epoch": 0.9323526364860665, + "eval_loss": 0.6785813570022583, + "eval_runtime": 86.3444, + "eval_samples_per_second": 55.892, + "eval_steps_per_second": 13.979, + "step": 2250 + }, + { + "epoch": 0.9530715839635346, + "grad_norm": 0.7610514760017395, + "learning_rate": 0.00013842061047325682, + "loss": 0.7804, + "step": 2300 + }, + { + "epoch": 0.9737905314410028, + "grad_norm": 0.7689616084098816, + "learning_rate": 0.00013702044245309438, + "loss": 0.7497, + "step": 2350 + }, + { + "epoch": 0.994509478918471, + "grad_norm": 0.542168378829956, + "learning_rate": 0.00013562027443293195, + "loss": 0.7333, + "step": 2400 + }, + { + "epoch": 1.0149176421837771, + "grad_norm": 0.33903324604034424, + "learning_rate": 0.0001342481097731728, + "loss": 0.6952, + "step": 2450 + }, + { + "epoch": 1.0356365896612452, + "grad_norm": 0.8183636665344238, + "learning_rate": 0.00013284794175301036, + "loss": 0.7386, + "step": 2500 + }, + { + "epoch": 1.0356365896612452, + "eval_loss": 0.675748348236084, + "eval_runtime": 86.2887, + "eval_samples_per_second": 55.929, + "eval_steps_per_second": 13.988, + "step": 2500 + }, + { + "epoch": 1.0563555371387134, + "grad_norm": 0.6831589937210083, + "learning_rate": 0.00013144777373284795, + "loss": 0.72, + "step": 2550 + }, + { + "epoch": 1.0770744846161815, + "grad_norm": 0.6346258521080017, + "learning_rate": 0.00013004760571268552, + "loss": 0.7026, + "step": 2600 + }, + { + "epoch": 1.0977934320936495, + "grad_norm": 0.5658385753631592, + "learning_rate": 0.0001286474376925231, + "loss": 0.7162, + "step": 2650 + }, + { + "epoch": 1.1185123795711178, + "grad_norm": 0.4242883026599884, + "learning_rate": 0.00012724726967236068, + "loss": 0.7325, + "step": 2700 + }, + { + "epoch": 1.1392313270485859, + "grad_norm": 0.5489133596420288, + "learning_rate": 0.00012584710165219827, + "loss": 0.7138, + "step": 2750 + }, + { + "epoch": 1.1392313270485859, + "eval_loss": 0.6747092604637146, + "eval_runtime": 86.4239, + "eval_samples_per_second": 55.841, + "eval_steps_per_second": 13.966, + "step": 2750 + }, + { + "epoch": 1.1599502745260541, + "grad_norm": 0.6514728665351868, + "learning_rate": 0.00012444693363203587, + "loss": 0.7105, + "step": 2800 + }, + { + "epoch": 1.1806692220035222, + "grad_norm": 0.48897412419319153, + "learning_rate": 0.00012304676561187343, + "loss": 0.7271, + "step": 2850 + }, + { + "epoch": 1.2013881694809903, + "grad_norm": 0.7159713506698608, + "learning_rate": 0.00012164659759171101, + "loss": 0.7454, + "step": 2900 + }, + { + "epoch": 1.2221071169584585, + "grad_norm": 0.7044214010238647, + "learning_rate": 0.0001202464295715486, + "loss": 0.6918, + "step": 2950 + }, + { + "epoch": 1.2428260644359266, + "grad_norm": 0.7934305667877197, + "learning_rate": 0.00011884626155138616, + "loss": 0.7018, + "step": 3000 + }, + { + "epoch": 1.2428260644359266, + "eval_loss": 0.6727278828620911, + "eval_runtime": 86.1985, + "eval_samples_per_second": 55.987, + "eval_steps_per_second": 14.003, + "step": 3000 + }, + { + "epoch": 1.2635450119133949, + "grad_norm": 0.8456618785858154, + "learning_rate": 0.00011744609353122375, + "loss": 0.763, + "step": 3050 + }, + { + "epoch": 1.284263959390863, + "grad_norm": 0.5733729600906372, + "learning_rate": 0.00011604592551106132, + "loss": 0.7034, + "step": 3100 + }, + { + "epoch": 1.304982906868331, + "grad_norm": 0.4783104658126831, + "learning_rate": 0.00011464575749089892, + "loss": 0.762, + "step": 3150 + }, + { + "epoch": 1.3257018543457992, + "grad_norm": 0.7016689777374268, + "learning_rate": 0.0001132455894707365, + "loss": 0.7049, + "step": 3200 + }, + { + "epoch": 1.3464208018232675, + "grad_norm": 0.6739513278007507, + "learning_rate": 0.00011184542145057409, + "loss": 0.7137, + "step": 3250 + }, + { + "epoch": 1.3464208018232675, + "eval_loss": 0.6689812541007996, + "eval_runtime": 86.4895, + "eval_samples_per_second": 55.799, + "eval_steps_per_second": 13.955, + "step": 3250 + }, + { + "epoch": 1.3671397493007356, + "grad_norm": 0.8907766938209534, + "learning_rate": 0.00011044525343041166, + "loss": 0.7476, + "step": 3300 + }, + { + "epoch": 1.3878586967782036, + "grad_norm": 0.8889743089675903, + "learning_rate": 0.00010904508541024922, + "loss": 0.7059, + "step": 3350 + }, + { + "epoch": 1.408577644255672, + "grad_norm": 0.5788094401359558, + "learning_rate": 0.00010764491739008682, + "loss": 0.7018, + "step": 3400 + }, + { + "epoch": 1.42929659173314, + "grad_norm": 0.7107548713684082, + "learning_rate": 0.00010624474936992438, + "loss": 0.6796, + "step": 3450 + }, + { + "epoch": 1.4500155392106082, + "grad_norm": 0.6979348063468933, + "learning_rate": 0.00010484458134976198, + "loss": 0.7212, + "step": 3500 + }, + { + "epoch": 1.4500155392106082, + "eval_loss": 0.6663665175437927, + "eval_runtime": 86.5532, + "eval_samples_per_second": 55.758, + "eval_steps_per_second": 13.945, + "step": 3500 + }, + { + "epoch": 1.4707344866880763, + "grad_norm": 0.7232558727264404, + "learning_rate": 0.00010344441332959956, + "loss": 0.6814, + "step": 3550 + }, + { + "epoch": 1.4914534341655443, + "grad_norm": 0.8630662560462952, + "learning_rate": 0.00010204424530943715, + "loss": 0.7012, + "step": 3600 + }, + { + "epoch": 1.5121723816430124, + "grad_norm": 0.9553645253181458, + "learning_rate": 0.00010064407728927472, + "loss": 0.7247, + "step": 3650 + }, + { + "epoch": 1.5328913291204807, + "grad_norm": 0.6892822980880737, + "learning_rate": 9.92439092691123e-05, + "loss": 0.7009, + "step": 3700 + }, + { + "epoch": 1.553610276597949, + "grad_norm": 0.8881245255470276, + "learning_rate": 9.787174460935312e-05, + "loss": 0.7579, + "step": 3750 + }, + { + "epoch": 1.553610276597949, + "eval_loss": 0.6655827164649963, + "eval_runtime": 86.317, + "eval_samples_per_second": 55.91, + "eval_steps_per_second": 13.983, + "step": 3750 + }, + { + "epoch": 1.574329224075417, + "grad_norm": 0.6604064702987671, + "learning_rate": 9.64715765891907e-05, + "loss": 0.7003, + "step": 3800 + }, + { + "epoch": 1.595048171552885, + "grad_norm": 0.5936245918273926, + "learning_rate": 9.507140856902829e-05, + "loss": 0.7093, + "step": 3850 + }, + { + "epoch": 1.6157671190303533, + "grad_norm": 0.6983786225318909, + "learning_rate": 9.367124054886587e-05, + "loss": 0.7009, + "step": 3900 + }, + { + "epoch": 1.6364860665078214, + "grad_norm": 0.6833502054214478, + "learning_rate": 9.227107252870345e-05, + "loss": 0.7293, + "step": 3950 + }, + { + "epoch": 1.6572050139852896, + "grad_norm": 0.5032167434692383, + "learning_rate": 9.087090450854103e-05, + "loss": 0.6968, + "step": 4000 + }, + { + "epoch": 1.6572050139852896, + "eval_loss": 0.6616591215133667, + "eval_runtime": 86.2625, + "eval_samples_per_second": 55.946, + "eval_steps_per_second": 13.992, + "step": 4000 + }, + { + "epoch": 1.6779239614627577, + "grad_norm": 0.73284912109375, + "learning_rate": 8.947073648837862e-05, + "loss": 0.723, + "step": 4050 + }, + { + "epoch": 1.6986429089402257, + "grad_norm": 0.7727170586585999, + "learning_rate": 8.807056846821619e-05, + "loss": 0.6965, + "step": 4100 + }, + { + "epoch": 1.719361856417694, + "grad_norm": 0.6575957536697388, + "learning_rate": 8.667040044805377e-05, + "loss": 0.6804, + "step": 4150 + }, + { + "epoch": 1.7400808038951623, + "grad_norm": 0.7174975275993347, + "learning_rate": 8.527023242789135e-05, + "loss": 0.7388, + "step": 4200 + }, + { + "epoch": 1.7607997513726303, + "grad_norm": 0.7730789184570312, + "learning_rate": 8.387006440772893e-05, + "loss": 0.6716, + "step": 4250 + }, + { + "epoch": 1.7607997513726303, + "eval_loss": 0.6594452857971191, + "eval_runtime": 86.4796, + "eval_samples_per_second": 55.805, + "eval_steps_per_second": 13.957, + "step": 4250 + }, + { + "epoch": 1.7815186988500984, + "grad_norm": 0.6953691840171814, + "learning_rate": 8.246989638756651e-05, + "loss": 0.6997, + "step": 4300 + }, + { + "epoch": 1.8022376463275664, + "grad_norm": 0.5468209385871887, + "learning_rate": 8.10697283674041e-05, + "loss": 0.7069, + "step": 4350 + }, + { + "epoch": 1.8229565938050347, + "grad_norm": 0.649025559425354, + "learning_rate": 7.966956034724167e-05, + "loss": 0.7179, + "step": 4400 + }, + { + "epoch": 1.843675541282503, + "grad_norm": 0.9825453162193298, + "learning_rate": 7.826939232707925e-05, + "loss": 0.7224, + "step": 4450 + }, + { + "epoch": 1.864394488759971, + "grad_norm": 0.5808931589126587, + "learning_rate": 7.686922430691683e-05, + "loss": 0.6139, + "step": 4500 + }, + { + "epoch": 1.864394488759971, + "eval_loss": 0.6582211852073669, + "eval_runtime": 86.3106, + "eval_samples_per_second": 55.914, + "eval_steps_per_second": 13.984, + "step": 4500 + }, + { + "epoch": 1.885113436237439, + "grad_norm": 0.8670596480369568, + "learning_rate": 7.546905628675441e-05, + "loss": 0.6993, + "step": 4550 + }, + { + "epoch": 1.9058323837149072, + "grad_norm": 0.5719550251960754, + "learning_rate": 7.406888826659199e-05, + "loss": 0.7224, + "step": 4600 + }, + { + "epoch": 1.9265513311923754, + "grad_norm": 0.80317622423172, + "learning_rate": 7.266872024642958e-05, + "loss": 0.721, + "step": 4650 + }, + { + "epoch": 1.9472702786698437, + "grad_norm": 0.8696815371513367, + "learning_rate": 7.126855222626717e-05, + "loss": 0.7326, + "step": 4700 + }, + { + "epoch": 1.9679892261473118, + "grad_norm": 0.6648033857345581, + "learning_rate": 6.986838420610473e-05, + "loss": 0.6956, + "step": 4750 + }, + { + "epoch": 1.9679892261473118, + "eval_loss": 0.6553655862808228, + "eval_runtime": 86.4589, + "eval_samples_per_second": 55.818, + "eval_steps_per_second": 13.96, + "step": 4750 + }, + { + "epoch": 1.9887081736247798, + "grad_norm": 0.5694370269775391, + "learning_rate": 6.846821618594231e-05, + "loss": 0.6719, + "step": 4800 + }, + { + "epoch": 2.009116336890086, + "grad_norm": 0.7416983246803284, + "learning_rate": 6.706804816577989e-05, + "loss": 0.6569, + "step": 4850 + }, + { + "epoch": 2.0298352843675542, + "grad_norm": 0.7476940751075745, + "learning_rate": 6.566788014561747e-05, + "loss": 0.6353, + "step": 4900 + }, + { + "epoch": 2.0505542318450223, + "grad_norm": 0.8248530626296997, + "learning_rate": 6.426771212545505e-05, + "loss": 0.6319, + "step": 4950 + }, + { + "epoch": 2.0712731793224903, + "grad_norm": 1.035627841949463, + "learning_rate": 6.286754410529265e-05, + "loss": 0.6003, + "step": 5000 + }, + { + "epoch": 2.0712731793224903, + "eval_loss": 0.6616777777671814, + "eval_runtime": 86.7192, + "eval_samples_per_second": 55.651, + "eval_steps_per_second": 13.918, + "step": 5000 + }, + { + "epoch": 2.0919921267999584, + "grad_norm": 0.8701285719871521, + "learning_rate": 6.146737608513023e-05, + "loss": 0.6324, + "step": 5050 + }, + { + "epoch": 2.112711074277427, + "grad_norm": 0.836010217666626, + "learning_rate": 6.0067208064967795e-05, + "loss": 0.6547, + "step": 5100 + }, + { + "epoch": 2.133430021754895, + "grad_norm": 0.738888144493103, + "learning_rate": 5.8667040044805375e-05, + "loss": 0.6618, + "step": 5150 + }, + { + "epoch": 2.154148969232363, + "grad_norm": 1.0129936933517456, + "learning_rate": 5.726687202464296e-05, + "loss": 0.6855, + "step": 5200 + }, + { + "epoch": 2.174867916709831, + "grad_norm": 1.0437065362930298, + "learning_rate": 5.586670400448054e-05, + "loss": 0.6397, + "step": 5250 + }, + { + "epoch": 2.174867916709831, + "eval_loss": 0.6610354781150818, + "eval_runtime": 86.6962, + "eval_samples_per_second": 55.666, + "eval_steps_per_second": 13.922, + "step": 5250 + }, + { + "epoch": 2.195586864187299, + "grad_norm": 0.6604383587837219, + "learning_rate": 5.446653598431812e-05, + "loss": 0.6165, + "step": 5300 + }, + { + "epoch": 2.2163058116647676, + "grad_norm": 0.7305940985679626, + "learning_rate": 5.30663679641557e-05, + "loss": 0.6772, + "step": 5350 + }, + { + "epoch": 2.2370247591422356, + "grad_norm": 0.8462594747543335, + "learning_rate": 5.166619994399328e-05, + "loss": 0.6416, + "step": 5400 + }, + { + "epoch": 2.2577437066197037, + "grad_norm": 0.7274892330169678, + "learning_rate": 5.026603192383086e-05, + "loss": 0.6524, + "step": 5450 + }, + { + "epoch": 2.2784626540971717, + "grad_norm": 0.579065203666687, + "learning_rate": 4.886586390366844e-05, + "loss": 0.6464, + "step": 5500 + }, + { + "epoch": 2.2784626540971717, + "eval_loss": 0.659969687461853, + "eval_runtime": 86.3113, + "eval_samples_per_second": 55.914, + "eval_steps_per_second": 13.984, + "step": 5500 + } + ], + "logging_steps": 50, + "max_steps": 7242, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.4553867615379456e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-5500/training_args.bin b/checkpoint-5500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..818731b2d637beef34d216b7461a12ef35db62df --- /dev/null +++ b/checkpoint-5500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a783872a61d64f4d1b4e002fa0fa67fa54dd9c59c4f7fbad58c794a77134069d +size 5432 diff --git a/checkpoint-6000/README.md b/checkpoint-6000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..575d436fd9e4a7a4d30fbc4644c1fecf7480fbbe --- /dev/null +++ b/checkpoint-6000/README.md @@ -0,0 +1,207 @@ +--- +base_model: meta-llama/Meta-Llama-3.1-8B-Instruct +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:meta-llama/Meta-Llama-3.1-8B-Instruct +- lora +- transformers +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.0 \ No newline at end of file diff --git a/checkpoint-6000/adapter_config.json b/checkpoint-6000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..84dde86ee0bd87c0f5d5a0a36710a2a58516a7c4 --- /dev/null +++ b/checkpoint-6000/adapter_config.json @@ -0,0 +1,43 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.0", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "o_proj", + "v_proj", + "k_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-6000/adapter_model.safetensors b/checkpoint-6000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..070e666235453849c61f8ab4888c8be935024032 --- /dev/null +++ b/checkpoint-6000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46bcbc5e4f824b33b28f4a7193353ac5be3f1c52add91d8dd75a4bec78f55dff +size 54560368 diff --git a/checkpoint-6000/chat_template.jinja b/checkpoint-6000/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..33089ace1be88f22a10fe861ad49718d5d886090 --- /dev/null +++ b/checkpoint-6000/chat_template.jinja @@ -0,0 +1,109 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- set date_string = "26 Jul 2024" %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message + builtin tools #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if builtin_tools is defined or tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{%- if builtin_tools is defined %} + {{- "Tools: " + builtin_tools | reject('equalto', 'code_interpreter') | join(", ") + "\n\n"}} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {%- if builtin_tools is defined and tool_call.name in builtin_tools %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- "<|python_tag|>" + tool_call.name + ".call(" }} + {%- for arg_name, arg_val in tool_call.arguments | items %} + {{- arg_name + '="' + arg_val + '"' }} + {%- if not loop.last %} + {{- ", " }} + {%- endif %} + {%- endfor %} + {{- ")" }} + {%- else %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {%- endif %} + {%- if builtin_tools is defined %} + {#- This means we're in ipython mode #} + {{- "<|eom_id|>" }} + {%- else %} + {{- "<|eot_id|>" }} + {%- endif %} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/checkpoint-6000/optimizer.pt b/checkpoint-6000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..ba7a0c88037963c61ba6369885563f4cf56efb7e --- /dev/null +++ b/checkpoint-6000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54c4af0715ce9edc6bfd15d60c252065acf406441b47f997b334f6a1eceb0baf +size 109267450 diff --git a/checkpoint-6000/rng_state.pth b/checkpoint-6000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..992146296383e323002fba8eb63d8fea45f51523 --- /dev/null +++ b/checkpoint-6000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eafccae9a074c32ce2ff040dc00e54d2bb52fae2e71dc7220d8181973a4a4f7c +size 14244 diff --git a/checkpoint-6000/scaler.pt b/checkpoint-6000/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..e868ea72297059bc8ec0a6befb25d7380c3d5b76 --- /dev/null +++ b/checkpoint-6000/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de9ad7db8790cc0524a5effddb30ceba2d976c854929b1e489924fd3d248ab70 +size 988 diff --git a/checkpoint-6000/scheduler.pt b/checkpoint-6000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..835e41e8ba57785260e4e7154883417da8204e97 --- /dev/null +++ b/checkpoint-6000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ad9c141dbe0bad9635848739bb3795b9bd35c5fc89da14623b172d3f3419dab +size 1064 diff --git a/checkpoint-6000/special_tokens_map.json b/checkpoint-6000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/checkpoint-6000/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/checkpoint-6000/tokenizer.json b/checkpoint-6000/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..d8b0bff70a20a3e87bcd1c207db61ae7a179bbf3 --- /dev/null +++ b/checkpoint-6000/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c85066e7642934ed09b44155e6566b0b5dab2637fb9433439ba5c9c7f8b50d3 +size 17210018 diff --git a/checkpoint-6000/tokenizer_config.json b/checkpoint-6000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..3beeacc86a6ca3cae14ad3004263ab74a4bac07a --- /dev/null +++ b/checkpoint-6000/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/checkpoint-6000/trainer_state.json b/checkpoint-6000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..350363d3cb411b663f3196e528204378203c5687 --- /dev/null +++ b/checkpoint-6000/trainer_state.json @@ -0,0 +1,1066 @@ +{ + "best_global_step": 4750, + "best_metric": 0.6553655862808228, + "best_model_checkpoint": "amiya_outputs/llama3.1-8b-amiya-palestinian/checkpoint-4500", + "epoch": 2.485652128871853, + "eval_steps": 250, + "global_step": 6000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.020718947477468146, + "grad_norm": 1.0589393377304077, + "learning_rate": 9.8e-05, + "loss": 2.6567, + "step": 50 + }, + { + "epoch": 0.04143789495493629, + "grad_norm": 0.9738045334815979, + "learning_rate": 0.00019800000000000002, + "loss": 0.9502, + "step": 100 + }, + { + "epoch": 0.062156842432404436, + "grad_norm": 0.8801347017288208, + "learning_rate": 0.00019862783534024082, + "loss": 0.8496, + "step": 150 + }, + { + "epoch": 0.08287578990987259, + "grad_norm": 0.7272312045097351, + "learning_rate": 0.00019722766732007841, + "loss": 0.8184, + "step": 200 + }, + { + "epoch": 0.10359473738734072, + "grad_norm": 0.7850629091262817, + "learning_rate": 0.000195827499299916, + "loss": 0.8392, + "step": 250 + }, + { + "epoch": 0.10359473738734072, + "eval_loss": 0.7402811050415039, + "eval_runtime": 85.9367, + "eval_samples_per_second": 56.158, + "eval_steps_per_second": 14.045, + "step": 250 + }, + { + "epoch": 0.12431368486480887, + "grad_norm": 0.40629276633262634, + "learning_rate": 0.00019442733127975358, + "loss": 0.8108, + "step": 300 + }, + { + "epoch": 0.145032632342277, + "grad_norm": 0.5258236527442932, + "learning_rate": 0.00019302716325959117, + "loss": 0.8116, + "step": 350 + }, + { + "epoch": 0.16575157981974517, + "grad_norm": 0.6879925727844238, + "learning_rate": 0.00019162699523942874, + "loss": 0.9089, + "step": 400 + }, + { + "epoch": 0.1864705272972133, + "grad_norm": 0.7583937048912048, + "learning_rate": 0.00019022682721926633, + "loss": 0.874, + "step": 450 + }, + { + "epoch": 0.20718947477468144, + "grad_norm": 0.6399120688438416, + "learning_rate": 0.0001888266591991039, + "loss": 0.8366, + "step": 500 + }, + { + "epoch": 0.20718947477468144, + "eval_loss": 0.7194066047668457, + "eval_runtime": 86.2811, + "eval_samples_per_second": 55.933, + "eval_steps_per_second": 13.989, + "step": 500 + }, + { + "epoch": 0.22790842225214958, + "grad_norm": 0.7763131856918335, + "learning_rate": 0.0001874264911789415, + "loss": 0.7912, + "step": 550 + }, + { + "epoch": 0.24862736972961774, + "grad_norm": 0.6845299601554871, + "learning_rate": 0.00018602632315877906, + "loss": 0.8506, + "step": 600 + }, + { + "epoch": 0.2693463172070859, + "grad_norm": 0.8045451045036316, + "learning_rate": 0.00018462615513861665, + "loss": 0.763, + "step": 650 + }, + { + "epoch": 0.290065264684554, + "grad_norm": 0.7035927176475525, + "learning_rate": 0.00018322598711845422, + "loss": 0.7769, + "step": 700 + }, + { + "epoch": 0.3107842121620222, + "grad_norm": 0.465000718832016, + "learning_rate": 0.00018182581909829179, + "loss": 0.7705, + "step": 750 + }, + { + "epoch": 0.3107842121620222, + "eval_loss": 0.7103215456008911, + "eval_runtime": 86.1101, + "eval_samples_per_second": 56.045, + "eval_steps_per_second": 14.017, + "step": 750 + }, + { + "epoch": 0.33150315963949034, + "grad_norm": 0.4990151524543762, + "learning_rate": 0.00018042565107812938, + "loss": 0.8438, + "step": 800 + }, + { + "epoch": 0.35222210711695845, + "grad_norm": 0.7391067147254944, + "learning_rate": 0.00017902548305796695, + "loss": 0.7688, + "step": 850 + }, + { + "epoch": 0.3729410545944266, + "grad_norm": 0.8036171197891235, + "learning_rate": 0.00017762531503780454, + "loss": 0.753, + "step": 900 + }, + { + "epoch": 0.3936600020718947, + "grad_norm": 0.44744470715522766, + "learning_rate": 0.00017622514701764213, + "loss": 0.7793, + "step": 950 + }, + { + "epoch": 0.4143789495493629, + "grad_norm": 0.630820631980896, + "learning_rate": 0.00017482497899747973, + "loss": 0.7555, + "step": 1000 + }, + { + "epoch": 0.4143789495493629, + "eval_loss": 0.7030432820320129, + "eval_runtime": 86.2543, + "eval_samples_per_second": 55.951, + "eval_steps_per_second": 13.994, + "step": 1000 + }, + { + "epoch": 0.43509789702683105, + "grad_norm": 0.45690879225730896, + "learning_rate": 0.0001734248109773173, + "loss": 0.793, + "step": 1050 + }, + { + "epoch": 0.45581684450429916, + "grad_norm": 0.5000227093696594, + "learning_rate": 0.00017202464295715486, + "loss": 0.8342, + "step": 1100 + }, + { + "epoch": 0.4765357919817673, + "grad_norm": 0.47182488441467285, + "learning_rate": 0.00017062447493699246, + "loss": 0.7997, + "step": 1150 + }, + { + "epoch": 0.4972547394592355, + "grad_norm": 0.7060516476631165, + "learning_rate": 0.00016922430691683002, + "loss": 0.7788, + "step": 1200 + }, + { + "epoch": 0.5179736869367036, + "grad_norm": 0.46701857447624207, + "learning_rate": 0.00016782413889666762, + "loss": 0.7518, + "step": 1250 + }, + { + "epoch": 0.5179736869367036, + "eval_loss": 0.7023425698280334, + "eval_runtime": 86.3015, + "eval_samples_per_second": 55.92, + "eval_steps_per_second": 13.986, + "step": 1250 + }, + { + "epoch": 0.5386926344141718, + "grad_norm": 0.668192446231842, + "learning_rate": 0.00016642397087650518, + "loss": 0.7682, + "step": 1300 + }, + { + "epoch": 0.5594115818916399, + "grad_norm": 0.47292283177375793, + "learning_rate": 0.00016502380285634278, + "loss": 0.7985, + "step": 1350 + }, + { + "epoch": 0.580130529369108, + "grad_norm": 0.7327275276184082, + "learning_rate": 0.00016362363483618034, + "loss": 0.8378, + "step": 1400 + }, + { + "epoch": 0.6008494768465762, + "grad_norm": 0.8417996764183044, + "learning_rate": 0.0001622234668160179, + "loss": 0.7962, + "step": 1450 + }, + { + "epoch": 0.6215684243240444, + "grad_norm": 0.6189562678337097, + "learning_rate": 0.0001608232987958555, + "loss": 0.8028, + "step": 1500 + }, + { + "epoch": 0.6215684243240444, + "eval_loss": 0.6915447115898132, + "eval_runtime": 86.2147, + "eval_samples_per_second": 55.977, + "eval_steps_per_second": 14.0, + "step": 1500 + }, + { + "epoch": 0.6422873718015125, + "grad_norm": 0.7345826625823975, + "learning_rate": 0.0001594231307756931, + "loss": 0.7978, + "step": 1550 + }, + { + "epoch": 0.6630063192789807, + "grad_norm": 0.6538310050964355, + "learning_rate": 0.0001580229627555307, + "loss": 0.7672, + "step": 1600 + }, + { + "epoch": 0.6837252667564487, + "grad_norm": 0.661582350730896, + "learning_rate": 0.00015662279473536826, + "loss": 0.7378, + "step": 1650 + }, + { + "epoch": 0.7044442142339169, + "grad_norm": 0.3603042960166931, + "learning_rate": 0.00015522262671520583, + "loss": 0.6741, + "step": 1700 + }, + { + "epoch": 0.7251631617113851, + "grad_norm": 0.8882561326026917, + "learning_rate": 0.00015382245869504342, + "loss": 0.7695, + "step": 1750 + }, + { + "epoch": 0.7251631617113851, + "eval_loss": 0.6858941316604614, + "eval_runtime": 86.6358, + "eval_samples_per_second": 55.704, + "eval_steps_per_second": 13.932, + "step": 1750 + }, + { + "epoch": 0.7458821091888532, + "grad_norm": 0.5933266282081604, + "learning_rate": 0.000152422290674881, + "loss": 0.7548, + "step": 1800 + }, + { + "epoch": 0.7666010566663214, + "grad_norm": 0.8178608417510986, + "learning_rate": 0.00015102212265471858, + "loss": 0.7639, + "step": 1850 + }, + { + "epoch": 0.7873200041437894, + "grad_norm": 0.4378993511199951, + "learning_rate": 0.00014962195463455615, + "loss": 0.7985, + "step": 1900 + }, + { + "epoch": 0.8080389516212576, + "grad_norm": 0.3732803463935852, + "learning_rate": 0.00014822178661439374, + "loss": 0.8481, + "step": 1950 + }, + { + "epoch": 0.8287578990987258, + "grad_norm": 0.7421035170555115, + "learning_rate": 0.0001468216185942313, + "loss": 0.7223, + "step": 2000 + }, + { + "epoch": 0.8287578990987258, + "eval_loss": 0.6823315024375916, + "eval_runtime": 86.5575, + "eval_samples_per_second": 55.755, + "eval_steps_per_second": 13.944, + "step": 2000 + }, + { + "epoch": 0.8494768465761939, + "grad_norm": 0.5109913349151611, + "learning_rate": 0.00014542145057406888, + "loss": 0.7895, + "step": 2050 + }, + { + "epoch": 0.8701957940536621, + "grad_norm": 0.47988179326057434, + "learning_rate": 0.00014402128255390647, + "loss": 0.7385, + "step": 2100 + }, + { + "epoch": 0.8909147415311303, + "grad_norm": 0.7593080997467041, + "learning_rate": 0.00014262111453374404, + "loss": 0.7744, + "step": 2150 + }, + { + "epoch": 0.9116336890085983, + "grad_norm": 0.5866154432296753, + "learning_rate": 0.00014122094651358163, + "loss": 0.7062, + "step": 2200 + }, + { + "epoch": 0.9323526364860665, + "grad_norm": 0.47364088892936707, + "learning_rate": 0.00013982077849341922, + "loss": 0.7792, + "step": 2250 + }, + { + "epoch": 0.9323526364860665, + "eval_loss": 0.6785813570022583, + "eval_runtime": 86.3444, + "eval_samples_per_second": 55.892, + "eval_steps_per_second": 13.979, + "step": 2250 + }, + { + "epoch": 0.9530715839635346, + "grad_norm": 0.7610514760017395, + "learning_rate": 0.00013842061047325682, + "loss": 0.7804, + "step": 2300 + }, + { + "epoch": 0.9737905314410028, + "grad_norm": 0.7689616084098816, + "learning_rate": 0.00013702044245309438, + "loss": 0.7497, + "step": 2350 + }, + { + "epoch": 0.994509478918471, + "grad_norm": 0.542168378829956, + "learning_rate": 0.00013562027443293195, + "loss": 0.7333, + "step": 2400 + }, + { + "epoch": 1.0149176421837771, + "grad_norm": 0.33903324604034424, + "learning_rate": 0.0001342481097731728, + "loss": 0.6952, + "step": 2450 + }, + { + "epoch": 1.0356365896612452, + "grad_norm": 0.8183636665344238, + "learning_rate": 0.00013284794175301036, + "loss": 0.7386, + "step": 2500 + }, + { + "epoch": 1.0356365896612452, + "eval_loss": 0.675748348236084, + "eval_runtime": 86.2887, + "eval_samples_per_second": 55.929, + "eval_steps_per_second": 13.988, + "step": 2500 + }, + { + "epoch": 1.0563555371387134, + "grad_norm": 0.6831589937210083, + "learning_rate": 0.00013144777373284795, + "loss": 0.72, + "step": 2550 + }, + { + "epoch": 1.0770744846161815, + "grad_norm": 0.6346258521080017, + "learning_rate": 0.00013004760571268552, + "loss": 0.7026, + "step": 2600 + }, + { + "epoch": 1.0977934320936495, + "grad_norm": 0.5658385753631592, + "learning_rate": 0.0001286474376925231, + "loss": 0.7162, + "step": 2650 + }, + { + "epoch": 1.1185123795711178, + "grad_norm": 0.4242883026599884, + "learning_rate": 0.00012724726967236068, + "loss": 0.7325, + "step": 2700 + }, + { + "epoch": 1.1392313270485859, + "grad_norm": 0.5489133596420288, + "learning_rate": 0.00012584710165219827, + "loss": 0.7138, + "step": 2750 + }, + { + "epoch": 1.1392313270485859, + "eval_loss": 0.6747092604637146, + "eval_runtime": 86.4239, + "eval_samples_per_second": 55.841, + "eval_steps_per_second": 13.966, + "step": 2750 + }, + { + "epoch": 1.1599502745260541, + "grad_norm": 0.6514728665351868, + "learning_rate": 0.00012444693363203587, + "loss": 0.7105, + "step": 2800 + }, + { + "epoch": 1.1806692220035222, + "grad_norm": 0.48897412419319153, + "learning_rate": 0.00012304676561187343, + "loss": 0.7271, + "step": 2850 + }, + { + "epoch": 1.2013881694809903, + "grad_norm": 0.7159713506698608, + "learning_rate": 0.00012164659759171101, + "loss": 0.7454, + "step": 2900 + }, + { + "epoch": 1.2221071169584585, + "grad_norm": 0.7044214010238647, + "learning_rate": 0.0001202464295715486, + "loss": 0.6918, + "step": 2950 + }, + { + "epoch": 1.2428260644359266, + "grad_norm": 0.7934305667877197, + "learning_rate": 0.00011884626155138616, + "loss": 0.7018, + "step": 3000 + }, + { + "epoch": 1.2428260644359266, + "eval_loss": 0.6727278828620911, + "eval_runtime": 86.1985, + "eval_samples_per_second": 55.987, + "eval_steps_per_second": 14.003, + "step": 3000 + }, + { + "epoch": 1.2635450119133949, + "grad_norm": 0.8456618785858154, + "learning_rate": 0.00011744609353122375, + "loss": 0.763, + "step": 3050 + }, + { + "epoch": 1.284263959390863, + "grad_norm": 0.5733729600906372, + "learning_rate": 0.00011604592551106132, + "loss": 0.7034, + "step": 3100 + }, + { + "epoch": 1.304982906868331, + "grad_norm": 0.4783104658126831, + "learning_rate": 0.00011464575749089892, + "loss": 0.762, + "step": 3150 + }, + { + "epoch": 1.3257018543457992, + "grad_norm": 0.7016689777374268, + "learning_rate": 0.0001132455894707365, + "loss": 0.7049, + "step": 3200 + }, + { + "epoch": 1.3464208018232675, + "grad_norm": 0.6739513278007507, + "learning_rate": 0.00011184542145057409, + "loss": 0.7137, + "step": 3250 + }, + { + "epoch": 1.3464208018232675, + "eval_loss": 0.6689812541007996, + "eval_runtime": 86.4895, + "eval_samples_per_second": 55.799, + "eval_steps_per_second": 13.955, + "step": 3250 + }, + { + "epoch": 1.3671397493007356, + "grad_norm": 0.8907766938209534, + "learning_rate": 0.00011044525343041166, + "loss": 0.7476, + "step": 3300 + }, + { + "epoch": 1.3878586967782036, + "grad_norm": 0.8889743089675903, + "learning_rate": 0.00010904508541024922, + "loss": 0.7059, + "step": 3350 + }, + { + "epoch": 1.408577644255672, + "grad_norm": 0.5788094401359558, + "learning_rate": 0.00010764491739008682, + "loss": 0.7018, + "step": 3400 + }, + { + "epoch": 1.42929659173314, + "grad_norm": 0.7107548713684082, + "learning_rate": 0.00010624474936992438, + "loss": 0.6796, + "step": 3450 + }, + { + "epoch": 1.4500155392106082, + "grad_norm": 0.6979348063468933, + "learning_rate": 0.00010484458134976198, + "loss": 0.7212, + "step": 3500 + }, + { + "epoch": 1.4500155392106082, + "eval_loss": 0.6663665175437927, + "eval_runtime": 86.5532, + "eval_samples_per_second": 55.758, + "eval_steps_per_second": 13.945, + "step": 3500 + }, + { + "epoch": 1.4707344866880763, + "grad_norm": 0.7232558727264404, + "learning_rate": 0.00010344441332959956, + "loss": 0.6814, + "step": 3550 + }, + { + "epoch": 1.4914534341655443, + "grad_norm": 0.8630662560462952, + "learning_rate": 0.00010204424530943715, + "loss": 0.7012, + "step": 3600 + }, + { + "epoch": 1.5121723816430124, + "grad_norm": 0.9553645253181458, + "learning_rate": 0.00010064407728927472, + "loss": 0.7247, + "step": 3650 + }, + { + "epoch": 1.5328913291204807, + "grad_norm": 0.6892822980880737, + "learning_rate": 9.92439092691123e-05, + "loss": 0.7009, + "step": 3700 + }, + { + "epoch": 1.553610276597949, + "grad_norm": 0.8881245255470276, + "learning_rate": 9.787174460935312e-05, + "loss": 0.7579, + "step": 3750 + }, + { + "epoch": 1.553610276597949, + "eval_loss": 0.6655827164649963, + "eval_runtime": 86.317, + "eval_samples_per_second": 55.91, + "eval_steps_per_second": 13.983, + "step": 3750 + }, + { + "epoch": 1.574329224075417, + "grad_norm": 0.6604064702987671, + "learning_rate": 9.64715765891907e-05, + "loss": 0.7003, + "step": 3800 + }, + { + "epoch": 1.595048171552885, + "grad_norm": 0.5936245918273926, + "learning_rate": 9.507140856902829e-05, + "loss": 0.7093, + "step": 3850 + }, + { + "epoch": 1.6157671190303533, + "grad_norm": 0.6983786225318909, + "learning_rate": 9.367124054886587e-05, + "loss": 0.7009, + "step": 3900 + }, + { + "epoch": 1.6364860665078214, + "grad_norm": 0.6833502054214478, + "learning_rate": 9.227107252870345e-05, + "loss": 0.7293, + "step": 3950 + }, + { + "epoch": 1.6572050139852896, + "grad_norm": 0.5032167434692383, + "learning_rate": 9.087090450854103e-05, + "loss": 0.6968, + "step": 4000 + }, + { + "epoch": 1.6572050139852896, + "eval_loss": 0.6616591215133667, + "eval_runtime": 86.2625, + "eval_samples_per_second": 55.946, + "eval_steps_per_second": 13.992, + "step": 4000 + }, + { + "epoch": 1.6779239614627577, + "grad_norm": 0.73284912109375, + "learning_rate": 8.947073648837862e-05, + "loss": 0.723, + "step": 4050 + }, + { + "epoch": 1.6986429089402257, + "grad_norm": 0.7727170586585999, + "learning_rate": 8.807056846821619e-05, + "loss": 0.6965, + "step": 4100 + }, + { + "epoch": 1.719361856417694, + "grad_norm": 0.6575957536697388, + "learning_rate": 8.667040044805377e-05, + "loss": 0.6804, + "step": 4150 + }, + { + "epoch": 1.7400808038951623, + "grad_norm": 0.7174975275993347, + "learning_rate": 8.527023242789135e-05, + "loss": 0.7388, + "step": 4200 + }, + { + "epoch": 1.7607997513726303, + "grad_norm": 0.7730789184570312, + "learning_rate": 8.387006440772893e-05, + "loss": 0.6716, + "step": 4250 + }, + { + "epoch": 1.7607997513726303, + "eval_loss": 0.6594452857971191, + "eval_runtime": 86.4796, + "eval_samples_per_second": 55.805, + "eval_steps_per_second": 13.957, + "step": 4250 + }, + { + "epoch": 1.7815186988500984, + "grad_norm": 0.6953691840171814, + "learning_rate": 8.246989638756651e-05, + "loss": 0.6997, + "step": 4300 + }, + { + "epoch": 1.8022376463275664, + "grad_norm": 0.5468209385871887, + "learning_rate": 8.10697283674041e-05, + "loss": 0.7069, + "step": 4350 + }, + { + "epoch": 1.8229565938050347, + "grad_norm": 0.649025559425354, + "learning_rate": 7.966956034724167e-05, + "loss": 0.7179, + "step": 4400 + }, + { + "epoch": 1.843675541282503, + "grad_norm": 0.9825453162193298, + "learning_rate": 7.826939232707925e-05, + "loss": 0.7224, + "step": 4450 + }, + { + "epoch": 1.864394488759971, + "grad_norm": 0.5808931589126587, + "learning_rate": 7.686922430691683e-05, + "loss": 0.6139, + "step": 4500 + }, + { + "epoch": 1.864394488759971, + "eval_loss": 0.6582211852073669, + "eval_runtime": 86.3106, + "eval_samples_per_second": 55.914, + "eval_steps_per_second": 13.984, + "step": 4500 + }, + { + "epoch": 1.885113436237439, + "grad_norm": 0.8670596480369568, + "learning_rate": 7.546905628675441e-05, + "loss": 0.6993, + "step": 4550 + }, + { + "epoch": 1.9058323837149072, + "grad_norm": 0.5719550251960754, + "learning_rate": 7.406888826659199e-05, + "loss": 0.7224, + "step": 4600 + }, + { + "epoch": 1.9265513311923754, + "grad_norm": 0.80317622423172, + "learning_rate": 7.266872024642958e-05, + "loss": 0.721, + "step": 4650 + }, + { + "epoch": 1.9472702786698437, + "grad_norm": 0.8696815371513367, + "learning_rate": 7.126855222626717e-05, + "loss": 0.7326, + "step": 4700 + }, + { + "epoch": 1.9679892261473118, + "grad_norm": 0.6648033857345581, + "learning_rate": 6.986838420610473e-05, + "loss": 0.6956, + "step": 4750 + }, + { + "epoch": 1.9679892261473118, + "eval_loss": 0.6553655862808228, + "eval_runtime": 86.4589, + "eval_samples_per_second": 55.818, + "eval_steps_per_second": 13.96, + "step": 4750 + }, + { + "epoch": 1.9887081736247798, + "grad_norm": 0.5694370269775391, + "learning_rate": 6.846821618594231e-05, + "loss": 0.6719, + "step": 4800 + }, + { + "epoch": 2.009116336890086, + "grad_norm": 0.7416983246803284, + "learning_rate": 6.706804816577989e-05, + "loss": 0.6569, + "step": 4850 + }, + { + "epoch": 2.0298352843675542, + "grad_norm": 0.7476940751075745, + "learning_rate": 6.566788014561747e-05, + "loss": 0.6353, + "step": 4900 + }, + { + "epoch": 2.0505542318450223, + "grad_norm": 0.8248530626296997, + "learning_rate": 6.426771212545505e-05, + "loss": 0.6319, + "step": 4950 + }, + { + "epoch": 2.0712731793224903, + "grad_norm": 1.035627841949463, + "learning_rate": 6.286754410529265e-05, + "loss": 0.6003, + "step": 5000 + }, + { + "epoch": 2.0712731793224903, + "eval_loss": 0.6616777777671814, + "eval_runtime": 86.7192, + "eval_samples_per_second": 55.651, + "eval_steps_per_second": 13.918, + "step": 5000 + }, + { + "epoch": 2.0919921267999584, + "grad_norm": 0.8701285719871521, + "learning_rate": 6.146737608513023e-05, + "loss": 0.6324, + "step": 5050 + }, + { + "epoch": 2.112711074277427, + "grad_norm": 0.836010217666626, + "learning_rate": 6.0067208064967795e-05, + "loss": 0.6547, + "step": 5100 + }, + { + "epoch": 2.133430021754895, + "grad_norm": 0.738888144493103, + "learning_rate": 5.8667040044805375e-05, + "loss": 0.6618, + "step": 5150 + }, + { + "epoch": 2.154148969232363, + "grad_norm": 1.0129936933517456, + "learning_rate": 5.726687202464296e-05, + "loss": 0.6855, + "step": 5200 + }, + { + "epoch": 2.174867916709831, + "grad_norm": 1.0437065362930298, + "learning_rate": 5.586670400448054e-05, + "loss": 0.6397, + "step": 5250 + }, + { + "epoch": 2.174867916709831, + "eval_loss": 0.6610354781150818, + "eval_runtime": 86.6962, + "eval_samples_per_second": 55.666, + "eval_steps_per_second": 13.922, + "step": 5250 + }, + { + "epoch": 2.195586864187299, + "grad_norm": 0.6604383587837219, + "learning_rate": 5.446653598431812e-05, + "loss": 0.6165, + "step": 5300 + }, + { + "epoch": 2.2163058116647676, + "grad_norm": 0.7305940985679626, + "learning_rate": 5.30663679641557e-05, + "loss": 0.6772, + "step": 5350 + }, + { + "epoch": 2.2370247591422356, + "grad_norm": 0.8462594747543335, + "learning_rate": 5.166619994399328e-05, + "loss": 0.6416, + "step": 5400 + }, + { + "epoch": 2.2577437066197037, + "grad_norm": 0.7274892330169678, + "learning_rate": 5.026603192383086e-05, + "loss": 0.6524, + "step": 5450 + }, + { + "epoch": 2.2784626540971717, + "grad_norm": 0.579065203666687, + "learning_rate": 4.886586390366844e-05, + "loss": 0.6464, + "step": 5500 + }, + { + "epoch": 2.2784626540971717, + "eval_loss": 0.659969687461853, + "eval_runtime": 86.3113, + "eval_samples_per_second": 55.914, + "eval_steps_per_second": 13.984, + "step": 5500 + }, + { + "epoch": 2.29918160157464, + "grad_norm": 0.8034997582435608, + "learning_rate": 4.7465695883506025e-05, + "loss": 0.6472, + "step": 5550 + }, + { + "epoch": 2.3199005490521083, + "grad_norm": 0.8343969583511353, + "learning_rate": 4.6065527863343605e-05, + "loss": 0.6459, + "step": 5600 + }, + { + "epoch": 2.3406194965295763, + "grad_norm": 0.8522002100944519, + "learning_rate": 4.466535984318118e-05, + "loss": 0.6345, + "step": 5650 + }, + { + "epoch": 2.3613384440070444, + "grad_norm": 1.0543782711029053, + "learning_rate": 4.3265191823018766e-05, + "loss": 0.6218, + "step": 5700 + }, + { + "epoch": 2.3820573914845125, + "grad_norm": 0.9417380690574646, + "learning_rate": 4.1865023802856346e-05, + "loss": 0.68, + "step": 5750 + }, + { + "epoch": 2.3820573914845125, + "eval_loss": 0.6579350233078003, + "eval_runtime": 86.1935, + "eval_samples_per_second": 55.99, + "eval_steps_per_second": 14.003, + "step": 5750 + }, + { + "epoch": 2.4027763389619805, + "grad_norm": 0.8992893099784851, + "learning_rate": 4.046485578269393e-05, + "loss": 0.647, + "step": 5800 + }, + { + "epoch": 2.423495286439449, + "grad_norm": 0.8680675029754639, + "learning_rate": 3.906468776253151e-05, + "loss": 0.6302, + "step": 5850 + }, + { + "epoch": 2.444214233916917, + "grad_norm": 0.878776490688324, + "learning_rate": 3.766451974236909e-05, + "loss": 0.6545, + "step": 5900 + }, + { + "epoch": 2.464933181394385, + "grad_norm": 0.8039425015449524, + "learning_rate": 3.626435172220667e-05, + "loss": 0.6557, + "step": 5950 + }, + { + "epoch": 2.485652128871853, + "grad_norm": 0.8756773471832275, + "learning_rate": 3.486418370204425e-05, + "loss": 0.639, + "step": 6000 + }, + { + "epoch": 2.485652128871853, + "eval_loss": 0.658104658126831, + "eval_runtime": 86.3311, + "eval_samples_per_second": 55.901, + "eval_steps_per_second": 13.981, + "step": 6000 + } + ], + "logging_steps": 50, + "max_steps": 7242, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.770013074485248e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-6000/training_args.bin b/checkpoint-6000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..818731b2d637beef34d216b7461a12ef35db62df --- /dev/null +++ b/checkpoint-6000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a783872a61d64f4d1b4e002fa0fa67fa54dd9c59c4f7fbad58c794a77134069d +size 5432 diff --git a/checkpoint-6500/README.md b/checkpoint-6500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..575d436fd9e4a7a4d30fbc4644c1fecf7480fbbe --- /dev/null +++ b/checkpoint-6500/README.md @@ -0,0 +1,207 @@ +--- +base_model: meta-llama/Meta-Llama-3.1-8B-Instruct +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:meta-llama/Meta-Llama-3.1-8B-Instruct +- lora +- transformers +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.0 \ No newline at end of file diff --git a/checkpoint-6500/adapter_config.json b/checkpoint-6500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..84dde86ee0bd87c0f5d5a0a36710a2a58516a7c4 --- /dev/null +++ b/checkpoint-6500/adapter_config.json @@ -0,0 +1,43 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.0", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "o_proj", + "v_proj", + "k_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-6500/adapter_model.safetensors b/checkpoint-6500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6a21e159a4c1eaa7984c37f542719801a8fdeefd --- /dev/null +++ b/checkpoint-6500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb5e3673451c0e3a0ba48cbd61009166223e681bef82750f8a6074fff94b4436 +size 54560368 diff --git a/checkpoint-6500/chat_template.jinja b/checkpoint-6500/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..33089ace1be88f22a10fe861ad49718d5d886090 --- /dev/null +++ b/checkpoint-6500/chat_template.jinja @@ -0,0 +1,109 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- set date_string = "26 Jul 2024" %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message + builtin tools #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if builtin_tools is defined or tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{%- if builtin_tools is defined %} + {{- "Tools: " + builtin_tools | reject('equalto', 'code_interpreter') | join(", ") + "\n\n"}} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {%- if builtin_tools is defined and tool_call.name in builtin_tools %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- "<|python_tag|>" + tool_call.name + ".call(" }} + {%- for arg_name, arg_val in tool_call.arguments | items %} + {{- arg_name + '="' + arg_val + '"' }} + {%- if not loop.last %} + {{- ", " }} + {%- endif %} + {%- endfor %} + {{- ")" }} + {%- else %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {%- endif %} + {%- if builtin_tools is defined %} + {#- This means we're in ipython mode #} + {{- "<|eom_id|>" }} + {%- else %} + {{- "<|eot_id|>" }} + {%- endif %} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/checkpoint-6500/optimizer.pt b/checkpoint-6500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..59d7b7083cb490f592a1e66aa9d89b0428e919f7 --- /dev/null +++ b/checkpoint-6500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db99f1381c7244e2d55700d48a0a542e0c3d758d3eddad403c1917a604da36a5 +size 109267450 diff --git a/checkpoint-6500/rng_state.pth b/checkpoint-6500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..defb4ac3700f54095f6a19181c72facf95bedbdd --- /dev/null +++ b/checkpoint-6500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05ff45acbfc8e11690143c92a768898180a61be56a79ee3c662f2779a986a937 +size 14244 diff --git a/checkpoint-6500/scaler.pt b/checkpoint-6500/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..cd6517225c75cca6be83f126d37e2371eae738bf --- /dev/null +++ b/checkpoint-6500/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2bd8353f58e8a87e336a1fa783685c57cbcbaf29ab1df8aaa4fc18924d7b858b +size 988 diff --git a/checkpoint-6500/scheduler.pt b/checkpoint-6500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..2a2bd37830efd87a613675edf8ac6376e68bb920 --- /dev/null +++ b/checkpoint-6500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8765b9a6d1d3139e5cc272c1c0c92d000b35b901284488e14bd3e2635060599b +size 1064 diff --git a/checkpoint-6500/special_tokens_map.json b/checkpoint-6500/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/checkpoint-6500/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/checkpoint-6500/tokenizer.json b/checkpoint-6500/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..d8b0bff70a20a3e87bcd1c207db61ae7a179bbf3 --- /dev/null +++ b/checkpoint-6500/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c85066e7642934ed09b44155e6566b0b5dab2637fb9433439ba5c9c7f8b50d3 +size 17210018 diff --git a/checkpoint-6500/tokenizer_config.json b/checkpoint-6500/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..3beeacc86a6ca3cae14ad3004263ab74a4bac07a --- /dev/null +++ b/checkpoint-6500/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/checkpoint-6500/trainer_state.json b/checkpoint-6500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..8b4aa13c26ba13d7fc9e970685702cd48aa26079 --- /dev/null +++ b/checkpoint-6500/trainer_state.json @@ -0,0 +1,1152 @@ +{ + "best_global_step": 6500, + "best_metric": 0.6551876664161682, + "best_model_checkpoint": "amiya_outputs/llama3.1-8b-amiya-palestinian/checkpoint-6500", + "epoch": 2.692841603646535, + "eval_steps": 250, + "global_step": 6500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.020718947477468146, + "grad_norm": 1.0589393377304077, + "learning_rate": 9.8e-05, + "loss": 2.6567, + "step": 50 + }, + { + "epoch": 0.04143789495493629, + "grad_norm": 0.9738045334815979, + "learning_rate": 0.00019800000000000002, + "loss": 0.9502, + "step": 100 + }, + { + "epoch": 0.062156842432404436, + "grad_norm": 0.8801347017288208, + "learning_rate": 0.00019862783534024082, + "loss": 0.8496, + "step": 150 + }, + { + "epoch": 0.08287578990987259, + "grad_norm": 0.7272312045097351, + "learning_rate": 0.00019722766732007841, + "loss": 0.8184, + "step": 200 + }, + { + "epoch": 0.10359473738734072, + "grad_norm": 0.7850629091262817, + "learning_rate": 0.000195827499299916, + "loss": 0.8392, + "step": 250 + }, + { + "epoch": 0.10359473738734072, + "eval_loss": 0.7402811050415039, + "eval_runtime": 85.9367, + "eval_samples_per_second": 56.158, + "eval_steps_per_second": 14.045, + "step": 250 + }, + { + "epoch": 0.12431368486480887, + "grad_norm": 0.40629276633262634, + "learning_rate": 0.00019442733127975358, + "loss": 0.8108, + "step": 300 + }, + { + "epoch": 0.145032632342277, + "grad_norm": 0.5258236527442932, + "learning_rate": 0.00019302716325959117, + "loss": 0.8116, + "step": 350 + }, + { + "epoch": 0.16575157981974517, + "grad_norm": 0.6879925727844238, + "learning_rate": 0.00019162699523942874, + "loss": 0.9089, + "step": 400 + }, + { + "epoch": 0.1864705272972133, + "grad_norm": 0.7583937048912048, + "learning_rate": 0.00019022682721926633, + "loss": 0.874, + "step": 450 + }, + { + "epoch": 0.20718947477468144, + "grad_norm": 0.6399120688438416, + "learning_rate": 0.0001888266591991039, + "loss": 0.8366, + "step": 500 + }, + { + "epoch": 0.20718947477468144, + "eval_loss": 0.7194066047668457, + "eval_runtime": 86.2811, + "eval_samples_per_second": 55.933, + "eval_steps_per_second": 13.989, + "step": 500 + }, + { + "epoch": 0.22790842225214958, + "grad_norm": 0.7763131856918335, + "learning_rate": 0.0001874264911789415, + "loss": 0.7912, + "step": 550 + }, + { + "epoch": 0.24862736972961774, + "grad_norm": 0.6845299601554871, + "learning_rate": 0.00018602632315877906, + "loss": 0.8506, + "step": 600 + }, + { + "epoch": 0.2693463172070859, + "grad_norm": 0.8045451045036316, + "learning_rate": 0.00018462615513861665, + "loss": 0.763, + "step": 650 + }, + { + "epoch": 0.290065264684554, + "grad_norm": 0.7035927176475525, + "learning_rate": 0.00018322598711845422, + "loss": 0.7769, + "step": 700 + }, + { + "epoch": 0.3107842121620222, + "grad_norm": 0.465000718832016, + "learning_rate": 0.00018182581909829179, + "loss": 0.7705, + "step": 750 + }, + { + "epoch": 0.3107842121620222, + "eval_loss": 0.7103215456008911, + "eval_runtime": 86.1101, + "eval_samples_per_second": 56.045, + "eval_steps_per_second": 14.017, + "step": 750 + }, + { + "epoch": 0.33150315963949034, + "grad_norm": 0.4990151524543762, + "learning_rate": 0.00018042565107812938, + "loss": 0.8438, + "step": 800 + }, + { + "epoch": 0.35222210711695845, + "grad_norm": 0.7391067147254944, + "learning_rate": 0.00017902548305796695, + "loss": 0.7688, + "step": 850 + }, + { + "epoch": 0.3729410545944266, + "grad_norm": 0.8036171197891235, + "learning_rate": 0.00017762531503780454, + "loss": 0.753, + "step": 900 + }, + { + "epoch": 0.3936600020718947, + "grad_norm": 0.44744470715522766, + "learning_rate": 0.00017622514701764213, + "loss": 0.7793, + "step": 950 + }, + { + "epoch": 0.4143789495493629, + "grad_norm": 0.630820631980896, + "learning_rate": 0.00017482497899747973, + "loss": 0.7555, + "step": 1000 + }, + { + "epoch": 0.4143789495493629, + "eval_loss": 0.7030432820320129, + "eval_runtime": 86.2543, + "eval_samples_per_second": 55.951, + "eval_steps_per_second": 13.994, + "step": 1000 + }, + { + "epoch": 0.43509789702683105, + "grad_norm": 0.45690879225730896, + "learning_rate": 0.0001734248109773173, + "loss": 0.793, + "step": 1050 + }, + { + "epoch": 0.45581684450429916, + "grad_norm": 0.5000227093696594, + "learning_rate": 0.00017202464295715486, + "loss": 0.8342, + "step": 1100 + }, + { + "epoch": 0.4765357919817673, + "grad_norm": 0.47182488441467285, + "learning_rate": 0.00017062447493699246, + "loss": 0.7997, + "step": 1150 + }, + { + "epoch": 0.4972547394592355, + "grad_norm": 0.7060516476631165, + "learning_rate": 0.00016922430691683002, + "loss": 0.7788, + "step": 1200 + }, + { + "epoch": 0.5179736869367036, + "grad_norm": 0.46701857447624207, + "learning_rate": 0.00016782413889666762, + "loss": 0.7518, + "step": 1250 + }, + { + "epoch": 0.5179736869367036, + "eval_loss": 0.7023425698280334, + "eval_runtime": 86.3015, + "eval_samples_per_second": 55.92, + "eval_steps_per_second": 13.986, + "step": 1250 + }, + { + "epoch": 0.5386926344141718, + "grad_norm": 0.668192446231842, + "learning_rate": 0.00016642397087650518, + "loss": 0.7682, + "step": 1300 + }, + { + "epoch": 0.5594115818916399, + "grad_norm": 0.47292283177375793, + "learning_rate": 0.00016502380285634278, + "loss": 0.7985, + "step": 1350 + }, + { + "epoch": 0.580130529369108, + "grad_norm": 0.7327275276184082, + "learning_rate": 0.00016362363483618034, + "loss": 0.8378, + "step": 1400 + }, + { + "epoch": 0.6008494768465762, + "grad_norm": 0.8417996764183044, + "learning_rate": 0.0001622234668160179, + "loss": 0.7962, + "step": 1450 + }, + { + "epoch": 0.6215684243240444, + "grad_norm": 0.6189562678337097, + "learning_rate": 0.0001608232987958555, + "loss": 0.8028, + "step": 1500 + }, + { + "epoch": 0.6215684243240444, + "eval_loss": 0.6915447115898132, + "eval_runtime": 86.2147, + "eval_samples_per_second": 55.977, + "eval_steps_per_second": 14.0, + "step": 1500 + }, + { + "epoch": 0.6422873718015125, + "grad_norm": 0.7345826625823975, + "learning_rate": 0.0001594231307756931, + "loss": 0.7978, + "step": 1550 + }, + { + "epoch": 0.6630063192789807, + "grad_norm": 0.6538310050964355, + "learning_rate": 0.0001580229627555307, + "loss": 0.7672, + "step": 1600 + }, + { + "epoch": 0.6837252667564487, + "grad_norm": 0.661582350730896, + "learning_rate": 0.00015662279473536826, + "loss": 0.7378, + "step": 1650 + }, + { + "epoch": 0.7044442142339169, + "grad_norm": 0.3603042960166931, + "learning_rate": 0.00015522262671520583, + "loss": 0.6741, + "step": 1700 + }, + { + "epoch": 0.7251631617113851, + "grad_norm": 0.8882561326026917, + "learning_rate": 0.00015382245869504342, + "loss": 0.7695, + "step": 1750 + }, + { + "epoch": 0.7251631617113851, + "eval_loss": 0.6858941316604614, + "eval_runtime": 86.6358, + "eval_samples_per_second": 55.704, + "eval_steps_per_second": 13.932, + "step": 1750 + }, + { + "epoch": 0.7458821091888532, + "grad_norm": 0.5933266282081604, + "learning_rate": 0.000152422290674881, + "loss": 0.7548, + "step": 1800 + }, + { + "epoch": 0.7666010566663214, + "grad_norm": 0.8178608417510986, + "learning_rate": 0.00015102212265471858, + "loss": 0.7639, + "step": 1850 + }, + { + "epoch": 0.7873200041437894, + "grad_norm": 0.4378993511199951, + "learning_rate": 0.00014962195463455615, + "loss": 0.7985, + "step": 1900 + }, + { + "epoch": 0.8080389516212576, + "grad_norm": 0.3732803463935852, + "learning_rate": 0.00014822178661439374, + "loss": 0.8481, + "step": 1950 + }, + { + "epoch": 0.8287578990987258, + "grad_norm": 0.7421035170555115, + "learning_rate": 0.0001468216185942313, + "loss": 0.7223, + "step": 2000 + }, + { + "epoch": 0.8287578990987258, + "eval_loss": 0.6823315024375916, + "eval_runtime": 86.5575, + "eval_samples_per_second": 55.755, + "eval_steps_per_second": 13.944, + "step": 2000 + }, + { + "epoch": 0.8494768465761939, + "grad_norm": 0.5109913349151611, + "learning_rate": 0.00014542145057406888, + "loss": 0.7895, + "step": 2050 + }, + { + "epoch": 0.8701957940536621, + "grad_norm": 0.47988179326057434, + "learning_rate": 0.00014402128255390647, + "loss": 0.7385, + "step": 2100 + }, + { + "epoch": 0.8909147415311303, + "grad_norm": 0.7593080997467041, + "learning_rate": 0.00014262111453374404, + "loss": 0.7744, + "step": 2150 + }, + { + "epoch": 0.9116336890085983, + "grad_norm": 0.5866154432296753, + "learning_rate": 0.00014122094651358163, + "loss": 0.7062, + "step": 2200 + }, + { + "epoch": 0.9323526364860665, + "grad_norm": 0.47364088892936707, + "learning_rate": 0.00013982077849341922, + "loss": 0.7792, + "step": 2250 + }, + { + "epoch": 0.9323526364860665, + "eval_loss": 0.6785813570022583, + "eval_runtime": 86.3444, + "eval_samples_per_second": 55.892, + "eval_steps_per_second": 13.979, + "step": 2250 + }, + { + "epoch": 0.9530715839635346, + "grad_norm": 0.7610514760017395, + "learning_rate": 0.00013842061047325682, + "loss": 0.7804, + "step": 2300 + }, + { + "epoch": 0.9737905314410028, + "grad_norm": 0.7689616084098816, + "learning_rate": 0.00013702044245309438, + "loss": 0.7497, + "step": 2350 + }, + { + "epoch": 0.994509478918471, + "grad_norm": 0.542168378829956, + "learning_rate": 0.00013562027443293195, + "loss": 0.7333, + "step": 2400 + }, + { + "epoch": 1.0149176421837771, + "grad_norm": 0.33903324604034424, + "learning_rate": 0.0001342481097731728, + "loss": 0.6952, + "step": 2450 + }, + { + "epoch": 1.0356365896612452, + "grad_norm": 0.8183636665344238, + "learning_rate": 0.00013284794175301036, + "loss": 0.7386, + "step": 2500 + }, + { + "epoch": 1.0356365896612452, + "eval_loss": 0.675748348236084, + "eval_runtime": 86.2887, + "eval_samples_per_second": 55.929, + "eval_steps_per_second": 13.988, + "step": 2500 + }, + { + "epoch": 1.0563555371387134, + "grad_norm": 0.6831589937210083, + "learning_rate": 0.00013144777373284795, + "loss": 0.72, + "step": 2550 + }, + { + "epoch": 1.0770744846161815, + "grad_norm": 0.6346258521080017, + "learning_rate": 0.00013004760571268552, + "loss": 0.7026, + "step": 2600 + }, + { + "epoch": 1.0977934320936495, + "grad_norm": 0.5658385753631592, + "learning_rate": 0.0001286474376925231, + "loss": 0.7162, + "step": 2650 + }, + { + "epoch": 1.1185123795711178, + "grad_norm": 0.4242883026599884, + "learning_rate": 0.00012724726967236068, + "loss": 0.7325, + "step": 2700 + }, + { + "epoch": 1.1392313270485859, + "grad_norm": 0.5489133596420288, + "learning_rate": 0.00012584710165219827, + "loss": 0.7138, + "step": 2750 + }, + { + "epoch": 1.1392313270485859, + "eval_loss": 0.6747092604637146, + "eval_runtime": 86.4239, + "eval_samples_per_second": 55.841, + "eval_steps_per_second": 13.966, + "step": 2750 + }, + { + "epoch": 1.1599502745260541, + "grad_norm": 0.6514728665351868, + "learning_rate": 0.00012444693363203587, + "loss": 0.7105, + "step": 2800 + }, + { + "epoch": 1.1806692220035222, + "grad_norm": 0.48897412419319153, + "learning_rate": 0.00012304676561187343, + "loss": 0.7271, + "step": 2850 + }, + { + "epoch": 1.2013881694809903, + "grad_norm": 0.7159713506698608, + "learning_rate": 0.00012164659759171101, + "loss": 0.7454, + "step": 2900 + }, + { + "epoch": 1.2221071169584585, + "grad_norm": 0.7044214010238647, + "learning_rate": 0.0001202464295715486, + "loss": 0.6918, + "step": 2950 + }, + { + "epoch": 1.2428260644359266, + "grad_norm": 0.7934305667877197, + "learning_rate": 0.00011884626155138616, + "loss": 0.7018, + "step": 3000 + }, + { + "epoch": 1.2428260644359266, + "eval_loss": 0.6727278828620911, + "eval_runtime": 86.1985, + "eval_samples_per_second": 55.987, + "eval_steps_per_second": 14.003, + "step": 3000 + }, + { + "epoch": 1.2635450119133949, + "grad_norm": 0.8456618785858154, + "learning_rate": 0.00011744609353122375, + "loss": 0.763, + "step": 3050 + }, + { + "epoch": 1.284263959390863, + "grad_norm": 0.5733729600906372, + "learning_rate": 0.00011604592551106132, + "loss": 0.7034, + "step": 3100 + }, + { + "epoch": 1.304982906868331, + "grad_norm": 0.4783104658126831, + "learning_rate": 0.00011464575749089892, + "loss": 0.762, + "step": 3150 + }, + { + "epoch": 1.3257018543457992, + "grad_norm": 0.7016689777374268, + "learning_rate": 0.0001132455894707365, + "loss": 0.7049, + "step": 3200 + }, + { + "epoch": 1.3464208018232675, + "grad_norm": 0.6739513278007507, + "learning_rate": 0.00011184542145057409, + "loss": 0.7137, + "step": 3250 + }, + { + "epoch": 1.3464208018232675, + "eval_loss": 0.6689812541007996, + "eval_runtime": 86.4895, + "eval_samples_per_second": 55.799, + "eval_steps_per_second": 13.955, + "step": 3250 + }, + { + "epoch": 1.3671397493007356, + "grad_norm": 0.8907766938209534, + "learning_rate": 0.00011044525343041166, + "loss": 0.7476, + "step": 3300 + }, + { + "epoch": 1.3878586967782036, + "grad_norm": 0.8889743089675903, + "learning_rate": 0.00010904508541024922, + "loss": 0.7059, + "step": 3350 + }, + { + "epoch": 1.408577644255672, + "grad_norm": 0.5788094401359558, + "learning_rate": 0.00010764491739008682, + "loss": 0.7018, + "step": 3400 + }, + { + "epoch": 1.42929659173314, + "grad_norm": 0.7107548713684082, + "learning_rate": 0.00010624474936992438, + "loss": 0.6796, + "step": 3450 + }, + { + "epoch": 1.4500155392106082, + "grad_norm": 0.6979348063468933, + "learning_rate": 0.00010484458134976198, + "loss": 0.7212, + "step": 3500 + }, + { + "epoch": 1.4500155392106082, + "eval_loss": 0.6663665175437927, + "eval_runtime": 86.5532, + "eval_samples_per_second": 55.758, + "eval_steps_per_second": 13.945, + "step": 3500 + }, + { + "epoch": 1.4707344866880763, + "grad_norm": 0.7232558727264404, + "learning_rate": 0.00010344441332959956, + "loss": 0.6814, + "step": 3550 + }, + { + "epoch": 1.4914534341655443, + "grad_norm": 0.8630662560462952, + "learning_rate": 0.00010204424530943715, + "loss": 0.7012, + "step": 3600 + }, + { + "epoch": 1.5121723816430124, + "grad_norm": 0.9553645253181458, + "learning_rate": 0.00010064407728927472, + "loss": 0.7247, + "step": 3650 + }, + { + "epoch": 1.5328913291204807, + "grad_norm": 0.6892822980880737, + "learning_rate": 9.92439092691123e-05, + "loss": 0.7009, + "step": 3700 + }, + { + "epoch": 1.553610276597949, + "grad_norm": 0.8881245255470276, + "learning_rate": 9.787174460935312e-05, + "loss": 0.7579, + "step": 3750 + }, + { + "epoch": 1.553610276597949, + "eval_loss": 0.6655827164649963, + "eval_runtime": 86.317, + "eval_samples_per_second": 55.91, + "eval_steps_per_second": 13.983, + "step": 3750 + }, + { + "epoch": 1.574329224075417, + "grad_norm": 0.6604064702987671, + "learning_rate": 9.64715765891907e-05, + "loss": 0.7003, + "step": 3800 + }, + { + "epoch": 1.595048171552885, + "grad_norm": 0.5936245918273926, + "learning_rate": 9.507140856902829e-05, + "loss": 0.7093, + "step": 3850 + }, + { + "epoch": 1.6157671190303533, + "grad_norm": 0.6983786225318909, + "learning_rate": 9.367124054886587e-05, + "loss": 0.7009, + "step": 3900 + }, + { + "epoch": 1.6364860665078214, + "grad_norm": 0.6833502054214478, + "learning_rate": 9.227107252870345e-05, + "loss": 0.7293, + "step": 3950 + }, + { + "epoch": 1.6572050139852896, + "grad_norm": 0.5032167434692383, + "learning_rate": 9.087090450854103e-05, + "loss": 0.6968, + "step": 4000 + }, + { + "epoch": 1.6572050139852896, + "eval_loss": 0.6616591215133667, + "eval_runtime": 86.2625, + "eval_samples_per_second": 55.946, + "eval_steps_per_second": 13.992, + "step": 4000 + }, + { + "epoch": 1.6779239614627577, + "grad_norm": 0.73284912109375, + "learning_rate": 8.947073648837862e-05, + "loss": 0.723, + "step": 4050 + }, + { + "epoch": 1.6986429089402257, + "grad_norm": 0.7727170586585999, + "learning_rate": 8.807056846821619e-05, + "loss": 0.6965, + "step": 4100 + }, + { + "epoch": 1.719361856417694, + "grad_norm": 0.6575957536697388, + "learning_rate": 8.667040044805377e-05, + "loss": 0.6804, + "step": 4150 + }, + { + "epoch": 1.7400808038951623, + "grad_norm": 0.7174975275993347, + "learning_rate": 8.527023242789135e-05, + "loss": 0.7388, + "step": 4200 + }, + { + "epoch": 1.7607997513726303, + "grad_norm": 0.7730789184570312, + "learning_rate": 8.387006440772893e-05, + "loss": 0.6716, + "step": 4250 + }, + { + "epoch": 1.7607997513726303, + "eval_loss": 0.6594452857971191, + "eval_runtime": 86.4796, + "eval_samples_per_second": 55.805, + "eval_steps_per_second": 13.957, + "step": 4250 + }, + { + "epoch": 1.7815186988500984, + "grad_norm": 0.6953691840171814, + "learning_rate": 8.246989638756651e-05, + "loss": 0.6997, + "step": 4300 + }, + { + "epoch": 1.8022376463275664, + "grad_norm": 0.5468209385871887, + "learning_rate": 8.10697283674041e-05, + "loss": 0.7069, + "step": 4350 + }, + { + "epoch": 1.8229565938050347, + "grad_norm": 0.649025559425354, + "learning_rate": 7.966956034724167e-05, + "loss": 0.7179, + "step": 4400 + }, + { + "epoch": 1.843675541282503, + "grad_norm": 0.9825453162193298, + "learning_rate": 7.826939232707925e-05, + "loss": 0.7224, + "step": 4450 + }, + { + "epoch": 1.864394488759971, + "grad_norm": 0.5808931589126587, + "learning_rate": 7.686922430691683e-05, + "loss": 0.6139, + "step": 4500 + }, + { + "epoch": 1.864394488759971, + "eval_loss": 0.6582211852073669, + "eval_runtime": 86.3106, + "eval_samples_per_second": 55.914, + "eval_steps_per_second": 13.984, + "step": 4500 + }, + { + "epoch": 1.885113436237439, + "grad_norm": 0.8670596480369568, + "learning_rate": 7.546905628675441e-05, + "loss": 0.6993, + "step": 4550 + }, + { + "epoch": 1.9058323837149072, + "grad_norm": 0.5719550251960754, + "learning_rate": 7.406888826659199e-05, + "loss": 0.7224, + "step": 4600 + }, + { + "epoch": 1.9265513311923754, + "grad_norm": 0.80317622423172, + "learning_rate": 7.266872024642958e-05, + "loss": 0.721, + "step": 4650 + }, + { + "epoch": 1.9472702786698437, + "grad_norm": 0.8696815371513367, + "learning_rate": 7.126855222626717e-05, + "loss": 0.7326, + "step": 4700 + }, + { + "epoch": 1.9679892261473118, + "grad_norm": 0.6648033857345581, + "learning_rate": 6.986838420610473e-05, + "loss": 0.6956, + "step": 4750 + }, + { + "epoch": 1.9679892261473118, + "eval_loss": 0.6553655862808228, + "eval_runtime": 86.4589, + "eval_samples_per_second": 55.818, + "eval_steps_per_second": 13.96, + "step": 4750 + }, + { + "epoch": 1.9887081736247798, + "grad_norm": 0.5694370269775391, + "learning_rate": 6.846821618594231e-05, + "loss": 0.6719, + "step": 4800 + }, + { + "epoch": 2.009116336890086, + "grad_norm": 0.7416983246803284, + "learning_rate": 6.706804816577989e-05, + "loss": 0.6569, + "step": 4850 + }, + { + "epoch": 2.0298352843675542, + "grad_norm": 0.7476940751075745, + "learning_rate": 6.566788014561747e-05, + "loss": 0.6353, + "step": 4900 + }, + { + "epoch": 2.0505542318450223, + "grad_norm": 0.8248530626296997, + "learning_rate": 6.426771212545505e-05, + "loss": 0.6319, + "step": 4950 + }, + { + "epoch": 2.0712731793224903, + "grad_norm": 1.035627841949463, + "learning_rate": 6.286754410529265e-05, + "loss": 0.6003, + "step": 5000 + }, + { + "epoch": 2.0712731793224903, + "eval_loss": 0.6616777777671814, + "eval_runtime": 86.7192, + "eval_samples_per_second": 55.651, + "eval_steps_per_second": 13.918, + "step": 5000 + }, + { + "epoch": 2.0919921267999584, + "grad_norm": 0.8701285719871521, + "learning_rate": 6.146737608513023e-05, + "loss": 0.6324, + "step": 5050 + }, + { + "epoch": 2.112711074277427, + "grad_norm": 0.836010217666626, + "learning_rate": 6.0067208064967795e-05, + "loss": 0.6547, + "step": 5100 + }, + { + "epoch": 2.133430021754895, + "grad_norm": 0.738888144493103, + "learning_rate": 5.8667040044805375e-05, + "loss": 0.6618, + "step": 5150 + }, + { + "epoch": 2.154148969232363, + "grad_norm": 1.0129936933517456, + "learning_rate": 5.726687202464296e-05, + "loss": 0.6855, + "step": 5200 + }, + { + "epoch": 2.174867916709831, + "grad_norm": 1.0437065362930298, + "learning_rate": 5.586670400448054e-05, + "loss": 0.6397, + "step": 5250 + }, + { + "epoch": 2.174867916709831, + "eval_loss": 0.6610354781150818, + "eval_runtime": 86.6962, + "eval_samples_per_second": 55.666, + "eval_steps_per_second": 13.922, + "step": 5250 + }, + { + "epoch": 2.195586864187299, + "grad_norm": 0.6604383587837219, + "learning_rate": 5.446653598431812e-05, + "loss": 0.6165, + "step": 5300 + }, + { + "epoch": 2.2163058116647676, + "grad_norm": 0.7305940985679626, + "learning_rate": 5.30663679641557e-05, + "loss": 0.6772, + "step": 5350 + }, + { + "epoch": 2.2370247591422356, + "grad_norm": 0.8462594747543335, + "learning_rate": 5.166619994399328e-05, + "loss": 0.6416, + "step": 5400 + }, + { + "epoch": 2.2577437066197037, + "grad_norm": 0.7274892330169678, + "learning_rate": 5.026603192383086e-05, + "loss": 0.6524, + "step": 5450 + }, + { + "epoch": 2.2784626540971717, + "grad_norm": 0.579065203666687, + "learning_rate": 4.886586390366844e-05, + "loss": 0.6464, + "step": 5500 + }, + { + "epoch": 2.2784626540971717, + "eval_loss": 0.659969687461853, + "eval_runtime": 86.3113, + "eval_samples_per_second": 55.914, + "eval_steps_per_second": 13.984, + "step": 5500 + }, + { + "epoch": 2.29918160157464, + "grad_norm": 0.8034997582435608, + "learning_rate": 4.7465695883506025e-05, + "loss": 0.6472, + "step": 5550 + }, + { + "epoch": 2.3199005490521083, + "grad_norm": 0.8343969583511353, + "learning_rate": 4.6065527863343605e-05, + "loss": 0.6459, + "step": 5600 + }, + { + "epoch": 2.3406194965295763, + "grad_norm": 0.8522002100944519, + "learning_rate": 4.466535984318118e-05, + "loss": 0.6345, + "step": 5650 + }, + { + "epoch": 2.3613384440070444, + "grad_norm": 1.0543782711029053, + "learning_rate": 4.3265191823018766e-05, + "loss": 0.6218, + "step": 5700 + }, + { + "epoch": 2.3820573914845125, + "grad_norm": 0.9417380690574646, + "learning_rate": 4.1865023802856346e-05, + "loss": 0.68, + "step": 5750 + }, + { + "epoch": 2.3820573914845125, + "eval_loss": 0.6579350233078003, + "eval_runtime": 86.1935, + "eval_samples_per_second": 55.99, + "eval_steps_per_second": 14.003, + "step": 5750 + }, + { + "epoch": 2.4027763389619805, + "grad_norm": 0.8992893099784851, + "learning_rate": 4.046485578269393e-05, + "loss": 0.647, + "step": 5800 + }, + { + "epoch": 2.423495286439449, + "grad_norm": 0.8680675029754639, + "learning_rate": 3.906468776253151e-05, + "loss": 0.6302, + "step": 5850 + }, + { + "epoch": 2.444214233916917, + "grad_norm": 0.878776490688324, + "learning_rate": 3.766451974236909e-05, + "loss": 0.6545, + "step": 5900 + }, + { + "epoch": 2.464933181394385, + "grad_norm": 0.8039425015449524, + "learning_rate": 3.626435172220667e-05, + "loss": 0.6557, + "step": 5950 + }, + { + "epoch": 2.485652128871853, + "grad_norm": 0.8756773471832275, + "learning_rate": 3.486418370204425e-05, + "loss": 0.639, + "step": 6000 + }, + { + "epoch": 2.485652128871853, + "eval_loss": 0.658104658126831, + "eval_runtime": 86.3311, + "eval_samples_per_second": 55.901, + "eval_steps_per_second": 13.981, + "step": 6000 + }, + { + "epoch": 2.506371076349321, + "grad_norm": 0.8273307085037231, + "learning_rate": 3.346401568188183e-05, + "loss": 0.6609, + "step": 6050 + }, + { + "epoch": 2.5270900238267897, + "grad_norm": 0.7528616786003113, + "learning_rate": 3.206384766171941e-05, + "loss": 0.6393, + "step": 6100 + }, + { + "epoch": 2.5478089713042578, + "grad_norm": 0.6834387183189392, + "learning_rate": 3.066367964155698e-05, + "loss": 0.6208, + "step": 6150 + }, + { + "epoch": 2.568527918781726, + "grad_norm": 0.6862203478813171, + "learning_rate": 2.9263511621394567e-05, + "loss": 0.6373, + "step": 6200 + }, + { + "epoch": 2.589246866259194, + "grad_norm": 1.0487428903579712, + "learning_rate": 2.786334360123215e-05, + "loss": 0.6481, + "step": 6250 + }, + { + "epoch": 2.589246866259194, + "eval_loss": 0.6565331816673279, + "eval_runtime": 86.2454, + "eval_samples_per_second": 55.957, + "eval_steps_per_second": 13.995, + "step": 6250 + }, + { + "epoch": 2.609965813736662, + "grad_norm": 1.1061326265335083, + "learning_rate": 2.646317558106973e-05, + "loss": 0.6502, + "step": 6300 + }, + { + "epoch": 2.6306847612141304, + "grad_norm": 0.6551749110221863, + "learning_rate": 2.5063007560907308e-05, + "loss": 0.6392, + "step": 6350 + }, + { + "epoch": 2.6514037086915985, + "grad_norm": 0.8796434998512268, + "learning_rate": 2.366283954074489e-05, + "loss": 0.6704, + "step": 6400 + }, + { + "epoch": 2.6721226561690665, + "grad_norm": 0.8105428218841553, + "learning_rate": 2.2262671520582472e-05, + "loss": 0.677, + "step": 6450 + }, + { + "epoch": 2.692841603646535, + "grad_norm": 0.9389123320579529, + "learning_rate": 2.0862503500420052e-05, + "loss": 0.6787, + "step": 6500 + }, + { + "epoch": 2.692841603646535, + "eval_loss": 0.6551876664161682, + "eval_runtime": 86.3099, + "eval_samples_per_second": 55.915, + "eval_steps_per_second": 13.984, + "step": 6500 + } + ], + "logging_steps": 50, + "max_steps": 7242, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.08819235633152e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-6500/training_args.bin b/checkpoint-6500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..818731b2d637beef34d216b7461a12ef35db62df --- /dev/null +++ b/checkpoint-6500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a783872a61d64f4d1b4e002fa0fa67fa54dd9c59c4f7fbad58c794a77134069d +size 5432 diff --git a/checkpoint-7000/README.md b/checkpoint-7000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..575d436fd9e4a7a4d30fbc4644c1fecf7480fbbe --- /dev/null +++ b/checkpoint-7000/README.md @@ -0,0 +1,207 @@ +--- +base_model: meta-llama/Meta-Llama-3.1-8B-Instruct +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:meta-llama/Meta-Llama-3.1-8B-Instruct +- lora +- transformers +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.0 \ No newline at end of file diff --git a/checkpoint-7000/adapter_config.json b/checkpoint-7000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..84dde86ee0bd87c0f5d5a0a36710a2a58516a7c4 --- /dev/null +++ b/checkpoint-7000/adapter_config.json @@ -0,0 +1,43 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.0", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "o_proj", + "v_proj", + "k_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-7000/adapter_model.safetensors b/checkpoint-7000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3accea21d4d6eecc8f6c4447d66d5ad0c1978920 --- /dev/null +++ b/checkpoint-7000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ed806adeae688d7c41407f6645cccc7ce2b13d73c5c283a964e550db5cccdfd +size 54560368 diff --git a/checkpoint-7000/chat_template.jinja b/checkpoint-7000/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..33089ace1be88f22a10fe861ad49718d5d886090 --- /dev/null +++ b/checkpoint-7000/chat_template.jinja @@ -0,0 +1,109 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- set date_string = "26 Jul 2024" %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message + builtin tools #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if builtin_tools is defined or tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{%- if builtin_tools is defined %} + {{- "Tools: " + builtin_tools | reject('equalto', 'code_interpreter') | join(", ") + "\n\n"}} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {%- if builtin_tools is defined and tool_call.name in builtin_tools %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- "<|python_tag|>" + tool_call.name + ".call(" }} + {%- for arg_name, arg_val in tool_call.arguments | items %} + {{- arg_name + '="' + arg_val + '"' }} + {%- if not loop.last %} + {{- ", " }} + {%- endif %} + {%- endfor %} + {{- ")" }} + {%- else %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {%- endif %} + {%- if builtin_tools is defined %} + {#- This means we're in ipython mode #} + {{- "<|eom_id|>" }} + {%- else %} + {{- "<|eot_id|>" }} + {%- endif %} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/checkpoint-7000/optimizer.pt b/checkpoint-7000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..cab82708955787035438d132d83e186c27af9cc2 --- /dev/null +++ b/checkpoint-7000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5868cfb31480390d040ef09f18672f0176d980f17ac94f533986e74aad2de1f +size 109267450 diff --git a/checkpoint-7000/rng_state.pth b/checkpoint-7000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..d3d486b50d5d8d3d15faa3a7d0e0a52f4644aba8 --- /dev/null +++ b/checkpoint-7000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ff31d3430d6c8726b0f3c235c4687482b63a413d36ef66114a1254c8da82c1b +size 14244 diff --git a/checkpoint-7000/scaler.pt b/checkpoint-7000/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..2b3ba3ca6e916db40e3cb3c2a278de8a1dbeb420 --- /dev/null +++ b/checkpoint-7000/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70535eaa7819662c353f8dbb812ea63dba0e980bb812101aefdd8b227cb1d32e +size 988 diff --git a/checkpoint-7000/scheduler.pt b/checkpoint-7000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..ab58f879b1fa146d06368a8fa2e2f062f9eed2a6 --- /dev/null +++ b/checkpoint-7000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4aa57ac4f72c3cfd35af5cf50413aefb37fae0429e5b093bc7593b169f5665b4 +size 1064 diff --git a/checkpoint-7000/special_tokens_map.json b/checkpoint-7000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/checkpoint-7000/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/checkpoint-7000/tokenizer.json b/checkpoint-7000/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..d8b0bff70a20a3e87bcd1c207db61ae7a179bbf3 --- /dev/null +++ b/checkpoint-7000/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c85066e7642934ed09b44155e6566b0b5dab2637fb9433439ba5c9c7f8b50d3 +size 17210018 diff --git a/checkpoint-7000/tokenizer_config.json b/checkpoint-7000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..3beeacc86a6ca3cae14ad3004263ab74a4bac07a --- /dev/null +++ b/checkpoint-7000/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/checkpoint-7000/trainer_state.json b/checkpoint-7000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..3d0b6a6c18b663039a0966eda2093e53b23379ae --- /dev/null +++ b/checkpoint-7000/trainer_state.json @@ -0,0 +1,1238 @@ +{ + "best_global_step": 7000, + "best_metric": 0.6540627479553223, + "best_model_checkpoint": "amiya_outputs/llama3.1-8b-amiya-palestinian/checkpoint-7000", + "epoch": 2.9000310784212164, + "eval_steps": 250, + "global_step": 7000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.020718947477468146, + "grad_norm": 1.0589393377304077, + "learning_rate": 9.8e-05, + "loss": 2.6567, + "step": 50 + }, + { + "epoch": 0.04143789495493629, + "grad_norm": 0.9738045334815979, + "learning_rate": 0.00019800000000000002, + "loss": 0.9502, + "step": 100 + }, + { + "epoch": 0.062156842432404436, + "grad_norm": 0.8801347017288208, + "learning_rate": 0.00019862783534024082, + "loss": 0.8496, + "step": 150 + }, + { + "epoch": 0.08287578990987259, + "grad_norm": 0.7272312045097351, + "learning_rate": 0.00019722766732007841, + "loss": 0.8184, + "step": 200 + }, + { + "epoch": 0.10359473738734072, + "grad_norm": 0.7850629091262817, + "learning_rate": 0.000195827499299916, + "loss": 0.8392, + "step": 250 + }, + { + "epoch": 0.10359473738734072, + "eval_loss": 0.7402811050415039, + "eval_runtime": 85.9367, + "eval_samples_per_second": 56.158, + "eval_steps_per_second": 14.045, + "step": 250 + }, + { + "epoch": 0.12431368486480887, + "grad_norm": 0.40629276633262634, + "learning_rate": 0.00019442733127975358, + "loss": 0.8108, + "step": 300 + }, + { + "epoch": 0.145032632342277, + "grad_norm": 0.5258236527442932, + "learning_rate": 0.00019302716325959117, + "loss": 0.8116, + "step": 350 + }, + { + "epoch": 0.16575157981974517, + "grad_norm": 0.6879925727844238, + "learning_rate": 0.00019162699523942874, + "loss": 0.9089, + "step": 400 + }, + { + "epoch": 0.1864705272972133, + "grad_norm": 0.7583937048912048, + "learning_rate": 0.00019022682721926633, + "loss": 0.874, + "step": 450 + }, + { + "epoch": 0.20718947477468144, + "grad_norm": 0.6399120688438416, + "learning_rate": 0.0001888266591991039, + "loss": 0.8366, + "step": 500 + }, + { + "epoch": 0.20718947477468144, + "eval_loss": 0.7194066047668457, + "eval_runtime": 86.2811, + "eval_samples_per_second": 55.933, + "eval_steps_per_second": 13.989, + "step": 500 + }, + { + "epoch": 0.22790842225214958, + "grad_norm": 0.7763131856918335, + "learning_rate": 0.0001874264911789415, + "loss": 0.7912, + "step": 550 + }, + { + "epoch": 0.24862736972961774, + "grad_norm": 0.6845299601554871, + "learning_rate": 0.00018602632315877906, + "loss": 0.8506, + "step": 600 + }, + { + "epoch": 0.2693463172070859, + "grad_norm": 0.8045451045036316, + "learning_rate": 0.00018462615513861665, + "loss": 0.763, + "step": 650 + }, + { + "epoch": 0.290065264684554, + "grad_norm": 0.7035927176475525, + "learning_rate": 0.00018322598711845422, + "loss": 0.7769, + "step": 700 + }, + { + "epoch": 0.3107842121620222, + "grad_norm": 0.465000718832016, + "learning_rate": 0.00018182581909829179, + "loss": 0.7705, + "step": 750 + }, + { + "epoch": 0.3107842121620222, + "eval_loss": 0.7103215456008911, + "eval_runtime": 86.1101, + "eval_samples_per_second": 56.045, + "eval_steps_per_second": 14.017, + "step": 750 + }, + { + "epoch": 0.33150315963949034, + "grad_norm": 0.4990151524543762, + "learning_rate": 0.00018042565107812938, + "loss": 0.8438, + "step": 800 + }, + { + "epoch": 0.35222210711695845, + "grad_norm": 0.7391067147254944, + "learning_rate": 0.00017902548305796695, + "loss": 0.7688, + "step": 850 + }, + { + "epoch": 0.3729410545944266, + "grad_norm": 0.8036171197891235, + "learning_rate": 0.00017762531503780454, + "loss": 0.753, + "step": 900 + }, + { + "epoch": 0.3936600020718947, + "grad_norm": 0.44744470715522766, + "learning_rate": 0.00017622514701764213, + "loss": 0.7793, + "step": 950 + }, + { + "epoch": 0.4143789495493629, + "grad_norm": 0.630820631980896, + "learning_rate": 0.00017482497899747973, + "loss": 0.7555, + "step": 1000 + }, + { + "epoch": 0.4143789495493629, + "eval_loss": 0.7030432820320129, + "eval_runtime": 86.2543, + "eval_samples_per_second": 55.951, + "eval_steps_per_second": 13.994, + "step": 1000 + }, + { + "epoch": 0.43509789702683105, + "grad_norm": 0.45690879225730896, + "learning_rate": 0.0001734248109773173, + "loss": 0.793, + "step": 1050 + }, + { + "epoch": 0.45581684450429916, + "grad_norm": 0.5000227093696594, + "learning_rate": 0.00017202464295715486, + "loss": 0.8342, + "step": 1100 + }, + { + "epoch": 0.4765357919817673, + "grad_norm": 0.47182488441467285, + "learning_rate": 0.00017062447493699246, + "loss": 0.7997, + "step": 1150 + }, + { + "epoch": 0.4972547394592355, + "grad_norm": 0.7060516476631165, + "learning_rate": 0.00016922430691683002, + "loss": 0.7788, + "step": 1200 + }, + { + "epoch": 0.5179736869367036, + "grad_norm": 0.46701857447624207, + "learning_rate": 0.00016782413889666762, + "loss": 0.7518, + "step": 1250 + }, + { + "epoch": 0.5179736869367036, + "eval_loss": 0.7023425698280334, + "eval_runtime": 86.3015, + "eval_samples_per_second": 55.92, + "eval_steps_per_second": 13.986, + "step": 1250 + }, + { + "epoch": 0.5386926344141718, + "grad_norm": 0.668192446231842, + "learning_rate": 0.00016642397087650518, + "loss": 0.7682, + "step": 1300 + }, + { + "epoch": 0.5594115818916399, + "grad_norm": 0.47292283177375793, + "learning_rate": 0.00016502380285634278, + "loss": 0.7985, + "step": 1350 + }, + { + "epoch": 0.580130529369108, + "grad_norm": 0.7327275276184082, + "learning_rate": 0.00016362363483618034, + "loss": 0.8378, + "step": 1400 + }, + { + "epoch": 0.6008494768465762, + "grad_norm": 0.8417996764183044, + "learning_rate": 0.0001622234668160179, + "loss": 0.7962, + "step": 1450 + }, + { + "epoch": 0.6215684243240444, + "grad_norm": 0.6189562678337097, + "learning_rate": 0.0001608232987958555, + "loss": 0.8028, + "step": 1500 + }, + { + "epoch": 0.6215684243240444, + "eval_loss": 0.6915447115898132, + "eval_runtime": 86.2147, + "eval_samples_per_second": 55.977, + "eval_steps_per_second": 14.0, + "step": 1500 + }, + { + "epoch": 0.6422873718015125, + "grad_norm": 0.7345826625823975, + "learning_rate": 0.0001594231307756931, + "loss": 0.7978, + "step": 1550 + }, + { + "epoch": 0.6630063192789807, + "grad_norm": 0.6538310050964355, + "learning_rate": 0.0001580229627555307, + "loss": 0.7672, + "step": 1600 + }, + { + "epoch": 0.6837252667564487, + "grad_norm": 0.661582350730896, + "learning_rate": 0.00015662279473536826, + "loss": 0.7378, + "step": 1650 + }, + { + "epoch": 0.7044442142339169, + "grad_norm": 0.3603042960166931, + "learning_rate": 0.00015522262671520583, + "loss": 0.6741, + "step": 1700 + }, + { + "epoch": 0.7251631617113851, + "grad_norm": 0.8882561326026917, + "learning_rate": 0.00015382245869504342, + "loss": 0.7695, + "step": 1750 + }, + { + "epoch": 0.7251631617113851, + "eval_loss": 0.6858941316604614, + "eval_runtime": 86.6358, + "eval_samples_per_second": 55.704, + "eval_steps_per_second": 13.932, + "step": 1750 + }, + { + "epoch": 0.7458821091888532, + "grad_norm": 0.5933266282081604, + "learning_rate": 0.000152422290674881, + "loss": 0.7548, + "step": 1800 + }, + { + "epoch": 0.7666010566663214, + "grad_norm": 0.8178608417510986, + "learning_rate": 0.00015102212265471858, + "loss": 0.7639, + "step": 1850 + }, + { + "epoch": 0.7873200041437894, + "grad_norm": 0.4378993511199951, + "learning_rate": 0.00014962195463455615, + "loss": 0.7985, + "step": 1900 + }, + { + "epoch": 0.8080389516212576, + "grad_norm": 0.3732803463935852, + "learning_rate": 0.00014822178661439374, + "loss": 0.8481, + "step": 1950 + }, + { + "epoch": 0.8287578990987258, + "grad_norm": 0.7421035170555115, + "learning_rate": 0.0001468216185942313, + "loss": 0.7223, + "step": 2000 + }, + { + "epoch": 0.8287578990987258, + "eval_loss": 0.6823315024375916, + "eval_runtime": 86.5575, + "eval_samples_per_second": 55.755, + "eval_steps_per_second": 13.944, + "step": 2000 + }, + { + "epoch": 0.8494768465761939, + "grad_norm": 0.5109913349151611, + "learning_rate": 0.00014542145057406888, + "loss": 0.7895, + "step": 2050 + }, + { + "epoch": 0.8701957940536621, + "grad_norm": 0.47988179326057434, + "learning_rate": 0.00014402128255390647, + "loss": 0.7385, + "step": 2100 + }, + { + "epoch": 0.8909147415311303, + "grad_norm": 0.7593080997467041, + "learning_rate": 0.00014262111453374404, + "loss": 0.7744, + "step": 2150 + }, + { + "epoch": 0.9116336890085983, + "grad_norm": 0.5866154432296753, + "learning_rate": 0.00014122094651358163, + "loss": 0.7062, + "step": 2200 + }, + { + "epoch": 0.9323526364860665, + "grad_norm": 0.47364088892936707, + "learning_rate": 0.00013982077849341922, + "loss": 0.7792, + "step": 2250 + }, + { + "epoch": 0.9323526364860665, + "eval_loss": 0.6785813570022583, + "eval_runtime": 86.3444, + "eval_samples_per_second": 55.892, + "eval_steps_per_second": 13.979, + "step": 2250 + }, + { + "epoch": 0.9530715839635346, + "grad_norm": 0.7610514760017395, + "learning_rate": 0.00013842061047325682, + "loss": 0.7804, + "step": 2300 + }, + { + "epoch": 0.9737905314410028, + "grad_norm": 0.7689616084098816, + "learning_rate": 0.00013702044245309438, + "loss": 0.7497, + "step": 2350 + }, + { + "epoch": 0.994509478918471, + "grad_norm": 0.542168378829956, + "learning_rate": 0.00013562027443293195, + "loss": 0.7333, + "step": 2400 + }, + { + "epoch": 1.0149176421837771, + "grad_norm": 0.33903324604034424, + "learning_rate": 0.0001342481097731728, + "loss": 0.6952, + "step": 2450 + }, + { + "epoch": 1.0356365896612452, + "grad_norm": 0.8183636665344238, + "learning_rate": 0.00013284794175301036, + "loss": 0.7386, + "step": 2500 + }, + { + "epoch": 1.0356365896612452, + "eval_loss": 0.675748348236084, + "eval_runtime": 86.2887, + "eval_samples_per_second": 55.929, + "eval_steps_per_second": 13.988, + "step": 2500 + }, + { + "epoch": 1.0563555371387134, + "grad_norm": 0.6831589937210083, + "learning_rate": 0.00013144777373284795, + "loss": 0.72, + "step": 2550 + }, + { + "epoch": 1.0770744846161815, + "grad_norm": 0.6346258521080017, + "learning_rate": 0.00013004760571268552, + "loss": 0.7026, + "step": 2600 + }, + { + "epoch": 1.0977934320936495, + "grad_norm": 0.5658385753631592, + "learning_rate": 0.0001286474376925231, + "loss": 0.7162, + "step": 2650 + }, + { + "epoch": 1.1185123795711178, + "grad_norm": 0.4242883026599884, + "learning_rate": 0.00012724726967236068, + "loss": 0.7325, + "step": 2700 + }, + { + "epoch": 1.1392313270485859, + "grad_norm": 0.5489133596420288, + "learning_rate": 0.00012584710165219827, + "loss": 0.7138, + "step": 2750 + }, + { + "epoch": 1.1392313270485859, + "eval_loss": 0.6747092604637146, + "eval_runtime": 86.4239, + "eval_samples_per_second": 55.841, + "eval_steps_per_second": 13.966, + "step": 2750 + }, + { + "epoch": 1.1599502745260541, + "grad_norm": 0.6514728665351868, + "learning_rate": 0.00012444693363203587, + "loss": 0.7105, + "step": 2800 + }, + { + "epoch": 1.1806692220035222, + "grad_norm": 0.48897412419319153, + "learning_rate": 0.00012304676561187343, + "loss": 0.7271, + "step": 2850 + }, + { + "epoch": 1.2013881694809903, + "grad_norm": 0.7159713506698608, + "learning_rate": 0.00012164659759171101, + "loss": 0.7454, + "step": 2900 + }, + { + "epoch": 1.2221071169584585, + "grad_norm": 0.7044214010238647, + "learning_rate": 0.0001202464295715486, + "loss": 0.6918, + "step": 2950 + }, + { + "epoch": 1.2428260644359266, + "grad_norm": 0.7934305667877197, + "learning_rate": 0.00011884626155138616, + "loss": 0.7018, + "step": 3000 + }, + { + "epoch": 1.2428260644359266, + "eval_loss": 0.6727278828620911, + "eval_runtime": 86.1985, + "eval_samples_per_second": 55.987, + "eval_steps_per_second": 14.003, + "step": 3000 + }, + { + "epoch": 1.2635450119133949, + "grad_norm": 0.8456618785858154, + "learning_rate": 0.00011744609353122375, + "loss": 0.763, + "step": 3050 + }, + { + "epoch": 1.284263959390863, + "grad_norm": 0.5733729600906372, + "learning_rate": 0.00011604592551106132, + "loss": 0.7034, + "step": 3100 + }, + { + "epoch": 1.304982906868331, + "grad_norm": 0.4783104658126831, + "learning_rate": 0.00011464575749089892, + "loss": 0.762, + "step": 3150 + }, + { + "epoch": 1.3257018543457992, + "grad_norm": 0.7016689777374268, + "learning_rate": 0.0001132455894707365, + "loss": 0.7049, + "step": 3200 + }, + { + "epoch": 1.3464208018232675, + "grad_norm": 0.6739513278007507, + "learning_rate": 0.00011184542145057409, + "loss": 0.7137, + "step": 3250 + }, + { + "epoch": 1.3464208018232675, + "eval_loss": 0.6689812541007996, + "eval_runtime": 86.4895, + "eval_samples_per_second": 55.799, + "eval_steps_per_second": 13.955, + "step": 3250 + }, + { + "epoch": 1.3671397493007356, + "grad_norm": 0.8907766938209534, + "learning_rate": 0.00011044525343041166, + "loss": 0.7476, + "step": 3300 + }, + { + "epoch": 1.3878586967782036, + "grad_norm": 0.8889743089675903, + "learning_rate": 0.00010904508541024922, + "loss": 0.7059, + "step": 3350 + }, + { + "epoch": 1.408577644255672, + "grad_norm": 0.5788094401359558, + "learning_rate": 0.00010764491739008682, + "loss": 0.7018, + "step": 3400 + }, + { + "epoch": 1.42929659173314, + "grad_norm": 0.7107548713684082, + "learning_rate": 0.00010624474936992438, + "loss": 0.6796, + "step": 3450 + }, + { + "epoch": 1.4500155392106082, + "grad_norm": 0.6979348063468933, + "learning_rate": 0.00010484458134976198, + "loss": 0.7212, + "step": 3500 + }, + { + "epoch": 1.4500155392106082, + "eval_loss": 0.6663665175437927, + "eval_runtime": 86.5532, + "eval_samples_per_second": 55.758, + "eval_steps_per_second": 13.945, + "step": 3500 + }, + { + "epoch": 1.4707344866880763, + "grad_norm": 0.7232558727264404, + "learning_rate": 0.00010344441332959956, + "loss": 0.6814, + "step": 3550 + }, + { + "epoch": 1.4914534341655443, + "grad_norm": 0.8630662560462952, + "learning_rate": 0.00010204424530943715, + "loss": 0.7012, + "step": 3600 + }, + { + "epoch": 1.5121723816430124, + "grad_norm": 0.9553645253181458, + "learning_rate": 0.00010064407728927472, + "loss": 0.7247, + "step": 3650 + }, + { + "epoch": 1.5328913291204807, + "grad_norm": 0.6892822980880737, + "learning_rate": 9.92439092691123e-05, + "loss": 0.7009, + "step": 3700 + }, + { + "epoch": 1.553610276597949, + "grad_norm": 0.8881245255470276, + "learning_rate": 9.787174460935312e-05, + "loss": 0.7579, + "step": 3750 + }, + { + "epoch": 1.553610276597949, + "eval_loss": 0.6655827164649963, + "eval_runtime": 86.317, + "eval_samples_per_second": 55.91, + "eval_steps_per_second": 13.983, + "step": 3750 + }, + { + "epoch": 1.574329224075417, + "grad_norm": 0.6604064702987671, + "learning_rate": 9.64715765891907e-05, + "loss": 0.7003, + "step": 3800 + }, + { + "epoch": 1.595048171552885, + "grad_norm": 0.5936245918273926, + "learning_rate": 9.507140856902829e-05, + "loss": 0.7093, + "step": 3850 + }, + { + "epoch": 1.6157671190303533, + "grad_norm": 0.6983786225318909, + "learning_rate": 9.367124054886587e-05, + "loss": 0.7009, + "step": 3900 + }, + { + "epoch": 1.6364860665078214, + "grad_norm": 0.6833502054214478, + "learning_rate": 9.227107252870345e-05, + "loss": 0.7293, + "step": 3950 + }, + { + "epoch": 1.6572050139852896, + "grad_norm": 0.5032167434692383, + "learning_rate": 9.087090450854103e-05, + "loss": 0.6968, + "step": 4000 + }, + { + "epoch": 1.6572050139852896, + "eval_loss": 0.6616591215133667, + "eval_runtime": 86.2625, + "eval_samples_per_second": 55.946, + "eval_steps_per_second": 13.992, + "step": 4000 + }, + { + "epoch": 1.6779239614627577, + "grad_norm": 0.73284912109375, + "learning_rate": 8.947073648837862e-05, + "loss": 0.723, + "step": 4050 + }, + { + "epoch": 1.6986429089402257, + "grad_norm": 0.7727170586585999, + "learning_rate": 8.807056846821619e-05, + "loss": 0.6965, + "step": 4100 + }, + { + "epoch": 1.719361856417694, + "grad_norm": 0.6575957536697388, + "learning_rate": 8.667040044805377e-05, + "loss": 0.6804, + "step": 4150 + }, + { + "epoch": 1.7400808038951623, + "grad_norm": 0.7174975275993347, + "learning_rate": 8.527023242789135e-05, + "loss": 0.7388, + "step": 4200 + }, + { + "epoch": 1.7607997513726303, + "grad_norm": 0.7730789184570312, + "learning_rate": 8.387006440772893e-05, + "loss": 0.6716, + "step": 4250 + }, + { + "epoch": 1.7607997513726303, + "eval_loss": 0.6594452857971191, + "eval_runtime": 86.4796, + "eval_samples_per_second": 55.805, + "eval_steps_per_second": 13.957, + "step": 4250 + }, + { + "epoch": 1.7815186988500984, + "grad_norm": 0.6953691840171814, + "learning_rate": 8.246989638756651e-05, + "loss": 0.6997, + "step": 4300 + }, + { + "epoch": 1.8022376463275664, + "grad_norm": 0.5468209385871887, + "learning_rate": 8.10697283674041e-05, + "loss": 0.7069, + "step": 4350 + }, + { + "epoch": 1.8229565938050347, + "grad_norm": 0.649025559425354, + "learning_rate": 7.966956034724167e-05, + "loss": 0.7179, + "step": 4400 + }, + { + "epoch": 1.843675541282503, + "grad_norm": 0.9825453162193298, + "learning_rate": 7.826939232707925e-05, + "loss": 0.7224, + "step": 4450 + }, + { + "epoch": 1.864394488759971, + "grad_norm": 0.5808931589126587, + "learning_rate": 7.686922430691683e-05, + "loss": 0.6139, + "step": 4500 + }, + { + "epoch": 1.864394488759971, + "eval_loss": 0.6582211852073669, + "eval_runtime": 86.3106, + "eval_samples_per_second": 55.914, + "eval_steps_per_second": 13.984, + "step": 4500 + }, + { + "epoch": 1.885113436237439, + "grad_norm": 0.8670596480369568, + "learning_rate": 7.546905628675441e-05, + "loss": 0.6993, + "step": 4550 + }, + { + "epoch": 1.9058323837149072, + "grad_norm": 0.5719550251960754, + "learning_rate": 7.406888826659199e-05, + "loss": 0.7224, + "step": 4600 + }, + { + "epoch": 1.9265513311923754, + "grad_norm": 0.80317622423172, + "learning_rate": 7.266872024642958e-05, + "loss": 0.721, + "step": 4650 + }, + { + "epoch": 1.9472702786698437, + "grad_norm": 0.8696815371513367, + "learning_rate": 7.126855222626717e-05, + "loss": 0.7326, + "step": 4700 + }, + { + "epoch": 1.9679892261473118, + "grad_norm": 0.6648033857345581, + "learning_rate": 6.986838420610473e-05, + "loss": 0.6956, + "step": 4750 + }, + { + "epoch": 1.9679892261473118, + "eval_loss": 0.6553655862808228, + "eval_runtime": 86.4589, + "eval_samples_per_second": 55.818, + "eval_steps_per_second": 13.96, + "step": 4750 + }, + { + "epoch": 1.9887081736247798, + "grad_norm": 0.5694370269775391, + "learning_rate": 6.846821618594231e-05, + "loss": 0.6719, + "step": 4800 + }, + { + "epoch": 2.009116336890086, + "grad_norm": 0.7416983246803284, + "learning_rate": 6.706804816577989e-05, + "loss": 0.6569, + "step": 4850 + }, + { + "epoch": 2.0298352843675542, + "grad_norm": 0.7476940751075745, + "learning_rate": 6.566788014561747e-05, + "loss": 0.6353, + "step": 4900 + }, + { + "epoch": 2.0505542318450223, + "grad_norm": 0.8248530626296997, + "learning_rate": 6.426771212545505e-05, + "loss": 0.6319, + "step": 4950 + }, + { + "epoch": 2.0712731793224903, + "grad_norm": 1.035627841949463, + "learning_rate": 6.286754410529265e-05, + "loss": 0.6003, + "step": 5000 + }, + { + "epoch": 2.0712731793224903, + "eval_loss": 0.6616777777671814, + "eval_runtime": 86.7192, + "eval_samples_per_second": 55.651, + "eval_steps_per_second": 13.918, + "step": 5000 + }, + { + "epoch": 2.0919921267999584, + "grad_norm": 0.8701285719871521, + "learning_rate": 6.146737608513023e-05, + "loss": 0.6324, + "step": 5050 + }, + { + "epoch": 2.112711074277427, + "grad_norm": 0.836010217666626, + "learning_rate": 6.0067208064967795e-05, + "loss": 0.6547, + "step": 5100 + }, + { + "epoch": 2.133430021754895, + "grad_norm": 0.738888144493103, + "learning_rate": 5.8667040044805375e-05, + "loss": 0.6618, + "step": 5150 + }, + { + "epoch": 2.154148969232363, + "grad_norm": 1.0129936933517456, + "learning_rate": 5.726687202464296e-05, + "loss": 0.6855, + "step": 5200 + }, + { + "epoch": 2.174867916709831, + "grad_norm": 1.0437065362930298, + "learning_rate": 5.586670400448054e-05, + "loss": 0.6397, + "step": 5250 + }, + { + "epoch": 2.174867916709831, + "eval_loss": 0.6610354781150818, + "eval_runtime": 86.6962, + "eval_samples_per_second": 55.666, + "eval_steps_per_second": 13.922, + "step": 5250 + }, + { + "epoch": 2.195586864187299, + "grad_norm": 0.6604383587837219, + "learning_rate": 5.446653598431812e-05, + "loss": 0.6165, + "step": 5300 + }, + { + "epoch": 2.2163058116647676, + "grad_norm": 0.7305940985679626, + "learning_rate": 5.30663679641557e-05, + "loss": 0.6772, + "step": 5350 + }, + { + "epoch": 2.2370247591422356, + "grad_norm": 0.8462594747543335, + "learning_rate": 5.166619994399328e-05, + "loss": 0.6416, + "step": 5400 + }, + { + "epoch": 2.2577437066197037, + "grad_norm": 0.7274892330169678, + "learning_rate": 5.026603192383086e-05, + "loss": 0.6524, + "step": 5450 + }, + { + "epoch": 2.2784626540971717, + "grad_norm": 0.579065203666687, + "learning_rate": 4.886586390366844e-05, + "loss": 0.6464, + "step": 5500 + }, + { + "epoch": 2.2784626540971717, + "eval_loss": 0.659969687461853, + "eval_runtime": 86.3113, + "eval_samples_per_second": 55.914, + "eval_steps_per_second": 13.984, + "step": 5500 + }, + { + "epoch": 2.29918160157464, + "grad_norm": 0.8034997582435608, + "learning_rate": 4.7465695883506025e-05, + "loss": 0.6472, + "step": 5550 + }, + { + "epoch": 2.3199005490521083, + "grad_norm": 0.8343969583511353, + "learning_rate": 4.6065527863343605e-05, + "loss": 0.6459, + "step": 5600 + }, + { + "epoch": 2.3406194965295763, + "grad_norm": 0.8522002100944519, + "learning_rate": 4.466535984318118e-05, + "loss": 0.6345, + "step": 5650 + }, + { + "epoch": 2.3613384440070444, + "grad_norm": 1.0543782711029053, + "learning_rate": 4.3265191823018766e-05, + "loss": 0.6218, + "step": 5700 + }, + { + "epoch": 2.3820573914845125, + "grad_norm": 0.9417380690574646, + "learning_rate": 4.1865023802856346e-05, + "loss": 0.68, + "step": 5750 + }, + { + "epoch": 2.3820573914845125, + "eval_loss": 0.6579350233078003, + "eval_runtime": 86.1935, + "eval_samples_per_second": 55.99, + "eval_steps_per_second": 14.003, + "step": 5750 + }, + { + "epoch": 2.4027763389619805, + "grad_norm": 0.8992893099784851, + "learning_rate": 4.046485578269393e-05, + "loss": 0.647, + "step": 5800 + }, + { + "epoch": 2.423495286439449, + "grad_norm": 0.8680675029754639, + "learning_rate": 3.906468776253151e-05, + "loss": 0.6302, + "step": 5850 + }, + { + "epoch": 2.444214233916917, + "grad_norm": 0.878776490688324, + "learning_rate": 3.766451974236909e-05, + "loss": 0.6545, + "step": 5900 + }, + { + "epoch": 2.464933181394385, + "grad_norm": 0.8039425015449524, + "learning_rate": 3.626435172220667e-05, + "loss": 0.6557, + "step": 5950 + }, + { + "epoch": 2.485652128871853, + "grad_norm": 0.8756773471832275, + "learning_rate": 3.486418370204425e-05, + "loss": 0.639, + "step": 6000 + }, + { + "epoch": 2.485652128871853, + "eval_loss": 0.658104658126831, + "eval_runtime": 86.3311, + "eval_samples_per_second": 55.901, + "eval_steps_per_second": 13.981, + "step": 6000 + }, + { + "epoch": 2.506371076349321, + "grad_norm": 0.8273307085037231, + "learning_rate": 3.346401568188183e-05, + "loss": 0.6609, + "step": 6050 + }, + { + "epoch": 2.5270900238267897, + "grad_norm": 0.7528616786003113, + "learning_rate": 3.206384766171941e-05, + "loss": 0.6393, + "step": 6100 + }, + { + "epoch": 2.5478089713042578, + "grad_norm": 0.6834387183189392, + "learning_rate": 3.066367964155698e-05, + "loss": 0.6208, + "step": 6150 + }, + { + "epoch": 2.568527918781726, + "grad_norm": 0.6862203478813171, + "learning_rate": 2.9263511621394567e-05, + "loss": 0.6373, + "step": 6200 + }, + { + "epoch": 2.589246866259194, + "grad_norm": 1.0487428903579712, + "learning_rate": 2.786334360123215e-05, + "loss": 0.6481, + "step": 6250 + }, + { + "epoch": 2.589246866259194, + "eval_loss": 0.6565331816673279, + "eval_runtime": 86.2454, + "eval_samples_per_second": 55.957, + "eval_steps_per_second": 13.995, + "step": 6250 + }, + { + "epoch": 2.609965813736662, + "grad_norm": 1.1061326265335083, + "learning_rate": 2.646317558106973e-05, + "loss": 0.6502, + "step": 6300 + }, + { + "epoch": 2.6306847612141304, + "grad_norm": 0.6551749110221863, + "learning_rate": 2.5063007560907308e-05, + "loss": 0.6392, + "step": 6350 + }, + { + "epoch": 2.6514037086915985, + "grad_norm": 0.8796434998512268, + "learning_rate": 2.366283954074489e-05, + "loss": 0.6704, + "step": 6400 + }, + { + "epoch": 2.6721226561690665, + "grad_norm": 0.8105428218841553, + "learning_rate": 2.2262671520582472e-05, + "loss": 0.677, + "step": 6450 + }, + { + "epoch": 2.692841603646535, + "grad_norm": 0.9389123320579529, + "learning_rate": 2.0862503500420052e-05, + "loss": 0.6787, + "step": 6500 + }, + { + "epoch": 2.692841603646535, + "eval_loss": 0.6551876664161682, + "eval_runtime": 86.3099, + "eval_samples_per_second": 55.915, + "eval_steps_per_second": 13.984, + "step": 6500 + }, + { + "epoch": 2.7135605511240026, + "grad_norm": 0.6925713419914246, + "learning_rate": 1.9462335480257633e-05, + "loss": 0.5816, + "step": 6550 + }, + { + "epoch": 2.734279498601471, + "grad_norm": 1.048319935798645, + "learning_rate": 1.806216746009521e-05, + "loss": 0.655, + "step": 6600 + }, + { + "epoch": 2.754998446078939, + "grad_norm": 0.7885390520095825, + "learning_rate": 1.6661999439932793e-05, + "loss": 0.6496, + "step": 6650 + }, + { + "epoch": 2.7757173935564072, + "grad_norm": 0.7435409426689148, + "learning_rate": 1.5261831419770374e-05, + "loss": 0.5829, + "step": 6700 + }, + { + "epoch": 2.7964363410338757, + "grad_norm": 0.8922176361083984, + "learning_rate": 1.3861663399607954e-05, + "loss": 0.626, + "step": 6750 + }, + { + "epoch": 2.7964363410338757, + "eval_loss": 0.6544692516326904, + "eval_runtime": 86.2446, + "eval_samples_per_second": 55.957, + "eval_steps_per_second": 13.995, + "step": 6750 + }, + { + "epoch": 2.817155288511344, + "grad_norm": 0.8337688446044922, + "learning_rate": 1.2461495379445535e-05, + "loss": 0.6534, + "step": 6800 + }, + { + "epoch": 2.837874235988812, + "grad_norm": 0.7951143980026245, + "learning_rate": 1.1061327359283115e-05, + "loss": 0.6365, + "step": 6850 + }, + { + "epoch": 2.85859318346628, + "grad_norm": 0.9364272952079773, + "learning_rate": 9.661159339120695e-06, + "loss": 0.5941, + "step": 6900 + }, + { + "epoch": 2.879312130943748, + "grad_norm": 1.0142576694488525, + "learning_rate": 8.260991318958276e-06, + "loss": 0.6736, + "step": 6950 + }, + { + "epoch": 2.9000310784212164, + "grad_norm": 1.0106154680252075, + "learning_rate": 6.860823298795855e-06, + "loss": 0.6488, + "step": 7000 + }, + { + "epoch": 2.9000310784212164, + "eval_loss": 0.6540627479553223, + "eval_runtime": 86.2476, + "eval_samples_per_second": 55.955, + "eval_steps_per_second": 13.995, + "step": 7000 + } + ], + "logging_steps": 50, + "max_steps": 7242, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.397878075654963e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-7000/training_args.bin b/checkpoint-7000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..818731b2d637beef34d216b7461a12ef35db62df --- /dev/null +++ b/checkpoint-7000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a783872a61d64f4d1b4e002fa0fa67fa54dd9c59c4f7fbad58c794a77134069d +size 5432 diff --git a/checkpoint-7242/README.md b/checkpoint-7242/README.md new file mode 100644 index 0000000000000000000000000000000000000000..575d436fd9e4a7a4d30fbc4644c1fecf7480fbbe --- /dev/null +++ b/checkpoint-7242/README.md @@ -0,0 +1,207 @@ +--- +base_model: meta-llama/Meta-Llama-3.1-8B-Instruct +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:meta-llama/Meta-Llama-3.1-8B-Instruct +- lora +- transformers +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.0 \ No newline at end of file diff --git a/checkpoint-7242/adapter_config.json b/checkpoint-7242/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..84dde86ee0bd87c0f5d5a0a36710a2a58516a7c4 --- /dev/null +++ b/checkpoint-7242/adapter_config.json @@ -0,0 +1,43 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.0", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "o_proj", + "v_proj", + "k_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-7242/adapter_model.safetensors b/checkpoint-7242/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8174cae764eba155598eaa5c3fffe7ff0821570b --- /dev/null +++ b/checkpoint-7242/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51a3ee4389f92dd44a3d8cceb996af6e3499db40aa22d14be12dd98b00d3bdcb +size 54560368 diff --git a/checkpoint-7242/chat_template.jinja b/checkpoint-7242/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..33089ace1be88f22a10fe861ad49718d5d886090 --- /dev/null +++ b/checkpoint-7242/chat_template.jinja @@ -0,0 +1,109 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- set date_string = "26 Jul 2024" %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message + builtin tools #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if builtin_tools is defined or tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{%- if builtin_tools is defined %} + {{- "Tools: " + builtin_tools | reject('equalto', 'code_interpreter') | join(", ") + "\n\n"}} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {%- if builtin_tools is defined and tool_call.name in builtin_tools %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- "<|python_tag|>" + tool_call.name + ".call(" }} + {%- for arg_name, arg_val in tool_call.arguments | items %} + {{- arg_name + '="' + arg_val + '"' }} + {%- if not loop.last %} + {{- ", " }} + {%- endif %} + {%- endfor %} + {{- ")" }} + {%- else %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {%- endif %} + {%- if builtin_tools is defined %} + {#- This means we're in ipython mode #} + {{- "<|eom_id|>" }} + {%- else %} + {{- "<|eot_id|>" }} + {%- endif %} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/checkpoint-7242/optimizer.pt b/checkpoint-7242/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..5860a4d27fb28191fae5b2333ca27341e1e00763 --- /dev/null +++ b/checkpoint-7242/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fef77f47c0c040444306d728e6230cb6dbdc704cdf793a16acfbbe74279d0a9b +size 109267450 diff --git a/checkpoint-7242/rng_state.pth b/checkpoint-7242/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..c8098b28c8fcef412e9398eb7efe2d590d99428d --- /dev/null +++ b/checkpoint-7242/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54b8cbe8c62f31a984bc16d9fba8a6622d79dbe0cb27742810a9273c590df6e4 +size 14244 diff --git a/checkpoint-7242/scaler.pt b/checkpoint-7242/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..c062cc0ab59a6f2c4b95faa01d3e59c90180190a --- /dev/null +++ b/checkpoint-7242/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b01c3f11748e57397c53455d94a03ad4d5819ae60cdd04bd7f5255135fc883c4 +size 988 diff --git a/checkpoint-7242/scheduler.pt b/checkpoint-7242/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..d989929049fe0de35629aaf08f0b53e86ee64487 --- /dev/null +++ b/checkpoint-7242/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae1a21a819d6ea8bef9c50680c738fcdac40984fc725c1fb0ab84218830b3ff2 +size 1064 diff --git a/checkpoint-7242/special_tokens_map.json b/checkpoint-7242/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/checkpoint-7242/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/checkpoint-7242/tokenizer.json b/checkpoint-7242/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..d8b0bff70a20a3e87bcd1c207db61ae7a179bbf3 --- /dev/null +++ b/checkpoint-7242/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c85066e7642934ed09b44155e6566b0b5dab2637fb9433439ba5c9c7f8b50d3 +size 17210018 diff --git a/checkpoint-7242/tokenizer_config.json b/checkpoint-7242/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..3beeacc86a6ca3cae14ad3004263ab74a4bac07a --- /dev/null +++ b/checkpoint-7242/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/checkpoint-7242/trainer_state.json b/checkpoint-7242/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..29fc62d94b1813b982973a1c4e103921fc2934cc --- /dev/null +++ b/checkpoint-7242/trainer_state.json @@ -0,0 +1,1266 @@ +{ + "best_global_step": 7000, + "best_metric": 0.6540627479553223, + "best_model_checkpoint": "amiya_outputs/llama3.1-8b-amiya-palestinian/checkpoint-7000", + "epoch": 3.0, + "eval_steps": 250, + "global_step": 7242, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.020718947477468146, + "grad_norm": 1.0589393377304077, + "learning_rate": 9.8e-05, + "loss": 2.6567, + "step": 50 + }, + { + "epoch": 0.04143789495493629, + "grad_norm": 0.9738045334815979, + "learning_rate": 0.00019800000000000002, + "loss": 0.9502, + "step": 100 + }, + { + "epoch": 0.062156842432404436, + "grad_norm": 0.8801347017288208, + "learning_rate": 0.00019862783534024082, + "loss": 0.8496, + "step": 150 + }, + { + "epoch": 0.08287578990987259, + "grad_norm": 0.7272312045097351, + "learning_rate": 0.00019722766732007841, + "loss": 0.8184, + "step": 200 + }, + { + "epoch": 0.10359473738734072, + "grad_norm": 0.7850629091262817, + "learning_rate": 0.000195827499299916, + "loss": 0.8392, + "step": 250 + }, + { + "epoch": 0.10359473738734072, + "eval_loss": 0.7402811050415039, + "eval_runtime": 85.9367, + "eval_samples_per_second": 56.158, + "eval_steps_per_second": 14.045, + "step": 250 + }, + { + "epoch": 0.12431368486480887, + "grad_norm": 0.40629276633262634, + "learning_rate": 0.00019442733127975358, + "loss": 0.8108, + "step": 300 + }, + { + "epoch": 0.145032632342277, + "grad_norm": 0.5258236527442932, + "learning_rate": 0.00019302716325959117, + "loss": 0.8116, + "step": 350 + }, + { + "epoch": 0.16575157981974517, + "grad_norm": 0.6879925727844238, + "learning_rate": 0.00019162699523942874, + "loss": 0.9089, + "step": 400 + }, + { + "epoch": 0.1864705272972133, + "grad_norm": 0.7583937048912048, + "learning_rate": 0.00019022682721926633, + "loss": 0.874, + "step": 450 + }, + { + "epoch": 0.20718947477468144, + "grad_norm": 0.6399120688438416, + "learning_rate": 0.0001888266591991039, + "loss": 0.8366, + "step": 500 + }, + { + "epoch": 0.20718947477468144, + "eval_loss": 0.7194066047668457, + "eval_runtime": 86.2811, + "eval_samples_per_second": 55.933, + "eval_steps_per_second": 13.989, + "step": 500 + }, + { + "epoch": 0.22790842225214958, + "grad_norm": 0.7763131856918335, + "learning_rate": 0.0001874264911789415, + "loss": 0.7912, + "step": 550 + }, + { + "epoch": 0.24862736972961774, + "grad_norm": 0.6845299601554871, + "learning_rate": 0.00018602632315877906, + "loss": 0.8506, + "step": 600 + }, + { + "epoch": 0.2693463172070859, + "grad_norm": 0.8045451045036316, + "learning_rate": 0.00018462615513861665, + "loss": 0.763, + "step": 650 + }, + { + "epoch": 0.290065264684554, + "grad_norm": 0.7035927176475525, + "learning_rate": 0.00018322598711845422, + "loss": 0.7769, + "step": 700 + }, + { + "epoch": 0.3107842121620222, + "grad_norm": 0.465000718832016, + "learning_rate": 0.00018182581909829179, + "loss": 0.7705, + "step": 750 + }, + { + "epoch": 0.3107842121620222, + "eval_loss": 0.7103215456008911, + "eval_runtime": 86.1101, + "eval_samples_per_second": 56.045, + "eval_steps_per_second": 14.017, + "step": 750 + }, + { + "epoch": 0.33150315963949034, + "grad_norm": 0.4990151524543762, + "learning_rate": 0.00018042565107812938, + "loss": 0.8438, + "step": 800 + }, + { + "epoch": 0.35222210711695845, + "grad_norm": 0.7391067147254944, + "learning_rate": 0.00017902548305796695, + "loss": 0.7688, + "step": 850 + }, + { + "epoch": 0.3729410545944266, + "grad_norm": 0.8036171197891235, + "learning_rate": 0.00017762531503780454, + "loss": 0.753, + "step": 900 + }, + { + "epoch": 0.3936600020718947, + "grad_norm": 0.44744470715522766, + "learning_rate": 0.00017622514701764213, + "loss": 0.7793, + "step": 950 + }, + { + "epoch": 0.4143789495493629, + "grad_norm": 0.630820631980896, + "learning_rate": 0.00017482497899747973, + "loss": 0.7555, + "step": 1000 + }, + { + "epoch": 0.4143789495493629, + "eval_loss": 0.7030432820320129, + "eval_runtime": 86.2543, + "eval_samples_per_second": 55.951, + "eval_steps_per_second": 13.994, + "step": 1000 + }, + { + "epoch": 0.43509789702683105, + "grad_norm": 0.45690879225730896, + "learning_rate": 0.0001734248109773173, + "loss": 0.793, + "step": 1050 + }, + { + "epoch": 0.45581684450429916, + "grad_norm": 0.5000227093696594, + "learning_rate": 0.00017202464295715486, + "loss": 0.8342, + "step": 1100 + }, + { + "epoch": 0.4765357919817673, + "grad_norm": 0.47182488441467285, + "learning_rate": 0.00017062447493699246, + "loss": 0.7997, + "step": 1150 + }, + { + "epoch": 0.4972547394592355, + "grad_norm": 0.7060516476631165, + "learning_rate": 0.00016922430691683002, + "loss": 0.7788, + "step": 1200 + }, + { + "epoch": 0.5179736869367036, + "grad_norm": 0.46701857447624207, + "learning_rate": 0.00016782413889666762, + "loss": 0.7518, + "step": 1250 + }, + { + "epoch": 0.5179736869367036, + "eval_loss": 0.7023425698280334, + "eval_runtime": 86.3015, + "eval_samples_per_second": 55.92, + "eval_steps_per_second": 13.986, + "step": 1250 + }, + { + "epoch": 0.5386926344141718, + "grad_norm": 0.668192446231842, + "learning_rate": 0.00016642397087650518, + "loss": 0.7682, + "step": 1300 + }, + { + "epoch": 0.5594115818916399, + "grad_norm": 0.47292283177375793, + "learning_rate": 0.00016502380285634278, + "loss": 0.7985, + "step": 1350 + }, + { + "epoch": 0.580130529369108, + "grad_norm": 0.7327275276184082, + "learning_rate": 0.00016362363483618034, + "loss": 0.8378, + "step": 1400 + }, + { + "epoch": 0.6008494768465762, + "grad_norm": 0.8417996764183044, + "learning_rate": 0.0001622234668160179, + "loss": 0.7962, + "step": 1450 + }, + { + "epoch": 0.6215684243240444, + "grad_norm": 0.6189562678337097, + "learning_rate": 0.0001608232987958555, + "loss": 0.8028, + "step": 1500 + }, + { + "epoch": 0.6215684243240444, + "eval_loss": 0.6915447115898132, + "eval_runtime": 86.2147, + "eval_samples_per_second": 55.977, + "eval_steps_per_second": 14.0, + "step": 1500 + }, + { + "epoch": 0.6422873718015125, + "grad_norm": 0.7345826625823975, + "learning_rate": 0.0001594231307756931, + "loss": 0.7978, + "step": 1550 + }, + { + "epoch": 0.6630063192789807, + "grad_norm": 0.6538310050964355, + "learning_rate": 0.0001580229627555307, + "loss": 0.7672, + "step": 1600 + }, + { + "epoch": 0.6837252667564487, + "grad_norm": 0.661582350730896, + "learning_rate": 0.00015662279473536826, + "loss": 0.7378, + "step": 1650 + }, + { + "epoch": 0.7044442142339169, + "grad_norm": 0.3603042960166931, + "learning_rate": 0.00015522262671520583, + "loss": 0.6741, + "step": 1700 + }, + { + "epoch": 0.7251631617113851, + "grad_norm": 0.8882561326026917, + "learning_rate": 0.00015382245869504342, + "loss": 0.7695, + "step": 1750 + }, + { + "epoch": 0.7251631617113851, + "eval_loss": 0.6858941316604614, + "eval_runtime": 86.6358, + "eval_samples_per_second": 55.704, + "eval_steps_per_second": 13.932, + "step": 1750 + }, + { + "epoch": 0.7458821091888532, + "grad_norm": 0.5933266282081604, + "learning_rate": 0.000152422290674881, + "loss": 0.7548, + "step": 1800 + }, + { + "epoch": 0.7666010566663214, + "grad_norm": 0.8178608417510986, + "learning_rate": 0.00015102212265471858, + "loss": 0.7639, + "step": 1850 + }, + { + "epoch": 0.7873200041437894, + "grad_norm": 0.4378993511199951, + "learning_rate": 0.00014962195463455615, + "loss": 0.7985, + "step": 1900 + }, + { + "epoch": 0.8080389516212576, + "grad_norm": 0.3732803463935852, + "learning_rate": 0.00014822178661439374, + "loss": 0.8481, + "step": 1950 + }, + { + "epoch": 0.8287578990987258, + "grad_norm": 0.7421035170555115, + "learning_rate": 0.0001468216185942313, + "loss": 0.7223, + "step": 2000 + }, + { + "epoch": 0.8287578990987258, + "eval_loss": 0.6823315024375916, + "eval_runtime": 86.5575, + "eval_samples_per_second": 55.755, + "eval_steps_per_second": 13.944, + "step": 2000 + }, + { + "epoch": 0.8494768465761939, + "grad_norm": 0.5109913349151611, + "learning_rate": 0.00014542145057406888, + "loss": 0.7895, + "step": 2050 + }, + { + "epoch": 0.8701957940536621, + "grad_norm": 0.47988179326057434, + "learning_rate": 0.00014402128255390647, + "loss": 0.7385, + "step": 2100 + }, + { + "epoch": 0.8909147415311303, + "grad_norm": 0.7593080997467041, + "learning_rate": 0.00014262111453374404, + "loss": 0.7744, + "step": 2150 + }, + { + "epoch": 0.9116336890085983, + "grad_norm": 0.5866154432296753, + "learning_rate": 0.00014122094651358163, + "loss": 0.7062, + "step": 2200 + }, + { + "epoch": 0.9323526364860665, + "grad_norm": 0.47364088892936707, + "learning_rate": 0.00013982077849341922, + "loss": 0.7792, + "step": 2250 + }, + { + "epoch": 0.9323526364860665, + "eval_loss": 0.6785813570022583, + "eval_runtime": 86.3444, + "eval_samples_per_second": 55.892, + "eval_steps_per_second": 13.979, + "step": 2250 + }, + { + "epoch": 0.9530715839635346, + "grad_norm": 0.7610514760017395, + "learning_rate": 0.00013842061047325682, + "loss": 0.7804, + "step": 2300 + }, + { + "epoch": 0.9737905314410028, + "grad_norm": 0.7689616084098816, + "learning_rate": 0.00013702044245309438, + "loss": 0.7497, + "step": 2350 + }, + { + "epoch": 0.994509478918471, + "grad_norm": 0.542168378829956, + "learning_rate": 0.00013562027443293195, + "loss": 0.7333, + "step": 2400 + }, + { + "epoch": 1.0149176421837771, + "grad_norm": 0.33903324604034424, + "learning_rate": 0.0001342481097731728, + "loss": 0.6952, + "step": 2450 + }, + { + "epoch": 1.0356365896612452, + "grad_norm": 0.8183636665344238, + "learning_rate": 0.00013284794175301036, + "loss": 0.7386, + "step": 2500 + }, + { + "epoch": 1.0356365896612452, + "eval_loss": 0.675748348236084, + "eval_runtime": 86.2887, + "eval_samples_per_second": 55.929, + "eval_steps_per_second": 13.988, + "step": 2500 + }, + { + "epoch": 1.0563555371387134, + "grad_norm": 0.6831589937210083, + "learning_rate": 0.00013144777373284795, + "loss": 0.72, + "step": 2550 + }, + { + "epoch": 1.0770744846161815, + "grad_norm": 0.6346258521080017, + "learning_rate": 0.00013004760571268552, + "loss": 0.7026, + "step": 2600 + }, + { + "epoch": 1.0977934320936495, + "grad_norm": 0.5658385753631592, + "learning_rate": 0.0001286474376925231, + "loss": 0.7162, + "step": 2650 + }, + { + "epoch": 1.1185123795711178, + "grad_norm": 0.4242883026599884, + "learning_rate": 0.00012724726967236068, + "loss": 0.7325, + "step": 2700 + }, + { + "epoch": 1.1392313270485859, + "grad_norm": 0.5489133596420288, + "learning_rate": 0.00012584710165219827, + "loss": 0.7138, + "step": 2750 + }, + { + "epoch": 1.1392313270485859, + "eval_loss": 0.6747092604637146, + "eval_runtime": 86.4239, + "eval_samples_per_second": 55.841, + "eval_steps_per_second": 13.966, + "step": 2750 + }, + { + "epoch": 1.1599502745260541, + "grad_norm": 0.6514728665351868, + "learning_rate": 0.00012444693363203587, + "loss": 0.7105, + "step": 2800 + }, + { + "epoch": 1.1806692220035222, + "grad_norm": 0.48897412419319153, + "learning_rate": 0.00012304676561187343, + "loss": 0.7271, + "step": 2850 + }, + { + "epoch": 1.2013881694809903, + "grad_norm": 0.7159713506698608, + "learning_rate": 0.00012164659759171101, + "loss": 0.7454, + "step": 2900 + }, + { + "epoch": 1.2221071169584585, + "grad_norm": 0.7044214010238647, + "learning_rate": 0.0001202464295715486, + "loss": 0.6918, + "step": 2950 + }, + { + "epoch": 1.2428260644359266, + "grad_norm": 0.7934305667877197, + "learning_rate": 0.00011884626155138616, + "loss": 0.7018, + "step": 3000 + }, + { + "epoch": 1.2428260644359266, + "eval_loss": 0.6727278828620911, + "eval_runtime": 86.1985, + "eval_samples_per_second": 55.987, + "eval_steps_per_second": 14.003, + "step": 3000 + }, + { + "epoch": 1.2635450119133949, + "grad_norm": 0.8456618785858154, + "learning_rate": 0.00011744609353122375, + "loss": 0.763, + "step": 3050 + }, + { + "epoch": 1.284263959390863, + "grad_norm": 0.5733729600906372, + "learning_rate": 0.00011604592551106132, + "loss": 0.7034, + "step": 3100 + }, + { + "epoch": 1.304982906868331, + "grad_norm": 0.4783104658126831, + "learning_rate": 0.00011464575749089892, + "loss": 0.762, + "step": 3150 + }, + { + "epoch": 1.3257018543457992, + "grad_norm": 0.7016689777374268, + "learning_rate": 0.0001132455894707365, + "loss": 0.7049, + "step": 3200 + }, + { + "epoch": 1.3464208018232675, + "grad_norm": 0.6739513278007507, + "learning_rate": 0.00011184542145057409, + "loss": 0.7137, + "step": 3250 + }, + { + "epoch": 1.3464208018232675, + "eval_loss": 0.6689812541007996, + "eval_runtime": 86.4895, + "eval_samples_per_second": 55.799, + "eval_steps_per_second": 13.955, + "step": 3250 + }, + { + "epoch": 1.3671397493007356, + "grad_norm": 0.8907766938209534, + "learning_rate": 0.00011044525343041166, + "loss": 0.7476, + "step": 3300 + }, + { + "epoch": 1.3878586967782036, + "grad_norm": 0.8889743089675903, + "learning_rate": 0.00010904508541024922, + "loss": 0.7059, + "step": 3350 + }, + { + "epoch": 1.408577644255672, + "grad_norm": 0.5788094401359558, + "learning_rate": 0.00010764491739008682, + "loss": 0.7018, + "step": 3400 + }, + { + "epoch": 1.42929659173314, + "grad_norm": 0.7107548713684082, + "learning_rate": 0.00010624474936992438, + "loss": 0.6796, + "step": 3450 + }, + { + "epoch": 1.4500155392106082, + "grad_norm": 0.6979348063468933, + "learning_rate": 0.00010484458134976198, + "loss": 0.7212, + "step": 3500 + }, + { + "epoch": 1.4500155392106082, + "eval_loss": 0.6663665175437927, + "eval_runtime": 86.5532, + "eval_samples_per_second": 55.758, + "eval_steps_per_second": 13.945, + "step": 3500 + }, + { + "epoch": 1.4707344866880763, + "grad_norm": 0.7232558727264404, + "learning_rate": 0.00010344441332959956, + "loss": 0.6814, + "step": 3550 + }, + { + "epoch": 1.4914534341655443, + "grad_norm": 0.8630662560462952, + "learning_rate": 0.00010204424530943715, + "loss": 0.7012, + "step": 3600 + }, + { + "epoch": 1.5121723816430124, + "grad_norm": 0.9553645253181458, + "learning_rate": 0.00010064407728927472, + "loss": 0.7247, + "step": 3650 + }, + { + "epoch": 1.5328913291204807, + "grad_norm": 0.6892822980880737, + "learning_rate": 9.92439092691123e-05, + "loss": 0.7009, + "step": 3700 + }, + { + "epoch": 1.553610276597949, + "grad_norm": 0.8881245255470276, + "learning_rate": 9.787174460935312e-05, + "loss": 0.7579, + "step": 3750 + }, + { + "epoch": 1.553610276597949, + "eval_loss": 0.6655827164649963, + "eval_runtime": 86.317, + "eval_samples_per_second": 55.91, + "eval_steps_per_second": 13.983, + "step": 3750 + }, + { + "epoch": 1.574329224075417, + "grad_norm": 0.6604064702987671, + "learning_rate": 9.64715765891907e-05, + "loss": 0.7003, + "step": 3800 + }, + { + "epoch": 1.595048171552885, + "grad_norm": 0.5936245918273926, + "learning_rate": 9.507140856902829e-05, + "loss": 0.7093, + "step": 3850 + }, + { + "epoch": 1.6157671190303533, + "grad_norm": 0.6983786225318909, + "learning_rate": 9.367124054886587e-05, + "loss": 0.7009, + "step": 3900 + }, + { + "epoch": 1.6364860665078214, + "grad_norm": 0.6833502054214478, + "learning_rate": 9.227107252870345e-05, + "loss": 0.7293, + "step": 3950 + }, + { + "epoch": 1.6572050139852896, + "grad_norm": 0.5032167434692383, + "learning_rate": 9.087090450854103e-05, + "loss": 0.6968, + "step": 4000 + }, + { + "epoch": 1.6572050139852896, + "eval_loss": 0.6616591215133667, + "eval_runtime": 86.2625, + "eval_samples_per_second": 55.946, + "eval_steps_per_second": 13.992, + "step": 4000 + }, + { + "epoch": 1.6779239614627577, + "grad_norm": 0.73284912109375, + "learning_rate": 8.947073648837862e-05, + "loss": 0.723, + "step": 4050 + }, + { + "epoch": 1.6986429089402257, + "grad_norm": 0.7727170586585999, + "learning_rate": 8.807056846821619e-05, + "loss": 0.6965, + "step": 4100 + }, + { + "epoch": 1.719361856417694, + "grad_norm": 0.6575957536697388, + "learning_rate": 8.667040044805377e-05, + "loss": 0.6804, + "step": 4150 + }, + { + "epoch": 1.7400808038951623, + "grad_norm": 0.7174975275993347, + "learning_rate": 8.527023242789135e-05, + "loss": 0.7388, + "step": 4200 + }, + { + "epoch": 1.7607997513726303, + "grad_norm": 0.7730789184570312, + "learning_rate": 8.387006440772893e-05, + "loss": 0.6716, + "step": 4250 + }, + { + "epoch": 1.7607997513726303, + "eval_loss": 0.6594452857971191, + "eval_runtime": 86.4796, + "eval_samples_per_second": 55.805, + "eval_steps_per_second": 13.957, + "step": 4250 + }, + { + "epoch": 1.7815186988500984, + "grad_norm": 0.6953691840171814, + "learning_rate": 8.246989638756651e-05, + "loss": 0.6997, + "step": 4300 + }, + { + "epoch": 1.8022376463275664, + "grad_norm": 0.5468209385871887, + "learning_rate": 8.10697283674041e-05, + "loss": 0.7069, + "step": 4350 + }, + { + "epoch": 1.8229565938050347, + "grad_norm": 0.649025559425354, + "learning_rate": 7.966956034724167e-05, + "loss": 0.7179, + "step": 4400 + }, + { + "epoch": 1.843675541282503, + "grad_norm": 0.9825453162193298, + "learning_rate": 7.826939232707925e-05, + "loss": 0.7224, + "step": 4450 + }, + { + "epoch": 1.864394488759971, + "grad_norm": 0.5808931589126587, + "learning_rate": 7.686922430691683e-05, + "loss": 0.6139, + "step": 4500 + }, + { + "epoch": 1.864394488759971, + "eval_loss": 0.6582211852073669, + "eval_runtime": 86.3106, + "eval_samples_per_second": 55.914, + "eval_steps_per_second": 13.984, + "step": 4500 + }, + { + "epoch": 1.885113436237439, + "grad_norm": 0.8670596480369568, + "learning_rate": 7.546905628675441e-05, + "loss": 0.6993, + "step": 4550 + }, + { + "epoch": 1.9058323837149072, + "grad_norm": 0.5719550251960754, + "learning_rate": 7.406888826659199e-05, + "loss": 0.7224, + "step": 4600 + }, + { + "epoch": 1.9265513311923754, + "grad_norm": 0.80317622423172, + "learning_rate": 7.266872024642958e-05, + "loss": 0.721, + "step": 4650 + }, + { + "epoch": 1.9472702786698437, + "grad_norm": 0.8696815371513367, + "learning_rate": 7.126855222626717e-05, + "loss": 0.7326, + "step": 4700 + }, + { + "epoch": 1.9679892261473118, + "grad_norm": 0.6648033857345581, + "learning_rate": 6.986838420610473e-05, + "loss": 0.6956, + "step": 4750 + }, + { + "epoch": 1.9679892261473118, + "eval_loss": 0.6553655862808228, + "eval_runtime": 86.4589, + "eval_samples_per_second": 55.818, + "eval_steps_per_second": 13.96, + "step": 4750 + }, + { + "epoch": 1.9887081736247798, + "grad_norm": 0.5694370269775391, + "learning_rate": 6.846821618594231e-05, + "loss": 0.6719, + "step": 4800 + }, + { + "epoch": 2.009116336890086, + "grad_norm": 0.7416983246803284, + "learning_rate": 6.706804816577989e-05, + "loss": 0.6569, + "step": 4850 + }, + { + "epoch": 2.0298352843675542, + "grad_norm": 0.7476940751075745, + "learning_rate": 6.566788014561747e-05, + "loss": 0.6353, + "step": 4900 + }, + { + "epoch": 2.0505542318450223, + "grad_norm": 0.8248530626296997, + "learning_rate": 6.426771212545505e-05, + "loss": 0.6319, + "step": 4950 + }, + { + "epoch": 2.0712731793224903, + "grad_norm": 1.035627841949463, + "learning_rate": 6.286754410529265e-05, + "loss": 0.6003, + "step": 5000 + }, + { + "epoch": 2.0712731793224903, + "eval_loss": 0.6616777777671814, + "eval_runtime": 86.7192, + "eval_samples_per_second": 55.651, + "eval_steps_per_second": 13.918, + "step": 5000 + }, + { + "epoch": 2.0919921267999584, + "grad_norm": 0.8701285719871521, + "learning_rate": 6.146737608513023e-05, + "loss": 0.6324, + "step": 5050 + }, + { + "epoch": 2.112711074277427, + "grad_norm": 0.836010217666626, + "learning_rate": 6.0067208064967795e-05, + "loss": 0.6547, + "step": 5100 + }, + { + "epoch": 2.133430021754895, + "grad_norm": 0.738888144493103, + "learning_rate": 5.8667040044805375e-05, + "loss": 0.6618, + "step": 5150 + }, + { + "epoch": 2.154148969232363, + "grad_norm": 1.0129936933517456, + "learning_rate": 5.726687202464296e-05, + "loss": 0.6855, + "step": 5200 + }, + { + "epoch": 2.174867916709831, + "grad_norm": 1.0437065362930298, + "learning_rate": 5.586670400448054e-05, + "loss": 0.6397, + "step": 5250 + }, + { + "epoch": 2.174867916709831, + "eval_loss": 0.6610354781150818, + "eval_runtime": 86.6962, + "eval_samples_per_second": 55.666, + "eval_steps_per_second": 13.922, + "step": 5250 + }, + { + "epoch": 2.195586864187299, + "grad_norm": 0.6604383587837219, + "learning_rate": 5.446653598431812e-05, + "loss": 0.6165, + "step": 5300 + }, + { + "epoch": 2.2163058116647676, + "grad_norm": 0.7305940985679626, + "learning_rate": 5.30663679641557e-05, + "loss": 0.6772, + "step": 5350 + }, + { + "epoch": 2.2370247591422356, + "grad_norm": 0.8462594747543335, + "learning_rate": 5.166619994399328e-05, + "loss": 0.6416, + "step": 5400 + }, + { + "epoch": 2.2577437066197037, + "grad_norm": 0.7274892330169678, + "learning_rate": 5.026603192383086e-05, + "loss": 0.6524, + "step": 5450 + }, + { + "epoch": 2.2784626540971717, + "grad_norm": 0.579065203666687, + "learning_rate": 4.886586390366844e-05, + "loss": 0.6464, + "step": 5500 + }, + { + "epoch": 2.2784626540971717, + "eval_loss": 0.659969687461853, + "eval_runtime": 86.3113, + "eval_samples_per_second": 55.914, + "eval_steps_per_second": 13.984, + "step": 5500 + }, + { + "epoch": 2.29918160157464, + "grad_norm": 0.8034997582435608, + "learning_rate": 4.7465695883506025e-05, + "loss": 0.6472, + "step": 5550 + }, + { + "epoch": 2.3199005490521083, + "grad_norm": 0.8343969583511353, + "learning_rate": 4.6065527863343605e-05, + "loss": 0.6459, + "step": 5600 + }, + { + "epoch": 2.3406194965295763, + "grad_norm": 0.8522002100944519, + "learning_rate": 4.466535984318118e-05, + "loss": 0.6345, + "step": 5650 + }, + { + "epoch": 2.3613384440070444, + "grad_norm": 1.0543782711029053, + "learning_rate": 4.3265191823018766e-05, + "loss": 0.6218, + "step": 5700 + }, + { + "epoch": 2.3820573914845125, + "grad_norm": 0.9417380690574646, + "learning_rate": 4.1865023802856346e-05, + "loss": 0.68, + "step": 5750 + }, + { + "epoch": 2.3820573914845125, + "eval_loss": 0.6579350233078003, + "eval_runtime": 86.1935, + "eval_samples_per_second": 55.99, + "eval_steps_per_second": 14.003, + "step": 5750 + }, + { + "epoch": 2.4027763389619805, + "grad_norm": 0.8992893099784851, + "learning_rate": 4.046485578269393e-05, + "loss": 0.647, + "step": 5800 + }, + { + "epoch": 2.423495286439449, + "grad_norm": 0.8680675029754639, + "learning_rate": 3.906468776253151e-05, + "loss": 0.6302, + "step": 5850 + }, + { + "epoch": 2.444214233916917, + "grad_norm": 0.878776490688324, + "learning_rate": 3.766451974236909e-05, + "loss": 0.6545, + "step": 5900 + }, + { + "epoch": 2.464933181394385, + "grad_norm": 0.8039425015449524, + "learning_rate": 3.626435172220667e-05, + "loss": 0.6557, + "step": 5950 + }, + { + "epoch": 2.485652128871853, + "grad_norm": 0.8756773471832275, + "learning_rate": 3.486418370204425e-05, + "loss": 0.639, + "step": 6000 + }, + { + "epoch": 2.485652128871853, + "eval_loss": 0.658104658126831, + "eval_runtime": 86.3311, + "eval_samples_per_second": 55.901, + "eval_steps_per_second": 13.981, + "step": 6000 + }, + { + "epoch": 2.506371076349321, + "grad_norm": 0.8273307085037231, + "learning_rate": 3.346401568188183e-05, + "loss": 0.6609, + "step": 6050 + }, + { + "epoch": 2.5270900238267897, + "grad_norm": 0.7528616786003113, + "learning_rate": 3.206384766171941e-05, + "loss": 0.6393, + "step": 6100 + }, + { + "epoch": 2.5478089713042578, + "grad_norm": 0.6834387183189392, + "learning_rate": 3.066367964155698e-05, + "loss": 0.6208, + "step": 6150 + }, + { + "epoch": 2.568527918781726, + "grad_norm": 0.6862203478813171, + "learning_rate": 2.9263511621394567e-05, + "loss": 0.6373, + "step": 6200 + }, + { + "epoch": 2.589246866259194, + "grad_norm": 1.0487428903579712, + "learning_rate": 2.786334360123215e-05, + "loss": 0.6481, + "step": 6250 + }, + { + "epoch": 2.589246866259194, + "eval_loss": 0.6565331816673279, + "eval_runtime": 86.2454, + "eval_samples_per_second": 55.957, + "eval_steps_per_second": 13.995, + "step": 6250 + }, + { + "epoch": 2.609965813736662, + "grad_norm": 1.1061326265335083, + "learning_rate": 2.646317558106973e-05, + "loss": 0.6502, + "step": 6300 + }, + { + "epoch": 2.6306847612141304, + "grad_norm": 0.6551749110221863, + "learning_rate": 2.5063007560907308e-05, + "loss": 0.6392, + "step": 6350 + }, + { + "epoch": 2.6514037086915985, + "grad_norm": 0.8796434998512268, + "learning_rate": 2.366283954074489e-05, + "loss": 0.6704, + "step": 6400 + }, + { + "epoch": 2.6721226561690665, + "grad_norm": 0.8105428218841553, + "learning_rate": 2.2262671520582472e-05, + "loss": 0.677, + "step": 6450 + }, + { + "epoch": 2.692841603646535, + "grad_norm": 0.9389123320579529, + "learning_rate": 2.0862503500420052e-05, + "loss": 0.6787, + "step": 6500 + }, + { + "epoch": 2.692841603646535, + "eval_loss": 0.6551876664161682, + "eval_runtime": 86.3099, + "eval_samples_per_second": 55.915, + "eval_steps_per_second": 13.984, + "step": 6500 + }, + { + "epoch": 2.7135605511240026, + "grad_norm": 0.6925713419914246, + "learning_rate": 1.9462335480257633e-05, + "loss": 0.5816, + "step": 6550 + }, + { + "epoch": 2.734279498601471, + "grad_norm": 1.048319935798645, + "learning_rate": 1.806216746009521e-05, + "loss": 0.655, + "step": 6600 + }, + { + "epoch": 2.754998446078939, + "grad_norm": 0.7885390520095825, + "learning_rate": 1.6661999439932793e-05, + "loss": 0.6496, + "step": 6650 + }, + { + "epoch": 2.7757173935564072, + "grad_norm": 0.7435409426689148, + "learning_rate": 1.5261831419770374e-05, + "loss": 0.5829, + "step": 6700 + }, + { + "epoch": 2.7964363410338757, + "grad_norm": 0.8922176361083984, + "learning_rate": 1.3861663399607954e-05, + "loss": 0.626, + "step": 6750 + }, + { + "epoch": 2.7964363410338757, + "eval_loss": 0.6544692516326904, + "eval_runtime": 86.2446, + "eval_samples_per_second": 55.957, + "eval_steps_per_second": 13.995, + "step": 6750 + }, + { + "epoch": 2.817155288511344, + "grad_norm": 0.8337688446044922, + "learning_rate": 1.2461495379445535e-05, + "loss": 0.6534, + "step": 6800 + }, + { + "epoch": 2.837874235988812, + "grad_norm": 0.7951143980026245, + "learning_rate": 1.1061327359283115e-05, + "loss": 0.6365, + "step": 6850 + }, + { + "epoch": 2.85859318346628, + "grad_norm": 0.9364272952079773, + "learning_rate": 9.661159339120695e-06, + "loss": 0.5941, + "step": 6900 + }, + { + "epoch": 2.879312130943748, + "grad_norm": 1.0142576694488525, + "learning_rate": 8.260991318958276e-06, + "loss": 0.6736, + "step": 6950 + }, + { + "epoch": 2.9000310784212164, + "grad_norm": 1.0106154680252075, + "learning_rate": 6.860823298795855e-06, + "loss": 0.6488, + "step": 7000 + }, + { + "epoch": 2.9000310784212164, + "eval_loss": 0.6540627479553223, + "eval_runtime": 86.2476, + "eval_samples_per_second": 55.955, + "eval_steps_per_second": 13.995, + "step": 7000 + }, + { + "epoch": 2.9207500258986845, + "grad_norm": 0.9546412825584412, + "learning_rate": 5.4606552786334365e-06, + "loss": 0.6435, + "step": 7050 + }, + { + "epoch": 2.9414689733761525, + "grad_norm": 0.6337213516235352, + "learning_rate": 4.060487258471016e-06, + "loss": 0.6192, + "step": 7100 + }, + { + "epoch": 2.9621879208536206, + "grad_norm": 0.8504210114479065, + "learning_rate": 2.6603192383085973e-06, + "loss": 0.626, + "step": 7150 + }, + { + "epoch": 2.9829068683310886, + "grad_norm": 0.9242532849311829, + "learning_rate": 1.2601512181461777e-06, + "loss": 0.6625, + "step": 7200 + } + ], + "logging_steps": 50, + "max_steps": 7242, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 4.549942076959949e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-7242/training_args.bin b/checkpoint-7242/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..818731b2d637beef34d216b7461a12ef35db62df --- /dev/null +++ b/checkpoint-7242/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a783872a61d64f4d1b4e002fa0fa67fa54dd9c59c4f7fbad58c794a77134069d +size 5432 diff --git a/training_args.bin b/training_args.bin index eb236cf9d6a781057673256e00030110ee72a3cc..818731b2d637beef34d216b7461a12ef35db62df 100644 --- a/training_args.bin +++ b/training_args.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eb76e3e4d2123d52e529262f1ff37bfc600a160bb369e2338a54f71b47c17108 +oid sha256:a783872a61d64f4d1b4e002fa0fa67fa54dd9c59c4f7fbad58c794a77134069d size 5432