| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.9972247918593897, | |
| "eval_steps": 500, | |
| "global_step": 810, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.03700277520814061, | |
| "grad_norm": 2.5574069547071865, | |
| "learning_rate": 5e-06, | |
| "loss": 1.0349, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.07400555041628122, | |
| "grad_norm": 5.900418380831986, | |
| "learning_rate": 5e-06, | |
| "loss": 0.9193, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.11100832562442182, | |
| "grad_norm": 1.343234502219187, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8706, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.14801110083256244, | |
| "grad_norm": 1.3710163438367058, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8479, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.18501387604070305, | |
| "grad_norm": 1.0033978639288033, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8243, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.22201665124884365, | |
| "grad_norm": 1.3608933171871491, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8107, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.2590194264569843, | |
| "grad_norm": 1.1028614841829054, | |
| "learning_rate": 5e-06, | |
| "loss": 0.799, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.2960222016651249, | |
| "grad_norm": 1.0434200254799575, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7885, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.3330249768732655, | |
| "grad_norm": 1.229274798797147, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7834, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.3700277520814061, | |
| "grad_norm": 0.9205856097213055, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7777, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.4070305272895467, | |
| "grad_norm": 0.7497767563492606, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7724, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.4440333024976873, | |
| "grad_norm": 0.8373352830011821, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7683, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.48103607770582796, | |
| "grad_norm": 0.9313579962563663, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7616, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.5180388529139686, | |
| "grad_norm": 0.6590049252216683, | |
| "learning_rate": 5e-06, | |
| "loss": 0.761, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.5550416281221091, | |
| "grad_norm": 0.7121322312495858, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7581, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.5920444033302498, | |
| "grad_norm": 0.6148417146408288, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7596, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.6290471785383904, | |
| "grad_norm": 0.7859140889960612, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7607, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.666049953746531, | |
| "grad_norm": 0.6262995603184957, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7536, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.7030527289546716, | |
| "grad_norm": 0.8388208254030984, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7509, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.7400555041628122, | |
| "grad_norm": 0.6363523499932093, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7477, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.7770582793709528, | |
| "grad_norm": 0.557587555852944, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7452, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.8140610545790934, | |
| "grad_norm": 0.5789377197252322, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7475, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.851063829787234, | |
| "grad_norm": 0.7489244924234153, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7423, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.8880666049953746, | |
| "grad_norm": 0.7658948623639423, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7407, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.9250693802035153, | |
| "grad_norm": 0.6418871797978494, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7429, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.9620721554116559, | |
| "grad_norm": 0.6374125796179182, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7457, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.9990749306197965, | |
| "grad_norm": 0.6225924769184604, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7398, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.9990749306197965, | |
| "eval_loss": 0.7373877167701721, | |
| "eval_runtime": 285.6741, | |
| "eval_samples_per_second": 25.487, | |
| "eval_steps_per_second": 0.399, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.0360777058279371, | |
| "grad_norm": 1.0999156797786174, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7465, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.0730804810360777, | |
| "grad_norm": 0.9033567059221405, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6916, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.1100832562442182, | |
| "grad_norm": 0.7225203751531558, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6891, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.147086031452359, | |
| "grad_norm": 0.7744016164334468, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6924, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.1840888066604995, | |
| "grad_norm": 0.5778670419847766, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6847, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.22109158186864, | |
| "grad_norm": 0.7216085851109396, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6874, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.2580943570767809, | |
| "grad_norm": 0.6834497791089044, | |
| "learning_rate": 5e-06, | |
| "loss": 0.685, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.2950971322849214, | |
| "grad_norm": 0.5864187616860316, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6887, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.332099907493062, | |
| "grad_norm": 0.6435850583829653, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6871, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.3691026827012025, | |
| "grad_norm": 1.026308036346174, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6914, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.4061054579093433, | |
| "grad_norm": 0.6177344760653564, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6895, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.4431082331174838, | |
| "grad_norm": 0.5769701433909521, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6899, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.4801110083256244, | |
| "grad_norm": 0.6884699856368363, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6846, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.5171137835337651, | |
| "grad_norm": 0.6251420323292153, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6886, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.5541165587419057, | |
| "grad_norm": 0.6980683262318477, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6845, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.5911193339500462, | |
| "grad_norm": 0.5739588719749933, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6885, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.6281221091581868, | |
| "grad_norm": 0.5948644114545361, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6894, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.6651248843663273, | |
| "grad_norm": 0.5752931750826541, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6844, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.702127659574468, | |
| "grad_norm": 0.6226615522398108, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6873, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.7391304347826086, | |
| "grad_norm": 0.709209259109901, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6822, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.7761332099907494, | |
| "grad_norm": 0.6748952075449096, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6866, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.81313598519889, | |
| "grad_norm": 0.7823171266955319, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6865, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.8501387604070305, | |
| "grad_norm": 0.9392626078254421, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6879, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.887141535615171, | |
| "grad_norm": 0.7275512298704835, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6826, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.9241443108233116, | |
| "grad_norm": 0.8091982613912542, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6841, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.9611470860314524, | |
| "grad_norm": 0.6148273009527061, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6882, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.998149861239593, | |
| "grad_norm": 0.6033253124158411, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6816, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.998149861239593, | |
| "eval_loss": 0.7247459888458252, | |
| "eval_runtime": 286.0565, | |
| "eval_samples_per_second": 25.453, | |
| "eval_steps_per_second": 0.399, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 2.0351526364477337, | |
| "grad_norm": 0.6676285360839004, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6817, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 2.0721554116558742, | |
| "grad_norm": 0.6670166248278999, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6302, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 2.109158186864015, | |
| "grad_norm": 0.7594092171720456, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6333, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 2.1461609620721553, | |
| "grad_norm": 0.734159893671992, | |
| "learning_rate": 5e-06, | |
| "loss": 0.631, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 2.183163737280296, | |
| "grad_norm": 0.7620304153842595, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6331, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 2.2201665124884364, | |
| "grad_norm": 0.8062383311336945, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6352, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 2.2571692876965774, | |
| "grad_norm": 0.6298902178051499, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6324, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 2.294172062904718, | |
| "grad_norm": 0.6917188810207621, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6373, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 2.3311748381128585, | |
| "grad_norm": 0.9550757835002316, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6357, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 2.368177613320999, | |
| "grad_norm": 0.5913508720714069, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6351, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 2.4051803885291396, | |
| "grad_norm": 0.6758217219286833, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6349, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 2.44218316373728, | |
| "grad_norm": 0.5787936091702227, | |
| "learning_rate": 5e-06, | |
| "loss": 0.632, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 2.4791859389454207, | |
| "grad_norm": 0.5976179095981347, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6303, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 2.5161887141535617, | |
| "grad_norm": 0.7954067986819094, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6319, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 2.5531914893617023, | |
| "grad_norm": 0.6891125209012705, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6383, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 2.590194264569843, | |
| "grad_norm": 0.8230201955415605, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6371, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 2.6271970397779834, | |
| "grad_norm": 0.6635633756561687, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6382, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 2.664199814986124, | |
| "grad_norm": 0.6306477728740528, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6411, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 2.7012025901942645, | |
| "grad_norm": 0.5984777601069516, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6369, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 2.738205365402405, | |
| "grad_norm": 0.7644851120709378, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6348, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 2.7752081406105455, | |
| "grad_norm": 0.6478127083239548, | |
| "learning_rate": 5e-06, | |
| "loss": 0.636, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 2.8122109158186865, | |
| "grad_norm": 0.6453201797896143, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6396, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 2.849213691026827, | |
| "grad_norm": 0.7223841425019709, | |
| "learning_rate": 5e-06, | |
| "loss": 0.639, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 2.8862164662349676, | |
| "grad_norm": 0.8854103875073065, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6352, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 2.923219241443108, | |
| "grad_norm": 0.74480497953526, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6387, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 2.9602220166512487, | |
| "grad_norm": 0.5468951542823913, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6346, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 2.9972247918593897, | |
| "grad_norm": 0.7117523367143715, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6406, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 2.9972247918593897, | |
| "eval_loss": 0.7273637056350708, | |
| "eval_runtime": 286.127, | |
| "eval_samples_per_second": 25.447, | |
| "eval_steps_per_second": 0.398, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 2.9972247918593897, | |
| "step": 810, | |
| "total_flos": 1356570789150720.0, | |
| "train_loss": 0.7045574435481319, | |
| "train_runtime": 47714.0524, | |
| "train_samples_per_second": 8.697, | |
| "train_steps_per_second": 0.017 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 810, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1356570789150720.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |