{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9972247918593897, "eval_steps": 500, "global_step": 810, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03700277520814061, "grad_norm": 2.5574069547071865, "learning_rate": 5e-06, "loss": 1.0349, "step": 10 }, { "epoch": 0.07400555041628122, "grad_norm": 5.900418380831986, "learning_rate": 5e-06, "loss": 0.9193, "step": 20 }, { "epoch": 0.11100832562442182, "grad_norm": 1.343234502219187, "learning_rate": 5e-06, "loss": 0.8706, "step": 30 }, { "epoch": 0.14801110083256244, "grad_norm": 1.3710163438367058, "learning_rate": 5e-06, "loss": 0.8479, "step": 40 }, { "epoch": 0.18501387604070305, "grad_norm": 1.0033978639288033, "learning_rate": 5e-06, "loss": 0.8243, "step": 50 }, { "epoch": 0.22201665124884365, "grad_norm": 1.3608933171871491, "learning_rate": 5e-06, "loss": 0.8107, "step": 60 }, { "epoch": 0.2590194264569843, "grad_norm": 1.1028614841829054, "learning_rate": 5e-06, "loss": 0.799, "step": 70 }, { "epoch": 0.2960222016651249, "grad_norm": 1.0434200254799575, "learning_rate": 5e-06, "loss": 0.7885, "step": 80 }, { "epoch": 0.3330249768732655, "grad_norm": 1.229274798797147, "learning_rate": 5e-06, "loss": 0.7834, "step": 90 }, { "epoch": 0.3700277520814061, "grad_norm": 0.9205856097213055, "learning_rate": 5e-06, "loss": 0.7777, "step": 100 }, { "epoch": 0.4070305272895467, "grad_norm": 0.7497767563492606, "learning_rate": 5e-06, "loss": 0.7724, "step": 110 }, { "epoch": 0.4440333024976873, "grad_norm": 0.8373352830011821, "learning_rate": 5e-06, "loss": 0.7683, "step": 120 }, { "epoch": 0.48103607770582796, "grad_norm": 0.9313579962563663, "learning_rate": 5e-06, "loss": 0.7616, "step": 130 }, { "epoch": 0.5180388529139686, "grad_norm": 0.6590049252216683, "learning_rate": 5e-06, "loss": 0.761, "step": 140 }, { "epoch": 0.5550416281221091, "grad_norm": 0.7121322312495858, "learning_rate": 5e-06, "loss": 0.7581, "step": 150 }, { "epoch": 0.5920444033302498, "grad_norm": 0.6148417146408288, "learning_rate": 5e-06, "loss": 0.7596, "step": 160 }, { "epoch": 0.6290471785383904, "grad_norm": 0.7859140889960612, "learning_rate": 5e-06, "loss": 0.7607, "step": 170 }, { "epoch": 0.666049953746531, "grad_norm": 0.6262995603184957, "learning_rate": 5e-06, "loss": 0.7536, "step": 180 }, { "epoch": 0.7030527289546716, "grad_norm": 0.8388208254030984, "learning_rate": 5e-06, "loss": 0.7509, "step": 190 }, { "epoch": 0.7400555041628122, "grad_norm": 0.6363523499932093, "learning_rate": 5e-06, "loss": 0.7477, "step": 200 }, { "epoch": 0.7770582793709528, "grad_norm": 0.557587555852944, "learning_rate": 5e-06, "loss": 0.7452, "step": 210 }, { "epoch": 0.8140610545790934, "grad_norm": 0.5789377197252322, "learning_rate": 5e-06, "loss": 0.7475, "step": 220 }, { "epoch": 0.851063829787234, "grad_norm": 0.7489244924234153, "learning_rate": 5e-06, "loss": 0.7423, "step": 230 }, { "epoch": 0.8880666049953746, "grad_norm": 0.7658948623639423, "learning_rate": 5e-06, "loss": 0.7407, "step": 240 }, { "epoch": 0.9250693802035153, "grad_norm": 0.6418871797978494, "learning_rate": 5e-06, "loss": 0.7429, "step": 250 }, { "epoch": 0.9620721554116559, "grad_norm": 0.6374125796179182, "learning_rate": 5e-06, "loss": 0.7457, "step": 260 }, { "epoch": 0.9990749306197965, "grad_norm": 0.6225924769184604, "learning_rate": 5e-06, "loss": 0.7398, "step": 270 }, { "epoch": 0.9990749306197965, "eval_loss": 0.7373877167701721, "eval_runtime": 285.6741, "eval_samples_per_second": 25.487, "eval_steps_per_second": 0.399, "step": 270 }, { "epoch": 1.0360777058279371, "grad_norm": 1.0999156797786174, "learning_rate": 5e-06, "loss": 0.7465, "step": 280 }, { "epoch": 1.0730804810360777, "grad_norm": 0.9033567059221405, "learning_rate": 5e-06, "loss": 0.6916, "step": 290 }, { "epoch": 1.1100832562442182, "grad_norm": 0.7225203751531558, "learning_rate": 5e-06, "loss": 0.6891, "step": 300 }, { "epoch": 1.147086031452359, "grad_norm": 0.7744016164334468, "learning_rate": 5e-06, "loss": 0.6924, "step": 310 }, { "epoch": 1.1840888066604995, "grad_norm": 0.5778670419847766, "learning_rate": 5e-06, "loss": 0.6847, "step": 320 }, { "epoch": 1.22109158186864, "grad_norm": 0.7216085851109396, "learning_rate": 5e-06, "loss": 0.6874, "step": 330 }, { "epoch": 1.2580943570767809, "grad_norm": 0.6834497791089044, "learning_rate": 5e-06, "loss": 0.685, "step": 340 }, { "epoch": 1.2950971322849214, "grad_norm": 0.5864187616860316, "learning_rate": 5e-06, "loss": 0.6887, "step": 350 }, { "epoch": 1.332099907493062, "grad_norm": 0.6435850583829653, "learning_rate": 5e-06, "loss": 0.6871, "step": 360 }, { "epoch": 1.3691026827012025, "grad_norm": 1.026308036346174, "learning_rate": 5e-06, "loss": 0.6914, "step": 370 }, { "epoch": 1.4061054579093433, "grad_norm": 0.6177344760653564, "learning_rate": 5e-06, "loss": 0.6895, "step": 380 }, { "epoch": 1.4431082331174838, "grad_norm": 0.5769701433909521, "learning_rate": 5e-06, "loss": 0.6899, "step": 390 }, { "epoch": 1.4801110083256244, "grad_norm": 0.6884699856368363, "learning_rate": 5e-06, "loss": 0.6846, "step": 400 }, { "epoch": 1.5171137835337651, "grad_norm": 0.6251420323292153, "learning_rate": 5e-06, "loss": 0.6886, "step": 410 }, { "epoch": 1.5541165587419057, "grad_norm": 0.6980683262318477, "learning_rate": 5e-06, "loss": 0.6845, "step": 420 }, { "epoch": 1.5911193339500462, "grad_norm": 0.5739588719749933, "learning_rate": 5e-06, "loss": 0.6885, "step": 430 }, { "epoch": 1.6281221091581868, "grad_norm": 0.5948644114545361, "learning_rate": 5e-06, "loss": 0.6894, "step": 440 }, { "epoch": 1.6651248843663273, "grad_norm": 0.5752931750826541, "learning_rate": 5e-06, "loss": 0.6844, "step": 450 }, { "epoch": 1.702127659574468, "grad_norm": 0.6226615522398108, "learning_rate": 5e-06, "loss": 0.6873, "step": 460 }, { "epoch": 1.7391304347826086, "grad_norm": 0.709209259109901, "learning_rate": 5e-06, "loss": 0.6822, "step": 470 }, { "epoch": 1.7761332099907494, "grad_norm": 0.6748952075449096, "learning_rate": 5e-06, "loss": 0.6866, "step": 480 }, { "epoch": 1.81313598519889, "grad_norm": 0.7823171266955319, "learning_rate": 5e-06, "loss": 0.6865, "step": 490 }, { "epoch": 1.8501387604070305, "grad_norm": 0.9392626078254421, "learning_rate": 5e-06, "loss": 0.6879, "step": 500 }, { "epoch": 1.887141535615171, "grad_norm": 0.7275512298704835, "learning_rate": 5e-06, "loss": 0.6826, "step": 510 }, { "epoch": 1.9241443108233116, "grad_norm": 0.8091982613912542, "learning_rate": 5e-06, "loss": 0.6841, "step": 520 }, { "epoch": 1.9611470860314524, "grad_norm": 0.6148273009527061, "learning_rate": 5e-06, "loss": 0.6882, "step": 530 }, { "epoch": 1.998149861239593, "grad_norm": 0.6033253124158411, "learning_rate": 5e-06, "loss": 0.6816, "step": 540 }, { "epoch": 1.998149861239593, "eval_loss": 0.7247459888458252, "eval_runtime": 286.0565, "eval_samples_per_second": 25.453, "eval_steps_per_second": 0.399, "step": 540 }, { "epoch": 2.0351526364477337, "grad_norm": 0.6676285360839004, "learning_rate": 5e-06, "loss": 0.6817, "step": 550 }, { "epoch": 2.0721554116558742, "grad_norm": 0.6670166248278999, "learning_rate": 5e-06, "loss": 0.6302, "step": 560 }, { "epoch": 2.109158186864015, "grad_norm": 0.7594092171720456, "learning_rate": 5e-06, "loss": 0.6333, "step": 570 }, { "epoch": 2.1461609620721553, "grad_norm": 0.734159893671992, "learning_rate": 5e-06, "loss": 0.631, "step": 580 }, { "epoch": 2.183163737280296, "grad_norm": 0.7620304153842595, "learning_rate": 5e-06, "loss": 0.6331, "step": 590 }, { "epoch": 2.2201665124884364, "grad_norm": 0.8062383311336945, "learning_rate": 5e-06, "loss": 0.6352, "step": 600 }, { "epoch": 2.2571692876965774, "grad_norm": 0.6298902178051499, "learning_rate": 5e-06, "loss": 0.6324, "step": 610 }, { "epoch": 2.294172062904718, "grad_norm": 0.6917188810207621, "learning_rate": 5e-06, "loss": 0.6373, "step": 620 }, { "epoch": 2.3311748381128585, "grad_norm": 0.9550757835002316, "learning_rate": 5e-06, "loss": 0.6357, "step": 630 }, { "epoch": 2.368177613320999, "grad_norm": 0.5913508720714069, "learning_rate": 5e-06, "loss": 0.6351, "step": 640 }, { "epoch": 2.4051803885291396, "grad_norm": 0.6758217219286833, "learning_rate": 5e-06, "loss": 0.6349, "step": 650 }, { "epoch": 2.44218316373728, "grad_norm": 0.5787936091702227, "learning_rate": 5e-06, "loss": 0.632, "step": 660 }, { "epoch": 2.4791859389454207, "grad_norm": 0.5976179095981347, "learning_rate": 5e-06, "loss": 0.6303, "step": 670 }, { "epoch": 2.5161887141535617, "grad_norm": 0.7954067986819094, "learning_rate": 5e-06, "loss": 0.6319, "step": 680 }, { "epoch": 2.5531914893617023, "grad_norm": 0.6891125209012705, "learning_rate": 5e-06, "loss": 0.6383, "step": 690 }, { "epoch": 2.590194264569843, "grad_norm": 0.8230201955415605, "learning_rate": 5e-06, "loss": 0.6371, "step": 700 }, { "epoch": 2.6271970397779834, "grad_norm": 0.6635633756561687, "learning_rate": 5e-06, "loss": 0.6382, "step": 710 }, { "epoch": 2.664199814986124, "grad_norm": 0.6306477728740528, "learning_rate": 5e-06, "loss": 0.6411, "step": 720 }, { "epoch": 2.7012025901942645, "grad_norm": 0.5984777601069516, "learning_rate": 5e-06, "loss": 0.6369, "step": 730 }, { "epoch": 2.738205365402405, "grad_norm": 0.7644851120709378, "learning_rate": 5e-06, "loss": 0.6348, "step": 740 }, { "epoch": 2.7752081406105455, "grad_norm": 0.6478127083239548, "learning_rate": 5e-06, "loss": 0.636, "step": 750 }, { "epoch": 2.8122109158186865, "grad_norm": 0.6453201797896143, "learning_rate": 5e-06, "loss": 0.6396, "step": 760 }, { "epoch": 2.849213691026827, "grad_norm": 0.7223841425019709, "learning_rate": 5e-06, "loss": 0.639, "step": 770 }, { "epoch": 2.8862164662349676, "grad_norm": 0.8854103875073065, "learning_rate": 5e-06, "loss": 0.6352, "step": 780 }, { "epoch": 2.923219241443108, "grad_norm": 0.74480497953526, "learning_rate": 5e-06, "loss": 0.6387, "step": 790 }, { "epoch": 2.9602220166512487, "grad_norm": 0.5468951542823913, "learning_rate": 5e-06, "loss": 0.6346, "step": 800 }, { "epoch": 2.9972247918593897, "grad_norm": 0.7117523367143715, "learning_rate": 5e-06, "loss": 0.6406, "step": 810 }, { "epoch": 2.9972247918593897, "eval_loss": 0.7273637056350708, "eval_runtime": 286.127, "eval_samples_per_second": 25.447, "eval_steps_per_second": 0.398, "step": 810 }, { "epoch": 2.9972247918593897, "step": 810, "total_flos": 1356570789150720.0, "train_loss": 0.7045574435481319, "train_runtime": 47714.0524, "train_samples_per_second": 8.697, "train_steps_per_second": 0.017 } ], "logging_steps": 10, "max_steps": 810, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1356570789150720.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }