{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9948364888123926, "eval_steps": 500, "global_step": 870, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03442340791738382, "grad_norm": 3.6819635678440528, "learning_rate": 5e-06, "loss": 1.0121, "step": 10 }, { "epoch": 0.06884681583476764, "grad_norm": 5.925274879357907, "learning_rate": 5e-06, "loss": 0.8959, "step": 20 }, { "epoch": 0.10327022375215146, "grad_norm": 1.9452350190286591, "learning_rate": 5e-06, "loss": 0.877, "step": 30 }, { "epoch": 0.13769363166953527, "grad_norm": 1.180696985048175, "learning_rate": 5e-06, "loss": 0.8454, "step": 40 }, { "epoch": 0.1721170395869191, "grad_norm": 0.9223333997999451, "learning_rate": 5e-06, "loss": 0.8155, "step": 50 }, { "epoch": 0.20654044750430292, "grad_norm": 0.9307430421223478, "learning_rate": 5e-06, "loss": 0.7988, "step": 60 }, { "epoch": 0.24096385542168675, "grad_norm": 0.791390062841487, "learning_rate": 5e-06, "loss": 0.7889, "step": 70 }, { "epoch": 0.27538726333907054, "grad_norm": 0.6218724507386446, "learning_rate": 5e-06, "loss": 0.7806, "step": 80 }, { "epoch": 0.3098106712564544, "grad_norm": 0.7264494941987614, "learning_rate": 5e-06, "loss": 0.7705, "step": 90 }, { "epoch": 0.3442340791738382, "grad_norm": 0.6924488359878446, "learning_rate": 5e-06, "loss": 0.7613, "step": 100 }, { "epoch": 0.37865748709122204, "grad_norm": 0.8803235860035055, "learning_rate": 5e-06, "loss": 0.761, "step": 110 }, { "epoch": 0.41308089500860584, "grad_norm": 0.6841880149421407, "learning_rate": 5e-06, "loss": 0.7555, "step": 120 }, { "epoch": 0.4475043029259897, "grad_norm": 0.667216275591224, "learning_rate": 5e-06, "loss": 0.7507, "step": 130 }, { "epoch": 0.4819277108433735, "grad_norm": 0.6640712451607944, "learning_rate": 5e-06, "loss": 0.7474, "step": 140 }, { "epoch": 0.5163511187607573, "grad_norm": 0.5645446586749623, "learning_rate": 5e-06, "loss": 0.7452, "step": 150 }, { "epoch": 0.5507745266781411, "grad_norm": 0.7052276498344197, "learning_rate": 5e-06, "loss": 0.7461, "step": 160 }, { "epoch": 0.5851979345955249, "grad_norm": 0.7026553647920556, "learning_rate": 5e-06, "loss": 0.7467, "step": 170 }, { "epoch": 0.6196213425129088, "grad_norm": 0.5956245872933223, "learning_rate": 5e-06, "loss": 0.7483, "step": 180 }, { "epoch": 0.6540447504302926, "grad_norm": 0.5945615019725103, "learning_rate": 5e-06, "loss": 0.7424, "step": 190 }, { "epoch": 0.6884681583476764, "grad_norm": 0.5926282356688969, "learning_rate": 5e-06, "loss": 0.7393, "step": 200 }, { "epoch": 0.7228915662650602, "grad_norm": 0.529261150364574, "learning_rate": 5e-06, "loss": 0.734, "step": 210 }, { "epoch": 0.7573149741824441, "grad_norm": 0.70244337869977, "learning_rate": 5e-06, "loss": 0.7344, "step": 220 }, { "epoch": 0.7917383820998278, "grad_norm": 0.5208460618800276, "learning_rate": 5e-06, "loss": 0.7332, "step": 230 }, { "epoch": 0.8261617900172117, "grad_norm": 0.6781034767797038, "learning_rate": 5e-06, "loss": 0.7323, "step": 240 }, { "epoch": 0.8605851979345955, "grad_norm": 0.6364816529741125, "learning_rate": 5e-06, "loss": 0.7311, "step": 250 }, { "epoch": 0.8950086058519794, "grad_norm": 0.6857634900258707, "learning_rate": 5e-06, "loss": 0.73, "step": 260 }, { "epoch": 0.9294320137693631, "grad_norm": 0.7533537266440626, "learning_rate": 5e-06, "loss": 0.7274, "step": 270 }, { "epoch": 0.963855421686747, "grad_norm": 0.668483865607749, "learning_rate": 5e-06, "loss": 0.7281, "step": 280 }, { "epoch": 0.9982788296041308, "grad_norm": 0.640731101658078, "learning_rate": 5e-06, "loss": 0.73, "step": 290 }, { "epoch": 0.9982788296041308, "eval_loss": 0.7240723967552185, "eval_runtime": 311.5856, "eval_samples_per_second": 25.123, "eval_steps_per_second": 0.395, "step": 290 }, { "epoch": 1.0327022375215147, "grad_norm": 0.8232801616508892, "learning_rate": 5e-06, "loss": 0.7164, "step": 300 }, { "epoch": 1.0671256454388984, "grad_norm": 0.6877231272007057, "learning_rate": 5e-06, "loss": 0.6796, "step": 310 }, { "epoch": 1.1015490533562822, "grad_norm": 0.7867017260974334, "learning_rate": 5e-06, "loss": 0.6731, "step": 320 }, { "epoch": 1.1359724612736661, "grad_norm": 0.6102991857765998, "learning_rate": 5e-06, "loss": 0.6804, "step": 330 }, { "epoch": 1.1703958691910499, "grad_norm": 0.7250816197036796, "learning_rate": 5e-06, "loss": 0.676, "step": 340 }, { "epoch": 1.2048192771084336, "grad_norm": 0.6971293258638788, "learning_rate": 5e-06, "loss": 0.6758, "step": 350 }, { "epoch": 1.2392426850258176, "grad_norm": 0.6980055976515607, "learning_rate": 5e-06, "loss": 0.6812, "step": 360 }, { "epoch": 1.2736660929432013, "grad_norm": 0.6257924181521026, "learning_rate": 5e-06, "loss": 0.6829, "step": 370 }, { "epoch": 1.3080895008605853, "grad_norm": 0.6620444223829324, "learning_rate": 5e-06, "loss": 0.6787, "step": 380 }, { "epoch": 1.342512908777969, "grad_norm": 0.6019054885784155, "learning_rate": 5e-06, "loss": 0.6793, "step": 390 }, { "epoch": 1.3769363166953528, "grad_norm": 0.6430051610733118, "learning_rate": 5e-06, "loss": 0.6774, "step": 400 }, { "epoch": 1.4113597246127367, "grad_norm": 0.5807368932507306, "learning_rate": 5e-06, "loss": 0.6812, "step": 410 }, { "epoch": 1.4457831325301205, "grad_norm": 0.6470925978408152, "learning_rate": 5e-06, "loss": 0.6747, "step": 420 }, { "epoch": 1.4802065404475044, "grad_norm": 0.6423216146537339, "learning_rate": 5e-06, "loss": 0.6764, "step": 430 }, { "epoch": 1.5146299483648882, "grad_norm": 0.5134608684735672, "learning_rate": 5e-06, "loss": 0.6744, "step": 440 }, { "epoch": 1.549053356282272, "grad_norm": 0.5563124728753217, "learning_rate": 5e-06, "loss": 0.6728, "step": 450 }, { "epoch": 1.5834767641996557, "grad_norm": 0.6269436233978866, "learning_rate": 5e-06, "loss": 0.6761, "step": 460 }, { "epoch": 1.6179001721170396, "grad_norm": 0.589734978264397, "learning_rate": 5e-06, "loss": 0.6792, "step": 470 }, { "epoch": 1.6523235800344234, "grad_norm": 0.6327759222361318, "learning_rate": 5e-06, "loss": 0.6768, "step": 480 }, { "epoch": 1.6867469879518073, "grad_norm": 0.6962103362892431, "learning_rate": 5e-06, "loss": 0.677, "step": 490 }, { "epoch": 1.721170395869191, "grad_norm": 0.5760289071453567, "learning_rate": 5e-06, "loss": 0.6799, "step": 500 }, { "epoch": 1.7555938037865748, "grad_norm": 0.6442600102377914, "learning_rate": 5e-06, "loss": 0.6773, "step": 510 }, { "epoch": 1.7900172117039586, "grad_norm": 0.7715377748849698, "learning_rate": 5e-06, "loss": 0.6761, "step": 520 }, { "epoch": 1.8244406196213425, "grad_norm": 0.5533000553027299, "learning_rate": 5e-06, "loss": 0.6736, "step": 530 }, { "epoch": 1.8588640275387265, "grad_norm": 0.6543045883003663, "learning_rate": 5e-06, "loss": 0.6724, "step": 540 }, { "epoch": 1.8932874354561102, "grad_norm": 0.7812179906299692, "learning_rate": 5e-06, "loss": 0.6745, "step": 550 }, { "epoch": 1.927710843373494, "grad_norm": 0.7706494630311692, "learning_rate": 5e-06, "loss": 0.6744, "step": 560 }, { "epoch": 1.9621342512908777, "grad_norm": 0.6182434646754749, "learning_rate": 5e-06, "loss": 0.6755, "step": 570 }, { "epoch": 1.9965576592082617, "grad_norm": 0.6295557645635617, "learning_rate": 5e-06, "loss": 0.6787, "step": 580 }, { "epoch": 2.0, "eval_loss": 0.7113586664199829, "eval_runtime": 311.8588, "eval_samples_per_second": 25.101, "eval_steps_per_second": 0.394, "step": 581 }, { "epoch": 2.0309810671256456, "grad_norm": 1.2061368930293643, "learning_rate": 5e-06, "loss": 0.6659, "step": 590 }, { "epoch": 2.0654044750430294, "grad_norm": 1.3280429163631766, "learning_rate": 5e-06, "loss": 0.6232, "step": 600 }, { "epoch": 2.099827882960413, "grad_norm": 0.8615634723401497, "learning_rate": 5e-06, "loss": 0.6239, "step": 610 }, { "epoch": 2.134251290877797, "grad_norm": 0.7137137740055365, "learning_rate": 5e-06, "loss": 0.6196, "step": 620 }, { "epoch": 2.1686746987951806, "grad_norm": 0.7012119673623688, "learning_rate": 5e-06, "loss": 0.6257, "step": 630 }, { "epoch": 2.2030981067125643, "grad_norm": 0.7539553553577881, "learning_rate": 5e-06, "loss": 0.6232, "step": 640 }, { "epoch": 2.2375215146299485, "grad_norm": 0.7635231238603634, "learning_rate": 5e-06, "loss": 0.6203, "step": 650 }, { "epoch": 2.2719449225473323, "grad_norm": 0.6908410296367468, "learning_rate": 5e-06, "loss": 0.6254, "step": 660 }, { "epoch": 2.306368330464716, "grad_norm": 0.6587745940287006, "learning_rate": 5e-06, "loss": 0.6301, "step": 670 }, { "epoch": 2.3407917383820998, "grad_norm": 0.5798868468674587, "learning_rate": 5e-06, "loss": 0.6279, "step": 680 }, { "epoch": 2.3752151462994835, "grad_norm": 0.8440728118550425, "learning_rate": 5e-06, "loss": 0.6287, "step": 690 }, { "epoch": 2.4096385542168672, "grad_norm": 0.6066489275997706, "learning_rate": 5e-06, "loss": 0.6295, "step": 700 }, { "epoch": 2.4440619621342514, "grad_norm": 0.7165812340817078, "learning_rate": 5e-06, "loss": 0.6277, "step": 710 }, { "epoch": 2.478485370051635, "grad_norm": 0.6122168594678861, "learning_rate": 5e-06, "loss": 0.6293, "step": 720 }, { "epoch": 2.512908777969019, "grad_norm": 0.81573767147419, "learning_rate": 5e-06, "loss": 0.6287, "step": 730 }, { "epoch": 2.5473321858864026, "grad_norm": 0.5606648215554753, "learning_rate": 5e-06, "loss": 0.6294, "step": 740 }, { "epoch": 2.581755593803787, "grad_norm": 0.6406975384981994, "learning_rate": 5e-06, "loss": 0.6232, "step": 750 }, { "epoch": 2.6161790017211706, "grad_norm": 0.6856546267607884, "learning_rate": 5e-06, "loss": 0.6291, "step": 760 }, { "epoch": 2.6506024096385543, "grad_norm": 0.6347450877099359, "learning_rate": 5e-06, "loss": 0.6283, "step": 770 }, { "epoch": 2.685025817555938, "grad_norm": 0.6621950677045059, "learning_rate": 5e-06, "loss": 0.6305, "step": 780 }, { "epoch": 2.719449225473322, "grad_norm": 0.6174426541448764, "learning_rate": 5e-06, "loss": 0.6255, "step": 790 }, { "epoch": 2.7538726333907055, "grad_norm": 0.6772601640104119, "learning_rate": 5e-06, "loss": 0.6314, "step": 800 }, { "epoch": 2.7882960413080893, "grad_norm": 0.5940690265376317, "learning_rate": 5e-06, "loss": 0.6261, "step": 810 }, { "epoch": 2.8227194492254735, "grad_norm": 0.5557807625472435, "learning_rate": 5e-06, "loss": 0.6266, "step": 820 }, { "epoch": 2.857142857142857, "grad_norm": 0.7023723168127282, "learning_rate": 5e-06, "loss": 0.6278, "step": 830 }, { "epoch": 2.891566265060241, "grad_norm": 0.5869122563169644, "learning_rate": 5e-06, "loss": 0.6272, "step": 840 }, { "epoch": 2.9259896729776247, "grad_norm": 0.6112033798118853, "learning_rate": 5e-06, "loss": 0.6304, "step": 850 }, { "epoch": 2.960413080895009, "grad_norm": 0.6445615202118182, "learning_rate": 5e-06, "loss": 0.631, "step": 860 }, { "epoch": 2.9948364888123926, "grad_norm": 0.6223406143063472, "learning_rate": 5e-06, "loss": 0.6305, "step": 870 }, { "epoch": 2.9948364888123926, "eval_loss": 0.7126539349555969, "eval_runtime": 315.6621, "eval_samples_per_second": 24.799, "eval_steps_per_second": 0.39, "step": 870 }, { "epoch": 2.9948364888123926, "step": 870, "total_flos": 1457073023877120.0, "train_loss": 0.6931865083760229, "train_runtime": 51465.7493, "train_samples_per_second": 8.669, "train_steps_per_second": 0.017 } ], "logging_steps": 10, "max_steps": 870, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1457073023877120.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }