| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.9948364888123926, | |
| "eval_steps": 500, | |
| "global_step": 870, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.03442340791738382, | |
| "grad_norm": 3.6819635678440528, | |
| "learning_rate": 5e-06, | |
| "loss": 1.0121, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.06884681583476764, | |
| "grad_norm": 5.925274879357907, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8959, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.10327022375215146, | |
| "grad_norm": 1.9452350190286591, | |
| "learning_rate": 5e-06, | |
| "loss": 0.877, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.13769363166953527, | |
| "grad_norm": 1.180696985048175, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8454, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.1721170395869191, | |
| "grad_norm": 0.9223333997999451, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8155, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.20654044750430292, | |
| "grad_norm": 0.9307430421223478, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7988, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.24096385542168675, | |
| "grad_norm": 0.791390062841487, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7889, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.27538726333907054, | |
| "grad_norm": 0.6218724507386446, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7806, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.3098106712564544, | |
| "grad_norm": 0.7264494941987614, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7705, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.3442340791738382, | |
| "grad_norm": 0.6924488359878446, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7613, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.37865748709122204, | |
| "grad_norm": 0.8803235860035055, | |
| "learning_rate": 5e-06, | |
| "loss": 0.761, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.41308089500860584, | |
| "grad_norm": 0.6841880149421407, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7555, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.4475043029259897, | |
| "grad_norm": 0.667216275591224, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7507, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.4819277108433735, | |
| "grad_norm": 0.6640712451607944, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7474, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.5163511187607573, | |
| "grad_norm": 0.5645446586749623, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7452, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.5507745266781411, | |
| "grad_norm": 0.7052276498344197, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7461, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.5851979345955249, | |
| "grad_norm": 0.7026553647920556, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7467, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.6196213425129088, | |
| "grad_norm": 0.5956245872933223, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7483, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.6540447504302926, | |
| "grad_norm": 0.5945615019725103, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7424, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.6884681583476764, | |
| "grad_norm": 0.5926282356688969, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7393, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.7228915662650602, | |
| "grad_norm": 0.529261150364574, | |
| "learning_rate": 5e-06, | |
| "loss": 0.734, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.7573149741824441, | |
| "grad_norm": 0.70244337869977, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7344, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.7917383820998278, | |
| "grad_norm": 0.5208460618800276, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7332, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.8261617900172117, | |
| "grad_norm": 0.6781034767797038, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7323, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.8605851979345955, | |
| "grad_norm": 0.6364816529741125, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7311, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.8950086058519794, | |
| "grad_norm": 0.6857634900258707, | |
| "learning_rate": 5e-06, | |
| "loss": 0.73, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.9294320137693631, | |
| "grad_norm": 0.7533537266440626, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7274, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.963855421686747, | |
| "grad_norm": 0.668483865607749, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7281, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.9982788296041308, | |
| "grad_norm": 0.640731101658078, | |
| "learning_rate": 5e-06, | |
| "loss": 0.73, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.9982788296041308, | |
| "eval_loss": 0.7240723967552185, | |
| "eval_runtime": 311.5856, | |
| "eval_samples_per_second": 25.123, | |
| "eval_steps_per_second": 0.395, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.0327022375215147, | |
| "grad_norm": 0.8232801616508892, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7164, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.0671256454388984, | |
| "grad_norm": 0.6877231272007057, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6796, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.1015490533562822, | |
| "grad_norm": 0.7867017260974334, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6731, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.1359724612736661, | |
| "grad_norm": 0.6102991857765998, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6804, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.1703958691910499, | |
| "grad_norm": 0.7250816197036796, | |
| "learning_rate": 5e-06, | |
| "loss": 0.676, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.2048192771084336, | |
| "grad_norm": 0.6971293258638788, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6758, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.2392426850258176, | |
| "grad_norm": 0.6980055976515607, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6812, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.2736660929432013, | |
| "grad_norm": 0.6257924181521026, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6829, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.3080895008605853, | |
| "grad_norm": 0.6620444223829324, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6787, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.342512908777969, | |
| "grad_norm": 0.6019054885784155, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6793, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.3769363166953528, | |
| "grad_norm": 0.6430051610733118, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6774, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.4113597246127367, | |
| "grad_norm": 0.5807368932507306, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6812, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.4457831325301205, | |
| "grad_norm": 0.6470925978408152, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6747, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.4802065404475044, | |
| "grad_norm": 0.6423216146537339, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6764, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.5146299483648882, | |
| "grad_norm": 0.5134608684735672, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6744, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.549053356282272, | |
| "grad_norm": 0.5563124728753217, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6728, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.5834767641996557, | |
| "grad_norm": 0.6269436233978866, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6761, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.6179001721170396, | |
| "grad_norm": 0.589734978264397, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6792, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.6523235800344234, | |
| "grad_norm": 0.6327759222361318, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6768, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.6867469879518073, | |
| "grad_norm": 0.6962103362892431, | |
| "learning_rate": 5e-06, | |
| "loss": 0.677, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.721170395869191, | |
| "grad_norm": 0.5760289071453567, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6799, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.7555938037865748, | |
| "grad_norm": 0.6442600102377914, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6773, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.7900172117039586, | |
| "grad_norm": 0.7715377748849698, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6761, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.8244406196213425, | |
| "grad_norm": 0.5533000553027299, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6736, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.8588640275387265, | |
| "grad_norm": 0.6543045883003663, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6724, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.8932874354561102, | |
| "grad_norm": 0.7812179906299692, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6745, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.927710843373494, | |
| "grad_norm": 0.7706494630311692, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6744, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.9621342512908777, | |
| "grad_norm": 0.6182434646754749, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6755, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.9965576592082617, | |
| "grad_norm": 0.6295557645635617, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6787, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 0.7113586664199829, | |
| "eval_runtime": 311.8588, | |
| "eval_samples_per_second": 25.101, | |
| "eval_steps_per_second": 0.394, | |
| "step": 581 | |
| }, | |
| { | |
| "epoch": 2.0309810671256456, | |
| "grad_norm": 1.2061368930293643, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6659, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 2.0654044750430294, | |
| "grad_norm": 1.3280429163631766, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6232, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 2.099827882960413, | |
| "grad_norm": 0.8615634723401497, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6239, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 2.134251290877797, | |
| "grad_norm": 0.7137137740055365, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6196, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 2.1686746987951806, | |
| "grad_norm": 0.7012119673623688, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6257, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 2.2030981067125643, | |
| "grad_norm": 0.7539553553577881, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6232, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 2.2375215146299485, | |
| "grad_norm": 0.7635231238603634, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6203, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 2.2719449225473323, | |
| "grad_norm": 0.6908410296367468, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6254, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 2.306368330464716, | |
| "grad_norm": 0.6587745940287006, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6301, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 2.3407917383820998, | |
| "grad_norm": 0.5798868468674587, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6279, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 2.3752151462994835, | |
| "grad_norm": 0.8440728118550425, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6287, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 2.4096385542168672, | |
| "grad_norm": 0.6066489275997706, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6295, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 2.4440619621342514, | |
| "grad_norm": 0.7165812340817078, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6277, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 2.478485370051635, | |
| "grad_norm": 0.6122168594678861, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6293, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 2.512908777969019, | |
| "grad_norm": 0.81573767147419, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6287, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 2.5473321858864026, | |
| "grad_norm": 0.5606648215554753, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6294, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 2.581755593803787, | |
| "grad_norm": 0.6406975384981994, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6232, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 2.6161790017211706, | |
| "grad_norm": 0.6856546267607884, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6291, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 2.6506024096385543, | |
| "grad_norm": 0.6347450877099359, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6283, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 2.685025817555938, | |
| "grad_norm": 0.6621950677045059, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6305, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 2.719449225473322, | |
| "grad_norm": 0.6174426541448764, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6255, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 2.7538726333907055, | |
| "grad_norm": 0.6772601640104119, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6314, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 2.7882960413080893, | |
| "grad_norm": 0.5940690265376317, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6261, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 2.8227194492254735, | |
| "grad_norm": 0.5557807625472435, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6266, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 2.857142857142857, | |
| "grad_norm": 0.7023723168127282, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6278, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 2.891566265060241, | |
| "grad_norm": 0.5869122563169644, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6272, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 2.9259896729776247, | |
| "grad_norm": 0.6112033798118853, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6304, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 2.960413080895009, | |
| "grad_norm": 0.6445615202118182, | |
| "learning_rate": 5e-06, | |
| "loss": 0.631, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 2.9948364888123926, | |
| "grad_norm": 0.6223406143063472, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6305, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 2.9948364888123926, | |
| "eval_loss": 0.7126539349555969, | |
| "eval_runtime": 315.6621, | |
| "eval_samples_per_second": 24.799, | |
| "eval_steps_per_second": 0.39, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 2.9948364888123926, | |
| "step": 870, | |
| "total_flos": 1457073023877120.0, | |
| "train_loss": 0.6931865083760229, | |
| "train_runtime": 51465.7493, | |
| "train_samples_per_second": 8.669, | |
| "train_steps_per_second": 0.017 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 870, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1457073023877120.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |