| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 7.0, |
| "eval_steps": 500, |
| "global_step": 910, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.07716049382716049, |
| "grad_norm": 0.6794719696044922, |
| "learning_rate": 0.0004955, |
| "loss": 1.8548, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.15432098765432098, |
| "grad_norm": 0.5799520611763, |
| "learning_rate": 0.0004905, |
| "loss": 1.3989, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.23148148148148148, |
| "grad_norm": 0.44065240025520325, |
| "learning_rate": 0.0004855, |
| "loss": 1.2791, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.30864197530864196, |
| "grad_norm": 0.4982340931892395, |
| "learning_rate": 0.00048049999999999997, |
| "loss": 1.2222, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.38580246913580246, |
| "grad_norm": 0.441045880317688, |
| "learning_rate": 0.0004755, |
| "loss": 1.1567, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.46296296296296297, |
| "grad_norm": 0.5147080421447754, |
| "learning_rate": 0.0004705, |
| "loss": 1.1233, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.5401234567901234, |
| "grad_norm": 0.4304927587509155, |
| "learning_rate": 0.00046550000000000004, |
| "loss": 1.1042, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.6172839506172839, |
| "grad_norm": 0.4338945150375366, |
| "learning_rate": 0.0004605, |
| "loss": 1.0991, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.6944444444444444, |
| "grad_norm": 0.4131755232810974, |
| "learning_rate": 0.0004555, |
| "loss": 1.0721, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.7716049382716049, |
| "grad_norm": 0.42770788073539734, |
| "learning_rate": 0.0004505, |
| "loss": 1.061, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.8487654320987654, |
| "grad_norm": 0.40293624997138977, |
| "learning_rate": 0.00044550000000000004, |
| "loss": 1.0456, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.9259259259259259, |
| "grad_norm": 0.405519038438797, |
| "learning_rate": 0.00044050000000000003, |
| "loss": 1.0443, |
| "step": 120 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 0.49841395020484924, |
| "learning_rate": 0.0004355, |
| "loss": 1.0311, |
| "step": 130 |
| }, |
| { |
| "epoch": 1.0771604938271604, |
| "grad_norm": 0.40889298915863037, |
| "learning_rate": 0.0004305, |
| "loss": 0.9679, |
| "step": 140 |
| }, |
| { |
| "epoch": 1.154320987654321, |
| "grad_norm": 0.43716397881507874, |
| "learning_rate": 0.0004255, |
| "loss": 0.963, |
| "step": 150 |
| }, |
| { |
| "epoch": 1.2314814814814814, |
| "grad_norm": 0.41822507977485657, |
| "learning_rate": 0.0004205, |
| "loss": 0.9622, |
| "step": 160 |
| }, |
| { |
| "epoch": 1.308641975308642, |
| "grad_norm": 0.4274284541606903, |
| "learning_rate": 0.00041549999999999996, |
| "loss": 0.9557, |
| "step": 170 |
| }, |
| { |
| "epoch": 1.3858024691358024, |
| "grad_norm": 0.4369387924671173, |
| "learning_rate": 0.0004105, |
| "loss": 0.9537, |
| "step": 180 |
| }, |
| { |
| "epoch": 1.462962962962963, |
| "grad_norm": 0.41516125202178955, |
| "learning_rate": 0.00040550000000000004, |
| "loss": 0.9556, |
| "step": 190 |
| }, |
| { |
| "epoch": 1.5401234567901234, |
| "grad_norm": 0.4300761818885803, |
| "learning_rate": 0.00040050000000000003, |
| "loss": 0.9422, |
| "step": 200 |
| }, |
| { |
| "epoch": 1.617283950617284, |
| "grad_norm": 0.4175296723842621, |
| "learning_rate": 0.0003955, |
| "loss": 0.9542, |
| "step": 210 |
| }, |
| { |
| "epoch": 1.6944444444444444, |
| "grad_norm": 0.42389756441116333, |
| "learning_rate": 0.0003905, |
| "loss": 0.9491, |
| "step": 220 |
| }, |
| { |
| "epoch": 1.7716049382716048, |
| "grad_norm": 0.41654592752456665, |
| "learning_rate": 0.0003855, |
| "loss": 0.9419, |
| "step": 230 |
| }, |
| { |
| "epoch": 1.8487654320987654, |
| "grad_norm": 0.43183737993240356, |
| "learning_rate": 0.00038050000000000003, |
| "loss": 0.9379, |
| "step": 240 |
| }, |
| { |
| "epoch": 1.925925925925926, |
| "grad_norm": 0.42406436800956726, |
| "learning_rate": 0.0003755, |
| "loss": 0.9458, |
| "step": 250 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 0.555463433265686, |
| "learning_rate": 0.0003705, |
| "loss": 0.936, |
| "step": 260 |
| }, |
| { |
| "epoch": 2.0771604938271606, |
| "grad_norm": 0.44835132360458374, |
| "learning_rate": 0.0003655, |
| "loss": 0.8446, |
| "step": 270 |
| }, |
| { |
| "epoch": 2.154320987654321, |
| "grad_norm": 0.4730922281742096, |
| "learning_rate": 0.0003605, |
| "loss": 0.8375, |
| "step": 280 |
| }, |
| { |
| "epoch": 2.2314814814814814, |
| "grad_norm": 0.49563348293304443, |
| "learning_rate": 0.00035549999999999997, |
| "loss": 0.8404, |
| "step": 290 |
| }, |
| { |
| "epoch": 2.308641975308642, |
| "grad_norm": 0.4677387475967407, |
| "learning_rate": 0.0003505, |
| "loss": 0.8498, |
| "step": 300 |
| }, |
| { |
| "epoch": 2.3858024691358026, |
| "grad_norm": 0.48064374923706055, |
| "learning_rate": 0.0003455, |
| "loss": 0.847, |
| "step": 310 |
| }, |
| { |
| "epoch": 2.462962962962963, |
| "grad_norm": 0.4703057110309601, |
| "learning_rate": 0.00034050000000000004, |
| "loss": 0.8548, |
| "step": 320 |
| }, |
| { |
| "epoch": 2.5401234567901234, |
| "grad_norm": 0.4649062156677246, |
| "learning_rate": 0.0003355, |
| "loss": 0.8513, |
| "step": 330 |
| }, |
| { |
| "epoch": 2.617283950617284, |
| "grad_norm": 0.5000564455986023, |
| "learning_rate": 0.0003305, |
| "loss": 0.865, |
| "step": 340 |
| }, |
| { |
| "epoch": 2.6944444444444446, |
| "grad_norm": 0.47587794065475464, |
| "learning_rate": 0.0003255, |
| "loss": 0.8638, |
| "step": 350 |
| }, |
| { |
| "epoch": 2.771604938271605, |
| "grad_norm": 0.4875640273094177, |
| "learning_rate": 0.00032050000000000004, |
| "loss": 0.8568, |
| "step": 360 |
| }, |
| { |
| "epoch": 2.8487654320987654, |
| "grad_norm": 0.4738157093524933, |
| "learning_rate": 0.0003155, |
| "loss": 0.8538, |
| "step": 370 |
| }, |
| { |
| "epoch": 2.925925925925926, |
| "grad_norm": 0.4902881681919098, |
| "learning_rate": 0.0003105, |
| "loss": 0.8571, |
| "step": 380 |
| }, |
| { |
| "epoch": 3.0, |
| "grad_norm": 0.6196104884147644, |
| "learning_rate": 0.0003055, |
| "loss": 0.8598, |
| "step": 390 |
| }, |
| { |
| "epoch": 3.0771604938271606, |
| "grad_norm": 0.5065454244613647, |
| "learning_rate": 0.0003005, |
| "loss": 0.7451, |
| "step": 400 |
| }, |
| { |
| "epoch": 3.154320987654321, |
| "grad_norm": 0.5143163204193115, |
| "learning_rate": 0.00029549999999999997, |
| "loss": 0.7511, |
| "step": 410 |
| }, |
| { |
| "epoch": 3.2314814814814814, |
| "grad_norm": 0.5133650302886963, |
| "learning_rate": 0.00029049999999999996, |
| "loss": 0.7565, |
| "step": 420 |
| }, |
| { |
| "epoch": 3.308641975308642, |
| "grad_norm": 0.5453213453292847, |
| "learning_rate": 0.0002855, |
| "loss": 0.7533, |
| "step": 430 |
| }, |
| { |
| "epoch": 3.3858024691358026, |
| "grad_norm": 0.5292612314224243, |
| "learning_rate": 0.00028050000000000004, |
| "loss": 0.7688, |
| "step": 440 |
| }, |
| { |
| "epoch": 3.462962962962963, |
| "grad_norm": 0.5598148107528687, |
| "learning_rate": 0.00027550000000000003, |
| "loss": 0.7697, |
| "step": 450 |
| }, |
| { |
| "epoch": 3.5401234567901234, |
| "grad_norm": 0.5388815402984619, |
| "learning_rate": 0.0002705, |
| "loss": 0.7717, |
| "step": 460 |
| }, |
| { |
| "epoch": 3.617283950617284, |
| "grad_norm": 0.5305619835853577, |
| "learning_rate": 0.0002655, |
| "loss": 0.7775, |
| "step": 470 |
| }, |
| { |
| "epoch": 3.6944444444444446, |
| "grad_norm": 0.5664732456207275, |
| "learning_rate": 0.0002605, |
| "loss": 0.7758, |
| "step": 480 |
| }, |
| { |
| "epoch": 3.771604938271605, |
| "grad_norm": 0.5241378545761108, |
| "learning_rate": 0.00025550000000000003, |
| "loss": 0.7861, |
| "step": 490 |
| }, |
| { |
| "epoch": 3.8487654320987654, |
| "grad_norm": 0.5400358438491821, |
| "learning_rate": 0.0002505, |
| "loss": 0.7788, |
| "step": 500 |
| }, |
| { |
| "epoch": 3.925925925925926, |
| "grad_norm": 0.5382837653160095, |
| "learning_rate": 0.0002455, |
| "loss": 0.7829, |
| "step": 510 |
| }, |
| { |
| "epoch": 4.0, |
| "grad_norm": 0.72385573387146, |
| "learning_rate": 0.0002405, |
| "loss": 0.7822, |
| "step": 520 |
| }, |
| { |
| "epoch": 4.077160493827161, |
| "grad_norm": 0.549626350402832, |
| "learning_rate": 0.0002355, |
| "loss": 0.677, |
| "step": 530 |
| }, |
| { |
| "epoch": 4.154320987654321, |
| "grad_norm": 0.5500410795211792, |
| "learning_rate": 0.00023050000000000002, |
| "loss": 0.6724, |
| "step": 540 |
| }, |
| { |
| "epoch": 4.231481481481482, |
| "grad_norm": 0.5754207372665405, |
| "learning_rate": 0.0002255, |
| "loss": 0.683, |
| "step": 550 |
| }, |
| { |
| "epoch": 4.308641975308642, |
| "grad_norm": 0.5669319033622742, |
| "learning_rate": 0.0002205, |
| "loss": 0.681, |
| "step": 560 |
| }, |
| { |
| "epoch": 4.385802469135802, |
| "grad_norm": 0.5849311947822571, |
| "learning_rate": 0.0002155, |
| "loss": 0.6851, |
| "step": 570 |
| }, |
| { |
| "epoch": 4.462962962962963, |
| "grad_norm": 0.5972752571105957, |
| "learning_rate": 0.0002105, |
| "loss": 0.692, |
| "step": 580 |
| }, |
| { |
| "epoch": 4.540123456790123, |
| "grad_norm": 0.5939822196960449, |
| "learning_rate": 0.00020549999999999998, |
| "loss": 0.6997, |
| "step": 590 |
| }, |
| { |
| "epoch": 4.617283950617284, |
| "grad_norm": 0.5963209867477417, |
| "learning_rate": 0.00020050000000000002, |
| "loss": 0.7016, |
| "step": 600 |
| }, |
| { |
| "epoch": 4.694444444444445, |
| "grad_norm": 0.5815416574478149, |
| "learning_rate": 0.0001955, |
| "loss": 0.7036, |
| "step": 610 |
| }, |
| { |
| "epoch": 4.771604938271605, |
| "grad_norm": 0.5937020182609558, |
| "learning_rate": 0.0001905, |
| "loss": 0.7036, |
| "step": 620 |
| }, |
| { |
| "epoch": 4.848765432098766, |
| "grad_norm": 0.5897537469863892, |
| "learning_rate": 0.0001855, |
| "loss": 0.7072, |
| "step": 630 |
| }, |
| { |
| "epoch": 4.925925925925926, |
| "grad_norm": 0.5775209069252014, |
| "learning_rate": 0.0001805, |
| "loss": 0.7119, |
| "step": 640 |
| }, |
| { |
| "epoch": 5.0, |
| "grad_norm": 0.7759206295013428, |
| "learning_rate": 0.00017549999999999998, |
| "loss": 0.7135, |
| "step": 650 |
| }, |
| { |
| "epoch": 5.077160493827161, |
| "grad_norm": 0.600871741771698, |
| "learning_rate": 0.00017050000000000002, |
| "loss": 0.6034, |
| "step": 660 |
| }, |
| { |
| "epoch": 5.154320987654321, |
| "grad_norm": 0.6454735994338989, |
| "learning_rate": 0.0001655, |
| "loss": 0.6104, |
| "step": 670 |
| }, |
| { |
| "epoch": 5.231481481481482, |
| "grad_norm": 0.63775235414505, |
| "learning_rate": 0.0001605, |
| "loss": 0.6146, |
| "step": 680 |
| }, |
| { |
| "epoch": 5.308641975308642, |
| "grad_norm": 0.6179593205451965, |
| "learning_rate": 0.0001555, |
| "loss": 0.6223, |
| "step": 690 |
| }, |
| { |
| "epoch": 5.385802469135802, |
| "grad_norm": 0.6479745507240295, |
| "learning_rate": 0.0001505, |
| "loss": 0.6166, |
| "step": 700 |
| }, |
| { |
| "epoch": 5.462962962962963, |
| "grad_norm": 0.6485195159912109, |
| "learning_rate": 0.00014549999999999999, |
| "loss": 0.6304, |
| "step": 710 |
| }, |
| { |
| "epoch": 5.540123456790123, |
| "grad_norm": 0.6457725167274475, |
| "learning_rate": 0.00014050000000000003, |
| "loss": 0.6317, |
| "step": 720 |
| }, |
| { |
| "epoch": 5.617283950617284, |
| "grad_norm": 0.6250383257865906, |
| "learning_rate": 0.00013550000000000001, |
| "loss": 0.6368, |
| "step": 730 |
| }, |
| { |
| "epoch": 5.694444444444445, |
| "grad_norm": 0.6302134990692139, |
| "learning_rate": 0.0001305, |
| "loss": 0.6349, |
| "step": 740 |
| }, |
| { |
| "epoch": 5.771604938271605, |
| "grad_norm": 0.6694247126579285, |
| "learning_rate": 0.00012550000000000001, |
| "loss": 0.6381, |
| "step": 750 |
| }, |
| { |
| "epoch": 5.848765432098766, |
| "grad_norm": 0.6490413546562195, |
| "learning_rate": 0.0001205, |
| "loss": 0.6388, |
| "step": 760 |
| }, |
| { |
| "epoch": 5.925925925925926, |
| "grad_norm": 0.6362209916114807, |
| "learning_rate": 0.0001155, |
| "loss": 0.6368, |
| "step": 770 |
| }, |
| { |
| "epoch": 6.0, |
| "grad_norm": 0.8477674722671509, |
| "learning_rate": 0.0001105, |
| "loss": 0.6389, |
| "step": 780 |
| }, |
| { |
| "epoch": 6.077160493827161, |
| "grad_norm": 0.6751664876937866, |
| "learning_rate": 0.0001055, |
| "loss": 0.5601, |
| "step": 790 |
| }, |
| { |
| "epoch": 6.154320987654321, |
| "grad_norm": 0.6612330079078674, |
| "learning_rate": 0.0001005, |
| "loss": 0.5574, |
| "step": 800 |
| }, |
| { |
| "epoch": 6.231481481481482, |
| "grad_norm": 0.6463121175765991, |
| "learning_rate": 9.55e-05, |
| "loss": 0.5605, |
| "step": 810 |
| }, |
| { |
| "epoch": 6.308641975308642, |
| "grad_norm": 0.7031050324440002, |
| "learning_rate": 9.05e-05, |
| "loss": 0.5602, |
| "step": 820 |
| }, |
| { |
| "epoch": 6.385802469135802, |
| "grad_norm": 0.6518180966377258, |
| "learning_rate": 8.55e-05, |
| "loss": 0.5609, |
| "step": 830 |
| }, |
| { |
| "epoch": 6.462962962962963, |
| "grad_norm": 0.6783486008644104, |
| "learning_rate": 8.05e-05, |
| "loss": 0.5705, |
| "step": 840 |
| }, |
| { |
| "epoch": 6.540123456790123, |
| "grad_norm": 0.6883763670921326, |
| "learning_rate": 7.55e-05, |
| "loss": 0.5684, |
| "step": 850 |
| }, |
| { |
| "epoch": 6.617283950617284, |
| "grad_norm": 0.6695120334625244, |
| "learning_rate": 7.049999999999999e-05, |
| "loss": 0.5625, |
| "step": 860 |
| }, |
| { |
| "epoch": 6.694444444444445, |
| "grad_norm": 0.678608238697052, |
| "learning_rate": 6.55e-05, |
| "loss": 0.5772, |
| "step": 870 |
| }, |
| { |
| "epoch": 6.771604938271605, |
| "grad_norm": 0.6769826412200928, |
| "learning_rate": 6.05e-05, |
| "loss": 0.5718, |
| "step": 880 |
| }, |
| { |
| "epoch": 6.848765432098766, |
| "grad_norm": 0.6706126928329468, |
| "learning_rate": 5.55e-05, |
| "loss": 0.5748, |
| "step": 890 |
| }, |
| { |
| "epoch": 6.925925925925926, |
| "grad_norm": 0.7031095027923584, |
| "learning_rate": 5.05e-05, |
| "loss": 0.5788, |
| "step": 900 |
| }, |
| { |
| "epoch": 7.0, |
| "grad_norm": 0.9296043515205383, |
| "learning_rate": 4.55e-05, |
| "loss": 0.5746, |
| "step": 910 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 1000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 8, |
| "save_steps": 100, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 3.91601644111061e+17, |
| "train_batch_size": 20, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|