| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.941747572815534, |
| "eval_steps": 500, |
| "global_step": 200, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.009708737864077669, |
| "grad_norm": 13.123837981408181, |
| "learning_rate": 2.3809523809523811e-07, |
| "loss": 3.4152, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.019417475728155338, |
| "grad_norm": 11.341939206512826, |
| "learning_rate": 4.7619047619047623e-07, |
| "loss": 3.3469, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.02912621359223301, |
| "grad_norm": 12.666841013000388, |
| "learning_rate": 7.142857142857143e-07, |
| "loss": 3.3404, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.038834951456310676, |
| "grad_norm": 12.38576292589342, |
| "learning_rate": 9.523809523809525e-07, |
| "loss": 3.3661, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.04854368932038835, |
| "grad_norm": 13.50943060392901, |
| "learning_rate": 1.1904761904761906e-06, |
| "loss": 3.6456, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.05825242718446602, |
| "grad_norm": 12.60070687752704, |
| "learning_rate": 1.4285714285714286e-06, |
| "loss": 3.4586, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.06796116504854369, |
| "grad_norm": 12.207104870760183, |
| "learning_rate": 1.6666666666666667e-06, |
| "loss": 3.3265, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.07766990291262135, |
| "grad_norm": 11.396052085771197, |
| "learning_rate": 1.904761904761905e-06, |
| "loss": 3.1192, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.08737864077669903, |
| "grad_norm": 10.849802772604386, |
| "learning_rate": 2.1428571428571427e-06, |
| "loss": 3.2892, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.0970873786407767, |
| "grad_norm": 9.303944959402013, |
| "learning_rate": 2.380952380952381e-06, |
| "loss": 2.9846, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.10679611650485436, |
| "grad_norm": 9.020909167429956, |
| "learning_rate": 2.6190476190476192e-06, |
| "loss": 3.0565, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.11650485436893204, |
| "grad_norm": 8.408768996145499, |
| "learning_rate": 2.8571428571428573e-06, |
| "loss": 3.1117, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.1262135922330097, |
| "grad_norm": 5.2970938586045, |
| "learning_rate": 3.0952380952380957e-06, |
| "loss": 2.6557, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.13592233009708737, |
| "grad_norm": 4.966874609089927, |
| "learning_rate": 3.3333333333333333e-06, |
| "loss": 2.5166, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.14563106796116504, |
| "grad_norm": 5.3359290318286865, |
| "learning_rate": 3.5714285714285718e-06, |
| "loss": 2.5143, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.1553398058252427, |
| "grad_norm": 5.692915225500684, |
| "learning_rate": 3.80952380952381e-06, |
| "loss": 2.4596, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.1650485436893204, |
| "grad_norm": 5.942402422601951, |
| "learning_rate": 4.047619047619048e-06, |
| "loss": 2.3662, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.17475728155339806, |
| "grad_norm": 10.92147820851386, |
| "learning_rate": 4.2857142857142855e-06, |
| "loss": 2.1289, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.18446601941747573, |
| "grad_norm": 8.249551104487301, |
| "learning_rate": 4.523809523809524e-06, |
| "loss": 1.9372, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.1941747572815534, |
| "grad_norm": 5.912610258594789, |
| "learning_rate": 4.761904761904762e-06, |
| "loss": 1.9367, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.20388349514563106, |
| "grad_norm": 3.466900934428963, |
| "learning_rate": 5e-06, |
| "loss": 1.9218, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.21359223300970873, |
| "grad_norm": 2.3996812791210087, |
| "learning_rate": 5.2380952380952384e-06, |
| "loss": 1.7494, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.22330097087378642, |
| "grad_norm": 2.071520914915747, |
| "learning_rate": 5.476190476190477e-06, |
| "loss": 1.7066, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.23300970873786409, |
| "grad_norm": 1.9787564899007526, |
| "learning_rate": 5.7142857142857145e-06, |
| "loss": 1.679, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.24271844660194175, |
| "grad_norm": 2.0523892248740663, |
| "learning_rate": 5.9523809523809525e-06, |
| "loss": 1.6847, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.2524271844660194, |
| "grad_norm": 2.728014645148073, |
| "learning_rate": 6.1904761904761914e-06, |
| "loss": 1.5834, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.2621359223300971, |
| "grad_norm": 1.9243753468407738, |
| "learning_rate": 6.4285714285714295e-06, |
| "loss": 1.6023, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.27184466019417475, |
| "grad_norm": 1.7443060926437244, |
| "learning_rate": 6.666666666666667e-06, |
| "loss": 1.5585, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.2815533980582524, |
| "grad_norm": 1.9667089972611325, |
| "learning_rate": 6.9047619047619055e-06, |
| "loss": 1.5523, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.2912621359223301, |
| "grad_norm": 1.6638656525516557, |
| "learning_rate": 7.1428571428571436e-06, |
| "loss": 1.4028, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.30097087378640774, |
| "grad_norm": 1.8288468644809657, |
| "learning_rate": 7.380952380952382e-06, |
| "loss": 1.5225, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.3106796116504854, |
| "grad_norm": 1.9316948734534611, |
| "learning_rate": 7.61904761904762e-06, |
| "loss": 1.6076, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.32038834951456313, |
| "grad_norm": 1.994402411129292, |
| "learning_rate": 7.857142857142858e-06, |
| "loss": 1.6018, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.3300970873786408, |
| "grad_norm": 2.0025094595893855, |
| "learning_rate": 8.095238095238097e-06, |
| "loss": 1.5295, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.33980582524271846, |
| "grad_norm": 2.198795362858049, |
| "learning_rate": 8.333333333333334e-06, |
| "loss": 1.5475, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.34951456310679613, |
| "grad_norm": 2.0797645643032685, |
| "learning_rate": 8.571428571428571e-06, |
| "loss": 1.4739, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.3592233009708738, |
| "grad_norm": 2.071926286682864, |
| "learning_rate": 8.80952380952381e-06, |
| "loss": 1.4679, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.36893203883495146, |
| "grad_norm": 2.4010215614859707, |
| "learning_rate": 9.047619047619049e-06, |
| "loss": 1.4364, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.3786407766990291, |
| "grad_norm": 2.1438204031361723, |
| "learning_rate": 9.285714285714288e-06, |
| "loss": 1.365, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.3883495145631068, |
| "grad_norm": 1.9436034900136812, |
| "learning_rate": 9.523809523809525e-06, |
| "loss": 1.4371, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.39805825242718446, |
| "grad_norm": 2.5388267486674665, |
| "learning_rate": 9.761904761904762e-06, |
| "loss": 1.4111, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.4077669902912621, |
| "grad_norm": 2.113398146300134, |
| "learning_rate": 1e-05, |
| "loss": 1.29, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.4174757281553398, |
| "grad_norm": 2.375511847477045, |
| "learning_rate": 9.999819767255175e-06, |
| "loss": 1.5152, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.42718446601941745, |
| "grad_norm": 2.2832270982086604, |
| "learning_rate": 9.999279082014233e-06, |
| "loss": 1.3026, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.4368932038834951, |
| "grad_norm": 2.180896773052596, |
| "learning_rate": 9.998377983256851e-06, |
| "loss": 1.2955, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.44660194174757284, |
| "grad_norm": 2.2006360002608916, |
| "learning_rate": 9.997116535946028e-06, |
| "loss": 1.284, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.4563106796116505, |
| "grad_norm": 2.013155184934404, |
| "learning_rate": 9.99549483102341e-06, |
| "loss": 1.357, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.46601941747572817, |
| "grad_norm": 2.0776157335492043, |
| "learning_rate": 9.993512985402724e-06, |
| "loss": 1.3026, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.47572815533980584, |
| "grad_norm": 1.9996729741636705, |
| "learning_rate": 9.99117114196137e-06, |
| "loss": 1.2625, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.4854368932038835, |
| "grad_norm": 1.9864825080235484, |
| "learning_rate": 9.988469469530086e-06, |
| "loss": 1.2091, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.49514563106796117, |
| "grad_norm": 1.8435037809122465, |
| "learning_rate": 9.985408162880813e-06, |
| "loss": 1.1999, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.5048543689320388, |
| "grad_norm": 1.9791449702400086, |
| "learning_rate": 9.981987442712634e-06, |
| "loss": 1.2577, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.5145631067961165, |
| "grad_norm": 1.9659977957936314, |
| "learning_rate": 9.978207555635856e-06, |
| "loss": 1.26, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.5242718446601942, |
| "grad_norm": 2.0385176269260747, |
| "learning_rate": 9.974068774154252e-06, |
| "loss": 1.2092, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.5339805825242718, |
| "grad_norm": 2.1519843859764816, |
| "learning_rate": 9.9695713966454e-06, |
| "loss": 1.3111, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.5436893203883495, |
| "grad_norm": 1.9718179273705034, |
| "learning_rate": 9.964715747339178e-06, |
| "loss": 1.1383, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.5533980582524272, |
| "grad_norm": 1.8488095555708863, |
| "learning_rate": 9.959502176294384e-06, |
| "loss": 1.1171, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.5631067961165048, |
| "grad_norm": 1.8823723202588312, |
| "learning_rate": 9.95393105937351e-06, |
| "loss": 1.1922, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.5728155339805825, |
| "grad_norm": 2.130166627458029, |
| "learning_rate": 9.948002798215632e-06, |
| "loss": 1.1811, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.5825242718446602, |
| "grad_norm": 1.972322379680146, |
| "learning_rate": 9.941717820207461e-06, |
| "loss": 1.122, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.5922330097087378, |
| "grad_norm": 1.9974554600923444, |
| "learning_rate": 9.935076578452535e-06, |
| "loss": 1.1378, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.6019417475728155, |
| "grad_norm": 2.1883383076564726, |
| "learning_rate": 9.928079551738542e-06, |
| "loss": 1.0968, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.6116504854368932, |
| "grad_norm": 1.8247180245095709, |
| "learning_rate": 9.92072724450282e-06, |
| "loss": 1.2031, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.6213592233009708, |
| "grad_norm": 1.8317145241143062, |
| "learning_rate": 9.913020186795967e-06, |
| "loss": 1.0359, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.6310679611650486, |
| "grad_norm": 1.8482608019696258, |
| "learning_rate": 9.904958934243655e-06, |
| "loss": 1.1466, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.6407766990291263, |
| "grad_norm": 2.0820089056154076, |
| "learning_rate": 9.89654406800655e-06, |
| "loss": 1.174, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.6504854368932039, |
| "grad_norm": 1.8203921553393068, |
| "learning_rate": 9.887776194738433e-06, |
| "loss": 1.0907, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.6601941747572816, |
| "grad_norm": 1.8417874931856297, |
| "learning_rate": 9.878655946542443e-06, |
| "loss": 1.0885, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.6699029126213593, |
| "grad_norm": 1.7661948434512647, |
| "learning_rate": 9.869183980925531e-06, |
| "loss": 1.1211, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.6796116504854369, |
| "grad_norm": 1.8387238745376497, |
| "learning_rate": 9.85936098075104e-06, |
| "loss": 1.0653, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.6893203883495146, |
| "grad_norm": 1.8949611010862242, |
| "learning_rate": 9.849187654189486e-06, |
| "loss": 1.1204, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.6990291262135923, |
| "grad_norm": 1.805814771954187, |
| "learning_rate": 9.838664734667496e-06, |
| "loss": 1.0253, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.7087378640776699, |
| "grad_norm": 1.6208527085431477, |
| "learning_rate": 9.827792980814934e-06, |
| "loss": 1.0949, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.7184466019417476, |
| "grad_norm": 1.7031116347714181, |
| "learning_rate": 9.81657317641022e-06, |
| "loss": 0.9673, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.7281553398058253, |
| "grad_norm": 1.6938233899501458, |
| "learning_rate": 9.80500613032381e-06, |
| "loss": 1.1198, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.7378640776699029, |
| "grad_norm": 1.7001778982567737, |
| "learning_rate": 9.79309267645989e-06, |
| "loss": 1.081, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.7475728155339806, |
| "grad_norm": 1.4923266262441406, |
| "learning_rate": 9.780833673696255e-06, |
| "loss": 1.0177, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.7572815533980582, |
| "grad_norm": 1.7039304530928836, |
| "learning_rate": 9.768230005822394e-06, |
| "loss": 1.0759, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.7669902912621359, |
| "grad_norm": 1.5967078271617257, |
| "learning_rate": 9.755282581475769e-06, |
| "loss": 1.0678, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.7766990291262136, |
| "grad_norm": 1.6218746165164817, |
| "learning_rate": 9.741992334076309e-06, |
| "loss": 1.033, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.7864077669902912, |
| "grad_norm": 1.4392158120730842, |
| "learning_rate": 9.728360221759125e-06, |
| "loss": 1.0064, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.7961165048543689, |
| "grad_norm": 1.5842251493264055, |
| "learning_rate": 9.714387227305422e-06, |
| "loss": 1.001, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.8058252427184466, |
| "grad_norm": 1.4858409988510537, |
| "learning_rate": 9.700074358071658e-06, |
| "loss": 1.0617, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.8155339805825242, |
| "grad_norm": 1.511143998651756, |
| "learning_rate": 9.68542264591692e-06, |
| "loss": 1.0232, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.8252427184466019, |
| "grad_norm": 1.477684418863534, |
| "learning_rate": 9.670433147128522e-06, |
| "loss": 1.0368, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.8349514563106796, |
| "grad_norm": 1.3902620258730627, |
| "learning_rate": 9.65510694234587e-06, |
| "loss": 1.0055, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.8446601941747572, |
| "grad_norm": 1.4791276503499884, |
| "learning_rate": 9.639445136482549e-06, |
| "loss": 0.9644, |
| "step": 87 |
| }, |
| { |
| "epoch": 0.8543689320388349, |
| "grad_norm": 1.2960748086935863, |
| "learning_rate": 9.623448858646658e-06, |
| "loss": 1.0314, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.8640776699029126, |
| "grad_norm": 1.4330209334150434, |
| "learning_rate": 9.607119262059426e-06, |
| "loss": 0.9931, |
| "step": 89 |
| }, |
| { |
| "epoch": 0.8737864077669902, |
| "grad_norm": 1.4711730042923779, |
| "learning_rate": 9.590457523972055e-06, |
| "loss": 1.0706, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.883495145631068, |
| "grad_norm": 1.2162238046460445, |
| "learning_rate": 9.573464845580864e-06, |
| "loss": 0.9255, |
| "step": 91 |
| }, |
| { |
| "epoch": 0.8932038834951457, |
| "grad_norm": 1.512646007631836, |
| "learning_rate": 9.55614245194068e-06, |
| "loss": 1.0073, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.9029126213592233, |
| "grad_norm": 1.1882859794319478, |
| "learning_rate": 9.538491591876522e-06, |
| "loss": 0.9323, |
| "step": 93 |
| }, |
| { |
| "epoch": 0.912621359223301, |
| "grad_norm": 1.3411887286252628, |
| "learning_rate": 9.520513537893574e-06, |
| "loss": 0.9872, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.9223300970873787, |
| "grad_norm": 1.4043520951877004, |
| "learning_rate": 9.502209586085444e-06, |
| "loss": 0.9911, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.9320388349514563, |
| "grad_norm": 1.3308705634842204, |
| "learning_rate": 9.48358105604072e-06, |
| "loss": 1.0069, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.941747572815534, |
| "grad_norm": 1.2655373953481186, |
| "learning_rate": 9.464629290747844e-06, |
| "loss": 0.8943, |
| "step": 97 |
| }, |
| { |
| "epoch": 0.9514563106796117, |
| "grad_norm": 1.2978200030362377, |
| "learning_rate": 9.445355656498284e-06, |
| "loss": 1.042, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.9611650485436893, |
| "grad_norm": 1.364983716861643, |
| "learning_rate": 9.425761542788049e-06, |
| "loss": 0.9119, |
| "step": 99 |
| }, |
| { |
| "epoch": 0.970873786407767, |
| "grad_norm": 1.2833466396457305, |
| "learning_rate": 9.40584836221749e-06, |
| "loss": 0.9067, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.9805825242718447, |
| "grad_norm": 1.3691910240046643, |
| "learning_rate": 9.38561755038949e-06, |
| "loss": 0.9509, |
| "step": 101 |
| }, |
| { |
| "epoch": 0.9902912621359223, |
| "grad_norm": 1.4140939858483665, |
| "learning_rate": 9.365070565805941e-06, |
| "loss": 0.8915, |
| "step": 102 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 1.2298760928406687, |
| "learning_rate": 9.34420888976262e-06, |
| "loss": 0.9611, |
| "step": 103 |
| }, |
| { |
| "epoch": 1.0097087378640777, |
| "grad_norm": 1.1700332930023898, |
| "learning_rate": 9.323034026242378e-06, |
| "loss": 0.7946, |
| "step": 104 |
| }, |
| { |
| "epoch": 1.0194174757281553, |
| "grad_norm": 1.2030270990453835, |
| "learning_rate": 9.301547501806725e-06, |
| "loss": 0.7579, |
| "step": 105 |
| }, |
| { |
| "epoch": 1.029126213592233, |
| "grad_norm": 1.1566037512030418, |
| "learning_rate": 9.279750865485772e-06, |
| "loss": 0.7817, |
| "step": 106 |
| }, |
| { |
| "epoch": 1.0388349514563107, |
| "grad_norm": 1.2824945991000376, |
| "learning_rate": 9.257645688666557e-06, |
| "loss": 0.8459, |
| "step": 107 |
| }, |
| { |
| "epoch": 1.0485436893203883, |
| "grad_norm": 1.192436325000586, |
| "learning_rate": 9.235233564979756e-06, |
| "loss": 0.7788, |
| "step": 108 |
| }, |
| { |
| "epoch": 1.058252427184466, |
| "grad_norm": 1.2523853200153154, |
| "learning_rate": 9.212516110184794e-06, |
| "loss": 0.8145, |
| "step": 109 |
| }, |
| { |
| "epoch": 1.0679611650485437, |
| "grad_norm": 1.180542349517411, |
| "learning_rate": 9.18949496205337e-06, |
| "loss": 0.803, |
| "step": 110 |
| }, |
| { |
| "epoch": 1.0776699029126213, |
| "grad_norm": 1.162079773122146, |
| "learning_rate": 9.166171780251365e-06, |
| "loss": 0.7449, |
| "step": 111 |
| }, |
| { |
| "epoch": 1.087378640776699, |
| "grad_norm": 1.244662719759827, |
| "learning_rate": 9.142548246219212e-06, |
| "loss": 0.8265, |
| "step": 112 |
| }, |
| { |
| "epoch": 1.0970873786407767, |
| "grad_norm": 1.1330628483376846, |
| "learning_rate": 9.118626063050661e-06, |
| "loss": 0.8005, |
| "step": 113 |
| }, |
| { |
| "epoch": 1.1067961165048543, |
| "grad_norm": 1.1569031627998947, |
| "learning_rate": 9.09440695537001e-06, |
| "loss": 0.7452, |
| "step": 114 |
| }, |
| { |
| "epoch": 1.116504854368932, |
| "grad_norm": 1.2810507561815097, |
| "learning_rate": 9.069892669207757e-06, |
| "loss": 0.866, |
| "step": 115 |
| }, |
| { |
| "epoch": 1.1262135922330097, |
| "grad_norm": 1.0927498157835924, |
| "learning_rate": 9.045084971874738e-06, |
| "loss": 0.7016, |
| "step": 116 |
| }, |
| { |
| "epoch": 1.1359223300970873, |
| "grad_norm": 1.0548124074919196, |
| "learning_rate": 9.019985651834703e-06, |
| "loss": 0.7704, |
| "step": 117 |
| }, |
| { |
| "epoch": 1.145631067961165, |
| "grad_norm": 1.1211671094064979, |
| "learning_rate": 8.994596518575393e-06, |
| "loss": 0.7425, |
| "step": 118 |
| }, |
| { |
| "epoch": 1.1553398058252426, |
| "grad_norm": 1.13515459595643, |
| "learning_rate": 8.968919402478076e-06, |
| "loss": 0.7805, |
| "step": 119 |
| }, |
| { |
| "epoch": 1.1650485436893203, |
| "grad_norm": 1.1759276131825478, |
| "learning_rate": 8.942956154685596e-06, |
| "loss": 0.7369, |
| "step": 120 |
| }, |
| { |
| "epoch": 1.174757281553398, |
| "grad_norm": 1.1070873796167529, |
| "learning_rate": 8.916708646968924e-06, |
| "loss": 0.8016, |
| "step": 121 |
| }, |
| { |
| "epoch": 1.1844660194174756, |
| "grad_norm": 1.0511845426686406, |
| "learning_rate": 8.890178771592198e-06, |
| "loss": 0.7011, |
| "step": 122 |
| }, |
| { |
| "epoch": 1.1941747572815533, |
| "grad_norm": 1.0579418108836893, |
| "learning_rate": 8.863368441176326e-06, |
| "loss": 0.7059, |
| "step": 123 |
| }, |
| { |
| "epoch": 1.203883495145631, |
| "grad_norm": 1.1049403220659926, |
| "learning_rate": 8.836279588561084e-06, |
| "loss": 0.7075, |
| "step": 124 |
| }, |
| { |
| "epoch": 1.2135922330097086, |
| "grad_norm": 1.0755124949327624, |
| "learning_rate": 8.808914166665773e-06, |
| "loss": 0.7112, |
| "step": 125 |
| }, |
| { |
| "epoch": 1.2233009708737863, |
| "grad_norm": 1.0976870364730078, |
| "learning_rate": 8.781274148348438e-06, |
| "loss": 0.7248, |
| "step": 126 |
| }, |
| { |
| "epoch": 1.233009708737864, |
| "grad_norm": 1.2025667720330029, |
| "learning_rate": 8.753361526263622e-06, |
| "loss": 0.7521, |
| "step": 127 |
| }, |
| { |
| "epoch": 1.2427184466019416, |
| "grad_norm": 1.1124911845617471, |
| "learning_rate": 8.725178312718727e-06, |
| "loss": 0.7417, |
| "step": 128 |
| }, |
| { |
| "epoch": 1.2524271844660193, |
| "grad_norm": 1.2386224168172428, |
| "learning_rate": 8.696726539528924e-06, |
| "loss": 0.7229, |
| "step": 129 |
| }, |
| { |
| "epoch": 1.262135922330097, |
| "grad_norm": 1.0679761458027965, |
| "learning_rate": 8.668008257870684e-06, |
| "loss": 0.7551, |
| "step": 130 |
| }, |
| { |
| "epoch": 1.2718446601941746, |
| "grad_norm": 1.0235025944068437, |
| "learning_rate": 8.639025538133899e-06, |
| "loss": 0.7074, |
| "step": 131 |
| }, |
| { |
| "epoch": 1.2815533980582523, |
| "grad_norm": 0.9990534132254134, |
| "learning_rate": 8.609780469772623e-06, |
| "loss": 0.6883, |
| "step": 132 |
| }, |
| { |
| "epoch": 1.29126213592233, |
| "grad_norm": 1.0705087766695287, |
| "learning_rate": 8.580275161154432e-06, |
| "loss": 0.7848, |
| "step": 133 |
| }, |
| { |
| "epoch": 1.3009708737864076, |
| "grad_norm": 1.0702664701898648, |
| "learning_rate": 8.550511739408428e-06, |
| "loss": 0.685, |
| "step": 134 |
| }, |
| { |
| "epoch": 1.3106796116504853, |
| "grad_norm": 1.0357856853514515, |
| "learning_rate": 8.520492350271895e-06, |
| "loss": 0.7346, |
| "step": 135 |
| }, |
| { |
| "epoch": 1.3203883495145632, |
| "grad_norm": 0.9933104397502625, |
| "learning_rate": 8.490219157935589e-06, |
| "loss": 0.7512, |
| "step": 136 |
| }, |
| { |
| "epoch": 1.3300970873786409, |
| "grad_norm": 0.9959095652454518, |
| "learning_rate": 8.459694344887732e-06, |
| "loss": 0.7059, |
| "step": 137 |
| }, |
| { |
| "epoch": 1.3398058252427185, |
| "grad_norm": 1.0857087498373814, |
| "learning_rate": 8.428920111756658e-06, |
| "loss": 0.7045, |
| "step": 138 |
| }, |
| { |
| "epoch": 1.3495145631067962, |
| "grad_norm": 0.978326232822382, |
| "learning_rate": 8.397898677152173e-06, |
| "loss": 0.655, |
| "step": 139 |
| }, |
| { |
| "epoch": 1.3592233009708738, |
| "grad_norm": 1.1179745234558807, |
| "learning_rate": 8.366632277505598e-06, |
| "loss": 0.759, |
| "step": 140 |
| }, |
| { |
| "epoch": 1.3689320388349515, |
| "grad_norm": 0.9862359110930711, |
| "learning_rate": 8.335123166908544e-06, |
| "loss": 0.6789, |
| "step": 141 |
| }, |
| { |
| "epoch": 1.3786407766990292, |
| "grad_norm": 0.9490832081233238, |
| "learning_rate": 8.303373616950408e-06, |
| "loss": 0.6631, |
| "step": 142 |
| }, |
| { |
| "epoch": 1.3883495145631068, |
| "grad_norm": 0.9707214432827141, |
| "learning_rate": 8.271385916554605e-06, |
| "loss": 0.6937, |
| "step": 143 |
| }, |
| { |
| "epoch": 1.3980582524271845, |
| "grad_norm": 1.0214073506468517, |
| "learning_rate": 8.239162371813552e-06, |
| "loss": 0.6327, |
| "step": 144 |
| }, |
| { |
| "epoch": 1.4077669902912622, |
| "grad_norm": 1.0274559550351532, |
| "learning_rate": 8.206705305822414e-06, |
| "loss": 0.7009, |
| "step": 145 |
| }, |
| { |
| "epoch": 1.4174757281553398, |
| "grad_norm": 1.0594563883841244, |
| "learning_rate": 8.17401705851163e-06, |
| "loss": 0.745, |
| "step": 146 |
| }, |
| { |
| "epoch": 1.4271844660194175, |
| "grad_norm": 0.9655417791148342, |
| "learning_rate": 8.141099986478212e-06, |
| "loss": 0.6486, |
| "step": 147 |
| }, |
| { |
| "epoch": 1.4368932038834952, |
| "grad_norm": 1.0220480965295933, |
| "learning_rate": 8.107956462815862e-06, |
| "loss": 0.6688, |
| "step": 148 |
| }, |
| { |
| "epoch": 1.4466019417475728, |
| "grad_norm": 0.9683390807392493, |
| "learning_rate": 8.074588876943872e-06, |
| "loss": 0.659, |
| "step": 149 |
| }, |
| { |
| "epoch": 1.4563106796116505, |
| "grad_norm": 1.0767779292378918, |
| "learning_rate": 8.040999634434883e-06, |
| "loss": 0.685, |
| "step": 150 |
| }, |
| { |
| "epoch": 1.4660194174757282, |
| "grad_norm": 0.9698677024653224, |
| "learning_rate": 8.00719115684144e-06, |
| "loss": 0.6567, |
| "step": 151 |
| }, |
| { |
| "epoch": 1.4757281553398058, |
| "grad_norm": 1.077389631746702, |
| "learning_rate": 7.973165881521435e-06, |
| "loss": 0.6843, |
| "step": 152 |
| }, |
| { |
| "epoch": 1.4854368932038835, |
| "grad_norm": 0.9955066354735376, |
| "learning_rate": 7.938926261462366e-06, |
| "loss": 0.6845, |
| "step": 153 |
| }, |
| { |
| "epoch": 1.4951456310679612, |
| "grad_norm": 0.9488957169160419, |
| "learning_rate": 7.90447476510452e-06, |
| "loss": 0.6019, |
| "step": 154 |
| }, |
| { |
| "epoch": 1.5048543689320388, |
| "grad_norm": 0.9938234421719504, |
| "learning_rate": 7.869813876162997e-06, |
| "loss": 0.7082, |
| "step": 155 |
| }, |
| { |
| "epoch": 1.5145631067961165, |
| "grad_norm": 0.9603100487668675, |
| "learning_rate": 7.834946093448658e-06, |
| "loss": 0.6412, |
| "step": 156 |
| }, |
| { |
| "epoch": 1.5242718446601942, |
| "grad_norm": 1.005992503007623, |
| "learning_rate": 7.799873930687979e-06, |
| "loss": 0.6879, |
| "step": 157 |
| }, |
| { |
| "epoch": 1.5339805825242718, |
| "grad_norm": 0.9916500135350472, |
| "learning_rate": 7.764599916341817e-06, |
| "loss": 0.6709, |
| "step": 158 |
| }, |
| { |
| "epoch": 1.5436893203883495, |
| "grad_norm": 0.9257591953412015, |
| "learning_rate": 7.729126593423151e-06, |
| "loss": 0.6196, |
| "step": 159 |
| }, |
| { |
| "epoch": 1.5533980582524272, |
| "grad_norm": 1.0569458652900285, |
| "learning_rate": 7.69345651931372e-06, |
| "loss": 0.696, |
| "step": 160 |
| }, |
| { |
| "epoch": 1.5631067961165048, |
| "grad_norm": 0.9735255979741266, |
| "learning_rate": 7.65759226557967e-06, |
| "loss": 0.6523, |
| "step": 161 |
| }, |
| { |
| "epoch": 1.5728155339805825, |
| "grad_norm": 0.9767326365838914, |
| "learning_rate": 7.621536417786159e-06, |
| "loss": 0.6609, |
| "step": 162 |
| }, |
| { |
| "epoch": 1.5825242718446602, |
| "grad_norm": 1.0593912081912753, |
| "learning_rate": 7.585291575310952e-06, |
| "loss": 0.664, |
| "step": 163 |
| }, |
| { |
| "epoch": 1.5922330097087378, |
| "grad_norm": 0.942168510730449, |
| "learning_rate": 7.548860351157028e-06, |
| "loss": 0.5929, |
| "step": 164 |
| }, |
| { |
| "epoch": 1.6019417475728155, |
| "grad_norm": 1.0180139630792668, |
| "learning_rate": 7.512245371764197e-06, |
| "loss": 0.6571, |
| "step": 165 |
| }, |
| { |
| "epoch": 1.6116504854368932, |
| "grad_norm": 0.9895426752938254, |
| "learning_rate": 7.475449276819753e-06, |
| "loss": 0.6581, |
| "step": 166 |
| }, |
| { |
| "epoch": 1.6213592233009708, |
| "grad_norm": 1.0191365953291556, |
| "learning_rate": 7.438474719068174e-06, |
| "loss": 0.726, |
| "step": 167 |
| }, |
| { |
| "epoch": 1.6310679611650487, |
| "grad_norm": 1.0066738404323723, |
| "learning_rate": 7.401324364119872e-06, |
| "loss": 0.6901, |
| "step": 168 |
| }, |
| { |
| "epoch": 1.6407766990291264, |
| "grad_norm": 1.002278806498304, |
| "learning_rate": 7.364000890259024e-06, |
| "loss": 0.6324, |
| "step": 169 |
| }, |
| { |
| "epoch": 1.650485436893204, |
| "grad_norm": 0.907526837788578, |
| "learning_rate": 7.326506988250488e-06, |
| "loss": 0.6219, |
| "step": 170 |
| }, |
| { |
| "epoch": 1.6601941747572817, |
| "grad_norm": 1.1098079007322723, |
| "learning_rate": 7.288845361145812e-06, |
| "loss": 0.7004, |
| "step": 171 |
| }, |
| { |
| "epoch": 1.6699029126213594, |
| "grad_norm": 1.0550771463928161, |
| "learning_rate": 7.251018724088367e-06, |
| "loss": 0.6032, |
| "step": 172 |
| }, |
| { |
| "epoch": 1.679611650485437, |
| "grad_norm": 1.0536071947022911, |
| "learning_rate": 7.213029804117604e-06, |
| "loss": 0.6874, |
| "step": 173 |
| }, |
| { |
| "epoch": 1.6893203883495147, |
| "grad_norm": 0.9509232087171413, |
| "learning_rate": 7.174881339972448e-06, |
| "loss": 0.6271, |
| "step": 174 |
| }, |
| { |
| "epoch": 1.6990291262135924, |
| "grad_norm": 0.9765447626747497, |
| "learning_rate": 7.136576081893863e-06, |
| "loss": 0.578, |
| "step": 175 |
| }, |
| { |
| "epoch": 1.70873786407767, |
| "grad_norm": 0.8766277871753961, |
| "learning_rate": 7.09811679142657e-06, |
| "loss": 0.62, |
| "step": 176 |
| }, |
| { |
| "epoch": 1.7184466019417477, |
| "grad_norm": 0.9544049210539662, |
| "learning_rate": 7.059506241219964e-06, |
| "loss": 0.6478, |
| "step": 177 |
| }, |
| { |
| "epoch": 1.7281553398058254, |
| "grad_norm": 0.9873530148187425, |
| "learning_rate": 7.020747214828221e-06, |
| "loss": 0.5994, |
| "step": 178 |
| }, |
| { |
| "epoch": 1.737864077669903, |
| "grad_norm": 0.902538459879482, |
| "learning_rate": 6.981842506509626e-06, |
| "loss": 0.5824, |
| "step": 179 |
| }, |
| { |
| "epoch": 1.7475728155339807, |
| "grad_norm": 1.0465611114075961, |
| "learning_rate": 6.942794921025127e-06, |
| "loss": 0.6444, |
| "step": 180 |
| }, |
| { |
| "epoch": 1.7572815533980584, |
| "grad_norm": 0.9406932219924345, |
| "learning_rate": 6.903607273436128e-06, |
| "loss": 0.6602, |
| "step": 181 |
| }, |
| { |
| "epoch": 1.766990291262136, |
| "grad_norm": 1.9912252671577682, |
| "learning_rate": 6.864282388901544e-06, |
| "loss": 0.6426, |
| "step": 182 |
| }, |
| { |
| "epoch": 1.7766990291262137, |
| "grad_norm": 0.9157876015013201, |
| "learning_rate": 6.824823102474127e-06, |
| "loss": 0.6018, |
| "step": 183 |
| }, |
| { |
| "epoch": 1.7864077669902914, |
| "grad_norm": 0.9118371479046088, |
| "learning_rate": 6.785232258896078e-06, |
| "loss": 0.6573, |
| "step": 184 |
| }, |
| { |
| "epoch": 1.796116504854369, |
| "grad_norm": 0.8991339248484931, |
| "learning_rate": 6.745512712393958e-06, |
| "loss": 0.6388, |
| "step": 185 |
| }, |
| { |
| "epoch": 1.8058252427184467, |
| "grad_norm": 0.9972017158456973, |
| "learning_rate": 6.705667326472926e-06, |
| "loss": 0.5934, |
| "step": 186 |
| }, |
| { |
| "epoch": 1.8155339805825244, |
| "grad_norm": 0.9988069506780758, |
| "learning_rate": 6.665698973710289e-06, |
| "loss": 0.5952, |
| "step": 187 |
| }, |
| { |
| "epoch": 1.825242718446602, |
| "grad_norm": 0.8514983462528519, |
| "learning_rate": 6.625610535548418e-06, |
| "loss": 0.5318, |
| "step": 188 |
| }, |
| { |
| "epoch": 1.8349514563106797, |
| "grad_norm": 0.9294990543655399, |
| "learning_rate": 6.585404902087011e-06, |
| "loss": 0.5717, |
| "step": 189 |
| }, |
| { |
| "epoch": 1.8446601941747574, |
| "grad_norm": 0.9938842716480702, |
| "learning_rate": 6.545084971874738e-06, |
| "loss": 0.6195, |
| "step": 190 |
| }, |
| { |
| "epoch": 1.854368932038835, |
| "grad_norm": 0.9123365141247879, |
| "learning_rate": 6.504653651700278e-06, |
| "loss": 0.5967, |
| "step": 191 |
| }, |
| { |
| "epoch": 1.8640776699029127, |
| "grad_norm": 1.0008141571407345, |
| "learning_rate": 6.464113856382752e-06, |
| "loss": 0.6277, |
| "step": 192 |
| }, |
| { |
| "epoch": 1.8737864077669903, |
| "grad_norm": 0.8919950398755992, |
| "learning_rate": 6.423468508561599e-06, |
| "loss": 0.5958, |
| "step": 193 |
| }, |
| { |
| "epoch": 1.883495145631068, |
| "grad_norm": 0.9351239526172137, |
| "learning_rate": 6.382720538485856e-06, |
| "loss": 0.5581, |
| "step": 194 |
| }, |
| { |
| "epoch": 1.8932038834951457, |
| "grad_norm": 1.0249486162494306, |
| "learning_rate": 6.341872883802923e-06, |
| "loss": 0.6901, |
| "step": 195 |
| }, |
| { |
| "epoch": 1.9029126213592233, |
| "grad_norm": 0.8637050153467829, |
| "learning_rate": 6.3009284893467655e-06, |
| "loss": 0.6034, |
| "step": 196 |
| }, |
| { |
| "epoch": 1.912621359223301, |
| "grad_norm": 0.9202467333329478, |
| "learning_rate": 6.259890306925627e-06, |
| "loss": 0.5609, |
| "step": 197 |
| }, |
| { |
| "epoch": 1.9223300970873787, |
| "grad_norm": 0.9798270645068964, |
| "learning_rate": 6.218761295109209e-06, |
| "loss": 0.6688, |
| "step": 198 |
| }, |
| { |
| "epoch": 1.9320388349514563, |
| "grad_norm": 1.0149231730700408, |
| "learning_rate": 6.177544419015388e-06, |
| "loss": 0.573, |
| "step": 199 |
| }, |
| { |
| "epoch": 1.941747572815534, |
| "grad_norm": 1.008333410310924, |
| "learning_rate": 6.136242650096451e-06, |
| "loss": 0.6607, |
| "step": 200 |
| } |
| ], |
| "logging_steps": 1.0, |
| "max_steps": 412, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 4, |
| "save_steps": 100, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 161207176396800.0, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|