{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.03949447077409163, "eval_steps": 500, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00019747235387045813, "grad_norm": 5.788590908050537, "learning_rate": 0.0, "loss": 1.0436, "step": 1 }, { "epoch": 0.00039494470774091627, "grad_norm": 6.300467491149902, "learning_rate": 1.0000000000000002e-06, "loss": 1.3195, "step": 2 }, { "epoch": 0.0005924170616113745, "grad_norm": 6.489816665649414, "learning_rate": 2.0000000000000003e-06, "loss": 1.5645, "step": 3 }, { "epoch": 0.0007898894154818325, "grad_norm": 5.287850379943848, "learning_rate": 3e-06, "loss": 1.154, "step": 4 }, { "epoch": 0.0009873617693522906, "grad_norm": 6.123354434967041, "learning_rate": 4.000000000000001e-06, "loss": 1.4052, "step": 5 }, { "epoch": 0.001184834123222749, "grad_norm": 5.501026630401611, "learning_rate": 5e-06, "loss": 1.708, "step": 6 }, { "epoch": 0.001382306477093207, "grad_norm": 4.960855007171631, "learning_rate": 6e-06, "loss": 1.6143, "step": 7 }, { "epoch": 0.001579778830963665, "grad_norm": 5.855900764465332, "learning_rate": 7e-06, "loss": 1.6376, "step": 8 }, { "epoch": 0.0017772511848341231, "grad_norm": 5.17061710357666, "learning_rate": 8.000000000000001e-06, "loss": 1.2182, "step": 9 }, { "epoch": 0.0019747235387045812, "grad_norm": 4.436169624328613, "learning_rate": 9e-06, "loss": 1.9812, "step": 10 }, { "epoch": 0.0021721958925750395, "grad_norm": 4.55659818649292, "learning_rate": 1e-05, "loss": 1.1695, "step": 11 }, { "epoch": 0.002369668246445498, "grad_norm": 5.105171203613281, "learning_rate": 9.947368421052632e-06, "loss": 1.1243, "step": 12 }, { "epoch": 0.0025671406003159557, "grad_norm": 5.100945472717285, "learning_rate": 9.894736842105264e-06, "loss": 1.8183, "step": 13 }, { "epoch": 0.002764612954186414, "grad_norm": 4.643738746643066, "learning_rate": 9.842105263157896e-06, "loss": 1.2569, "step": 14 }, { "epoch": 0.002962085308056872, "grad_norm": 4.557013034820557, "learning_rate": 9.789473684210527e-06, "loss": 1.7372, "step": 15 }, { "epoch": 0.00315955766192733, "grad_norm": 6.458473205566406, "learning_rate": 9.736842105263159e-06, "loss": 2.1888, "step": 16 }, { "epoch": 0.0033570300157977884, "grad_norm": 5.135770320892334, "learning_rate": 9.68421052631579e-06, "loss": 1.2197, "step": 17 }, { "epoch": 0.0035545023696682463, "grad_norm": 5.171187400817871, "learning_rate": 9.631578947368422e-06, "loss": 1.6326, "step": 18 }, { "epoch": 0.0037519747235387046, "grad_norm": 4.825802326202393, "learning_rate": 9.578947368421054e-06, "loss": 1.4698, "step": 19 }, { "epoch": 0.0039494470774091624, "grad_norm": 4.34477424621582, "learning_rate": 9.526315789473684e-06, "loss": 1.6102, "step": 20 }, { "epoch": 0.004146919431279621, "grad_norm": 4.5733866691589355, "learning_rate": 9.473684210526315e-06, "loss": 1.8448, "step": 21 }, { "epoch": 0.004344391785150079, "grad_norm": 4.53968620300293, "learning_rate": 9.421052631578949e-06, "loss": 1.2972, "step": 22 }, { "epoch": 0.004541864139020537, "grad_norm": 5.408173084259033, "learning_rate": 9.36842105263158e-06, "loss": 1.7215, "step": 23 }, { "epoch": 0.004739336492890996, "grad_norm": 4.2717461585998535, "learning_rate": 9.315789473684212e-06, "loss": 1.137, "step": 24 }, { "epoch": 0.0049368088467614535, "grad_norm": 4.3075785636901855, "learning_rate": 9.263157894736842e-06, "loss": 1.3048, "step": 25 }, { "epoch": 0.005134281200631911, "grad_norm": 4.659534454345703, "learning_rate": 9.210526315789474e-06, "loss": 0.9883, "step": 26 }, { "epoch": 0.00533175355450237, "grad_norm": 4.719169616699219, "learning_rate": 9.157894736842105e-06, "loss": 1.4926, "step": 27 }, { "epoch": 0.005529225908372828, "grad_norm": 4.502306938171387, "learning_rate": 9.105263157894739e-06, "loss": 1.8264, "step": 28 }, { "epoch": 0.005726698262243286, "grad_norm": 4.353489875793457, "learning_rate": 9.05263157894737e-06, "loss": 1.6681, "step": 29 }, { "epoch": 0.005924170616113744, "grad_norm": 5.161799907684326, "learning_rate": 9e-06, "loss": 1.6566, "step": 30 }, { "epoch": 0.006121642969984202, "grad_norm": 4.235696315765381, "learning_rate": 8.947368421052632e-06, "loss": 1.7276, "step": 31 }, { "epoch": 0.00631911532385466, "grad_norm": 6.545216083526611, "learning_rate": 8.894736842105264e-06, "loss": 1.0481, "step": 32 }, { "epoch": 0.006516587677725118, "grad_norm": 4.9834113121032715, "learning_rate": 8.842105263157895e-06, "loss": 1.4457, "step": 33 }, { "epoch": 0.006714060031595577, "grad_norm": 4.448666572570801, "learning_rate": 8.789473684210527e-06, "loss": 1.7361, "step": 34 }, { "epoch": 0.006911532385466035, "grad_norm": 4.735658168792725, "learning_rate": 8.736842105263158e-06, "loss": 1.3361, "step": 35 }, { "epoch": 0.0071090047393364926, "grad_norm": 5.857210636138916, "learning_rate": 8.68421052631579e-06, "loss": 1.4395, "step": 36 }, { "epoch": 0.007306477093206951, "grad_norm": 4.746231555938721, "learning_rate": 8.631578947368422e-06, "loss": 1.4588, "step": 37 }, { "epoch": 0.007503949447077409, "grad_norm": 5.420529365539551, "learning_rate": 8.578947368421053e-06, "loss": 1.4352, "step": 38 }, { "epoch": 0.007701421800947867, "grad_norm": 3.966956853866577, "learning_rate": 8.526315789473685e-06, "loss": 1.7395, "step": 39 }, { "epoch": 0.007898894154818325, "grad_norm": 4.626506805419922, "learning_rate": 8.473684210526317e-06, "loss": 1.2152, "step": 40 }, { "epoch": 0.008096366508688783, "grad_norm": 5.946537971496582, "learning_rate": 8.421052631578948e-06, "loss": 1.2569, "step": 41 }, { "epoch": 0.008293838862559242, "grad_norm": 6.078729152679443, "learning_rate": 8.36842105263158e-06, "loss": 2.0072, "step": 42 }, { "epoch": 0.0084913112164297, "grad_norm": 5.362630844116211, "learning_rate": 8.315789473684212e-06, "loss": 1.6189, "step": 43 }, { "epoch": 0.008688783570300158, "grad_norm": 4.6319098472595215, "learning_rate": 8.263157894736843e-06, "loss": 1.1937, "step": 44 }, { "epoch": 0.008886255924170616, "grad_norm": 5.145988464355469, "learning_rate": 8.210526315789475e-06, "loss": 1.48, "step": 45 }, { "epoch": 0.009083728278041074, "grad_norm": 5.191286563873291, "learning_rate": 8.157894736842106e-06, "loss": 1.6115, "step": 46 }, { "epoch": 0.009281200631911532, "grad_norm": 4.6075544357299805, "learning_rate": 8.105263157894736e-06, "loss": 1.3963, "step": 47 }, { "epoch": 0.009478672985781991, "grad_norm": 4.724617958068848, "learning_rate": 8.052631578947368e-06, "loss": 1.7869, "step": 48 }, { "epoch": 0.009676145339652449, "grad_norm": 4.976570129394531, "learning_rate": 8.000000000000001e-06, "loss": 1.5635, "step": 49 }, { "epoch": 0.009873617693522907, "grad_norm": 5.121829032897949, "learning_rate": 7.947368421052633e-06, "loss": 1.246, "step": 50 }, { "epoch": 0.010071090047393365, "grad_norm": 5.260928153991699, "learning_rate": 7.894736842105265e-06, "loss": 1.98, "step": 51 }, { "epoch": 0.010268562401263823, "grad_norm": 5.072506904602051, "learning_rate": 7.842105263157895e-06, "loss": 1.0555, "step": 52 }, { "epoch": 0.01046603475513428, "grad_norm": 4.6401872634887695, "learning_rate": 7.789473684210526e-06, "loss": 1.5456, "step": 53 }, { "epoch": 0.01066350710900474, "grad_norm": 5.566153526306152, "learning_rate": 7.736842105263158e-06, "loss": 1.7965, "step": 54 }, { "epoch": 0.010860979462875198, "grad_norm": 5.0998215675354, "learning_rate": 7.68421052631579e-06, "loss": 1.3262, "step": 55 }, { "epoch": 0.011058451816745656, "grad_norm": 4.437518119812012, "learning_rate": 7.631578947368423e-06, "loss": 1.7174, "step": 56 }, { "epoch": 0.011255924170616114, "grad_norm": 3.8838698863983154, "learning_rate": 7.578947368421054e-06, "loss": 1.185, "step": 57 }, { "epoch": 0.011453396524486572, "grad_norm": 4.112951278686523, "learning_rate": 7.526315789473685e-06, "loss": 1.3785, "step": 58 }, { "epoch": 0.01165086887835703, "grad_norm": 4.612501621246338, "learning_rate": 7.473684210526316e-06, "loss": 1.2861, "step": 59 }, { "epoch": 0.011848341232227487, "grad_norm": 4.541945457458496, "learning_rate": 7.421052631578948e-06, "loss": 1.0096, "step": 60 }, { "epoch": 0.012045813586097947, "grad_norm": 6.567755699157715, "learning_rate": 7.368421052631579e-06, "loss": 1.0163, "step": 61 }, { "epoch": 0.012243285939968405, "grad_norm": 4.897511959075928, "learning_rate": 7.315789473684212e-06, "loss": 1.8322, "step": 62 }, { "epoch": 0.012440758293838863, "grad_norm": 4.3046064376831055, "learning_rate": 7.263157894736843e-06, "loss": 1.0891, "step": 63 }, { "epoch": 0.01263823064770932, "grad_norm": 4.533966064453125, "learning_rate": 7.210526315789474e-06, "loss": 1.3371, "step": 64 }, { "epoch": 0.012835703001579778, "grad_norm": 4.470656394958496, "learning_rate": 7.157894736842106e-06, "loss": 2.3414, "step": 65 }, { "epoch": 0.013033175355450236, "grad_norm": 4.738101959228516, "learning_rate": 7.1052631578947375e-06, "loss": 1.765, "step": 66 }, { "epoch": 0.013230647709320696, "grad_norm": 3.870649814605713, "learning_rate": 7.052631578947369e-06, "loss": 1.6424, "step": 67 }, { "epoch": 0.013428120063191154, "grad_norm": 7.043670177459717, "learning_rate": 7e-06, "loss": 1.5049, "step": 68 }, { "epoch": 0.013625592417061612, "grad_norm": 4.199113368988037, "learning_rate": 6.947368421052632e-06, "loss": 1.5253, "step": 69 }, { "epoch": 0.01382306477093207, "grad_norm": 4.738468170166016, "learning_rate": 6.894736842105264e-06, "loss": 1.4544, "step": 70 }, { "epoch": 0.014020537124802527, "grad_norm": 5.083221912384033, "learning_rate": 6.842105263157896e-06, "loss": 1.4874, "step": 71 }, { "epoch": 0.014218009478672985, "grad_norm": 4.9555253982543945, "learning_rate": 6.789473684210527e-06, "loss": 1.7049, "step": 72 }, { "epoch": 0.014415481832543445, "grad_norm": 4.266180992126465, "learning_rate": 6.736842105263158e-06, "loss": 1.2864, "step": 73 }, { "epoch": 0.014612954186413903, "grad_norm": 4.510780334472656, "learning_rate": 6.68421052631579e-06, "loss": 1.4615, "step": 74 }, { "epoch": 0.01481042654028436, "grad_norm": 4.075244426727295, "learning_rate": 6.631578947368421e-06, "loss": 1.0669, "step": 75 }, { "epoch": 0.015007898894154818, "grad_norm": 4.169254302978516, "learning_rate": 6.578947368421054e-06, "loss": 1.4458, "step": 76 }, { "epoch": 0.015205371248025276, "grad_norm": 4.540365695953369, "learning_rate": 6.526315789473685e-06, "loss": 1.4148, "step": 77 }, { "epoch": 0.015402843601895734, "grad_norm": 4.700695037841797, "learning_rate": 6.473684210526316e-06, "loss": 2.0347, "step": 78 }, { "epoch": 0.015600315955766192, "grad_norm": 4.982248306274414, "learning_rate": 6.421052631578948e-06, "loss": 1.3731, "step": 79 }, { "epoch": 0.01579778830963665, "grad_norm": 5.1645941734313965, "learning_rate": 6.3684210526315795e-06, "loss": 1.1041, "step": 80 }, { "epoch": 0.015995260663507108, "grad_norm": 3.988223075866699, "learning_rate": 6.31578947368421e-06, "loss": 1.8261, "step": 81 }, { "epoch": 0.016192733017377565, "grad_norm": 5.132425785064697, "learning_rate": 6.263157894736842e-06, "loss": 1.3754, "step": 82 }, { "epoch": 0.016390205371248027, "grad_norm": 4.158195972442627, "learning_rate": 6.2105263157894745e-06, "loss": 1.233, "step": 83 }, { "epoch": 0.016587677725118485, "grad_norm": 5.385928630828857, "learning_rate": 6.157894736842106e-06, "loss": 1.6838, "step": 84 }, { "epoch": 0.016785150078988943, "grad_norm": 4.63645076751709, "learning_rate": 6.105263157894738e-06, "loss": 1.9314, "step": 85 }, { "epoch": 0.0169826224328594, "grad_norm": 5.3244757652282715, "learning_rate": 6.0526315789473685e-06, "loss": 1.4223, "step": 86 }, { "epoch": 0.017180094786729858, "grad_norm": 4.603305339813232, "learning_rate": 6e-06, "loss": 1.7243, "step": 87 }, { "epoch": 0.017377567140600316, "grad_norm": 5.223212718963623, "learning_rate": 5.947368421052632e-06, "loss": 1.337, "step": 88 }, { "epoch": 0.017575039494470774, "grad_norm": 4.344864845275879, "learning_rate": 5.8947368421052634e-06, "loss": 1.4767, "step": 89 }, { "epoch": 0.017772511848341232, "grad_norm": 4.268956184387207, "learning_rate": 5.842105263157896e-06, "loss": 1.0763, "step": 90 }, { "epoch": 0.01796998420221169, "grad_norm": 4.882099151611328, "learning_rate": 5.789473684210527e-06, "loss": 1.7006, "step": 91 }, { "epoch": 0.018167456556082148, "grad_norm": 5.254153728485107, "learning_rate": 5.736842105263158e-06, "loss": 2.0318, "step": 92 }, { "epoch": 0.018364928909952605, "grad_norm": 4.496589660644531, "learning_rate": 5.68421052631579e-06, "loss": 1.0075, "step": 93 }, { "epoch": 0.018562401263823063, "grad_norm": 4.610377788543701, "learning_rate": 5.631578947368422e-06, "loss": 0.973, "step": 94 }, { "epoch": 0.018759873617693525, "grad_norm": 4.102587699890137, "learning_rate": 5.578947368421052e-06, "loss": 1.4508, "step": 95 }, { "epoch": 0.018957345971563982, "grad_norm": 4.820801258087158, "learning_rate": 5.526315789473685e-06, "loss": 2.234, "step": 96 }, { "epoch": 0.01915481832543444, "grad_norm": 4.17614221572876, "learning_rate": 5.4736842105263165e-06, "loss": 0.9472, "step": 97 }, { "epoch": 0.019352290679304898, "grad_norm": 4.681643962860107, "learning_rate": 5.421052631578948e-06, "loss": 0.952, "step": 98 }, { "epoch": 0.019549763033175356, "grad_norm": 4.793570041656494, "learning_rate": 5.36842105263158e-06, "loss": 2.1524, "step": 99 }, { "epoch": 0.019747235387045814, "grad_norm": 5.580649375915527, "learning_rate": 5.315789473684211e-06, "loss": 1.168, "step": 100 }, { "epoch": 0.019944707740916272, "grad_norm": 4.42297887802124, "learning_rate": 5.263157894736842e-06, "loss": 1.0549, "step": 101 }, { "epoch": 0.02014218009478673, "grad_norm": 4.290710926055908, "learning_rate": 5.210526315789474e-06, "loss": 1.4328, "step": 102 }, { "epoch": 0.020339652448657188, "grad_norm": 4.0357255935668945, "learning_rate": 5.157894736842106e-06, "loss": 0.9894, "step": 103 }, { "epoch": 0.020537124802527645, "grad_norm": 4.805973529815674, "learning_rate": 5.105263157894738e-06, "loss": 1.3093, "step": 104 }, { "epoch": 0.020734597156398103, "grad_norm": 4.185112953186035, "learning_rate": 5.052631578947369e-06, "loss": 1.4606, "step": 105 }, { "epoch": 0.02093206951026856, "grad_norm": 4.384559631347656, "learning_rate": 5e-06, "loss": 1.0051, "step": 106 }, { "epoch": 0.02112954186413902, "grad_norm": 4.920189380645752, "learning_rate": 4.947368421052632e-06, "loss": 1.1721, "step": 107 }, { "epoch": 0.02132701421800948, "grad_norm": 4.506773471832275, "learning_rate": 4.894736842105264e-06, "loss": 1.4764, "step": 108 }, { "epoch": 0.021524486571879938, "grad_norm": 4.980959415435791, "learning_rate": 4.842105263157895e-06, "loss": 1.1304, "step": 109 }, { "epoch": 0.021721958925750396, "grad_norm": 4.118868827819824, "learning_rate": 4.789473684210527e-06, "loss": 1.1892, "step": 110 }, { "epoch": 0.021919431279620854, "grad_norm": 6.19287109375, "learning_rate": 4.736842105263158e-06, "loss": 1.6254, "step": 111 }, { "epoch": 0.022116903633491312, "grad_norm": 5.811559200286865, "learning_rate": 4.68421052631579e-06, "loss": 1.8662, "step": 112 }, { "epoch": 0.02231437598736177, "grad_norm": 4.6513352394104, "learning_rate": 4.631578947368421e-06, "loss": 1.3816, "step": 113 }, { "epoch": 0.022511848341232227, "grad_norm": 5.178617477416992, "learning_rate": 4.578947368421053e-06, "loss": 1.2535, "step": 114 }, { "epoch": 0.022709320695102685, "grad_norm": 3.830137014389038, "learning_rate": 4.526315789473685e-06, "loss": 1.5312, "step": 115 }, { "epoch": 0.022906793048973143, "grad_norm": 4.620641231536865, "learning_rate": 4.473684210526316e-06, "loss": 1.5528, "step": 116 }, { "epoch": 0.0231042654028436, "grad_norm": 5.0326738357543945, "learning_rate": 4.4210526315789476e-06, "loss": 1.3443, "step": 117 }, { "epoch": 0.02330173775671406, "grad_norm": 4.62188720703125, "learning_rate": 4.368421052631579e-06, "loss": 1.0445, "step": 118 }, { "epoch": 0.023499210110584517, "grad_norm": 3.9635679721832275, "learning_rate": 4.315789473684211e-06, "loss": 1.5812, "step": 119 }, { "epoch": 0.023696682464454975, "grad_norm": 5.0226664543151855, "learning_rate": 4.2631578947368425e-06, "loss": 1.2235, "step": 120 }, { "epoch": 0.023894154818325436, "grad_norm": 5.353757858276367, "learning_rate": 4.210526315789474e-06, "loss": 1.5831, "step": 121 }, { "epoch": 0.024091627172195894, "grad_norm": 4.838202476501465, "learning_rate": 4.157894736842106e-06, "loss": 1.7689, "step": 122 }, { "epoch": 0.02428909952606635, "grad_norm": 4.49991512298584, "learning_rate": 4.105263157894737e-06, "loss": 0.828, "step": 123 }, { "epoch": 0.02448657187993681, "grad_norm": 4.480604648590088, "learning_rate": 4.052631578947368e-06, "loss": 1.6869, "step": 124 }, { "epoch": 0.024684044233807267, "grad_norm": 4.725250244140625, "learning_rate": 4.000000000000001e-06, "loss": 0.7038, "step": 125 }, { "epoch": 0.024881516587677725, "grad_norm": 4.806907653808594, "learning_rate": 3.947368421052632e-06, "loss": 1.6985, "step": 126 }, { "epoch": 0.025078988941548183, "grad_norm": 4.734091758728027, "learning_rate": 3.894736842105263e-06, "loss": 1.5138, "step": 127 }, { "epoch": 0.02527646129541864, "grad_norm": 5.3406243324279785, "learning_rate": 3.842105263157895e-06, "loss": 1.7152, "step": 128 }, { "epoch": 0.0254739336492891, "grad_norm": 4.890450477600098, "learning_rate": 3.789473684210527e-06, "loss": 1.734, "step": 129 }, { "epoch": 0.025671406003159557, "grad_norm": 5.458994388580322, "learning_rate": 3.736842105263158e-06, "loss": 1.5888, "step": 130 }, { "epoch": 0.025868878357030015, "grad_norm": 4.655605316162109, "learning_rate": 3.6842105263157896e-06, "loss": 1.7478, "step": 131 }, { "epoch": 0.026066350710900472, "grad_norm": 7.664575576782227, "learning_rate": 3.6315789473684217e-06, "loss": 1.5616, "step": 132 }, { "epoch": 0.026263823064770934, "grad_norm": 5.207353115081787, "learning_rate": 3.578947368421053e-06, "loss": 1.8918, "step": 133 }, { "epoch": 0.02646129541864139, "grad_norm": 3.969021797180176, "learning_rate": 3.5263157894736846e-06, "loss": 1.0807, "step": 134 }, { "epoch": 0.02665876777251185, "grad_norm": 5.148044586181641, "learning_rate": 3.473684210526316e-06, "loss": 1.5419, "step": 135 }, { "epoch": 0.026856240126382307, "grad_norm": 5.609622955322266, "learning_rate": 3.421052631578948e-06, "loss": 1.2194, "step": 136 }, { "epoch": 0.027053712480252765, "grad_norm": 4.281411170959473, "learning_rate": 3.368421052631579e-06, "loss": 0.9111, "step": 137 }, { "epoch": 0.027251184834123223, "grad_norm": 4.415678977966309, "learning_rate": 3.3157894736842107e-06, "loss": 1.999, "step": 138 }, { "epoch": 0.02744865718799368, "grad_norm": 4.46459436416626, "learning_rate": 3.2631578947368423e-06, "loss": 1.749, "step": 139 }, { "epoch": 0.02764612954186414, "grad_norm": 4.180970191955566, "learning_rate": 3.210526315789474e-06, "loss": 1.4696, "step": 140 }, { "epoch": 0.027843601895734597, "grad_norm": 4.308414459228516, "learning_rate": 3.157894736842105e-06, "loss": 1.6192, "step": 141 }, { "epoch": 0.028041074249605055, "grad_norm": 6.27396821975708, "learning_rate": 3.1052631578947372e-06, "loss": 0.9774, "step": 142 }, { "epoch": 0.028238546603475512, "grad_norm": 5.713791847229004, "learning_rate": 3.052631578947369e-06, "loss": 1.2108, "step": 143 }, { "epoch": 0.02843601895734597, "grad_norm": 3.94455885887146, "learning_rate": 3e-06, "loss": 0.928, "step": 144 }, { "epoch": 0.028633491311216428, "grad_norm": 3.577357769012451, "learning_rate": 2.9473684210526317e-06, "loss": 1.2109, "step": 145 }, { "epoch": 0.02883096366508689, "grad_norm": 4.142393112182617, "learning_rate": 2.8947368421052634e-06, "loss": 0.8309, "step": 146 }, { "epoch": 0.029028436018957347, "grad_norm": 5.459789276123047, "learning_rate": 2.842105263157895e-06, "loss": 1.6238, "step": 147 }, { "epoch": 0.029225908372827805, "grad_norm": 4.358528137207031, "learning_rate": 2.789473684210526e-06, "loss": 1.2229, "step": 148 }, { "epoch": 0.029423380726698263, "grad_norm": 4.201858043670654, "learning_rate": 2.7368421052631583e-06, "loss": 0.8191, "step": 149 }, { "epoch": 0.02962085308056872, "grad_norm": 5.12843656539917, "learning_rate": 2.68421052631579e-06, "loss": 2.0575, "step": 150 }, { "epoch": 0.02981832543443918, "grad_norm": 3.9480583667755127, "learning_rate": 2.631578947368421e-06, "loss": 1.2752, "step": 151 }, { "epoch": 0.030015797788309637, "grad_norm": 4.900203227996826, "learning_rate": 2.578947368421053e-06, "loss": 0.6156, "step": 152 }, { "epoch": 0.030213270142180094, "grad_norm": 5.017102241516113, "learning_rate": 2.5263157894736844e-06, "loss": 1.4791, "step": 153 }, { "epoch": 0.030410742496050552, "grad_norm": 4.326578140258789, "learning_rate": 2.473684210526316e-06, "loss": 1.1194, "step": 154 }, { "epoch": 0.03060821484992101, "grad_norm": 4.385910511016846, "learning_rate": 2.4210526315789477e-06, "loss": 1.8029, "step": 155 }, { "epoch": 0.030805687203791468, "grad_norm": 3.988187551498413, "learning_rate": 2.368421052631579e-06, "loss": 0.9693, "step": 156 }, { "epoch": 0.031003159557661926, "grad_norm": 4.455026149749756, "learning_rate": 2.3157894736842105e-06, "loss": 1.3069, "step": 157 }, { "epoch": 0.031200631911532384, "grad_norm": 5.547366142272949, "learning_rate": 2.2631578947368426e-06, "loss": 1.5911, "step": 158 }, { "epoch": 0.03139810426540284, "grad_norm": 5.407074928283691, "learning_rate": 2.2105263157894738e-06, "loss": 1.4276, "step": 159 }, { "epoch": 0.0315955766192733, "grad_norm": 4.332579135894775, "learning_rate": 2.1578947368421054e-06, "loss": 1.0162, "step": 160 }, { "epoch": 0.03179304897314376, "grad_norm": 4.770085334777832, "learning_rate": 2.105263157894737e-06, "loss": 1.5811, "step": 161 }, { "epoch": 0.031990521327014215, "grad_norm": 4.930882930755615, "learning_rate": 2.0526315789473687e-06, "loss": 1.1303, "step": 162 }, { "epoch": 0.03218799368088467, "grad_norm": 5.203794479370117, "learning_rate": 2.0000000000000003e-06, "loss": 1.135, "step": 163 }, { "epoch": 0.03238546603475513, "grad_norm": 3.9070425033569336, "learning_rate": 1.9473684210526315e-06, "loss": 0.9977, "step": 164 }, { "epoch": 0.032582938388625596, "grad_norm": 5.830733299255371, "learning_rate": 1.8947368421052634e-06, "loss": 1.8587, "step": 165 }, { "epoch": 0.032780410742496054, "grad_norm": 4.05476713180542, "learning_rate": 1.8421052631578948e-06, "loss": 1.9495, "step": 166 }, { "epoch": 0.03297788309636651, "grad_norm": 3.980226755142212, "learning_rate": 1.7894736842105265e-06, "loss": 1.6, "step": 167 }, { "epoch": 0.03317535545023697, "grad_norm": 6.381178379058838, "learning_rate": 1.736842105263158e-06, "loss": 1.6905, "step": 168 }, { "epoch": 0.03337282780410743, "grad_norm": 4.4184889793396, "learning_rate": 1.6842105263157895e-06, "loss": 1.4703, "step": 169 }, { "epoch": 0.033570300157977885, "grad_norm": 4.5157470703125, "learning_rate": 1.6315789473684212e-06, "loss": 1.273, "step": 170 }, { "epoch": 0.03376777251184834, "grad_norm": 4.454701900482178, "learning_rate": 1.5789473684210526e-06, "loss": 1.9122, "step": 171 }, { "epoch": 0.0339652448657188, "grad_norm": 4.891290664672852, "learning_rate": 1.5263157894736844e-06, "loss": 1.514, "step": 172 }, { "epoch": 0.03416271721958926, "grad_norm": 4.397899627685547, "learning_rate": 1.4736842105263159e-06, "loss": 1.7544, "step": 173 }, { "epoch": 0.034360189573459717, "grad_norm": 5.422823429107666, "learning_rate": 1.4210526315789475e-06, "loss": 1.5294, "step": 174 }, { "epoch": 0.034557661927330174, "grad_norm": 5.267470359802246, "learning_rate": 1.3684210526315791e-06, "loss": 1.2176, "step": 175 }, { "epoch": 0.03475513428120063, "grad_norm": 4.583755016326904, "learning_rate": 1.3157894736842106e-06, "loss": 1.3583, "step": 176 }, { "epoch": 0.03495260663507109, "grad_norm": 4.745589733123779, "learning_rate": 1.2631578947368422e-06, "loss": 1.6765, "step": 177 }, { "epoch": 0.03515007898894155, "grad_norm": 4.703863620758057, "learning_rate": 1.2105263157894738e-06, "loss": 1.2263, "step": 178 }, { "epoch": 0.035347551342812006, "grad_norm": 4.113995552062988, "learning_rate": 1.1578947368421053e-06, "loss": 1.8098, "step": 179 }, { "epoch": 0.035545023696682464, "grad_norm": 5.088428020477295, "learning_rate": 1.1052631578947369e-06, "loss": 1.7887, "step": 180 }, { "epoch": 0.03574249605055292, "grad_norm": 4.361863613128662, "learning_rate": 1.0526315789473685e-06, "loss": 1.8482, "step": 181 }, { "epoch": 0.03593996840442338, "grad_norm": 3.7712790966033936, "learning_rate": 1.0000000000000002e-06, "loss": 1.0318, "step": 182 }, { "epoch": 0.03613744075829384, "grad_norm": 4.38617467880249, "learning_rate": 9.473684210526317e-07, "loss": 1.1747, "step": 183 }, { "epoch": 0.036334913112164295, "grad_norm": 4.757498741149902, "learning_rate": 8.947368421052632e-07, "loss": 1.591, "step": 184 }, { "epoch": 0.03653238546603475, "grad_norm": 4.661401748657227, "learning_rate": 8.421052631578948e-07, "loss": 1.2443, "step": 185 }, { "epoch": 0.03672985781990521, "grad_norm": 4.178214073181152, "learning_rate": 7.894736842105263e-07, "loss": 1.6486, "step": 186 }, { "epoch": 0.03692733017377567, "grad_norm": 4.652418613433838, "learning_rate": 7.368421052631579e-07, "loss": 1.7046, "step": 187 }, { "epoch": 0.03712480252764613, "grad_norm": 5.367217540740967, "learning_rate": 6.842105263157896e-07, "loss": 1.1643, "step": 188 }, { "epoch": 0.037322274881516584, "grad_norm": 5.026525020599365, "learning_rate": 6.315789473684211e-07, "loss": 1.8984, "step": 189 }, { "epoch": 0.03751974723538705, "grad_norm": 5.15156888961792, "learning_rate": 5.789473684210526e-07, "loss": 1.199, "step": 190 }, { "epoch": 0.03771721958925751, "grad_norm": 4.1339111328125, "learning_rate": 5.263157894736843e-07, "loss": 1.2138, "step": 191 }, { "epoch": 0.037914691943127965, "grad_norm": 4.904068946838379, "learning_rate": 4.7368421052631585e-07, "loss": 0.9992, "step": 192 }, { "epoch": 0.03811216429699842, "grad_norm": 4.655853271484375, "learning_rate": 4.210526315789474e-07, "loss": 0.9699, "step": 193 }, { "epoch": 0.03830963665086888, "grad_norm": 4.786022663116455, "learning_rate": 3.6842105263157896e-07, "loss": 1.1047, "step": 194 }, { "epoch": 0.03850710900473934, "grad_norm": 4.2019362449646, "learning_rate": 3.1578947368421055e-07, "loss": 1.3494, "step": 195 }, { "epoch": 0.038704581358609796, "grad_norm": 4.608132839202881, "learning_rate": 2.6315789473684213e-07, "loss": 1.5102, "step": 196 }, { "epoch": 0.038902053712480254, "grad_norm": 3.955866813659668, "learning_rate": 2.105263157894737e-07, "loss": 0.9329, "step": 197 }, { "epoch": 0.03909952606635071, "grad_norm": 3.948068857192993, "learning_rate": 1.5789473684210527e-07, "loss": 0.9115, "step": 198 }, { "epoch": 0.03929699842022117, "grad_norm": 4.394677639007568, "learning_rate": 1.0526315789473685e-07, "loss": 0.842, "step": 199 }, { "epoch": 0.03949447077409163, "grad_norm": 4.138340473175049, "learning_rate": 5.263157894736842e-08, "loss": 1.0818, "step": 200 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6305414963527680.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }