| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 4.992511233150275, |
| "eval_steps": 500, |
| "global_step": 625, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.00798801797304044, |
| "grad_norm": 5.8922959558083035, |
| "learning_rate": 1.26984126984127e-06, |
| "loss": 0.9284, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.01597603594608088, |
| "grad_norm": 5.937587864934546, |
| "learning_rate": 2.53968253968254e-06, |
| "loss": 0.9318, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.023964053919121316, |
| "grad_norm": 5.861772382161128, |
| "learning_rate": 3.80952380952381e-06, |
| "loss": 0.9331, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.03195207189216176, |
| "grad_norm": 5.239005126601421, |
| "learning_rate": 5.07936507936508e-06, |
| "loss": 0.9119, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.0399400898652022, |
| "grad_norm": 3.629499849213271, |
| "learning_rate": 6.349206349206349e-06, |
| "loss": 0.8754, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.04792810783824263, |
| "grad_norm": 2.106015204146543, |
| "learning_rate": 7.61904761904762e-06, |
| "loss": 0.836, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.05591612581128307, |
| "grad_norm": 4.356383106407689, |
| "learning_rate": 8.888888888888888e-06, |
| "loss": 0.8711, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.06390414378432352, |
| "grad_norm": 4.748038669119492, |
| "learning_rate": 1.015873015873016e-05, |
| "loss": 0.8657, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.07189216175736396, |
| "grad_norm": 4.437164847463165, |
| "learning_rate": 1.1428571428571429e-05, |
| "loss": 0.8249, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.0798801797304044, |
| "grad_norm": 4.231505558889787, |
| "learning_rate": 1.2698412698412699e-05, |
| "loss": 0.8205, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.08786819770344484, |
| "grad_norm": 2.771780905554085, |
| "learning_rate": 1.3968253968253968e-05, |
| "loss": 0.8071, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.09585621567648527, |
| "grad_norm": 1.7918482116297212, |
| "learning_rate": 1.523809523809524e-05, |
| "loss": 0.7653, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.1038442336495257, |
| "grad_norm": 1.6236141779129738, |
| "learning_rate": 1.6507936507936507e-05, |
| "loss": 0.7437, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.11183225162256615, |
| "grad_norm": 1.2870146428263272, |
| "learning_rate": 1.7777777777777777e-05, |
| "loss": 0.736, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.11982026959560658, |
| "grad_norm": 1.0068702786417012, |
| "learning_rate": 1.904761904761905e-05, |
| "loss": 0.7124, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.12780828756864704, |
| "grad_norm": 1.1636059875738414, |
| "learning_rate": 2.031746031746032e-05, |
| "loss": 0.7004, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.13579630554168748, |
| "grad_norm": 0.8899548891950194, |
| "learning_rate": 2.158730158730159e-05, |
| "loss": 0.6953, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.14378432351472792, |
| "grad_norm": 0.8171634825879731, |
| "learning_rate": 2.2857142857142858e-05, |
| "loss": 0.6899, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.15177234148776836, |
| "grad_norm": 0.8423601505147725, |
| "learning_rate": 2.4126984126984128e-05, |
| "loss": 0.6759, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.1597603594608088, |
| "grad_norm": 0.9144240660639567, |
| "learning_rate": 2.5396825396825397e-05, |
| "loss": 0.6768, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.16774837743384924, |
| "grad_norm": 0.7527042679957461, |
| "learning_rate": 2.6666666666666667e-05, |
| "loss": 0.6664, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.17573639540688968, |
| "grad_norm": 0.9115589252395023, |
| "learning_rate": 2.7936507936507936e-05, |
| "loss": 0.6685, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.18372441337993012, |
| "grad_norm": 0.7794511419641769, |
| "learning_rate": 2.9206349206349206e-05, |
| "loss": 0.6476, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.19171243135297053, |
| "grad_norm": 0.8206145936410231, |
| "learning_rate": 3.047619047619048e-05, |
| "loss": 0.6555, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.19970044932601097, |
| "grad_norm": 0.8660748611689925, |
| "learning_rate": 3.1746031746031745e-05, |
| "loss": 0.6504, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.2076884672990514, |
| "grad_norm": 1.09005656089158, |
| "learning_rate": 3.3015873015873014e-05, |
| "loss": 0.6468, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.21567648527209185, |
| "grad_norm": 1.2233269812335474, |
| "learning_rate": 3.4285714285714284e-05, |
| "loss": 0.6554, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.2236645032451323, |
| "grad_norm": 0.7202441107469458, |
| "learning_rate": 3.555555555555555e-05, |
| "loss": 0.6351, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.23165252121817273, |
| "grad_norm": 1.549610538416556, |
| "learning_rate": 3.682539682539683e-05, |
| "loss": 0.6386, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.23964053919121317, |
| "grad_norm": 0.7964826077261805, |
| "learning_rate": 3.80952380952381e-05, |
| "loss": 0.6282, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.2476285571642536, |
| "grad_norm": 0.6903264777596222, |
| "learning_rate": 3.936507936507937e-05, |
| "loss": 0.6281, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.2556165751372941, |
| "grad_norm": 1.2761326884044875, |
| "learning_rate": 4.063492063492064e-05, |
| "loss": 0.6216, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.2636045931103345, |
| "grad_norm": 1.3286354473207003, |
| "learning_rate": 4.190476190476191e-05, |
| "loss": 0.6196, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.27159261108337496, |
| "grad_norm": 0.6908894226839724, |
| "learning_rate": 4.317460317460318e-05, |
| "loss": 0.6144, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.2795806290564154, |
| "grad_norm": 1.0386450814645398, |
| "learning_rate": 4.444444444444445e-05, |
| "loss": 0.6155, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.28756864702945584, |
| "grad_norm": 0.7231485406568985, |
| "learning_rate": 4.5714285714285716e-05, |
| "loss": 0.6081, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.2955566650024963, |
| "grad_norm": 1.0158040603959178, |
| "learning_rate": 4.698412698412699e-05, |
| "loss": 0.6071, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.3035446829755367, |
| "grad_norm": 1.5638712924845808, |
| "learning_rate": 4.8253968253968255e-05, |
| "loss": 0.6011, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.31153270094857716, |
| "grad_norm": 0.9158856622661424, |
| "learning_rate": 4.952380952380953e-05, |
| "loss": 0.6029, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.3195207189216176, |
| "grad_norm": 1.533932723524169, |
| "learning_rate": 5.0793650793650794e-05, |
| "loss": 0.6007, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.32750873689465804, |
| "grad_norm": 1.1151203871839255, |
| "learning_rate": 5.206349206349207e-05, |
| "loss": 0.614, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.3354967548676985, |
| "grad_norm": 1.7936006261869704, |
| "learning_rate": 5.333333333333333e-05, |
| "loss": 0.5964, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.3434847728407389, |
| "grad_norm": 1.6373220709210505, |
| "learning_rate": 5.460317460317461e-05, |
| "loss": 0.6048, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.35147279081377936, |
| "grad_norm": 1.1820238351172419, |
| "learning_rate": 5.587301587301587e-05, |
| "loss": 0.5983, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.3594608087868198, |
| "grad_norm": 1.0448595195163097, |
| "learning_rate": 5.714285714285715e-05, |
| "loss": 0.6015, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.36744882675986024, |
| "grad_norm": 0.9595564806215681, |
| "learning_rate": 5.841269841269841e-05, |
| "loss": 0.5845, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.3754368447329007, |
| "grad_norm": 1.5962786237575002, |
| "learning_rate": 5.968253968253969e-05, |
| "loss": 0.5995, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.38342486270594106, |
| "grad_norm": 1.5871239780794693, |
| "learning_rate": 6.095238095238096e-05, |
| "loss": 0.5884, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.3914128806789815, |
| "grad_norm": 1.1244276800474557, |
| "learning_rate": 6.222222222222223e-05, |
| "loss": 0.597, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.39940089865202194, |
| "grad_norm": 1.971108394067141, |
| "learning_rate": 6.349206349206349e-05, |
| "loss": 0.5959, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.4073889166250624, |
| "grad_norm": 1.119155483104472, |
| "learning_rate": 6.476190476190477e-05, |
| "loss": 0.595, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.4153769345981028, |
| "grad_norm": 2.3293959233637813, |
| "learning_rate": 6.603174603174603e-05, |
| "loss": 0.5968, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.42336495257114326, |
| "grad_norm": 1.7690872710201135, |
| "learning_rate": 6.730158730158731e-05, |
| "loss": 0.5942, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.4313529705441837, |
| "grad_norm": 1.5082563438406895, |
| "learning_rate": 6.857142857142857e-05, |
| "loss": 0.5929, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.43934098851722414, |
| "grad_norm": 1.705505860185178, |
| "learning_rate": 6.984126984126985e-05, |
| "loss": 0.5886, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.4473290064902646, |
| "grad_norm": 1.311562682930509, |
| "learning_rate": 7.11111111111111e-05, |
| "loss": 0.5942, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.455317024463305, |
| "grad_norm": 1.3702806631104458, |
| "learning_rate": 7.238095238095239e-05, |
| "loss": 0.5918, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.46330504243634546, |
| "grad_norm": 1.3735083834456305, |
| "learning_rate": 7.365079365079366e-05, |
| "loss": 0.5944, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.4712930604093859, |
| "grad_norm": 1.5109411814380815, |
| "learning_rate": 7.492063492063493e-05, |
| "loss": 0.5859, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.47928107838242634, |
| "grad_norm": 1.8414765598754854, |
| "learning_rate": 7.61904761904762e-05, |
| "loss": 0.5932, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.4872690963554668, |
| "grad_norm": 1.1402061244328228, |
| "learning_rate": 7.746031746031747e-05, |
| "loss": 0.5828, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.4952571143285072, |
| "grad_norm": 1.794539731996526, |
| "learning_rate": 7.873015873015874e-05, |
| "loss": 0.5792, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.5032451323015477, |
| "grad_norm": 1.4047554942240879, |
| "learning_rate": 8e-05, |
| "loss": 0.5804, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.5112331502745882, |
| "grad_norm": 1.3931552496353343, |
| "learning_rate": 7.999937503459301e-05, |
| "loss": 0.5775, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.5192211682476285, |
| "grad_norm": 1.1656900196646254, |
| "learning_rate": 7.999750015790111e-05, |
| "loss": 0.5909, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.527209186220669, |
| "grad_norm": 1.1493581998452567, |
| "learning_rate": 7.999437542851095e-05, |
| "loss": 0.5754, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.5351972041937094, |
| "grad_norm": 1.9412467459743252, |
| "learning_rate": 7.999000094406493e-05, |
| "loss": 0.5932, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.5431852221667499, |
| "grad_norm": 1.3410903514703634, |
| "learning_rate": 7.998437684125812e-05, |
| "loss": 0.5849, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.5511732401397903, |
| "grad_norm": 1.1599213167605864, |
| "learning_rate": 7.997750329583402e-05, |
| "loss": 0.5779, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.5591612581128308, |
| "grad_norm": 2.611492770456904, |
| "learning_rate": 7.9969380522579e-05, |
| "loss": 0.5936, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.5671492760858712, |
| "grad_norm": 1.5087257150690652, |
| "learning_rate": 7.996000877531569e-05, |
| "loss": 0.5884, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.5751372940589117, |
| "grad_norm": 2.6141462248634086, |
| "learning_rate": 7.9949388346895e-05, |
| "loss": 0.5951, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.5831253120319521, |
| "grad_norm": 1.9836349293790256, |
| "learning_rate": 7.993751956918693e-05, |
| "loss": 0.5874, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.5911133300049926, |
| "grad_norm": 1.5274699003911547, |
| "learning_rate": 7.992440281307027e-05, |
| "loss": 0.5962, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.5991013479780329, |
| "grad_norm": 1.2158179637702575, |
| "learning_rate": 7.991003848842093e-05, |
| "loss": 0.5801, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.6070893659510734, |
| "grad_norm": 1.2430162793293555, |
| "learning_rate": 7.989442704409925e-05, |
| "loss": 0.5757, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.6150773839241138, |
| "grad_norm": 0.9546052456533828, |
| "learning_rate": 7.987756896793583e-05, |
| "loss": 0.5836, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.6230654018971543, |
| "grad_norm": 1.051061984198158, |
| "learning_rate": 7.985946478671642e-05, |
| "loss": 0.575, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.6310534198701947, |
| "grad_norm": 1.025823090309492, |
| "learning_rate": 7.984011506616534e-05, |
| "loss": 0.5792, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.6390414378432352, |
| "grad_norm": 1.0879892769571216, |
| "learning_rate": 7.981952041092792e-05, |
| "loss": 0.575, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.6470294558162756, |
| "grad_norm": 1.3203984543837413, |
| "learning_rate": 7.979768146455148e-05, |
| "loss": 0.5725, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.6550174737893161, |
| "grad_norm": 0.8793400599633049, |
| "learning_rate": 7.977459890946534e-05, |
| "loss": 0.5643, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.6630054917623565, |
| "grad_norm": 0.9839614386276342, |
| "learning_rate": 7.975027346695943e-05, |
| "loss": 0.5609, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.670993509735397, |
| "grad_norm": 1.087269282291481, |
| "learning_rate": 7.972470589716175e-05, |
| "loss": 0.5706, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.6789815277084373, |
| "grad_norm": 0.8949957037226873, |
| "learning_rate": 7.969789699901462e-05, |
| "loss": 0.5718, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.6869695456814778, |
| "grad_norm": 0.5685210804043624, |
| "learning_rate": 7.966984761024974e-05, |
| "loss": 0.5651, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.6949575636545182, |
| "grad_norm": 0.7365421304468946, |
| "learning_rate": 7.964055860736199e-05, |
| "loss": 0.5625, |
| "step": 87 |
| }, |
| { |
| "epoch": 0.7029455816275587, |
| "grad_norm": 0.6519155688073771, |
| "learning_rate": 7.961003090558208e-05, |
| "loss": 0.5602, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.7109335996005991, |
| "grad_norm": 0.47928031192412984, |
| "learning_rate": 7.957826545884786e-05, |
| "loss": 0.5549, |
| "step": 89 |
| }, |
| { |
| "epoch": 0.7189216175736396, |
| "grad_norm": 0.7685907979864348, |
| "learning_rate": 7.95452632597746e-05, |
| "loss": 0.5558, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.72690963554668, |
| "grad_norm": 0.7342212353643156, |
| "learning_rate": 7.951102533962393e-05, |
| "loss": 0.5539, |
| "step": 91 |
| }, |
| { |
| "epoch": 0.7348976535197205, |
| "grad_norm": 0.526378766562186, |
| "learning_rate": 7.947555276827166e-05, |
| "loss": 0.5604, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.7428856714927609, |
| "grad_norm": 0.763635167097638, |
| "learning_rate": 7.94388466541743e-05, |
| "loss": 0.5604, |
| "step": 93 |
| }, |
| { |
| "epoch": 0.7508736894658014, |
| "grad_norm": 1.1133910887739713, |
| "learning_rate": 7.940090814433437e-05, |
| "loss": 0.5502, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.7588617074388417, |
| "grad_norm": 1.350450301452925, |
| "learning_rate": 7.936173842426473e-05, |
| "loss": 0.5607, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.7668497254118821, |
| "grad_norm": 0.47766209706502316, |
| "learning_rate": 7.932133871795136e-05, |
| "loss": 0.5584, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.7748377433849226, |
| "grad_norm": 0.8964819495426043, |
| "learning_rate": 7.927971028781522e-05, |
| "loss": 0.5533, |
| "step": 97 |
| }, |
| { |
| "epoch": 0.782825761357963, |
| "grad_norm": 1.2844756885032345, |
| "learning_rate": 7.923685443467275e-05, |
| "loss": 0.5439, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.7908137793310035, |
| "grad_norm": 0.7076588316414215, |
| "learning_rate": 7.919277249769522e-05, |
| "loss": 0.5516, |
| "step": 99 |
| }, |
| { |
| "epoch": 0.7988017973040439, |
| "grad_norm": 0.9548748366290979, |
| "learning_rate": 7.914746585436692e-05, |
| "loss": 0.5622, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.8067898152770844, |
| "grad_norm": 1.0033397557294186, |
| "learning_rate": 7.91009359204421e-05, |
| "loss": 0.55, |
| "step": 101 |
| }, |
| { |
| "epoch": 0.8147778332501248, |
| "grad_norm": 0.8058153670114928, |
| "learning_rate": 7.90531841499007e-05, |
| "loss": 0.5472, |
| "step": 102 |
| }, |
| { |
| "epoch": 0.8227658512231653, |
| "grad_norm": 0.7522684804995226, |
| "learning_rate": 7.900421203490295e-05, |
| "loss": 0.5475, |
| "step": 103 |
| }, |
| { |
| "epoch": 0.8307538691962056, |
| "grad_norm": 0.8260701286176672, |
| "learning_rate": 7.895402110574277e-05, |
| "loss": 0.546, |
| "step": 104 |
| }, |
| { |
| "epoch": 0.8387418871692461, |
| "grad_norm": 0.9294034148971123, |
| "learning_rate": 7.890261293079985e-05, |
| "loss": 0.5486, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.8467299051422865, |
| "grad_norm": 0.6210859012554373, |
| "learning_rate": 7.884998911649077e-05, |
| "loss": 0.5565, |
| "step": 106 |
| }, |
| { |
| "epoch": 0.854717923115327, |
| "grad_norm": 0.6446646964930844, |
| "learning_rate": 7.879615130721868e-05, |
| "loss": 0.539, |
| "step": 107 |
| }, |
| { |
| "epoch": 0.8627059410883674, |
| "grad_norm": 0.8996911090197094, |
| "learning_rate": 7.8741101185322e-05, |
| "loss": 0.5422, |
| "step": 108 |
| }, |
| { |
| "epoch": 0.8706939590614079, |
| "grad_norm": 0.9338087827721026, |
| "learning_rate": 7.868484047102183e-05, |
| "loss": 0.5535, |
| "step": 109 |
| }, |
| { |
| "epoch": 0.8786819770344483, |
| "grad_norm": 1.1026810388479344, |
| "learning_rate": 7.862737092236818e-05, |
| "loss": 0.5453, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.8866699950074888, |
| "grad_norm": 0.9663842431402072, |
| "learning_rate": 7.856869433518506e-05, |
| "loss": 0.5452, |
| "step": 111 |
| }, |
| { |
| "epoch": 0.8946580129805292, |
| "grad_norm": 1.0210253102387117, |
| "learning_rate": 7.850881254301432e-05, |
| "loss": 0.5568, |
| "step": 112 |
| }, |
| { |
| "epoch": 0.9026460309535697, |
| "grad_norm": 0.8477567856764551, |
| "learning_rate": 7.844772741705835e-05, |
| "loss": 0.545, |
| "step": 113 |
| }, |
| { |
| "epoch": 0.91063404892661, |
| "grad_norm": 0.5613356829580358, |
| "learning_rate": 7.838544086612174e-05, |
| "loss": 0.5438, |
| "step": 114 |
| }, |
| { |
| "epoch": 0.9186220668996505, |
| "grad_norm": 0.6248181380373118, |
| "learning_rate": 7.832195483655144e-05, |
| "loss": 0.5366, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.9266100848726909, |
| "grad_norm": 0.8519302343250585, |
| "learning_rate": 7.825727131217609e-05, |
| "loss": 0.5401, |
| "step": 116 |
| }, |
| { |
| "epoch": 0.9345981028457314, |
| "grad_norm": 0.45919068712258837, |
| "learning_rate": 7.81913923142439e-05, |
| "loss": 0.5518, |
| "step": 117 |
| }, |
| { |
| "epoch": 0.9425861208187718, |
| "grad_norm": 0.5491942320357649, |
| "learning_rate": 7.812431990135965e-05, |
| "loss": 0.545, |
| "step": 118 |
| }, |
| { |
| "epoch": 0.9505741387918123, |
| "grad_norm": 0.7204970814629463, |
| "learning_rate": 7.805605616942023e-05, |
| "loss": 0.5502, |
| "step": 119 |
| }, |
| { |
| "epoch": 0.9585621567648527, |
| "grad_norm": 0.624445399157028, |
| "learning_rate": 7.798660325154917e-05, |
| "loss": 0.5465, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.9665501747378932, |
| "grad_norm": 0.43723265221924457, |
| "learning_rate": 7.791596331803003e-05, |
| "loss": 0.5387, |
| "step": 121 |
| }, |
| { |
| "epoch": 0.9745381927109336, |
| "grad_norm": 0.40296635700807665, |
| "learning_rate": 7.784413857623856e-05, |
| "loss": 0.5384, |
| "step": 122 |
| }, |
| { |
| "epoch": 0.982526210683974, |
| "grad_norm": 0.4355607269982166, |
| "learning_rate": 7.77711312705737e-05, |
| "loss": 0.5391, |
| "step": 123 |
| }, |
| { |
| "epoch": 0.9905142286570144, |
| "grad_norm": 0.37094543758250353, |
| "learning_rate": 7.769694368238746e-05, |
| "loss": 0.534, |
| "step": 124 |
| }, |
| { |
| "epoch": 0.9985022466300549, |
| "grad_norm": 0.36678250566452825, |
| "learning_rate": 7.762157812991369e-05, |
| "loss": 0.535, |
| "step": 125 |
| }, |
| { |
| "epoch": 1.0064902646030953, |
| "grad_norm": 0.7573100076200363, |
| "learning_rate": 7.754503696819553e-05, |
| "loss": 0.955, |
| "step": 126 |
| }, |
| { |
| "epoch": 1.0144782825761358, |
| "grad_norm": 1.0167041671110564, |
| "learning_rate": 7.74673225890119e-05, |
| "loss": 0.5181, |
| "step": 127 |
| }, |
| { |
| "epoch": 1.0224663005491763, |
| "grad_norm": 1.0181250181107355, |
| "learning_rate": 7.738843742080269e-05, |
| "loss": 0.5237, |
| "step": 128 |
| }, |
| { |
| "epoch": 1.0304543185222166, |
| "grad_norm": 1.1080605772500498, |
| "learning_rate": 7.730838392859303e-05, |
| "loss": 0.5312, |
| "step": 129 |
| }, |
| { |
| "epoch": 1.038442336495257, |
| "grad_norm": 0.7638562997222614, |
| "learning_rate": 7.722716461391603e-05, |
| "loss": 0.5338, |
| "step": 130 |
| }, |
| { |
| "epoch": 1.0464303544682976, |
| "grad_norm": 0.8619620628236141, |
| "learning_rate": 7.714478201473483e-05, |
| "loss": 0.5249, |
| "step": 131 |
| }, |
| { |
| "epoch": 1.054418372441338, |
| "grad_norm": 1.1654304124994774, |
| "learning_rate": 7.706123870536315e-05, |
| "loss": 0.5208, |
| "step": 132 |
| }, |
| { |
| "epoch": 1.0624063904143783, |
| "grad_norm": 0.5330720342927018, |
| "learning_rate": 7.697653729638489e-05, |
| "loss": 0.5184, |
| "step": 133 |
| }, |
| { |
| "epoch": 1.0703944083874188, |
| "grad_norm": 1.020325885284434, |
| "learning_rate": 7.689068043457261e-05, |
| "loss": 0.5128, |
| "step": 134 |
| }, |
| { |
| "epoch": 1.0783824263604593, |
| "grad_norm": 0.6983781848617573, |
| "learning_rate": 7.68036708028047e-05, |
| "loss": 0.518, |
| "step": 135 |
| }, |
| { |
| "epoch": 1.0863704443334998, |
| "grad_norm": 0.6057523169656847, |
| "learning_rate": 7.671551111998169e-05, |
| "loss": 0.5196, |
| "step": 136 |
| }, |
| { |
| "epoch": 1.09435846230654, |
| "grad_norm": 0.5211411106516707, |
| "learning_rate": 7.662620414094117e-05, |
| "loss": 0.5199, |
| "step": 137 |
| }, |
| { |
| "epoch": 1.1023464802795806, |
| "grad_norm": 0.5166573997289899, |
| "learning_rate": 7.653575265637177e-05, |
| "loss": 0.5154, |
| "step": 138 |
| }, |
| { |
| "epoch": 1.110334498252621, |
| "grad_norm": 0.4470708726865469, |
| "learning_rate": 7.644415949272591e-05, |
| "loss": 0.5098, |
| "step": 139 |
| }, |
| { |
| "epoch": 1.1183225162256616, |
| "grad_norm": 0.5357218094920962, |
| "learning_rate": 7.635142751213156e-05, |
| "loss": 0.5196, |
| "step": 140 |
| }, |
| { |
| "epoch": 1.1263105341987019, |
| "grad_norm": 0.48982578714373154, |
| "learning_rate": 7.62575596123027e-05, |
| "loss": 0.5112, |
| "step": 141 |
| }, |
| { |
| "epoch": 1.1342985521717424, |
| "grad_norm": 0.3953911478616972, |
| "learning_rate": 7.616255872644888e-05, |
| "loss": 0.5022, |
| "step": 142 |
| }, |
| { |
| "epoch": 1.1422865701447829, |
| "grad_norm": 0.46599322968658796, |
| "learning_rate": 7.60664278231834e-05, |
| "loss": 0.5067, |
| "step": 143 |
| }, |
| { |
| "epoch": 1.1502745881178233, |
| "grad_norm": 0.47850160868681485, |
| "learning_rate": 7.596916990643077e-05, |
| "loss": 0.5028, |
| "step": 144 |
| }, |
| { |
| "epoch": 1.1582626060908636, |
| "grad_norm": 0.42978953466708475, |
| "learning_rate": 7.587078801533262e-05, |
| "loss": 0.5015, |
| "step": 145 |
| }, |
| { |
| "epoch": 1.1662506240639041, |
| "grad_norm": 0.3540055333518291, |
| "learning_rate": 7.577128522415292e-05, |
| "loss": 0.5076, |
| "step": 146 |
| }, |
| { |
| "epoch": 1.1742386420369446, |
| "grad_norm": 0.3351153000601574, |
| "learning_rate": 7.567066464218178e-05, |
| "loss": 0.4989, |
| "step": 147 |
| }, |
| { |
| "epoch": 1.182226660009985, |
| "grad_norm": 0.3005800301999229, |
| "learning_rate": 7.556892941363833e-05, |
| "loss": 0.4967, |
| "step": 148 |
| }, |
| { |
| "epoch": 1.1902146779830254, |
| "grad_norm": 0.3563502792477842, |
| "learning_rate": 7.546608271757251e-05, |
| "loss": 0.5107, |
| "step": 149 |
| }, |
| { |
| "epoch": 1.1982026959560659, |
| "grad_norm": 0.38770493909399334, |
| "learning_rate": 7.536212776776567e-05, |
| "loss": 0.5104, |
| "step": 150 |
| }, |
| { |
| "epoch": 1.2061907139291064, |
| "grad_norm": 0.3767151991317555, |
| "learning_rate": 7.525706781263023e-05, |
| "loss": 0.5102, |
| "step": 151 |
| }, |
| { |
| "epoch": 1.2141787319021469, |
| "grad_norm": 0.4105950587040687, |
| "learning_rate": 7.515090613510801e-05, |
| "loss": 0.4986, |
| "step": 152 |
| }, |
| { |
| "epoch": 1.2221667498751871, |
| "grad_norm": 0.42936249879191585, |
| "learning_rate": 7.504364605256784e-05, |
| "loss": 0.5035, |
| "step": 153 |
| }, |
| { |
| "epoch": 1.2301547678482276, |
| "grad_norm": 0.4346225237944244, |
| "learning_rate": 7.493529091670181e-05, |
| "loss": 0.4988, |
| "step": 154 |
| }, |
| { |
| "epoch": 1.2381427858212681, |
| "grad_norm": 0.4396844168311194, |
| "learning_rate": 7.482584411342043e-05, |
| "loss": 0.5077, |
| "step": 155 |
| }, |
| { |
| "epoch": 1.2461308037943086, |
| "grad_norm": 0.431746092302867, |
| "learning_rate": 7.471530906274704e-05, |
| "loss": 0.4983, |
| "step": 156 |
| }, |
| { |
| "epoch": 1.254118821767349, |
| "grad_norm": 0.5889910567664702, |
| "learning_rate": 7.460368921871077e-05, |
| "loss": 0.5122, |
| "step": 157 |
| }, |
| { |
| "epoch": 1.2621068397403894, |
| "grad_norm": 0.6830133790630488, |
| "learning_rate": 7.44909880692387e-05, |
| "loss": 0.5073, |
| "step": 158 |
| }, |
| { |
| "epoch": 1.27009485771343, |
| "grad_norm": 0.6354350767066138, |
| "learning_rate": 7.437720913604681e-05, |
| "loss": 0.5117, |
| "step": 159 |
| }, |
| { |
| "epoch": 1.2780828756864704, |
| "grad_norm": 0.4963286720098572, |
| "learning_rate": 7.426235597452995e-05, |
| "loss": 0.4993, |
| "step": 160 |
| }, |
| { |
| "epoch": 1.2860708936595107, |
| "grad_norm": 0.418831779419711, |
| "learning_rate": 7.41464321736508e-05, |
| "loss": 0.5021, |
| "step": 161 |
| }, |
| { |
| "epoch": 1.2940589116325512, |
| "grad_norm": 0.4787432347277129, |
| "learning_rate": 7.402944135582758e-05, |
| "loss": 0.502, |
| "step": 162 |
| }, |
| { |
| "epoch": 1.3020469296055917, |
| "grad_norm": 0.50062737801301, |
| "learning_rate": 7.391138717682103e-05, |
| "loss": 0.4937, |
| "step": 163 |
| }, |
| { |
| "epoch": 1.310034947578632, |
| "grad_norm": 0.39201954318713855, |
| "learning_rate": 7.379227332562005e-05, |
| "loss": 0.5003, |
| "step": 164 |
| }, |
| { |
| "epoch": 1.3180229655516724, |
| "grad_norm": 0.31007216413114186, |
| "learning_rate": 7.367210352432645e-05, |
| "loss": 0.502, |
| "step": 165 |
| }, |
| { |
| "epoch": 1.326010983524713, |
| "grad_norm": 0.42076785863557453, |
| "learning_rate": 7.355088152803866e-05, |
| "loss": 0.501, |
| "step": 166 |
| }, |
| { |
| "epoch": 1.3339990014977534, |
| "grad_norm": 0.4745296323176778, |
| "learning_rate": 7.342861112473442e-05, |
| "loss": 0.4979, |
| "step": 167 |
| }, |
| { |
| "epoch": 1.341987019470794, |
| "grad_norm": 0.4199718916823893, |
| "learning_rate": 7.330529613515232e-05, |
| "loss": 0.4984, |
| "step": 168 |
| }, |
| { |
| "epoch": 1.3499750374438342, |
| "grad_norm": 0.3814943625708202, |
| "learning_rate": 7.318094041267253e-05, |
| "loss": 0.4946, |
| "step": 169 |
| }, |
| { |
| "epoch": 1.3579630554168747, |
| "grad_norm": 0.3584958844621985, |
| "learning_rate": 7.305554784319625e-05, |
| "loss": 0.4945, |
| "step": 170 |
| }, |
| { |
| "epoch": 1.3659510733899152, |
| "grad_norm": 0.3258027404514737, |
| "learning_rate": 7.29291223450244e-05, |
| "loss": 0.4936, |
| "step": 171 |
| }, |
| { |
| "epoch": 1.3739390913629554, |
| "grad_norm": 0.3304823682468289, |
| "learning_rate": 7.280166786873514e-05, |
| "loss": 0.4957, |
| "step": 172 |
| }, |
| { |
| "epoch": 1.381927109335996, |
| "grad_norm": 0.285695277322611, |
| "learning_rate": 7.267318839706038e-05, |
| "loss": 0.5004, |
| "step": 173 |
| }, |
| { |
| "epoch": 1.3899151273090364, |
| "grad_norm": 0.360711874339804, |
| "learning_rate": 7.25436879447614e-05, |
| "loss": 0.4946, |
| "step": 174 |
| }, |
| { |
| "epoch": 1.397903145282077, |
| "grad_norm": 0.4690067762041838, |
| "learning_rate": 7.241317055850336e-05, |
| "loss": 0.4933, |
| "step": 175 |
| }, |
| { |
| "epoch": 1.4058911632551174, |
| "grad_norm": 0.48954294072750454, |
| "learning_rate": 7.228164031672879e-05, |
| "loss": 0.4958, |
| "step": 176 |
| }, |
| { |
| "epoch": 1.4138791812281577, |
| "grad_norm": 0.5871985410108085, |
| "learning_rate": 7.214910132953027e-05, |
| "loss": 0.495, |
| "step": 177 |
| }, |
| { |
| "epoch": 1.4218671992011982, |
| "grad_norm": 0.720040324723498, |
| "learning_rate": 7.201555773852189e-05, |
| "loss": 0.4989, |
| "step": 178 |
| }, |
| { |
| "epoch": 1.4298552171742387, |
| "grad_norm": 0.8159522745469254, |
| "learning_rate": 7.188101371670991e-05, |
| "loss": 0.5006, |
| "step": 179 |
| }, |
| { |
| "epoch": 1.437843235147279, |
| "grad_norm": 0.8363865485901019, |
| "learning_rate": 7.174547346836228e-05, |
| "loss": 0.5069, |
| "step": 180 |
| }, |
| { |
| "epoch": 1.4458312531203195, |
| "grad_norm": 0.7345453619769279, |
| "learning_rate": 7.160894122887733e-05, |
| "loss": 0.4927, |
| "step": 181 |
| }, |
| { |
| "epoch": 1.45381927109336, |
| "grad_norm": 0.589527692471703, |
| "learning_rate": 7.147142126465138e-05, |
| "loss": 0.4955, |
| "step": 182 |
| }, |
| { |
| "epoch": 1.4618072890664005, |
| "grad_norm": 0.4423587194525166, |
| "learning_rate": 7.133291787294547e-05, |
| "loss": 0.5094, |
| "step": 183 |
| }, |
| { |
| "epoch": 1.469795307039441, |
| "grad_norm": 0.40340279142628255, |
| "learning_rate": 7.119343538175102e-05, |
| "loss": 0.4967, |
| "step": 184 |
| }, |
| { |
| "epoch": 1.4777833250124812, |
| "grad_norm": 0.4982976531352129, |
| "learning_rate": 7.10529781496546e-05, |
| "loss": 0.4951, |
| "step": 185 |
| }, |
| { |
| "epoch": 1.4857713429855217, |
| "grad_norm": 0.45741686448136076, |
| "learning_rate": 7.09115505657018e-05, |
| "loss": 0.4839, |
| "step": 186 |
| }, |
| { |
| "epoch": 1.4937593609585622, |
| "grad_norm": 0.32134532426731377, |
| "learning_rate": 7.076915704926e-05, |
| "loss": 0.4947, |
| "step": 187 |
| }, |
| { |
| "epoch": 1.5017473789316025, |
| "grad_norm": 0.2578730665869774, |
| "learning_rate": 7.062580204988028e-05, |
| "loss": 0.4885, |
| "step": 188 |
| }, |
| { |
| "epoch": 1.509735396904643, |
| "grad_norm": 0.3424320920246288, |
| "learning_rate": 7.048149004715843e-05, |
| "loss": 0.4968, |
| "step": 189 |
| }, |
| { |
| "epoch": 1.5177234148776835, |
| "grad_norm": 0.40215949965851383, |
| "learning_rate": 7.033622555059491e-05, |
| "loss": 0.4964, |
| "step": 190 |
| }, |
| { |
| "epoch": 1.525711432850724, |
| "grad_norm": 0.3989533402101727, |
| "learning_rate": 7.0190013099454e-05, |
| "loss": 0.4993, |
| "step": 191 |
| }, |
| { |
| "epoch": 1.5336994508237645, |
| "grad_norm": 0.2863829598271095, |
| "learning_rate": 7.004285726262188e-05, |
| "loss": 0.5058, |
| "step": 192 |
| }, |
| { |
| "epoch": 1.5416874687968047, |
| "grad_norm": 0.24052248409440963, |
| "learning_rate": 6.989476263846396e-05, |
| "loss": 0.4861, |
| "step": 193 |
| }, |
| { |
| "epoch": 1.5496754867698452, |
| "grad_norm": 0.4598329169035325, |
| "learning_rate": 6.974573385468105e-05, |
| "loss": 0.5007, |
| "step": 194 |
| }, |
| { |
| "epoch": 1.5576635047428855, |
| "grad_norm": 0.6079055307812807, |
| "learning_rate": 6.95957755681649e-05, |
| "loss": 0.5008, |
| "step": 195 |
| }, |
| { |
| "epoch": 1.565651522715926, |
| "grad_norm": 0.5580132412627938, |
| "learning_rate": 6.944489246485257e-05, |
| "loss": 0.4962, |
| "step": 196 |
| }, |
| { |
| "epoch": 1.5736395406889665, |
| "grad_norm": 0.42994805656529084, |
| "learning_rate": 6.929308925958009e-05, |
| "loss": 0.5076, |
| "step": 197 |
| }, |
| { |
| "epoch": 1.581627558662007, |
| "grad_norm": 0.3842832421038355, |
| "learning_rate": 6.914037069593504e-05, |
| "loss": 0.4924, |
| "step": 198 |
| }, |
| { |
| "epoch": 1.5896155766350475, |
| "grad_norm": 0.32699055905703517, |
| "learning_rate": 6.898674154610839e-05, |
| "loss": 0.4921, |
| "step": 199 |
| }, |
| { |
| "epoch": 1.597603594608088, |
| "grad_norm": 0.42528398283904756, |
| "learning_rate": 6.883220661074534e-05, |
| "loss": 0.4928, |
| "step": 200 |
| }, |
| { |
| "epoch": 1.6055916125811283, |
| "grad_norm": 0.6183497108648602, |
| "learning_rate": 6.867677071879535e-05, |
| "loss": 0.4993, |
| "step": 201 |
| }, |
| { |
| "epoch": 1.6135796305541688, |
| "grad_norm": 0.7584925576329896, |
| "learning_rate": 6.852043872736116e-05, |
| "loss": 0.4846, |
| "step": 202 |
| }, |
| { |
| "epoch": 1.621567648527209, |
| "grad_norm": 0.6243564361060799, |
| "learning_rate": 6.836321552154714e-05, |
| "loss": 0.5007, |
| "step": 203 |
| }, |
| { |
| "epoch": 1.6295556665002495, |
| "grad_norm": 0.3651441665883393, |
| "learning_rate": 6.820510601430649e-05, |
| "loss": 0.4936, |
| "step": 204 |
| }, |
| { |
| "epoch": 1.63754368447329, |
| "grad_norm": 0.23834669483267124, |
| "learning_rate": 6.804611514628788e-05, |
| "loss": 0.4857, |
| "step": 205 |
| }, |
| { |
| "epoch": 1.6455317024463305, |
| "grad_norm": 0.3073254289591667, |
| "learning_rate": 6.78862478856809e-05, |
| "loss": 0.4974, |
| "step": 206 |
| }, |
| { |
| "epoch": 1.653519720419371, |
| "grad_norm": 0.3183758714531585, |
| "learning_rate": 6.772550922806096e-05, |
| "loss": 0.4915, |
| "step": 207 |
| }, |
| { |
| "epoch": 1.6615077383924115, |
| "grad_norm": 0.2710320114390746, |
| "learning_rate": 6.756390419623307e-05, |
| "loss": 0.4901, |
| "step": 208 |
| }, |
| { |
| "epoch": 1.6694957563654518, |
| "grad_norm": 0.27532630096114225, |
| "learning_rate": 6.740143784007495e-05, |
| "loss": 0.4885, |
| "step": 209 |
| }, |
| { |
| "epoch": 1.6774837743384923, |
| "grad_norm": 0.24949516998489749, |
| "learning_rate": 6.723811523637923e-05, |
| "loss": 0.4948, |
| "step": 210 |
| }, |
| { |
| "epoch": 1.6854717923115325, |
| "grad_norm": 0.27385769367337703, |
| "learning_rate": 6.707394148869479e-05, |
| "loss": 0.4963, |
| "step": 211 |
| }, |
| { |
| "epoch": 1.693459810284573, |
| "grad_norm": 0.3041551075828834, |
| "learning_rate": 6.690892172716726e-05, |
| "loss": 0.486, |
| "step": 212 |
| }, |
| { |
| "epoch": 1.7014478282576135, |
| "grad_norm": 0.3555632959677351, |
| "learning_rate": 6.674306110837881e-05, |
| "loss": 0.499, |
| "step": 213 |
| }, |
| { |
| "epoch": 1.709435846230654, |
| "grad_norm": 0.3329437137508577, |
| "learning_rate": 6.657636481518683e-05, |
| "loss": 0.4949, |
| "step": 214 |
| }, |
| { |
| "epoch": 1.7174238642036945, |
| "grad_norm": 0.3417126321251888, |
| "learning_rate": 6.640883805656221e-05, |
| "loss": 0.4913, |
| "step": 215 |
| }, |
| { |
| "epoch": 1.725411882176735, |
| "grad_norm": 0.3989241732557222, |
| "learning_rate": 6.624048606742636e-05, |
| "loss": 0.4911, |
| "step": 216 |
| }, |
| { |
| "epoch": 1.7333999001497753, |
| "grad_norm": 0.45014562286637283, |
| "learning_rate": 6.607131410848777e-05, |
| "loss": 0.4932, |
| "step": 217 |
| }, |
| { |
| "epoch": 1.7413879181228158, |
| "grad_norm": 0.4927365755110579, |
| "learning_rate": 6.590132746607755e-05, |
| "loss": 0.4929, |
| "step": 218 |
| }, |
| { |
| "epoch": 1.749375936095856, |
| "grad_norm": 0.5486106005274718, |
| "learning_rate": 6.573053145198422e-05, |
| "loss": 0.4924, |
| "step": 219 |
| }, |
| { |
| "epoch": 1.7573639540688966, |
| "grad_norm": 0.5493013804791822, |
| "learning_rate": 6.555893140328787e-05, |
| "loss": 0.5029, |
| "step": 220 |
| }, |
| { |
| "epoch": 1.765351972041937, |
| "grad_norm": 0.4921038998096511, |
| "learning_rate": 6.538653268219316e-05, |
| "loss": 0.501, |
| "step": 221 |
| }, |
| { |
| "epoch": 1.7733399900149776, |
| "grad_norm": 0.36708379922405937, |
| "learning_rate": 6.521334067586194e-05, |
| "loss": 0.4912, |
| "step": 222 |
| }, |
| { |
| "epoch": 1.781328007988018, |
| "grad_norm": 0.2934447036565008, |
| "learning_rate": 6.503936079624486e-05, |
| "loss": 0.4924, |
| "step": 223 |
| }, |
| { |
| "epoch": 1.7893160259610585, |
| "grad_norm": 0.41971512428606667, |
| "learning_rate": 6.486459847991226e-05, |
| "loss": 0.4867, |
| "step": 224 |
| }, |
| { |
| "epoch": 1.7973040439340988, |
| "grad_norm": 0.38954075869198324, |
| "learning_rate": 6.46890591878842e-05, |
| "loss": 0.4833, |
| "step": 225 |
| }, |
| { |
| "epoch": 1.8052920619071393, |
| "grad_norm": 0.34504882506932716, |
| "learning_rate": 6.451274840545995e-05, |
| "loss": 0.4952, |
| "step": 226 |
| }, |
| { |
| "epoch": 1.8132800798801796, |
| "grad_norm": 0.3115751552302506, |
| "learning_rate": 6.433567164204652e-05, |
| "loss": 0.4838, |
| "step": 227 |
| }, |
| { |
| "epoch": 1.82126809785322, |
| "grad_norm": 0.3412485251072806, |
| "learning_rate": 6.415783443098645e-05, |
| "loss": 0.4855, |
| "step": 228 |
| }, |
| { |
| "epoch": 1.8292561158262606, |
| "grad_norm": 0.4108218843875664, |
| "learning_rate": 6.397924232938504e-05, |
| "loss": 0.4911, |
| "step": 229 |
| }, |
| { |
| "epoch": 1.837244133799301, |
| "grad_norm": 0.348838980704177, |
| "learning_rate": 6.379990091793653e-05, |
| "loss": 0.4924, |
| "step": 230 |
| }, |
| { |
| "epoch": 1.8452321517723416, |
| "grad_norm": 0.2727569106903297, |
| "learning_rate": 6.361981580074983e-05, |
| "loss": 0.4875, |
| "step": 231 |
| }, |
| { |
| "epoch": 1.853220169745382, |
| "grad_norm": 0.31966296310063425, |
| "learning_rate": 6.343899260517339e-05, |
| "loss": 0.4929, |
| "step": 232 |
| }, |
| { |
| "epoch": 1.8612081877184223, |
| "grad_norm": 0.2973479822646696, |
| "learning_rate": 6.325743698161927e-05, |
| "loss": 0.4929, |
| "step": 233 |
| }, |
| { |
| "epoch": 1.8691962056914628, |
| "grad_norm": 0.34272092476530364, |
| "learning_rate": 6.307515460338672e-05, |
| "loss": 0.4896, |
| "step": 234 |
| }, |
| { |
| "epoch": 1.877184223664503, |
| "grad_norm": 0.3581061926529654, |
| "learning_rate": 6.289215116648477e-05, |
| "loss": 0.486, |
| "step": 235 |
| }, |
| { |
| "epoch": 1.8851722416375436, |
| "grad_norm": 0.2528403776001991, |
| "learning_rate": 6.270843238945426e-05, |
| "loss": 0.4941, |
| "step": 236 |
| }, |
| { |
| "epoch": 1.893160259610584, |
| "grad_norm": 0.2684767914087712, |
| "learning_rate": 6.252400401318924e-05, |
| "loss": 0.495, |
| "step": 237 |
| }, |
| { |
| "epoch": 1.9011482775836246, |
| "grad_norm": 0.3089206948515233, |
| "learning_rate": 6.233887180075744e-05, |
| "loss": 0.4952, |
| "step": 238 |
| }, |
| { |
| "epoch": 1.909136295556665, |
| "grad_norm": 0.30351254889018653, |
| "learning_rate": 6.21530415372203e-05, |
| "loss": 0.4846, |
| "step": 239 |
| }, |
| { |
| "epoch": 1.9171243135297056, |
| "grad_norm": 0.4047998399516971, |
| "learning_rate": 6.196651902945213e-05, |
| "loss": 0.4961, |
| "step": 240 |
| }, |
| { |
| "epoch": 1.9251123315027459, |
| "grad_norm": 0.34718079097807986, |
| "learning_rate": 6.17793101059587e-05, |
| "loss": 0.4784, |
| "step": 241 |
| }, |
| { |
| "epoch": 1.9331003494757864, |
| "grad_norm": 0.23676859947641374, |
| "learning_rate": 6.159142061669504e-05, |
| "loss": 0.4816, |
| "step": 242 |
| }, |
| { |
| "epoch": 1.9410883674488266, |
| "grad_norm": 0.3083982484226228, |
| "learning_rate": 6.14028564328827e-05, |
| "loss": 0.4846, |
| "step": 243 |
| }, |
| { |
| "epoch": 1.9490763854218671, |
| "grad_norm": 0.23280924719224474, |
| "learning_rate": 6.12136234468263e-05, |
| "loss": 0.4901, |
| "step": 244 |
| }, |
| { |
| "epoch": 1.9570644033949076, |
| "grad_norm": 0.23217318367899584, |
| "learning_rate": 6.1023727571729334e-05, |
| "loss": 0.4922, |
| "step": 245 |
| }, |
| { |
| "epoch": 1.965052421367948, |
| "grad_norm": 0.3110861621844553, |
| "learning_rate": 6.083317474150943e-05, |
| "loss": 0.4897, |
| "step": 246 |
| }, |
| { |
| "epoch": 1.9730404393409886, |
| "grad_norm": 0.2740981225422537, |
| "learning_rate": 6.0641970910612966e-05, |
| "loss": 0.4884, |
| "step": 247 |
| }, |
| { |
| "epoch": 1.981028457314029, |
| "grad_norm": 0.30045631025591646, |
| "learning_rate": 6.045012205382894e-05, |
| "loss": 0.4842, |
| "step": 248 |
| }, |
| { |
| "epoch": 1.9890164752870694, |
| "grad_norm": 0.3426504942977091, |
| "learning_rate": 6.025763416610229e-05, |
| "loss": 0.4805, |
| "step": 249 |
| }, |
| { |
| "epoch": 1.9970044932601099, |
| "grad_norm": 0.2696833408525596, |
| "learning_rate": 6.006451326234656e-05, |
| "loss": 0.4955, |
| "step": 250 |
| }, |
| { |
| "epoch": 2.00499251123315, |
| "grad_norm": 0.5162311215778072, |
| "learning_rate": 5.987076537725598e-05, |
| "loss": 0.8356, |
| "step": 251 |
| }, |
| { |
| "epoch": 2.0129805292061906, |
| "grad_norm": 0.8755278174646857, |
| "learning_rate": 5.9676396565116814e-05, |
| "loss": 0.4597, |
| "step": 252 |
| }, |
| { |
| "epoch": 2.020968547179231, |
| "grad_norm": 1.2654521868820567, |
| "learning_rate": 5.9481412899618286e-05, |
| "loss": 0.4832, |
| "step": 253 |
| }, |
| { |
| "epoch": 2.0289565651522716, |
| "grad_norm": 0.7005128945439788, |
| "learning_rate": 5.9285820473662676e-05, |
| "loss": 0.4576, |
| "step": 254 |
| }, |
| { |
| "epoch": 2.036944583125312, |
| "grad_norm": 0.8900852330925937, |
| "learning_rate": 5.9089625399174975e-05, |
| "loss": 0.4677, |
| "step": 255 |
| }, |
| { |
| "epoch": 2.0449326010983526, |
| "grad_norm": 0.9295293387128268, |
| "learning_rate": 5.8892833806911934e-05, |
| "loss": 0.4581, |
| "step": 256 |
| }, |
| { |
| "epoch": 2.052920619071393, |
| "grad_norm": 0.7632251727706844, |
| "learning_rate": 5.869545184627041e-05, |
| "loss": 0.4564, |
| "step": 257 |
| }, |
| { |
| "epoch": 2.060908637044433, |
| "grad_norm": 0.606887179521497, |
| "learning_rate": 5.849748568509529e-05, |
| "loss": 0.4446, |
| "step": 258 |
| }, |
| { |
| "epoch": 2.0688966550174737, |
| "grad_norm": 0.7617777810480713, |
| "learning_rate": 5.829894150948668e-05, |
| "loss": 0.4501, |
| "step": 259 |
| }, |
| { |
| "epoch": 2.076884672990514, |
| "grad_norm": 0.6040763884991026, |
| "learning_rate": 5.8099825523606675e-05, |
| "loss": 0.4468, |
| "step": 260 |
| }, |
| { |
| "epoch": 2.0848726909635547, |
| "grad_norm": 0.6051469481172999, |
| "learning_rate": 5.790014394948542e-05, |
| "loss": 0.4543, |
| "step": 261 |
| }, |
| { |
| "epoch": 2.092860708936595, |
| "grad_norm": 0.478413783344682, |
| "learning_rate": 5.769990302682672e-05, |
| "loss": 0.4506, |
| "step": 262 |
| }, |
| { |
| "epoch": 2.1008487269096356, |
| "grad_norm": 0.562558957244333, |
| "learning_rate": 5.749910901281309e-05, |
| "loss": 0.453, |
| "step": 263 |
| }, |
| { |
| "epoch": 2.108836744882676, |
| "grad_norm": 0.4282466955885263, |
| "learning_rate": 5.729776818191014e-05, |
| "loss": 0.4545, |
| "step": 264 |
| }, |
| { |
| "epoch": 2.1168247628557166, |
| "grad_norm": 0.5285703751553213, |
| "learning_rate": 5.709588682567059e-05, |
| "loss": 0.4479, |
| "step": 265 |
| }, |
| { |
| "epoch": 2.1248127808287567, |
| "grad_norm": 0.40043659559155015, |
| "learning_rate": 5.689347125253765e-05, |
| "loss": 0.4442, |
| "step": 266 |
| }, |
| { |
| "epoch": 2.132800798801797, |
| "grad_norm": 0.45748239783102446, |
| "learning_rate": 5.6690527787647856e-05, |
| "loss": 0.4507, |
| "step": 267 |
| }, |
| { |
| "epoch": 2.1407888167748377, |
| "grad_norm": 0.4448537769428446, |
| "learning_rate": 5.6487062772633455e-05, |
| "loss": 0.4518, |
| "step": 268 |
| }, |
| { |
| "epoch": 2.148776834747878, |
| "grad_norm": 0.3496452875829841, |
| "learning_rate": 5.628308256542428e-05, |
| "loss": 0.4511, |
| "step": 269 |
| }, |
| { |
| "epoch": 2.1567648527209187, |
| "grad_norm": 0.36851827820489447, |
| "learning_rate": 5.607859354004897e-05, |
| "loss": 0.4475, |
| "step": 270 |
| }, |
| { |
| "epoch": 2.164752870693959, |
| "grad_norm": 0.3581014245926748, |
| "learning_rate": 5.5873602086435876e-05, |
| "loss": 0.4559, |
| "step": 271 |
| }, |
| { |
| "epoch": 2.1727408886669997, |
| "grad_norm": 0.3124251429586786, |
| "learning_rate": 5.566811461021335e-05, |
| "loss": 0.4507, |
| "step": 272 |
| }, |
| { |
| "epoch": 2.1807289066400397, |
| "grad_norm": 0.363939895859037, |
| "learning_rate": 5.5462137532509624e-05, |
| "loss": 0.4488, |
| "step": 273 |
| }, |
| { |
| "epoch": 2.18871692461308, |
| "grad_norm": 0.26872286843640025, |
| "learning_rate": 5.5255677289752086e-05, |
| "loss": 0.445, |
| "step": 274 |
| }, |
| { |
| "epoch": 2.1967049425861207, |
| "grad_norm": 0.31188711856580686, |
| "learning_rate": 5.504874033346623e-05, |
| "loss": 0.4518, |
| "step": 275 |
| }, |
| { |
| "epoch": 2.204692960559161, |
| "grad_norm": 0.27440306176835016, |
| "learning_rate": 5.4841333130074015e-05, |
| "loss": 0.4398, |
| "step": 276 |
| }, |
| { |
| "epoch": 2.2126809785322017, |
| "grad_norm": 0.2443244556857597, |
| "learning_rate": 5.4633462160691793e-05, |
| "loss": 0.4496, |
| "step": 277 |
| }, |
| { |
| "epoch": 2.220668996505242, |
| "grad_norm": 0.3469310336287689, |
| "learning_rate": 5.442513392092783e-05, |
| "loss": 0.4434, |
| "step": 278 |
| }, |
| { |
| "epoch": 2.2286570144782827, |
| "grad_norm": 0.2103072041810048, |
| "learning_rate": 5.4216354920679256e-05, |
| "loss": 0.4536, |
| "step": 279 |
| }, |
| { |
| "epoch": 2.236645032451323, |
| "grad_norm": 0.302897592656899, |
| "learning_rate": 5.400713168392874e-05, |
| "loss": 0.4469, |
| "step": 280 |
| }, |
| { |
| "epoch": 2.2446330504243637, |
| "grad_norm": 0.26907620566043555, |
| "learning_rate": 5.379747074854054e-05, |
| "loss": 0.4429, |
| "step": 281 |
| }, |
| { |
| "epoch": 2.2526210683974037, |
| "grad_norm": 0.242767529010096, |
| "learning_rate": 5.358737866605624e-05, |
| "loss": 0.4526, |
| "step": 282 |
| }, |
| { |
| "epoch": 2.260609086370444, |
| "grad_norm": 0.24059729283753153, |
| "learning_rate": 5.337686200149004e-05, |
| "loss": 0.4496, |
| "step": 283 |
| }, |
| { |
| "epoch": 2.2685971043434847, |
| "grad_norm": 0.16892626513698825, |
| "learning_rate": 5.316592733312359e-05, |
| "loss": 0.4444, |
| "step": 284 |
| }, |
| { |
| "epoch": 2.276585122316525, |
| "grad_norm": 0.2428921866442825, |
| "learning_rate": 5.2954581252300416e-05, |
| "loss": 0.4475, |
| "step": 285 |
| }, |
| { |
| "epoch": 2.2845731402895657, |
| "grad_norm": 0.24079102043869002, |
| "learning_rate": 5.2742830363220014e-05, |
| "loss": 0.4443, |
| "step": 286 |
| }, |
| { |
| "epoch": 2.292561158262606, |
| "grad_norm": 0.1691131754858366, |
| "learning_rate": 5.25306812827314e-05, |
| "loss": 0.4423, |
| "step": 287 |
| }, |
| { |
| "epoch": 2.3005491762356467, |
| "grad_norm": 0.26332279757319926, |
| "learning_rate": 5.231814064012639e-05, |
| "loss": 0.4482, |
| "step": 288 |
| }, |
| { |
| "epoch": 2.3085371942086867, |
| "grad_norm": 0.30874064763423864, |
| "learning_rate": 5.210521507693245e-05, |
| "loss": 0.4439, |
| "step": 289 |
| }, |
| { |
| "epoch": 2.3165252121817272, |
| "grad_norm": 0.22311973873687838, |
| "learning_rate": 5.189191124670514e-05, |
| "loss": 0.4402, |
| "step": 290 |
| }, |
| { |
| "epoch": 2.3245132301547677, |
| "grad_norm": 0.1922497454060213, |
| "learning_rate": 5.167823581482022e-05, |
| "loss": 0.4409, |
| "step": 291 |
| }, |
| { |
| "epoch": 2.3325012481278082, |
| "grad_norm": 0.16710905147214794, |
| "learning_rate": 5.146419545826535e-05, |
| "loss": 0.4471, |
| "step": 292 |
| }, |
| { |
| "epoch": 2.3404892661008487, |
| "grad_norm": 0.18694588888380953, |
| "learning_rate": 5.124979686543145e-05, |
| "loss": 0.4514, |
| "step": 293 |
| }, |
| { |
| "epoch": 2.3484772840738892, |
| "grad_norm": 0.19041976798949875, |
| "learning_rate": 5.103504673590372e-05, |
| "loss": 0.4385, |
| "step": 294 |
| }, |
| { |
| "epoch": 2.3564653020469297, |
| "grad_norm": 0.20694395753288766, |
| "learning_rate": 5.081995178025228e-05, |
| "loss": 0.4486, |
| "step": 295 |
| }, |
| { |
| "epoch": 2.36445332001997, |
| "grad_norm": 0.16778281147710722, |
| "learning_rate": 5.060451871982242e-05, |
| "loss": 0.455, |
| "step": 296 |
| }, |
| { |
| "epoch": 2.3724413379930107, |
| "grad_norm": 0.17343940615670786, |
| "learning_rate": 5.038875428652468e-05, |
| "loss": 0.447, |
| "step": 297 |
| }, |
| { |
| "epoch": 2.3804293559660508, |
| "grad_norm": 0.17734566622982126, |
| "learning_rate": 5.0172665222624395e-05, |
| "loss": 0.4481, |
| "step": 298 |
| }, |
| { |
| "epoch": 2.3884173739390913, |
| "grad_norm": 0.1766718931672107, |
| "learning_rate": 4.995625828053106e-05, |
| "loss": 0.4524, |
| "step": 299 |
| }, |
| { |
| "epoch": 2.3964053919121318, |
| "grad_norm": 0.19583636193380063, |
| "learning_rate": 4.973954022258729e-05, |
| "loss": 0.4547, |
| "step": 300 |
| }, |
| { |
| "epoch": 2.4043934098851723, |
| "grad_norm": 0.17026857168289744, |
| "learning_rate": 4.952251782085757e-05, |
| "loss": 0.448, |
| "step": 301 |
| }, |
| { |
| "epoch": 2.4123814278582127, |
| "grad_norm": 0.1394946256958487, |
| "learning_rate": 4.930519785691657e-05, |
| "loss": 0.4482, |
| "step": 302 |
| }, |
| { |
| "epoch": 2.4203694458312532, |
| "grad_norm": 0.1507130531191368, |
| "learning_rate": 4.9087587121637284e-05, |
| "loss": 0.4489, |
| "step": 303 |
| }, |
| { |
| "epoch": 2.4283574638042937, |
| "grad_norm": 0.19875894846238537, |
| "learning_rate": 4.886969241497878e-05, |
| "loss": 0.4445, |
| "step": 304 |
| }, |
| { |
| "epoch": 2.436345481777334, |
| "grad_norm": 0.23769686285223604, |
| "learning_rate": 4.865152054577379e-05, |
| "loss": 0.4524, |
| "step": 305 |
| }, |
| { |
| "epoch": 2.4443334997503743, |
| "grad_norm": 0.22560472662810682, |
| "learning_rate": 4.843307833151583e-05, |
| "loss": 0.4473, |
| "step": 306 |
| }, |
| { |
| "epoch": 2.452321517723415, |
| "grad_norm": 0.15975420253786612, |
| "learning_rate": 4.82143725981463e-05, |
| "loss": 0.4474, |
| "step": 307 |
| }, |
| { |
| "epoch": 2.4603095356964553, |
| "grad_norm": 0.1453747344586306, |
| "learning_rate": 4.7995410179841065e-05, |
| "loss": 0.4496, |
| "step": 308 |
| }, |
| { |
| "epoch": 2.4682975536694958, |
| "grad_norm": 0.15320122247522389, |
| "learning_rate": 4.777619791879698e-05, |
| "loss": 0.4445, |
| "step": 309 |
| }, |
| { |
| "epoch": 2.4762855716425363, |
| "grad_norm": 0.20898054566985402, |
| "learning_rate": 4.755674266501802e-05, |
| "loss": 0.4557, |
| "step": 310 |
| }, |
| { |
| "epoch": 2.4842735896155768, |
| "grad_norm": 0.21741215675606498, |
| "learning_rate": 4.73370512761013e-05, |
| "loss": 0.4417, |
| "step": 311 |
| }, |
| { |
| "epoch": 2.4922616075886173, |
| "grad_norm": 0.16889794561130403, |
| "learning_rate": 4.711713061702274e-05, |
| "loss": 0.4443, |
| "step": 312 |
| }, |
| { |
| "epoch": 2.5002496255616578, |
| "grad_norm": 0.17993307076723922, |
| "learning_rate": 4.689698755992255e-05, |
| "loss": 0.4479, |
| "step": 313 |
| }, |
| { |
| "epoch": 2.508237643534698, |
| "grad_norm": 0.19257453660181062, |
| "learning_rate": 4.667662898389048e-05, |
| "loss": 0.4491, |
| "step": 314 |
| }, |
| { |
| "epoch": 2.5162256615077383, |
| "grad_norm": 0.1472085090976699, |
| "learning_rate": 4.645606177475089e-05, |
| "loss": 0.4373, |
| "step": 315 |
| }, |
| { |
| "epoch": 2.524213679480779, |
| "grad_norm": 0.19033455613068187, |
| "learning_rate": 4.6235292824847575e-05, |
| "loss": 0.4544, |
| "step": 316 |
| }, |
| { |
| "epoch": 2.5322016974538193, |
| "grad_norm": 0.18170601952075063, |
| "learning_rate": 4.601432903282836e-05, |
| "loss": 0.4412, |
| "step": 317 |
| }, |
| { |
| "epoch": 2.54018971542686, |
| "grad_norm": 0.15727860647785666, |
| "learning_rate": 4.579317730342955e-05, |
| "loss": 0.4399, |
| "step": 318 |
| }, |
| { |
| "epoch": 2.5481777333999003, |
| "grad_norm": 0.17970878529305648, |
| "learning_rate": 4.5571844547260184e-05, |
| "loss": 0.4403, |
| "step": 319 |
| }, |
| { |
| "epoch": 2.5561657513729408, |
| "grad_norm": 0.15429718810042514, |
| "learning_rate": 4.535033768058604e-05, |
| "loss": 0.4485, |
| "step": 320 |
| }, |
| { |
| "epoch": 2.564153769345981, |
| "grad_norm": 0.15715864822910056, |
| "learning_rate": 4.512866362511361e-05, |
| "loss": 0.4467, |
| "step": 321 |
| }, |
| { |
| "epoch": 2.5721417873190213, |
| "grad_norm": 0.14222629722842062, |
| "learning_rate": 4.490682930777368e-05, |
| "loss": 0.4374, |
| "step": 322 |
| }, |
| { |
| "epoch": 2.580129805292062, |
| "grad_norm": 0.16416055580887054, |
| "learning_rate": 4.468484166050499e-05, |
| "loss": 0.4429, |
| "step": 323 |
| }, |
| { |
| "epoch": 2.5881178232651023, |
| "grad_norm": 0.1378665667643313, |
| "learning_rate": 4.446270762003754e-05, |
| "loss": 0.4439, |
| "step": 324 |
| }, |
| { |
| "epoch": 2.596105841238143, |
| "grad_norm": 0.14749790568854468, |
| "learning_rate": 4.424043412767589e-05, |
| "loss": 0.4466, |
| "step": 325 |
| }, |
| { |
| "epoch": 2.6040938592111833, |
| "grad_norm": 0.146540138127552, |
| "learning_rate": 4.401802812908221e-05, |
| "loss": 0.4419, |
| "step": 326 |
| }, |
| { |
| "epoch": 2.612081877184224, |
| "grad_norm": 0.17339116836008553, |
| "learning_rate": 4.379549657405928e-05, |
| "loss": 0.4467, |
| "step": 327 |
| }, |
| { |
| "epoch": 2.620069895157264, |
| "grad_norm": 0.18348099975421248, |
| "learning_rate": 4.35728464163333e-05, |
| "loss": 0.4416, |
| "step": 328 |
| }, |
| { |
| "epoch": 2.628057913130305, |
| "grad_norm": 0.13620309620113327, |
| "learning_rate": 4.335008461333657e-05, |
| "loss": 0.4427, |
| "step": 329 |
| }, |
| { |
| "epoch": 2.636045931103345, |
| "grad_norm": 0.1709480972281254, |
| "learning_rate": 4.312721812599016e-05, |
| "loss": 0.4414, |
| "step": 330 |
| }, |
| { |
| "epoch": 2.6440339490763853, |
| "grad_norm": 0.16164451064940724, |
| "learning_rate": 4.2904253918486295e-05, |
| "loss": 0.4535, |
| "step": 331 |
| }, |
| { |
| "epoch": 2.652021967049426, |
| "grad_norm": 0.14081917286088105, |
| "learning_rate": 4.268119895807084e-05, |
| "loss": 0.4429, |
| "step": 332 |
| }, |
| { |
| "epoch": 2.6600099850224663, |
| "grad_norm": 0.18137180021156257, |
| "learning_rate": 4.245806021482547e-05, |
| "loss": 0.4427, |
| "step": 333 |
| }, |
| { |
| "epoch": 2.667998002995507, |
| "grad_norm": 0.13800609298110714, |
| "learning_rate": 4.2234844661449964e-05, |
| "loss": 0.44, |
| "step": 334 |
| }, |
| { |
| "epoch": 2.6759860209685473, |
| "grad_norm": 0.1551146252415665, |
| "learning_rate": 4.20115592730443e-05, |
| "loss": 0.4507, |
| "step": 335 |
| }, |
| { |
| "epoch": 2.683974038941588, |
| "grad_norm": 0.15173038583107296, |
| "learning_rate": 4.178821102689064e-05, |
| "loss": 0.4426, |
| "step": 336 |
| }, |
| { |
| "epoch": 2.691962056914628, |
| "grad_norm": 0.15116080328062176, |
| "learning_rate": 4.156480690223537e-05, |
| "loss": 0.447, |
| "step": 337 |
| }, |
| { |
| "epoch": 2.6999500748876684, |
| "grad_norm": 0.17450805671193279, |
| "learning_rate": 4.134135388007097e-05, |
| "loss": 0.4469, |
| "step": 338 |
| }, |
| { |
| "epoch": 2.707938092860709, |
| "grad_norm": 0.17281860373285934, |
| "learning_rate": 4.111785894291789e-05, |
| "loss": 0.4427, |
| "step": 339 |
| }, |
| { |
| "epoch": 2.7159261108337494, |
| "grad_norm": 0.13324453353593427, |
| "learning_rate": 4.089432907460634e-05, |
| "loss": 0.45, |
| "step": 340 |
| }, |
| { |
| "epoch": 2.72391412880679, |
| "grad_norm": 0.15126807617639215, |
| "learning_rate": 4.0670771260058106e-05, |
| "loss": 0.4486, |
| "step": 341 |
| }, |
| { |
| "epoch": 2.7319021467798303, |
| "grad_norm": 0.16029221354477333, |
| "learning_rate": 4.044719248506819e-05, |
| "loss": 0.4408, |
| "step": 342 |
| }, |
| { |
| "epoch": 2.739890164752871, |
| "grad_norm": 0.1463219695798821, |
| "learning_rate": 4.0223599736086596e-05, |
| "loss": 0.4479, |
| "step": 343 |
| }, |
| { |
| "epoch": 2.747878182725911, |
| "grad_norm": 0.14595637335852438, |
| "learning_rate": 4e-05, |
| "loss": 0.4473, |
| "step": 344 |
| }, |
| { |
| "epoch": 2.755866200698952, |
| "grad_norm": 0.13738467367514962, |
| "learning_rate": 3.9776400263913404e-05, |
| "loss": 0.4541, |
| "step": 345 |
| }, |
| { |
| "epoch": 2.763854218671992, |
| "grad_norm": 0.1439562510526391, |
| "learning_rate": 3.9552807514931824e-05, |
| "loss": 0.4436, |
| "step": 346 |
| }, |
| { |
| "epoch": 2.7718422366450324, |
| "grad_norm": 0.13006608621756496, |
| "learning_rate": 3.93292287399419e-05, |
| "loss": 0.4397, |
| "step": 347 |
| }, |
| { |
| "epoch": 2.779830254618073, |
| "grad_norm": 0.14041358992697037, |
| "learning_rate": 3.9105670925393665e-05, |
| "loss": 0.4322, |
| "step": 348 |
| }, |
| { |
| "epoch": 2.7878182725911134, |
| "grad_norm": 0.1495382630742624, |
| "learning_rate": 3.8882141057082117e-05, |
| "loss": 0.449, |
| "step": 349 |
| }, |
| { |
| "epoch": 2.795806290564154, |
| "grad_norm": 0.13422760316245289, |
| "learning_rate": 3.8658646119929046e-05, |
| "loss": 0.4481, |
| "step": 350 |
| }, |
| { |
| "epoch": 2.8037943085371944, |
| "grad_norm": 0.16641223994983959, |
| "learning_rate": 3.843519309776464e-05, |
| "loss": 0.4454, |
| "step": 351 |
| }, |
| { |
| "epoch": 2.811782326510235, |
| "grad_norm": 0.12812350342466014, |
| "learning_rate": 3.821178897310938e-05, |
| "loss": 0.4535, |
| "step": 352 |
| }, |
| { |
| "epoch": 2.819770344483275, |
| "grad_norm": 0.15337686560279318, |
| "learning_rate": 3.798844072695571e-05, |
| "loss": 0.4455, |
| "step": 353 |
| }, |
| { |
| "epoch": 2.8277583624563154, |
| "grad_norm": 0.13561487109024523, |
| "learning_rate": 3.776515533855004e-05, |
| "loss": 0.4421, |
| "step": 354 |
| }, |
| { |
| "epoch": 2.835746380429356, |
| "grad_norm": 0.12405473708728454, |
| "learning_rate": 3.7541939785174545e-05, |
| "loss": 0.4433, |
| "step": 355 |
| }, |
| { |
| "epoch": 2.8437343984023964, |
| "grad_norm": 0.12633600414835006, |
| "learning_rate": 3.731880104192917e-05, |
| "loss": 0.4432, |
| "step": 356 |
| }, |
| { |
| "epoch": 2.851722416375437, |
| "grad_norm": 0.1317080752956006, |
| "learning_rate": 3.709574608151371e-05, |
| "loss": 0.4465, |
| "step": 357 |
| }, |
| { |
| "epoch": 2.8597104343484774, |
| "grad_norm": 0.1475249153982226, |
| "learning_rate": 3.687278187400985e-05, |
| "loss": 0.4401, |
| "step": 358 |
| }, |
| { |
| "epoch": 2.867698452321518, |
| "grad_norm": 0.1458288492905671, |
| "learning_rate": 3.664991538666344e-05, |
| "loss": 0.4344, |
| "step": 359 |
| }, |
| { |
| "epoch": 2.875686470294558, |
| "grad_norm": 0.11939958255100196, |
| "learning_rate": 3.6427153583666715e-05, |
| "loss": 0.4367, |
| "step": 360 |
| }, |
| { |
| "epoch": 2.883674488267599, |
| "grad_norm": 0.16554239338436524, |
| "learning_rate": 3.620450342594073e-05, |
| "loss": 0.4418, |
| "step": 361 |
| }, |
| { |
| "epoch": 2.891662506240639, |
| "grad_norm": 0.1187974636724584, |
| "learning_rate": 3.59819718709178e-05, |
| "loss": 0.45, |
| "step": 362 |
| }, |
| { |
| "epoch": 2.8996505242136794, |
| "grad_norm": 0.15936228812392336, |
| "learning_rate": 3.575956587232413e-05, |
| "loss": 0.4508, |
| "step": 363 |
| }, |
| { |
| "epoch": 2.90763854218672, |
| "grad_norm": 0.13367105463505505, |
| "learning_rate": 3.5537292379962474e-05, |
| "loss": 0.4465, |
| "step": 364 |
| }, |
| { |
| "epoch": 2.9156265601597604, |
| "grad_norm": 0.14243006994077556, |
| "learning_rate": 3.5315158339495015e-05, |
| "loss": 0.4464, |
| "step": 365 |
| }, |
| { |
| "epoch": 2.923614578132801, |
| "grad_norm": 0.1399001261869002, |
| "learning_rate": 3.509317069222633e-05, |
| "loss": 0.4502, |
| "step": 366 |
| }, |
| { |
| "epoch": 2.9316025961058414, |
| "grad_norm": 0.13108273735056272, |
| "learning_rate": 3.487133637488639e-05, |
| "loss": 0.4369, |
| "step": 367 |
| }, |
| { |
| "epoch": 2.939590614078882, |
| "grad_norm": 0.14943325684519726, |
| "learning_rate": 3.464966231941397e-05, |
| "loss": 0.4415, |
| "step": 368 |
| }, |
| { |
| "epoch": 2.947578632051922, |
| "grad_norm": 0.13558373438864768, |
| "learning_rate": 3.442815545273983e-05, |
| "loss": 0.4382, |
| "step": 369 |
| }, |
| { |
| "epoch": 2.9555666500249624, |
| "grad_norm": 0.12912584792295748, |
| "learning_rate": 3.420682269657047e-05, |
| "loss": 0.4363, |
| "step": 370 |
| }, |
| { |
| "epoch": 2.963554667998003, |
| "grad_norm": 0.12458007215100302, |
| "learning_rate": 3.398567096717165e-05, |
| "loss": 0.4409, |
| "step": 371 |
| }, |
| { |
| "epoch": 2.9715426859710434, |
| "grad_norm": 0.12840111428281253, |
| "learning_rate": 3.376470717515244e-05, |
| "loss": 0.4407, |
| "step": 372 |
| }, |
| { |
| "epoch": 2.979530703944084, |
| "grad_norm": 0.13058809738960123, |
| "learning_rate": 3.354393822524913e-05, |
| "loss": 0.4407, |
| "step": 373 |
| }, |
| { |
| "epoch": 2.9875187219171244, |
| "grad_norm": 0.15613845334671814, |
| "learning_rate": 3.332337101610953e-05, |
| "loss": 0.4473, |
| "step": 374 |
| }, |
| { |
| "epoch": 2.995506739890165, |
| "grad_norm": 0.13389617942366203, |
| "learning_rate": 3.310301244007747e-05, |
| "loss": 0.4352, |
| "step": 375 |
| }, |
| { |
| "epoch": 3.0034947578632054, |
| "grad_norm": 0.30944417126328405, |
| "learning_rate": 3.2882869382977265e-05, |
| "loss": 0.7723, |
| "step": 376 |
| }, |
| { |
| "epoch": 3.0114827758362455, |
| "grad_norm": 0.29354627871039446, |
| "learning_rate": 3.266294872389871e-05, |
| "loss": 0.4025, |
| "step": 377 |
| }, |
| { |
| "epoch": 3.019470793809286, |
| "grad_norm": 0.2010591684487564, |
| "learning_rate": 3.2443257334981985e-05, |
| "loss": 0.4024, |
| "step": 378 |
| }, |
| { |
| "epoch": 3.0274588117823265, |
| "grad_norm": 0.27298247297612654, |
| "learning_rate": 3.222380208120304e-05, |
| "loss": 0.4089, |
| "step": 379 |
| }, |
| { |
| "epoch": 3.035446829755367, |
| "grad_norm": 0.23270934832932566, |
| "learning_rate": 3.200458982015894e-05, |
| "loss": 0.4072, |
| "step": 380 |
| }, |
| { |
| "epoch": 3.0434348477284074, |
| "grad_norm": 0.20268308202991778, |
| "learning_rate": 3.178562740185372e-05, |
| "loss": 0.4022, |
| "step": 381 |
| }, |
| { |
| "epoch": 3.051422865701448, |
| "grad_norm": 0.20766736812021794, |
| "learning_rate": 3.156692166848418e-05, |
| "loss": 0.4024, |
| "step": 382 |
| }, |
| { |
| "epoch": 3.0594108836744884, |
| "grad_norm": 0.2547479854625852, |
| "learning_rate": 3.134847945422622e-05, |
| "loss": 0.4072, |
| "step": 383 |
| }, |
| { |
| "epoch": 3.067398901647529, |
| "grad_norm": 0.1969866280565691, |
| "learning_rate": 3.113030758502123e-05, |
| "loss": 0.4118, |
| "step": 384 |
| }, |
| { |
| "epoch": 3.075386919620569, |
| "grad_norm": 0.23153499880928385, |
| "learning_rate": 3.091241287836272e-05, |
| "loss": 0.4077, |
| "step": 385 |
| }, |
| { |
| "epoch": 3.0833749375936095, |
| "grad_norm": 0.20503882652518132, |
| "learning_rate": 3.0694802143083436e-05, |
| "loss": 0.4132, |
| "step": 386 |
| }, |
| { |
| "epoch": 3.09136295556665, |
| "grad_norm": 0.17320798113782282, |
| "learning_rate": 3.0477482179142432e-05, |
| "loss": 0.4097, |
| "step": 387 |
| }, |
| { |
| "epoch": 3.0993509735396905, |
| "grad_norm": 0.20168474769945824, |
| "learning_rate": 3.026045977741272e-05, |
| "loss": 0.3965, |
| "step": 388 |
| }, |
| { |
| "epoch": 3.107338991512731, |
| "grad_norm": 0.19398918365065387, |
| "learning_rate": 3.004374171946895e-05, |
| "loss": 0.402, |
| "step": 389 |
| }, |
| { |
| "epoch": 3.1153270094857715, |
| "grad_norm": 0.16700046485980305, |
| "learning_rate": 2.9827334777375622e-05, |
| "loss": 0.4136, |
| "step": 390 |
| }, |
| { |
| "epoch": 3.123315027458812, |
| "grad_norm": 0.25279169008131713, |
| "learning_rate": 2.9611245713475328e-05, |
| "loss": 0.4003, |
| "step": 391 |
| }, |
| { |
| "epoch": 3.131303045431852, |
| "grad_norm": 0.16080528287954057, |
| "learning_rate": 2.9395481280177596e-05, |
| "loss": 0.4011, |
| "step": 392 |
| }, |
| { |
| "epoch": 3.1392910634048925, |
| "grad_norm": 0.22759163441812938, |
| "learning_rate": 2.9180048219747736e-05, |
| "loss": 0.4034, |
| "step": 393 |
| }, |
| { |
| "epoch": 3.147279081377933, |
| "grad_norm": 0.17841534466968145, |
| "learning_rate": 2.8964953264096277e-05, |
| "loss": 0.4086, |
| "step": 394 |
| }, |
| { |
| "epoch": 3.1552670993509735, |
| "grad_norm": 0.17487802806512123, |
| "learning_rate": 2.8750203134568564e-05, |
| "loss": 0.408, |
| "step": 395 |
| }, |
| { |
| "epoch": 3.163255117324014, |
| "grad_norm": 0.18241625540198192, |
| "learning_rate": 2.8535804541734663e-05, |
| "loss": 0.4077, |
| "step": 396 |
| }, |
| { |
| "epoch": 3.1712431352970545, |
| "grad_norm": 0.16398724549614757, |
| "learning_rate": 2.832176418517979e-05, |
| "loss": 0.4098, |
| "step": 397 |
| }, |
| { |
| "epoch": 3.179231153270095, |
| "grad_norm": 0.16170229114317095, |
| "learning_rate": 2.8108088753294864e-05, |
| "loss": 0.4, |
| "step": 398 |
| }, |
| { |
| "epoch": 3.1872191712431355, |
| "grad_norm": 0.14606650542275093, |
| "learning_rate": 2.7894784923067563e-05, |
| "loss": 0.4081, |
| "step": 399 |
| }, |
| { |
| "epoch": 3.195207189216176, |
| "grad_norm": 0.154688060281461, |
| "learning_rate": 2.768185935987362e-05, |
| "loss": 0.4095, |
| "step": 400 |
| }, |
| { |
| "epoch": 3.203195207189216, |
| "grad_norm": 0.14458385897335363, |
| "learning_rate": 2.7469318717268622e-05, |
| "loss": 0.4083, |
| "step": 401 |
| }, |
| { |
| "epoch": 3.2111832251622565, |
| "grad_norm": 0.14953811526297756, |
| "learning_rate": 2.7257169636779992e-05, |
| "loss": 0.4082, |
| "step": 402 |
| }, |
| { |
| "epoch": 3.219171243135297, |
| "grad_norm": 0.13312099784173914, |
| "learning_rate": 2.704541874769958e-05, |
| "loss": 0.4068, |
| "step": 403 |
| }, |
| { |
| "epoch": 3.2271592611083375, |
| "grad_norm": 0.1386674411611782, |
| "learning_rate": 2.6834072666876427e-05, |
| "loss": 0.402, |
| "step": 404 |
| }, |
| { |
| "epoch": 3.235147279081378, |
| "grad_norm": 0.12924251838188583, |
| "learning_rate": 2.6623137998509964e-05, |
| "loss": 0.4113, |
| "step": 405 |
| }, |
| { |
| "epoch": 3.2431352970544185, |
| "grad_norm": 0.13222743176356805, |
| "learning_rate": 2.641262133394378e-05, |
| "loss": 0.4093, |
| "step": 406 |
| }, |
| { |
| "epoch": 3.251123315027459, |
| "grad_norm": 0.13021912109847186, |
| "learning_rate": 2.6202529251459475e-05, |
| "loss": 0.4104, |
| "step": 407 |
| }, |
| { |
| "epoch": 3.259111333000499, |
| "grad_norm": 0.13606000089551518, |
| "learning_rate": 2.599286831607127e-05, |
| "loss": 0.4089, |
| "step": 408 |
| }, |
| { |
| "epoch": 3.2670993509735395, |
| "grad_norm": 0.13357003115707924, |
| "learning_rate": 2.5783645079320757e-05, |
| "loss": 0.4055, |
| "step": 409 |
| }, |
| { |
| "epoch": 3.27508736894658, |
| "grad_norm": 0.1232470250676397, |
| "learning_rate": 2.5574866079072188e-05, |
| "loss": 0.4133, |
| "step": 410 |
| }, |
| { |
| "epoch": 3.2830753869196205, |
| "grad_norm": 0.14061126711951444, |
| "learning_rate": 2.5366537839308213e-05, |
| "loss": 0.4023, |
| "step": 411 |
| }, |
| { |
| "epoch": 3.291063404892661, |
| "grad_norm": 0.12020419683198272, |
| "learning_rate": 2.515866686992599e-05, |
| "loss": 0.406, |
| "step": 412 |
| }, |
| { |
| "epoch": 3.2990514228657015, |
| "grad_norm": 0.13624018306536384, |
| "learning_rate": 2.4951259666533778e-05, |
| "loss": 0.4137, |
| "step": 413 |
| }, |
| { |
| "epoch": 3.307039440838742, |
| "grad_norm": 0.13470595005125394, |
| "learning_rate": 2.4744322710247914e-05, |
| "loss": 0.4072, |
| "step": 414 |
| }, |
| { |
| "epoch": 3.3150274588117825, |
| "grad_norm": 0.11406991036845995, |
| "learning_rate": 2.4537862467490393e-05, |
| "loss": 0.4032, |
| "step": 415 |
| }, |
| { |
| "epoch": 3.323015476784823, |
| "grad_norm": 0.12469392558548403, |
| "learning_rate": 2.4331885389786648e-05, |
| "loss": 0.4061, |
| "step": 416 |
| }, |
| { |
| "epoch": 3.331003494757863, |
| "grad_norm": 0.11240496673470576, |
| "learning_rate": 2.4126397913564138e-05, |
| "loss": 0.3972, |
| "step": 417 |
| }, |
| { |
| "epoch": 3.3389915127309036, |
| "grad_norm": 0.11440176304944144, |
| "learning_rate": 2.3921406459951038e-05, |
| "loss": 0.401, |
| "step": 418 |
| }, |
| { |
| "epoch": 3.346979530703944, |
| "grad_norm": 0.12061267695807164, |
| "learning_rate": 2.371691743457573e-05, |
| "loss": 0.4042, |
| "step": 419 |
| }, |
| { |
| "epoch": 3.3549675486769845, |
| "grad_norm": 0.12408924452739928, |
| "learning_rate": 2.3512937227366548e-05, |
| "loss": 0.4042, |
| "step": 420 |
| }, |
| { |
| "epoch": 3.362955566650025, |
| "grad_norm": 0.119324320832681, |
| "learning_rate": 2.330947221235217e-05, |
| "loss": 0.3999, |
| "step": 421 |
| }, |
| { |
| "epoch": 3.3709435846230655, |
| "grad_norm": 0.11372319294009971, |
| "learning_rate": 2.3106528747462374e-05, |
| "loss": 0.411, |
| "step": 422 |
| }, |
| { |
| "epoch": 3.378931602596106, |
| "grad_norm": 0.11440578627516848, |
| "learning_rate": 2.290411317432942e-05, |
| "loss": 0.4103, |
| "step": 423 |
| }, |
| { |
| "epoch": 3.386919620569146, |
| "grad_norm": 0.11396557333843903, |
| "learning_rate": 2.270223181808988e-05, |
| "loss": 0.4056, |
| "step": 424 |
| }, |
| { |
| "epoch": 3.3949076385421866, |
| "grad_norm": 0.1073175497389294, |
| "learning_rate": 2.250089098718692e-05, |
| "loss": 0.4001, |
| "step": 425 |
| }, |
| { |
| "epoch": 3.402895656515227, |
| "grad_norm": 0.11142545752473547, |
| "learning_rate": 2.2300096973173276e-05, |
| "loss": 0.4013, |
| "step": 426 |
| }, |
| { |
| "epoch": 3.4108836744882676, |
| "grad_norm": 0.11528253053702402, |
| "learning_rate": 2.2099856050514593e-05, |
| "loss": 0.4074, |
| "step": 427 |
| }, |
| { |
| "epoch": 3.418871692461308, |
| "grad_norm": 0.1075239061798206, |
| "learning_rate": 2.1900174476393335e-05, |
| "loss": 0.4035, |
| "step": 428 |
| }, |
| { |
| "epoch": 3.4268597104343486, |
| "grad_norm": 0.10808021553369461, |
| "learning_rate": 2.170105849051332e-05, |
| "loss": 0.4052, |
| "step": 429 |
| }, |
| { |
| "epoch": 3.434847728407389, |
| "grad_norm": 0.11387661467604573, |
| "learning_rate": 2.1502514314904723e-05, |
| "loss": 0.4011, |
| "step": 430 |
| }, |
| { |
| "epoch": 3.442835746380429, |
| "grad_norm": 0.10171924087995715, |
| "learning_rate": 2.1304548153729596e-05, |
| "loss": 0.4077, |
| "step": 431 |
| }, |
| { |
| "epoch": 3.4508237643534696, |
| "grad_norm": 0.1285002444781682, |
| "learning_rate": 2.1107166193088073e-05, |
| "loss": 0.4063, |
| "step": 432 |
| }, |
| { |
| "epoch": 3.45881178232651, |
| "grad_norm": 0.11335168282371334, |
| "learning_rate": 2.091037460082503e-05, |
| "loss": 0.4154, |
| "step": 433 |
| }, |
| { |
| "epoch": 3.4667998002995506, |
| "grad_norm": 0.11343444669438019, |
| "learning_rate": 2.0714179526337334e-05, |
| "loss": 0.41, |
| "step": 434 |
| }, |
| { |
| "epoch": 3.474787818272591, |
| "grad_norm": 0.1217156602130217, |
| "learning_rate": 2.0518587100381727e-05, |
| "loss": 0.4075, |
| "step": 435 |
| }, |
| { |
| "epoch": 3.4827758362456316, |
| "grad_norm": 0.10725293992167916, |
| "learning_rate": 2.0323603434883186e-05, |
| "loss": 0.4066, |
| "step": 436 |
| }, |
| { |
| "epoch": 3.490763854218672, |
| "grad_norm": 0.12028103178489573, |
| "learning_rate": 2.0129234622744044e-05, |
| "loss": 0.4103, |
| "step": 437 |
| }, |
| { |
| "epoch": 3.4987518721917126, |
| "grad_norm": 0.1029854987347421, |
| "learning_rate": 1.9935486737653452e-05, |
| "loss": 0.4038, |
| "step": 438 |
| }, |
| { |
| "epoch": 3.506739890164753, |
| "grad_norm": 0.11857347505878003, |
| "learning_rate": 1.9742365833897733e-05, |
| "loss": 0.4074, |
| "step": 439 |
| }, |
| { |
| "epoch": 3.514727908137793, |
| "grad_norm": 0.1105825700379065, |
| "learning_rate": 1.954987794617107e-05, |
| "loss": 0.4105, |
| "step": 440 |
| }, |
| { |
| "epoch": 3.5227159261108336, |
| "grad_norm": 0.11511596034752838, |
| "learning_rate": 1.9358029089387034e-05, |
| "loss": 0.4131, |
| "step": 441 |
| }, |
| { |
| "epoch": 3.530703944083874, |
| "grad_norm": 0.11612657903144337, |
| "learning_rate": 1.916682525849058e-05, |
| "loss": 0.4068, |
| "step": 442 |
| }, |
| { |
| "epoch": 3.5386919620569146, |
| "grad_norm": 0.10575599755099882, |
| "learning_rate": 1.897627242827068e-05, |
| "loss": 0.4038, |
| "step": 443 |
| }, |
| { |
| "epoch": 3.546679980029955, |
| "grad_norm": 0.11088748332110426, |
| "learning_rate": 1.878637655317372e-05, |
| "loss": 0.4078, |
| "step": 444 |
| }, |
| { |
| "epoch": 3.5546679980029956, |
| "grad_norm": 0.11466223345296331, |
| "learning_rate": 1.859714356711731e-05, |
| "loss": 0.3939, |
| "step": 445 |
| }, |
| { |
| "epoch": 3.562656015976036, |
| "grad_norm": 0.11673865175002288, |
| "learning_rate": 1.8408579383304985e-05, |
| "loss": 0.4049, |
| "step": 446 |
| }, |
| { |
| "epoch": 3.570644033949076, |
| "grad_norm": 0.11577952607867907, |
| "learning_rate": 1.8220689894041314e-05, |
| "loss": 0.4088, |
| "step": 447 |
| }, |
| { |
| "epoch": 3.578632051922117, |
| "grad_norm": 0.10690091900937719, |
| "learning_rate": 1.8033480970547872e-05, |
| "loss": 0.4056, |
| "step": 448 |
| }, |
| { |
| "epoch": 3.586620069895157, |
| "grad_norm": 0.11541573082426308, |
| "learning_rate": 1.7846958462779716e-05, |
| "loss": 0.4007, |
| "step": 449 |
| }, |
| { |
| "epoch": 3.5946080878681976, |
| "grad_norm": 0.1100114302346526, |
| "learning_rate": 1.7661128199242576e-05, |
| "loss": 0.4089, |
| "step": 450 |
| }, |
| { |
| "epoch": 3.602596105841238, |
| "grad_norm": 0.10956511339867736, |
| "learning_rate": 1.7475995986810775e-05, |
| "loss": 0.4018, |
| "step": 451 |
| }, |
| { |
| "epoch": 3.6105841238142786, |
| "grad_norm": 0.10850454028936493, |
| "learning_rate": 1.7291567610545738e-05, |
| "loss": 0.4051, |
| "step": 452 |
| }, |
| { |
| "epoch": 3.618572141787319, |
| "grad_norm": 0.1131878747175685, |
| "learning_rate": 1.7107848833515244e-05, |
| "loss": 0.4079, |
| "step": 453 |
| }, |
| { |
| "epoch": 3.6265601597603596, |
| "grad_norm": 0.09884020665129564, |
| "learning_rate": 1.6924845396613275e-05, |
| "loss": 0.407, |
| "step": 454 |
| }, |
| { |
| "epoch": 3.6345481777334, |
| "grad_norm": 0.11216709502149264, |
| "learning_rate": 1.6742563018380734e-05, |
| "loss": 0.4087, |
| "step": 455 |
| }, |
| { |
| "epoch": 3.64253619570644, |
| "grad_norm": 0.0996580768122796, |
| "learning_rate": 1.6561007394826623e-05, |
| "loss": 0.4039, |
| "step": 456 |
| }, |
| { |
| "epoch": 3.6505242136794807, |
| "grad_norm": 0.10651639312645377, |
| "learning_rate": 1.638018419925018e-05, |
| "loss": 0.3996, |
| "step": 457 |
| }, |
| { |
| "epoch": 3.658512231652521, |
| "grad_norm": 0.09841162160967377, |
| "learning_rate": 1.6200099082063477e-05, |
| "loss": 0.4055, |
| "step": 458 |
| }, |
| { |
| "epoch": 3.6665002496255616, |
| "grad_norm": 0.11559374542937897, |
| "learning_rate": 1.602075767061497e-05, |
| "loss": 0.4088, |
| "step": 459 |
| }, |
| { |
| "epoch": 3.674488267598602, |
| "grad_norm": 0.11049592658320795, |
| "learning_rate": 1.584216556901355e-05, |
| "loss": 0.4053, |
| "step": 460 |
| }, |
| { |
| "epoch": 3.6824762855716426, |
| "grad_norm": 0.09690459875455099, |
| "learning_rate": 1.566432835795349e-05, |
| "loss": 0.4052, |
| "step": 461 |
| }, |
| { |
| "epoch": 3.690464303544683, |
| "grad_norm": 0.11084043420560455, |
| "learning_rate": 1.5487251594540062e-05, |
| "loss": 0.4013, |
| "step": 462 |
| }, |
| { |
| "epoch": 3.698452321517723, |
| "grad_norm": 0.11145942008644477, |
| "learning_rate": 1.5310940812115812e-05, |
| "loss": 0.404, |
| "step": 463 |
| }, |
| { |
| "epoch": 3.706440339490764, |
| "grad_norm": 0.09702858045834936, |
| "learning_rate": 1.5135401520087757e-05, |
| "loss": 0.4033, |
| "step": 464 |
| }, |
| { |
| "epoch": 3.714428357463804, |
| "grad_norm": 0.10073536549329104, |
| "learning_rate": 1.4960639203755136e-05, |
| "loss": 0.4046, |
| "step": 465 |
| }, |
| { |
| "epoch": 3.7224163754368447, |
| "grad_norm": 0.09948648507952308, |
| "learning_rate": 1.4786659324138075e-05, |
| "loss": 0.4041, |
| "step": 466 |
| }, |
| { |
| "epoch": 3.730404393409885, |
| "grad_norm": 0.09373041246826647, |
| "learning_rate": 1.4613467317806861e-05, |
| "loss": 0.4075, |
| "step": 467 |
| }, |
| { |
| "epoch": 3.7383924113829257, |
| "grad_norm": 0.10208986391007283, |
| "learning_rate": 1.4441068596712157e-05, |
| "loss": 0.3999, |
| "step": 468 |
| }, |
| { |
| "epoch": 3.746380429355966, |
| "grad_norm": 0.10239549924151786, |
| "learning_rate": 1.4269468548015785e-05, |
| "loss": 0.3954, |
| "step": 469 |
| }, |
| { |
| "epoch": 3.7543684473290067, |
| "grad_norm": 0.10434926470085772, |
| "learning_rate": 1.4098672533922471e-05, |
| "loss": 0.4103, |
| "step": 470 |
| }, |
| { |
| "epoch": 3.762356465302047, |
| "grad_norm": 0.1022671854724037, |
| "learning_rate": 1.3928685891512248e-05, |
| "loss": 0.4068, |
| "step": 471 |
| }, |
| { |
| "epoch": 3.770344483275087, |
| "grad_norm": 0.10372672313209318, |
| "learning_rate": 1.375951393257365e-05, |
| "loss": 0.4063, |
| "step": 472 |
| }, |
| { |
| "epoch": 3.7783325012481277, |
| "grad_norm": 0.1001467709798247, |
| "learning_rate": 1.35911619434378e-05, |
| "loss": 0.3982, |
| "step": 473 |
| }, |
| { |
| "epoch": 3.786320519221168, |
| "grad_norm": 0.10848171250475616, |
| "learning_rate": 1.3423635184813182e-05, |
| "loss": 0.3994, |
| "step": 474 |
| }, |
| { |
| "epoch": 3.7943085371942087, |
| "grad_norm": 0.10297059459791853, |
| "learning_rate": 1.3256938891621208e-05, |
| "loss": 0.4051, |
| "step": 475 |
| }, |
| { |
| "epoch": 3.802296555167249, |
| "grad_norm": 0.09850487513786725, |
| "learning_rate": 1.3091078272832732e-05, |
| "loss": 0.4039, |
| "step": 476 |
| }, |
| { |
| "epoch": 3.8102845731402897, |
| "grad_norm": 0.09654837279347964, |
| "learning_rate": 1.2926058511305221e-05, |
| "loss": 0.4027, |
| "step": 477 |
| }, |
| { |
| "epoch": 3.81827259111333, |
| "grad_norm": 0.10106233469187086, |
| "learning_rate": 1.2761884763620773e-05, |
| "loss": 0.4028, |
| "step": 478 |
| }, |
| { |
| "epoch": 3.8262606090863702, |
| "grad_norm": 0.10521144963578496, |
| "learning_rate": 1.2598562159925068e-05, |
| "loss": 0.4047, |
| "step": 479 |
| }, |
| { |
| "epoch": 3.8342486270594107, |
| "grad_norm": 0.10055170090272858, |
| "learning_rate": 1.2436095803766946e-05, |
| "loss": 0.408, |
| "step": 480 |
| }, |
| { |
| "epoch": 3.842236645032451, |
| "grad_norm": 0.10030963723827738, |
| "learning_rate": 1.2274490771939047e-05, |
| "loss": 0.4139, |
| "step": 481 |
| }, |
| { |
| "epoch": 3.8502246630054917, |
| "grad_norm": 0.11291159209305866, |
| "learning_rate": 1.2113752114319107e-05, |
| "loss": 0.4075, |
| "step": 482 |
| }, |
| { |
| "epoch": 3.858212680978532, |
| "grad_norm": 0.09711749318081525, |
| "learning_rate": 1.195388485371213e-05, |
| "loss": 0.4008, |
| "step": 483 |
| }, |
| { |
| "epoch": 3.8662006989515727, |
| "grad_norm": 0.09587092246627478, |
| "learning_rate": 1.1794893985693517e-05, |
| "loss": 0.4072, |
| "step": 484 |
| }, |
| { |
| "epoch": 3.874188716924613, |
| "grad_norm": 0.10842252534792915, |
| "learning_rate": 1.1636784478452872e-05, |
| "loss": 0.3983, |
| "step": 485 |
| }, |
| { |
| "epoch": 3.8821767348976532, |
| "grad_norm": 0.10030989962078998, |
| "learning_rate": 1.1479561272638851e-05, |
| "loss": 0.405, |
| "step": 486 |
| }, |
| { |
| "epoch": 3.890164752870694, |
| "grad_norm": 0.09668292596476558, |
| "learning_rate": 1.1323229281204667e-05, |
| "loss": 0.4046, |
| "step": 487 |
| }, |
| { |
| "epoch": 3.8981527708437342, |
| "grad_norm": 0.11229884300303226, |
| "learning_rate": 1.1167793389254671e-05, |
| "loss": 0.4077, |
| "step": 488 |
| }, |
| { |
| "epoch": 3.9061407888167747, |
| "grad_norm": 0.1007265262970734, |
| "learning_rate": 1.1013258453891624e-05, |
| "loss": 0.4079, |
| "step": 489 |
| }, |
| { |
| "epoch": 3.9141288067898152, |
| "grad_norm": 0.09800596022091544, |
| "learning_rate": 1.0859629304064966e-05, |
| "loss": 0.4124, |
| "step": 490 |
| }, |
| { |
| "epoch": 3.9221168247628557, |
| "grad_norm": 0.0972237859271068, |
| "learning_rate": 1.0706910740419927e-05, |
| "loss": 0.3995, |
| "step": 491 |
| }, |
| { |
| "epoch": 3.930104842735896, |
| "grad_norm": 0.09568160375704794, |
| "learning_rate": 1.055510753514744e-05, |
| "loss": 0.4044, |
| "step": 492 |
| }, |
| { |
| "epoch": 3.9380928607089367, |
| "grad_norm": 0.10293942587001009, |
| "learning_rate": 1.0404224431835127e-05, |
| "loss": 0.3999, |
| "step": 493 |
| }, |
| { |
| "epoch": 3.946080878681977, |
| "grad_norm": 0.09547704742606819, |
| "learning_rate": 1.025426614531897e-05, |
| "loss": 0.4012, |
| "step": 494 |
| }, |
| { |
| "epoch": 3.9540688966550173, |
| "grad_norm": 0.09843903422495338, |
| "learning_rate": 1.0105237361536058e-05, |
| "loss": 0.4029, |
| "step": 495 |
| }, |
| { |
| "epoch": 3.9620569146280578, |
| "grad_norm": 0.0995011244677626, |
| "learning_rate": 9.957142737378128e-06, |
| "loss": 0.4084, |
| "step": 496 |
| }, |
| { |
| "epoch": 3.9700449326010983, |
| "grad_norm": 0.10559619287684664, |
| "learning_rate": 9.809986900546011e-06, |
| "loss": 0.4031, |
| "step": 497 |
| }, |
| { |
| "epoch": 3.9780329505741387, |
| "grad_norm": 0.09619833393540202, |
| "learning_rate": 9.663774449405095e-06, |
| "loss": 0.3986, |
| "step": 498 |
| }, |
| { |
| "epoch": 3.9860209685471792, |
| "grad_norm": 0.09183866726575214, |
| "learning_rate": 9.518509952841586e-06, |
| "loss": 0.4066, |
| "step": 499 |
| }, |
| { |
| "epoch": 3.9940089865202197, |
| "grad_norm": 0.09366222741801747, |
| "learning_rate": 9.374197950119726e-06, |
| "loss": 0.4039, |
| "step": 500 |
| }, |
| { |
| "epoch": 4.00199700449326, |
| "grad_norm": 0.2243873122878777, |
| "learning_rate": 9.230842950740002e-06, |
| "loss": 0.7111, |
| "step": 501 |
| }, |
| { |
| "epoch": 4.0099850224663, |
| "grad_norm": 0.16901611231637365, |
| "learning_rate": 9.088449434298204e-06, |
| "loss": 0.3809, |
| "step": 502 |
| }, |
| { |
| "epoch": 4.017973040439341, |
| "grad_norm": 0.13423991777929192, |
| "learning_rate": 8.947021850345398e-06, |
| "loss": 0.3726, |
| "step": 503 |
| }, |
| { |
| "epoch": 4.025961058412381, |
| "grad_norm": 0.1178503561561421, |
| "learning_rate": 8.806564618248999e-06, |
| "loss": 0.3808, |
| "step": 504 |
| }, |
| { |
| "epoch": 4.033949076385422, |
| "grad_norm": 0.14732236266291146, |
| "learning_rate": 8.667082127054533e-06, |
| "loss": 0.3832, |
| "step": 505 |
| }, |
| { |
| "epoch": 4.041937094358462, |
| "grad_norm": 0.15778749749862814, |
| "learning_rate": 8.52857873534862e-06, |
| "loss": 0.3779, |
| "step": 506 |
| }, |
| { |
| "epoch": 4.049925112331502, |
| "grad_norm": 0.1386627835810346, |
| "learning_rate": 8.391058771122673e-06, |
| "loss": 0.3831, |
| "step": 507 |
| }, |
| { |
| "epoch": 4.057913130304543, |
| "grad_norm": 0.12467125366104446, |
| "learning_rate": 8.254526531637727e-06, |
| "loss": 0.3874, |
| "step": 508 |
| }, |
| { |
| "epoch": 4.065901148277583, |
| "grad_norm": 0.1255433393864893, |
| "learning_rate": 8.118986283290096e-06, |
| "loss": 0.3873, |
| "step": 509 |
| }, |
| { |
| "epoch": 4.073889166250624, |
| "grad_norm": 0.13259474782114336, |
| "learning_rate": 7.984442261478108e-06, |
| "loss": 0.3779, |
| "step": 510 |
| }, |
| { |
| "epoch": 4.081877184223664, |
| "grad_norm": 0.1313024483917481, |
| "learning_rate": 7.850898670469745e-06, |
| "loss": 0.3796, |
| "step": 511 |
| }, |
| { |
| "epoch": 4.089865202196705, |
| "grad_norm": 0.12062485435615429, |
| "learning_rate": 7.718359683271224e-06, |
| "loss": 0.3801, |
| "step": 512 |
| }, |
| { |
| "epoch": 4.097853220169745, |
| "grad_norm": 0.11323572660175975, |
| "learning_rate": 7.586829441496668e-06, |
| "loss": 0.3692, |
| "step": 513 |
| }, |
| { |
| "epoch": 4.105841238142786, |
| "grad_norm": 0.12334975202412422, |
| "learning_rate": 7.456312055238606e-06, |
| "loss": 0.3792, |
| "step": 514 |
| }, |
| { |
| "epoch": 4.113829256115826, |
| "grad_norm": 0.12055598843637728, |
| "learning_rate": 7.326811602939634e-06, |
| "loss": 0.3825, |
| "step": 515 |
| }, |
| { |
| "epoch": 4.121817274088866, |
| "grad_norm": 0.11922302158014507, |
| "learning_rate": 7.198332131264876e-06, |
| "loss": 0.3827, |
| "step": 516 |
| }, |
| { |
| "epoch": 4.129805292061907, |
| "grad_norm": 0.1197396216655153, |
| "learning_rate": 7.070877654975614e-06, |
| "loss": 0.3858, |
| "step": 517 |
| }, |
| { |
| "epoch": 4.137793310034947, |
| "grad_norm": 0.10303380534845168, |
| "learning_rate": 6.944452156803763e-06, |
| "loss": 0.3763, |
| "step": 518 |
| }, |
| { |
| "epoch": 4.145781328007988, |
| "grad_norm": 0.10771564322360738, |
| "learning_rate": 6.819059587327479e-06, |
| "loss": 0.3798, |
| "step": 519 |
| }, |
| { |
| "epoch": 4.153769345981028, |
| "grad_norm": 0.11083630377147478, |
| "learning_rate": 6.694703864847673e-06, |
| "loss": 0.3812, |
| "step": 520 |
| }, |
| { |
| "epoch": 4.161757363954069, |
| "grad_norm": 0.1036678793429057, |
| "learning_rate": 6.571388875265592e-06, |
| "loss": 0.3804, |
| "step": 521 |
| }, |
| { |
| "epoch": 4.169745381927109, |
| "grad_norm": 0.10290514415858039, |
| "learning_rate": 6.449118471961342e-06, |
| "loss": 0.3815, |
| "step": 522 |
| }, |
| { |
| "epoch": 4.177733399900149, |
| "grad_norm": 0.09999602037947594, |
| "learning_rate": 6.327896475673561e-06, |
| "loss": 0.3796, |
| "step": 523 |
| }, |
| { |
| "epoch": 4.18572141787319, |
| "grad_norm": 0.10176649553175782, |
| "learning_rate": 6.207726674379961e-06, |
| "loss": 0.3802, |
| "step": 524 |
| }, |
| { |
| "epoch": 4.19370943584623, |
| "grad_norm": 0.10341756297649503, |
| "learning_rate": 6.088612823178968e-06, |
| "loss": 0.3752, |
| "step": 525 |
| }, |
| { |
| "epoch": 4.201697453819271, |
| "grad_norm": 0.10010538520744762, |
| "learning_rate": 5.970558644172424e-06, |
| "loss": 0.3772, |
| "step": 526 |
| }, |
| { |
| "epoch": 4.209685471792311, |
| "grad_norm": 0.09383564748055143, |
| "learning_rate": 5.853567826349213e-06, |
| "loss": 0.3738, |
| "step": 527 |
| }, |
| { |
| "epoch": 4.217673489765352, |
| "grad_norm": 0.09458974198014311, |
| "learning_rate": 5.737644025470057e-06, |
| "loss": 0.3752, |
| "step": 528 |
| }, |
| { |
| "epoch": 4.225661507738392, |
| "grad_norm": 0.10200091444940393, |
| "learning_rate": 5.6227908639532045e-06, |
| "loss": 0.3822, |
| "step": 529 |
| }, |
| { |
| "epoch": 4.233649525711433, |
| "grad_norm": 0.09730500481091861, |
| "learning_rate": 5.509011930761308e-06, |
| "loss": 0.381, |
| "step": 530 |
| }, |
| { |
| "epoch": 4.241637543684473, |
| "grad_norm": 0.09532139450671104, |
| "learning_rate": 5.396310781289243e-06, |
| "loss": 0.3816, |
| "step": 531 |
| }, |
| { |
| "epoch": 4.249625561657513, |
| "grad_norm": 0.09644789239600618, |
| "learning_rate": 5.284690937252977e-06, |
| "loss": 0.3696, |
| "step": 532 |
| }, |
| { |
| "epoch": 4.257613579630554, |
| "grad_norm": 0.10066108394874461, |
| "learning_rate": 5.1741558865795906e-06, |
| "loss": 0.3859, |
| "step": 533 |
| }, |
| { |
| "epoch": 4.265601597603594, |
| "grad_norm": 0.09693373503450557, |
| "learning_rate": 5.064709083298214e-06, |
| "loss": 0.3822, |
| "step": 534 |
| }, |
| { |
| "epoch": 4.273589615576635, |
| "grad_norm": 0.08926912859612744, |
| "learning_rate": 4.95635394743216e-06, |
| "loss": 0.3782, |
| "step": 535 |
| }, |
| { |
| "epoch": 4.281577633549675, |
| "grad_norm": 0.09076499502790894, |
| "learning_rate": 4.849093864891994e-06, |
| "loss": 0.3822, |
| "step": 536 |
| }, |
| { |
| "epoch": 4.289565651522716, |
| "grad_norm": 0.08773426720097247, |
| "learning_rate": 4.7429321873697865e-06, |
| "loss": 0.3783, |
| "step": 537 |
| }, |
| { |
| "epoch": 4.297553669495756, |
| "grad_norm": 0.0879998661027265, |
| "learning_rate": 4.637872232234326e-06, |
| "loss": 0.3805, |
| "step": 538 |
| }, |
| { |
| "epoch": 4.305541687468796, |
| "grad_norm": 0.09409764491066522, |
| "learning_rate": 4.5339172824274955e-06, |
| "loss": 0.3795, |
| "step": 539 |
| }, |
| { |
| "epoch": 4.313529705441837, |
| "grad_norm": 0.089547761049764, |
| "learning_rate": 4.4310705863616835e-06, |
| "loss": 0.3794, |
| "step": 540 |
| }, |
| { |
| "epoch": 4.321517723414877, |
| "grad_norm": 0.09066889486649515, |
| "learning_rate": 4.329335357818236e-06, |
| "loss": 0.3759, |
| "step": 541 |
| }, |
| { |
| "epoch": 4.329505741387918, |
| "grad_norm": 0.09166858551173564, |
| "learning_rate": 4.228714775847084e-06, |
| "loss": 0.3877, |
| "step": 542 |
| }, |
| { |
| "epoch": 4.337493759360958, |
| "grad_norm": 0.09606401143384108, |
| "learning_rate": 4.129211984667385e-06, |
| "loss": 0.3803, |
| "step": 543 |
| }, |
| { |
| "epoch": 4.345481777333999, |
| "grad_norm": 0.08718065900580216, |
| "learning_rate": 4.030830093569247e-06, |
| "loss": 0.3764, |
| "step": 544 |
| }, |
| { |
| "epoch": 4.353469795307039, |
| "grad_norm": 0.09279816372084171, |
| "learning_rate": 3.933572176816602e-06, |
| "loss": 0.3818, |
| "step": 545 |
| }, |
| { |
| "epoch": 4.361457813280079, |
| "grad_norm": 0.0895802314032739, |
| "learning_rate": 3.837441273551137e-06, |
| "loss": 0.3749, |
| "step": 546 |
| }, |
| { |
| "epoch": 4.36944583125312, |
| "grad_norm": 0.08925866632093443, |
| "learning_rate": 3.7424403876972924e-06, |
| "loss": 0.3741, |
| "step": 547 |
| }, |
| { |
| "epoch": 4.37743384922616, |
| "grad_norm": 0.09077149176473304, |
| "learning_rate": 3.6485724878684382e-06, |
| "loss": 0.3889, |
| "step": 548 |
| }, |
| { |
| "epoch": 4.385421867199201, |
| "grad_norm": 0.08624641665702638, |
| "learning_rate": 3.555840507274093e-06, |
| "loss": 0.3788, |
| "step": 549 |
| }, |
| { |
| "epoch": 4.393409885172241, |
| "grad_norm": 0.09155307608035071, |
| "learning_rate": 3.464247343628242e-06, |
| "loss": 0.3833, |
| "step": 550 |
| }, |
| { |
| "epoch": 4.401397903145282, |
| "grad_norm": 0.08659198159245704, |
| "learning_rate": 3.373795859058837e-06, |
| "loss": 0.3756, |
| "step": 551 |
| }, |
| { |
| "epoch": 4.409385921118322, |
| "grad_norm": 0.08959149189104454, |
| "learning_rate": 3.284488880018315e-06, |
| "loss": 0.3809, |
| "step": 552 |
| }, |
| { |
| "epoch": 4.417373939091363, |
| "grad_norm": 0.08570866197339067, |
| "learning_rate": 3.196329197195307e-06, |
| "loss": 0.379, |
| "step": 553 |
| }, |
| { |
| "epoch": 4.425361957064403, |
| "grad_norm": 0.08585759689716206, |
| "learning_rate": 3.1093195654274024e-06, |
| "loss": 0.3844, |
| "step": 554 |
| }, |
| { |
| "epoch": 4.433349975037443, |
| "grad_norm": 0.08851894364844058, |
| "learning_rate": 3.0234627036151186e-06, |
| "loss": 0.3754, |
| "step": 555 |
| }, |
| { |
| "epoch": 4.441337993010484, |
| "grad_norm": 0.08546993455255109, |
| "learning_rate": 2.9387612946368647e-06, |
| "loss": 0.3767, |
| "step": 556 |
| }, |
| { |
| "epoch": 4.449326010983524, |
| "grad_norm": 0.08689133858962513, |
| "learning_rate": 2.855217985265184e-06, |
| "loss": 0.3818, |
| "step": 557 |
| }, |
| { |
| "epoch": 4.457314028956565, |
| "grad_norm": 0.08705508747400349, |
| "learning_rate": 2.7728353860839763e-06, |
| "loss": 0.3789, |
| "step": 558 |
| }, |
| { |
| "epoch": 4.465302046929605, |
| "grad_norm": 0.08598514484683649, |
| "learning_rate": 2.6916160714069817e-06, |
| "loss": 0.3721, |
| "step": 559 |
| }, |
| { |
| "epoch": 4.473290064902646, |
| "grad_norm": 0.08768951265999986, |
| "learning_rate": 2.6115625791973155e-06, |
| "loss": 0.3777, |
| "step": 560 |
| }, |
| { |
| "epoch": 4.481278082875686, |
| "grad_norm": 0.08479223708104064, |
| "learning_rate": 2.5326774109881223e-06, |
| "loss": 0.3805, |
| "step": 561 |
| }, |
| { |
| "epoch": 4.489266100848727, |
| "grad_norm": 0.08131123805163427, |
| "learning_rate": 2.454963031804485e-06, |
| "loss": 0.3746, |
| "step": 562 |
| }, |
| { |
| "epoch": 4.497254118821767, |
| "grad_norm": 0.08329047935604311, |
| "learning_rate": 2.378421870086314e-06, |
| "loss": 0.3761, |
| "step": 563 |
| }, |
| { |
| "epoch": 4.5052421367948075, |
| "grad_norm": 0.08462162107210089, |
| "learning_rate": 2.3030563176125444e-06, |
| "loss": 0.3738, |
| "step": 564 |
| }, |
| { |
| "epoch": 4.513230154767848, |
| "grad_norm": 0.09812143956960612, |
| "learning_rate": 2.228868729426319e-06, |
| "loss": 0.3765, |
| "step": 565 |
| }, |
| { |
| "epoch": 4.521218172740888, |
| "grad_norm": 0.08490273500457897, |
| "learning_rate": 2.1558614237614516e-06, |
| "loss": 0.3778, |
| "step": 566 |
| }, |
| { |
| "epoch": 4.529206190713929, |
| "grad_norm": 0.08570430572140957, |
| "learning_rate": 2.0840366819699788e-06, |
| "loss": 0.3857, |
| "step": 567 |
| }, |
| { |
| "epoch": 4.537194208686969, |
| "grad_norm": 0.08300561137308456, |
| "learning_rate": 2.013396748450842e-06, |
| "loss": 0.3761, |
| "step": 568 |
| }, |
| { |
| "epoch": 4.54518222666001, |
| "grad_norm": 0.08443227783552133, |
| "learning_rate": 1.9439438305797776e-06, |
| "loss": 0.3756, |
| "step": 569 |
| }, |
| { |
| "epoch": 4.55317024463305, |
| "grad_norm": 0.08135395570142633, |
| "learning_rate": 1.8756800986403466e-06, |
| "loss": 0.3782, |
| "step": 570 |
| }, |
| { |
| "epoch": 4.5611582626060905, |
| "grad_norm": 0.08279967533402854, |
| "learning_rate": 1.808607685756103e-06, |
| "loss": 0.3776, |
| "step": 571 |
| }, |
| { |
| "epoch": 4.569146280579131, |
| "grad_norm": 0.0834623870625263, |
| "learning_rate": 1.7427286878239247e-06, |
| "loss": 0.3713, |
| "step": 572 |
| }, |
| { |
| "epoch": 4.5771342985521715, |
| "grad_norm": 0.08512591892730595, |
| "learning_rate": 1.6780451634485606e-06, |
| "loss": 0.3781, |
| "step": 573 |
| }, |
| { |
| "epoch": 4.585122316525212, |
| "grad_norm": 0.08121169235017031, |
| "learning_rate": 1.614559133878264e-06, |
| "loss": 0.3822, |
| "step": 574 |
| }, |
| { |
| "epoch": 4.5931103344982525, |
| "grad_norm": 0.0815454483422227, |
| "learning_rate": 1.5522725829416474e-06, |
| "loss": 0.3789, |
| "step": 575 |
| }, |
| { |
| "epoch": 4.601098352471293, |
| "grad_norm": 0.0819923460505712, |
| "learning_rate": 1.4911874569856965e-06, |
| "loss": 0.3777, |
| "step": 576 |
| }, |
| { |
| "epoch": 4.6090863704443334, |
| "grad_norm": 0.08276809528907374, |
| "learning_rate": 1.4313056648149393e-06, |
| "loss": 0.3818, |
| "step": 577 |
| }, |
| { |
| "epoch": 4.6170743884173735, |
| "grad_norm": 0.08123407989783393, |
| "learning_rate": 1.3726290776318175e-06, |
| "loss": 0.3752, |
| "step": 578 |
| }, |
| { |
| "epoch": 4.625062406390414, |
| "grad_norm": 0.08137283984240884, |
| "learning_rate": 1.3151595289781738e-06, |
| "loss": 0.3846, |
| "step": 579 |
| }, |
| { |
| "epoch": 4.6330504243634545, |
| "grad_norm": 0.08150026114578374, |
| "learning_rate": 1.2588988146780135e-06, |
| "loss": 0.3884, |
| "step": 580 |
| }, |
| { |
| "epoch": 4.641038442336495, |
| "grad_norm": 0.08281920320562544, |
| "learning_rate": 1.2038486927813354e-06, |
| "loss": 0.3841, |
| "step": 581 |
| }, |
| { |
| "epoch": 4.6490264603095355, |
| "grad_norm": 0.08355306503400638, |
| "learning_rate": 1.1500108835092472e-06, |
| "loss": 0.3812, |
| "step": 582 |
| }, |
| { |
| "epoch": 4.657014478282576, |
| "grad_norm": 0.08418060141581976, |
| "learning_rate": 1.0973870692001554e-06, |
| "loss": 0.3792, |
| "step": 583 |
| }, |
| { |
| "epoch": 4.6650024962556165, |
| "grad_norm": 0.08223524263153421, |
| "learning_rate": 1.0459788942572423e-06, |
| "loss": 0.3843, |
| "step": 584 |
| }, |
| { |
| "epoch": 4.6729905142286565, |
| "grad_norm": 0.08271968804038993, |
| "learning_rate": 9.957879650970549e-07, |
| "loss": 0.3857, |
| "step": 585 |
| }, |
| { |
| "epoch": 4.6809785322016975, |
| "grad_norm": 0.08244656434489289, |
| "learning_rate": 9.468158500993207e-07, |
| "loss": 0.3874, |
| "step": 586 |
| }, |
| { |
| "epoch": 4.6889665501747375, |
| "grad_norm": 0.0819506533129172, |
| "learning_rate": 8.990640795579186e-07, |
| "loss": 0.3808, |
| "step": 587 |
| }, |
| { |
| "epoch": 4.6969545681477785, |
| "grad_norm": 0.08149745500782653, |
| "learning_rate": 8.525341456330883e-07, |
| "loss": 0.3727, |
| "step": 588 |
| }, |
| { |
| "epoch": 4.7049425861208185, |
| "grad_norm": 0.08076187044142838, |
| "learning_rate": 8.072275023047926e-07, |
| "loss": 0.3761, |
| "step": 589 |
| }, |
| { |
| "epoch": 4.712930604093859, |
| "grad_norm": 0.08151591065997134, |
| "learning_rate": 7.631455653272613e-07, |
| "loss": 0.3832, |
| "step": 590 |
| }, |
| { |
| "epoch": 4.7209186220668995, |
| "grad_norm": 0.08462271380326744, |
| "learning_rate": 7.202897121847852e-07, |
| "loss": 0.3749, |
| "step": 591 |
| }, |
| { |
| "epoch": 4.72890664003994, |
| "grad_norm": 0.08308313300815548, |
| "learning_rate": 6.786612820486449e-07, |
| "loss": 0.3742, |
| "step": 592 |
| }, |
| { |
| "epoch": 4.7368946580129805, |
| "grad_norm": 0.08421663571704587, |
| "learning_rate": 6.382615757352817e-07, |
| "loss": 0.383, |
| "step": 593 |
| }, |
| { |
| "epoch": 4.744882675986021, |
| "grad_norm": 0.08208417725816322, |
| "learning_rate": 5.990918556656411e-07, |
| "loss": 0.3802, |
| "step": 594 |
| }, |
| { |
| "epoch": 4.7528706939590615, |
| "grad_norm": 0.08235652164981158, |
| "learning_rate": 5.611533458257245e-07, |
| "loss": 0.3826, |
| "step": 595 |
| }, |
| { |
| "epoch": 4.7608587119321015, |
| "grad_norm": 0.0823525460961533, |
| "learning_rate": 5.2444723172834e-07, |
| "loss": 0.375, |
| "step": 596 |
| }, |
| { |
| "epoch": 4.7688467299051425, |
| "grad_norm": 0.08291828155167397, |
| "learning_rate": 4.889746603760693e-07, |
| "loss": 0.3841, |
| "step": 597 |
| }, |
| { |
| "epoch": 4.7768347478781825, |
| "grad_norm": 0.0809741018796145, |
| "learning_rate": 4.5473674022541213e-07, |
| "loss": 0.3753, |
| "step": 598 |
| }, |
| { |
| "epoch": 4.7848227658512235, |
| "grad_norm": 0.08124124038278724, |
| "learning_rate": 4.2173454115214783e-07, |
| "loss": 0.3838, |
| "step": 599 |
| }, |
| { |
| "epoch": 4.7928107838242635, |
| "grad_norm": 0.08103520713384339, |
| "learning_rate": 3.899690944179257e-07, |
| "loss": 0.3765, |
| "step": 600 |
| }, |
| { |
| "epoch": 4.8007988017973044, |
| "grad_norm": 0.08227217638870313, |
| "learning_rate": 3.5944139263800694e-07, |
| "loss": 0.3834, |
| "step": 601 |
| }, |
| { |
| "epoch": 4.8087868197703445, |
| "grad_norm": 0.07899228317121158, |
| "learning_rate": 3.3015238975026675e-07, |
| "loss": 0.3694, |
| "step": 602 |
| }, |
| { |
| "epoch": 4.8167748377433846, |
| "grad_norm": 0.09227389493594652, |
| "learning_rate": 3.021030009853876e-07, |
| "loss": 0.3783, |
| "step": 603 |
| }, |
| { |
| "epoch": 4.8247628557164255, |
| "grad_norm": 0.08106531182197436, |
| "learning_rate": 2.752941028382594e-07, |
| "loss": 0.3773, |
| "step": 604 |
| }, |
| { |
| "epoch": 4.8327508736894655, |
| "grad_norm": 0.08015145752167932, |
| "learning_rate": 2.4972653304057073e-07, |
| "loss": 0.3777, |
| "step": 605 |
| }, |
| { |
| "epoch": 4.8407388916625065, |
| "grad_norm": 0.08160453860592876, |
| "learning_rate": 2.25401090534656e-07, |
| "loss": 0.3808, |
| "step": 606 |
| }, |
| { |
| "epoch": 4.8487269096355465, |
| "grad_norm": 0.07966427336497452, |
| "learning_rate": 2.0231853544852465e-07, |
| "loss": 0.3744, |
| "step": 607 |
| }, |
| { |
| "epoch": 4.8567149276085875, |
| "grad_norm": 0.08123242623536424, |
| "learning_rate": 1.8047958907209339e-07, |
| "loss": 0.3825, |
| "step": 608 |
| }, |
| { |
| "epoch": 4.8647029455816275, |
| "grad_norm": 0.0805412707928896, |
| "learning_rate": 1.5988493383466198e-07, |
| "loss": 0.3749, |
| "step": 609 |
| }, |
| { |
| "epoch": 4.872690963554668, |
| "grad_norm": 0.08036474123731352, |
| "learning_rate": 1.40535213283588e-07, |
| "loss": 0.3748, |
| "step": 610 |
| }, |
| { |
| "epoch": 4.8806789815277085, |
| "grad_norm": 0.08213950898863626, |
| "learning_rate": 1.2243103206417418e-07, |
| "loss": 0.3819, |
| "step": 611 |
| }, |
| { |
| "epoch": 4.888666999500749, |
| "grad_norm": 0.07935174486804004, |
| "learning_rate": 1.05572955900759e-07, |
| "loss": 0.3827, |
| "step": 612 |
| }, |
| { |
| "epoch": 4.8966550174737895, |
| "grad_norm": 0.07731873027438858, |
| "learning_rate": 8.996151157907306e-08, |
| "loss": 0.3674, |
| "step": 613 |
| }, |
| { |
| "epoch": 4.90464303544683, |
| "grad_norm": 0.07905308777211134, |
| "learning_rate": 7.559718692974116e-08, |
| "loss": 0.3755, |
| "step": 614 |
| }, |
| { |
| "epoch": 4.9126310534198705, |
| "grad_norm": 0.08188223266669394, |
| "learning_rate": 6.248043081307664e-08, |
| "loss": 0.3848, |
| "step": 615 |
| }, |
| { |
| "epoch": 4.9206190713929105, |
| "grad_norm": 0.07960614583875532, |
| "learning_rate": 5.0611653105003824e-08, |
| "loss": 0.3754, |
| "step": 616 |
| }, |
| { |
| "epoch": 4.928607089365951, |
| "grad_norm": 0.08159036451816658, |
| "learning_rate": 3.99912246843126e-08, |
| "loss": 0.384, |
| "step": 617 |
| }, |
| { |
| "epoch": 4.9365951073389915, |
| "grad_norm": 0.08068916515639828, |
| "learning_rate": 3.061947742101001e-08, |
| "loss": 0.3797, |
| "step": 618 |
| }, |
| { |
| "epoch": 4.944583125312032, |
| "grad_norm": 0.07945334308304049, |
| "learning_rate": 2.2496704165995142e-08, |
| "loss": 0.378, |
| "step": 619 |
| }, |
| { |
| "epoch": 4.9525711432850725, |
| "grad_norm": 0.08051278989431843, |
| "learning_rate": 1.5623158741884247e-08, |
| "loss": 0.3804, |
| "step": 620 |
| }, |
| { |
| "epoch": 4.960559161258113, |
| "grad_norm": 0.07952219440080063, |
| "learning_rate": 9.999055935074887e-09, |
| "loss": 0.3661, |
| "step": 621 |
| }, |
| { |
| "epoch": 4.9685471792311535, |
| "grad_norm": 0.08077553977056519, |
| "learning_rate": 5.624571489053488e-09, |
| "loss": 0.3829, |
| "step": 622 |
| }, |
| { |
| "epoch": 4.976535197204194, |
| "grad_norm": 0.08255739947277718, |
| "learning_rate": 2.499842098901972e-09, |
| "loss": 0.3842, |
| "step": 623 |
| }, |
| { |
| "epoch": 4.9845232151772345, |
| "grad_norm": 0.08097383031020737, |
| "learning_rate": 6.249654069989674e-10, |
| "loss": 0.3817, |
| "step": 624 |
| }, |
| { |
| "epoch": 4.992511233150275, |
| "grad_norm": 0.08105708331755175, |
| "learning_rate": 0.0, |
| "loss": 0.377, |
| "step": 625 |
| }, |
| { |
| "epoch": 4.992511233150275, |
| "step": 625, |
| "total_flos": 1.6083110655493669e+19, |
| "train_loss": 0.47168621559143065, |
| "train_runtime": 96267.4715, |
| "train_samples_per_second": 3.329, |
| "train_steps_per_second": 0.006 |
| } |
| ], |
| "logging_steps": 1.0, |
| "max_steps": 625, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 5, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.6083110655493669e+19, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|