| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9970559371933267, | |
| "eval_steps": 500, | |
| "global_step": 254, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.003925417075564278, | |
| "grad_norm": 116.9738323646099, | |
| "learning_rate": 7.692307692307694e-07, | |
| "loss": 9.3997, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.007850834151128557, | |
| "grad_norm": 123.91257504333558, | |
| "learning_rate": 1.5384615384615387e-06, | |
| "loss": 9.3549, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.011776251226692836, | |
| "grad_norm": 48.57135571075974, | |
| "learning_rate": 2.307692307692308e-06, | |
| "loss": 9.0614, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.015701668302257114, | |
| "grad_norm": 58.59648343522018, | |
| "learning_rate": 3.0769230769230774e-06, | |
| "loss": 8.8509, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.019627085377821395, | |
| "grad_norm": 50.91025788096803, | |
| "learning_rate": 3.846153846153847e-06, | |
| "loss": 8.636, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.023552502453385672, | |
| "grad_norm": 44.45780070375837, | |
| "learning_rate": 4.615384615384616e-06, | |
| "loss": 8.3173, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.02747791952894995, | |
| "grad_norm": 42.84702479489111, | |
| "learning_rate": 5.384615384615385e-06, | |
| "loss": 7.8859, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.03140333660451423, | |
| "grad_norm": 42.34819567131082, | |
| "learning_rate": 6.153846153846155e-06, | |
| "loss": 7.5329, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.03532875368007851, | |
| "grad_norm": 39.50937586543066, | |
| "learning_rate": 6.923076923076923e-06, | |
| "loss": 6.857, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.03925417075564279, | |
| "grad_norm": 34.115530165648025, | |
| "learning_rate": 7.692307692307694e-06, | |
| "loss": 6.5921, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.04317958783120707, | |
| "grad_norm": 29.69677549924329, | |
| "learning_rate": 8.461538461538462e-06, | |
| "loss": 6.0138, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.047105004906771344, | |
| "grad_norm": 23.64207176396817, | |
| "learning_rate": 9.230769230769232e-06, | |
| "loss": 5.6403, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.05103042198233562, | |
| "grad_norm": 22.631328757859926, | |
| "learning_rate": 1e-05, | |
| "loss": 5.314, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.0549558390578999, | |
| "grad_norm": 37.01468512712713, | |
| "learning_rate": 1.076923076923077e-05, | |
| "loss": 5.1368, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.058881256133464184, | |
| "grad_norm": 14.98399004019133, | |
| "learning_rate": 1.1538461538461538e-05, | |
| "loss": 4.9973, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.06280667320902845, | |
| "grad_norm": 24.12417626781245, | |
| "learning_rate": 1.230769230769231e-05, | |
| "loss": 4.5107, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.06673209028459273, | |
| "grad_norm": 22.007146508271816, | |
| "learning_rate": 1.3076923076923078e-05, | |
| "loss": 4.7359, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.07065750736015702, | |
| "grad_norm": 11.616076478834982, | |
| "learning_rate": 1.3846153846153847e-05, | |
| "loss": 4.4023, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.0745829244357213, | |
| "grad_norm": 17.952985681304522, | |
| "learning_rate": 1.4615384615384615e-05, | |
| "loss": 4.5416, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.07850834151128558, | |
| "grad_norm": 8.98080811280273, | |
| "learning_rate": 1.5384615384615387e-05, | |
| "loss": 4.1081, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.08243375858684986, | |
| "grad_norm": 10.149578559605182, | |
| "learning_rate": 1.6153846153846154e-05, | |
| "loss": 3.8817, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.08635917566241413, | |
| "grad_norm": 8.571053946987243, | |
| "learning_rate": 1.6923076923076924e-05, | |
| "loss": 3.7572, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.09028459273797841, | |
| "grad_norm": 7.502672983259189, | |
| "learning_rate": 1.7692307692307694e-05, | |
| "loss": 3.5601, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.09421000981354269, | |
| "grad_norm": 7.085495749229153, | |
| "learning_rate": 1.8461538461538465e-05, | |
| "loss": 3.8353, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.09813542688910697, | |
| "grad_norm": 7.138103529726273, | |
| "learning_rate": 1.923076923076923e-05, | |
| "loss": 3.6063, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.10206084396467124, | |
| "grad_norm": 6.667151300603169, | |
| "learning_rate": 2e-05, | |
| "loss": 3.3784, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.10598626104023552, | |
| "grad_norm": 15.159134026094918, | |
| "learning_rate": 1.999905072250599e-05, | |
| "loss": 3.1382, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.1099116781157998, | |
| "grad_norm": 23.620617884079227, | |
| "learning_rate": 1.9996203070249516e-05, | |
| "loss": 3.42, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.11383709519136408, | |
| "grad_norm": 6.525857331664067, | |
| "learning_rate": 1.999145758387301e-05, | |
| "loss": 3.234, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.11776251226692837, | |
| "grad_norm": 6.5567605227849075, | |
| "learning_rate": 1.9984815164333163e-05, | |
| "loss": 3.0821, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.12168792934249265, | |
| "grad_norm": 6.091812823958394, | |
| "learning_rate": 1.9976277072729845e-05, | |
| "loss": 3.1632, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.1256133464180569, | |
| "grad_norm": 5.473117028293557, | |
| "learning_rate": 1.99658449300667e-05, | |
| "loss": 3.0885, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.1295387634936212, | |
| "grad_norm": 5.217399804685527, | |
| "learning_rate": 1.9953520716943373e-05, | |
| "loss": 3.1615, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.13346418056918546, | |
| "grad_norm": 4.589181632635667, | |
| "learning_rate": 1.9939306773179498e-05, | |
| "loss": 2.8962, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.13738959764474976, | |
| "grad_norm": 6.8821193201718485, | |
| "learning_rate": 1.992320579737045e-05, | |
| "loss": 2.9593, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.14131501472031405, | |
| "grad_norm": 4.053937279650924, | |
| "learning_rate": 1.990522084637503e-05, | |
| "loss": 2.6995, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.1452404317958783, | |
| "grad_norm": 4.624820726258902, | |
| "learning_rate": 1.9885355334735082e-05, | |
| "loss": 2.7838, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.1491658488714426, | |
| "grad_norm": 3.3512823024660494, | |
| "learning_rate": 1.9863613034027224e-05, | |
| "loss": 2.5849, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.15309126594700687, | |
| "grad_norm": 3.8260211624317937, | |
| "learning_rate": 1.98399980721468e-05, | |
| "loss": 2.8336, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.15701668302257116, | |
| "grad_norm": 7.819647460383096, | |
| "learning_rate": 1.981451493252418e-05, | |
| "loss": 2.6306, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.16094210009813542, | |
| "grad_norm": 4.867901933457006, | |
| "learning_rate": 1.9787168453273546e-05, | |
| "loss": 2.5791, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.1648675171736997, | |
| "grad_norm": 4.84141951604172, | |
| "learning_rate": 1.9757963826274357e-05, | |
| "loss": 2.4821, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.16879293424926398, | |
| "grad_norm": 7.411142957673967, | |
| "learning_rate": 1.972690659618564e-05, | |
| "loss": 2.4986, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.17271835132482827, | |
| "grad_norm": 9.801242167429267, | |
| "learning_rate": 1.9694002659393306e-05, | |
| "loss": 2.4249, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.17664376840039253, | |
| "grad_norm": 6.830738579764302, | |
| "learning_rate": 1.9659258262890683e-05, | |
| "loss": 2.4422, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.18056918547595682, | |
| "grad_norm": 7.668891548488468, | |
| "learning_rate": 1.9622680003092503e-05, | |
| "loss": 2.387, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.1844946025515211, | |
| "grad_norm": 4.403569193476907, | |
| "learning_rate": 1.958427482458253e-05, | |
| "loss": 2.2889, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.18842001962708538, | |
| "grad_norm": 4.411285728525577, | |
| "learning_rate": 1.9544050018795076e-05, | |
| "loss": 2.3454, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.19234543670264967, | |
| "grad_norm": 5.339863771977718, | |
| "learning_rate": 1.9502013222630714e-05, | |
| "loss": 2.3232, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.19627085377821393, | |
| "grad_norm": 6.4580414109536965, | |
| "learning_rate": 1.9458172417006347e-05, | |
| "loss": 2.2459, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.20019627085377822, | |
| "grad_norm": 4.598940394609222, | |
| "learning_rate": 1.9412535925339998e-05, | |
| "loss": 2.2329, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.2041216879293425, | |
| "grad_norm": 3.3724555313255684, | |
| "learning_rate": 1.936511241197055e-05, | |
| "loss": 2.1502, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.20804710500490678, | |
| "grad_norm": 3.1443440005312557, | |
| "learning_rate": 1.9315910880512792e-05, | |
| "loss": 2.1237, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.21197252208047104, | |
| "grad_norm": 3.181286637321292, | |
| "learning_rate": 1.9264940672148018e-05, | |
| "loss": 2.1416, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.21589793915603533, | |
| "grad_norm": 4.727806393906144, | |
| "learning_rate": 1.921221146385057e-05, | |
| "loss": 2.1783, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.2198233562315996, | |
| "grad_norm": 5.178218482524286, | |
| "learning_rate": 1.9157733266550577e-05, | |
| "loss": 2.1399, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.2237487733071639, | |
| "grad_norm": 2.6620699759243935, | |
| "learning_rate": 1.910151642323337e-05, | |
| "loss": 2.1205, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.22767419038272815, | |
| "grad_norm": 2.322126256967793, | |
| "learning_rate": 1.9043571606975776e-05, | |
| "loss": 2.1135, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.23159960745829244, | |
| "grad_norm": 5.38527462128201, | |
| "learning_rate": 1.898390981891979e-05, | |
| "loss": 2.1041, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.23552502453385674, | |
| "grad_norm": 2.4579464277480065, | |
| "learning_rate": 1.8922542386183942e-05, | |
| "loss": 2.1206, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.239450441609421, | |
| "grad_norm": 2.3666339533811818, | |
| "learning_rate": 1.885948095971278e-05, | |
| "loss": 2.0352, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.2433758586849853, | |
| "grad_norm": 2.1445114411608808, | |
| "learning_rate": 1.879473751206489e-05, | |
| "loss": 2.0884, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.24730127576054955, | |
| "grad_norm": 2.4373855388744756, | |
| "learning_rate": 1.8728324335139814e-05, | |
| "loss": 2.0655, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.2512266928361138, | |
| "grad_norm": 3.572187608420911, | |
| "learning_rate": 1.866025403784439e-05, | |
| "loss": 2.0285, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.25515210991167814, | |
| "grad_norm": 2.1895539404446964, | |
| "learning_rate": 1.8590539543698852e-05, | |
| "loss": 2.0263, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.2590775269872424, | |
| "grad_norm": 2.5873674686273627, | |
| "learning_rate": 1.851919408838327e-05, | |
| "loss": 2.1084, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.26300294406280667, | |
| "grad_norm": 1.894183797101693, | |
| "learning_rate": 1.844623121722465e-05, | |
| "loss": 2.0226, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.26692836113837093, | |
| "grad_norm": 1.6854205851946853, | |
| "learning_rate": 1.8371664782625287e-05, | |
| "loss": 1.9757, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.27085377821393525, | |
| "grad_norm": 1.881677615447827, | |
| "learning_rate": 1.8295508941432814e-05, | |
| "loss": 2.0371, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.2747791952894995, | |
| "grad_norm": 1.8170043507499531, | |
| "learning_rate": 1.821777815225245e-05, | |
| "loss": 1.9245, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.2787046123650638, | |
| "grad_norm": 1.6628780800262886, | |
| "learning_rate": 1.813848717270195e-05, | |
| "loss": 1.9581, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.2826300294406281, | |
| "grad_norm": 1.5452111684921823, | |
| "learning_rate": 1.8057651056609784e-05, | |
| "loss": 1.9222, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.28655544651619236, | |
| "grad_norm": 1.8541605025179029, | |
| "learning_rate": 1.797528515115709e-05, | |
| "loss": 1.9187, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.2904808635917566, | |
| "grad_norm": 1.7009298828174881, | |
| "learning_rate": 1.789140509396394e-05, | |
| "loss": 1.9104, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.2944062806673209, | |
| "grad_norm": 1.4258711689883241, | |
| "learning_rate": 1.7806026810120423e-05, | |
| "loss": 1.953, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.2983316977428852, | |
| "grad_norm": 5.440289983178362, | |
| "learning_rate": 1.771916650916321e-05, | |
| "loss": 1.8837, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.30225711481844947, | |
| "grad_norm": 1.8948358808766053, | |
| "learning_rate": 1.7630840681998068e-05, | |
| "loss": 1.9148, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.30618253189401373, | |
| "grad_norm": 11.033261500781018, | |
| "learning_rate": 1.7541066097768965e-05, | |
| "loss": 1.8635, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.310107948969578, | |
| "grad_norm": 1.6724370104935717, | |
| "learning_rate": 1.744985980067437e-05, | |
| "loss": 1.9074, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.3140333660451423, | |
| "grad_norm": 10.261440916511484, | |
| "learning_rate": 1.735723910673132e-05, | |
| "loss": 1.9702, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.3179587831207066, | |
| "grad_norm": 1.7073614580018202, | |
| "learning_rate": 1.7263221600487852e-05, | |
| "loss": 1.8482, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.32188420019627084, | |
| "grad_norm": 1.760995341965249, | |
| "learning_rate": 1.7167825131684516e-05, | |
| "loss": 1.88, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.3258096172718351, | |
| "grad_norm": 1.529927666349978, | |
| "learning_rate": 1.7071067811865477e-05, | |
| "loss": 1.8582, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.3297350343473994, | |
| "grad_norm": 1.3260010690935662, | |
| "learning_rate": 1.6972968010939953e-05, | |
| "loss": 1.8592, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.3336604514229637, | |
| "grad_norm": 1.4020723238969466, | |
| "learning_rate": 1.687354435369459e-05, | |
| "loss": 1.9269, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.33758586849852795, | |
| "grad_norm": 1.3647315777910027, | |
| "learning_rate": 1.6772815716257414e-05, | |
| "loss": 1.8492, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.34151128557409227, | |
| "grad_norm": 1.4217128715416125, | |
| "learning_rate": 1.6670801222514135e-05, | |
| "loss": 1.8466, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.34543670264965654, | |
| "grad_norm": 1.3181609641675227, | |
| "learning_rate": 1.6567520240477344e-05, | |
| "loss": 1.8846, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.3493621197252208, | |
| "grad_norm": 1.4153897003847502, | |
| "learning_rate": 1.646299237860941e-05, | |
| "loss": 1.8809, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.35328753680078506, | |
| "grad_norm": 1.4414064552826915, | |
| "learning_rate": 1.6357237482099682e-05, | |
| "loss": 1.8003, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.3572129538763494, | |
| "grad_norm": 1.3760039651998437, | |
| "learning_rate": 1.625027562909679e-05, | |
| "loss": 1.8299, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.36113837095191365, | |
| "grad_norm": 2.517776896681609, | |
| "learning_rate": 1.6142127126896682e-05, | |
| "loss": 1.8708, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.3650637880274779, | |
| "grad_norm": 1.700692015059516, | |
| "learning_rate": 1.603281250808719e-05, | |
| "loss": 1.8774, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.3689892051030422, | |
| "grad_norm": 1.2064699534529657, | |
| "learning_rate": 1.5922352526649803e-05, | |
| "loss": 1.7571, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.3729146221786065, | |
| "grad_norm": 1.1558175580795997, | |
| "learning_rate": 1.5810768154019386e-05, | |
| "loss": 1.8125, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.37684003925417076, | |
| "grad_norm": 1.3409669812920715, | |
| "learning_rate": 1.5698080575102662e-05, | |
| "loss": 1.7661, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.380765456329735, | |
| "grad_norm": 1.601135716797025, | |
| "learning_rate": 1.5584311184256144e-05, | |
| "loss": 1.7495, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.38469087340529934, | |
| "grad_norm": 1.2027385581838417, | |
| "learning_rate": 1.5469481581224274e-05, | |
| "loss": 1.8115, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.3886162904808636, | |
| "grad_norm": 1.1975748712372833, | |
| "learning_rate": 1.5353613567038607e-05, | |
| "loss": 1.801, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.39254170755642787, | |
| "grad_norm": 3.09366210815099, | |
| "learning_rate": 1.523672913987878e-05, | |
| "loss": 1.7824, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.39646712463199213, | |
| "grad_norm": 1.1400962542750739, | |
| "learning_rate": 1.5118850490896012e-05, | |
| "loss": 1.7159, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.40039254170755645, | |
| "grad_norm": 1.2331105843413726, | |
| "learning_rate": 1.5000000000000002e-05, | |
| "loss": 1.7095, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.4043179587831207, | |
| "grad_norm": 1.1816337676783275, | |
| "learning_rate": 1.4880200231609982e-05, | |
| "loss": 1.7716, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.408243375858685, | |
| "grad_norm": 1.2284944812544702, | |
| "learning_rate": 1.4759473930370738e-05, | |
| "loss": 1.7902, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.41216879293424924, | |
| "grad_norm": 1.0847382800246375, | |
| "learning_rate": 1.4637844016834407e-05, | |
| "loss": 1.7647, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.41609421000981356, | |
| "grad_norm": 1.020608852405514, | |
| "learning_rate": 1.4515333583108896e-05, | |
| "loss": 1.7278, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.4200196270853778, | |
| "grad_norm": 1.2263117526234197, | |
| "learning_rate": 1.4391965888473705e-05, | |
| "loss": 1.7765, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.4239450441609421, | |
| "grad_norm": 1.0785380085141927, | |
| "learning_rate": 1.4267764354964038e-05, | |
| "loss": 1.7331, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.4278704612365064, | |
| "grad_norm": 1.0224592858163286, | |
| "learning_rate": 1.4142752562923988e-05, | |
| "loss": 1.7239, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.43179587831207067, | |
| "grad_norm": 1.1123423771410839, | |
| "learning_rate": 1.4016954246529697e-05, | |
| "loss": 1.7455, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.43572129538763493, | |
| "grad_norm": 1.0692956641607927, | |
| "learning_rate": 1.3890393289283262e-05, | |
| "loss": 1.7406, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.4396467124631992, | |
| "grad_norm": 1.0406766628092503, | |
| "learning_rate": 1.3763093719478357e-05, | |
| "loss": 1.7804, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.4435721295387635, | |
| "grad_norm": 1.034270520095407, | |
| "learning_rate": 1.3635079705638298e-05, | |
| "loss": 1.7499, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.4474975466143278, | |
| "grad_norm": 1.325505380155415, | |
| "learning_rate": 1.3506375551927546e-05, | |
| "loss": 1.7379, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.45142296368989204, | |
| "grad_norm": 2.8627791225075465, | |
| "learning_rate": 1.3377005693537393e-05, | |
| "loss": 1.7488, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.4553483807654563, | |
| "grad_norm": 1.1692597051088127, | |
| "learning_rate": 1.3246994692046837e-05, | |
| "loss": 1.727, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.4592737978410206, | |
| "grad_norm": 1.3186565424723304, | |
| "learning_rate": 1.3116367230759415e-05, | |
| "loss": 1.7135, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.4631992149165849, | |
| "grad_norm": 1.865299751069251, | |
| "learning_rate": 1.2985148110016947e-05, | |
| "loss": 1.6394, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.46712463199214915, | |
| "grad_norm": 0.9855711070712846, | |
| "learning_rate": 1.2853362242491054e-05, | |
| "loss": 1.6934, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.47105004906771347, | |
| "grad_norm": 1.01597596740161, | |
| "learning_rate": 1.2721034648453353e-05, | |
| "loss": 1.7594, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.47497546614327774, | |
| "grad_norm": 1.1222245558941415, | |
| "learning_rate": 1.2588190451025209e-05, | |
| "loss": 1.7102, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.478900883218842, | |
| "grad_norm": 1.6892513075440292, | |
| "learning_rate": 1.2454854871407993e-05, | |
| "loss": 1.6797, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.48282630029440626, | |
| "grad_norm": 1.1638725988581853, | |
| "learning_rate": 1.2321053224094678e-05, | |
| "loss": 1.7002, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.4867517173699706, | |
| "grad_norm": 1.313282663510557, | |
| "learning_rate": 1.218681091206376e-05, | |
| "loss": 1.6767, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.49067713444553485, | |
| "grad_norm": 1.1382396359200373, | |
| "learning_rate": 1.2052153421956343e-05, | |
| "loss": 1.6805, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.4946025515210991, | |
| "grad_norm": 1.1544843677311407, | |
| "learning_rate": 1.1917106319237386e-05, | |
| "loss": 1.6995, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.4985279685966634, | |
| "grad_norm": 0.9538551705927951, | |
| "learning_rate": 1.1781695243341933e-05, | |
| "loss": 1.643, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.5024533856722276, | |
| "grad_norm": 0.9507730642036515, | |
| "learning_rate": 1.164594590280734e-05, | |
| "loss": 1.6367, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.5063788027477919, | |
| "grad_norm": 1.0417445095908215, | |
| "learning_rate": 1.1509884070392369e-05, | |
| "loss": 1.6949, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.5103042198233563, | |
| "grad_norm": 1.026943416751704, | |
| "learning_rate": 1.1373535578184083e-05, | |
| "loss": 1.6824, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.5142296368989205, | |
| "grad_norm": 1.003065669067786, | |
| "learning_rate": 1.123692631269348e-05, | |
| "loss": 1.6902, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.5181550539744848, | |
| "grad_norm": 1.1835317624091395, | |
| "learning_rate": 1.1100082209940795e-05, | |
| "loss": 1.6927, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.5220804710500491, | |
| "grad_norm": 1.3576591105617888, | |
| "learning_rate": 1.0963029250531418e-05, | |
| "loss": 1.6739, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.5260058881256133, | |
| "grad_norm": 1.0113858517339607, | |
| "learning_rate": 1.0825793454723325e-05, | |
| "loss": 1.6723, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.5299313052011776, | |
| "grad_norm": 0.9744800567978432, | |
| "learning_rate": 1.0688400877486978e-05, | |
| "loss": 1.655, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.5338567222767419, | |
| "grad_norm": 0.9398947245879805, | |
| "learning_rate": 1.0550877603558656e-05, | |
| "loss": 1.6541, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.5377821393523062, | |
| "grad_norm": 0.9375124553680311, | |
| "learning_rate": 1.0413249742488132e-05, | |
| "loss": 1.664, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.5417075564278705, | |
| "grad_norm": 1.0133372395656326, | |
| "learning_rate": 1.0275543423681622e-05, | |
| "loss": 1.7141, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.5456329735034348, | |
| "grad_norm": 1.0303573991874475, | |
| "learning_rate": 1.0137784791440965e-05, | |
| "loss": 1.6964, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.549558390578999, | |
| "grad_norm": 0.9565693156778721, | |
| "learning_rate": 1e-05, | |
| "loss": 1.6634, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.5534838076545633, | |
| "grad_norm": 1.121421026361072, | |
| "learning_rate": 9.862215208559037e-06, | |
| "loss": 1.6666, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.5574092247301276, | |
| "grad_norm": 0.9667397885627346, | |
| "learning_rate": 9.724456576318383e-06, | |
| "loss": 1.5758, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.5613346418056918, | |
| "grad_norm": 0.9537255512602201, | |
| "learning_rate": 9.586750257511868e-06, | |
| "loss": 1.6376, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.5652600588812562, | |
| "grad_norm": 0.9438995534371774, | |
| "learning_rate": 9.449122396441344e-06, | |
| "loss": 1.6598, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.5691854759568205, | |
| "grad_norm": 0.9548167040445172, | |
| "learning_rate": 9.311599122513029e-06, | |
| "loss": 1.6417, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.5731108930323847, | |
| "grad_norm": 0.9049022575284126, | |
| "learning_rate": 9.174206545276678e-06, | |
| "loss": 1.7025, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.577036310107949, | |
| "grad_norm": 0.9432930890181294, | |
| "learning_rate": 9.036970749468585e-06, | |
| "loss": 1.6375, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.5809617271835132, | |
| "grad_norm": 1.958166606907053, | |
| "learning_rate": 8.899917790059208e-06, | |
| "loss": 1.6378, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.5848871442590775, | |
| "grad_norm": 1.0081677994924394, | |
| "learning_rate": 8.763073687306523e-06, | |
| "loss": 1.6151, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.5888125613346418, | |
| "grad_norm": 0.9877469922698178, | |
| "learning_rate": 8.626464421815919e-06, | |
| "loss": 1.6838, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.592737978410206, | |
| "grad_norm": 1.056961299931411, | |
| "learning_rate": 8.49011592960763e-06, | |
| "loss": 1.6561, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.5966633954857704, | |
| "grad_norm": 0.9567294624946977, | |
| "learning_rate": 8.35405409719266e-06, | |
| "loss": 1.6398, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.6005888125613347, | |
| "grad_norm": 0.9449145391070992, | |
| "learning_rate": 8.218304756658072e-06, | |
| "loss": 1.6554, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.6045142296368989, | |
| "grad_norm": 1.012295287909777, | |
| "learning_rate": 8.082893680762619e-06, | |
| "loss": 1.647, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.6084396467124632, | |
| "grad_norm": 0.9795392518478, | |
| "learning_rate": 7.947846578043658e-06, | |
| "loss": 1.631, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.6123650637880275, | |
| "grad_norm": 1.0023414998286204, | |
| "learning_rate": 7.813189087936243e-06, | |
| "loss": 1.5987, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.6162904808635917, | |
| "grad_norm": 0.9756815792976389, | |
| "learning_rate": 7.678946775905323e-06, | |
| "loss": 1.6738, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.620215897939156, | |
| "grad_norm": 1.0045653822710507, | |
| "learning_rate": 7.545145128592009e-06, | |
| "loss": 1.6642, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.6241413150147204, | |
| "grad_norm": 0.911147130019971, | |
| "learning_rate": 7.411809548974792e-06, | |
| "loss": 1.5968, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.6280667320902846, | |
| "grad_norm": 0.9064965972522054, | |
| "learning_rate": 7.278965351546648e-06, | |
| "loss": 1.6273, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.6319921491658489, | |
| "grad_norm": 0.9176975570356877, | |
| "learning_rate": 7.14663775750895e-06, | |
| "loss": 1.605, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.6359175662414132, | |
| "grad_norm": 0.8528776050815917, | |
| "learning_rate": 7.014851889983058e-06, | |
| "loss": 1.6116, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.6398429833169774, | |
| "grad_norm": 0.8589446735583834, | |
| "learning_rate": 6.883632769240589e-06, | |
| "loss": 1.6359, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.6437684003925417, | |
| "grad_norm": 0.8693598946335392, | |
| "learning_rate": 6.7530053079531664e-06, | |
| "loss": 1.6058, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.647693817468106, | |
| "grad_norm": 0.9799079815467372, | |
| "learning_rate": 6.6229943064626115e-06, | |
| "loss": 1.697, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.6516192345436702, | |
| "grad_norm": 0.9008836905212236, | |
| "learning_rate": 6.4936244480724575e-06, | |
| "loss": 1.6473, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.6555446516192346, | |
| "grad_norm": 0.8838813402782661, | |
| "learning_rate": 6.364920294361701e-06, | |
| "loss": 1.633, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.6594700686947988, | |
| "grad_norm": 0.9559913531438309, | |
| "learning_rate": 6.236906280521646e-06, | |
| "loss": 1.6873, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.6633954857703631, | |
| "grad_norm": 1.0277028273153903, | |
| "learning_rate": 6.109606710716741e-06, | |
| "loss": 1.5752, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.6673209028459274, | |
| "grad_norm": 0.8938515793453153, | |
| "learning_rate": 5.983045753470308e-06, | |
| "loss": 1.6452, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.6712463199214916, | |
| "grad_norm": 0.8556067546503252, | |
| "learning_rate": 5.857247437076012e-06, | |
| "loss": 1.6039, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.6751717369970559, | |
| "grad_norm": 0.8621858742594237, | |
| "learning_rate": 5.732235645035964e-06, | |
| "loss": 1.5726, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.6790971540726202, | |
| "grad_norm": 0.8567920663299861, | |
| "learning_rate": 5.608034111526298e-06, | |
| "loss": 1.6092, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.6830225711481845, | |
| "grad_norm": 0.9304541437780851, | |
| "learning_rate": 5.484666416891109e-06, | |
| "loss": 1.6294, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.6869479882237488, | |
| "grad_norm": 0.9799895019545034, | |
| "learning_rate": 5.362155983165594e-06, | |
| "loss": 1.6491, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.6908734052993131, | |
| "grad_norm": 0.8600307824342088, | |
| "learning_rate": 5.240526069629265e-06, | |
| "loss": 1.5957, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.6947988223748773, | |
| "grad_norm": 0.9072137683070356, | |
| "learning_rate": 5.119799768390021e-06, | |
| "loss": 1.6048, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.6987242394504416, | |
| "grad_norm": 0.9471535710462194, | |
| "learning_rate": 5.000000000000003e-06, | |
| "loss": 1.5996, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.7026496565260059, | |
| "grad_norm": 0.8767147652530162, | |
| "learning_rate": 4.881149509103993e-06, | |
| "loss": 1.6235, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.7065750736015701, | |
| "grad_norm": 0.8551100522979943, | |
| "learning_rate": 4.763270860121222e-06, | |
| "loss": 1.5987, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.7105004906771345, | |
| "grad_norm": 2.682630794646339, | |
| "learning_rate": 4.646386432961396e-06, | |
| "loss": 1.6288, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 0.7144259077526988, | |
| "grad_norm": 0.9218355935778795, | |
| "learning_rate": 4.530518418775734e-06, | |
| "loss": 1.5845, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.718351324828263, | |
| "grad_norm": 0.8944990873390325, | |
| "learning_rate": 4.415688815743858e-06, | |
| "loss": 1.5499, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 0.7222767419038273, | |
| "grad_norm": 0.8642527045568604, | |
| "learning_rate": 4.301919424897339e-06, | |
| "loss": 1.5471, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.7262021589793916, | |
| "grad_norm": 0.847995978904247, | |
| "learning_rate": 4.189231845980618e-06, | |
| "loss": 1.5885, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.7301275760549558, | |
| "grad_norm": 0.8569284771048433, | |
| "learning_rate": 4.077647473350201e-06, | |
| "loss": 1.6413, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.7340529931305201, | |
| "grad_norm": 0.8115797403733079, | |
| "learning_rate": 3.967187491912813e-06, | |
| "loss": 1.6192, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 0.7379784102060843, | |
| "grad_norm": 0.8459095182066169, | |
| "learning_rate": 3.857872873103322e-06, | |
| "loss": 1.6268, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.7419038272816487, | |
| "grad_norm": 0.8610021863012962, | |
| "learning_rate": 3.749724370903216e-06, | |
| "loss": 1.5548, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 0.745829244357213, | |
| "grad_norm": 0.8977310054539044, | |
| "learning_rate": 3.6427625179003223e-06, | |
| "loss": 1.5387, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.7497546614327772, | |
| "grad_norm": 0.9793672572198016, | |
| "learning_rate": 3.5370076213905904e-06, | |
| "loss": 1.5878, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 0.7536800785083415, | |
| "grad_norm": 0.8481741676724341, | |
| "learning_rate": 3.4324797595226567e-06, | |
| "loss": 1.5028, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.7576054955839058, | |
| "grad_norm": 0.8597838552057963, | |
| "learning_rate": 3.329198777485869e-06, | |
| "loss": 1.6214, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 0.76153091265947, | |
| "grad_norm": 3.616697619018478, | |
| "learning_rate": 3.2271842837425917e-06, | |
| "loss": 1.6, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.7654563297350343, | |
| "grad_norm": 0.8269416415115425, | |
| "learning_rate": 3.1264556463054162e-06, | |
| "loss": 1.5626, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.7693817468105987, | |
| "grad_norm": 0.8339428761497759, | |
| "learning_rate": 3.0270319890600465e-06, | |
| "loss": 1.5993, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.7733071638861629, | |
| "grad_norm": 0.8586040015054238, | |
| "learning_rate": 2.9289321881345257e-06, | |
| "loss": 1.5738, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 0.7772325809617272, | |
| "grad_norm": 0.8917002874622041, | |
| "learning_rate": 2.8321748683154893e-06, | |
| "loss": 1.621, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.7811579980372915, | |
| "grad_norm": 0.8987794594287856, | |
| "learning_rate": 2.73677839951215e-06, | |
| "loss": 1.5869, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 0.7850834151128557, | |
| "grad_norm": 0.8283138049802639, | |
| "learning_rate": 2.642760893268684e-06, | |
| "loss": 1.5747, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.78900883218842, | |
| "grad_norm": 0.8236640627120524, | |
| "learning_rate": 2.55014019932563e-06, | |
| "loss": 1.6623, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 0.7929342492639843, | |
| "grad_norm": 0.8633741424003055, | |
| "learning_rate": 2.4589339022310386e-06, | |
| "loss": 1.6475, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 0.7968596663395485, | |
| "grad_norm": 0.8659380267098473, | |
| "learning_rate": 2.369159318001937e-06, | |
| "loss": 1.638, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 0.8007850834151129, | |
| "grad_norm": 0.8362436890758017, | |
| "learning_rate": 2.2808334908367914e-06, | |
| "loss": 1.5459, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 0.8047105004906772, | |
| "grad_norm": 0.8671870625149154, | |
| "learning_rate": 2.1939731898795803e-06, | |
| "loss": 1.6203, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.8086359175662414, | |
| "grad_norm": 0.8387701873967306, | |
| "learning_rate": 2.1085949060360654e-06, | |
| "loss": 1.5794, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 0.8125613346418057, | |
| "grad_norm": 0.8272291995800398, | |
| "learning_rate": 2.0247148488429104e-06, | |
| "loss": 1.5846, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 0.81648675171737, | |
| "grad_norm": 0.9148983529895788, | |
| "learning_rate": 1.9423489433902186e-06, | |
| "loss": 1.627, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.8204121687929342, | |
| "grad_norm": 1.023887839871913, | |
| "learning_rate": 1.861512827298051e-06, | |
| "loss": 1.6436, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 0.8243375858684985, | |
| "grad_norm": 0.8668047143699129, | |
| "learning_rate": 1.7822218477475496e-06, | |
| "loss": 1.5111, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.8282630029440629, | |
| "grad_norm": 0.8921013544076094, | |
| "learning_rate": 1.704491058567187e-06, | |
| "loss": 1.5929, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 0.8321884200196271, | |
| "grad_norm": 0.7827134172094803, | |
| "learning_rate": 1.6283352173747148e-06, | |
| "loss": 1.5684, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 0.8361138370951914, | |
| "grad_norm": 0.9151558285804295, | |
| "learning_rate": 1.5537687827753512e-06, | |
| "loss": 1.5954, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 0.8400392541707556, | |
| "grad_norm": 0.7806131759440897, | |
| "learning_rate": 1.4808059116167306e-06, | |
| "loss": 1.5959, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 0.8439646712463199, | |
| "grad_norm": 0.8071021031464382, | |
| "learning_rate": 1.409460456301147e-06, | |
| "loss": 1.5547, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.8478900883218842, | |
| "grad_norm": 0.9885392511275477, | |
| "learning_rate": 1.339745962155613e-06, | |
| "loss": 1.5806, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.8518155053974484, | |
| "grad_norm": 0.8447951427483683, | |
| "learning_rate": 1.2716756648601857e-06, | |
| "loss": 1.5858, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 0.8557409224730128, | |
| "grad_norm": 0.8442429240687109, | |
| "learning_rate": 1.2052624879351105e-06, | |
| "loss": 1.5602, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 0.8596663395485771, | |
| "grad_norm": 0.8304889902121484, | |
| "learning_rate": 1.1405190402872201e-06, | |
| "loss": 1.5274, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 0.8635917566241413, | |
| "grad_norm": 0.867275409980241, | |
| "learning_rate": 1.0774576138160596e-06, | |
| "loss": 1.5931, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.8675171736997056, | |
| "grad_norm": 5.025226481716912, | |
| "learning_rate": 1.0160901810802114e-06, | |
| "loss": 1.6555, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 0.8714425907752699, | |
| "grad_norm": 2.0164281778712465, | |
| "learning_rate": 9.564283930242258e-07, | |
| "loss": 1.661, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 0.8753680078508341, | |
| "grad_norm": 0.8447154641571145, | |
| "learning_rate": 8.984835767666311e-07, | |
| "loss": 1.6076, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 0.8792934249263984, | |
| "grad_norm": 0.8975390071105757, | |
| "learning_rate": 8.42266733449425e-07, | |
| "loss": 1.5106, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.8832188420019627, | |
| "grad_norm": 0.8174103028875408, | |
| "learning_rate": 7.877885361494353e-07, | |
| "loss": 1.6067, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.887144259077527, | |
| "grad_norm": 0.854406198730777, | |
| "learning_rate": 7.350593278519824e-07, | |
| "loss": 1.5743, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 0.8910696761530913, | |
| "grad_norm": 0.8218241526940049, | |
| "learning_rate": 6.840891194872112e-07, | |
| "loss": 1.5742, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 0.8949950932286556, | |
| "grad_norm": 0.8490420507418462, | |
| "learning_rate": 6.348875880294536e-07, | |
| "loss": 1.5941, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 0.8989205103042198, | |
| "grad_norm": 0.8319724448801171, | |
| "learning_rate": 5.874640746600047e-07, | |
| "loss": 1.5843, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 0.9028459273797841, | |
| "grad_norm": 0.7477454547881142, | |
| "learning_rate": 5.418275829936537e-07, | |
| "loss": 1.5841, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.9067713444553483, | |
| "grad_norm": 0.8119670445159111, | |
| "learning_rate": 4.979867773692881e-07, | |
| "loss": 1.5129, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 0.9106967615309126, | |
| "grad_norm": 0.8063431890509406, | |
| "learning_rate": 4.5594998120492505e-07, | |
| "loss": 1.6281, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 0.914622178606477, | |
| "grad_norm": 0.8038685064920295, | |
| "learning_rate": 4.1572517541747294e-07, | |
| "loss": 1.5374, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 0.9185475956820413, | |
| "grad_norm": 0.7714276603717256, | |
| "learning_rate": 3.773199969074959e-07, | |
| "loss": 1.5733, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 0.9224730127576055, | |
| "grad_norm": 0.802623741175608, | |
| "learning_rate": 3.4074173710931804e-07, | |
| "loss": 1.5588, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.9263984298331698, | |
| "grad_norm": 0.8204240222973807, | |
| "learning_rate": 3.059973406066963e-07, | |
| "loss": 1.4883, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 0.930323846908734, | |
| "grad_norm": 0.8820271488142285, | |
| "learning_rate": 2.730934038143607e-07, | |
| "loss": 1.6086, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 0.9342492639842983, | |
| "grad_norm": 0.8275009337148957, | |
| "learning_rate": 2.420361737256438e-07, | |
| "loss": 1.5674, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 0.9381746810598626, | |
| "grad_norm": 0.8098954974154235, | |
| "learning_rate": 2.1283154672645522e-07, | |
| "loss": 1.5705, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 0.9421000981354269, | |
| "grad_norm": 0.8616587187287363, | |
| "learning_rate": 1.854850674758213e-07, | |
| "loss": 1.5822, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.9460255152109912, | |
| "grad_norm": 0.7757300122580277, | |
| "learning_rate": 1.6000192785320057e-07, | |
| "loss": 1.5585, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 0.9499509322865555, | |
| "grad_norm": 0.8825447741450724, | |
| "learning_rate": 1.3638696597277678e-07, | |
| "loss": 1.5802, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 0.9538763493621197, | |
| "grad_norm": 0.7909984644724999, | |
| "learning_rate": 1.1464466526491691e-07, | |
| "loss": 1.5111, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 0.957801766437684, | |
| "grad_norm": 0.9826658212623717, | |
| "learning_rate": 9.47791536249676e-08, | |
| "loss": 1.5619, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 0.9617271835132483, | |
| "grad_norm": 0.8676845965842969, | |
| "learning_rate": 7.679420262954984e-08, | |
| "loss": 1.6559, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.9656526005888125, | |
| "grad_norm": 0.8063317459427207, | |
| "learning_rate": 6.069322682050516e-08, | |
| "loss": 1.6133, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 0.9695780176643768, | |
| "grad_norm": 1.2596969588466849, | |
| "learning_rate": 4.647928305662852e-08, | |
| "loss": 1.6134, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 0.9735034347399412, | |
| "grad_norm": 0.7890445423722453, | |
| "learning_rate": 3.4155069933301535e-08, | |
| "loss": 1.6156, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 0.9774288518155054, | |
| "grad_norm": 0.9707160185876472, | |
| "learning_rate": 2.372292727015557e-08, | |
| "loss": 1.6215, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 0.9813542688910697, | |
| "grad_norm": 0.8378049298136845, | |
| "learning_rate": 1.518483566683826e-08, | |
| "loss": 1.6676, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.985279685966634, | |
| "grad_norm": 0.8694716533735262, | |
| "learning_rate": 8.542416126989805e-09, | |
| "loss": 1.6165, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 0.9892051030421982, | |
| "grad_norm": 0.7930214390540633, | |
| "learning_rate": 3.7969297504858445e-09, | |
| "loss": 1.6058, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 0.9931305201177625, | |
| "grad_norm": 2.859324121777814, | |
| "learning_rate": 9.49277494008971e-10, | |
| "loss": 1.5339, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 0.9970559371933267, | |
| "grad_norm": 0.8600778729962169, | |
| "learning_rate": 0.0, | |
| "loss": 1.6055, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 0.9970559371933267, | |
| "eval_loss": 1.5823049545288086, | |
| "eval_runtime": 51.7269, | |
| "eval_samples_per_second": 5.2, | |
| "eval_steps_per_second": 0.657, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 0.9970559371933267, | |
| "step": 254, | |
| "total_flos": 66244072439808.0, | |
| "train_loss": 2.2374156319250273, | |
| "train_runtime": 15313.1311, | |
| "train_samples_per_second": 1.065, | |
| "train_steps_per_second": 0.017 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 254, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 66244072439808.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |