| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.9788972089857046, |
| "eval_steps": 500, |
| "global_step": 273, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.010891763104152484, |
| "grad_norm": 5.917168704218648, |
| "learning_rate": 2.8571428571428573e-06, |
| "loss": 0.8738, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.021783526208304968, |
| "grad_norm": 5.848189651981173, |
| "learning_rate": 5.7142857142857145e-06, |
| "loss": 0.8637, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.03267528931245745, |
| "grad_norm": 5.4410485869996865, |
| "learning_rate": 8.571428571428571e-06, |
| "loss": 0.8524, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.043567052416609936, |
| "grad_norm": 2.423684029964203, |
| "learning_rate": 1.1428571428571429e-05, |
| "loss": 0.783, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.05445881552076242, |
| "grad_norm": 4.0235503087845155, |
| "learning_rate": 1.4285714285714287e-05, |
| "loss": 0.7805, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.0653505786249149, |
| "grad_norm": 4.14511666016107, |
| "learning_rate": 1.7142857142857142e-05, |
| "loss": 0.7643, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.0762423417290674, |
| "grad_norm": 4.42039479030743, |
| "learning_rate": 2e-05, |
| "loss": 0.7512, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.08713410483321987, |
| "grad_norm": 2.857807259160155, |
| "learning_rate": 2.2857142857142858e-05, |
| "loss": 0.7182, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.09802586793737236, |
| "grad_norm": 2.690272592093101, |
| "learning_rate": 2.5714285714285718e-05, |
| "loss": 0.6835, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.10891763104152484, |
| "grad_norm": 2.0185553871545165, |
| "learning_rate": 2.8571428571428574e-05, |
| "loss": 0.666, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.11980939414567733, |
| "grad_norm": 1.6526659811622688, |
| "learning_rate": 3.142857142857143e-05, |
| "loss": 0.6551, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.1307011572498298, |
| "grad_norm": 1.3684051055691528, |
| "learning_rate": 3.4285714285714284e-05, |
| "loss": 0.6374, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.1415929203539823, |
| "grad_norm": 1.3936888061646935, |
| "learning_rate": 3.714285714285715e-05, |
| "loss": 0.6142, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.1524846834581348, |
| "grad_norm": 1.3347032728939032, |
| "learning_rate": 4e-05, |
| "loss": 0.6085, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.16337644656228728, |
| "grad_norm": 1.6041646981350726, |
| "learning_rate": 4.2857142857142856e-05, |
| "loss": 0.6168, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.17426820966643974, |
| "grad_norm": 1.250156217488812, |
| "learning_rate": 4.5714285714285716e-05, |
| "loss": 0.5904, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.18515997277059223, |
| "grad_norm": 1.363684815879282, |
| "learning_rate": 4.857142857142857e-05, |
| "loss": 0.5983, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.19605173587474473, |
| "grad_norm": 1.160866980807094, |
| "learning_rate": 5.1428571428571436e-05, |
| "loss": 0.5762, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.20694349897889722, |
| "grad_norm": 1.8320088500371636, |
| "learning_rate": 5.4285714285714295e-05, |
| "loss": 0.5794, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.21783526208304968, |
| "grad_norm": 1.310331415981151, |
| "learning_rate": 5.714285714285715e-05, |
| "loss": 0.5723, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.22872702518720217, |
| "grad_norm": 2.0403045692700275, |
| "learning_rate": 6.000000000000001e-05, |
| "loss": 0.576, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.23961878829135466, |
| "grad_norm": 1.0753217952290004, |
| "learning_rate": 6.285714285714286e-05, |
| "loss": 0.5657, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.25051055139550715, |
| "grad_norm": 2.0101370583197076, |
| "learning_rate": 6.571428571428571e-05, |
| "loss": 0.5751, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.2614023144996596, |
| "grad_norm": 1.8269660980505025, |
| "learning_rate": 6.857142857142857e-05, |
| "loss": 0.5622, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.27229407760381213, |
| "grad_norm": 1.333190530233097, |
| "learning_rate": 7.142857142857143e-05, |
| "loss": 0.553, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.2831858407079646, |
| "grad_norm": 1.5278657539191043, |
| "learning_rate": 7.42857142857143e-05, |
| "loss": 0.5746, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.29407760381211706, |
| "grad_norm": 1.4753801604023495, |
| "learning_rate": 7.714285714285715e-05, |
| "loss": 0.5504, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.3049693669162696, |
| "grad_norm": 1.4736990838732313, |
| "learning_rate": 8e-05, |
| "loss": 0.5542, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.31586113002042204, |
| "grad_norm": 2.3486277040401893, |
| "learning_rate": 7.999671154713278e-05, |
| "loss": 0.5507, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.32675289312457456, |
| "grad_norm": 1.1844138236301467, |
| "learning_rate": 7.99868467292272e-05, |
| "loss": 0.5496, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.337644656228727, |
| "grad_norm": 2.4920955827056748, |
| "learning_rate": 7.997040716828271e-05, |
| "loss": 0.5575, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.3485364193328795, |
| "grad_norm": 1.7099423006847476, |
| "learning_rate": 7.994739556733538e-05, |
| "loss": 0.5596, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.359428182437032, |
| "grad_norm": 2.1342906902090504, |
| "learning_rate": 7.991781571001347e-05, |
| "loss": 0.5337, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.37031994554118447, |
| "grad_norm": 1.7483277996825424, |
| "learning_rate": 7.988167245991528e-05, |
| "loss": 0.5492, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.381211708645337, |
| "grad_norm": 1.7968259576328387, |
| "learning_rate": 7.983897175980957e-05, |
| "loss": 0.5314, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.39210347174948945, |
| "grad_norm": 1.559300515616932, |
| "learning_rate": 7.97897206306583e-05, |
| "loss": 0.5307, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.4029952348536419, |
| "grad_norm": 1.0889578452931736, |
| "learning_rate": 7.973392717046233e-05, |
| "loss": 0.5408, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.41388699795779443, |
| "grad_norm": 1.313871076023579, |
| "learning_rate": 7.967160055292984e-05, |
| "loss": 0.5147, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.4247787610619469, |
| "grad_norm": 1.583430729115433, |
| "learning_rate": 7.960275102596809e-05, |
| "loss": 0.5216, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.43567052416609936, |
| "grad_norm": 1.3933127292778362, |
| "learning_rate": 7.952738990999824e-05, |
| "loss": 0.5187, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.4465622872702519, |
| "grad_norm": 1.4991075926399846, |
| "learning_rate": 7.94455295960942e-05, |
| "loss": 0.5123, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.45745405037440434, |
| "grad_norm": 1.2790673350427315, |
| "learning_rate": 7.93571835439452e-05, |
| "loss": 0.5255, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.46834581347855686, |
| "grad_norm": 1.656174041977428, |
| "learning_rate": 7.926236627964262e-05, |
| "loss": 0.5224, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.4792375765827093, |
| "grad_norm": 1.3680863968048569, |
| "learning_rate": 7.916109339329173e-05, |
| "loss": 0.5104, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.4901293396868618, |
| "grad_norm": 1.1248700116366424, |
| "learning_rate": 7.905338153644818e-05, |
| "loss": 0.5036, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.5010211027910143, |
| "grad_norm": 1.4820737812996578, |
| "learning_rate": 7.89392484193802e-05, |
| "loss": 0.5169, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.5119128658951668, |
| "grad_norm": 1.1533791684607533, |
| "learning_rate": 7.881871280815659e-05, |
| "loss": 0.5071, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.5228046289993192, |
| "grad_norm": 1.9492363638377506, |
| "learning_rate": 7.869179452156118e-05, |
| "loss": 0.5093, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.5336963921034718, |
| "grad_norm": 0.9617512636578286, |
| "learning_rate": 7.855851442783414e-05, |
| "loss": 0.501, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.5445881552076243, |
| "grad_norm": 1.0817397757606, |
| "learning_rate": 7.841889444124078e-05, |
| "loss": 0.4997, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.5554799183117767, |
| "grad_norm": 2.1099922597533074, |
| "learning_rate": 7.827295751846836e-05, |
| "loss": 0.5151, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.5663716814159292, |
| "grad_norm": 1.0752679988230807, |
| "learning_rate": 7.81207276548515e-05, |
| "loss": 0.512, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.5772634445200817, |
| "grad_norm": 2.033228507248604, |
| "learning_rate": 7.796222988042676e-05, |
| "loss": 0.5247, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.5881552076242341, |
| "grad_norm": 1.4290947325043002, |
| "learning_rate": 7.779749025581717e-05, |
| "loss": 0.5161, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.5990469707283866, |
| "grad_norm": 1.6107474254316703, |
| "learning_rate": 7.762653586794731e-05, |
| "loss": 0.5113, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.6099387338325392, |
| "grad_norm": 1.7403793303180215, |
| "learning_rate": 7.74493948255895e-05, |
| "loss": 0.5104, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.6208304969366917, |
| "grad_norm": 1.3273260664695445, |
| "learning_rate": 7.726609625474218e-05, |
| "loss": 0.5149, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.6317222600408441, |
| "grad_norm": 1.2835468728323596, |
| "learning_rate": 7.707667029384088e-05, |
| "loss": 0.4985, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.6426140231449966, |
| "grad_norm": 1.3493586560972042, |
| "learning_rate": 7.688114808880283e-05, |
| "loss": 0.515, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.6535057862491491, |
| "grad_norm": 0.8317273919287447, |
| "learning_rate": 7.667956178790582e-05, |
| "loss": 0.5059, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.6643975493533015, |
| "grad_norm": 1.1983241891992835, |
| "learning_rate": 7.647194453650228e-05, |
| "loss": 0.4935, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.675289312457454, |
| "grad_norm": 1.1964694002526848, |
| "learning_rate": 7.625833047156953e-05, |
| "loss": 0.5058, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.6861810755616066, |
| "grad_norm": 1.1878899519661879, |
| "learning_rate": 7.603875471609677e-05, |
| "loss": 0.4903, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.697072838665759, |
| "grad_norm": 1.2583807783190593, |
| "learning_rate": 7.581325337331013e-05, |
| "loss": 0.497, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.7079646017699115, |
| "grad_norm": 0.9383067837973975, |
| "learning_rate": 7.558186352073648e-05, |
| "loss": 0.4874, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.718856364874064, |
| "grad_norm": 1.114271813149658, |
| "learning_rate": 7.534462320410702e-05, |
| "loss": 0.4957, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.7297481279782164, |
| "grad_norm": 0.9889746448795311, |
| "learning_rate": 7.510157143110172e-05, |
| "loss": 0.4976, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.7406398910823689, |
| "grad_norm": 1.5156805885931242, |
| "learning_rate": 7.485274816493558e-05, |
| "loss": 0.4959, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.7515316541865215, |
| "grad_norm": 0.9749471928652824, |
| "learning_rate": 7.459819431778775e-05, |
| "loss": 0.4796, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.762423417290674, |
| "grad_norm": 0.8404206560752534, |
| "learning_rate": 7.433795174407465e-05, |
| "loss": 0.4762, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.7733151803948264, |
| "grad_norm": 1.2857273953478379, |
| "learning_rate": 7.407206323356818e-05, |
| "loss": 0.4955, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.7842069434989789, |
| "grad_norm": 1.5254696719564749, |
| "learning_rate": 7.380057250436006e-05, |
| "loss": 0.4945, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.7950987066031314, |
| "grad_norm": 1.0225507119621404, |
| "learning_rate": 7.352352419567362e-05, |
| "loss": 0.4883, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.8059904697072838, |
| "grad_norm": 1.3824406570348136, |
| "learning_rate": 7.324096386052416e-05, |
| "loss": 0.4906, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.8168822328114363, |
| "grad_norm": 0.9568602258759329, |
| "learning_rate": 7.295293795822887e-05, |
| "loss": 0.4838, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.8277739959155889, |
| "grad_norm": 1.2765315330970022, |
| "learning_rate": 7.265949384676795e-05, |
| "loss": 0.4815, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.8386657590197413, |
| "grad_norm": 1.2279324061842216, |
| "learning_rate": 7.236067977499791e-05, |
| "loss": 0.4855, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.8495575221238938, |
| "grad_norm": 0.9629770077059606, |
| "learning_rate": 7.205654487471826e-05, |
| "loss": 0.4811, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.8604492852280463, |
| "grad_norm": 1.295116932433786, |
| "learning_rate": 7.174713915259331e-05, |
| "loss": 0.4889, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.8713410483321987, |
| "grad_norm": 0.9924719057218118, |
| "learning_rate": 7.143251348192971e-05, |
| "loss": 0.488, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.8822328114363512, |
| "grad_norm": 1.3773423039136135, |
| "learning_rate": 7.111271959431189e-05, |
| "loss": 0.4859, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.8931245745405038, |
| "grad_norm": 0.9416135398394887, |
| "learning_rate": 7.078781007109625e-05, |
| "loss": 0.4803, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.9040163376446563, |
| "grad_norm": 1.0674313281470116, |
| "learning_rate": 7.045783833476538e-05, |
| "loss": 0.4733, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.9149081007488087, |
| "grad_norm": 1.000464101793752, |
| "learning_rate": 7.012285864014445e-05, |
| "loss": 0.4726, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.9257998638529612, |
| "grad_norm": 1.3944228987734595, |
| "learning_rate": 6.978292606548029e-05, |
| "loss": 0.4817, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.9366916269571137, |
| "grad_norm": 0.9210477820477416, |
| "learning_rate": 6.943809650338541e-05, |
| "loss": 0.4856, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.9475833900612661, |
| "grad_norm": 1.4702724614133884, |
| "learning_rate": 6.908842665164789e-05, |
| "loss": 0.4893, |
| "step": 87 |
| }, |
| { |
| "epoch": 0.9584751531654186, |
| "grad_norm": 0.9392028605617796, |
| "learning_rate": 6.873397400390911e-05, |
| "loss": 0.4749, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.9693669162695712, |
| "grad_norm": 1.33752177947439, |
| "learning_rate": 6.837479684021032e-05, |
| "loss": 0.4642, |
| "step": 89 |
| }, |
| { |
| "epoch": 0.9802586793737236, |
| "grad_norm": 1.0674472905397991, |
| "learning_rate": 6.80109542174102e-05, |
| "loss": 0.4721, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.9911504424778761, |
| "grad_norm": 0.8855006430820443, |
| "learning_rate": 6.76425059594746e-05, |
| "loss": 0.471, |
| "step": 91 |
| }, |
| { |
| "epoch": 1.0047651463580667, |
| "grad_norm": 0.7477549645216865, |
| "learning_rate": 6.726951264763998e-05, |
| "loss": 0.4723, |
| "step": 92 |
| }, |
| { |
| "epoch": 1.0156569094622192, |
| "grad_norm": 0.8956363744921413, |
| "learning_rate": 6.689203561045268e-05, |
| "loss": 0.4562, |
| "step": 93 |
| }, |
| { |
| "epoch": 1.0265486725663717, |
| "grad_norm": 1.060602408653099, |
| "learning_rate": 6.651013691368492e-05, |
| "loss": 0.4599, |
| "step": 94 |
| }, |
| { |
| "epoch": 1.0374404356705242, |
| "grad_norm": 1.2755687426821307, |
| "learning_rate": 6.612387935012995e-05, |
| "loss": 0.4586, |
| "step": 95 |
| }, |
| { |
| "epoch": 1.0483321987746765, |
| "grad_norm": 0.7866789226297869, |
| "learning_rate": 6.573332642927737e-05, |
| "loss": 0.4564, |
| "step": 96 |
| }, |
| { |
| "epoch": 1.059223961878829, |
| "grad_norm": 1.0671580779917749, |
| "learning_rate": 6.53385423668708e-05, |
| "loss": 0.4532, |
| "step": 97 |
| }, |
| { |
| "epoch": 1.0701157249829816, |
| "grad_norm": 1.8705582630594135, |
| "learning_rate": 6.493959207434934e-05, |
| "loss": 0.4518, |
| "step": 98 |
| }, |
| { |
| "epoch": 1.081007488087134, |
| "grad_norm": 0.8747747648963227, |
| "learning_rate": 6.453654114817467e-05, |
| "loss": 0.4602, |
| "step": 99 |
| }, |
| { |
| "epoch": 1.0918992511912866, |
| "grad_norm": 2.6708530213907915, |
| "learning_rate": 6.412945585904545e-05, |
| "loss": 0.4706, |
| "step": 100 |
| }, |
| { |
| "epoch": 1.1027910142954391, |
| "grad_norm": 2.059630403847963, |
| "learning_rate": 6.371840314100104e-05, |
| "loss": 0.4746, |
| "step": 101 |
| }, |
| { |
| "epoch": 1.1136827773995917, |
| "grad_norm": 2.100904334906369, |
| "learning_rate": 6.330345058041585e-05, |
| "loss": 0.4645, |
| "step": 102 |
| }, |
| { |
| "epoch": 1.124574540503744, |
| "grad_norm": 1.9589727338791063, |
| "learning_rate": 6.288466640488679e-05, |
| "loss": 0.4574, |
| "step": 103 |
| }, |
| { |
| "epoch": 1.1354663036078965, |
| "grad_norm": 1.250427733328813, |
| "learning_rate": 6.2462119472015e-05, |
| "loss": 0.453, |
| "step": 104 |
| }, |
| { |
| "epoch": 1.146358066712049, |
| "grad_norm": 1.244933711184395, |
| "learning_rate": 6.20358792580841e-05, |
| "loss": 0.4562, |
| "step": 105 |
| }, |
| { |
| "epoch": 1.1572498298162015, |
| "grad_norm": 0.978598635089721, |
| "learning_rate": 6.160601584663681e-05, |
| "loss": 0.4544, |
| "step": 106 |
| }, |
| { |
| "epoch": 1.168141592920354, |
| "grad_norm": 1.0133074820438104, |
| "learning_rate": 6.11725999169515e-05, |
| "loss": 0.4567, |
| "step": 107 |
| }, |
| { |
| "epoch": 1.1790333560245065, |
| "grad_norm": 0.9079803477398684, |
| "learning_rate": 6.0735702732421015e-05, |
| "loss": 0.4537, |
| "step": 108 |
| }, |
| { |
| "epoch": 1.189925119128659, |
| "grad_norm": 0.47916750157430577, |
| "learning_rate": 6.029539612883529e-05, |
| "loss": 0.4514, |
| "step": 109 |
| }, |
| { |
| "epoch": 1.2008168822328114, |
| "grad_norm": 0.737478359754882, |
| "learning_rate": 5.9851752502570015e-05, |
| "loss": 0.4462, |
| "step": 110 |
| }, |
| { |
| "epoch": 1.2117086453369639, |
| "grad_norm": 0.5899481616784151, |
| "learning_rate": 5.940484479868288e-05, |
| "loss": 0.4456, |
| "step": 111 |
| }, |
| { |
| "epoch": 1.2226004084411164, |
| "grad_norm": 0.766606804098348, |
| "learning_rate": 5.895474649891995e-05, |
| "loss": 0.4441, |
| "step": 112 |
| }, |
| { |
| "epoch": 1.233492171545269, |
| "grad_norm": 0.6645207203467242, |
| "learning_rate": 5.8501531609633424e-05, |
| "loss": 0.4463, |
| "step": 113 |
| }, |
| { |
| "epoch": 1.2443839346494214, |
| "grad_norm": 0.681340548554224, |
| "learning_rate": 5.8045274649613386e-05, |
| "loss": 0.4424, |
| "step": 114 |
| }, |
| { |
| "epoch": 1.2552756977535737, |
| "grad_norm": 0.7119900691253739, |
| "learning_rate": 5.7586050637835295e-05, |
| "loss": 0.4459, |
| "step": 115 |
| }, |
| { |
| "epoch": 1.2661674608577265, |
| "grad_norm": 0.49479084006465096, |
| "learning_rate": 5.7123935081125034e-05, |
| "loss": 0.4408, |
| "step": 116 |
| }, |
| { |
| "epoch": 1.2770592239618788, |
| "grad_norm": 0.7116517379970301, |
| "learning_rate": 5.6659003961743965e-05, |
| "loss": 0.4455, |
| "step": 117 |
| }, |
| { |
| "epoch": 1.2879509870660313, |
| "grad_norm": 0.6527860650845239, |
| "learning_rate": 5.619133372489575e-05, |
| "loss": 0.4362, |
| "step": 118 |
| }, |
| { |
| "epoch": 1.2988427501701838, |
| "grad_norm": 0.5221316831765591, |
| "learning_rate": 5.572100126615695e-05, |
| "loss": 0.4483, |
| "step": 119 |
| }, |
| { |
| "epoch": 1.3097345132743363, |
| "grad_norm": 0.36665207601249333, |
| "learning_rate": 5.524808391883367e-05, |
| "loss": 0.4488, |
| "step": 120 |
| }, |
| { |
| "epoch": 1.3206262763784888, |
| "grad_norm": 0.41335991497113145, |
| "learning_rate": 5.477265944124626e-05, |
| "loss": 0.4466, |
| "step": 121 |
| }, |
| { |
| "epoch": 1.3315180394826411, |
| "grad_norm": 0.4288901779920896, |
| "learning_rate": 5.429480600394405e-05, |
| "loss": 0.4455, |
| "step": 122 |
| }, |
| { |
| "epoch": 1.3424098025867937, |
| "grad_norm": 0.4476404528186165, |
| "learning_rate": 5.381460217685231e-05, |
| "loss": 0.4409, |
| "step": 123 |
| }, |
| { |
| "epoch": 1.3533015656909462, |
| "grad_norm": 0.3975466345144197, |
| "learning_rate": 5.333212691635368e-05, |
| "loss": 0.4368, |
| "step": 124 |
| }, |
| { |
| "epoch": 1.3641933287950987, |
| "grad_norm": 0.3833210592542072, |
| "learning_rate": 5.2847459552305834e-05, |
| "loss": 0.4414, |
| "step": 125 |
| }, |
| { |
| "epoch": 1.3750850918992512, |
| "grad_norm": 0.38028400797503575, |
| "learning_rate": 5.23606797749979e-05, |
| "loss": 0.4342, |
| "step": 126 |
| }, |
| { |
| "epoch": 1.3859768550034037, |
| "grad_norm": 0.29336709544411704, |
| "learning_rate": 5.1871867622047624e-05, |
| "loss": 0.4349, |
| "step": 127 |
| }, |
| { |
| "epoch": 1.3968686181075562, |
| "grad_norm": 0.4021055378411946, |
| "learning_rate": 5.13811034652413e-05, |
| "loss": 0.4359, |
| "step": 128 |
| }, |
| { |
| "epoch": 1.4077603812117085, |
| "grad_norm": 0.36564910628203584, |
| "learning_rate": 5.088846799731885e-05, |
| "loss": 0.4449, |
| "step": 129 |
| }, |
| { |
| "epoch": 1.418652144315861, |
| "grad_norm": 0.3687189406492797, |
| "learning_rate": 5.039404221870612e-05, |
| "loss": 0.4384, |
| "step": 130 |
| }, |
| { |
| "epoch": 1.4295439074200136, |
| "grad_norm": 0.29150702350417407, |
| "learning_rate": 4.989790742419658e-05, |
| "loss": 0.4391, |
| "step": 131 |
| }, |
| { |
| "epoch": 1.440435670524166, |
| "grad_norm": 0.28941702462294044, |
| "learning_rate": 4.940014518958461e-05, |
| "loss": 0.435, |
| "step": 132 |
| }, |
| { |
| "epoch": 1.4513274336283186, |
| "grad_norm": 0.3046809035055879, |
| "learning_rate": 4.890083735825258e-05, |
| "loss": 0.4423, |
| "step": 133 |
| }, |
| { |
| "epoch": 1.4622191967324711, |
| "grad_norm": 0.3901636771063769, |
| "learning_rate": 4.8400066027713974e-05, |
| "loss": 0.4373, |
| "step": 134 |
| }, |
| { |
| "epoch": 1.4731109598366237, |
| "grad_norm": 0.2975253048975319, |
| "learning_rate": 4.789791353611469e-05, |
| "loss": 0.441, |
| "step": 135 |
| }, |
| { |
| "epoch": 1.484002722940776, |
| "grad_norm": 0.2861058200191958, |
| "learning_rate": 4.7394462448694756e-05, |
| "loss": 0.4392, |
| "step": 136 |
| }, |
| { |
| "epoch": 1.4948944860449285, |
| "grad_norm": 0.30291271659415053, |
| "learning_rate": 4.688979554421276e-05, |
| "loss": 0.4437, |
| "step": 137 |
| }, |
| { |
| "epoch": 1.505786249149081, |
| "grad_norm": 0.27992103529188517, |
| "learning_rate": 4.6383995801335176e-05, |
| "loss": 0.4448, |
| "step": 138 |
| }, |
| { |
| "epoch": 1.5166780122532335, |
| "grad_norm": 0.25747738933655456, |
| "learning_rate": 4.5877146384992725e-05, |
| "loss": 0.4431, |
| "step": 139 |
| }, |
| { |
| "epoch": 1.527569775357386, |
| "grad_norm": 0.341071325514537, |
| "learning_rate": 4.5369330632706223e-05, |
| "loss": 0.4331, |
| "step": 140 |
| }, |
| { |
| "epoch": 1.5384615384615383, |
| "grad_norm": 0.2602954860252923, |
| "learning_rate": 4.486063204088402e-05, |
| "loss": 0.4433, |
| "step": 141 |
| }, |
| { |
| "epoch": 1.549353301565691, |
| "grad_norm": 0.23767705409963544, |
| "learning_rate": 4.435113425109324e-05, |
| "loss": 0.4337, |
| "step": 142 |
| }, |
| { |
| "epoch": 1.5602450646698434, |
| "grad_norm": 0.29322114556856593, |
| "learning_rate": 4.3840921036307274e-05, |
| "loss": 0.4387, |
| "step": 143 |
| }, |
| { |
| "epoch": 1.5711368277739959, |
| "grad_norm": 0.2171415398561475, |
| "learning_rate": 4.333007628713158e-05, |
| "loss": 0.4333, |
| "step": 144 |
| }, |
| { |
| "epoch": 1.5820285908781484, |
| "grad_norm": 0.26753092128368694, |
| "learning_rate": 4.281868399801016e-05, |
| "loss": 0.4375, |
| "step": 145 |
| }, |
| { |
| "epoch": 1.592920353982301, |
| "grad_norm": 0.2577848543389177, |
| "learning_rate": 4.230682825341498e-05, |
| "loss": 0.4331, |
| "step": 146 |
| }, |
| { |
| "epoch": 1.6038121170864534, |
| "grad_norm": 0.23037795102518452, |
| "learning_rate": 4.17945932140206e-05, |
| "loss": 0.4432, |
| "step": 147 |
| }, |
| { |
| "epoch": 1.6147038801906057, |
| "grad_norm": 0.22432493774064247, |
| "learning_rate": 4.128206310286622e-05, |
| "loss": 0.433, |
| "step": 148 |
| }, |
| { |
| "epoch": 1.6255956432947585, |
| "grad_norm": 0.27660950965609943, |
| "learning_rate": 4.0769322191507485e-05, |
| "loss": 0.4291, |
| "step": 149 |
| }, |
| { |
| "epoch": 1.6364874063989108, |
| "grad_norm": 0.29639803872823944, |
| "learning_rate": 4.025645478616045e-05, |
| "loss": 0.4479, |
| "step": 150 |
| }, |
| { |
| "epoch": 1.6473791695030633, |
| "grad_norm": 0.22551842554518517, |
| "learning_rate": 3.974354521383956e-05, |
| "loss": 0.4305, |
| "step": 151 |
| }, |
| { |
| "epoch": 1.6582709326072158, |
| "grad_norm": 0.21855552569143427, |
| "learning_rate": 3.923067780849252e-05, |
| "loss": 0.4341, |
| "step": 152 |
| }, |
| { |
| "epoch": 1.6691626957113683, |
| "grad_norm": 0.25345471682479176, |
| "learning_rate": 3.87179368971338e-05, |
| "loss": 0.4332, |
| "step": 153 |
| }, |
| { |
| "epoch": 1.6800544588155208, |
| "grad_norm": 0.3294043510791402, |
| "learning_rate": 3.820540678597942e-05, |
| "loss": 0.4324, |
| "step": 154 |
| }, |
| { |
| "epoch": 1.6909462219196731, |
| "grad_norm": 0.2915052857817396, |
| "learning_rate": 3.769317174658503e-05, |
| "loss": 0.4289, |
| "step": 155 |
| }, |
| { |
| "epoch": 1.7018379850238259, |
| "grad_norm": 0.20123186425930717, |
| "learning_rate": 3.718131600198984e-05, |
| "loss": 0.4414, |
| "step": 156 |
| }, |
| { |
| "epoch": 1.7127297481279782, |
| "grad_norm": 0.23019840765145663, |
| "learning_rate": 3.666992371286843e-05, |
| "loss": 0.4341, |
| "step": 157 |
| }, |
| { |
| "epoch": 1.7236215112321307, |
| "grad_norm": 0.2075296642104422, |
| "learning_rate": 3.615907896369273e-05, |
| "loss": 0.4391, |
| "step": 158 |
| }, |
| { |
| "epoch": 1.7345132743362832, |
| "grad_norm": 0.20499145565748494, |
| "learning_rate": 3.564886574890677e-05, |
| "loss": 0.4368, |
| "step": 159 |
| }, |
| { |
| "epoch": 1.7454050374404355, |
| "grad_norm": 0.2270936672913207, |
| "learning_rate": 3.5139367959115986e-05, |
| "loss": 0.4362, |
| "step": 160 |
| }, |
| { |
| "epoch": 1.7562968005445883, |
| "grad_norm": 0.20695924414455472, |
| "learning_rate": 3.4630669367293797e-05, |
| "loss": 0.4321, |
| "step": 161 |
| }, |
| { |
| "epoch": 1.7671885636487406, |
| "grad_norm": 0.20145634393976244, |
| "learning_rate": 3.412285361500729e-05, |
| "loss": 0.4327, |
| "step": 162 |
| }, |
| { |
| "epoch": 1.778080326752893, |
| "grad_norm": 0.1954369301373391, |
| "learning_rate": 3.3616004198664845e-05, |
| "loss": 0.4324, |
| "step": 163 |
| }, |
| { |
| "epoch": 1.7889720898570456, |
| "grad_norm": 0.20481400956545093, |
| "learning_rate": 3.311020445578725e-05, |
| "loss": 0.4377, |
| "step": 164 |
| }, |
| { |
| "epoch": 1.799863852961198, |
| "grad_norm": 0.2162909320824147, |
| "learning_rate": 3.260553755130525e-05, |
| "loss": 0.4299, |
| "step": 165 |
| }, |
| { |
| "epoch": 1.8107556160653506, |
| "grad_norm": 0.1811829339048361, |
| "learning_rate": 3.210208646388532e-05, |
| "loss": 0.43, |
| "step": 166 |
| }, |
| { |
| "epoch": 1.821647379169503, |
| "grad_norm": 0.2086574503584923, |
| "learning_rate": 3.1599933972286026e-05, |
| "loss": 0.4346, |
| "step": 167 |
| }, |
| { |
| "epoch": 1.8325391422736557, |
| "grad_norm": 0.22526815431719788, |
| "learning_rate": 3.109916264174743e-05, |
| "loss": 0.4288, |
| "step": 168 |
| }, |
| { |
| "epoch": 1.843430905377808, |
| "grad_norm": 0.18015782113002662, |
| "learning_rate": 3.0599854810415393e-05, |
| "loss": 0.428, |
| "step": 169 |
| }, |
| { |
| "epoch": 1.8543226684819605, |
| "grad_norm": 0.20990114701229817, |
| "learning_rate": 3.0102092575803435e-05, |
| "loss": 0.4333, |
| "step": 170 |
| }, |
| { |
| "epoch": 1.865214431586113, |
| "grad_norm": 0.1988483391924283, |
| "learning_rate": 2.9605957781293893e-05, |
| "loss": 0.4346, |
| "step": 171 |
| }, |
| { |
| "epoch": 1.8761061946902655, |
| "grad_norm": 0.19078378305907726, |
| "learning_rate": 2.911153200268116e-05, |
| "loss": 0.4251, |
| "step": 172 |
| }, |
| { |
| "epoch": 1.886997957794418, |
| "grad_norm": 0.23446180498320415, |
| "learning_rate": 2.8618896534758707e-05, |
| "loss": 0.4377, |
| "step": 173 |
| }, |
| { |
| "epoch": 1.8978897208985703, |
| "grad_norm": 0.1761870449513099, |
| "learning_rate": 2.8128132377952376e-05, |
| "loss": 0.4227, |
| "step": 174 |
| }, |
| { |
| "epoch": 1.908781484002723, |
| "grad_norm": 0.2332557884998122, |
| "learning_rate": 2.7639320225002108e-05, |
| "loss": 0.4325, |
| "step": 175 |
| }, |
| { |
| "epoch": 1.9196732471068754, |
| "grad_norm": 0.2211426717672741, |
| "learning_rate": 2.715254044769418e-05, |
| "loss": 0.4308, |
| "step": 176 |
| }, |
| { |
| "epoch": 1.9305650102110279, |
| "grad_norm": 0.18328106510038045, |
| "learning_rate": 2.666787308364634e-05, |
| "loss": 0.4322, |
| "step": 177 |
| }, |
| { |
| "epoch": 1.9414567733151804, |
| "grad_norm": 0.2288520554831422, |
| "learning_rate": 2.6185397823147703e-05, |
| "loss": 0.4263, |
| "step": 178 |
| }, |
| { |
| "epoch": 1.952348536419333, |
| "grad_norm": 0.17271019835077814, |
| "learning_rate": 2.5705193996055977e-05, |
| "loss": 0.4231, |
| "step": 179 |
| }, |
| { |
| "epoch": 1.9632402995234854, |
| "grad_norm": 0.21114296576465388, |
| "learning_rate": 2.5227340558753755e-05, |
| "loss": 0.4323, |
| "step": 180 |
| }, |
| { |
| "epoch": 1.9741320626276377, |
| "grad_norm": 0.1708320475165614, |
| "learning_rate": 2.4751916081166336e-05, |
| "loss": 0.4318, |
| "step": 181 |
| }, |
| { |
| "epoch": 1.9850238257317905, |
| "grad_norm": 0.20243633225986957, |
| "learning_rate": 2.427899873384306e-05, |
| "loss": 0.4284, |
| "step": 182 |
| }, |
| { |
| "epoch": 1.9959155888359428, |
| "grad_norm": 0.17214298347005028, |
| "learning_rate": 2.3808666275104248e-05, |
| "loss": 0.4303, |
| "step": 183 |
| }, |
| { |
| "epoch": 2.0095302927161334, |
| "grad_norm": 0.2371948537766053, |
| "learning_rate": 2.334099603825605e-05, |
| "loss": 0.4195, |
| "step": 184 |
| }, |
| { |
| "epoch": 2.020422055820286, |
| "grad_norm": 0.20683435017217672, |
| "learning_rate": 2.2876064918874993e-05, |
| "loss": 0.4056, |
| "step": 185 |
| }, |
| { |
| "epoch": 2.0313138189244384, |
| "grad_norm": 0.2132114122172093, |
| "learning_rate": 2.241394936216472e-05, |
| "loss": 0.4033, |
| "step": 186 |
| }, |
| { |
| "epoch": 2.0422055820285907, |
| "grad_norm": 0.19087121395561507, |
| "learning_rate": 2.1954725350386614e-05, |
| "loss": 0.405, |
| "step": 187 |
| }, |
| { |
| "epoch": 2.0530973451327434, |
| "grad_norm": 0.21040243392909405, |
| "learning_rate": 2.14984683903666e-05, |
| "loss": 0.4094, |
| "step": 188 |
| }, |
| { |
| "epoch": 2.0639891082368957, |
| "grad_norm": 0.16468816702883052, |
| "learning_rate": 2.1045253501080058e-05, |
| "loss": 0.3988, |
| "step": 189 |
| }, |
| { |
| "epoch": 2.0748808713410485, |
| "grad_norm": 0.17523456140653074, |
| "learning_rate": 2.0595155201317115e-05, |
| "loss": 0.4009, |
| "step": 190 |
| }, |
| { |
| "epoch": 2.085772634445201, |
| "grad_norm": 0.19773800473297834, |
| "learning_rate": 2.0148247497430012e-05, |
| "loss": 0.4016, |
| "step": 191 |
| }, |
| { |
| "epoch": 2.096664397549353, |
| "grad_norm": 0.1647636735264003, |
| "learning_rate": 1.970460387116472e-05, |
| "loss": 0.3948, |
| "step": 192 |
| }, |
| { |
| "epoch": 2.107556160653506, |
| "grad_norm": 0.18353188744441096, |
| "learning_rate": 1.9264297267579e-05, |
| "loss": 0.3995, |
| "step": 193 |
| }, |
| { |
| "epoch": 2.118447923757658, |
| "grad_norm": 0.1804931450578559, |
| "learning_rate": 1.8827400083048503e-05, |
| "loss": 0.404, |
| "step": 194 |
| }, |
| { |
| "epoch": 2.129339686861811, |
| "grad_norm": 0.16921208996470288, |
| "learning_rate": 1.8393984153363203e-05, |
| "loss": 0.3886, |
| "step": 195 |
| }, |
| { |
| "epoch": 2.140231449965963, |
| "grad_norm": 0.15129405097453685, |
| "learning_rate": 1.7964120741915905e-05, |
| "loss": 0.4082, |
| "step": 196 |
| }, |
| { |
| "epoch": 2.151123213070116, |
| "grad_norm": 0.17224554916641183, |
| "learning_rate": 1.753788052798501e-05, |
| "loss": 0.3994, |
| "step": 197 |
| }, |
| { |
| "epoch": 2.162014976174268, |
| "grad_norm": 0.15770956818834928, |
| "learning_rate": 1.7115333595113225e-05, |
| "loss": 0.403, |
| "step": 198 |
| }, |
| { |
| "epoch": 2.172906739278421, |
| "grad_norm": 0.1716039599905281, |
| "learning_rate": 1.669654941958416e-05, |
| "loss": 0.4032, |
| "step": 199 |
| }, |
| { |
| "epoch": 2.1837985023825732, |
| "grad_norm": 0.16454710212660165, |
| "learning_rate": 1.628159685899897e-05, |
| "loss": 0.405, |
| "step": 200 |
| }, |
| { |
| "epoch": 2.1946902654867255, |
| "grad_norm": 0.13993922901377312, |
| "learning_rate": 1.5870544140954543e-05, |
| "loss": 0.4077, |
| "step": 201 |
| }, |
| { |
| "epoch": 2.2055820285908783, |
| "grad_norm": 0.16301323668704157, |
| "learning_rate": 1.5463458851825345e-05, |
| "loss": 0.4053, |
| "step": 202 |
| }, |
| { |
| "epoch": 2.2164737916950306, |
| "grad_norm": 0.15989174020128882, |
| "learning_rate": 1.5060407925650662e-05, |
| "loss": 0.4036, |
| "step": 203 |
| }, |
| { |
| "epoch": 2.2273655547991833, |
| "grad_norm": 0.14155165998806302, |
| "learning_rate": 1.466145763312922e-05, |
| "loss": 0.4021, |
| "step": 204 |
| }, |
| { |
| "epoch": 2.2382573179033356, |
| "grad_norm": 0.13197910914100647, |
| "learning_rate": 1.426667357072265e-05, |
| "loss": 0.403, |
| "step": 205 |
| }, |
| { |
| "epoch": 2.249149081007488, |
| "grad_norm": 0.1502501179431283, |
| "learning_rate": 1.3876120649870051e-05, |
| "loss": 0.4068, |
| "step": 206 |
| }, |
| { |
| "epoch": 2.2600408441116406, |
| "grad_norm": 0.12632548576221497, |
| "learning_rate": 1.3489863086315085e-05, |
| "loss": 0.3954, |
| "step": 207 |
| }, |
| { |
| "epoch": 2.270932607215793, |
| "grad_norm": 0.12310652443499624, |
| "learning_rate": 1.3107964389547326e-05, |
| "loss": 0.4056, |
| "step": 208 |
| }, |
| { |
| "epoch": 2.2818243703199457, |
| "grad_norm": 0.13232393309386553, |
| "learning_rate": 1.2730487352360026e-05, |
| "loss": 0.4028, |
| "step": 209 |
| }, |
| { |
| "epoch": 2.292716133424098, |
| "grad_norm": 0.12337150415039137, |
| "learning_rate": 1.2357494040525416e-05, |
| "loss": 0.4125, |
| "step": 210 |
| }, |
| { |
| "epoch": 2.3036078965282503, |
| "grad_norm": 0.11483009808175632, |
| "learning_rate": 1.1989045782589815e-05, |
| "loss": 0.404, |
| "step": 211 |
| }, |
| { |
| "epoch": 2.314499659632403, |
| "grad_norm": 0.1213039797258653, |
| "learning_rate": 1.1625203159789686e-05, |
| "loss": 0.398, |
| "step": 212 |
| }, |
| { |
| "epoch": 2.3253914227365553, |
| "grad_norm": 0.1296712824505999, |
| "learning_rate": 1.1266025996090902e-05, |
| "loss": 0.4103, |
| "step": 213 |
| }, |
| { |
| "epoch": 2.336283185840708, |
| "grad_norm": 0.12739388021409695, |
| "learning_rate": 1.0911573348352107e-05, |
| "loss": 0.4054, |
| "step": 214 |
| }, |
| { |
| "epoch": 2.3471749489448603, |
| "grad_norm": 0.11766621480801555, |
| "learning_rate": 1.0561903496614603e-05, |
| "loss": 0.4036, |
| "step": 215 |
| }, |
| { |
| "epoch": 2.358066712049013, |
| "grad_norm": 0.12339813858079209, |
| "learning_rate": 1.0217073934519726e-05, |
| "loss": 0.4012, |
| "step": 216 |
| }, |
| { |
| "epoch": 2.3689584751531654, |
| "grad_norm": 0.11647805921713034, |
| "learning_rate": 9.877141359855567e-06, |
| "loss": 0.4109, |
| "step": 217 |
| }, |
| { |
| "epoch": 2.379850238257318, |
| "grad_norm": 0.109735463051235, |
| "learning_rate": 9.542161665234623e-06, |
| "loss": 0.3991, |
| "step": 218 |
| }, |
| { |
| "epoch": 2.3907420013614704, |
| "grad_norm": 0.1181837766256319, |
| "learning_rate": 9.212189928903758e-06, |
| "loss": 0.4008, |
| "step": 219 |
| }, |
| { |
| "epoch": 2.4016337644656227, |
| "grad_norm": 0.12247494016760661, |
| "learning_rate": 8.887280405688106e-06, |
| "loss": 0.4044, |
| "step": 220 |
| }, |
| { |
| "epoch": 2.4125255275697755, |
| "grad_norm": 0.11192215755173132, |
| "learning_rate": 8.567486518070306e-06, |
| "loss": 0.3992, |
| "step": 221 |
| }, |
| { |
| "epoch": 2.4234172906739277, |
| "grad_norm": 0.109297303198497, |
| "learning_rate": 8.252860847406712e-06, |
| "loss": 0.3948, |
| "step": 222 |
| }, |
| { |
| "epoch": 2.4343090537780805, |
| "grad_norm": 0.11284991227332902, |
| "learning_rate": 7.943455125281741e-06, |
| "loss": 0.3995, |
| "step": 223 |
| }, |
| { |
| "epoch": 2.445200816882233, |
| "grad_norm": 0.11548250618562617, |
| "learning_rate": 7.639320225002106e-06, |
| "loss": 0.407, |
| "step": 224 |
| }, |
| { |
| "epoch": 2.456092579986385, |
| "grad_norm": 0.116001123064691, |
| "learning_rate": 7.340506153232052e-06, |
| "loss": 0.4, |
| "step": 225 |
| }, |
| { |
| "epoch": 2.466984343090538, |
| "grad_norm": 0.10992561869419722, |
| "learning_rate": 7.047062041771133e-06, |
| "loss": 0.4045, |
| "step": 226 |
| }, |
| { |
| "epoch": 2.47787610619469, |
| "grad_norm": 0.10955112339301888, |
| "learning_rate": 6.759036139475843e-06, |
| "loss": 0.3964, |
| "step": 227 |
| }, |
| { |
| "epoch": 2.488767869298843, |
| "grad_norm": 0.11910747258190017, |
| "learning_rate": 6.476475804326377e-06, |
| "loss": 0.4025, |
| "step": 228 |
| }, |
| { |
| "epoch": 2.499659632402995, |
| "grad_norm": 0.116651055560925, |
| "learning_rate": 6.199427495639963e-06, |
| "loss": 0.3994, |
| "step": 229 |
| }, |
| { |
| "epoch": 2.5105513955071475, |
| "grad_norm": 0.1138861345434715, |
| "learning_rate": 5.927936766431836e-06, |
| "loss": 0.4022, |
| "step": 230 |
| }, |
| { |
| "epoch": 2.5214431586113, |
| "grad_norm": 0.09848824385777599, |
| "learning_rate": 5.662048255925357e-06, |
| "loss": 0.3988, |
| "step": 231 |
| }, |
| { |
| "epoch": 2.532334921715453, |
| "grad_norm": 0.10150492531984864, |
| "learning_rate": 5.40180568221226e-06, |
| "loss": 0.4047, |
| "step": 232 |
| }, |
| { |
| "epoch": 2.5432266848196052, |
| "grad_norm": 0.11056977593577616, |
| "learning_rate": 5.147251835064424e-06, |
| "loss": 0.4057, |
| "step": 233 |
| }, |
| { |
| "epoch": 2.5541184479237575, |
| "grad_norm": 0.10655441819972683, |
| "learning_rate": 4.898428568898288e-06, |
| "loss": 0.3971, |
| "step": 234 |
| }, |
| { |
| "epoch": 2.5650102110279103, |
| "grad_norm": 0.10580636847333873, |
| "learning_rate": 4.65537679589299e-06, |
| "loss": 0.4045, |
| "step": 235 |
| }, |
| { |
| "epoch": 2.5759019741320626, |
| "grad_norm": 0.09908356652521799, |
| "learning_rate": 4.418136479263533e-06, |
| "loss": 0.4004, |
| "step": 236 |
| }, |
| { |
| "epoch": 2.5867937372362153, |
| "grad_norm": 0.09287282151185479, |
| "learning_rate": 4.186746626689879e-06, |
| "loss": 0.4006, |
| "step": 237 |
| }, |
| { |
| "epoch": 2.5976855003403676, |
| "grad_norm": 0.11618401418689138, |
| "learning_rate": 3.961245283903239e-06, |
| "loss": 0.4025, |
| "step": 238 |
| }, |
| { |
| "epoch": 2.60857726344452, |
| "grad_norm": 0.09813772046749265, |
| "learning_rate": 3.7416695284304737e-06, |
| "loss": 0.4051, |
| "step": 239 |
| }, |
| { |
| "epoch": 2.6194690265486726, |
| "grad_norm": 0.08852232603950476, |
| "learning_rate": 3.5280554634977217e-06, |
| "loss": 0.4013, |
| "step": 240 |
| }, |
| { |
| "epoch": 2.630360789652825, |
| "grad_norm": 0.09394838490800748, |
| "learning_rate": 3.320438212094197e-06, |
| "loss": 0.4059, |
| "step": 241 |
| }, |
| { |
| "epoch": 2.6412525527569777, |
| "grad_norm": 0.09059231002024923, |
| "learning_rate": 3.1188519111971804e-06, |
| "loss": 0.4039, |
| "step": 242 |
| }, |
| { |
| "epoch": 2.65214431586113, |
| "grad_norm": 0.09411366384619929, |
| "learning_rate": 2.9233297061591346e-06, |
| "loss": 0.3973, |
| "step": 243 |
| }, |
| { |
| "epoch": 2.6630360789652823, |
| "grad_norm": 0.09939434350976686, |
| "learning_rate": 2.733903745257838e-06, |
| "loss": 0.4049, |
| "step": 244 |
| }, |
| { |
| "epoch": 2.673927842069435, |
| "grad_norm": 0.09359826061681908, |
| "learning_rate": 2.550605174410512e-06, |
| "loss": 0.4017, |
| "step": 245 |
| }, |
| { |
| "epoch": 2.6848196051735873, |
| "grad_norm": 0.09354919223068338, |
| "learning_rate": 2.373464132052701e-06, |
| "loss": 0.4115, |
| "step": 246 |
| }, |
| { |
| "epoch": 2.69571136827774, |
| "grad_norm": 0.09746026612098238, |
| "learning_rate": 2.202509744182835e-06, |
| "loss": 0.3982, |
| "step": 247 |
| }, |
| { |
| "epoch": 2.7066031313818923, |
| "grad_norm": 0.09168148902821703, |
| "learning_rate": 2.0377701195732545e-06, |
| "loss": 0.4071, |
| "step": 248 |
| }, |
| { |
| "epoch": 2.717494894486045, |
| "grad_norm": 0.09074247743593482, |
| "learning_rate": 1.879272345148513e-06, |
| "loss": 0.4101, |
| "step": 249 |
| }, |
| { |
| "epoch": 2.7283866575901974, |
| "grad_norm": 0.09117996023348725, |
| "learning_rate": 1.727042481531651e-06, |
| "loss": 0.4035, |
| "step": 250 |
| }, |
| { |
| "epoch": 2.73927842069435, |
| "grad_norm": 0.09180074130441847, |
| "learning_rate": 1.5811055587592283e-06, |
| "loss": 0.4009, |
| "step": 251 |
| }, |
| { |
| "epoch": 2.7501701837985024, |
| "grad_norm": 0.08657516729277243, |
| "learning_rate": 1.4414855721658705e-06, |
| "loss": 0.4041, |
| "step": 252 |
| }, |
| { |
| "epoch": 2.7610619469026547, |
| "grad_norm": 0.09146051585946402, |
| "learning_rate": 1.3082054784388221e-06, |
| "loss": 0.4055, |
| "step": 253 |
| }, |
| { |
| "epoch": 2.7719537100068075, |
| "grad_norm": 0.09305195530663764, |
| "learning_rate": 1.1812871918434143e-06, |
| "loss": 0.4027, |
| "step": 254 |
| }, |
| { |
| "epoch": 2.7828454731109598, |
| "grad_norm": 0.09703421946821408, |
| "learning_rate": 1.0607515806198142e-06, |
| "loss": 0.4079, |
| "step": 255 |
| }, |
| { |
| "epoch": 2.7937372362151125, |
| "grad_norm": 0.08906174726324725, |
| "learning_rate": 9.466184635518361e-07, |
| "loss": 0.4017, |
| "step": 256 |
| }, |
| { |
| "epoch": 2.804628999319265, |
| "grad_norm": 0.08574571986172967, |
| "learning_rate": 8.389066067082852e-07, |
| "loss": 0.3967, |
| "step": 257 |
| }, |
| { |
| "epoch": 2.815520762423417, |
| "grad_norm": 0.0880676756279348, |
| "learning_rate": 7.376337203573824e-07, |
| "loss": 0.406, |
| "step": 258 |
| }, |
| { |
| "epoch": 2.82641252552757, |
| "grad_norm": 0.08789941779545143, |
| "learning_rate": 6.428164560548134e-07, |
| "loss": 0.4076, |
| "step": 259 |
| }, |
| { |
| "epoch": 2.837304288631722, |
| "grad_norm": 0.08553769033856681, |
| "learning_rate": 5.544704039058025e-07, |
| "loss": 0.3983, |
| "step": 260 |
| }, |
| { |
| "epoch": 2.848196051735875, |
| "grad_norm": 0.08010149417951533, |
| "learning_rate": 4.7261009000177274e-07, |
| "loss": 0.3918, |
| "step": 261 |
| }, |
| { |
| "epoch": 2.859087814840027, |
| "grad_norm": 0.07962434672916259, |
| "learning_rate": 3.972489740319274e-07, |
| "loss": 0.3924, |
| "step": 262 |
| }, |
| { |
| "epoch": 2.8699795779441795, |
| "grad_norm": 0.08482433839475353, |
| "learning_rate": 3.283994470701579e-07, |
| "loss": 0.4057, |
| "step": 263 |
| }, |
| { |
| "epoch": 2.880871341048332, |
| "grad_norm": 0.08350491927147591, |
| "learning_rate": 2.66072829537678e-07, |
| "loss": 0.4099, |
| "step": 264 |
| }, |
| { |
| "epoch": 2.891763104152485, |
| "grad_norm": 0.0880658902471283, |
| "learning_rate": 2.102793693417038e-07, |
| "loss": 0.4073, |
| "step": 265 |
| }, |
| { |
| "epoch": 2.9026548672566372, |
| "grad_norm": 0.08328293731348216, |
| "learning_rate": 1.6102824019043728e-07, |
| "loss": 0.4002, |
| "step": 266 |
| }, |
| { |
| "epoch": 2.9135466303607895, |
| "grad_norm": 0.07841175895615185, |
| "learning_rate": 1.1832754008472614e-07, |
| "loss": 0.389, |
| "step": 267 |
| }, |
| { |
| "epoch": 2.9244383934649423, |
| "grad_norm": 0.08287793053425582, |
| "learning_rate": 8.21842899865466e-08, |
| "loss": 0.3955, |
| "step": 268 |
| }, |
| { |
| "epoch": 2.9353301565690946, |
| "grad_norm": 0.08049459789709469, |
| "learning_rate": 5.260443266462467e-08, |
| "loss": 0.3949, |
| "step": 269 |
| }, |
| { |
| "epoch": 2.9462219196732473, |
| "grad_norm": 0.086710676105483, |
| "learning_rate": 2.9592831717293326e-08, |
| "loss": 0.4072, |
| "step": 270 |
| }, |
| { |
| "epoch": 2.9571136827773996, |
| "grad_norm": 0.08154316295610879, |
| "learning_rate": 1.3153270772807702e-08, |
| "loss": 0.3967, |
| "step": 271 |
| }, |
| { |
| "epoch": 2.968005445881552, |
| "grad_norm": 0.08705352570078856, |
| "learning_rate": 3.2884528672294523e-09, |
| "loss": 0.395, |
| "step": 272 |
| }, |
| { |
| "epoch": 2.9788972089857046, |
| "grad_norm": 0.08297445165644558, |
| "learning_rate": 0.0, |
| "loss": 0.4097, |
| "step": 273 |
| }, |
| { |
| "epoch": 2.9788972089857046, |
| "step": 273, |
| "total_flos": 7.02510214541004e+18, |
| "train_loss": 0.464617241979082, |
| "train_runtime": 57723.5744, |
| "train_samples_per_second": 2.443, |
| "train_steps_per_second": 0.005 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 273, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 7.02510214541004e+18, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|