| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 3.6199095022624435, |
| "eval_steps": 1000, |
| "global_step": 200, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.01809954751131222, |
| "grad_norm": 4.442032814025879, |
| "learning_rate": 0.0, |
| "loss": 1.497, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.03619909502262444, |
| "grad_norm": 4.569097995758057, |
| "learning_rate": 1.5051499783199055e-07, |
| "loss": 1.4506, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.05429864253393665, |
| "grad_norm": 3.9136922359466553, |
| "learning_rate": 2.385606273598312e-07, |
| "loss": 1.2728, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.07239819004524888, |
| "grad_norm": 3.9136922359466553, |
| "learning_rate": 2.385606273598312e-07, |
| "loss": 1.587, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.09049773755656108, |
| "grad_norm": 3.9136922359466553, |
| "learning_rate": 2.385606273598312e-07, |
| "loss": 1.408, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.1085972850678733, |
| "grad_norm": 4.174969673156738, |
| "learning_rate": 3.010299956639811e-07, |
| "loss": 1.3948, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.12669683257918551, |
| "grad_norm": 4.174969673156738, |
| "learning_rate": 3.010299956639811e-07, |
| "loss": 1.515, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.14479638009049775, |
| "grad_norm": 4.5925774574279785, |
| "learning_rate": 3.494850021680093e-07, |
| "loss": 1.5149, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.16289592760180996, |
| "grad_norm": 4.349692344665527, |
| "learning_rate": 3.8907562519182173e-07, |
| "loss": 1.5055, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.18099547511312217, |
| "grad_norm": 5.4546027183532715, |
| "learning_rate": 4.2254902000712834e-07, |
| "loss": 1.4242, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.19909502262443438, |
| "grad_norm": 4.096962928771973, |
| "learning_rate": 4.5154499349597166e-07, |
| "loss": 1.4094, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.2171945701357466, |
| "grad_norm": 5.804754734039307, |
| "learning_rate": 4.771212547196623e-07, |
| "loss": 1.6699, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.23529411764705882, |
| "grad_norm": 4.950932502746582, |
| "learning_rate": 4.999999999999999e-07, |
| "loss": 1.4682, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.25339366515837103, |
| "grad_norm": 4.728433609008789, |
| "learning_rate": 5.206963425791124e-07, |
| "loss": 1.3831, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.27149321266968324, |
| "grad_norm": 4.042943000793457, |
| "learning_rate": 5.395906230238123e-07, |
| "loss": 1.4438, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.2895927601809955, |
| "grad_norm": 3.768977642059326, |
| "learning_rate": 5.569716761534182e-07, |
| "loss": 1.3996, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.3076923076923077, |
| "grad_norm": 4.375296115875244, |
| "learning_rate": 5.730640178391189e-07, |
| "loss": 1.3065, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.3257918552036199, |
| "grad_norm": 3.959031105041504, |
| "learning_rate": 5.880456295278405e-07, |
| "loss": 1.4533, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.3438914027149321, |
| "grad_norm": 4.150556564331055, |
| "learning_rate": 6.020599913279622e-07, |
| "loss": 1.4256, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.36199095022624433, |
| "grad_norm": 5.503087520599365, |
| "learning_rate": 6.15224460689137e-07, |
| "loss": 1.3212, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.38009049773755654, |
| "grad_norm": 5.915998935699463, |
| "learning_rate": 6.276362525516529e-07, |
| "loss": 1.4347, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.39819004524886875, |
| "grad_norm": 3.7581424713134766, |
| "learning_rate": 6.393768004764143e-07, |
| "loss": 1.3012, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.416289592760181, |
| "grad_norm": 3.728571653366089, |
| "learning_rate": 6.505149978319905e-07, |
| "loss": 1.3426, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.4343891402714932, |
| "grad_norm": 4.000448226928711, |
| "learning_rate": 6.611096473669595e-07, |
| "loss": 1.3064, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.45248868778280543, |
| "grad_norm": 3.8997511863708496, |
| "learning_rate": 6.712113404111031e-07, |
| "loss": 1.4217, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.47058823529411764, |
| "grad_norm": 3.4601259231567383, |
| "learning_rate": 6.808639180087963e-07, |
| "loss": 1.278, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.48868778280542985, |
| "grad_norm": 3.3370354175567627, |
| "learning_rate": 6.901056208558029e-07, |
| "loss": 1.402, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.5067873303167421, |
| "grad_norm": 4.348806381225586, |
| "learning_rate": 6.989700043360186e-07, |
| "loss": 1.3881, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.5248868778280543, |
| "grad_norm": 3.3785560131073, |
| "learning_rate": 7.074866739854088e-07, |
| "loss": 1.2949, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.5429864253393665, |
| "grad_norm": 3.269310474395752, |
| "learning_rate": 7.156818820794935e-07, |
| "loss": 1.3877, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.5610859728506787, |
| "grad_norm": 3.2553048133850098, |
| "learning_rate": 7.235790156711094e-07, |
| "loss": 1.3625, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.579185520361991, |
| "grad_norm": 4.357567310333252, |
| "learning_rate": 7.311989989494779e-07, |
| "loss": 1.2856, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.5972850678733032, |
| "grad_norm": 3.816030263900757, |
| "learning_rate": 7.38560627359831e-07, |
| "loss": 1.2666, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.6153846153846154, |
| "grad_norm": 3.816030263900757, |
| "learning_rate": 7.38560627359831e-07, |
| "loss": 1.288, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.6334841628959276, |
| "grad_norm": 3.095264434814453, |
| "learning_rate": 7.456808469171361e-07, |
| "loss": 1.3014, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.6515837104072398, |
| "grad_norm": 5.308295726776123, |
| "learning_rate": 7.525749891599529e-07, |
| "loss": 1.3896, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.669683257918552, |
| "grad_norm": 3.2766010761260986, |
| "learning_rate": 7.592569699389436e-07, |
| "loss": 1.3191, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.6877828054298643, |
| "grad_norm": 3.6091532707214355, |
| "learning_rate": 7.657394585211274e-07, |
| "loss": 1.3227, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.7058823529411765, |
| "grad_norm": 4.882140159606934, |
| "learning_rate": 7.720340221751376e-07, |
| "loss": 1.2763, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.7239819004524887, |
| "grad_norm": 3.5777502059936523, |
| "learning_rate": 7.781512503836435e-07, |
| "loss": 1.2811, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.7420814479638009, |
| "grad_norm": 3.520531415939331, |
| "learning_rate": 7.841008620334974e-07, |
| "loss": 1.3677, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.7601809954751131, |
| "grad_norm": 4.111598014831543, |
| "learning_rate": 7.89891798308405e-07, |
| "loss": 1.3009, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.7782805429864253, |
| "grad_norm": 3.3306169509887695, |
| "learning_rate": 7.955323035132494e-07, |
| "loss": 1.1605, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.7963800904977375, |
| "grad_norm": 3.112687349319458, |
| "learning_rate": 8.01029995663981e-07, |
| "loss": 1.2589, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.8144796380090498, |
| "grad_norm": 3.7117624282836914, |
| "learning_rate": 8.063919283598676e-07, |
| "loss": 1.2737, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.832579185520362, |
| "grad_norm": 4.166692733764648, |
| "learning_rate": 8.116246451989502e-07, |
| "loss": 1.2353, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.8506787330316742, |
| "grad_norm": 6.097745895385742, |
| "learning_rate": 8.16734227789793e-07, |
| "loss": 1.221, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.8687782805429864, |
| "grad_norm": 3.3959226608276367, |
| "learning_rate": 8.217263382430935e-07, |
| "loss": 1.2574, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.8868778280542986, |
| "grad_norm": 4.395689964294434, |
| "learning_rate": 8.266062568876716e-07, |
| "loss": 1.1438, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.9049773755656109, |
| "grad_norm": 3.479315757751465, |
| "learning_rate": 8.313789158407869e-07, |
| "loss": 1.2905, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.9230769230769231, |
| "grad_norm": 3.4640543460845947, |
| "learning_rate": 8.360489289678585e-07, |
| "loss": 1.2397, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.9411764705882353, |
| "grad_norm": 5.382558822631836, |
| "learning_rate": 8.406206186877934e-07, |
| "loss": 1.2131, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.9592760180995475, |
| "grad_norm": 5.364609718322754, |
| "learning_rate": 8.450980400142567e-07, |
| "loss": 1.2098, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.9773755656108597, |
| "grad_norm": 3.814605951309204, |
| "learning_rate": 8.494850021680092e-07, |
| "loss": 1.2415, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.995475113122172, |
| "grad_norm": 2.9692606925964355, |
| "learning_rate": 8.53785088048968e-07, |
| "loss": 1.1955, |
| "step": 55 |
| }, |
| { |
| "epoch": 1.0135746606334841, |
| "grad_norm": 3.130876302719116, |
| "learning_rate": 8.580016718173995e-07, |
| "loss": 1.158, |
| "step": 56 |
| }, |
| { |
| "epoch": 1.0316742081447963, |
| "grad_norm": 3.4549965858459473, |
| "learning_rate": 8.621379348003944e-07, |
| "loss": 1.2838, |
| "step": 57 |
| }, |
| { |
| "epoch": 1.0497737556561086, |
| "grad_norm": 3.4735677242279053, |
| "learning_rate": 8.661968799114842e-07, |
| "loss": 1.0405, |
| "step": 58 |
| }, |
| { |
| "epoch": 1.0678733031674208, |
| "grad_norm": 4.529227256774902, |
| "learning_rate": 8.701813447471218e-07, |
| "loss": 1.1188, |
| "step": 59 |
| }, |
| { |
| "epoch": 1.085972850678733, |
| "grad_norm": 3.53712797164917, |
| "learning_rate": 8.740940135031001e-07, |
| "loss": 1.2113, |
| "step": 60 |
| }, |
| { |
| "epoch": 1.1040723981900453, |
| "grad_norm": 3.5053398609161377, |
| "learning_rate": 8.779374278362456e-07, |
| "loss": 1.197, |
| "step": 61 |
| }, |
| { |
| "epoch": 1.1221719457013575, |
| "grad_norm": 2.57523512840271, |
| "learning_rate": 8.817139967814684e-07, |
| "loss": 1.0894, |
| "step": 62 |
| }, |
| { |
| "epoch": 1.1402714932126696, |
| "grad_norm": 3.3335084915161133, |
| "learning_rate": 8.854260058210719e-07, |
| "loss": 1.026, |
| "step": 63 |
| }, |
| { |
| "epoch": 1.1583710407239818, |
| "grad_norm": 3.1159112453460693, |
| "learning_rate": 8.890756251918216e-07, |
| "loss": 1.1336, |
| "step": 64 |
| }, |
| { |
| "epoch": 1.1764705882352942, |
| "grad_norm": 4.9009294509887695, |
| "learning_rate": 8.926649175053833e-07, |
| "loss": 1.1167, |
| "step": 65 |
| }, |
| { |
| "epoch": 1.1945701357466063, |
| "grad_norm": 3.9954421520233154, |
| "learning_rate": 8.961958447491268e-07, |
| "loss": 1.0955, |
| "step": 66 |
| }, |
| { |
| "epoch": 1.2126696832579185, |
| "grad_norm": 4.976899147033691, |
| "learning_rate": 8.996702747267907e-07, |
| "loss": 1.1722, |
| "step": 67 |
| }, |
| { |
| "epoch": 1.2307692307692308, |
| "grad_norm": 4.318509101867676, |
| "learning_rate": 9.030899869919433e-07, |
| "loss": 1.0919, |
| "step": 68 |
| }, |
| { |
| "epoch": 1.248868778280543, |
| "grad_norm": 3.509249210357666, |
| "learning_rate": 9.064566783214276e-07, |
| "loss": 1.1013, |
| "step": 69 |
| }, |
| { |
| "epoch": 1.2669683257918551, |
| "grad_norm": 2.86033034324646, |
| "learning_rate": 9.097719677709341e-07, |
| "loss": 1.0721, |
| "step": 70 |
| }, |
| { |
| "epoch": 1.2850678733031673, |
| "grad_norm": 2.95745587348938, |
| "learning_rate": 9.13037401350413e-07, |
| "loss": 1.1235, |
| "step": 71 |
| }, |
| { |
| "epoch": 1.3031674208144797, |
| "grad_norm": 3.4295952320098877, |
| "learning_rate": 9.162544563531181e-07, |
| "loss": 1.1075, |
| "step": 72 |
| }, |
| { |
| "epoch": 1.3212669683257918, |
| "grad_norm": 4.3247575759887695, |
| "learning_rate": 9.194245453686276e-07, |
| "loss": 1.0976, |
| "step": 73 |
| }, |
| { |
| "epoch": 1.3393665158371042, |
| "grad_norm": 3.853940010070801, |
| "learning_rate": 9.225490200071283e-07, |
| "loss": 0.997, |
| "step": 74 |
| }, |
| { |
| "epoch": 1.3574660633484164, |
| "grad_norm": 3.1195785999298096, |
| "learning_rate": 9.256291743595375e-07, |
| "loss": 1.1499, |
| "step": 75 |
| }, |
| { |
| "epoch": 1.3755656108597285, |
| "grad_norm": 3.3039655685424805, |
| "learning_rate": 9.28666248215634e-07, |
| "loss": 1.0822, |
| "step": 76 |
| }, |
| { |
| "epoch": 1.3936651583710407, |
| "grad_norm": 2.9753613471984863, |
| "learning_rate": 9.316614300602277e-07, |
| "loss": 1.0524, |
| "step": 77 |
| }, |
| { |
| "epoch": 1.4117647058823528, |
| "grad_norm": 2.7271316051483154, |
| "learning_rate": 9.346158598654879e-07, |
| "loss": 1.0233, |
| "step": 78 |
| }, |
| { |
| "epoch": 1.4298642533936652, |
| "grad_norm": 5.75250244140625, |
| "learning_rate": 9.375306316958498e-07, |
| "loss": 1.0652, |
| "step": 79 |
| }, |
| { |
| "epoch": 1.4479638009049773, |
| "grad_norm": 3.271881341934204, |
| "learning_rate": 9.404067961403955e-07, |
| "loss": 0.9659, |
| "step": 80 |
| }, |
| { |
| "epoch": 1.4660633484162897, |
| "grad_norm": 3.8478872776031494, |
| "learning_rate": 9.432453625862408e-07, |
| "loss": 1.1446, |
| "step": 81 |
| }, |
| { |
| "epoch": 1.4841628959276019, |
| "grad_norm": 2.925736904144287, |
| "learning_rate": 9.4604730134524e-07, |
| "loss": 0.9846, |
| "step": 82 |
| }, |
| { |
| "epoch": 1.502262443438914, |
| "grad_norm": 2.8262815475463867, |
| "learning_rate": 9.488135456452205e-07, |
| "loss": 0.9395, |
| "step": 83 |
| }, |
| { |
| "epoch": 1.5203619909502262, |
| "grad_norm": 4.063136100769043, |
| "learning_rate": 9.515449934959715e-07, |
| "loss": 1.0066, |
| "step": 84 |
| }, |
| { |
| "epoch": 1.5384615384615383, |
| "grad_norm": 2.5695972442626953, |
| "learning_rate": 9.542425094393247e-07, |
| "loss": 0.9592, |
| "step": 85 |
| }, |
| { |
| "epoch": 1.5565610859728507, |
| "grad_norm": 3.1798267364501953, |
| "learning_rate": 9.569069261918583e-07, |
| "loss": 0.9213, |
| "step": 86 |
| }, |
| { |
| "epoch": 1.5746606334841629, |
| "grad_norm": 6.351828098297119, |
| "learning_rate": 9.59539046188037e-07, |
| "loss": 0.9416, |
| "step": 87 |
| }, |
| { |
| "epoch": 1.5927601809954752, |
| "grad_norm": 2.9044878482818604, |
| "learning_rate": 9.621396430309406e-07, |
| "loss": 0.9626, |
| "step": 88 |
| }, |
| { |
| "epoch": 1.6108597285067874, |
| "grad_norm": 3.130195140838623, |
| "learning_rate": 9.647094628571462e-07, |
| "loss": 0.9767, |
| "step": 89 |
| }, |
| { |
| "epoch": 1.6289592760180995, |
| "grad_norm": 6.14766263961792, |
| "learning_rate": 9.672492256217836e-07, |
| "loss": 1.029, |
| "step": 90 |
| }, |
| { |
| "epoch": 1.6470588235294117, |
| "grad_norm": 3.2492759227752686, |
| "learning_rate": 9.69759626309309e-07, |
| "loss": 0.9621, |
| "step": 91 |
| }, |
| { |
| "epoch": 1.6651583710407238, |
| "grad_norm": 2.8760759830474854, |
| "learning_rate": 9.722413360750842e-07, |
| "loss": 1.0525, |
| "step": 92 |
| }, |
| { |
| "epoch": 1.6832579185520362, |
| "grad_norm": 2.910680055618286, |
| "learning_rate": 9.74695003322456e-07, |
| "loss": 0.8777, |
| "step": 93 |
| }, |
| { |
| "epoch": 1.7013574660633484, |
| "grad_norm": 3.2539334297180176, |
| "learning_rate": 9.771212547196622e-07, |
| "loss": 0.9407, |
| "step": 94 |
| }, |
| { |
| "epoch": 1.7194570135746607, |
| "grad_norm": 2.7406673431396484, |
| "learning_rate": 9.795206961605466e-07, |
| "loss": 0.9077, |
| "step": 95 |
| }, |
| { |
| "epoch": 1.737556561085973, |
| "grad_norm": 3.0724148750305176, |
| "learning_rate": 9.818939136727774e-07, |
| "loss": 0.8289, |
| "step": 96 |
| }, |
| { |
| "epoch": 1.755656108597285, |
| "grad_norm": 3.2282042503356934, |
| "learning_rate": 9.842414742769674e-07, |
| "loss": 0.9638, |
| "step": 97 |
| }, |
| { |
| "epoch": 1.7737556561085972, |
| "grad_norm": 6.185736656188965, |
| "learning_rate": 9.865639267998492e-07, |
| "loss": 0.9356, |
| "step": 98 |
| }, |
| { |
| "epoch": 1.7918552036199094, |
| "grad_norm": 3.5250043869018555, |
| "learning_rate": 9.888618026444236e-07, |
| "loss": 0.8963, |
| "step": 99 |
| }, |
| { |
| "epoch": 1.8099547511312217, |
| "grad_norm": 3.418933629989624, |
| "learning_rate": 9.91135616519784e-07, |
| "loss": 0.9555, |
| "step": 100 |
| }, |
| { |
| "epoch": 1.8280542986425339, |
| "grad_norm": 2.943357467651367, |
| "learning_rate": 9.933858671331222e-07, |
| "loss": 0.8866, |
| "step": 101 |
| }, |
| { |
| "epoch": 1.8461538461538463, |
| "grad_norm": 3.657294511795044, |
| "learning_rate": 9.956130378462473e-07, |
| "loss": 0.9626, |
| "step": 102 |
| }, |
| { |
| "epoch": 1.8642533936651584, |
| "grad_norm": 3.3619449138641357, |
| "learning_rate": 9.978175972987748e-07, |
| "loss": 0.8601, |
| "step": 103 |
| }, |
| { |
| "epoch": 1.8823529411764706, |
| "grad_norm": 3.6125662326812744, |
| "learning_rate": 9.999999999999997e-07, |
| "loss": 0.8527, |
| "step": 104 |
| }, |
| { |
| "epoch": 1.9004524886877827, |
| "grad_norm": 3.5924038887023926, |
| "learning_rate": 1e-06, |
| "loss": 0.879, |
| "step": 105 |
| }, |
| { |
| "epoch": 1.9185520361990949, |
| "grad_norm": 2.770998001098633, |
| "learning_rate": 1e-06, |
| "loss": 0.8364, |
| "step": 106 |
| }, |
| { |
| "epoch": 1.9366515837104072, |
| "grad_norm": 3.2162160873413086, |
| "learning_rate": 1e-06, |
| "loss": 0.8591, |
| "step": 107 |
| }, |
| { |
| "epoch": 1.9547511312217196, |
| "grad_norm": 5.3574910163879395, |
| "learning_rate": 1e-06, |
| "loss": 0.9122, |
| "step": 108 |
| }, |
| { |
| "epoch": 1.9728506787330318, |
| "grad_norm": 3.564077138900757, |
| "learning_rate": 1e-06, |
| "loss": 0.9706, |
| "step": 109 |
| }, |
| { |
| "epoch": 1.990950226244344, |
| "grad_norm": 3.347325086593628, |
| "learning_rate": 1e-06, |
| "loss": 0.7614, |
| "step": 110 |
| }, |
| { |
| "epoch": 2.009049773755656, |
| "grad_norm": 2.967604637145996, |
| "learning_rate": 1e-06, |
| "loss": 0.8534, |
| "step": 111 |
| }, |
| { |
| "epoch": 2.0271493212669682, |
| "grad_norm": 2.7884786128997803, |
| "learning_rate": 1e-06, |
| "loss": 0.8786, |
| "step": 112 |
| }, |
| { |
| "epoch": 2.0452488687782804, |
| "grad_norm": 3.3380022048950195, |
| "learning_rate": 1e-06, |
| "loss": 0.7821, |
| "step": 113 |
| }, |
| { |
| "epoch": 2.0633484162895925, |
| "grad_norm": 2.7749805450439453, |
| "learning_rate": 1e-06, |
| "loss": 0.749, |
| "step": 114 |
| }, |
| { |
| "epoch": 2.081447963800905, |
| "grad_norm": 2.6747641563415527, |
| "learning_rate": 1e-06, |
| "loss": 0.7501, |
| "step": 115 |
| }, |
| { |
| "epoch": 2.0995475113122173, |
| "grad_norm": 3.078357458114624, |
| "learning_rate": 1e-06, |
| "loss": 0.7979, |
| "step": 116 |
| }, |
| { |
| "epoch": 2.1176470588235294, |
| "grad_norm": 5.836832523345947, |
| "learning_rate": 1e-06, |
| "loss": 0.7917, |
| "step": 117 |
| }, |
| { |
| "epoch": 2.1357466063348416, |
| "grad_norm": 6.190191745758057, |
| "learning_rate": 1e-06, |
| "loss": 0.8743, |
| "step": 118 |
| }, |
| { |
| "epoch": 2.1538461538461537, |
| "grad_norm": 3.3209612369537354, |
| "learning_rate": 1e-06, |
| "loss": 0.7989, |
| "step": 119 |
| }, |
| { |
| "epoch": 2.171945701357466, |
| "grad_norm": 3.5375821590423584, |
| "learning_rate": 1e-06, |
| "loss": 0.7356, |
| "step": 120 |
| }, |
| { |
| "epoch": 2.1900452488687785, |
| "grad_norm": 2.635437488555908, |
| "learning_rate": 1e-06, |
| "loss": 0.7602, |
| "step": 121 |
| }, |
| { |
| "epoch": 2.2081447963800906, |
| "grad_norm": 4.8566460609436035, |
| "learning_rate": 1e-06, |
| "loss": 0.7611, |
| "step": 122 |
| }, |
| { |
| "epoch": 2.226244343891403, |
| "grad_norm": 3.1864778995513916, |
| "learning_rate": 1e-06, |
| "loss": 0.767, |
| "step": 123 |
| }, |
| { |
| "epoch": 2.244343891402715, |
| "grad_norm": 4.00085973739624, |
| "learning_rate": 1e-06, |
| "loss": 0.7164, |
| "step": 124 |
| }, |
| { |
| "epoch": 2.262443438914027, |
| "grad_norm": 3.608243465423584, |
| "learning_rate": 1e-06, |
| "loss": 0.8259, |
| "step": 125 |
| }, |
| { |
| "epoch": 2.2805429864253393, |
| "grad_norm": 2.6522486209869385, |
| "learning_rate": 1e-06, |
| "loss": 0.6852, |
| "step": 126 |
| }, |
| { |
| "epoch": 2.2986425339366514, |
| "grad_norm": 3.137711524963379, |
| "learning_rate": 1e-06, |
| "loss": 0.7703, |
| "step": 127 |
| }, |
| { |
| "epoch": 2.3167420814479636, |
| "grad_norm": 3.867400884628296, |
| "learning_rate": 1e-06, |
| "loss": 0.7472, |
| "step": 128 |
| }, |
| { |
| "epoch": 2.334841628959276, |
| "grad_norm": 3.147169351577759, |
| "learning_rate": 1e-06, |
| "loss": 0.6673, |
| "step": 129 |
| }, |
| { |
| "epoch": 2.3529411764705883, |
| "grad_norm": 5.177737236022949, |
| "learning_rate": 1e-06, |
| "loss": 0.7478, |
| "step": 130 |
| }, |
| { |
| "epoch": 2.3710407239819005, |
| "grad_norm": 4.302467346191406, |
| "learning_rate": 1e-06, |
| "loss": 0.7025, |
| "step": 131 |
| }, |
| { |
| "epoch": 2.3891402714932126, |
| "grad_norm": 3.5397591590881348, |
| "learning_rate": 1e-06, |
| "loss": 0.6615, |
| "step": 132 |
| }, |
| { |
| "epoch": 2.4072398190045248, |
| "grad_norm": 5.074939250946045, |
| "learning_rate": 1e-06, |
| "loss": 0.7043, |
| "step": 133 |
| }, |
| { |
| "epoch": 2.425339366515837, |
| "grad_norm": 7.0850090980529785, |
| "learning_rate": 1e-06, |
| "loss": 0.6587, |
| "step": 134 |
| }, |
| { |
| "epoch": 2.4434389140271495, |
| "grad_norm": 3.6032073497772217, |
| "learning_rate": 1e-06, |
| "loss": 0.6726, |
| "step": 135 |
| }, |
| { |
| "epoch": 2.4615384615384617, |
| "grad_norm": 2.707559108734131, |
| "learning_rate": 1e-06, |
| "loss": 0.6529, |
| "step": 136 |
| }, |
| { |
| "epoch": 2.479638009049774, |
| "grad_norm": 3.5912561416625977, |
| "learning_rate": 1e-06, |
| "loss": 0.6655, |
| "step": 137 |
| }, |
| { |
| "epoch": 2.497737556561086, |
| "grad_norm": 3.537712335586548, |
| "learning_rate": 1e-06, |
| "loss": 0.6617, |
| "step": 138 |
| }, |
| { |
| "epoch": 2.515837104072398, |
| "grad_norm": 8.936192512512207, |
| "learning_rate": 1e-06, |
| "loss": 0.6757, |
| "step": 139 |
| }, |
| { |
| "epoch": 2.5339366515837103, |
| "grad_norm": 2.4215872287750244, |
| "learning_rate": 1e-06, |
| "loss": 0.6143, |
| "step": 140 |
| }, |
| { |
| "epoch": 2.5520361990950224, |
| "grad_norm": 3.1526424884796143, |
| "learning_rate": 1e-06, |
| "loss": 0.6308, |
| "step": 141 |
| }, |
| { |
| "epoch": 2.5701357466063346, |
| "grad_norm": 4.358147144317627, |
| "learning_rate": 1e-06, |
| "loss": 0.7556, |
| "step": 142 |
| }, |
| { |
| "epoch": 2.588235294117647, |
| "grad_norm": 3.423694610595703, |
| "learning_rate": 1e-06, |
| "loss": 0.5853, |
| "step": 143 |
| }, |
| { |
| "epoch": 2.6063348416289593, |
| "grad_norm": 3.792518138885498, |
| "learning_rate": 1e-06, |
| "loss": 0.6154, |
| "step": 144 |
| }, |
| { |
| "epoch": 2.6244343891402715, |
| "grad_norm": 7.347434997558594, |
| "learning_rate": 1e-06, |
| "loss": 0.5983, |
| "step": 145 |
| }, |
| { |
| "epoch": 2.6425339366515836, |
| "grad_norm": 3.5371973514556885, |
| "learning_rate": 1e-06, |
| "loss": 0.6173, |
| "step": 146 |
| }, |
| { |
| "epoch": 2.660633484162896, |
| "grad_norm": 3.084972620010376, |
| "learning_rate": 1e-06, |
| "loss": 0.5556, |
| "step": 147 |
| }, |
| { |
| "epoch": 2.6787330316742084, |
| "grad_norm": 5.569381237030029, |
| "learning_rate": 1e-06, |
| "loss": 0.6146, |
| "step": 148 |
| }, |
| { |
| "epoch": 2.6968325791855206, |
| "grad_norm": 3.270057201385498, |
| "learning_rate": 1e-06, |
| "loss": 0.5938, |
| "step": 149 |
| }, |
| { |
| "epoch": 2.7149321266968327, |
| "grad_norm": 3.9021122455596924, |
| "learning_rate": 1e-06, |
| "loss": 0.532, |
| "step": 150 |
| }, |
| { |
| "epoch": 2.733031674208145, |
| "grad_norm": 3.702974319458008, |
| "learning_rate": 1e-06, |
| "loss": 0.5419, |
| "step": 151 |
| }, |
| { |
| "epoch": 2.751131221719457, |
| "grad_norm": 3.640477418899536, |
| "learning_rate": 1e-06, |
| "loss": 0.5927, |
| "step": 152 |
| }, |
| { |
| "epoch": 2.769230769230769, |
| "grad_norm": 4.119628429412842, |
| "learning_rate": 1e-06, |
| "loss": 0.6556, |
| "step": 153 |
| }, |
| { |
| "epoch": 2.7873303167420813, |
| "grad_norm": 4.196484565734863, |
| "learning_rate": 1e-06, |
| "loss": 0.5473, |
| "step": 154 |
| }, |
| { |
| "epoch": 2.8054298642533935, |
| "grad_norm": 3.049004316329956, |
| "learning_rate": 1e-06, |
| "loss": 0.5382, |
| "step": 155 |
| }, |
| { |
| "epoch": 2.8235294117647056, |
| "grad_norm": 3.8105475902557373, |
| "learning_rate": 1e-06, |
| "loss": 0.5667, |
| "step": 156 |
| }, |
| { |
| "epoch": 2.841628959276018, |
| "grad_norm": 7.120466232299805, |
| "learning_rate": 1e-06, |
| "loss": 0.5815, |
| "step": 157 |
| }, |
| { |
| "epoch": 2.8597285067873304, |
| "grad_norm": 2.96946120262146, |
| "learning_rate": 1e-06, |
| "loss": 0.5488, |
| "step": 158 |
| }, |
| { |
| "epoch": 2.8778280542986425, |
| "grad_norm": 3.927828073501587, |
| "learning_rate": 1e-06, |
| "loss": 0.5051, |
| "step": 159 |
| }, |
| { |
| "epoch": 2.8959276018099547, |
| "grad_norm": 3.6861846446990967, |
| "learning_rate": 1e-06, |
| "loss": 0.5683, |
| "step": 160 |
| }, |
| { |
| "epoch": 2.914027149321267, |
| "grad_norm": 3.383025646209717, |
| "learning_rate": 1e-06, |
| "loss": 0.4956, |
| "step": 161 |
| }, |
| { |
| "epoch": 2.9321266968325794, |
| "grad_norm": 3.9769487380981445, |
| "learning_rate": 1e-06, |
| "loss": 0.5101, |
| "step": 162 |
| }, |
| { |
| "epoch": 2.9502262443438916, |
| "grad_norm": 3.262488842010498, |
| "learning_rate": 1e-06, |
| "loss": 0.5589, |
| "step": 163 |
| }, |
| { |
| "epoch": 2.9683257918552037, |
| "grad_norm": 3.582789182662964, |
| "learning_rate": 1e-06, |
| "loss": 0.5064, |
| "step": 164 |
| }, |
| { |
| "epoch": 2.986425339366516, |
| "grad_norm": 3.441208839416504, |
| "learning_rate": 1e-06, |
| "loss": 0.4696, |
| "step": 165 |
| }, |
| { |
| "epoch": 3.004524886877828, |
| "grad_norm": 4.338072299957275, |
| "learning_rate": 1e-06, |
| "loss": 0.5083, |
| "step": 166 |
| }, |
| { |
| "epoch": 3.02262443438914, |
| "grad_norm": 3.537062644958496, |
| "learning_rate": 1e-06, |
| "loss": 0.5328, |
| "step": 167 |
| }, |
| { |
| "epoch": 3.0407239819004523, |
| "grad_norm": 3.4206771850585938, |
| "learning_rate": 1e-06, |
| "loss": 0.4615, |
| "step": 168 |
| }, |
| { |
| "epoch": 3.0588235294117645, |
| "grad_norm": 3.4206771850585938, |
| "learning_rate": 1e-06, |
| "loss": 0.4958, |
| "step": 169 |
| }, |
| { |
| "epoch": 3.076923076923077, |
| "grad_norm": 3.015188694000244, |
| "learning_rate": 1e-06, |
| "loss": 0.5143, |
| "step": 170 |
| }, |
| { |
| "epoch": 3.0950226244343892, |
| "grad_norm": 3.314436435699463, |
| "learning_rate": 1e-06, |
| "loss": 0.4972, |
| "step": 171 |
| }, |
| { |
| "epoch": 3.1131221719457014, |
| "grad_norm": 4.298553466796875, |
| "learning_rate": 1e-06, |
| "loss": 0.4794, |
| "step": 172 |
| }, |
| { |
| "epoch": 3.1312217194570136, |
| "grad_norm": 4.354773044586182, |
| "learning_rate": 1e-06, |
| "loss": 0.4287, |
| "step": 173 |
| }, |
| { |
| "epoch": 3.1493212669683257, |
| "grad_norm": 3.558988332748413, |
| "learning_rate": 1e-06, |
| "loss": 0.4092, |
| "step": 174 |
| }, |
| { |
| "epoch": 3.167420814479638, |
| "grad_norm": 5.127811431884766, |
| "learning_rate": 1e-06, |
| "loss": 0.4465, |
| "step": 175 |
| }, |
| { |
| "epoch": 3.1855203619909505, |
| "grad_norm": 2.5903167724609375, |
| "learning_rate": 1e-06, |
| "loss": 0.4119, |
| "step": 176 |
| }, |
| { |
| "epoch": 3.2036199095022626, |
| "grad_norm": 3.8117196559906006, |
| "learning_rate": 1e-06, |
| "loss": 0.4529, |
| "step": 177 |
| }, |
| { |
| "epoch": 3.2217194570135748, |
| "grad_norm": 3.164416551589966, |
| "learning_rate": 1e-06, |
| "loss": 0.4856, |
| "step": 178 |
| }, |
| { |
| "epoch": 3.239819004524887, |
| "grad_norm": 4.106442451477051, |
| "learning_rate": 1e-06, |
| "loss": 0.4312, |
| "step": 179 |
| }, |
| { |
| "epoch": 3.257918552036199, |
| "grad_norm": 3.762195110321045, |
| "learning_rate": 1e-06, |
| "loss": 0.411, |
| "step": 180 |
| }, |
| { |
| "epoch": 3.276018099547511, |
| "grad_norm": 3.0889034271240234, |
| "learning_rate": 1e-06, |
| "loss": 0.3977, |
| "step": 181 |
| }, |
| { |
| "epoch": 3.2941176470588234, |
| "grad_norm": 3.6932199001312256, |
| "learning_rate": 1e-06, |
| "loss": 0.4528, |
| "step": 182 |
| }, |
| { |
| "epoch": 3.3122171945701355, |
| "grad_norm": 3.405756711959839, |
| "learning_rate": 1e-06, |
| "loss": 0.4042, |
| "step": 183 |
| }, |
| { |
| "epoch": 3.330316742081448, |
| "grad_norm": 5.229561805725098, |
| "learning_rate": 1e-06, |
| "loss": 0.4171, |
| "step": 184 |
| }, |
| { |
| "epoch": 3.3484162895927603, |
| "grad_norm": 5.6756439208984375, |
| "learning_rate": 1e-06, |
| "loss": 0.3891, |
| "step": 185 |
| }, |
| { |
| "epoch": 3.3665158371040724, |
| "grad_norm": 3.706697940826416, |
| "learning_rate": 1e-06, |
| "loss": 0.3673, |
| "step": 186 |
| }, |
| { |
| "epoch": 3.3846153846153846, |
| "grad_norm": 2.9856503009796143, |
| "learning_rate": 1e-06, |
| "loss": 0.3989, |
| "step": 187 |
| }, |
| { |
| "epoch": 3.4027149321266967, |
| "grad_norm": 3.5144922733306885, |
| "learning_rate": 1e-06, |
| "loss": 0.3885, |
| "step": 188 |
| }, |
| { |
| "epoch": 3.420814479638009, |
| "grad_norm": 5.046453475952148, |
| "learning_rate": 1e-06, |
| "loss": 0.402, |
| "step": 189 |
| }, |
| { |
| "epoch": 3.4389140271493215, |
| "grad_norm": 4.306224822998047, |
| "learning_rate": 1e-06, |
| "loss": 0.3568, |
| "step": 190 |
| }, |
| { |
| "epoch": 3.4570135746606336, |
| "grad_norm": 2.7284157276153564, |
| "learning_rate": 1e-06, |
| "loss": 0.4109, |
| "step": 191 |
| }, |
| { |
| "epoch": 3.475113122171946, |
| "grad_norm": 5.036966323852539, |
| "learning_rate": 1e-06, |
| "loss": 0.3958, |
| "step": 192 |
| }, |
| { |
| "epoch": 3.493212669683258, |
| "grad_norm": 2.3863677978515625, |
| "learning_rate": 1e-06, |
| "loss": 0.3674, |
| "step": 193 |
| }, |
| { |
| "epoch": 3.51131221719457, |
| "grad_norm": 3.3838043212890625, |
| "learning_rate": 1e-06, |
| "loss": 0.3627, |
| "step": 194 |
| }, |
| { |
| "epoch": 3.5294117647058822, |
| "grad_norm": 6.964299201965332, |
| "learning_rate": 1e-06, |
| "loss": 0.405, |
| "step": 195 |
| }, |
| { |
| "epoch": 3.5475113122171944, |
| "grad_norm": 2.7131292819976807, |
| "learning_rate": 1e-06, |
| "loss": 0.3599, |
| "step": 196 |
| }, |
| { |
| "epoch": 3.5656108597285066, |
| "grad_norm": 2.6383941173553467, |
| "learning_rate": 1e-06, |
| "loss": 0.3335, |
| "step": 197 |
| }, |
| { |
| "epoch": 3.583710407239819, |
| "grad_norm": 2.710670232772827, |
| "learning_rate": 1e-06, |
| "loss": 0.3979, |
| "step": 198 |
| }, |
| { |
| "epoch": 3.6018099547511313, |
| "grad_norm": 2.6803207397460938, |
| "learning_rate": 1e-06, |
| "loss": 0.3695, |
| "step": 199 |
| }, |
| { |
| "epoch": 3.6199095022624435, |
| "grad_norm": 3.22158145904541, |
| "learning_rate": 1e-06, |
| "loss": 0.3832, |
| "step": 200 |
| } |
| ], |
| "logging_steps": 1.0, |
| "max_steps": 10000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 182, |
| "save_steps": 50, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.1675256582811156e+18, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|