| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.9953521630318198, |
| "eval_steps": 500, |
| "global_step": 174, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.01144082946013586, |
| "grad_norm": 4.3125, |
| "learning_rate": 2.0000000000000002e-07, |
| "loss": 1.0962, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.02288165892027172, |
| "grad_norm": 40.25, |
| "learning_rate": 4.0000000000000003e-07, |
| "loss": 1.0789, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.03432248838040758, |
| "grad_norm": 14.8125, |
| "learning_rate": 6.000000000000001e-07, |
| "loss": 1.0865, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.04576331784054344, |
| "grad_norm": 4.1875, |
| "learning_rate": 8.000000000000001e-07, |
| "loss": 1.0257, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.0572041473006793, |
| "grad_norm": 5.09375, |
| "learning_rate": 1.0000000000000002e-06, |
| "loss": 1.0766, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.06864497676081516, |
| "grad_norm": 27.375, |
| "learning_rate": 1.2000000000000002e-06, |
| "loss": 1.0548, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.08008580622095102, |
| "grad_norm": 2.734375, |
| "learning_rate": 1.4000000000000001e-06, |
| "loss": 1.0669, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.09152663568108688, |
| "grad_norm": 6.5, |
| "learning_rate": 1.6000000000000001e-06, |
| "loss": 1.0566, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.10296746514122274, |
| "grad_norm": 17.875, |
| "learning_rate": 1.8000000000000001e-06, |
| "loss": 1.2024, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.1144082946013586, |
| "grad_norm": 3.5, |
| "learning_rate": 2.0000000000000003e-06, |
| "loss": 1.0421, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.12584912406149446, |
| "grad_norm": 3.5, |
| "learning_rate": 2.2e-06, |
| "loss": 1.1209, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.13728995352163031, |
| "grad_norm": 3.734375, |
| "learning_rate": 2.4000000000000003e-06, |
| "loss": 1.1189, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.14873078298176617, |
| "grad_norm": 2.609375, |
| "learning_rate": 2.6e-06, |
| "loss": 1.1231, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.16017161244190203, |
| "grad_norm": 6.125, |
| "learning_rate": 2.8000000000000003e-06, |
| "loss": 1.1184, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.1716124419020379, |
| "grad_norm": 5.0, |
| "learning_rate": 3e-06, |
| "loss": 1.0939, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.18305327136217375, |
| "grad_norm": 55.5, |
| "learning_rate": 3.2000000000000003e-06, |
| "loss": 1.121, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.1944941008223096, |
| "grad_norm": 2.96875, |
| "learning_rate": 3.4000000000000005e-06, |
| "loss": 1.0549, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.20593493028244547, |
| "grad_norm": 2.65625, |
| "learning_rate": 3.6000000000000003e-06, |
| "loss": 1.0634, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.21737575974258133, |
| "grad_norm": 2.640625, |
| "learning_rate": 3.8000000000000005e-06, |
| "loss": 1.1134, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.2288165892027172, |
| "grad_norm": 3.328125, |
| "learning_rate": 4.000000000000001e-06, |
| "loss": 1.1133, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.24025741866285305, |
| "grad_norm": 2.96875, |
| "learning_rate": 4.2000000000000004e-06, |
| "loss": 1.1252, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.2516982481229889, |
| "grad_norm": 4.21875, |
| "learning_rate": 4.4e-06, |
| "loss": 1.187, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.26313907758312477, |
| "grad_norm": 2.46875, |
| "learning_rate": 4.600000000000001e-06, |
| "loss": 1.103, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.27457990704326063, |
| "grad_norm": 3.625, |
| "learning_rate": 4.800000000000001e-06, |
| "loss": 1.1059, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.2860207365033965, |
| "grad_norm": 1.7265625, |
| "learning_rate": 5e-06, |
| "loss": 1.0725, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.29746156596353235, |
| "grad_norm": 1.6875, |
| "learning_rate": 5.2e-06, |
| "loss": 1.0673, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.3089023954236682, |
| "grad_norm": 1.828125, |
| "learning_rate": 5.400000000000001e-06, |
| "loss": 1.0517, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.32034322488380407, |
| "grad_norm": 1.9375, |
| "learning_rate": 5.600000000000001e-06, |
| "loss": 1.1265, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.3317840543439399, |
| "grad_norm": 1.6875, |
| "learning_rate": 5.8e-06, |
| "loss": 1.0353, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.3432248838040758, |
| "grad_norm": 1.65625, |
| "learning_rate": 6e-06, |
| "loss": 1.1089, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.35466571326421165, |
| "grad_norm": 2.28125, |
| "learning_rate": 6.200000000000001e-06, |
| "loss": 1.0071, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.3661065427243475, |
| "grad_norm": 2.34375, |
| "learning_rate": 6.4000000000000006e-06, |
| "loss": 1.129, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.37754737218448337, |
| "grad_norm": 2.09375, |
| "learning_rate": 6.600000000000001e-06, |
| "loss": 1.0258, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.3889882016446192, |
| "grad_norm": 2.890625, |
| "learning_rate": 6.800000000000001e-06, |
| "loss": 1.0924, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.4004290311047551, |
| "grad_norm": 4.4375, |
| "learning_rate": 7e-06, |
| "loss": 1.1275, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.41186986056489094, |
| "grad_norm": 1.6328125, |
| "learning_rate": 7.2000000000000005e-06, |
| "loss": 1.0683, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.4233106900250268, |
| "grad_norm": 2.34375, |
| "learning_rate": 7.4e-06, |
| "loss": 1.1141, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.43475151948516266, |
| "grad_norm": 5.34375, |
| "learning_rate": 7.600000000000001e-06, |
| "loss": 1.0, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.4461923489452985, |
| "grad_norm": 4.09375, |
| "learning_rate": 7.800000000000002e-06, |
| "loss": 1.1454, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.4576331784054344, |
| "grad_norm": 3.3125, |
| "learning_rate": 8.000000000000001e-06, |
| "loss": 1.0984, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.46907400786557024, |
| "grad_norm": 1.578125, |
| "learning_rate": 8.2e-06, |
| "loss": 1.0194, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.4805148373257061, |
| "grad_norm": 7.03125, |
| "learning_rate": 8.400000000000001e-06, |
| "loss": 1.122, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.49195566678584196, |
| "grad_norm": 9.0625, |
| "learning_rate": 8.6e-06, |
| "loss": 1.112, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.5033964962459778, |
| "grad_norm": 7.75, |
| "learning_rate": 8.8e-06, |
| "loss": 1.1003, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.5148373257061137, |
| "grad_norm": 27.375, |
| "learning_rate": 9e-06, |
| "loss": 1.1063, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.5262781551662495, |
| "grad_norm": 1.8984375, |
| "learning_rate": 9.200000000000002e-06, |
| "loss": 1.0891, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.5377189846263855, |
| "grad_norm": 2.21875, |
| "learning_rate": 9.4e-06, |
| "loss": 1.0974, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.5491598140865213, |
| "grad_norm": 1.1875, |
| "learning_rate": 9.600000000000001e-06, |
| "loss": 1.0568, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.5606006435466572, |
| "grad_norm": 1.78125, |
| "learning_rate": 9.800000000000001e-06, |
| "loss": 1.0272, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.572041473006793, |
| "grad_norm": 4.6875, |
| "learning_rate": 1e-05, |
| "loss": 1.0282, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.5834823024669289, |
| "grad_norm": 1.7890625, |
| "learning_rate": 9.998395376482152e-06, |
| "loss": 1.0128, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.5949231319270647, |
| "grad_norm": 2.5625, |
| "learning_rate": 9.993582535855265e-06, |
| "loss": 1.0956, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.6063639613872006, |
| "grad_norm": 3.59375, |
| "learning_rate": 9.985564567238237e-06, |
| "loss": 1.0804, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.6178047908473364, |
| "grad_norm": 10.4375, |
| "learning_rate": 9.974346616959476e-06, |
| "loss": 1.1, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.6292456203074723, |
| "grad_norm": 4.75, |
| "learning_rate": 9.959935885253715e-06, |
| "loss": 1.0653, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.6406864497676081, |
| "grad_norm": 1.875, |
| "learning_rate": 9.942341621640558e-06, |
| "loss": 1.0784, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.652127279227744, |
| "grad_norm": 3.265625, |
| "learning_rate": 9.921575118987672e-06, |
| "loss": 1.0597, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.6635681086878799, |
| "grad_norm": 2.078125, |
| "learning_rate": 9.897649706262474e-06, |
| "loss": 1.0695, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.6750089381480158, |
| "grad_norm": 6.65625, |
| "learning_rate": 9.870580739976936e-06, |
| "loss": 1.0896, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.6864497676081516, |
| "grad_norm": 2.453125, |
| "learning_rate": 9.840385594331022e-06, |
| "loss": 1.0653, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.6978905970682875, |
| "grad_norm": 2.859375, |
| "learning_rate": 9.807083650061063e-06, |
| "loss": 1.0338, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.7093314265284233, |
| "grad_norm": 2.234375, |
| "learning_rate": 9.770696282000245e-06, |
| "loss": 1.0404, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.7207722559885592, |
| "grad_norm": 1.53125, |
| "learning_rate": 9.731246845359187e-06, |
| "loss": 1.0658, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.732213085448695, |
| "grad_norm": 3.15625, |
| "learning_rate": 9.688760660735403e-06, |
| "loss": 1.0667, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.7436539149088309, |
| "grad_norm": 2.515625, |
| "learning_rate": 9.643264997861312e-06, |
| "loss": 0.9907, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.7550947443689667, |
| "grad_norm": 2.3125, |
| "learning_rate": 9.594789058101154e-06, |
| "loss": 1.0233, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.7665355738291026, |
| "grad_norm": 1.6328125, |
| "learning_rate": 9.543363955708124e-06, |
| "loss": 1.0892, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.7779764032892384, |
| "grad_norm": 1.859375, |
| "learning_rate": 9.48902269785371e-06, |
| "loss": 1.0364, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.7894172327493744, |
| "grad_norm": 1.6953125, |
| "learning_rate": 9.431800163442043e-06, |
| "loss": 0.9652, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.8008580622095102, |
| "grad_norm": 36.0, |
| "learning_rate": 9.371733080722911e-06, |
| "loss": 1.0376, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.8122988916696461, |
| "grad_norm": 3.890625, |
| "learning_rate": 9.308860003717748e-06, |
| "loss": 1.037, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.8237397211297819, |
| "grad_norm": 2.640625, |
| "learning_rate": 9.243221287473755e-06, |
| "loss": 1.0471, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.8351805505899178, |
| "grad_norm": 5.84375, |
| "learning_rate": 9.174859062162037e-06, |
| "loss": 1.0588, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.8466213800500536, |
| "grad_norm": 1.3359375, |
| "learning_rate": 9.103817206036383e-06, |
| "loss": 1.0636, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.8580622095101895, |
| "grad_norm": 39.25, |
| "learning_rate": 9.030141317270026e-06, |
| "loss": 1.0492, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.8695030389703253, |
| "grad_norm": 35.0, |
| "learning_rate": 8.953878684688492e-06, |
| "loss": 1.1284, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.8809438684304612, |
| "grad_norm": 101.0, |
| "learning_rate": 8.875078257417294e-06, |
| "loss": 1.0219, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.892384697890597, |
| "grad_norm": 4.4375, |
| "learning_rate": 8.793790613463956e-06, |
| "loss": 1.1058, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.903825527350733, |
| "grad_norm": 8.0625, |
| "learning_rate": 8.710067927254555e-06, |
| "loss": 1.0518, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.9152663568108688, |
| "grad_norm": 8.875, |
| "learning_rate": 8.6239639361456e-06, |
| "loss": 1.0308, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.9267071862710047, |
| "grad_norm": 1.6484375, |
| "learning_rate": 8.535533905932739e-06, |
| "loss": 1.0269, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.9381480157311405, |
| "grad_norm": 2.21875, |
| "learning_rate": 8.444834595378434e-06, |
| "loss": 1.0251, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.9495888451912764, |
| "grad_norm": 2.765625, |
| "learning_rate": 8.351924219781393e-06, |
| "loss": 1.0167, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.9610296746514122, |
| "grad_norm": 1.3984375, |
| "learning_rate": 8.256862413611113e-06, |
| "loss": 1.0286, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.9724705041115481, |
| "grad_norm": 1.6796875, |
| "learning_rate": 8.15971019223152e-06, |
| "loss": 1.0564, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.9839113335716839, |
| "grad_norm": 5.3125, |
| "learning_rate": 8.060529912738316e-06, |
| "loss": 1.027, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.9953521630318198, |
| "grad_norm": 1.4296875, |
| "learning_rate": 7.959385233935087e-06, |
| "loss": 1.0048, |
| "step": 87 |
| }, |
| { |
| "epoch": 1.0114408294601358, |
| "grad_norm": 2.109375, |
| "learning_rate": 7.856341075473963e-06, |
| "loss": 1.4279, |
| "step": 88 |
| }, |
| { |
| "epoch": 1.0228816589202716, |
| "grad_norm": 1.375, |
| "learning_rate": 7.751463576186957e-06, |
| "loss": 1.0041, |
| "step": 89 |
| }, |
| { |
| "epoch": 1.0343224883804076, |
| "grad_norm": 2.578125, |
| "learning_rate": 7.644820051634813e-06, |
| "loss": 1.0455, |
| "step": 90 |
| }, |
| { |
| "epoch": 1.0457633178405434, |
| "grad_norm": 2.90625, |
| "learning_rate": 7.536478950900537e-06, |
| "loss": 1.0663, |
| "step": 91 |
| }, |
| { |
| "epoch": 1.0572041473006792, |
| "grad_norm": 5.6875, |
| "learning_rate": 7.4265098126554065e-06, |
| "loss": 1.0023, |
| "step": 92 |
| }, |
| { |
| "epoch": 1.0686449767608153, |
| "grad_norm": 1.2265625, |
| "learning_rate": 7.314983220525604e-06, |
| "loss": 1.0352, |
| "step": 93 |
| }, |
| { |
| "epoch": 1.080085806220951, |
| "grad_norm": 3.28125, |
| "learning_rate": 7.201970757788172e-06, |
| "loss": 1.1086, |
| "step": 94 |
| }, |
| { |
| "epoch": 1.0915266356810869, |
| "grad_norm": 2.8125, |
| "learning_rate": 7.087544961425317e-06, |
| "loss": 1.0692, |
| "step": 95 |
| }, |
| { |
| "epoch": 1.1029674651412227, |
| "grad_norm": 4.40625, |
| "learning_rate": 6.971779275566593e-06, |
| "loss": 1.0666, |
| "step": 96 |
| }, |
| { |
| "epoch": 1.1144082946013585, |
| "grad_norm": 4.21875, |
| "learning_rate": 6.85474800434884e-06, |
| "loss": 1.0401, |
| "step": 97 |
| }, |
| { |
| "epoch": 1.1258491240614945, |
| "grad_norm": 1.671875, |
| "learning_rate": 6.736526264224101e-06, |
| "loss": 1.062, |
| "step": 98 |
| }, |
| { |
| "epoch": 1.1372899535216303, |
| "grad_norm": 2.90625, |
| "learning_rate": 6.617189935746191e-06, |
| "loss": 1.1024, |
| "step": 99 |
| }, |
| { |
| "epoch": 1.1487307829817661, |
| "grad_norm": 4.46875, |
| "learning_rate": 6.496815614866792e-06, |
| "loss": 1.1216, |
| "step": 100 |
| }, |
| { |
| "epoch": 1.1601716124419021, |
| "grad_norm": 1.9453125, |
| "learning_rate": 6.375480563772391e-06, |
| "loss": 1.0347, |
| "step": 101 |
| }, |
| { |
| "epoch": 1.171612441902038, |
| "grad_norm": 2.015625, |
| "learning_rate": 6.2532626612936035e-06, |
| "loss": 1.1006, |
| "step": 102 |
| }, |
| { |
| "epoch": 1.1830532713621738, |
| "grad_norm": 1.8671875, |
| "learning_rate": 6.130240352918675e-06, |
| "loss": 1.0257, |
| "step": 103 |
| }, |
| { |
| "epoch": 1.1944941008223096, |
| "grad_norm": 3.53125, |
| "learning_rate": 6.006492600443301e-06, |
| "loss": 1.091, |
| "step": 104 |
| }, |
| { |
| "epoch": 1.2059349302824454, |
| "grad_norm": 2.890625, |
| "learning_rate": 5.882098831289044e-06, |
| "loss": 1.0359, |
| "step": 105 |
| }, |
| { |
| "epoch": 1.2173757597425814, |
| "grad_norm": 3.421875, |
| "learning_rate": 5.757138887522884e-06, |
| "loss": 1.1039, |
| "step": 106 |
| }, |
| { |
| "epoch": 1.2288165892027172, |
| "grad_norm": 12.4375, |
| "learning_rate": 5.631692974610647e-06, |
| "loss": 1.1496, |
| "step": 107 |
| }, |
| { |
| "epoch": 1.240257418662853, |
| "grad_norm": 16.25, |
| "learning_rate": 5.505841609937162e-06, |
| "loss": 1.2067, |
| "step": 108 |
| }, |
| { |
| "epoch": 1.251698248122989, |
| "grad_norm": 8.1875, |
| "learning_rate": 5.379665571126232e-06, |
| "loss": 1.2643, |
| "step": 109 |
| }, |
| { |
| "epoch": 1.2631390775831248, |
| "grad_norm": 6.03125, |
| "learning_rate": 5.253245844193564e-06, |
| "loss": 1.0771, |
| "step": 110 |
| }, |
| { |
| "epoch": 1.2745799070432606, |
| "grad_norm": 3.03125, |
| "learning_rate": 5.12666357156594e-06, |
| "loss": 1.0801, |
| "step": 111 |
| }, |
| { |
| "epoch": 1.2860207365033964, |
| "grad_norm": 4.0625, |
| "learning_rate": 5e-06, |
| "loss": 1.125, |
| "step": 112 |
| }, |
| { |
| "epoch": 1.2974615659635322, |
| "grad_norm": 2.875, |
| "learning_rate": 4.873336428434062e-06, |
| "loss": 1.0785, |
| "step": 113 |
| }, |
| { |
| "epoch": 1.3089023954236683, |
| "grad_norm": 4.65625, |
| "learning_rate": 4.746754155806437e-06, |
| "loss": 1.1186, |
| "step": 114 |
| }, |
| { |
| "epoch": 1.320343224883804, |
| "grad_norm": 1.671875, |
| "learning_rate": 4.62033442887377e-06, |
| "loss": 1.0442, |
| "step": 115 |
| }, |
| { |
| "epoch": 1.3317840543439399, |
| "grad_norm": 13.125, |
| "learning_rate": 4.49415839006284e-06, |
| "loss": 0.992, |
| "step": 116 |
| }, |
| { |
| "epoch": 1.343224883804076, |
| "grad_norm": 2.515625, |
| "learning_rate": 4.368307025389355e-06, |
| "loss": 1.0311, |
| "step": 117 |
| }, |
| { |
| "epoch": 1.3546657132642117, |
| "grad_norm": 2.234375, |
| "learning_rate": 4.2428611124771184e-06, |
| "loss": 1.1222, |
| "step": 118 |
| }, |
| { |
| "epoch": 1.3661065427243475, |
| "grad_norm": 3.859375, |
| "learning_rate": 4.11790116871096e-06, |
| "loss": 1.0476, |
| "step": 119 |
| }, |
| { |
| "epoch": 1.3775473721844833, |
| "grad_norm": 2.875, |
| "learning_rate": 3.993507399556699e-06, |
| "loss": 1.1549, |
| "step": 120 |
| }, |
| { |
| "epoch": 1.3889882016446191, |
| "grad_norm": 3.203125, |
| "learning_rate": 3.869759647081326e-06, |
| "loss": 1.0162, |
| "step": 121 |
| }, |
| { |
| "epoch": 1.4004290311047551, |
| "grad_norm": 2.515625, |
| "learning_rate": 3.7467373387063973e-06, |
| "loss": 1.0692, |
| "step": 122 |
| }, |
| { |
| "epoch": 1.411869860564891, |
| "grad_norm": 9.5, |
| "learning_rate": 3.62451943622761e-06, |
| "loss": 1.1584, |
| "step": 123 |
| }, |
| { |
| "epoch": 1.4233106900250267, |
| "grad_norm": 2.0625, |
| "learning_rate": 3.5031843851332105e-06, |
| "loss": 1.1119, |
| "step": 124 |
| }, |
| { |
| "epoch": 1.4347515194851628, |
| "grad_norm": 3.453125, |
| "learning_rate": 3.3828100642538097e-06, |
| "loss": 1.0913, |
| "step": 125 |
| }, |
| { |
| "epoch": 1.4461923489452986, |
| "grad_norm": 2.046875, |
| "learning_rate": 3.2634737357758994e-06, |
| "loss": 1.0886, |
| "step": 126 |
| }, |
| { |
| "epoch": 1.4576331784054344, |
| "grad_norm": 2.453125, |
| "learning_rate": 3.145251995651162e-06, |
| "loss": 1.064, |
| "step": 127 |
| }, |
| { |
| "epoch": 1.4690740078655702, |
| "grad_norm": 1.1953125, |
| "learning_rate": 3.0282207244334084e-06, |
| "loss": 1.029, |
| "step": 128 |
| }, |
| { |
| "epoch": 1.480514837325706, |
| "grad_norm": 3.21875, |
| "learning_rate": 2.912455038574686e-06, |
| "loss": 1.0867, |
| "step": 129 |
| }, |
| { |
| "epoch": 1.491955666785842, |
| "grad_norm": 2.828125, |
| "learning_rate": 2.7980292422118282e-06, |
| "loss": 1.1117, |
| "step": 130 |
| }, |
| { |
| "epoch": 1.5033964962459778, |
| "grad_norm": 6.21875, |
| "learning_rate": 2.6850167794743966e-06, |
| "loss": 1.0614, |
| "step": 131 |
| }, |
| { |
| "epoch": 1.5148373257061136, |
| "grad_norm": 1.0078125, |
| "learning_rate": 2.573490187344596e-06, |
| "loss": 1.016, |
| "step": 132 |
| }, |
| { |
| "epoch": 1.5262781551662497, |
| "grad_norm": 5.65625, |
| "learning_rate": 2.4635210490994648e-06, |
| "loss": 1.0849, |
| "step": 133 |
| }, |
| { |
| "epoch": 1.5377189846263855, |
| "grad_norm": 2.5625, |
| "learning_rate": 2.3551799483651894e-06, |
| "loss": 1.0158, |
| "step": 134 |
| }, |
| { |
| "epoch": 1.5491598140865213, |
| "grad_norm": 0.9453125, |
| "learning_rate": 2.2485364238130435e-06, |
| "loss": 1.0059, |
| "step": 135 |
| }, |
| { |
| "epoch": 1.5606006435466573, |
| "grad_norm": 1.71875, |
| "learning_rate": 2.1436589245260375e-06, |
| "loss": 1.0081, |
| "step": 136 |
| }, |
| { |
| "epoch": 1.5720414730067929, |
| "grad_norm": 2.09375, |
| "learning_rate": 2.040614766064913e-06, |
| "loss": 1.0354, |
| "step": 137 |
| }, |
| { |
| "epoch": 1.583482302466929, |
| "grad_norm": 2.875, |
| "learning_rate": 1.9394700872616856e-06, |
| "loss": 1.0662, |
| "step": 138 |
| }, |
| { |
| "epoch": 1.5949231319270647, |
| "grad_norm": 2.390625, |
| "learning_rate": 1.8402898077684806e-06, |
| "loss": 1.1284, |
| "step": 139 |
| }, |
| { |
| "epoch": 1.6063639613872005, |
| "grad_norm": 1.9609375, |
| "learning_rate": 1.74313758638889e-06, |
| "loss": 1.0253, |
| "step": 140 |
| }, |
| { |
| "epoch": 1.6178047908473365, |
| "grad_norm": 2.328125, |
| "learning_rate": 1.648075780218607e-06, |
| "loss": 1.1114, |
| "step": 141 |
| }, |
| { |
| "epoch": 1.6292456203074723, |
| "grad_norm": 1.3359375, |
| "learning_rate": 1.555165404621567e-06, |
| "loss": 1.0888, |
| "step": 142 |
| }, |
| { |
| "epoch": 1.6406864497676081, |
| "grad_norm": 2.4375, |
| "learning_rate": 1.4644660940672628e-06, |
| "loss": 1.0505, |
| "step": 143 |
| }, |
| { |
| "epoch": 1.6521272792277442, |
| "grad_norm": 6.0, |
| "learning_rate": 1.3760360638544012e-06, |
| "loss": 1.2607, |
| "step": 144 |
| }, |
| { |
| "epoch": 1.6635681086878797, |
| "grad_norm": 4.1875, |
| "learning_rate": 1.2899320727454472e-06, |
| "loss": 1.1066, |
| "step": 145 |
| }, |
| { |
| "epoch": 1.6750089381480158, |
| "grad_norm": 2.71875, |
| "learning_rate": 1.2062093865360458e-06, |
| "loss": 1.0927, |
| "step": 146 |
| }, |
| { |
| "epoch": 1.6864497676081516, |
| "grad_norm": 5.1875, |
| "learning_rate": 1.1249217425827063e-06, |
| "loss": 1.0638, |
| "step": 147 |
| }, |
| { |
| "epoch": 1.6978905970682874, |
| "grad_norm": 2.125, |
| "learning_rate": 1.046121315311508e-06, |
| "loss": 1.0502, |
| "step": 148 |
| }, |
| { |
| "epoch": 1.7093314265284234, |
| "grad_norm": 4.34375, |
| "learning_rate": 9.69858682729976e-07, |
| "loss": 1.0722, |
| "step": 149 |
| }, |
| { |
| "epoch": 1.7207722559885592, |
| "grad_norm": 3.84375, |
| "learning_rate": 8.961827939636198e-07, |
| "loss": 1.1203, |
| "step": 150 |
| }, |
| { |
| "epoch": 1.732213085448695, |
| "grad_norm": 4.09375, |
| "learning_rate": 8.251409378379638e-07, |
| "loss": 1.1094, |
| "step": 151 |
| }, |
| { |
| "epoch": 1.743653914908831, |
| "grad_norm": 2.859375, |
| "learning_rate": 7.567787125262449e-07, |
| "loss": 1.0351, |
| "step": 152 |
| }, |
| { |
| "epoch": 1.7550947443689666, |
| "grad_norm": 16.125, |
| "learning_rate": 6.911399962822518e-07, |
| "loss": 1.2069, |
| "step": 153 |
| }, |
| { |
| "epoch": 1.7665355738291026, |
| "grad_norm": 1.296875, |
| "learning_rate": 6.282669192770896e-07, |
| "loss": 1.0696, |
| "step": 154 |
| }, |
| { |
| "epoch": 1.7779764032892384, |
| "grad_norm": 3.140625, |
| "learning_rate": 5.681998365579594e-07, |
| "loss": 1.117, |
| "step": 155 |
| }, |
| { |
| "epoch": 1.7894172327493743, |
| "grad_norm": 1.609375, |
| "learning_rate": 5.109773021462921e-07, |
| "loss": 1.0426, |
| "step": 156 |
| }, |
| { |
| "epoch": 1.8008580622095103, |
| "grad_norm": 1.171875, |
| "learning_rate": 4.5663604429187547e-07, |
| "loss": 1.0841, |
| "step": 157 |
| }, |
| { |
| "epoch": 1.812298891669646, |
| "grad_norm": 1.8984375, |
| "learning_rate": 4.05210941898847e-07, |
| "loss": 0.9866, |
| "step": 158 |
| }, |
| { |
| "epoch": 1.8237397211297819, |
| "grad_norm": 3.25, |
| "learning_rate": 3.567350021386895e-07, |
| "loss": 0.9696, |
| "step": 159 |
| }, |
| { |
| "epoch": 1.835180550589918, |
| "grad_norm": 1.8984375, |
| "learning_rate": 3.112393392645985e-07, |
| "loss": 1.0818, |
| "step": 160 |
| }, |
| { |
| "epoch": 1.8466213800500535, |
| "grad_norm": 2.71875, |
| "learning_rate": 2.6875315464081566e-07, |
| "loss": 1.102, |
| "step": 161 |
| }, |
| { |
| "epoch": 1.8580622095101895, |
| "grad_norm": 5.625, |
| "learning_rate": 2.2930371799975593e-07, |
| "loss": 1.0949, |
| "step": 162 |
| }, |
| { |
| "epoch": 1.8695030389703253, |
| "grad_norm": 2.671875, |
| "learning_rate": 1.9291634993893803e-07, |
| "loss": 1.0649, |
| "step": 163 |
| }, |
| { |
| "epoch": 1.8809438684304611, |
| "grad_norm": 6.375, |
| "learning_rate": 1.5961440566897913e-07, |
| "loss": 1.1652, |
| "step": 164 |
| }, |
| { |
| "epoch": 1.8923846978905972, |
| "grad_norm": 1.515625, |
| "learning_rate": 1.2941926002306536e-07, |
| "loss": 1.088, |
| "step": 165 |
| }, |
| { |
| "epoch": 1.903825527350733, |
| "grad_norm": 2.875, |
| "learning_rate": 1.0235029373752758e-07, |
| "loss": 1.091, |
| "step": 166 |
| }, |
| { |
| "epoch": 1.9152663568108688, |
| "grad_norm": 4.21875, |
| "learning_rate": 7.842488101232893e-08, |
| "loss": 1.0127, |
| "step": 167 |
| }, |
| { |
| "epoch": 1.9267071862710048, |
| "grad_norm": 1.5, |
| "learning_rate": 5.7658378359443104e-08, |
| "loss": 1.0575, |
| "step": 168 |
| }, |
| { |
| "epoch": 1.9381480157311404, |
| "grad_norm": 2.265625, |
| "learning_rate": 4.006411474628491e-08, |
| "loss": 1.0529, |
| "step": 169 |
| }, |
| { |
| "epoch": 1.9495888451912764, |
| "grad_norm": 3.4375, |
| "learning_rate": 2.5653383040524228e-08, |
| "loss": 1.1027, |
| "step": 170 |
| }, |
| { |
| "epoch": 1.9610296746514122, |
| "grad_norm": 4.96875, |
| "learning_rate": 1.4435432761762958e-08, |
| "loss": 1.1598, |
| "step": 171 |
| }, |
| { |
| "epoch": 1.972470504111548, |
| "grad_norm": 2.5, |
| "learning_rate": 6.417464144736208e-09, |
| "loss": 1.001, |
| "step": 172 |
| }, |
| { |
| "epoch": 1.983911333571684, |
| "grad_norm": 5.875, |
| "learning_rate": 1.6046235178474034e-09, |
| "loss": 1.161, |
| "step": 173 |
| }, |
| { |
| "epoch": 1.9953521630318198, |
| "grad_norm": 1.4296875, |
| "learning_rate": 0.0, |
| "loss": 1.0751, |
| "step": 174 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 174, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 44, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 4.2837861025752023e+18, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|