{ "best_metric": 1.5770864486694336, "best_model_checkpoint": "miner_id_24/checkpoint-600", "epoch": 0.3236573278041873, "eval_steps": 200, "global_step": 600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005394288796736456, "grad_norm": 18.71552085876465, "learning_rate": 2.0000000000000003e-06, "loss": 57.3329, "step": 1 }, { "epoch": 0.0005394288796736456, "eval_loss": 4.576467514038086, "eval_runtime": 141.0154, "eval_samples_per_second": 2.12, "eval_steps_per_second": 2.12, "step": 1 }, { "epoch": 0.0010788577593472911, "grad_norm": 40.17718505859375, "learning_rate": 4.000000000000001e-06, "loss": 111.721, "step": 2 }, { "epoch": 0.0016182866390209367, "grad_norm": 55.62163162231445, "learning_rate": 6e-06, "loss": 145.8098, "step": 3 }, { "epoch": 0.0021577155186945822, "grad_norm": 70.09906005859375, "learning_rate": 8.000000000000001e-06, "loss": 176.5399, "step": 4 }, { "epoch": 0.0026971443983682276, "grad_norm": 96.45822143554688, "learning_rate": 1e-05, "loss": 205.6804, "step": 5 }, { "epoch": 0.0032365732780418733, "grad_norm": 96.46897888183594, "learning_rate": 1.2e-05, "loss": 191.1242, "step": 6 }, { "epoch": 0.0037760021577155187, "grad_norm": 123.18101501464844, "learning_rate": 1.4000000000000001e-05, "loss": 200.1216, "step": 7 }, { "epoch": 0.0043154310373891645, "grad_norm": 112.75751495361328, "learning_rate": 1.6000000000000003e-05, "loss": 199.3468, "step": 8 }, { "epoch": 0.00485485991706281, "grad_norm": 105.84030151367188, "learning_rate": 1.8e-05, "loss": 197.7578, "step": 9 }, { "epoch": 0.005394288796736455, "grad_norm": 152.0435333251953, "learning_rate": 2e-05, "loss": 221.8745, "step": 10 }, { "epoch": 0.0059337176764101005, "grad_norm": 140.9628143310547, "learning_rate": 2.2000000000000003e-05, "loss": 202.1205, "step": 11 }, { "epoch": 0.006473146556083747, "grad_norm": 136.8531036376953, "learning_rate": 2.4e-05, "loss": 192.207, "step": 12 }, { "epoch": 0.007012575435757392, "grad_norm": 135.1580352783203, "learning_rate": 2.6000000000000002e-05, "loss": 188.5981, "step": 13 }, { "epoch": 0.007552004315431037, "grad_norm": 135.94815063476562, "learning_rate": 2.8000000000000003e-05, "loss": 182.9973, "step": 14 }, { "epoch": 0.008091433195104683, "grad_norm": 130.7935333251953, "learning_rate": 3e-05, "loss": 181.9996, "step": 15 }, { "epoch": 0.008630862074778329, "grad_norm": 135.71165466308594, "learning_rate": 3.2000000000000005e-05, "loss": 156.7745, "step": 16 }, { "epoch": 0.009170290954451973, "grad_norm": 80.55735778808594, "learning_rate": 3.4000000000000007e-05, "loss": 105.2249, "step": 17 }, { "epoch": 0.00970971983412562, "grad_norm": 78.56623840332031, "learning_rate": 3.6e-05, "loss": 93.8699, "step": 18 }, { "epoch": 0.010249148713799266, "grad_norm": 73.5405502319336, "learning_rate": 3.8e-05, "loss": 96.5256, "step": 19 }, { "epoch": 0.01078857759347291, "grad_norm": 66.16717529296875, "learning_rate": 4e-05, "loss": 79.6901, "step": 20 }, { "epoch": 0.011328006473146556, "grad_norm": 65.6923599243164, "learning_rate": 4.2e-05, "loss": 90.1881, "step": 21 }, { "epoch": 0.011867435352820201, "grad_norm": 77.53053283691406, "learning_rate": 4.4000000000000006e-05, "loss": 85.8049, "step": 22 }, { "epoch": 0.012406864232493847, "grad_norm": 71.17222595214844, "learning_rate": 4.600000000000001e-05, "loss": 64.6935, "step": 23 }, { "epoch": 0.012946293112167493, "grad_norm": 46.50193786621094, "learning_rate": 4.8e-05, "loss": 72.2138, "step": 24 }, { "epoch": 0.013485721991841138, "grad_norm": 45.66022491455078, "learning_rate": 5e-05, "loss": 71.2709, "step": 25 }, { "epoch": 0.014025150871514784, "grad_norm": 46.14365768432617, "learning_rate": 5.2000000000000004e-05, "loss": 59.7781, "step": 26 }, { "epoch": 0.014564579751188429, "grad_norm": 54.284664154052734, "learning_rate": 5.4000000000000005e-05, "loss": 64.8576, "step": 27 }, { "epoch": 0.015104008630862075, "grad_norm": 43.3782958984375, "learning_rate": 5.6000000000000006e-05, "loss": 68.6312, "step": 28 }, { "epoch": 0.01564343751053572, "grad_norm": 35.549217224121094, "learning_rate": 5.8e-05, "loss": 62.3088, "step": 29 }, { "epoch": 0.016182866390209365, "grad_norm": 42.21353530883789, "learning_rate": 6e-05, "loss": 65.5001, "step": 30 }, { "epoch": 0.01672229526988301, "grad_norm": 46.08031463623047, "learning_rate": 6.2e-05, "loss": 57.0952, "step": 31 }, { "epoch": 0.017261724149556658, "grad_norm": 38.45962905883789, "learning_rate": 6.400000000000001e-05, "loss": 65.1331, "step": 32 }, { "epoch": 0.017801153029230302, "grad_norm": 34.330406188964844, "learning_rate": 6.6e-05, "loss": 56.026, "step": 33 }, { "epoch": 0.018340581908903947, "grad_norm": 35.08675003051758, "learning_rate": 6.800000000000001e-05, "loss": 55.9277, "step": 34 }, { "epoch": 0.018880010788577595, "grad_norm": 37.337825775146484, "learning_rate": 7e-05, "loss": 53.9387, "step": 35 }, { "epoch": 0.01941943966825124, "grad_norm": 36.146873474121094, "learning_rate": 7.2e-05, "loss": 61.2999, "step": 36 }, { "epoch": 0.019958868547924884, "grad_norm": 41.229610443115234, "learning_rate": 7.4e-05, "loss": 70.8618, "step": 37 }, { "epoch": 0.02049829742759853, "grad_norm": 42.86275863647461, "learning_rate": 7.6e-05, "loss": 60.1886, "step": 38 }, { "epoch": 0.021037726307272176, "grad_norm": 36.5433235168457, "learning_rate": 7.800000000000001e-05, "loss": 61.5439, "step": 39 }, { "epoch": 0.02157715518694582, "grad_norm": 39.95774841308594, "learning_rate": 8e-05, "loss": 57.8462, "step": 40 }, { "epoch": 0.022116584066619465, "grad_norm": 38.86470413208008, "learning_rate": 8.2e-05, "loss": 55.4324, "step": 41 }, { "epoch": 0.022656012946293113, "grad_norm": 30.977352142333984, "learning_rate": 8.4e-05, "loss": 57.6402, "step": 42 }, { "epoch": 0.023195441825966757, "grad_norm": 38.25783157348633, "learning_rate": 8.6e-05, "loss": 50.586, "step": 43 }, { "epoch": 0.023734870705640402, "grad_norm": 37.11707305908203, "learning_rate": 8.800000000000001e-05, "loss": 36.7947, "step": 44 }, { "epoch": 0.02427429958531405, "grad_norm": 40.3302116394043, "learning_rate": 9e-05, "loss": 40.2388, "step": 45 }, { "epoch": 0.024813728464987694, "grad_norm": 42.60755920410156, "learning_rate": 9.200000000000001e-05, "loss": 58.9665, "step": 46 }, { "epoch": 0.02535315734466134, "grad_norm": 44.4195442199707, "learning_rate": 9.4e-05, "loss": 50.2349, "step": 47 }, { "epoch": 0.025892586224334987, "grad_norm": 37.404727935791016, "learning_rate": 9.6e-05, "loss": 48.7437, "step": 48 }, { "epoch": 0.02643201510400863, "grad_norm": 48.31377410888672, "learning_rate": 9.8e-05, "loss": 54.4895, "step": 49 }, { "epoch": 0.026971443983682276, "grad_norm": 51.360191345214844, "learning_rate": 0.0001, "loss": 57.4654, "step": 50 }, { "epoch": 0.02751087286335592, "grad_norm": 23.211647033691406, "learning_rate": 0.00010200000000000001, "loss": 36.3068, "step": 51 }, { "epoch": 0.028050301743029568, "grad_norm": 34.541805267333984, "learning_rate": 0.00010400000000000001, "loss": 75.6809, "step": 52 }, { "epoch": 0.028589730622703213, "grad_norm": 42.0761833190918, "learning_rate": 0.00010600000000000002, "loss": 96.818, "step": 53 }, { "epoch": 0.029129159502376857, "grad_norm": 43.26933670043945, "learning_rate": 0.00010800000000000001, "loss": 101.1451, "step": 54 }, { "epoch": 0.029668588382050505, "grad_norm": 51.45765686035156, "learning_rate": 0.00011000000000000002, "loss": 115.0704, "step": 55 }, { "epoch": 0.03020801726172415, "grad_norm": 43.3838005065918, "learning_rate": 0.00011200000000000001, "loss": 112.4747, "step": 56 }, { "epoch": 0.030747446141397794, "grad_norm": 59.83226013183594, "learning_rate": 0.00011399999999999999, "loss": 114.741, "step": 57 }, { "epoch": 0.03128687502107144, "grad_norm": 38.54649353027344, "learning_rate": 0.000116, "loss": 100.9508, "step": 58 }, { "epoch": 0.03182630390074508, "grad_norm": 34.7606086730957, "learning_rate": 0.000118, "loss": 87.4097, "step": 59 }, { "epoch": 0.03236573278041873, "grad_norm": 34.808265686035156, "learning_rate": 0.00012, "loss": 94.2411, "step": 60 }, { "epoch": 0.03290516166009238, "grad_norm": 33.40951156616211, "learning_rate": 0.000122, "loss": 85.6042, "step": 61 }, { "epoch": 0.03344459053976602, "grad_norm": 25.83111572265625, "learning_rate": 0.000124, "loss": 79.6747, "step": 62 }, { "epoch": 0.03398401941943967, "grad_norm": 51.73832321166992, "learning_rate": 0.000126, "loss": 65.953, "step": 63 }, { "epoch": 0.034523448299113316, "grad_norm": 38.63320541381836, "learning_rate": 0.00012800000000000002, "loss": 72.4182, "step": 64 }, { "epoch": 0.03506287717878696, "grad_norm": 20.10302734375, "learning_rate": 0.00013000000000000002, "loss": 61.2385, "step": 65 }, { "epoch": 0.035602306058460605, "grad_norm": 27.804248809814453, "learning_rate": 0.000132, "loss": 59.6029, "step": 66 }, { "epoch": 0.03614173493813425, "grad_norm": 30.542932510375977, "learning_rate": 0.000134, "loss": 51.2451, "step": 67 }, { "epoch": 0.036681163817807894, "grad_norm": 70.11331176757812, "learning_rate": 0.00013600000000000003, "loss": 51.116, "step": 68 }, { "epoch": 0.03722059269748154, "grad_norm": 155.8134307861328, "learning_rate": 0.000138, "loss": 76.3231, "step": 69 }, { "epoch": 0.03776002157715519, "grad_norm": 146.5844268798828, "learning_rate": 0.00014, "loss": 68.6173, "step": 70 }, { "epoch": 0.03829945045682883, "grad_norm": 102.16127014160156, "learning_rate": 0.000142, "loss": 72.0777, "step": 71 }, { "epoch": 0.03883887933650248, "grad_norm": 40.04204559326172, "learning_rate": 0.000144, "loss": 52.744, "step": 72 }, { "epoch": 0.039378308216176126, "grad_norm": 75.35163116455078, "learning_rate": 0.000146, "loss": 59.1683, "step": 73 }, { "epoch": 0.03991773709584977, "grad_norm": 77.30841827392578, "learning_rate": 0.000148, "loss": 51.3059, "step": 74 }, { "epoch": 0.040457165975523415, "grad_norm": 52.49984359741211, "learning_rate": 0.00015000000000000001, "loss": 52.407, "step": 75 }, { "epoch": 0.04099659485519706, "grad_norm": 35.61119842529297, "learning_rate": 0.000152, "loss": 50.8863, "step": 76 }, { "epoch": 0.041536023734870704, "grad_norm": 34.10403060913086, "learning_rate": 0.000154, "loss": 53.6901, "step": 77 }, { "epoch": 0.04207545261454435, "grad_norm": 39.79935836791992, "learning_rate": 0.00015600000000000002, "loss": 49.8857, "step": 78 }, { "epoch": 0.042614881494218, "grad_norm": 35.74922561645508, "learning_rate": 0.00015800000000000002, "loss": 62.577, "step": 79 }, { "epoch": 0.04315431037389164, "grad_norm": 31.491291046142578, "learning_rate": 0.00016, "loss": 52.0815, "step": 80 }, { "epoch": 0.04369373925356529, "grad_norm": 23.866592407226562, "learning_rate": 0.000162, "loss": 64.6077, "step": 81 }, { "epoch": 0.04423316813323893, "grad_norm": 28.5296688079834, "learning_rate": 0.000164, "loss": 59.5244, "step": 82 }, { "epoch": 0.04477259701291258, "grad_norm": 33.92407989501953, "learning_rate": 0.000166, "loss": 62.9956, "step": 83 }, { "epoch": 0.045312025892586226, "grad_norm": 27.05453109741211, "learning_rate": 0.000168, "loss": 52.9777, "step": 84 }, { "epoch": 0.04585145477225987, "grad_norm": 23.927709579467773, "learning_rate": 0.00017, "loss": 56.0232, "step": 85 }, { "epoch": 0.046390883651933515, "grad_norm": 31.250370025634766, "learning_rate": 0.000172, "loss": 55.1487, "step": 86 }, { "epoch": 0.04693031253160716, "grad_norm": 32.98558044433594, "learning_rate": 0.000174, "loss": 54.3132, "step": 87 }, { "epoch": 0.047469741411280804, "grad_norm": 39.15415954589844, "learning_rate": 0.00017600000000000002, "loss": 56.2989, "step": 88 }, { "epoch": 0.04800917029095445, "grad_norm": 32.42843246459961, "learning_rate": 0.00017800000000000002, "loss": 41.7672, "step": 89 }, { "epoch": 0.0485485991706281, "grad_norm": 42.03153610229492, "learning_rate": 0.00018, "loss": 50.3046, "step": 90 }, { "epoch": 0.04908802805030174, "grad_norm": 38.14472961425781, "learning_rate": 0.000182, "loss": 50.2817, "step": 91 }, { "epoch": 0.04962745692997539, "grad_norm": 32.74757385253906, "learning_rate": 0.00018400000000000003, "loss": 47.9721, "step": 92 }, { "epoch": 0.05016688580964904, "grad_norm": 41.20277404785156, "learning_rate": 0.00018600000000000002, "loss": 48.2985, "step": 93 }, { "epoch": 0.05070631468932268, "grad_norm": 42.31992721557617, "learning_rate": 0.000188, "loss": 58.7386, "step": 94 }, { "epoch": 0.051245743568996326, "grad_norm": 28.106618881225586, "learning_rate": 0.00019, "loss": 46.3057, "step": 95 }, { "epoch": 0.051785172448669974, "grad_norm": 37.70038604736328, "learning_rate": 0.000192, "loss": 35.5874, "step": 96 }, { "epoch": 0.052324601328343615, "grad_norm": 36.007530212402344, "learning_rate": 0.000194, "loss": 47.9065, "step": 97 }, { "epoch": 0.05286403020801726, "grad_norm": 29.738492965698242, "learning_rate": 0.000196, "loss": 49.6222, "step": 98 }, { "epoch": 0.05340345908769091, "grad_norm": 42.806785583496094, "learning_rate": 0.00019800000000000002, "loss": 44.6868, "step": 99 }, { "epoch": 0.05394288796736455, "grad_norm": 31.359643936157227, "learning_rate": 0.0002, "loss": 30.5743, "step": 100 }, { "epoch": 0.0544823168470382, "grad_norm": 24.176820755004883, "learning_rate": 0.00019999998344063995, "loss": 41.8829, "step": 101 }, { "epoch": 0.05502174572671184, "grad_norm": 43.5556755065918, "learning_rate": 0.00019999993376256528, "loss": 64.5931, "step": 102 }, { "epoch": 0.05556117460638549, "grad_norm": 35.98505401611328, "learning_rate": 0.00019999985096579245, "loss": 94.4231, "step": 103 }, { "epoch": 0.056100603486059136, "grad_norm": 35.83631134033203, "learning_rate": 0.00019999973505034887, "loss": 113.3877, "step": 104 }, { "epoch": 0.05664003236573278, "grad_norm": 30.29425621032715, "learning_rate": 0.00019999958601627296, "loss": 113.0325, "step": 105 }, { "epoch": 0.057179461245406425, "grad_norm": 27.389789581298828, "learning_rate": 0.000199999403863614, "loss": 111.3191, "step": 106 }, { "epoch": 0.05771889012508007, "grad_norm": 27.400251388549805, "learning_rate": 0.00019999918859243244, "loss": 97.0415, "step": 107 }, { "epoch": 0.058258319004753714, "grad_norm": 20.399946212768555, "learning_rate": 0.0001999989402027995, "loss": 90.2641, "step": 108 }, { "epoch": 0.05879774788442736, "grad_norm": 25.029308319091797, "learning_rate": 0.0001999986586947974, "loss": 94.4251, "step": 109 }, { "epoch": 0.05933717676410101, "grad_norm": 29.495418548583984, "learning_rate": 0.00019999834406851945, "loss": 94.9159, "step": 110 }, { "epoch": 0.05987660564377465, "grad_norm": 19.77571678161621, "learning_rate": 0.0001999979963240698, "loss": 75.4925, "step": 111 }, { "epoch": 0.0604160345234483, "grad_norm": 25.004566192626953, "learning_rate": 0.00019999761546156365, "loss": 71.3454, "step": 112 }, { "epoch": 0.06095546340312195, "grad_norm": 34.21379852294922, "learning_rate": 0.00019999720148112715, "loss": 66.511, "step": 113 }, { "epoch": 0.06149489228279559, "grad_norm": 22.71439552307129, "learning_rate": 0.00019999675438289738, "loss": 52.0498, "step": 114 }, { "epoch": 0.062034321162469236, "grad_norm": 24.381750106811523, "learning_rate": 0.0001999962741670224, "loss": 55.2827, "step": 115 }, { "epoch": 0.06257375004214288, "grad_norm": 37.246803283691406, "learning_rate": 0.00019999576083366125, "loss": 54.9355, "step": 116 }, { "epoch": 0.06311317892181653, "grad_norm": 81.53564453125, "learning_rate": 0.00019999521438298398, "loss": 59.4422, "step": 117 }, { "epoch": 0.06365260780149017, "grad_norm": 129.4823760986328, "learning_rate": 0.00019999463481517156, "loss": 67.393, "step": 118 }, { "epoch": 0.06419203668116381, "grad_norm": 77.96698760986328, "learning_rate": 0.00019999402213041588, "loss": 67.9443, "step": 119 }, { "epoch": 0.06473146556083746, "grad_norm": 53.094512939453125, "learning_rate": 0.0001999933763289199, "loss": 61.054, "step": 120 }, { "epoch": 0.06527089444051111, "grad_norm": 52.896366119384766, "learning_rate": 0.00019999269741089752, "loss": 62.3436, "step": 121 }, { "epoch": 0.06581032332018476, "grad_norm": 57.282318115234375, "learning_rate": 0.00019999198537657353, "loss": 56.6129, "step": 122 }, { "epoch": 0.0663497521998584, "grad_norm": 46.553062438964844, "learning_rate": 0.0001999912402261838, "loss": 55.701, "step": 123 }, { "epoch": 0.06688918107953204, "grad_norm": 28.822669982910156, "learning_rate": 0.00019999046195997512, "loss": 54.2102, "step": 124 }, { "epoch": 0.06742860995920569, "grad_norm": 28.726089477539062, "learning_rate": 0.00019998965057820516, "loss": 56.0332, "step": 125 }, { "epoch": 0.06796803883887934, "grad_norm": 26.886003494262695, "learning_rate": 0.0001999888060811427, "loss": 43.4516, "step": 126 }, { "epoch": 0.06850746771855298, "grad_norm": 31.9282169342041, "learning_rate": 0.00019998792846906747, "loss": 52.2149, "step": 127 }, { "epoch": 0.06904689659822663, "grad_norm": 38.317962646484375, "learning_rate": 0.00019998701774227005, "loss": 54.0044, "step": 128 }, { "epoch": 0.06958632547790028, "grad_norm": 31.158544540405273, "learning_rate": 0.00019998607390105209, "loss": 55.2255, "step": 129 }, { "epoch": 0.07012575435757391, "grad_norm": 33.239166259765625, "learning_rate": 0.00019998509694572615, "loss": 56.3811, "step": 130 }, { "epoch": 0.07066518323724756, "grad_norm": 30.34086799621582, "learning_rate": 0.00019998408687661582, "loss": 52.0529, "step": 131 }, { "epoch": 0.07120461211692121, "grad_norm": 24.05341911315918, "learning_rate": 0.00019998304369405563, "loss": 60.5602, "step": 132 }, { "epoch": 0.07174404099659486, "grad_norm": 26.90273094177246, "learning_rate": 0.00019998196739839103, "loss": 57.3375, "step": 133 }, { "epoch": 0.0722834698762685, "grad_norm": 24.157773971557617, "learning_rate": 0.0001999808579899785, "loss": 47.7251, "step": 134 }, { "epoch": 0.07282289875594215, "grad_norm": 28.088014602661133, "learning_rate": 0.00019997971546918545, "loss": 56.1037, "step": 135 }, { "epoch": 0.07336232763561579, "grad_norm": 32.39021682739258, "learning_rate": 0.00019997853983639029, "loss": 52.0922, "step": 136 }, { "epoch": 0.07390175651528944, "grad_norm": 29.597578048706055, "learning_rate": 0.0001999773310919824, "loss": 46.3537, "step": 137 }, { "epoch": 0.07444118539496308, "grad_norm": 38.31181335449219, "learning_rate": 0.000199976089236362, "loss": 46.8711, "step": 138 }, { "epoch": 0.07498061427463673, "grad_norm": 39.67713165283203, "learning_rate": 0.00019997481426994044, "loss": 45.0961, "step": 139 }, { "epoch": 0.07552004315431038, "grad_norm": 48.8436164855957, "learning_rate": 0.00019997350619314, "loss": 48.7547, "step": 140 }, { "epoch": 0.07605947203398401, "grad_norm": 88.95709991455078, "learning_rate": 0.00019997216500639383, "loss": 50.3681, "step": 141 }, { "epoch": 0.07659890091365766, "grad_norm": 34.2819938659668, "learning_rate": 0.0001999707907101462, "loss": 44.3903, "step": 142 }, { "epoch": 0.07713832979333131, "grad_norm": 42.79631042480469, "learning_rate": 0.00019996938330485217, "loss": 31.0566, "step": 143 }, { "epoch": 0.07767775867300496, "grad_norm": 37.28693389892578, "learning_rate": 0.00019996794279097791, "loss": 34.0999, "step": 144 }, { "epoch": 0.0782171875526786, "grad_norm": 43.65718460083008, "learning_rate": 0.00019996646916900051, "loss": 48.7369, "step": 145 }, { "epoch": 0.07875661643235225, "grad_norm": 39.86713409423828, "learning_rate": 0.00019996496243940794, "loss": 36.6841, "step": 146 }, { "epoch": 0.07929604531202589, "grad_norm": 32.35002899169922, "learning_rate": 0.0001999634226026993, "loss": 43.1344, "step": 147 }, { "epoch": 0.07983547419169953, "grad_norm": 36.14616775512695, "learning_rate": 0.0001999618496593845, "loss": 51.9779, "step": 148 }, { "epoch": 0.08037490307137318, "grad_norm": 31.071197509765625, "learning_rate": 0.00019996024360998456, "loss": 39.5621, "step": 149 }, { "epoch": 0.08091433195104683, "grad_norm": 33.61774444580078, "learning_rate": 0.00019995860445503127, "loss": 37.7614, "step": 150 }, { "epoch": 0.08145376083072048, "grad_norm": 22.93950653076172, "learning_rate": 0.00019995693219506758, "loss": 59.2331, "step": 151 }, { "epoch": 0.08199318971039413, "grad_norm": 31.307132720947266, "learning_rate": 0.00019995522683064726, "loss": 70.8054, "step": 152 }, { "epoch": 0.08253261859006776, "grad_norm": 28.894466400146484, "learning_rate": 0.00019995348836233516, "loss": 84.8097, "step": 153 }, { "epoch": 0.08307204746974141, "grad_norm": 26.76435661315918, "learning_rate": 0.000199951716790707, "loss": 101.4707, "step": 154 }, { "epoch": 0.08361147634941506, "grad_norm": 26.842918395996094, "learning_rate": 0.00019994991211634954, "loss": 107.518, "step": 155 }, { "epoch": 0.0841509052290887, "grad_norm": 25.251588821411133, "learning_rate": 0.00019994807433986047, "loss": 106.076, "step": 156 }, { "epoch": 0.08469033410876235, "grad_norm": 28.60271453857422, "learning_rate": 0.0001999462034618484, "loss": 96.3093, "step": 157 }, { "epoch": 0.085229762988436, "grad_norm": 22.537473678588867, "learning_rate": 0.00019994429948293291, "loss": 88.6475, "step": 158 }, { "epoch": 0.08576919186810963, "grad_norm": 18.868396759033203, "learning_rate": 0.00019994236240374465, "loss": 92.4222, "step": 159 }, { "epoch": 0.08630862074778328, "grad_norm": 21.84971046447754, "learning_rate": 0.00019994039222492513, "loss": 88.0079, "step": 160 }, { "epoch": 0.08684804962745693, "grad_norm": 23.634244918823242, "learning_rate": 0.00019993838894712682, "loss": 77.0574, "step": 161 }, { "epoch": 0.08738747850713058, "grad_norm": 18.22877311706543, "learning_rate": 0.00019993635257101322, "loss": 67.3958, "step": 162 }, { "epoch": 0.08792690738680423, "grad_norm": 21.62260627746582, "learning_rate": 0.00019993428309725872, "loss": 65.1832, "step": 163 }, { "epoch": 0.08846633626647786, "grad_norm": 18.148618698120117, "learning_rate": 0.0001999321805265487, "loss": 63.1231, "step": 164 }, { "epoch": 0.08900576514615151, "grad_norm": 20.20022201538086, "learning_rate": 0.00019993004485957956, "loss": 59.0852, "step": 165 }, { "epoch": 0.08954519402582516, "grad_norm": 28.2082576751709, "learning_rate": 0.00019992787609705853, "loss": 55.8505, "step": 166 }, { "epoch": 0.0900846229054988, "grad_norm": 43.48365020751953, "learning_rate": 0.00019992567423970394, "loss": 40.495, "step": 167 }, { "epoch": 0.09062405178517245, "grad_norm": 149.13955688476562, "learning_rate": 0.00019992343928824498, "loss": 91.8388, "step": 168 }, { "epoch": 0.0911634806648461, "grad_norm": 91.07251739501953, "learning_rate": 0.00019992117124342183, "loss": 61.9425, "step": 169 }, { "epoch": 0.09170290954451973, "grad_norm": 65.70806121826172, "learning_rate": 0.00019991887010598565, "loss": 59.7979, "step": 170 }, { "epoch": 0.09224233842419338, "grad_norm": 45.109580993652344, "learning_rate": 0.00019991653587669855, "loss": 63.235, "step": 171 }, { "epoch": 0.09278176730386703, "grad_norm": 49.24695587158203, "learning_rate": 0.00019991416855633364, "loss": 55.8371, "step": 172 }, { "epoch": 0.09332119618354068, "grad_norm": 44.50947952270508, "learning_rate": 0.0001999117681456749, "loss": 45.3712, "step": 173 }, { "epoch": 0.09386062506321433, "grad_norm": 45.105506896972656, "learning_rate": 0.00019990933464551728, "loss": 59.354, "step": 174 }, { "epoch": 0.09440005394288797, "grad_norm": 31.862106323242188, "learning_rate": 0.0001999068680566668, "loss": 49.2883, "step": 175 }, { "epoch": 0.09493948282256161, "grad_norm": 34.86188507080078, "learning_rate": 0.00019990436837994028, "loss": 40.9445, "step": 176 }, { "epoch": 0.09547891170223526, "grad_norm": 52.34774398803711, "learning_rate": 0.00019990183561616567, "loss": 54.3114, "step": 177 }, { "epoch": 0.0960183405819089, "grad_norm": 30.12732696533203, "learning_rate": 0.00019989926976618172, "loss": 44.8966, "step": 178 }, { "epoch": 0.09655776946158255, "grad_norm": 29.296287536621094, "learning_rate": 0.00019989667083083825, "loss": 47.5101, "step": 179 }, { "epoch": 0.0970971983412562, "grad_norm": 42.42873764038086, "learning_rate": 0.00019989403881099597, "loss": 48.2378, "step": 180 }, { "epoch": 0.09763662722092983, "grad_norm": 31.62274742126465, "learning_rate": 0.00019989137370752657, "loss": 42.1564, "step": 181 }, { "epoch": 0.09817605610060348, "grad_norm": 30.754499435424805, "learning_rate": 0.00019988867552131275, "loss": 52.2929, "step": 182 }, { "epoch": 0.09871548498027713, "grad_norm": 31.932157516479492, "learning_rate": 0.000199885944253248, "loss": 45.6226, "step": 183 }, { "epoch": 0.09925491385995078, "grad_norm": 33.754722595214844, "learning_rate": 0.00019988317990423703, "loss": 39.9572, "step": 184 }, { "epoch": 0.09979434273962443, "grad_norm": 33.33165740966797, "learning_rate": 0.00019988038247519522, "loss": 52.7357, "step": 185 }, { "epoch": 0.10033377161929807, "grad_norm": 28.355619430541992, "learning_rate": 0.0001998775519670491, "loss": 39.8865, "step": 186 }, { "epoch": 0.10087320049897171, "grad_norm": 60.16803741455078, "learning_rate": 0.00019987468838073613, "loss": 48.3595, "step": 187 }, { "epoch": 0.10141262937864536, "grad_norm": 33.5135498046875, "learning_rate": 0.00019987179171720464, "loss": 34.3803, "step": 188 }, { "epoch": 0.101952058258319, "grad_norm": 33.8374137878418, "learning_rate": 0.00019986886197741403, "loss": 46.4517, "step": 189 }, { "epoch": 0.10249148713799265, "grad_norm": 26.143709182739258, "learning_rate": 0.0001998658991623345, "loss": 30.6351, "step": 190 }, { "epoch": 0.1030309160176663, "grad_norm": 28.791723251342773, "learning_rate": 0.0001998629032729474, "loss": 44.2275, "step": 191 }, { "epoch": 0.10357034489733995, "grad_norm": 33.818931579589844, "learning_rate": 0.00019985987431024485, "loss": 43.5677, "step": 192 }, { "epoch": 0.10410977377701358, "grad_norm": 40.07392883300781, "learning_rate": 0.00019985681227523006, "loss": 34.5844, "step": 193 }, { "epoch": 0.10464920265668723, "grad_norm": 30.963062286376953, "learning_rate": 0.00019985371716891708, "loss": 44.1099, "step": 194 }, { "epoch": 0.10518863153636088, "grad_norm": 31.774293899536133, "learning_rate": 0.000199850588992331, "loss": 36.4496, "step": 195 }, { "epoch": 0.10572806041603452, "grad_norm": 47.396575927734375, "learning_rate": 0.00019984742774650785, "loss": 50.9736, "step": 196 }, { "epoch": 0.10626748929570817, "grad_norm": 58.573341369628906, "learning_rate": 0.00019984423343249457, "loss": 44.6643, "step": 197 }, { "epoch": 0.10680691817538182, "grad_norm": 33.57207107543945, "learning_rate": 0.00019984100605134906, "loss": 36.4154, "step": 198 }, { "epoch": 0.10734634705505545, "grad_norm": 33.817752838134766, "learning_rate": 0.00019983774560414027, "loss": 38.8474, "step": 199 }, { "epoch": 0.1078857759347291, "grad_norm": 34.572608947753906, "learning_rate": 0.00019983445209194791, "loss": 30.1009, "step": 200 }, { "epoch": 0.1078857759347291, "eval_loss": 1.836081624031067, "eval_runtime": 141.0356, "eval_samples_per_second": 2.12, "eval_steps_per_second": 2.12, "step": 200 }, { "epoch": 0.10842520481440275, "grad_norm": 23.590002059936523, "learning_rate": 0.0001998311255158628, "loss": 53.9458, "step": 201 }, { "epoch": 0.1089646336940764, "grad_norm": 39.737159729003906, "learning_rate": 0.00019982776587698666, "loss": 85.7514, "step": 202 }, { "epoch": 0.10950406257375005, "grad_norm": 35.41561508178711, "learning_rate": 0.00019982437317643217, "loss": 84.9662, "step": 203 }, { "epoch": 0.11004349145342368, "grad_norm": 31.39605140686035, "learning_rate": 0.0001998209474153229, "loss": 110.0561, "step": 204 }, { "epoch": 0.11058292033309733, "grad_norm": 30.160261154174805, "learning_rate": 0.00019981748859479348, "loss": 101.1574, "step": 205 }, { "epoch": 0.11112234921277098, "grad_norm": 33.4417724609375, "learning_rate": 0.00019981399671598939, "loss": 116.0456, "step": 206 }, { "epoch": 0.11166177809244462, "grad_norm": 34.16884994506836, "learning_rate": 0.0001998104717800671, "loss": 103.0287, "step": 207 }, { "epoch": 0.11220120697211827, "grad_norm": 33.58393859863281, "learning_rate": 0.00019980691378819406, "loss": 95.5024, "step": 208 }, { "epoch": 0.11274063585179192, "grad_norm": 29.785871505737305, "learning_rate": 0.00019980332274154857, "loss": 91.5854, "step": 209 }, { "epoch": 0.11328006473146555, "grad_norm": 29.184667587280273, "learning_rate": 0.00019979969864131997, "loss": 86.9138, "step": 210 }, { "epoch": 0.1138194936111392, "grad_norm": 25.164024353027344, "learning_rate": 0.00019979604148870854, "loss": 72.7827, "step": 211 }, { "epoch": 0.11435892249081285, "grad_norm": 18.179292678833008, "learning_rate": 0.00019979235128492545, "loss": 67.364, "step": 212 }, { "epoch": 0.1148983513704865, "grad_norm": 20.353260040283203, "learning_rate": 0.00019978862803119284, "loss": 60.0141, "step": 213 }, { "epoch": 0.11543778025016015, "grad_norm": 27.25603485107422, "learning_rate": 0.00019978487172874382, "loss": 61.8063, "step": 214 }, { "epoch": 0.1159772091298338, "grad_norm": 40.56468963623047, "learning_rate": 0.00019978108237882244, "loss": 51.2483, "step": 215 }, { "epoch": 0.11651663800950743, "grad_norm": 64.65696716308594, "learning_rate": 0.00019977725998268365, "loss": 37.8312, "step": 216 }, { "epoch": 0.11705606688918108, "grad_norm": 80.94468688964844, "learning_rate": 0.00019977340454159343, "loss": 55.2775, "step": 217 }, { "epoch": 0.11759549576885472, "grad_norm": 100.61930084228516, "learning_rate": 0.00019976951605682862, "loss": 65.5767, "step": 218 }, { "epoch": 0.11813492464852837, "grad_norm": 71.5768051147461, "learning_rate": 0.00019976559452967703, "loss": 57.5296, "step": 219 }, { "epoch": 0.11867435352820202, "grad_norm": 37.10725021362305, "learning_rate": 0.00019976163996143745, "loss": 48.8497, "step": 220 }, { "epoch": 0.11921378240787567, "grad_norm": 40.85627746582031, "learning_rate": 0.00019975765235341955, "loss": 47.6466, "step": 221 }, { "epoch": 0.1197532112875493, "grad_norm": 55.1395263671875, "learning_rate": 0.000199753631706944, "loss": 60.2519, "step": 222 }, { "epoch": 0.12029264016722295, "grad_norm": 42.060585021972656, "learning_rate": 0.00019974957802334234, "loss": 48.1031, "step": 223 }, { "epoch": 0.1208320690468966, "grad_norm": 36.57340621948242, "learning_rate": 0.00019974549130395713, "loss": 43.3995, "step": 224 }, { "epoch": 0.12137149792657025, "grad_norm": 31.497970581054688, "learning_rate": 0.0001997413715501419, "loss": 41.1591, "step": 225 }, { "epoch": 0.1219109268062439, "grad_norm": 30.481502532958984, "learning_rate": 0.00019973721876326094, "loss": 38.0712, "step": 226 }, { "epoch": 0.12245035568591753, "grad_norm": 38.2381477355957, "learning_rate": 0.00019973303294468968, "loss": 46.3861, "step": 227 }, { "epoch": 0.12298978456559118, "grad_norm": 37.4508171081543, "learning_rate": 0.0001997288140958144, "loss": 49.3107, "step": 228 }, { "epoch": 0.12352921344526482, "grad_norm": 37.3139533996582, "learning_rate": 0.0001997245622180323, "loss": 43.1914, "step": 229 }, { "epoch": 0.12406864232493847, "grad_norm": 35.13384246826172, "learning_rate": 0.0001997202773127516, "loss": 45.7228, "step": 230 }, { "epoch": 0.12460807120461212, "grad_norm": 37.45779037475586, "learning_rate": 0.00019971595938139135, "loss": 45.0848, "step": 231 }, { "epoch": 0.12514750008428577, "grad_norm": 37.03962707519531, "learning_rate": 0.00019971160842538162, "loss": 46.3705, "step": 232 }, { "epoch": 0.12568692896395942, "grad_norm": 30.98250389099121, "learning_rate": 0.0001997072244461634, "loss": 41.1065, "step": 233 }, { "epoch": 0.12622635784363306, "grad_norm": 33.62482833862305, "learning_rate": 0.00019970280744518854, "loss": 41.8594, "step": 234 }, { "epoch": 0.1267657867233067, "grad_norm": 45.488739013671875, "learning_rate": 0.00019969835742392, "loss": 38.6525, "step": 235 }, { "epoch": 0.12730521560298033, "grad_norm": 43.84321594238281, "learning_rate": 0.0001996938743838315, "loss": 53.2114, "step": 236 }, { "epoch": 0.12784464448265398, "grad_norm": 40.51958084106445, "learning_rate": 0.00019968935832640782, "loss": 50.4725, "step": 237 }, { "epoch": 0.12838407336232763, "grad_norm": 35.1596794128418, "learning_rate": 0.00019968480925314458, "loss": 45.1618, "step": 238 }, { "epoch": 0.12892350224200128, "grad_norm": 32.27614974975586, "learning_rate": 0.00019968022716554832, "loss": 38.2164, "step": 239 }, { "epoch": 0.12946293112167492, "grad_norm": 33.67794418334961, "learning_rate": 0.00019967561206513668, "loss": 43.3203, "step": 240 }, { "epoch": 0.13000236000134857, "grad_norm": 26.34979820251465, "learning_rate": 0.00019967096395343806, "loss": 32.1165, "step": 241 }, { "epoch": 0.13054178888102222, "grad_norm": 33.10830307006836, "learning_rate": 0.00019966628283199186, "loss": 45.5207, "step": 242 }, { "epoch": 0.13108121776069587, "grad_norm": 47.04872131347656, "learning_rate": 0.00019966156870234844, "loss": 44.7497, "step": 243 }, { "epoch": 0.13162064664036952, "grad_norm": 38.99346160888672, "learning_rate": 0.000199656821566069, "loss": 43.9255, "step": 244 }, { "epoch": 0.13216007552004316, "grad_norm": 29.892854690551758, "learning_rate": 0.00019965204142472574, "loss": 48.4896, "step": 245 }, { "epoch": 0.1326995043997168, "grad_norm": 37.65726089477539, "learning_rate": 0.00019964722827990185, "loss": 37.7987, "step": 246 }, { "epoch": 0.13323893327939046, "grad_norm": 41.673274993896484, "learning_rate": 0.00019964238213319134, "loss": 48.4095, "step": 247 }, { "epoch": 0.13377836215906408, "grad_norm": 37.152793884277344, "learning_rate": 0.00019963750298619917, "loss": 33.8212, "step": 248 }, { "epoch": 0.13431779103873773, "grad_norm": 43.92071533203125, "learning_rate": 0.00019963259084054128, "loss": 35.554, "step": 249 }, { "epoch": 0.13485721991841138, "grad_norm": 39.161903381347656, "learning_rate": 0.0001996276456978445, "loss": 33.8096, "step": 250 }, { "epoch": 0.13539664879808502, "grad_norm": 24.633363723754883, "learning_rate": 0.00019962266755974657, "loss": 46.0338, "step": 251 }, { "epoch": 0.13593607767775867, "grad_norm": 54.83051300048828, "learning_rate": 0.00019961765642789625, "loss": 80.4599, "step": 252 }, { "epoch": 0.13647550655743232, "grad_norm": 43.1768684387207, "learning_rate": 0.0001996126123039531, "loss": 84.3379, "step": 253 }, { "epoch": 0.13701493543710597, "grad_norm": 24.49346160888672, "learning_rate": 0.00019960753518958772, "loss": 100.9898, "step": 254 }, { "epoch": 0.13755436431677961, "grad_norm": 38.09309768676758, "learning_rate": 0.00019960242508648154, "loss": 101.0717, "step": 255 }, { "epoch": 0.13809379319645326, "grad_norm": 40.072296142578125, "learning_rate": 0.00019959728199632699, "loss": 108.2131, "step": 256 }, { "epoch": 0.1386332220761269, "grad_norm": 43.77210235595703, "learning_rate": 0.0001995921059208274, "loss": 111.636, "step": 257 }, { "epoch": 0.13917265095580056, "grad_norm": 42.023155212402344, "learning_rate": 0.00019958689686169697, "loss": 90.4911, "step": 258 }, { "epoch": 0.13971207983547418, "grad_norm": 27.917343139648438, "learning_rate": 0.00019958165482066094, "loss": 92.3676, "step": 259 }, { "epoch": 0.14025150871514783, "grad_norm": 19.174135208129883, "learning_rate": 0.00019957637979945537, "loss": 88.4276, "step": 260 }, { "epoch": 0.14079093759482147, "grad_norm": 22.779672622680664, "learning_rate": 0.0001995710717998273, "loss": 88.3991, "step": 261 }, { "epoch": 0.14133036647449512, "grad_norm": 17.607568740844727, "learning_rate": 0.00019956573082353463, "loss": 77.4426, "step": 262 }, { "epoch": 0.14186979535416877, "grad_norm": 22.228328704833984, "learning_rate": 0.00019956035687234626, "loss": 68.3415, "step": 263 }, { "epoch": 0.14240922423384242, "grad_norm": 21.00279998779297, "learning_rate": 0.00019955494994804198, "loss": 70.7203, "step": 264 }, { "epoch": 0.14294865311351607, "grad_norm": 27.789443969726562, "learning_rate": 0.00019954951005241248, "loss": 62.4471, "step": 265 }, { "epoch": 0.14348808199318971, "grad_norm": 21.813310623168945, "learning_rate": 0.0001995440371872594, "loss": 65.5364, "step": 266 }, { "epoch": 0.14402751087286336, "grad_norm": 22.338788986206055, "learning_rate": 0.00019953853135439522, "loss": 53.7872, "step": 267 }, { "epoch": 0.144566939752537, "grad_norm": 17.053470611572266, "learning_rate": 0.00019953299255564346, "loss": 46.6823, "step": 268 }, { "epoch": 0.14510636863221066, "grad_norm": 34.75794219970703, "learning_rate": 0.0001995274207928385, "loss": 32.208, "step": 269 }, { "epoch": 0.1456457975118843, "grad_norm": 76.52667236328125, "learning_rate": 0.00019952181606782565, "loss": 52.4054, "step": 270 }, { "epoch": 0.14618522639155793, "grad_norm": 71.48796844482422, "learning_rate": 0.00019951617838246107, "loss": 48.9668, "step": 271 }, { "epoch": 0.14672465527123157, "grad_norm": 79.96577453613281, "learning_rate": 0.00019951050773861192, "loss": 61.6082, "step": 272 }, { "epoch": 0.14726408415090522, "grad_norm": 42.05474090576172, "learning_rate": 0.0001995048041381562, "loss": 50.8627, "step": 273 }, { "epoch": 0.14780351303057887, "grad_norm": 43.19125747680664, "learning_rate": 0.00019949906758298295, "loss": 45.519, "step": 274 }, { "epoch": 0.14834294191025252, "grad_norm": 47.39426040649414, "learning_rate": 0.00019949329807499198, "loss": 51.654, "step": 275 }, { "epoch": 0.14888237078992617, "grad_norm": 36.0722770690918, "learning_rate": 0.00019948749561609415, "loss": 46.8854, "step": 276 }, { "epoch": 0.14942179966959981, "grad_norm": 33.252742767333984, "learning_rate": 0.00019948166020821107, "loss": 46.7532, "step": 277 }, { "epoch": 0.14996122854927346, "grad_norm": 33.89019012451172, "learning_rate": 0.0001994757918532754, "loss": 49.6403, "step": 278 }, { "epoch": 0.1505006574289471, "grad_norm": 37.914676666259766, "learning_rate": 0.00019946989055323066, "loss": 54.5018, "step": 279 }, { "epoch": 0.15104008630862076, "grad_norm": 37.611061096191406, "learning_rate": 0.00019946395631003128, "loss": 50.6423, "step": 280 }, { "epoch": 0.1515795151882944, "grad_norm": 36.489723205566406, "learning_rate": 0.00019945798912564264, "loss": 45.9299, "step": 281 }, { "epoch": 0.15211894406796803, "grad_norm": 31.33220100402832, "learning_rate": 0.00019945198900204095, "loss": 47.4519, "step": 282 }, { "epoch": 0.15265837294764167, "grad_norm": 32.4266242980957, "learning_rate": 0.00019944595594121337, "loss": 40.0806, "step": 283 }, { "epoch": 0.15319780182731532, "grad_norm": 38.17313003540039, "learning_rate": 0.00019943988994515797, "loss": 39.9765, "step": 284 }, { "epoch": 0.15373723070698897, "grad_norm": 40.299354553222656, "learning_rate": 0.00019943379101588376, "loss": 40.7812, "step": 285 }, { "epoch": 0.15427665958666262, "grad_norm": 42.34661102294922, "learning_rate": 0.00019942765915541063, "loss": 31.2513, "step": 286 }, { "epoch": 0.15481608846633627, "grad_norm": 46.61203384399414, "learning_rate": 0.00019942149436576938, "loss": 41.5619, "step": 287 }, { "epoch": 0.1553555173460099, "grad_norm": 39.79526901245117, "learning_rate": 0.00019941529664900168, "loss": 38.13, "step": 288 }, { "epoch": 0.15589494622568356, "grad_norm": 42.995567321777344, "learning_rate": 0.0001994090660071601, "loss": 41.3515, "step": 289 }, { "epoch": 0.1564343751053572, "grad_norm": 34.27892303466797, "learning_rate": 0.00019940280244230824, "loss": 41.1277, "step": 290 }, { "epoch": 0.15697380398503086, "grad_norm": 29.622488021850586, "learning_rate": 0.00019939650595652045, "loss": 49.2284, "step": 291 }, { "epoch": 0.1575132328647045, "grad_norm": 36.693119049072266, "learning_rate": 0.00019939017655188206, "loss": 35.5444, "step": 292 }, { "epoch": 0.15805266174437815, "grad_norm": 30.75679588317871, "learning_rate": 0.00019938381423048932, "loss": 34.9666, "step": 293 }, { "epoch": 0.15859209062405177, "grad_norm": 35.84019088745117, "learning_rate": 0.00019937741899444928, "loss": 39.4625, "step": 294 }, { "epoch": 0.15913151950372542, "grad_norm": 35.854496002197266, "learning_rate": 0.00019937099084588002, "loss": 37.2887, "step": 295 }, { "epoch": 0.15967094838339907, "grad_norm": 33.07613754272461, "learning_rate": 0.00019936452978691044, "loss": 34.5375, "step": 296 }, { "epoch": 0.16021037726307272, "grad_norm": 43.46371078491211, "learning_rate": 0.00019935803581968035, "loss": 30.3173, "step": 297 }, { "epoch": 0.16074980614274637, "grad_norm": 52.03241729736328, "learning_rate": 0.00019935150894634046, "loss": 42.4725, "step": 298 }, { "epoch": 0.16128923502242, "grad_norm": 50.36249542236328, "learning_rate": 0.00019934494916905245, "loss": 37.3647, "step": 299 }, { "epoch": 0.16182866390209366, "grad_norm": 41.50126647949219, "learning_rate": 0.00019933835648998875, "loss": 24.2931, "step": 300 }, { "epoch": 0.1623680927817673, "grad_norm": 31.253141403198242, "learning_rate": 0.00019933173091133286, "loss": 44.7853, "step": 301 }, { "epoch": 0.16290752166144096, "grad_norm": 96.83972930908203, "learning_rate": 0.000199325072435279, "loss": 84.9808, "step": 302 }, { "epoch": 0.1634469505411146, "grad_norm": 91.9966049194336, "learning_rate": 0.0001993183810640324, "loss": 99.5531, "step": 303 }, { "epoch": 0.16398637942078825, "grad_norm": 66.43877410888672, "learning_rate": 0.00019931165679980918, "loss": 105.7665, "step": 304 }, { "epoch": 0.16452580830046187, "grad_norm": 35.26411056518555, "learning_rate": 0.00019930489964483633, "loss": 109.6819, "step": 305 }, { "epoch": 0.16506523718013552, "grad_norm": 47.18457794189453, "learning_rate": 0.00019929810960135172, "loss": 113.4221, "step": 306 }, { "epoch": 0.16560466605980917, "grad_norm": 49.24475860595703, "learning_rate": 0.00019929128667160408, "loss": 108.0158, "step": 307 }, { "epoch": 0.16614409493948282, "grad_norm": 45.63924026489258, "learning_rate": 0.00019928443085785318, "loss": 94.1414, "step": 308 }, { "epoch": 0.16668352381915646, "grad_norm": 46.688350677490234, "learning_rate": 0.00019927754216236948, "loss": 87.8688, "step": 309 }, { "epoch": 0.1672229526988301, "grad_norm": 39.54045486450195, "learning_rate": 0.00019927062058743448, "loss": 92.6019, "step": 310 }, { "epoch": 0.16776238157850376, "grad_norm": 29.866121292114258, "learning_rate": 0.0001992636661353405, "loss": 81.9024, "step": 311 }, { "epoch": 0.1683018104581774, "grad_norm": 22.350112915039062, "learning_rate": 0.0001992566788083908, "loss": 68.4321, "step": 312 }, { "epoch": 0.16884123933785106, "grad_norm": 21.657258987426758, "learning_rate": 0.00019924965860889944, "loss": 65.7434, "step": 313 }, { "epoch": 0.1693806682175247, "grad_norm": 18.347572326660156, "learning_rate": 0.00019924260553919146, "loss": 62.485, "step": 314 }, { "epoch": 0.16992009709719835, "grad_norm": 28.368114471435547, "learning_rate": 0.00019923551960160268, "loss": 53.7759, "step": 315 }, { "epoch": 0.170459525976872, "grad_norm": 35.214988708496094, "learning_rate": 0.00019922840079848, "loss": 45.4414, "step": 316 }, { "epoch": 0.17099895485654562, "grad_norm": 38.698760986328125, "learning_rate": 0.00019922124913218094, "loss": 37.665, "step": 317 }, { "epoch": 0.17153838373621927, "grad_norm": 43.39471435546875, "learning_rate": 0.0001992140646050741, "loss": 51.4899, "step": 318 }, { "epoch": 0.17207781261589292, "grad_norm": 43.52251434326172, "learning_rate": 0.00019920684721953894, "loss": 48.5712, "step": 319 }, { "epoch": 0.17261724149556656, "grad_norm": 60.897579193115234, "learning_rate": 0.00019919959697796568, "loss": 59.9231, "step": 320 }, { "epoch": 0.1731566703752402, "grad_norm": 37.93972396850586, "learning_rate": 0.0001991923138827556, "loss": 47.906, "step": 321 }, { "epoch": 0.17369609925491386, "grad_norm": 44.32222366333008, "learning_rate": 0.0001991849979363207, "loss": 54.5404, "step": 322 }, { "epoch": 0.1742355281345875, "grad_norm": 37.367671966552734, "learning_rate": 0.00019917764914108394, "loss": 49.3113, "step": 323 }, { "epoch": 0.17477495701426116, "grad_norm": 43.20479965209961, "learning_rate": 0.00019917026749947917, "loss": 41.9015, "step": 324 }, { "epoch": 0.1753143858939348, "grad_norm": 36.7598991394043, "learning_rate": 0.0001991628530139511, "loss": 43.7222, "step": 325 }, { "epoch": 0.17585381477360845, "grad_norm": 33.30655288696289, "learning_rate": 0.0001991554056869553, "loss": 48.4387, "step": 326 }, { "epoch": 0.1763932436532821, "grad_norm": 32.89339828491211, "learning_rate": 0.00019914792552095818, "loss": 51.108, "step": 327 }, { "epoch": 0.17693267253295572, "grad_norm": 31.422489166259766, "learning_rate": 0.00019914041251843716, "loss": 42.9287, "step": 328 }, { "epoch": 0.17747210141262937, "grad_norm": 33.38264465332031, "learning_rate": 0.00019913286668188037, "loss": 47.0867, "step": 329 }, { "epoch": 0.17801153029230302, "grad_norm": 37.976837158203125, "learning_rate": 0.00019912528801378698, "loss": 38.2593, "step": 330 }, { "epoch": 0.17855095917197666, "grad_norm": 35.707054138183594, "learning_rate": 0.0001991176765166669, "loss": 44.5348, "step": 331 }, { "epoch": 0.1790903880516503, "grad_norm": 43.86237335205078, "learning_rate": 0.00019911003219304094, "loss": 40.4868, "step": 332 }, { "epoch": 0.17962981693132396, "grad_norm": 54.88194274902344, "learning_rate": 0.00019910235504544082, "loss": 38.935, "step": 333 }, { "epoch": 0.1801692458109976, "grad_norm": 43.87349319458008, "learning_rate": 0.00019909464507640915, "loss": 43.0978, "step": 334 }, { "epoch": 0.18070867469067126, "grad_norm": 43.421932220458984, "learning_rate": 0.0001990869022884993, "loss": 39.2888, "step": 335 }, { "epoch": 0.1812481035703449, "grad_norm": 41.14269256591797, "learning_rate": 0.00019907912668427566, "loss": 42.6139, "step": 336 }, { "epoch": 0.18178753245001855, "grad_norm": 38.619380950927734, "learning_rate": 0.00019907131826631336, "loss": 40.0248, "step": 337 }, { "epoch": 0.1823269613296922, "grad_norm": 33.65724563598633, "learning_rate": 0.00019906347703719845, "loss": 38.7406, "step": 338 }, { "epoch": 0.18286639020936585, "grad_norm": 35.25956344604492, "learning_rate": 0.0001990556029995279, "loss": 39.2734, "step": 339 }, { "epoch": 0.18340581908903947, "grad_norm": 36.87468719482422, "learning_rate": 0.00019904769615590942, "loss": 40.6619, "step": 340 }, { "epoch": 0.18394524796871312, "grad_norm": 32.0380973815918, "learning_rate": 0.00019903975650896168, "loss": 39.8376, "step": 341 }, { "epoch": 0.18448467684838676, "grad_norm": 33.44660949707031, "learning_rate": 0.0001990317840613142, "loss": 33.338, "step": 342 }, { "epoch": 0.1850241057280604, "grad_norm": 36.242523193359375, "learning_rate": 0.00019902377881560735, "loss": 35.0493, "step": 343 }, { "epoch": 0.18556353460773406, "grad_norm": 37.39813232421875, "learning_rate": 0.00019901574077449232, "loss": 26.9563, "step": 344 }, { "epoch": 0.1861029634874077, "grad_norm": 35.84196472167969, "learning_rate": 0.0001990076699406313, "loss": 33.7825, "step": 345 }, { "epoch": 0.18664239236708136, "grad_norm": 38.69563293457031, "learning_rate": 0.00019899956631669717, "loss": 29.9582, "step": 346 }, { "epoch": 0.187181821246755, "grad_norm": 47.82805633544922, "learning_rate": 0.00019899142990537376, "loss": 33.9471, "step": 347 }, { "epoch": 0.18772125012642865, "grad_norm": 36.29233169555664, "learning_rate": 0.00019898326070935579, "loss": 28.1711, "step": 348 }, { "epoch": 0.1882606790061023, "grad_norm": 45.26416015625, "learning_rate": 0.00019897505873134872, "loss": 33.76, "step": 349 }, { "epoch": 0.18880010788577595, "grad_norm": 39.766441345214844, "learning_rate": 0.000198966823974069, "loss": 25.629, "step": 350 }, { "epoch": 0.18933953676544957, "grad_norm": 30.092906951904297, "learning_rate": 0.00019895855644024387, "loss": 45.1687, "step": 351 }, { "epoch": 0.18987896564512322, "grad_norm": 61.02379608154297, "learning_rate": 0.00019895025613261136, "loss": 77.4727, "step": 352 }, { "epoch": 0.19041839452479686, "grad_norm": 51.788063049316406, "learning_rate": 0.00019894192305392055, "loss": 82.3816, "step": 353 }, { "epoch": 0.1909578234044705, "grad_norm": 72.1239242553711, "learning_rate": 0.0001989335572069311, "loss": 103.2545, "step": 354 }, { "epoch": 0.19149725228414416, "grad_norm": 29.279748916625977, "learning_rate": 0.00019892515859441383, "loss": 113.7908, "step": 355 }, { "epoch": 0.1920366811638178, "grad_norm": 43.08776092529297, "learning_rate": 0.00019891672721915015, "loss": 107.6541, "step": 356 }, { "epoch": 0.19257611004349146, "grad_norm": 54.121192932128906, "learning_rate": 0.00019890826308393243, "loss": 102.3774, "step": 357 }, { "epoch": 0.1931155389231651, "grad_norm": 52.771793365478516, "learning_rate": 0.0001988997661915639, "loss": 87.3872, "step": 358 }, { "epoch": 0.19365496780283875, "grad_norm": 58.10847854614258, "learning_rate": 0.00019889123654485866, "loss": 97.106, "step": 359 }, { "epoch": 0.1941943966825124, "grad_norm": 52.38351058959961, "learning_rate": 0.00019888267414664156, "loss": 91.256, "step": 360 }, { "epoch": 0.19473382556218605, "grad_norm": 48.153804779052734, "learning_rate": 0.0001988740789997484, "loss": 81.894, "step": 361 }, { "epoch": 0.19527325444185967, "grad_norm": 25.811304092407227, "learning_rate": 0.00019886545110702576, "loss": 69.6325, "step": 362 }, { "epoch": 0.19581268332153332, "grad_norm": 22.911964416503906, "learning_rate": 0.00019885679047133107, "loss": 65.5302, "step": 363 }, { "epoch": 0.19635211220120696, "grad_norm": 37.54278564453125, "learning_rate": 0.00019884809709553265, "loss": 60.65, "step": 364 }, { "epoch": 0.1968915410808806, "grad_norm": 20.303857803344727, "learning_rate": 0.00019883937098250963, "loss": 44.1299, "step": 365 }, { "epoch": 0.19743096996055426, "grad_norm": 31.87704849243164, "learning_rate": 0.00019883061213515197, "loss": 34.1489, "step": 366 }, { "epoch": 0.1979703988402279, "grad_norm": 39.10615539550781, "learning_rate": 0.00019882182055636053, "loss": 37.5989, "step": 367 }, { "epoch": 0.19850982771990155, "grad_norm": 41.10018539428711, "learning_rate": 0.00019881299624904692, "loss": 48.6169, "step": 368 }, { "epoch": 0.1990492565995752, "grad_norm": 34.8628044128418, "learning_rate": 0.00019880413921613367, "loss": 51.3889, "step": 369 }, { "epoch": 0.19958868547924885, "grad_norm": 41.81850051879883, "learning_rate": 0.0001987952494605541, "loss": 46.2857, "step": 370 }, { "epoch": 0.2001281143589225, "grad_norm": 46.00803756713867, "learning_rate": 0.00019878632698525238, "loss": 42.1201, "step": 371 }, { "epoch": 0.20066754323859615, "grad_norm": 37.3172492980957, "learning_rate": 0.00019877737179318353, "loss": 44.8517, "step": 372 }, { "epoch": 0.2012069721182698, "grad_norm": 30.38181495666504, "learning_rate": 0.0001987683838873134, "loss": 30.3321, "step": 373 }, { "epoch": 0.20174640099794341, "grad_norm": 36.00757598876953, "learning_rate": 0.00019875936327061865, "loss": 41.3805, "step": 374 }, { "epoch": 0.20228582987761706, "grad_norm": 36.742733001708984, "learning_rate": 0.00019875030994608684, "loss": 48.6651, "step": 375 }, { "epoch": 0.2028252587572907, "grad_norm": 42.53518295288086, "learning_rate": 0.00019874122391671622, "loss": 32.5649, "step": 376 }, { "epoch": 0.20336468763696436, "grad_norm": 35.77900314331055, "learning_rate": 0.00019873210518551608, "loss": 46.6955, "step": 377 }, { "epoch": 0.203904116516638, "grad_norm": 44.95616149902344, "learning_rate": 0.00019872295375550635, "loss": 41.271, "step": 378 }, { "epoch": 0.20444354539631165, "grad_norm": 34.28546142578125, "learning_rate": 0.00019871376962971789, "loss": 41.4059, "step": 379 }, { "epoch": 0.2049829742759853, "grad_norm": 35.807682037353516, "learning_rate": 0.00019870455281119237, "loss": 45.8892, "step": 380 }, { "epoch": 0.20552240315565895, "grad_norm": 30.27015495300293, "learning_rate": 0.00019869530330298227, "loss": 34.013, "step": 381 }, { "epoch": 0.2060618320353326, "grad_norm": 38.26789093017578, "learning_rate": 0.00019868602110815093, "loss": 42.6953, "step": 382 }, { "epoch": 0.20660126091500625, "grad_norm": 39.61716079711914, "learning_rate": 0.00019867670622977248, "loss": 40.4979, "step": 383 }, { "epoch": 0.2071406897946799, "grad_norm": 35.717227935791016, "learning_rate": 0.00019866735867093188, "loss": 31.5146, "step": 384 }, { "epoch": 0.20768011867435351, "grad_norm": 43.41541290283203, "learning_rate": 0.0001986579784347249, "loss": 37.5416, "step": 385 }, { "epoch": 0.20821954755402716, "grad_norm": 40.18928146362305, "learning_rate": 0.0001986485655242582, "loss": 39.0367, "step": 386 }, { "epoch": 0.2087589764337008, "grad_norm": 35.295291900634766, "learning_rate": 0.00019863911994264926, "loss": 36.8243, "step": 387 }, { "epoch": 0.20929840531337446, "grad_norm": 52.24161148071289, "learning_rate": 0.00019862964169302621, "loss": 41.7241, "step": 388 }, { "epoch": 0.2098378341930481, "grad_norm": 53.32133483886719, "learning_rate": 0.00019862013077852822, "loss": 38.7999, "step": 389 }, { "epoch": 0.21037726307272175, "grad_norm": 42.945804595947266, "learning_rate": 0.00019861058720230514, "loss": 34.0199, "step": 390 }, { "epoch": 0.2109166919523954, "grad_norm": 38.77582931518555, "learning_rate": 0.00019860101096751768, "loss": 33.4203, "step": 391 }, { "epoch": 0.21145612083206905, "grad_norm": 30.80617332458496, "learning_rate": 0.0001985914020773374, "loss": 27.0483, "step": 392 }, { "epoch": 0.2119955497117427, "grad_norm": 43.676090240478516, "learning_rate": 0.00019858176053494663, "loss": 33.954, "step": 393 }, { "epoch": 0.21253497859141635, "grad_norm": 38.32650375366211, "learning_rate": 0.00019857208634353852, "loss": 29.378, "step": 394 }, { "epoch": 0.21307440747109, "grad_norm": 39.12830352783203, "learning_rate": 0.000198562379506317, "loss": 27.9634, "step": 395 }, { "epoch": 0.21361383635076364, "grad_norm": 47.39609909057617, "learning_rate": 0.00019855264002649692, "loss": 34.1847, "step": 396 }, { "epoch": 0.21415326523043726, "grad_norm": 38.62258529663086, "learning_rate": 0.00019854286790730384, "loss": 26.0765, "step": 397 }, { "epoch": 0.2146926941101109, "grad_norm": 42.81424331665039, "learning_rate": 0.00019853306315197413, "loss": 34.1509, "step": 398 }, { "epoch": 0.21523212298978456, "grad_norm": 45.57196807861328, "learning_rate": 0.00019852322576375503, "loss": 32.0371, "step": 399 }, { "epoch": 0.2157715518694582, "grad_norm": 35.20758819580078, "learning_rate": 0.0001985133557459046, "loss": 20.3634, "step": 400 }, { "epoch": 0.2157715518694582, "eval_loss": 1.6627388000488281, "eval_runtime": 141.0153, "eval_samples_per_second": 2.12, "eval_steps_per_second": 2.12, "step": 400 }, { "epoch": 0.21631098074913185, "grad_norm": 24.1074161529541, "learning_rate": 0.00019850345310169155, "loss": 37.3797, "step": 401 }, { "epoch": 0.2168504096288055, "grad_norm": 62.604949951171875, "learning_rate": 0.00019849351783439561, "loss": 78.7953, "step": 402 }, { "epoch": 0.21738983850847915, "grad_norm": 43.36476135253906, "learning_rate": 0.0001984835499473072, "loss": 82.645, "step": 403 }, { "epoch": 0.2179292673881528, "grad_norm": 52.12046432495117, "learning_rate": 0.0001984735494437275, "loss": 87.0839, "step": 404 }, { "epoch": 0.21846869626782645, "grad_norm": 34.333431243896484, "learning_rate": 0.00019846351632696863, "loss": 105.6289, "step": 405 }, { "epoch": 0.2190081251475001, "grad_norm": 41.665771484375, "learning_rate": 0.00019845345060035335, "loss": 112.3874, "step": 406 }, { "epoch": 0.21954755402717374, "grad_norm": 58.79914093017578, "learning_rate": 0.00019844335226721537, "loss": 114.2657, "step": 407 }, { "epoch": 0.22008698290684736, "grad_norm": 52.85742950439453, "learning_rate": 0.00019843322133089906, "loss": 98.4778, "step": 408 }, { "epoch": 0.220626411786521, "grad_norm": 53.792476654052734, "learning_rate": 0.00019842305779475968, "loss": 94.7811, "step": 409 }, { "epoch": 0.22116584066619466, "grad_norm": 49.56667709350586, "learning_rate": 0.0001984128616621633, "loss": 92.4516, "step": 410 }, { "epoch": 0.2217052695458683, "grad_norm": 38.96401596069336, "learning_rate": 0.0001984026329364867, "loss": 78.0561, "step": 411 }, { "epoch": 0.22224469842554195, "grad_norm": 35.649200439453125, "learning_rate": 0.00019839237162111757, "loss": 66.0612, "step": 412 }, { "epoch": 0.2227841273052156, "grad_norm": 22.54837989807129, "learning_rate": 0.00019838207771945426, "loss": 59.3091, "step": 413 }, { "epoch": 0.22332355618488925, "grad_norm": 16.843589782714844, "learning_rate": 0.00019837175123490596, "loss": 62.8711, "step": 414 }, { "epoch": 0.2238629850645629, "grad_norm": 18.909435272216797, "learning_rate": 0.00019836139217089275, "loss": 55.3784, "step": 415 }, { "epoch": 0.22440241394423655, "grad_norm": 25.120887756347656, "learning_rate": 0.0001983510005308454, "loss": 51.9063, "step": 416 }, { "epoch": 0.2249418428239102, "grad_norm": 30.78650665283203, "learning_rate": 0.00019834057631820543, "loss": 32.4726, "step": 417 }, { "epoch": 0.22548127170358384, "grad_norm": 72.46208953857422, "learning_rate": 0.00019833011953642525, "loss": 44.1452, "step": 418 }, { "epoch": 0.2260207005832575, "grad_norm": 45.94267654418945, "learning_rate": 0.000198319630188968, "loss": 50.9596, "step": 419 }, { "epoch": 0.2265601294629311, "grad_norm": 47.52016067504883, "learning_rate": 0.00019830910827930764, "loss": 44.8286, "step": 420 }, { "epoch": 0.22709955834260476, "grad_norm": 40.93891525268555, "learning_rate": 0.00019829855381092886, "loss": 56.7985, "step": 421 }, { "epoch": 0.2276389872222784, "grad_norm": 36.567108154296875, "learning_rate": 0.0001982879667873272, "loss": 35.7161, "step": 422 }, { "epoch": 0.22817841610195205, "grad_norm": 31.908977508544922, "learning_rate": 0.0001982773472120089, "loss": 42.8407, "step": 423 }, { "epoch": 0.2287178449816257, "grad_norm": 37.47427749633789, "learning_rate": 0.00019826669508849108, "loss": 39.5264, "step": 424 }, { "epoch": 0.22925727386129935, "grad_norm": 43.83090591430664, "learning_rate": 0.00019825601042030156, "loss": 48.5415, "step": 425 }, { "epoch": 0.229796702740973, "grad_norm": 42.004425048828125, "learning_rate": 0.00019824529321097893, "loss": 39.4127, "step": 426 }, { "epoch": 0.23033613162064664, "grad_norm": 38.282066345214844, "learning_rate": 0.00019823454346407267, "loss": 40.8499, "step": 427 }, { "epoch": 0.2308755605003203, "grad_norm": 33.92627716064453, "learning_rate": 0.0001982237611831429, "loss": 35.4472, "step": 428 }, { "epoch": 0.23141498937999394, "grad_norm": 53.361106872558594, "learning_rate": 0.00019821294637176057, "loss": 43.1921, "step": 429 }, { "epoch": 0.2319544182596676, "grad_norm": 40.92842102050781, "learning_rate": 0.00019820209903350744, "loss": 36.5019, "step": 430 }, { "epoch": 0.2324938471393412, "grad_norm": 35.71042251586914, "learning_rate": 0.00019819121917197602, "loss": 36.598, "step": 431 }, { "epoch": 0.23303327601901486, "grad_norm": 35.10508728027344, "learning_rate": 0.00019818030679076952, "loss": 31.6675, "step": 432 }, { "epoch": 0.2335727048986885, "grad_norm": 31.885364532470703, "learning_rate": 0.00019816936189350206, "loss": 34.3554, "step": 433 }, { "epoch": 0.23411213377836215, "grad_norm": 42.998878479003906, "learning_rate": 0.0001981583844837984, "loss": 28.1099, "step": 434 }, { "epoch": 0.2346515626580358, "grad_norm": 38.70567321777344, "learning_rate": 0.00019814737456529412, "loss": 42.3567, "step": 435 }, { "epoch": 0.23519099153770945, "grad_norm": 34.43855285644531, "learning_rate": 0.00019813633214163555, "loss": 22.8285, "step": 436 }, { "epoch": 0.2357304204173831, "grad_norm": 33.38055419921875, "learning_rate": 0.00019812525721647986, "loss": 36.1465, "step": 437 }, { "epoch": 0.23626984929705674, "grad_norm": 42.98970413208008, "learning_rate": 0.00019811414979349485, "loss": 34.8416, "step": 438 }, { "epoch": 0.2368092781767304, "grad_norm": 37.12187957763672, "learning_rate": 0.0001981030098763592, "loss": 34.276, "step": 439 }, { "epoch": 0.23734870705640404, "grad_norm": 44.36403274536133, "learning_rate": 0.00019809183746876232, "loss": 30.3544, "step": 440 }, { "epoch": 0.2378881359360777, "grad_norm": 46.281654357910156, "learning_rate": 0.00019808063257440432, "loss": 27.8803, "step": 441 }, { "epoch": 0.23842756481575134, "grad_norm": 49.94664001464844, "learning_rate": 0.00019806939519699613, "loss": 31.0358, "step": 442 }, { "epoch": 0.23896699369542496, "grad_norm": 42.308616638183594, "learning_rate": 0.0001980581253402595, "loss": 29.4053, "step": 443 }, { "epoch": 0.2395064225750986, "grad_norm": 51.36742401123047, "learning_rate": 0.00019804682300792674, "loss": 31.0947, "step": 444 }, { "epoch": 0.24004585145477225, "grad_norm": 40.25013732910156, "learning_rate": 0.00019803548820374113, "loss": 26.6703, "step": 445 }, { "epoch": 0.2405852803344459, "grad_norm": 53.013710021972656, "learning_rate": 0.00019802412093145657, "loss": 35.5286, "step": 446 }, { "epoch": 0.24112470921411955, "grad_norm": 41.21833038330078, "learning_rate": 0.00019801272119483775, "loss": 25.3315, "step": 447 }, { "epoch": 0.2416641380937932, "grad_norm": 61.56970977783203, "learning_rate": 0.00019800128899766017, "loss": 27.589, "step": 448 }, { "epoch": 0.24220356697346684, "grad_norm": 58.22453308105469, "learning_rate": 0.00019798982434371, "loss": 37.2235, "step": 449 }, { "epoch": 0.2427429958531405, "grad_norm": 36.04716110229492, "learning_rate": 0.00019797832723678413, "loss": 28.1485, "step": 450 }, { "epoch": 0.24328242473281414, "grad_norm": 50.804813385009766, "learning_rate": 0.00019796679768069032, "loss": 49.1471, "step": 451 }, { "epoch": 0.2438218536124878, "grad_norm": 91.2785873413086, "learning_rate": 0.00019795523567924702, "loss": 72.8998, "step": 452 }, { "epoch": 0.24436128249216144, "grad_norm": 110.37539672851562, "learning_rate": 0.00019794364123628335, "loss": 98.2308, "step": 453 }, { "epoch": 0.24490071137183506, "grad_norm": 79.3825912475586, "learning_rate": 0.00019793201435563932, "loss": 109.7274, "step": 454 }, { "epoch": 0.2454401402515087, "grad_norm": 36.62171173095703, "learning_rate": 0.00019792035504116555, "loss": 107.5116, "step": 455 }, { "epoch": 0.24597956913118235, "grad_norm": 57.664146423339844, "learning_rate": 0.00019790866329672346, "loss": 113.5622, "step": 456 }, { "epoch": 0.246518998010856, "grad_norm": 57.12027359008789, "learning_rate": 0.00019789693912618524, "loss": 102.4627, "step": 457 }, { "epoch": 0.24705842689052965, "grad_norm": 67.92241668701172, "learning_rate": 0.00019788518253343376, "loss": 90.2483, "step": 458 }, { "epoch": 0.2475978557702033, "grad_norm": 63.95331573486328, "learning_rate": 0.00019787339352236264, "loss": 94.7671, "step": 459 }, { "epoch": 0.24813728464987694, "grad_norm": 55.70960235595703, "learning_rate": 0.00019786157209687627, "loss": 92.1523, "step": 460 }, { "epoch": 0.2486767135295506, "grad_norm": 44.270233154296875, "learning_rate": 0.00019784971826088973, "loss": 82.3084, "step": 461 }, { "epoch": 0.24921614240922424, "grad_norm": 35.74955749511719, "learning_rate": 0.0001978378320183289, "loss": 71.401, "step": 462 }, { "epoch": 0.2497555712888979, "grad_norm": 26.20838165283203, "learning_rate": 0.00019782591337313035, "loss": 68.6018, "step": 463 }, { "epoch": 0.25029500016857154, "grad_norm": 20.70208740234375, "learning_rate": 0.00019781396232924133, "loss": 62.6257, "step": 464 }, { "epoch": 0.25083442904824516, "grad_norm": 17.804771423339844, "learning_rate": 0.00019780197889061993, "loss": 54.6564, "step": 465 }, { "epoch": 0.25137385792791883, "grad_norm": 24.327360153198242, "learning_rate": 0.0001977899630612349, "loss": 50.7451, "step": 466 }, { "epoch": 0.25191328680759245, "grad_norm": 29.580142974853516, "learning_rate": 0.00019777791484506567, "loss": 34.4045, "step": 467 }, { "epoch": 0.2524527156872661, "grad_norm": 30.99888801574707, "learning_rate": 0.00019776583424610254, "loss": 41.2975, "step": 468 }, { "epoch": 0.25299214456693975, "grad_norm": 40.59465408325195, "learning_rate": 0.0001977537212683464, "loss": 56.0607, "step": 469 }, { "epoch": 0.2535315734466134, "grad_norm": 42.85790252685547, "learning_rate": 0.00019774157591580894, "loss": 40.9168, "step": 470 }, { "epoch": 0.25407100232628704, "grad_norm": 38.090885162353516, "learning_rate": 0.0001977293981925125, "loss": 49.6262, "step": 471 }, { "epoch": 0.25461043120596066, "grad_norm": 33.007991790771484, "learning_rate": 0.0001977171881024902, "loss": 44.5241, "step": 472 }, { "epoch": 0.25514986008563434, "grad_norm": 39.41592025756836, "learning_rate": 0.00019770494564978595, "loss": 38.185, "step": 473 }, { "epoch": 0.25568928896530796, "grad_norm": 33.008148193359375, "learning_rate": 0.00019769267083845417, "loss": 42.3843, "step": 474 }, { "epoch": 0.25622871784498163, "grad_norm": 27.917991638183594, "learning_rate": 0.0001976803636725602, "loss": 33.7216, "step": 475 }, { "epoch": 0.25676814672465526, "grad_norm": 29.870256423950195, "learning_rate": 0.00019766802415617998, "loss": 35.7963, "step": 476 }, { "epoch": 0.25730757560432893, "grad_norm": 44.98633575439453, "learning_rate": 0.0001976556522934002, "loss": 35.8127, "step": 477 }, { "epoch": 0.25784700448400255, "grad_norm": 43.03909683227539, "learning_rate": 0.0001976432480883183, "loss": 35.4111, "step": 478 }, { "epoch": 0.2583864333636762, "grad_norm": 47.32424545288086, "learning_rate": 0.00019763081154504234, "loss": 41.8895, "step": 479 }, { "epoch": 0.25892586224334985, "grad_norm": 49.7735595703125, "learning_rate": 0.0001976183426676912, "loss": 32.9801, "step": 480 }, { "epoch": 0.2594652911230235, "grad_norm": 44.57673645019531, "learning_rate": 0.0001976058414603944, "loss": 36.089, "step": 481 }, { "epoch": 0.26000472000269714, "grad_norm": 36.22349548339844, "learning_rate": 0.00019759330792729212, "loss": 47.0487, "step": 482 }, { "epoch": 0.26054414888237076, "grad_norm": 38.58706283569336, "learning_rate": 0.00019758074207253535, "loss": 34.3672, "step": 483 }, { "epoch": 0.26108357776204444, "grad_norm": 40.61176300048828, "learning_rate": 0.00019756814390028575, "loss": 39.7468, "step": 484 }, { "epoch": 0.26162300664171806, "grad_norm": 29.439836502075195, "learning_rate": 0.00019755551341471566, "loss": 34.1449, "step": 485 }, { "epoch": 0.26216243552139173, "grad_norm": 35.68241882324219, "learning_rate": 0.00019754285062000815, "loss": 31.6102, "step": 486 }, { "epoch": 0.26270186440106535, "grad_norm": 44.2021598815918, "learning_rate": 0.000197530155520357, "loss": 31.8889, "step": 487 }, { "epoch": 0.26324129328073903, "grad_norm": 53.82715606689453, "learning_rate": 0.00019751742811996656, "loss": 31.6853, "step": 488 }, { "epoch": 0.26378072216041265, "grad_norm": 41.77256774902344, "learning_rate": 0.00019750466842305208, "loss": 39.1939, "step": 489 }, { "epoch": 0.2643201510400863, "grad_norm": 36.42414093017578, "learning_rate": 0.00019749187643383937, "loss": 26.3978, "step": 490 }, { "epoch": 0.26485957991975995, "grad_norm": 49.238014221191406, "learning_rate": 0.00019747905215656498, "loss": 33.8181, "step": 491 }, { "epoch": 0.2653990087994336, "grad_norm": 37.46484375, "learning_rate": 0.00019746619559547619, "loss": 32.0879, "step": 492 }, { "epoch": 0.26593843767910724, "grad_norm": 29.428075790405273, "learning_rate": 0.00019745330675483084, "loss": 22.5194, "step": 493 }, { "epoch": 0.2664778665587809, "grad_norm": 42.24260330200195, "learning_rate": 0.00019744038563889764, "loss": 34.5577, "step": 494 }, { "epoch": 0.26701729543845454, "grad_norm": 43.271976470947266, "learning_rate": 0.00019742743225195582, "loss": 25.107, "step": 495 }, { "epoch": 0.26755672431812816, "grad_norm": 41.1341667175293, "learning_rate": 0.00019741444659829543, "loss": 24.4596, "step": 496 }, { "epoch": 0.26809615319780183, "grad_norm": 35.3587760925293, "learning_rate": 0.00019740142868221713, "loss": 21.1434, "step": 497 }, { "epoch": 0.26863558207747545, "grad_norm": 47.48214340209961, "learning_rate": 0.00019738837850803226, "loss": 23.4752, "step": 498 }, { "epoch": 0.26917501095714913, "grad_norm": 44.637882232666016, "learning_rate": 0.00019737529608006293, "loss": 21.9525, "step": 499 }, { "epoch": 0.26971443983682275, "grad_norm": 31.005287170410156, "learning_rate": 0.00019736218140264185, "loss": 19.1622, "step": 500 }, { "epoch": 0.2702538687164964, "grad_norm": 32.10681915283203, "learning_rate": 0.0001973490344801124, "loss": 44.8021, "step": 501 }, { "epoch": 0.27079329759617005, "grad_norm": 67.818603515625, "learning_rate": 0.0001973358553168287, "loss": 90.5945, "step": 502 }, { "epoch": 0.2713327264758437, "grad_norm": 78.30387115478516, "learning_rate": 0.00019732264391715556, "loss": 101.037, "step": 503 }, { "epoch": 0.27187215535551734, "grad_norm": 92.50519561767578, "learning_rate": 0.00019730940028546835, "loss": 124.3723, "step": 504 }, { "epoch": 0.272411584235191, "grad_norm": 38.794246673583984, "learning_rate": 0.0001972961244261532, "loss": 105.1317, "step": 505 }, { "epoch": 0.27295101311486464, "grad_norm": 34.56374740600586, "learning_rate": 0.00019728281634360698, "loss": 101.3536, "step": 506 }, { "epoch": 0.27349044199453826, "grad_norm": 33.79701614379883, "learning_rate": 0.00019726947604223712, "loss": 105.4946, "step": 507 }, { "epoch": 0.27402987087421193, "grad_norm": 39.242740631103516, "learning_rate": 0.00019725610352646172, "loss": 82.6645, "step": 508 }, { "epoch": 0.27456929975388555, "grad_norm": 41.144683837890625, "learning_rate": 0.0001972426988007096, "loss": 99.5104, "step": 509 }, { "epoch": 0.27510872863355923, "grad_norm": 43.32292175292969, "learning_rate": 0.00019722926186942026, "loss": 90.6068, "step": 510 }, { "epoch": 0.27564815751323285, "grad_norm": 40.97383117675781, "learning_rate": 0.0001972157927370438, "loss": 71.8933, "step": 511 }, { "epoch": 0.2761875863929065, "grad_norm": 27.89875602722168, "learning_rate": 0.0001972022914080411, "loss": 66.0499, "step": 512 }, { "epoch": 0.27672701527258015, "grad_norm": 23.75403594970703, "learning_rate": 0.00019718875788688354, "loss": 59.9798, "step": 513 }, { "epoch": 0.2772664441522538, "grad_norm": 18.101530075073242, "learning_rate": 0.0001971751921780533, "loss": 55.1379, "step": 514 }, { "epoch": 0.27780587303192744, "grad_norm": 24.123146057128906, "learning_rate": 0.00019716159428604315, "loss": 51.0036, "step": 515 }, { "epoch": 0.2783453019116011, "grad_norm": 29.12915802001953, "learning_rate": 0.00019714796421535654, "loss": 35.74, "step": 516 }, { "epoch": 0.27888473079127474, "grad_norm": 41.40327072143555, "learning_rate": 0.00019713430197050756, "loss": 34.8342, "step": 517 }, { "epoch": 0.27942415967094836, "grad_norm": 65.70941162109375, "learning_rate": 0.00019712060755602102, "loss": 45.6267, "step": 518 }, { "epoch": 0.27996358855062203, "grad_norm": 37.733158111572266, "learning_rate": 0.00019710688097643227, "loss": 40.7, "step": 519 }, { "epoch": 0.28050301743029565, "grad_norm": 39.90540313720703, "learning_rate": 0.0001970931222362874, "loss": 52.105, "step": 520 }, { "epoch": 0.28104244630996933, "grad_norm": 41.023155212402344, "learning_rate": 0.0001970793313401432, "loss": 47.4019, "step": 521 }, { "epoch": 0.28158187518964295, "grad_norm": 39.340972900390625, "learning_rate": 0.00019706550829256693, "loss": 36.3784, "step": 522 }, { "epoch": 0.2821213040693166, "grad_norm": 31.36964988708496, "learning_rate": 0.0001970516530981367, "loss": 32.5883, "step": 523 }, { "epoch": 0.28266073294899025, "grad_norm": 31.426342010498047, "learning_rate": 0.00019703776576144105, "loss": 37.0281, "step": 524 }, { "epoch": 0.2832001618286639, "grad_norm": 48.170589447021484, "learning_rate": 0.00019702384628707945, "loss": 50.0541, "step": 525 }, { "epoch": 0.28373959070833754, "grad_norm": 58.017845153808594, "learning_rate": 0.0001970098946796617, "loss": 35.1185, "step": 526 }, { "epoch": 0.2842790195880112, "grad_norm": 44.51712417602539, "learning_rate": 0.0001969959109438085, "loss": 30.6861, "step": 527 }, { "epoch": 0.28481844846768484, "grad_norm": 38.26441955566406, "learning_rate": 0.00019698189508415102, "loss": 42.7979, "step": 528 }, { "epoch": 0.28535787734735846, "grad_norm": 33.41388702392578, "learning_rate": 0.00019696784710533115, "loss": 31.6934, "step": 529 }, { "epoch": 0.28589730622703213, "grad_norm": 39.14249038696289, "learning_rate": 0.00019695376701200145, "loss": 31.4034, "step": 530 }, { "epoch": 0.28643673510670575, "grad_norm": 38.64737319946289, "learning_rate": 0.000196939654808825, "loss": 35.3318, "step": 531 }, { "epoch": 0.28697616398637943, "grad_norm": 32.65852355957031, "learning_rate": 0.0001969255105004756, "loss": 33.1427, "step": 532 }, { "epoch": 0.28751559286605305, "grad_norm": 33.65852355957031, "learning_rate": 0.0001969113340916377, "loss": 31.0407, "step": 533 }, { "epoch": 0.2880550217457267, "grad_norm": 31.496322631835938, "learning_rate": 0.00019689712558700628, "loss": 32.1776, "step": 534 }, { "epoch": 0.28859445062540034, "grad_norm": 37.255680084228516, "learning_rate": 0.00019688288499128707, "loss": 32.4352, "step": 535 }, { "epoch": 0.289133879505074, "grad_norm": 35.74131774902344, "learning_rate": 0.00019686861230919635, "loss": 39.0239, "step": 536 }, { "epoch": 0.28967330838474764, "grad_norm": 62.805694580078125, "learning_rate": 0.00019685430754546107, "loss": 39.168, "step": 537 }, { "epoch": 0.2902127372644213, "grad_norm": 32.74406814575195, "learning_rate": 0.00019683997070481875, "loss": 27.3064, "step": 538 }, { "epoch": 0.29075216614409494, "grad_norm": 60.63595199584961, "learning_rate": 0.00019682560179201759, "loss": 37.3217, "step": 539 }, { "epoch": 0.2912915950237686, "grad_norm": 49.350975036621094, "learning_rate": 0.00019681120081181636, "loss": 32.6254, "step": 540 }, { "epoch": 0.29183102390344223, "grad_norm": 33.03507614135742, "learning_rate": 0.00019679676776898454, "loss": 23.6142, "step": 541 }, { "epoch": 0.29237045278311585, "grad_norm": 46.380985260009766, "learning_rate": 0.00019678230266830212, "loss": 26.1048, "step": 542 }, { "epoch": 0.29290988166278953, "grad_norm": 44.384132385253906, "learning_rate": 0.00019676780551455977, "loss": 19.0745, "step": 543 }, { "epoch": 0.29344931054246315, "grad_norm": 32.757320404052734, "learning_rate": 0.0001967532763125588, "loss": 33.5921, "step": 544 }, { "epoch": 0.2939887394221368, "grad_norm": 40.512939453125, "learning_rate": 0.000196738715067111, "loss": 23.9648, "step": 545 }, { "epoch": 0.29452816830181044, "grad_norm": 36.085330963134766, "learning_rate": 0.00019672412178303898, "loss": 25.8736, "step": 546 }, { "epoch": 0.2950675971814841, "grad_norm": 39.4991340637207, "learning_rate": 0.00019670949646517576, "loss": 35.8085, "step": 547 }, { "epoch": 0.29560702606115774, "grad_norm": 56.80205535888672, "learning_rate": 0.0001966948391183651, "loss": 21.2566, "step": 548 }, { "epoch": 0.2961464549408314, "grad_norm": 51.80792999267578, "learning_rate": 0.00019668014974746133, "loss": 19.3891, "step": 549 }, { "epoch": 0.29668588382050504, "grad_norm": 40.740726470947266, "learning_rate": 0.00019666542835732937, "loss": 17.442, "step": 550 }, { "epoch": 0.2972253127001787, "grad_norm": 43.78228759765625, "learning_rate": 0.00019665067495284476, "loss": 53.1444, "step": 551 }, { "epoch": 0.29776474157985233, "grad_norm": 68.15139770507812, "learning_rate": 0.00019663588953889363, "loss": 83.8455, "step": 552 }, { "epoch": 0.29830417045952595, "grad_norm": 57.72416305541992, "learning_rate": 0.00019662107212037273, "loss": 91.3314, "step": 553 }, { "epoch": 0.29884359933919963, "grad_norm": 70.40361785888672, "learning_rate": 0.0001966062227021894, "loss": 115.1381, "step": 554 }, { "epoch": 0.29938302821887325, "grad_norm": 33.6906623840332, "learning_rate": 0.00019659134128926156, "loss": 96.5649, "step": 555 }, { "epoch": 0.2999224570985469, "grad_norm": 41.24090576171875, "learning_rate": 0.00019657642788651776, "loss": 104.8012, "step": 556 }, { "epoch": 0.30046188597822054, "grad_norm": 62.62508773803711, "learning_rate": 0.00019656148249889714, "loss": 89.1584, "step": 557 }, { "epoch": 0.3010013148578942, "grad_norm": 54.20726013183594, "learning_rate": 0.00019654650513134937, "loss": 102.4601, "step": 558 }, { "epoch": 0.30154074373756784, "grad_norm": 51.19554138183594, "learning_rate": 0.00019653149578883482, "loss": 94.7273, "step": 559 }, { "epoch": 0.3020801726172415, "grad_norm": 50.297447204589844, "learning_rate": 0.00019651645447632437, "loss": 85.4999, "step": 560 }, { "epoch": 0.30261960149691514, "grad_norm": 43.541648864746094, "learning_rate": 0.00019650138119879952, "loss": 84.9936, "step": 561 }, { "epoch": 0.3031590303765888, "grad_norm": 30.611860275268555, "learning_rate": 0.00019648627596125233, "loss": 68.3871, "step": 562 }, { "epoch": 0.30369845925626243, "grad_norm": 18.373859405517578, "learning_rate": 0.00019647113876868546, "loss": 64.1806, "step": 563 }, { "epoch": 0.30423788813593605, "grad_norm": 17.967041015625, "learning_rate": 0.00019645596962611218, "loss": 58.1967, "step": 564 }, { "epoch": 0.30477731701560973, "grad_norm": 17.57683563232422, "learning_rate": 0.00019644076853855626, "loss": 48.7426, "step": 565 }, { "epoch": 0.30531674589528335, "grad_norm": 24.4635066986084, "learning_rate": 0.00019642553551105219, "loss": 45.5702, "step": 566 }, { "epoch": 0.305856174774957, "grad_norm": 44.31038284301758, "learning_rate": 0.0001964102705486449, "loss": 36.4538, "step": 567 }, { "epoch": 0.30639560365463064, "grad_norm": 45.66762924194336, "learning_rate": 0.00019639497365638993, "loss": 37.6228, "step": 568 }, { "epoch": 0.3069350325343043, "grad_norm": 45.2806282043457, "learning_rate": 0.00019637964483935346, "loss": 47.7514, "step": 569 }, { "epoch": 0.30747446141397794, "grad_norm": 44.627296447753906, "learning_rate": 0.00019636428410261218, "loss": 50.5934, "step": 570 }, { "epoch": 0.3080138902936516, "grad_norm": 39.8631706237793, "learning_rate": 0.00019634889145125336, "loss": 33.2035, "step": 571 }, { "epoch": 0.30855331917332524, "grad_norm": 43.88326644897461, "learning_rate": 0.00019633346689037486, "loss": 44.4418, "step": 572 }, { "epoch": 0.3090927480529989, "grad_norm": 31.599515914916992, "learning_rate": 0.0001963180104250851, "loss": 29.8656, "step": 573 }, { "epoch": 0.30963217693267253, "grad_norm": 29.062061309814453, "learning_rate": 0.00019630252206050307, "loss": 29.4416, "step": 574 }, { "epoch": 0.31017160581234615, "grad_norm": 35.07856750488281, "learning_rate": 0.00019628700180175833, "loss": 33.663, "step": 575 }, { "epoch": 0.3107110346920198, "grad_norm": 38.65933609008789, "learning_rate": 0.00019627144965399094, "loss": 43.6982, "step": 576 }, { "epoch": 0.31125046357169345, "grad_norm": 36.53346252441406, "learning_rate": 0.0001962558656223516, "loss": 41.9741, "step": 577 }, { "epoch": 0.3117898924513671, "grad_norm": 50.61214065551758, "learning_rate": 0.00019624024971200154, "loss": 31.3103, "step": 578 }, { "epoch": 0.31232932133104074, "grad_norm": 39.70477294921875, "learning_rate": 0.00019622460192811255, "loss": 40.1001, "step": 579 }, { "epoch": 0.3128687502107144, "grad_norm": 43.24115753173828, "learning_rate": 0.000196208922275867, "loss": 38.9648, "step": 580 }, { "epoch": 0.31340817909038804, "grad_norm": 49.614410400390625, "learning_rate": 0.00019619321076045778, "loss": 38.396, "step": 581 }, { "epoch": 0.3139476079700617, "grad_norm": 38.65335464477539, "learning_rate": 0.0001961774673870883, "loss": 33.8401, "step": 582 }, { "epoch": 0.31448703684973534, "grad_norm": 36.919837951660156, "learning_rate": 0.00019616169216097262, "loss": 40.8598, "step": 583 }, { "epoch": 0.315026465729409, "grad_norm": 34.90658187866211, "learning_rate": 0.00019614588508733524, "loss": 26.7875, "step": 584 }, { "epoch": 0.31556589460908263, "grad_norm": 36.6773796081543, "learning_rate": 0.00019613004617141132, "loss": 38.7512, "step": 585 }, { "epoch": 0.3161053234887563, "grad_norm": 38.80603790283203, "learning_rate": 0.00019611417541844645, "loss": 22.4567, "step": 586 }, { "epoch": 0.3166447523684299, "grad_norm": 39.85905838012695, "learning_rate": 0.00019609827283369687, "loss": 34.7722, "step": 587 }, { "epoch": 0.31718418124810355, "grad_norm": 42.714210510253906, "learning_rate": 0.00019608233842242925, "loss": 29.6514, "step": 588 }, { "epoch": 0.3177236101277772, "grad_norm": 28.49331283569336, "learning_rate": 0.00019606637218992092, "loss": 32.2811, "step": 589 }, { "epoch": 0.31826303900745084, "grad_norm": 38.48284912109375, "learning_rate": 0.0001960503741414597, "loss": 19.4347, "step": 590 }, { "epoch": 0.3188024678871245, "grad_norm": 40.46686553955078, "learning_rate": 0.00019603434428234389, "loss": 36.0755, "step": 591 }, { "epoch": 0.31934189676679814, "grad_norm": 33.52849578857422, "learning_rate": 0.00019601828261788236, "loss": 23.4967, "step": 592 }, { "epoch": 0.3198813256464718, "grad_norm": 36.89003372192383, "learning_rate": 0.0001960021891533946, "loss": 17.4822, "step": 593 }, { "epoch": 0.32042075452614543, "grad_norm": 47.023624420166016, "learning_rate": 0.00019598606389421055, "loss": 26.3533, "step": 594 }, { "epoch": 0.3209601834058191, "grad_norm": 53.969627380371094, "learning_rate": 0.00019596990684567063, "loss": 36.3338, "step": 595 }, { "epoch": 0.32149961228549273, "grad_norm": 31.71206283569336, "learning_rate": 0.00019595371801312588, "loss": 23.1099, "step": 596 }, { "epoch": 0.3220390411651664, "grad_norm": 34.602901458740234, "learning_rate": 0.00019593749740193784, "loss": 20.7281, "step": 597 }, { "epoch": 0.32257847004484, "grad_norm": 32.23836135864258, "learning_rate": 0.00019592124501747855, "loss": 19.1565, "step": 598 }, { "epoch": 0.32311789892451365, "grad_norm": 31.762807846069336, "learning_rate": 0.00019590496086513063, "loss": 20.822, "step": 599 }, { "epoch": 0.3236573278041873, "grad_norm": 38.77958297729492, "learning_rate": 0.00019588864495028712, "loss": 20.7172, "step": 600 }, { "epoch": 0.3236573278041873, "eval_loss": 1.5770864486694336, "eval_runtime": 140.3936, "eval_samples_per_second": 2.13, "eval_steps_per_second": 2.13, "step": 600 } ], "logging_steps": 1, "max_steps": 5559, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 200, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.0696873835715625e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }