| { | |
| "best_metric": 1.5770864486694336, | |
| "best_model_checkpoint": "miner_id_24/checkpoint-600", | |
| "epoch": 0.3236573278041873, | |
| "eval_steps": 200, | |
| "global_step": 600, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0005394288796736456, | |
| "grad_norm": 18.71552085876465, | |
| "learning_rate": 2.0000000000000003e-06, | |
| "loss": 57.3329, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.0005394288796736456, | |
| "eval_loss": 4.576467514038086, | |
| "eval_runtime": 141.0154, | |
| "eval_samples_per_second": 2.12, | |
| "eval_steps_per_second": 2.12, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.0010788577593472911, | |
| "grad_norm": 40.17718505859375, | |
| "learning_rate": 4.000000000000001e-06, | |
| "loss": 111.721, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.0016182866390209367, | |
| "grad_norm": 55.62163162231445, | |
| "learning_rate": 6e-06, | |
| "loss": 145.8098, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.0021577155186945822, | |
| "grad_norm": 70.09906005859375, | |
| "learning_rate": 8.000000000000001e-06, | |
| "loss": 176.5399, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.0026971443983682276, | |
| "grad_norm": 96.45822143554688, | |
| "learning_rate": 1e-05, | |
| "loss": 205.6804, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.0032365732780418733, | |
| "grad_norm": 96.46897888183594, | |
| "learning_rate": 1.2e-05, | |
| "loss": 191.1242, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.0037760021577155187, | |
| "grad_norm": 123.18101501464844, | |
| "learning_rate": 1.4000000000000001e-05, | |
| "loss": 200.1216, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.0043154310373891645, | |
| "grad_norm": 112.75751495361328, | |
| "learning_rate": 1.6000000000000003e-05, | |
| "loss": 199.3468, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.00485485991706281, | |
| "grad_norm": 105.84030151367188, | |
| "learning_rate": 1.8e-05, | |
| "loss": 197.7578, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.005394288796736455, | |
| "grad_norm": 152.0435333251953, | |
| "learning_rate": 2e-05, | |
| "loss": 221.8745, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0059337176764101005, | |
| "grad_norm": 140.9628143310547, | |
| "learning_rate": 2.2000000000000003e-05, | |
| "loss": 202.1205, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.006473146556083747, | |
| "grad_norm": 136.8531036376953, | |
| "learning_rate": 2.4e-05, | |
| "loss": 192.207, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.007012575435757392, | |
| "grad_norm": 135.1580352783203, | |
| "learning_rate": 2.6000000000000002e-05, | |
| "loss": 188.5981, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.007552004315431037, | |
| "grad_norm": 135.94815063476562, | |
| "learning_rate": 2.8000000000000003e-05, | |
| "loss": 182.9973, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.008091433195104683, | |
| "grad_norm": 130.7935333251953, | |
| "learning_rate": 3e-05, | |
| "loss": 181.9996, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.008630862074778329, | |
| "grad_norm": 135.71165466308594, | |
| "learning_rate": 3.2000000000000005e-05, | |
| "loss": 156.7745, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.009170290954451973, | |
| "grad_norm": 80.55735778808594, | |
| "learning_rate": 3.4000000000000007e-05, | |
| "loss": 105.2249, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.00970971983412562, | |
| "grad_norm": 78.56623840332031, | |
| "learning_rate": 3.6e-05, | |
| "loss": 93.8699, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.010249148713799266, | |
| "grad_norm": 73.5405502319336, | |
| "learning_rate": 3.8e-05, | |
| "loss": 96.5256, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.01078857759347291, | |
| "grad_norm": 66.16717529296875, | |
| "learning_rate": 4e-05, | |
| "loss": 79.6901, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.011328006473146556, | |
| "grad_norm": 65.6923599243164, | |
| "learning_rate": 4.2e-05, | |
| "loss": 90.1881, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.011867435352820201, | |
| "grad_norm": 77.53053283691406, | |
| "learning_rate": 4.4000000000000006e-05, | |
| "loss": 85.8049, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.012406864232493847, | |
| "grad_norm": 71.17222595214844, | |
| "learning_rate": 4.600000000000001e-05, | |
| "loss": 64.6935, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.012946293112167493, | |
| "grad_norm": 46.50193786621094, | |
| "learning_rate": 4.8e-05, | |
| "loss": 72.2138, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.013485721991841138, | |
| "grad_norm": 45.66022491455078, | |
| "learning_rate": 5e-05, | |
| "loss": 71.2709, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.014025150871514784, | |
| "grad_norm": 46.14365768432617, | |
| "learning_rate": 5.2000000000000004e-05, | |
| "loss": 59.7781, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.014564579751188429, | |
| "grad_norm": 54.284664154052734, | |
| "learning_rate": 5.4000000000000005e-05, | |
| "loss": 64.8576, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.015104008630862075, | |
| "grad_norm": 43.3782958984375, | |
| "learning_rate": 5.6000000000000006e-05, | |
| "loss": 68.6312, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.01564343751053572, | |
| "grad_norm": 35.549217224121094, | |
| "learning_rate": 5.8e-05, | |
| "loss": 62.3088, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.016182866390209365, | |
| "grad_norm": 42.21353530883789, | |
| "learning_rate": 6e-05, | |
| "loss": 65.5001, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.01672229526988301, | |
| "grad_norm": 46.08031463623047, | |
| "learning_rate": 6.2e-05, | |
| "loss": 57.0952, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.017261724149556658, | |
| "grad_norm": 38.45962905883789, | |
| "learning_rate": 6.400000000000001e-05, | |
| "loss": 65.1331, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.017801153029230302, | |
| "grad_norm": 34.330406188964844, | |
| "learning_rate": 6.6e-05, | |
| "loss": 56.026, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.018340581908903947, | |
| "grad_norm": 35.08675003051758, | |
| "learning_rate": 6.800000000000001e-05, | |
| "loss": 55.9277, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.018880010788577595, | |
| "grad_norm": 37.337825775146484, | |
| "learning_rate": 7e-05, | |
| "loss": 53.9387, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.01941943966825124, | |
| "grad_norm": 36.146873474121094, | |
| "learning_rate": 7.2e-05, | |
| "loss": 61.2999, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.019958868547924884, | |
| "grad_norm": 41.229610443115234, | |
| "learning_rate": 7.4e-05, | |
| "loss": 70.8618, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.02049829742759853, | |
| "grad_norm": 42.86275863647461, | |
| "learning_rate": 7.6e-05, | |
| "loss": 60.1886, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.021037726307272176, | |
| "grad_norm": 36.5433235168457, | |
| "learning_rate": 7.800000000000001e-05, | |
| "loss": 61.5439, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.02157715518694582, | |
| "grad_norm": 39.95774841308594, | |
| "learning_rate": 8e-05, | |
| "loss": 57.8462, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.022116584066619465, | |
| "grad_norm": 38.86470413208008, | |
| "learning_rate": 8.2e-05, | |
| "loss": 55.4324, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.022656012946293113, | |
| "grad_norm": 30.977352142333984, | |
| "learning_rate": 8.4e-05, | |
| "loss": 57.6402, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.023195441825966757, | |
| "grad_norm": 38.25783157348633, | |
| "learning_rate": 8.6e-05, | |
| "loss": 50.586, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.023734870705640402, | |
| "grad_norm": 37.11707305908203, | |
| "learning_rate": 8.800000000000001e-05, | |
| "loss": 36.7947, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.02427429958531405, | |
| "grad_norm": 40.3302116394043, | |
| "learning_rate": 9e-05, | |
| "loss": 40.2388, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.024813728464987694, | |
| "grad_norm": 42.60755920410156, | |
| "learning_rate": 9.200000000000001e-05, | |
| "loss": 58.9665, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.02535315734466134, | |
| "grad_norm": 44.4195442199707, | |
| "learning_rate": 9.4e-05, | |
| "loss": 50.2349, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.025892586224334987, | |
| "grad_norm": 37.404727935791016, | |
| "learning_rate": 9.6e-05, | |
| "loss": 48.7437, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.02643201510400863, | |
| "grad_norm": 48.31377410888672, | |
| "learning_rate": 9.8e-05, | |
| "loss": 54.4895, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.026971443983682276, | |
| "grad_norm": 51.360191345214844, | |
| "learning_rate": 0.0001, | |
| "loss": 57.4654, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.02751087286335592, | |
| "grad_norm": 23.211647033691406, | |
| "learning_rate": 0.00010200000000000001, | |
| "loss": 36.3068, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.028050301743029568, | |
| "grad_norm": 34.541805267333984, | |
| "learning_rate": 0.00010400000000000001, | |
| "loss": 75.6809, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.028589730622703213, | |
| "grad_norm": 42.0761833190918, | |
| "learning_rate": 0.00010600000000000002, | |
| "loss": 96.818, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.029129159502376857, | |
| "grad_norm": 43.26933670043945, | |
| "learning_rate": 0.00010800000000000001, | |
| "loss": 101.1451, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.029668588382050505, | |
| "grad_norm": 51.45765686035156, | |
| "learning_rate": 0.00011000000000000002, | |
| "loss": 115.0704, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.03020801726172415, | |
| "grad_norm": 43.3838005065918, | |
| "learning_rate": 0.00011200000000000001, | |
| "loss": 112.4747, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.030747446141397794, | |
| "grad_norm": 59.83226013183594, | |
| "learning_rate": 0.00011399999999999999, | |
| "loss": 114.741, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.03128687502107144, | |
| "grad_norm": 38.54649353027344, | |
| "learning_rate": 0.000116, | |
| "loss": 100.9508, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.03182630390074508, | |
| "grad_norm": 34.7606086730957, | |
| "learning_rate": 0.000118, | |
| "loss": 87.4097, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.03236573278041873, | |
| "grad_norm": 34.808265686035156, | |
| "learning_rate": 0.00012, | |
| "loss": 94.2411, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.03290516166009238, | |
| "grad_norm": 33.40951156616211, | |
| "learning_rate": 0.000122, | |
| "loss": 85.6042, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.03344459053976602, | |
| "grad_norm": 25.83111572265625, | |
| "learning_rate": 0.000124, | |
| "loss": 79.6747, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.03398401941943967, | |
| "grad_norm": 51.73832321166992, | |
| "learning_rate": 0.000126, | |
| "loss": 65.953, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.034523448299113316, | |
| "grad_norm": 38.63320541381836, | |
| "learning_rate": 0.00012800000000000002, | |
| "loss": 72.4182, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.03506287717878696, | |
| "grad_norm": 20.10302734375, | |
| "learning_rate": 0.00013000000000000002, | |
| "loss": 61.2385, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.035602306058460605, | |
| "grad_norm": 27.804248809814453, | |
| "learning_rate": 0.000132, | |
| "loss": 59.6029, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.03614173493813425, | |
| "grad_norm": 30.542932510375977, | |
| "learning_rate": 0.000134, | |
| "loss": 51.2451, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.036681163817807894, | |
| "grad_norm": 70.11331176757812, | |
| "learning_rate": 0.00013600000000000003, | |
| "loss": 51.116, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.03722059269748154, | |
| "grad_norm": 155.8134307861328, | |
| "learning_rate": 0.000138, | |
| "loss": 76.3231, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.03776002157715519, | |
| "grad_norm": 146.5844268798828, | |
| "learning_rate": 0.00014, | |
| "loss": 68.6173, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.03829945045682883, | |
| "grad_norm": 102.16127014160156, | |
| "learning_rate": 0.000142, | |
| "loss": 72.0777, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.03883887933650248, | |
| "grad_norm": 40.04204559326172, | |
| "learning_rate": 0.000144, | |
| "loss": 52.744, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.039378308216176126, | |
| "grad_norm": 75.35163116455078, | |
| "learning_rate": 0.000146, | |
| "loss": 59.1683, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.03991773709584977, | |
| "grad_norm": 77.30841827392578, | |
| "learning_rate": 0.000148, | |
| "loss": 51.3059, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.040457165975523415, | |
| "grad_norm": 52.49984359741211, | |
| "learning_rate": 0.00015000000000000001, | |
| "loss": 52.407, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.04099659485519706, | |
| "grad_norm": 35.61119842529297, | |
| "learning_rate": 0.000152, | |
| "loss": 50.8863, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.041536023734870704, | |
| "grad_norm": 34.10403060913086, | |
| "learning_rate": 0.000154, | |
| "loss": 53.6901, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.04207545261454435, | |
| "grad_norm": 39.79935836791992, | |
| "learning_rate": 0.00015600000000000002, | |
| "loss": 49.8857, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.042614881494218, | |
| "grad_norm": 35.74922561645508, | |
| "learning_rate": 0.00015800000000000002, | |
| "loss": 62.577, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.04315431037389164, | |
| "grad_norm": 31.491291046142578, | |
| "learning_rate": 0.00016, | |
| "loss": 52.0815, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.04369373925356529, | |
| "grad_norm": 23.866592407226562, | |
| "learning_rate": 0.000162, | |
| "loss": 64.6077, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.04423316813323893, | |
| "grad_norm": 28.5296688079834, | |
| "learning_rate": 0.000164, | |
| "loss": 59.5244, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.04477259701291258, | |
| "grad_norm": 33.92407989501953, | |
| "learning_rate": 0.000166, | |
| "loss": 62.9956, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.045312025892586226, | |
| "grad_norm": 27.05453109741211, | |
| "learning_rate": 0.000168, | |
| "loss": 52.9777, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.04585145477225987, | |
| "grad_norm": 23.927709579467773, | |
| "learning_rate": 0.00017, | |
| "loss": 56.0232, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.046390883651933515, | |
| "grad_norm": 31.250370025634766, | |
| "learning_rate": 0.000172, | |
| "loss": 55.1487, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.04693031253160716, | |
| "grad_norm": 32.98558044433594, | |
| "learning_rate": 0.000174, | |
| "loss": 54.3132, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.047469741411280804, | |
| "grad_norm": 39.15415954589844, | |
| "learning_rate": 0.00017600000000000002, | |
| "loss": 56.2989, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.04800917029095445, | |
| "grad_norm": 32.42843246459961, | |
| "learning_rate": 0.00017800000000000002, | |
| "loss": 41.7672, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.0485485991706281, | |
| "grad_norm": 42.03153610229492, | |
| "learning_rate": 0.00018, | |
| "loss": 50.3046, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.04908802805030174, | |
| "grad_norm": 38.14472961425781, | |
| "learning_rate": 0.000182, | |
| "loss": 50.2817, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.04962745692997539, | |
| "grad_norm": 32.74757385253906, | |
| "learning_rate": 0.00018400000000000003, | |
| "loss": 47.9721, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.05016688580964904, | |
| "grad_norm": 41.20277404785156, | |
| "learning_rate": 0.00018600000000000002, | |
| "loss": 48.2985, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.05070631468932268, | |
| "grad_norm": 42.31992721557617, | |
| "learning_rate": 0.000188, | |
| "loss": 58.7386, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.051245743568996326, | |
| "grad_norm": 28.106618881225586, | |
| "learning_rate": 0.00019, | |
| "loss": 46.3057, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.051785172448669974, | |
| "grad_norm": 37.70038604736328, | |
| "learning_rate": 0.000192, | |
| "loss": 35.5874, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.052324601328343615, | |
| "grad_norm": 36.007530212402344, | |
| "learning_rate": 0.000194, | |
| "loss": 47.9065, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.05286403020801726, | |
| "grad_norm": 29.738492965698242, | |
| "learning_rate": 0.000196, | |
| "loss": 49.6222, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.05340345908769091, | |
| "grad_norm": 42.806785583496094, | |
| "learning_rate": 0.00019800000000000002, | |
| "loss": 44.6868, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.05394288796736455, | |
| "grad_norm": 31.359643936157227, | |
| "learning_rate": 0.0002, | |
| "loss": 30.5743, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.0544823168470382, | |
| "grad_norm": 24.176820755004883, | |
| "learning_rate": 0.00019999998344063995, | |
| "loss": 41.8829, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.05502174572671184, | |
| "grad_norm": 43.5556755065918, | |
| "learning_rate": 0.00019999993376256528, | |
| "loss": 64.5931, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.05556117460638549, | |
| "grad_norm": 35.98505401611328, | |
| "learning_rate": 0.00019999985096579245, | |
| "loss": 94.4231, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.056100603486059136, | |
| "grad_norm": 35.83631134033203, | |
| "learning_rate": 0.00019999973505034887, | |
| "loss": 113.3877, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.05664003236573278, | |
| "grad_norm": 30.29425621032715, | |
| "learning_rate": 0.00019999958601627296, | |
| "loss": 113.0325, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.057179461245406425, | |
| "grad_norm": 27.389789581298828, | |
| "learning_rate": 0.000199999403863614, | |
| "loss": 111.3191, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.05771889012508007, | |
| "grad_norm": 27.400251388549805, | |
| "learning_rate": 0.00019999918859243244, | |
| "loss": 97.0415, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.058258319004753714, | |
| "grad_norm": 20.399946212768555, | |
| "learning_rate": 0.0001999989402027995, | |
| "loss": 90.2641, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.05879774788442736, | |
| "grad_norm": 25.029308319091797, | |
| "learning_rate": 0.0001999986586947974, | |
| "loss": 94.4251, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.05933717676410101, | |
| "grad_norm": 29.495418548583984, | |
| "learning_rate": 0.00019999834406851945, | |
| "loss": 94.9159, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.05987660564377465, | |
| "grad_norm": 19.77571678161621, | |
| "learning_rate": 0.0001999979963240698, | |
| "loss": 75.4925, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.0604160345234483, | |
| "grad_norm": 25.004566192626953, | |
| "learning_rate": 0.00019999761546156365, | |
| "loss": 71.3454, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.06095546340312195, | |
| "grad_norm": 34.21379852294922, | |
| "learning_rate": 0.00019999720148112715, | |
| "loss": 66.511, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.06149489228279559, | |
| "grad_norm": 22.71439552307129, | |
| "learning_rate": 0.00019999675438289738, | |
| "loss": 52.0498, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.062034321162469236, | |
| "grad_norm": 24.381750106811523, | |
| "learning_rate": 0.0001999962741670224, | |
| "loss": 55.2827, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.06257375004214288, | |
| "grad_norm": 37.246803283691406, | |
| "learning_rate": 0.00019999576083366125, | |
| "loss": 54.9355, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.06311317892181653, | |
| "grad_norm": 81.53564453125, | |
| "learning_rate": 0.00019999521438298398, | |
| "loss": 59.4422, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.06365260780149017, | |
| "grad_norm": 129.4823760986328, | |
| "learning_rate": 0.00019999463481517156, | |
| "loss": 67.393, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.06419203668116381, | |
| "grad_norm": 77.96698760986328, | |
| "learning_rate": 0.00019999402213041588, | |
| "loss": 67.9443, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.06473146556083746, | |
| "grad_norm": 53.094512939453125, | |
| "learning_rate": 0.0001999933763289199, | |
| "loss": 61.054, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.06527089444051111, | |
| "grad_norm": 52.896366119384766, | |
| "learning_rate": 0.00019999269741089752, | |
| "loss": 62.3436, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.06581032332018476, | |
| "grad_norm": 57.282318115234375, | |
| "learning_rate": 0.00019999198537657353, | |
| "loss": 56.6129, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.0663497521998584, | |
| "grad_norm": 46.553062438964844, | |
| "learning_rate": 0.0001999912402261838, | |
| "loss": 55.701, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.06688918107953204, | |
| "grad_norm": 28.822669982910156, | |
| "learning_rate": 0.00019999046195997512, | |
| "loss": 54.2102, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.06742860995920569, | |
| "grad_norm": 28.726089477539062, | |
| "learning_rate": 0.00019998965057820516, | |
| "loss": 56.0332, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.06796803883887934, | |
| "grad_norm": 26.886003494262695, | |
| "learning_rate": 0.0001999888060811427, | |
| "loss": 43.4516, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.06850746771855298, | |
| "grad_norm": 31.9282169342041, | |
| "learning_rate": 0.00019998792846906747, | |
| "loss": 52.2149, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.06904689659822663, | |
| "grad_norm": 38.317962646484375, | |
| "learning_rate": 0.00019998701774227005, | |
| "loss": 54.0044, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.06958632547790028, | |
| "grad_norm": 31.158544540405273, | |
| "learning_rate": 0.00019998607390105209, | |
| "loss": 55.2255, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.07012575435757391, | |
| "grad_norm": 33.239166259765625, | |
| "learning_rate": 0.00019998509694572615, | |
| "loss": 56.3811, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.07066518323724756, | |
| "grad_norm": 30.34086799621582, | |
| "learning_rate": 0.00019998408687661582, | |
| "loss": 52.0529, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.07120461211692121, | |
| "grad_norm": 24.05341911315918, | |
| "learning_rate": 0.00019998304369405563, | |
| "loss": 60.5602, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.07174404099659486, | |
| "grad_norm": 26.90273094177246, | |
| "learning_rate": 0.00019998196739839103, | |
| "loss": 57.3375, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.0722834698762685, | |
| "grad_norm": 24.157773971557617, | |
| "learning_rate": 0.0001999808579899785, | |
| "loss": 47.7251, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.07282289875594215, | |
| "grad_norm": 28.088014602661133, | |
| "learning_rate": 0.00019997971546918545, | |
| "loss": 56.1037, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.07336232763561579, | |
| "grad_norm": 32.39021682739258, | |
| "learning_rate": 0.00019997853983639029, | |
| "loss": 52.0922, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.07390175651528944, | |
| "grad_norm": 29.597578048706055, | |
| "learning_rate": 0.0001999773310919824, | |
| "loss": 46.3537, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.07444118539496308, | |
| "grad_norm": 38.31181335449219, | |
| "learning_rate": 0.000199976089236362, | |
| "loss": 46.8711, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.07498061427463673, | |
| "grad_norm": 39.67713165283203, | |
| "learning_rate": 0.00019997481426994044, | |
| "loss": 45.0961, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.07552004315431038, | |
| "grad_norm": 48.8436164855957, | |
| "learning_rate": 0.00019997350619314, | |
| "loss": 48.7547, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.07605947203398401, | |
| "grad_norm": 88.95709991455078, | |
| "learning_rate": 0.00019997216500639383, | |
| "loss": 50.3681, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.07659890091365766, | |
| "grad_norm": 34.2819938659668, | |
| "learning_rate": 0.0001999707907101462, | |
| "loss": 44.3903, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.07713832979333131, | |
| "grad_norm": 42.79631042480469, | |
| "learning_rate": 0.00019996938330485217, | |
| "loss": 31.0566, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.07767775867300496, | |
| "grad_norm": 37.28693389892578, | |
| "learning_rate": 0.00019996794279097791, | |
| "loss": 34.0999, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.0782171875526786, | |
| "grad_norm": 43.65718460083008, | |
| "learning_rate": 0.00019996646916900051, | |
| "loss": 48.7369, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.07875661643235225, | |
| "grad_norm": 39.86713409423828, | |
| "learning_rate": 0.00019996496243940794, | |
| "loss": 36.6841, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.07929604531202589, | |
| "grad_norm": 32.35002899169922, | |
| "learning_rate": 0.0001999634226026993, | |
| "loss": 43.1344, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.07983547419169953, | |
| "grad_norm": 36.14616775512695, | |
| "learning_rate": 0.0001999618496593845, | |
| "loss": 51.9779, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.08037490307137318, | |
| "grad_norm": 31.071197509765625, | |
| "learning_rate": 0.00019996024360998456, | |
| "loss": 39.5621, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.08091433195104683, | |
| "grad_norm": 33.61774444580078, | |
| "learning_rate": 0.00019995860445503127, | |
| "loss": 37.7614, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.08145376083072048, | |
| "grad_norm": 22.93950653076172, | |
| "learning_rate": 0.00019995693219506758, | |
| "loss": 59.2331, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.08199318971039413, | |
| "grad_norm": 31.307132720947266, | |
| "learning_rate": 0.00019995522683064726, | |
| "loss": 70.8054, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.08253261859006776, | |
| "grad_norm": 28.894466400146484, | |
| "learning_rate": 0.00019995348836233516, | |
| "loss": 84.8097, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.08307204746974141, | |
| "grad_norm": 26.76435661315918, | |
| "learning_rate": 0.000199951716790707, | |
| "loss": 101.4707, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.08361147634941506, | |
| "grad_norm": 26.842918395996094, | |
| "learning_rate": 0.00019994991211634954, | |
| "loss": 107.518, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.0841509052290887, | |
| "grad_norm": 25.251588821411133, | |
| "learning_rate": 0.00019994807433986047, | |
| "loss": 106.076, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.08469033410876235, | |
| "grad_norm": 28.60271453857422, | |
| "learning_rate": 0.0001999462034618484, | |
| "loss": 96.3093, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.085229762988436, | |
| "grad_norm": 22.537473678588867, | |
| "learning_rate": 0.00019994429948293291, | |
| "loss": 88.6475, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.08576919186810963, | |
| "grad_norm": 18.868396759033203, | |
| "learning_rate": 0.00019994236240374465, | |
| "loss": 92.4222, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.08630862074778328, | |
| "grad_norm": 21.84971046447754, | |
| "learning_rate": 0.00019994039222492513, | |
| "loss": 88.0079, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.08684804962745693, | |
| "grad_norm": 23.634244918823242, | |
| "learning_rate": 0.00019993838894712682, | |
| "loss": 77.0574, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.08738747850713058, | |
| "grad_norm": 18.22877311706543, | |
| "learning_rate": 0.00019993635257101322, | |
| "loss": 67.3958, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.08792690738680423, | |
| "grad_norm": 21.62260627746582, | |
| "learning_rate": 0.00019993428309725872, | |
| "loss": 65.1832, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.08846633626647786, | |
| "grad_norm": 18.148618698120117, | |
| "learning_rate": 0.0001999321805265487, | |
| "loss": 63.1231, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.08900576514615151, | |
| "grad_norm": 20.20022201538086, | |
| "learning_rate": 0.00019993004485957956, | |
| "loss": 59.0852, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.08954519402582516, | |
| "grad_norm": 28.2082576751709, | |
| "learning_rate": 0.00019992787609705853, | |
| "loss": 55.8505, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.0900846229054988, | |
| "grad_norm": 43.48365020751953, | |
| "learning_rate": 0.00019992567423970394, | |
| "loss": 40.495, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.09062405178517245, | |
| "grad_norm": 149.13955688476562, | |
| "learning_rate": 0.00019992343928824498, | |
| "loss": 91.8388, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.0911634806648461, | |
| "grad_norm": 91.07251739501953, | |
| "learning_rate": 0.00019992117124342183, | |
| "loss": 61.9425, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.09170290954451973, | |
| "grad_norm": 65.70806121826172, | |
| "learning_rate": 0.00019991887010598565, | |
| "loss": 59.7979, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.09224233842419338, | |
| "grad_norm": 45.109580993652344, | |
| "learning_rate": 0.00019991653587669855, | |
| "loss": 63.235, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.09278176730386703, | |
| "grad_norm": 49.24695587158203, | |
| "learning_rate": 0.00019991416855633364, | |
| "loss": 55.8371, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.09332119618354068, | |
| "grad_norm": 44.50947952270508, | |
| "learning_rate": 0.0001999117681456749, | |
| "loss": 45.3712, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.09386062506321433, | |
| "grad_norm": 45.105506896972656, | |
| "learning_rate": 0.00019990933464551728, | |
| "loss": 59.354, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.09440005394288797, | |
| "grad_norm": 31.862106323242188, | |
| "learning_rate": 0.0001999068680566668, | |
| "loss": 49.2883, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.09493948282256161, | |
| "grad_norm": 34.86188507080078, | |
| "learning_rate": 0.00019990436837994028, | |
| "loss": 40.9445, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.09547891170223526, | |
| "grad_norm": 52.34774398803711, | |
| "learning_rate": 0.00019990183561616567, | |
| "loss": 54.3114, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.0960183405819089, | |
| "grad_norm": 30.12732696533203, | |
| "learning_rate": 0.00019989926976618172, | |
| "loss": 44.8966, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.09655776946158255, | |
| "grad_norm": 29.296287536621094, | |
| "learning_rate": 0.00019989667083083825, | |
| "loss": 47.5101, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.0970971983412562, | |
| "grad_norm": 42.42873764038086, | |
| "learning_rate": 0.00019989403881099597, | |
| "loss": 48.2378, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.09763662722092983, | |
| "grad_norm": 31.62274742126465, | |
| "learning_rate": 0.00019989137370752657, | |
| "loss": 42.1564, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 0.09817605610060348, | |
| "grad_norm": 30.754499435424805, | |
| "learning_rate": 0.00019988867552131275, | |
| "loss": 52.2929, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.09871548498027713, | |
| "grad_norm": 31.932157516479492, | |
| "learning_rate": 0.000199885944253248, | |
| "loss": 45.6226, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 0.09925491385995078, | |
| "grad_norm": 33.754722595214844, | |
| "learning_rate": 0.00019988317990423703, | |
| "loss": 39.9572, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.09979434273962443, | |
| "grad_norm": 33.33165740966797, | |
| "learning_rate": 0.00019988038247519522, | |
| "loss": 52.7357, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.10033377161929807, | |
| "grad_norm": 28.355619430541992, | |
| "learning_rate": 0.0001998775519670491, | |
| "loss": 39.8865, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.10087320049897171, | |
| "grad_norm": 60.16803741455078, | |
| "learning_rate": 0.00019987468838073613, | |
| "loss": 48.3595, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 0.10141262937864536, | |
| "grad_norm": 33.5135498046875, | |
| "learning_rate": 0.00019987179171720464, | |
| "loss": 34.3803, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.101952058258319, | |
| "grad_norm": 33.8374137878418, | |
| "learning_rate": 0.00019986886197741403, | |
| "loss": 46.4517, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 0.10249148713799265, | |
| "grad_norm": 26.143709182739258, | |
| "learning_rate": 0.0001998658991623345, | |
| "loss": 30.6351, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.1030309160176663, | |
| "grad_norm": 28.791723251342773, | |
| "learning_rate": 0.0001998629032729474, | |
| "loss": 44.2275, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 0.10357034489733995, | |
| "grad_norm": 33.818931579589844, | |
| "learning_rate": 0.00019985987431024485, | |
| "loss": 43.5677, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.10410977377701358, | |
| "grad_norm": 40.07392883300781, | |
| "learning_rate": 0.00019985681227523006, | |
| "loss": 34.5844, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 0.10464920265668723, | |
| "grad_norm": 30.963062286376953, | |
| "learning_rate": 0.00019985371716891708, | |
| "loss": 44.1099, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.10518863153636088, | |
| "grad_norm": 31.774293899536133, | |
| "learning_rate": 0.000199850588992331, | |
| "loss": 36.4496, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.10572806041603452, | |
| "grad_norm": 47.396575927734375, | |
| "learning_rate": 0.00019984742774650785, | |
| "loss": 50.9736, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.10626748929570817, | |
| "grad_norm": 58.573341369628906, | |
| "learning_rate": 0.00019984423343249457, | |
| "loss": 44.6643, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 0.10680691817538182, | |
| "grad_norm": 33.57207107543945, | |
| "learning_rate": 0.00019984100605134906, | |
| "loss": 36.4154, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.10734634705505545, | |
| "grad_norm": 33.817752838134766, | |
| "learning_rate": 0.00019983774560414027, | |
| "loss": 38.8474, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 0.1078857759347291, | |
| "grad_norm": 34.572608947753906, | |
| "learning_rate": 0.00019983445209194791, | |
| "loss": 30.1009, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.1078857759347291, | |
| "eval_loss": 1.836081624031067, | |
| "eval_runtime": 141.0356, | |
| "eval_samples_per_second": 2.12, | |
| "eval_steps_per_second": 2.12, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.10842520481440275, | |
| "grad_norm": 23.590002059936523, | |
| "learning_rate": 0.0001998311255158628, | |
| "loss": 53.9458, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 0.1089646336940764, | |
| "grad_norm": 39.737159729003906, | |
| "learning_rate": 0.00019982776587698666, | |
| "loss": 85.7514, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 0.10950406257375005, | |
| "grad_norm": 35.41561508178711, | |
| "learning_rate": 0.00019982437317643217, | |
| "loss": 84.9662, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 0.11004349145342368, | |
| "grad_norm": 31.39605140686035, | |
| "learning_rate": 0.0001998209474153229, | |
| "loss": 110.0561, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 0.11058292033309733, | |
| "grad_norm": 30.160261154174805, | |
| "learning_rate": 0.00019981748859479348, | |
| "loss": 101.1574, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.11112234921277098, | |
| "grad_norm": 33.4417724609375, | |
| "learning_rate": 0.00019981399671598939, | |
| "loss": 116.0456, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 0.11166177809244462, | |
| "grad_norm": 34.16884994506836, | |
| "learning_rate": 0.0001998104717800671, | |
| "loss": 103.0287, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 0.11220120697211827, | |
| "grad_norm": 33.58393859863281, | |
| "learning_rate": 0.00019980691378819406, | |
| "loss": 95.5024, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.11274063585179192, | |
| "grad_norm": 29.785871505737305, | |
| "learning_rate": 0.00019980332274154857, | |
| "loss": 91.5854, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 0.11328006473146555, | |
| "grad_norm": 29.184667587280273, | |
| "learning_rate": 0.00019979969864131997, | |
| "loss": 86.9138, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.1138194936111392, | |
| "grad_norm": 25.164024353027344, | |
| "learning_rate": 0.00019979604148870854, | |
| "loss": 72.7827, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 0.11435892249081285, | |
| "grad_norm": 18.179292678833008, | |
| "learning_rate": 0.00019979235128492545, | |
| "loss": 67.364, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 0.1148983513704865, | |
| "grad_norm": 20.353260040283203, | |
| "learning_rate": 0.00019978862803119284, | |
| "loss": 60.0141, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 0.11543778025016015, | |
| "grad_norm": 27.25603485107422, | |
| "learning_rate": 0.00019978487172874382, | |
| "loss": 61.8063, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 0.1159772091298338, | |
| "grad_norm": 40.56468963623047, | |
| "learning_rate": 0.00019978108237882244, | |
| "loss": 51.2483, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.11651663800950743, | |
| "grad_norm": 64.65696716308594, | |
| "learning_rate": 0.00019977725998268365, | |
| "loss": 37.8312, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.11705606688918108, | |
| "grad_norm": 80.94468688964844, | |
| "learning_rate": 0.00019977340454159343, | |
| "loss": 55.2775, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 0.11759549576885472, | |
| "grad_norm": 100.61930084228516, | |
| "learning_rate": 0.00019976951605682862, | |
| "loss": 65.5767, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 0.11813492464852837, | |
| "grad_norm": 71.5768051147461, | |
| "learning_rate": 0.00019976559452967703, | |
| "loss": 57.5296, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 0.11867435352820202, | |
| "grad_norm": 37.10725021362305, | |
| "learning_rate": 0.00019976163996143745, | |
| "loss": 48.8497, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.11921378240787567, | |
| "grad_norm": 40.85627746582031, | |
| "learning_rate": 0.00019975765235341955, | |
| "loss": 47.6466, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 0.1197532112875493, | |
| "grad_norm": 55.1395263671875, | |
| "learning_rate": 0.000199753631706944, | |
| "loss": 60.2519, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 0.12029264016722295, | |
| "grad_norm": 42.060585021972656, | |
| "learning_rate": 0.00019974957802334234, | |
| "loss": 48.1031, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 0.1208320690468966, | |
| "grad_norm": 36.57340621948242, | |
| "learning_rate": 0.00019974549130395713, | |
| "loss": 43.3995, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.12137149792657025, | |
| "grad_norm": 31.497970581054688, | |
| "learning_rate": 0.0001997413715501419, | |
| "loss": 41.1591, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.1219109268062439, | |
| "grad_norm": 30.481502532958984, | |
| "learning_rate": 0.00019973721876326094, | |
| "loss": 38.0712, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 0.12245035568591753, | |
| "grad_norm": 38.2381477355957, | |
| "learning_rate": 0.00019973303294468968, | |
| "loss": 46.3861, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 0.12298978456559118, | |
| "grad_norm": 37.4508171081543, | |
| "learning_rate": 0.0001997288140958144, | |
| "loss": 49.3107, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 0.12352921344526482, | |
| "grad_norm": 37.3139533996582, | |
| "learning_rate": 0.0001997245622180323, | |
| "loss": 43.1914, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 0.12406864232493847, | |
| "grad_norm": 35.13384246826172, | |
| "learning_rate": 0.0001997202773127516, | |
| "loss": 45.7228, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.12460807120461212, | |
| "grad_norm": 37.45779037475586, | |
| "learning_rate": 0.00019971595938139135, | |
| "loss": 45.0848, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 0.12514750008428577, | |
| "grad_norm": 37.03962707519531, | |
| "learning_rate": 0.00019971160842538162, | |
| "loss": 46.3705, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 0.12568692896395942, | |
| "grad_norm": 30.98250389099121, | |
| "learning_rate": 0.0001997072244461634, | |
| "loss": 41.1065, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 0.12622635784363306, | |
| "grad_norm": 33.62482833862305, | |
| "learning_rate": 0.00019970280744518854, | |
| "loss": 41.8594, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 0.1267657867233067, | |
| "grad_norm": 45.488739013671875, | |
| "learning_rate": 0.00019969835742392, | |
| "loss": 38.6525, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.12730521560298033, | |
| "grad_norm": 43.84321594238281, | |
| "learning_rate": 0.0001996938743838315, | |
| "loss": 53.2114, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 0.12784464448265398, | |
| "grad_norm": 40.51958084106445, | |
| "learning_rate": 0.00019968935832640782, | |
| "loss": 50.4725, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 0.12838407336232763, | |
| "grad_norm": 35.1596794128418, | |
| "learning_rate": 0.00019968480925314458, | |
| "loss": 45.1618, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 0.12892350224200128, | |
| "grad_norm": 32.27614974975586, | |
| "learning_rate": 0.00019968022716554832, | |
| "loss": 38.2164, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 0.12946293112167492, | |
| "grad_norm": 33.67794418334961, | |
| "learning_rate": 0.00019967561206513668, | |
| "loss": 43.3203, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.13000236000134857, | |
| "grad_norm": 26.34979820251465, | |
| "learning_rate": 0.00019967096395343806, | |
| "loss": 32.1165, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 0.13054178888102222, | |
| "grad_norm": 33.10830307006836, | |
| "learning_rate": 0.00019966628283199186, | |
| "loss": 45.5207, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 0.13108121776069587, | |
| "grad_norm": 47.04872131347656, | |
| "learning_rate": 0.00019966156870234844, | |
| "loss": 44.7497, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 0.13162064664036952, | |
| "grad_norm": 38.99346160888672, | |
| "learning_rate": 0.000199656821566069, | |
| "loss": 43.9255, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 0.13216007552004316, | |
| "grad_norm": 29.892854690551758, | |
| "learning_rate": 0.00019965204142472574, | |
| "loss": 48.4896, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.1326995043997168, | |
| "grad_norm": 37.65726089477539, | |
| "learning_rate": 0.00019964722827990185, | |
| "loss": 37.7987, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 0.13323893327939046, | |
| "grad_norm": 41.673274993896484, | |
| "learning_rate": 0.00019964238213319134, | |
| "loss": 48.4095, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 0.13377836215906408, | |
| "grad_norm": 37.152793884277344, | |
| "learning_rate": 0.00019963750298619917, | |
| "loss": 33.8212, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 0.13431779103873773, | |
| "grad_norm": 43.92071533203125, | |
| "learning_rate": 0.00019963259084054128, | |
| "loss": 35.554, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 0.13485721991841138, | |
| "grad_norm": 39.161903381347656, | |
| "learning_rate": 0.0001996276456978445, | |
| "loss": 33.8096, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.13539664879808502, | |
| "grad_norm": 24.633363723754883, | |
| "learning_rate": 0.00019962266755974657, | |
| "loss": 46.0338, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 0.13593607767775867, | |
| "grad_norm": 54.83051300048828, | |
| "learning_rate": 0.00019961765642789625, | |
| "loss": 80.4599, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 0.13647550655743232, | |
| "grad_norm": 43.1768684387207, | |
| "learning_rate": 0.0001996126123039531, | |
| "loss": 84.3379, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 0.13701493543710597, | |
| "grad_norm": 24.49346160888672, | |
| "learning_rate": 0.00019960753518958772, | |
| "loss": 100.9898, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 0.13755436431677961, | |
| "grad_norm": 38.09309768676758, | |
| "learning_rate": 0.00019960242508648154, | |
| "loss": 101.0717, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.13809379319645326, | |
| "grad_norm": 40.072296142578125, | |
| "learning_rate": 0.00019959728199632699, | |
| "loss": 108.2131, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 0.1386332220761269, | |
| "grad_norm": 43.77210235595703, | |
| "learning_rate": 0.0001995921059208274, | |
| "loss": 111.636, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 0.13917265095580056, | |
| "grad_norm": 42.023155212402344, | |
| "learning_rate": 0.00019958689686169697, | |
| "loss": 90.4911, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 0.13971207983547418, | |
| "grad_norm": 27.917343139648438, | |
| "learning_rate": 0.00019958165482066094, | |
| "loss": 92.3676, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 0.14025150871514783, | |
| "grad_norm": 19.174135208129883, | |
| "learning_rate": 0.00019957637979945537, | |
| "loss": 88.4276, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.14079093759482147, | |
| "grad_norm": 22.779672622680664, | |
| "learning_rate": 0.0001995710717998273, | |
| "loss": 88.3991, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 0.14133036647449512, | |
| "grad_norm": 17.607568740844727, | |
| "learning_rate": 0.00019956573082353463, | |
| "loss": 77.4426, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 0.14186979535416877, | |
| "grad_norm": 22.228328704833984, | |
| "learning_rate": 0.00019956035687234626, | |
| "loss": 68.3415, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 0.14240922423384242, | |
| "grad_norm": 21.00279998779297, | |
| "learning_rate": 0.00019955494994804198, | |
| "loss": 70.7203, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 0.14294865311351607, | |
| "grad_norm": 27.789443969726562, | |
| "learning_rate": 0.00019954951005241248, | |
| "loss": 62.4471, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.14348808199318971, | |
| "grad_norm": 21.813310623168945, | |
| "learning_rate": 0.0001995440371872594, | |
| "loss": 65.5364, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 0.14402751087286336, | |
| "grad_norm": 22.338788986206055, | |
| "learning_rate": 0.00019953853135439522, | |
| "loss": 53.7872, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 0.144566939752537, | |
| "grad_norm": 17.053470611572266, | |
| "learning_rate": 0.00019953299255564346, | |
| "loss": 46.6823, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 0.14510636863221066, | |
| "grad_norm": 34.75794219970703, | |
| "learning_rate": 0.0001995274207928385, | |
| "loss": 32.208, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 0.1456457975118843, | |
| "grad_norm": 76.52667236328125, | |
| "learning_rate": 0.00019952181606782565, | |
| "loss": 52.4054, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.14618522639155793, | |
| "grad_norm": 71.48796844482422, | |
| "learning_rate": 0.00019951617838246107, | |
| "loss": 48.9668, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 0.14672465527123157, | |
| "grad_norm": 79.96577453613281, | |
| "learning_rate": 0.00019951050773861192, | |
| "loss": 61.6082, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 0.14726408415090522, | |
| "grad_norm": 42.05474090576172, | |
| "learning_rate": 0.0001995048041381562, | |
| "loss": 50.8627, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 0.14780351303057887, | |
| "grad_norm": 43.19125747680664, | |
| "learning_rate": 0.00019949906758298295, | |
| "loss": 45.519, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 0.14834294191025252, | |
| "grad_norm": 47.39426040649414, | |
| "learning_rate": 0.00019949329807499198, | |
| "loss": 51.654, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.14888237078992617, | |
| "grad_norm": 36.0722770690918, | |
| "learning_rate": 0.00019948749561609415, | |
| "loss": 46.8854, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 0.14942179966959981, | |
| "grad_norm": 33.252742767333984, | |
| "learning_rate": 0.00019948166020821107, | |
| "loss": 46.7532, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 0.14996122854927346, | |
| "grad_norm": 33.89019012451172, | |
| "learning_rate": 0.0001994757918532754, | |
| "loss": 49.6403, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 0.1505006574289471, | |
| "grad_norm": 37.914676666259766, | |
| "learning_rate": 0.00019946989055323066, | |
| "loss": 54.5018, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 0.15104008630862076, | |
| "grad_norm": 37.611061096191406, | |
| "learning_rate": 0.00019946395631003128, | |
| "loss": 50.6423, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.1515795151882944, | |
| "grad_norm": 36.489723205566406, | |
| "learning_rate": 0.00019945798912564264, | |
| "loss": 45.9299, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 0.15211894406796803, | |
| "grad_norm": 31.33220100402832, | |
| "learning_rate": 0.00019945198900204095, | |
| "loss": 47.4519, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 0.15265837294764167, | |
| "grad_norm": 32.4266242980957, | |
| "learning_rate": 0.00019944595594121337, | |
| "loss": 40.0806, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 0.15319780182731532, | |
| "grad_norm": 38.17313003540039, | |
| "learning_rate": 0.00019943988994515797, | |
| "loss": 39.9765, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 0.15373723070698897, | |
| "grad_norm": 40.299354553222656, | |
| "learning_rate": 0.00019943379101588376, | |
| "loss": 40.7812, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.15427665958666262, | |
| "grad_norm": 42.34661102294922, | |
| "learning_rate": 0.00019942765915541063, | |
| "loss": 31.2513, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 0.15481608846633627, | |
| "grad_norm": 46.61203384399414, | |
| "learning_rate": 0.00019942149436576938, | |
| "loss": 41.5619, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 0.1553555173460099, | |
| "grad_norm": 39.79526901245117, | |
| "learning_rate": 0.00019941529664900168, | |
| "loss": 38.13, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 0.15589494622568356, | |
| "grad_norm": 42.995567321777344, | |
| "learning_rate": 0.0001994090660071601, | |
| "loss": 41.3515, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 0.1564343751053572, | |
| "grad_norm": 34.27892303466797, | |
| "learning_rate": 0.00019940280244230824, | |
| "loss": 41.1277, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.15697380398503086, | |
| "grad_norm": 29.622488021850586, | |
| "learning_rate": 0.00019939650595652045, | |
| "loss": 49.2284, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 0.1575132328647045, | |
| "grad_norm": 36.693119049072266, | |
| "learning_rate": 0.00019939017655188206, | |
| "loss": 35.5444, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 0.15805266174437815, | |
| "grad_norm": 30.75679588317871, | |
| "learning_rate": 0.00019938381423048932, | |
| "loss": 34.9666, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 0.15859209062405177, | |
| "grad_norm": 35.84019088745117, | |
| "learning_rate": 0.00019937741899444928, | |
| "loss": 39.4625, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 0.15913151950372542, | |
| "grad_norm": 35.854496002197266, | |
| "learning_rate": 0.00019937099084588002, | |
| "loss": 37.2887, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.15967094838339907, | |
| "grad_norm": 33.07613754272461, | |
| "learning_rate": 0.00019936452978691044, | |
| "loss": 34.5375, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 0.16021037726307272, | |
| "grad_norm": 43.46371078491211, | |
| "learning_rate": 0.00019935803581968035, | |
| "loss": 30.3173, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 0.16074980614274637, | |
| "grad_norm": 52.03241729736328, | |
| "learning_rate": 0.00019935150894634046, | |
| "loss": 42.4725, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 0.16128923502242, | |
| "grad_norm": 50.36249542236328, | |
| "learning_rate": 0.00019934494916905245, | |
| "loss": 37.3647, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 0.16182866390209366, | |
| "grad_norm": 41.50126647949219, | |
| "learning_rate": 0.00019933835648998875, | |
| "loss": 24.2931, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.1623680927817673, | |
| "grad_norm": 31.253141403198242, | |
| "learning_rate": 0.00019933173091133286, | |
| "loss": 44.7853, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 0.16290752166144096, | |
| "grad_norm": 96.83972930908203, | |
| "learning_rate": 0.000199325072435279, | |
| "loss": 84.9808, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 0.1634469505411146, | |
| "grad_norm": 91.9966049194336, | |
| "learning_rate": 0.0001993183810640324, | |
| "loss": 99.5531, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 0.16398637942078825, | |
| "grad_norm": 66.43877410888672, | |
| "learning_rate": 0.00019931165679980918, | |
| "loss": 105.7665, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 0.16452580830046187, | |
| "grad_norm": 35.26411056518555, | |
| "learning_rate": 0.00019930489964483633, | |
| "loss": 109.6819, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.16506523718013552, | |
| "grad_norm": 47.18457794189453, | |
| "learning_rate": 0.00019929810960135172, | |
| "loss": 113.4221, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 0.16560466605980917, | |
| "grad_norm": 49.24475860595703, | |
| "learning_rate": 0.00019929128667160408, | |
| "loss": 108.0158, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 0.16614409493948282, | |
| "grad_norm": 45.63924026489258, | |
| "learning_rate": 0.00019928443085785318, | |
| "loss": 94.1414, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 0.16668352381915646, | |
| "grad_norm": 46.688350677490234, | |
| "learning_rate": 0.00019927754216236948, | |
| "loss": 87.8688, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 0.1672229526988301, | |
| "grad_norm": 39.54045486450195, | |
| "learning_rate": 0.00019927062058743448, | |
| "loss": 92.6019, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.16776238157850376, | |
| "grad_norm": 29.866121292114258, | |
| "learning_rate": 0.0001992636661353405, | |
| "loss": 81.9024, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 0.1683018104581774, | |
| "grad_norm": 22.350112915039062, | |
| "learning_rate": 0.0001992566788083908, | |
| "loss": 68.4321, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 0.16884123933785106, | |
| "grad_norm": 21.657258987426758, | |
| "learning_rate": 0.00019924965860889944, | |
| "loss": 65.7434, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 0.1693806682175247, | |
| "grad_norm": 18.347572326660156, | |
| "learning_rate": 0.00019924260553919146, | |
| "loss": 62.485, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 0.16992009709719835, | |
| "grad_norm": 28.368114471435547, | |
| "learning_rate": 0.00019923551960160268, | |
| "loss": 53.7759, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.170459525976872, | |
| "grad_norm": 35.214988708496094, | |
| "learning_rate": 0.00019922840079848, | |
| "loss": 45.4414, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 0.17099895485654562, | |
| "grad_norm": 38.698760986328125, | |
| "learning_rate": 0.00019922124913218094, | |
| "loss": 37.665, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 0.17153838373621927, | |
| "grad_norm": 43.39471435546875, | |
| "learning_rate": 0.0001992140646050741, | |
| "loss": 51.4899, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 0.17207781261589292, | |
| "grad_norm": 43.52251434326172, | |
| "learning_rate": 0.00019920684721953894, | |
| "loss": 48.5712, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 0.17261724149556656, | |
| "grad_norm": 60.897579193115234, | |
| "learning_rate": 0.00019919959697796568, | |
| "loss": 59.9231, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.1731566703752402, | |
| "grad_norm": 37.93972396850586, | |
| "learning_rate": 0.0001991923138827556, | |
| "loss": 47.906, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 0.17369609925491386, | |
| "grad_norm": 44.32222366333008, | |
| "learning_rate": 0.0001991849979363207, | |
| "loss": 54.5404, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 0.1742355281345875, | |
| "grad_norm": 37.367671966552734, | |
| "learning_rate": 0.00019917764914108394, | |
| "loss": 49.3113, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 0.17477495701426116, | |
| "grad_norm": 43.20479965209961, | |
| "learning_rate": 0.00019917026749947917, | |
| "loss": 41.9015, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 0.1753143858939348, | |
| "grad_norm": 36.7598991394043, | |
| "learning_rate": 0.0001991628530139511, | |
| "loss": 43.7222, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.17585381477360845, | |
| "grad_norm": 33.30655288696289, | |
| "learning_rate": 0.0001991554056869553, | |
| "loss": 48.4387, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 0.1763932436532821, | |
| "grad_norm": 32.89339828491211, | |
| "learning_rate": 0.00019914792552095818, | |
| "loss": 51.108, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 0.17693267253295572, | |
| "grad_norm": 31.422489166259766, | |
| "learning_rate": 0.00019914041251843716, | |
| "loss": 42.9287, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 0.17747210141262937, | |
| "grad_norm": 33.38264465332031, | |
| "learning_rate": 0.00019913286668188037, | |
| "loss": 47.0867, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 0.17801153029230302, | |
| "grad_norm": 37.976837158203125, | |
| "learning_rate": 0.00019912528801378698, | |
| "loss": 38.2593, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.17855095917197666, | |
| "grad_norm": 35.707054138183594, | |
| "learning_rate": 0.0001991176765166669, | |
| "loss": 44.5348, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 0.1790903880516503, | |
| "grad_norm": 43.86237335205078, | |
| "learning_rate": 0.00019911003219304094, | |
| "loss": 40.4868, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 0.17962981693132396, | |
| "grad_norm": 54.88194274902344, | |
| "learning_rate": 0.00019910235504544082, | |
| "loss": 38.935, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 0.1801692458109976, | |
| "grad_norm": 43.87349319458008, | |
| "learning_rate": 0.00019909464507640915, | |
| "loss": 43.0978, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 0.18070867469067126, | |
| "grad_norm": 43.421932220458984, | |
| "learning_rate": 0.0001990869022884993, | |
| "loss": 39.2888, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.1812481035703449, | |
| "grad_norm": 41.14269256591797, | |
| "learning_rate": 0.00019907912668427566, | |
| "loss": 42.6139, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 0.18178753245001855, | |
| "grad_norm": 38.619380950927734, | |
| "learning_rate": 0.00019907131826631336, | |
| "loss": 40.0248, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 0.1823269613296922, | |
| "grad_norm": 33.65724563598633, | |
| "learning_rate": 0.00019906347703719845, | |
| "loss": 38.7406, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 0.18286639020936585, | |
| "grad_norm": 35.25956344604492, | |
| "learning_rate": 0.0001990556029995279, | |
| "loss": 39.2734, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 0.18340581908903947, | |
| "grad_norm": 36.87468719482422, | |
| "learning_rate": 0.00019904769615590942, | |
| "loss": 40.6619, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.18394524796871312, | |
| "grad_norm": 32.0380973815918, | |
| "learning_rate": 0.00019903975650896168, | |
| "loss": 39.8376, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 0.18448467684838676, | |
| "grad_norm": 33.44660949707031, | |
| "learning_rate": 0.0001990317840613142, | |
| "loss": 33.338, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 0.1850241057280604, | |
| "grad_norm": 36.242523193359375, | |
| "learning_rate": 0.00019902377881560735, | |
| "loss": 35.0493, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 0.18556353460773406, | |
| "grad_norm": 37.39813232421875, | |
| "learning_rate": 0.00019901574077449232, | |
| "loss": 26.9563, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 0.1861029634874077, | |
| "grad_norm": 35.84196472167969, | |
| "learning_rate": 0.0001990076699406313, | |
| "loss": 33.7825, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.18664239236708136, | |
| "grad_norm": 38.69563293457031, | |
| "learning_rate": 0.00019899956631669717, | |
| "loss": 29.9582, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 0.187181821246755, | |
| "grad_norm": 47.82805633544922, | |
| "learning_rate": 0.00019899142990537376, | |
| "loss": 33.9471, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 0.18772125012642865, | |
| "grad_norm": 36.29233169555664, | |
| "learning_rate": 0.00019898326070935579, | |
| "loss": 28.1711, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 0.1882606790061023, | |
| "grad_norm": 45.26416015625, | |
| "learning_rate": 0.00019897505873134872, | |
| "loss": 33.76, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 0.18880010788577595, | |
| "grad_norm": 39.766441345214844, | |
| "learning_rate": 0.000198966823974069, | |
| "loss": 25.629, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.18933953676544957, | |
| "grad_norm": 30.092906951904297, | |
| "learning_rate": 0.00019895855644024387, | |
| "loss": 45.1687, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 0.18987896564512322, | |
| "grad_norm": 61.02379608154297, | |
| "learning_rate": 0.00019895025613261136, | |
| "loss": 77.4727, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 0.19041839452479686, | |
| "grad_norm": 51.788063049316406, | |
| "learning_rate": 0.00019894192305392055, | |
| "loss": 82.3816, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 0.1909578234044705, | |
| "grad_norm": 72.1239242553711, | |
| "learning_rate": 0.0001989335572069311, | |
| "loss": 103.2545, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 0.19149725228414416, | |
| "grad_norm": 29.279748916625977, | |
| "learning_rate": 0.00019892515859441383, | |
| "loss": 113.7908, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.1920366811638178, | |
| "grad_norm": 43.08776092529297, | |
| "learning_rate": 0.00019891672721915015, | |
| "loss": 107.6541, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 0.19257611004349146, | |
| "grad_norm": 54.121192932128906, | |
| "learning_rate": 0.00019890826308393243, | |
| "loss": 102.3774, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 0.1931155389231651, | |
| "grad_norm": 52.771793365478516, | |
| "learning_rate": 0.0001988997661915639, | |
| "loss": 87.3872, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 0.19365496780283875, | |
| "grad_norm": 58.10847854614258, | |
| "learning_rate": 0.00019889123654485866, | |
| "loss": 97.106, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 0.1941943966825124, | |
| "grad_norm": 52.38351058959961, | |
| "learning_rate": 0.00019888267414664156, | |
| "loss": 91.256, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.19473382556218605, | |
| "grad_norm": 48.153804779052734, | |
| "learning_rate": 0.0001988740789997484, | |
| "loss": 81.894, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 0.19527325444185967, | |
| "grad_norm": 25.811304092407227, | |
| "learning_rate": 0.00019886545110702576, | |
| "loss": 69.6325, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 0.19581268332153332, | |
| "grad_norm": 22.911964416503906, | |
| "learning_rate": 0.00019885679047133107, | |
| "loss": 65.5302, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 0.19635211220120696, | |
| "grad_norm": 37.54278564453125, | |
| "learning_rate": 0.00019884809709553265, | |
| "loss": 60.65, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 0.1968915410808806, | |
| "grad_norm": 20.303857803344727, | |
| "learning_rate": 0.00019883937098250963, | |
| "loss": 44.1299, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.19743096996055426, | |
| "grad_norm": 31.87704849243164, | |
| "learning_rate": 0.00019883061213515197, | |
| "loss": 34.1489, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 0.1979703988402279, | |
| "grad_norm": 39.10615539550781, | |
| "learning_rate": 0.00019882182055636053, | |
| "loss": 37.5989, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 0.19850982771990155, | |
| "grad_norm": 41.10018539428711, | |
| "learning_rate": 0.00019881299624904692, | |
| "loss": 48.6169, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 0.1990492565995752, | |
| "grad_norm": 34.8628044128418, | |
| "learning_rate": 0.00019880413921613367, | |
| "loss": 51.3889, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 0.19958868547924885, | |
| "grad_norm": 41.81850051879883, | |
| "learning_rate": 0.0001987952494605541, | |
| "loss": 46.2857, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.2001281143589225, | |
| "grad_norm": 46.00803756713867, | |
| "learning_rate": 0.00019878632698525238, | |
| "loss": 42.1201, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 0.20066754323859615, | |
| "grad_norm": 37.3172492980957, | |
| "learning_rate": 0.00019877737179318353, | |
| "loss": 44.8517, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 0.2012069721182698, | |
| "grad_norm": 30.38181495666504, | |
| "learning_rate": 0.0001987683838873134, | |
| "loss": 30.3321, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 0.20174640099794341, | |
| "grad_norm": 36.00757598876953, | |
| "learning_rate": 0.00019875936327061865, | |
| "loss": 41.3805, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 0.20228582987761706, | |
| "grad_norm": 36.742733001708984, | |
| "learning_rate": 0.00019875030994608684, | |
| "loss": 48.6651, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.2028252587572907, | |
| "grad_norm": 42.53518295288086, | |
| "learning_rate": 0.00019874122391671622, | |
| "loss": 32.5649, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 0.20336468763696436, | |
| "grad_norm": 35.77900314331055, | |
| "learning_rate": 0.00019873210518551608, | |
| "loss": 46.6955, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 0.203904116516638, | |
| "grad_norm": 44.95616149902344, | |
| "learning_rate": 0.00019872295375550635, | |
| "loss": 41.271, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 0.20444354539631165, | |
| "grad_norm": 34.28546142578125, | |
| "learning_rate": 0.00019871376962971789, | |
| "loss": 41.4059, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 0.2049829742759853, | |
| "grad_norm": 35.807682037353516, | |
| "learning_rate": 0.00019870455281119237, | |
| "loss": 45.8892, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.20552240315565895, | |
| "grad_norm": 30.27015495300293, | |
| "learning_rate": 0.00019869530330298227, | |
| "loss": 34.013, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 0.2060618320353326, | |
| "grad_norm": 38.26789093017578, | |
| "learning_rate": 0.00019868602110815093, | |
| "loss": 42.6953, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 0.20660126091500625, | |
| "grad_norm": 39.61716079711914, | |
| "learning_rate": 0.00019867670622977248, | |
| "loss": 40.4979, | |
| "step": 383 | |
| }, | |
| { | |
| "epoch": 0.2071406897946799, | |
| "grad_norm": 35.717227935791016, | |
| "learning_rate": 0.00019866735867093188, | |
| "loss": 31.5146, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 0.20768011867435351, | |
| "grad_norm": 43.41541290283203, | |
| "learning_rate": 0.0001986579784347249, | |
| "loss": 37.5416, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.20821954755402716, | |
| "grad_norm": 40.18928146362305, | |
| "learning_rate": 0.0001986485655242582, | |
| "loss": 39.0367, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 0.2087589764337008, | |
| "grad_norm": 35.295291900634766, | |
| "learning_rate": 0.00019863911994264926, | |
| "loss": 36.8243, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 0.20929840531337446, | |
| "grad_norm": 52.24161148071289, | |
| "learning_rate": 0.00019862964169302621, | |
| "loss": 41.7241, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 0.2098378341930481, | |
| "grad_norm": 53.32133483886719, | |
| "learning_rate": 0.00019862013077852822, | |
| "loss": 38.7999, | |
| "step": 389 | |
| }, | |
| { | |
| "epoch": 0.21037726307272175, | |
| "grad_norm": 42.945804595947266, | |
| "learning_rate": 0.00019861058720230514, | |
| "loss": 34.0199, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.2109166919523954, | |
| "grad_norm": 38.77582931518555, | |
| "learning_rate": 0.00019860101096751768, | |
| "loss": 33.4203, | |
| "step": 391 | |
| }, | |
| { | |
| "epoch": 0.21145612083206905, | |
| "grad_norm": 30.80617332458496, | |
| "learning_rate": 0.0001985914020773374, | |
| "loss": 27.0483, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 0.2119955497117427, | |
| "grad_norm": 43.676090240478516, | |
| "learning_rate": 0.00019858176053494663, | |
| "loss": 33.954, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 0.21253497859141635, | |
| "grad_norm": 38.32650375366211, | |
| "learning_rate": 0.00019857208634353852, | |
| "loss": 29.378, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 0.21307440747109, | |
| "grad_norm": 39.12830352783203, | |
| "learning_rate": 0.000198562379506317, | |
| "loss": 27.9634, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.21361383635076364, | |
| "grad_norm": 47.39609909057617, | |
| "learning_rate": 0.00019855264002649692, | |
| "loss": 34.1847, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 0.21415326523043726, | |
| "grad_norm": 38.62258529663086, | |
| "learning_rate": 0.00019854286790730384, | |
| "loss": 26.0765, | |
| "step": 397 | |
| }, | |
| { | |
| "epoch": 0.2146926941101109, | |
| "grad_norm": 42.81424331665039, | |
| "learning_rate": 0.00019853306315197413, | |
| "loss": 34.1509, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 0.21523212298978456, | |
| "grad_norm": 45.57196807861328, | |
| "learning_rate": 0.00019852322576375503, | |
| "loss": 32.0371, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 0.2157715518694582, | |
| "grad_norm": 35.20758819580078, | |
| "learning_rate": 0.0001985133557459046, | |
| "loss": 20.3634, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.2157715518694582, | |
| "eval_loss": 1.6627388000488281, | |
| "eval_runtime": 141.0153, | |
| "eval_samples_per_second": 2.12, | |
| "eval_steps_per_second": 2.12, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.21631098074913185, | |
| "grad_norm": 24.1074161529541, | |
| "learning_rate": 0.00019850345310169155, | |
| "loss": 37.3797, | |
| "step": 401 | |
| }, | |
| { | |
| "epoch": 0.2168504096288055, | |
| "grad_norm": 62.604949951171875, | |
| "learning_rate": 0.00019849351783439561, | |
| "loss": 78.7953, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 0.21738983850847915, | |
| "grad_norm": 43.36476135253906, | |
| "learning_rate": 0.0001984835499473072, | |
| "loss": 82.645, | |
| "step": 403 | |
| }, | |
| { | |
| "epoch": 0.2179292673881528, | |
| "grad_norm": 52.12046432495117, | |
| "learning_rate": 0.0001984735494437275, | |
| "loss": 87.0839, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 0.21846869626782645, | |
| "grad_norm": 34.333431243896484, | |
| "learning_rate": 0.00019846351632696863, | |
| "loss": 105.6289, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.2190081251475001, | |
| "grad_norm": 41.665771484375, | |
| "learning_rate": 0.00019845345060035335, | |
| "loss": 112.3874, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 0.21954755402717374, | |
| "grad_norm": 58.79914093017578, | |
| "learning_rate": 0.00019844335226721537, | |
| "loss": 114.2657, | |
| "step": 407 | |
| }, | |
| { | |
| "epoch": 0.22008698290684736, | |
| "grad_norm": 52.85742950439453, | |
| "learning_rate": 0.00019843322133089906, | |
| "loss": 98.4778, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 0.220626411786521, | |
| "grad_norm": 53.792476654052734, | |
| "learning_rate": 0.00019842305779475968, | |
| "loss": 94.7811, | |
| "step": 409 | |
| }, | |
| { | |
| "epoch": 0.22116584066619466, | |
| "grad_norm": 49.56667709350586, | |
| "learning_rate": 0.0001984128616621633, | |
| "loss": 92.4516, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.2217052695458683, | |
| "grad_norm": 38.96401596069336, | |
| "learning_rate": 0.0001984026329364867, | |
| "loss": 78.0561, | |
| "step": 411 | |
| }, | |
| { | |
| "epoch": 0.22224469842554195, | |
| "grad_norm": 35.649200439453125, | |
| "learning_rate": 0.00019839237162111757, | |
| "loss": 66.0612, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 0.2227841273052156, | |
| "grad_norm": 22.54837989807129, | |
| "learning_rate": 0.00019838207771945426, | |
| "loss": 59.3091, | |
| "step": 413 | |
| }, | |
| { | |
| "epoch": 0.22332355618488925, | |
| "grad_norm": 16.843589782714844, | |
| "learning_rate": 0.00019837175123490596, | |
| "loss": 62.8711, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 0.2238629850645629, | |
| "grad_norm": 18.909435272216797, | |
| "learning_rate": 0.00019836139217089275, | |
| "loss": 55.3784, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.22440241394423655, | |
| "grad_norm": 25.120887756347656, | |
| "learning_rate": 0.0001983510005308454, | |
| "loss": 51.9063, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 0.2249418428239102, | |
| "grad_norm": 30.78650665283203, | |
| "learning_rate": 0.00019834057631820543, | |
| "loss": 32.4726, | |
| "step": 417 | |
| }, | |
| { | |
| "epoch": 0.22548127170358384, | |
| "grad_norm": 72.46208953857422, | |
| "learning_rate": 0.00019833011953642525, | |
| "loss": 44.1452, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 0.2260207005832575, | |
| "grad_norm": 45.94267654418945, | |
| "learning_rate": 0.000198319630188968, | |
| "loss": 50.9596, | |
| "step": 419 | |
| }, | |
| { | |
| "epoch": 0.2265601294629311, | |
| "grad_norm": 47.52016067504883, | |
| "learning_rate": 0.00019830910827930764, | |
| "loss": 44.8286, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.22709955834260476, | |
| "grad_norm": 40.93891525268555, | |
| "learning_rate": 0.00019829855381092886, | |
| "loss": 56.7985, | |
| "step": 421 | |
| }, | |
| { | |
| "epoch": 0.2276389872222784, | |
| "grad_norm": 36.567108154296875, | |
| "learning_rate": 0.0001982879667873272, | |
| "loss": 35.7161, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 0.22817841610195205, | |
| "grad_norm": 31.908977508544922, | |
| "learning_rate": 0.0001982773472120089, | |
| "loss": 42.8407, | |
| "step": 423 | |
| }, | |
| { | |
| "epoch": 0.2287178449816257, | |
| "grad_norm": 37.47427749633789, | |
| "learning_rate": 0.00019826669508849108, | |
| "loss": 39.5264, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 0.22925727386129935, | |
| "grad_norm": 43.83090591430664, | |
| "learning_rate": 0.00019825601042030156, | |
| "loss": 48.5415, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.229796702740973, | |
| "grad_norm": 42.004425048828125, | |
| "learning_rate": 0.00019824529321097893, | |
| "loss": 39.4127, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 0.23033613162064664, | |
| "grad_norm": 38.282066345214844, | |
| "learning_rate": 0.00019823454346407267, | |
| "loss": 40.8499, | |
| "step": 427 | |
| }, | |
| { | |
| "epoch": 0.2308755605003203, | |
| "grad_norm": 33.92627716064453, | |
| "learning_rate": 0.0001982237611831429, | |
| "loss": 35.4472, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 0.23141498937999394, | |
| "grad_norm": 53.361106872558594, | |
| "learning_rate": 0.00019821294637176057, | |
| "loss": 43.1921, | |
| "step": 429 | |
| }, | |
| { | |
| "epoch": 0.2319544182596676, | |
| "grad_norm": 40.92842102050781, | |
| "learning_rate": 0.00019820209903350744, | |
| "loss": 36.5019, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.2324938471393412, | |
| "grad_norm": 35.71042251586914, | |
| "learning_rate": 0.00019819121917197602, | |
| "loss": 36.598, | |
| "step": 431 | |
| }, | |
| { | |
| "epoch": 0.23303327601901486, | |
| "grad_norm": 35.10508728027344, | |
| "learning_rate": 0.00019818030679076952, | |
| "loss": 31.6675, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 0.2335727048986885, | |
| "grad_norm": 31.885364532470703, | |
| "learning_rate": 0.00019816936189350206, | |
| "loss": 34.3554, | |
| "step": 433 | |
| }, | |
| { | |
| "epoch": 0.23411213377836215, | |
| "grad_norm": 42.998878479003906, | |
| "learning_rate": 0.0001981583844837984, | |
| "loss": 28.1099, | |
| "step": 434 | |
| }, | |
| { | |
| "epoch": 0.2346515626580358, | |
| "grad_norm": 38.70567321777344, | |
| "learning_rate": 0.00019814737456529412, | |
| "loss": 42.3567, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.23519099153770945, | |
| "grad_norm": 34.43855285644531, | |
| "learning_rate": 0.00019813633214163555, | |
| "loss": 22.8285, | |
| "step": 436 | |
| }, | |
| { | |
| "epoch": 0.2357304204173831, | |
| "grad_norm": 33.38055419921875, | |
| "learning_rate": 0.00019812525721647986, | |
| "loss": 36.1465, | |
| "step": 437 | |
| }, | |
| { | |
| "epoch": 0.23626984929705674, | |
| "grad_norm": 42.98970413208008, | |
| "learning_rate": 0.00019811414979349485, | |
| "loss": 34.8416, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 0.2368092781767304, | |
| "grad_norm": 37.12187957763672, | |
| "learning_rate": 0.0001981030098763592, | |
| "loss": 34.276, | |
| "step": 439 | |
| }, | |
| { | |
| "epoch": 0.23734870705640404, | |
| "grad_norm": 44.36403274536133, | |
| "learning_rate": 0.00019809183746876232, | |
| "loss": 30.3544, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.2378881359360777, | |
| "grad_norm": 46.281654357910156, | |
| "learning_rate": 0.00019808063257440432, | |
| "loss": 27.8803, | |
| "step": 441 | |
| }, | |
| { | |
| "epoch": 0.23842756481575134, | |
| "grad_norm": 49.94664001464844, | |
| "learning_rate": 0.00019806939519699613, | |
| "loss": 31.0358, | |
| "step": 442 | |
| }, | |
| { | |
| "epoch": 0.23896699369542496, | |
| "grad_norm": 42.308616638183594, | |
| "learning_rate": 0.0001980581253402595, | |
| "loss": 29.4053, | |
| "step": 443 | |
| }, | |
| { | |
| "epoch": 0.2395064225750986, | |
| "grad_norm": 51.36742401123047, | |
| "learning_rate": 0.00019804682300792674, | |
| "loss": 31.0947, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 0.24004585145477225, | |
| "grad_norm": 40.25013732910156, | |
| "learning_rate": 0.00019803548820374113, | |
| "loss": 26.6703, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.2405852803344459, | |
| "grad_norm": 53.013710021972656, | |
| "learning_rate": 0.00019802412093145657, | |
| "loss": 35.5286, | |
| "step": 446 | |
| }, | |
| { | |
| "epoch": 0.24112470921411955, | |
| "grad_norm": 41.21833038330078, | |
| "learning_rate": 0.00019801272119483775, | |
| "loss": 25.3315, | |
| "step": 447 | |
| }, | |
| { | |
| "epoch": 0.2416641380937932, | |
| "grad_norm": 61.56970977783203, | |
| "learning_rate": 0.00019800128899766017, | |
| "loss": 27.589, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 0.24220356697346684, | |
| "grad_norm": 58.22453308105469, | |
| "learning_rate": 0.00019798982434371, | |
| "loss": 37.2235, | |
| "step": 449 | |
| }, | |
| { | |
| "epoch": 0.2427429958531405, | |
| "grad_norm": 36.04716110229492, | |
| "learning_rate": 0.00019797832723678413, | |
| "loss": 28.1485, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.24328242473281414, | |
| "grad_norm": 50.804813385009766, | |
| "learning_rate": 0.00019796679768069032, | |
| "loss": 49.1471, | |
| "step": 451 | |
| }, | |
| { | |
| "epoch": 0.2438218536124878, | |
| "grad_norm": 91.2785873413086, | |
| "learning_rate": 0.00019795523567924702, | |
| "loss": 72.8998, | |
| "step": 452 | |
| }, | |
| { | |
| "epoch": 0.24436128249216144, | |
| "grad_norm": 110.37539672851562, | |
| "learning_rate": 0.00019794364123628335, | |
| "loss": 98.2308, | |
| "step": 453 | |
| }, | |
| { | |
| "epoch": 0.24490071137183506, | |
| "grad_norm": 79.3825912475586, | |
| "learning_rate": 0.00019793201435563932, | |
| "loss": 109.7274, | |
| "step": 454 | |
| }, | |
| { | |
| "epoch": 0.2454401402515087, | |
| "grad_norm": 36.62171173095703, | |
| "learning_rate": 0.00019792035504116555, | |
| "loss": 107.5116, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.24597956913118235, | |
| "grad_norm": 57.664146423339844, | |
| "learning_rate": 0.00019790866329672346, | |
| "loss": 113.5622, | |
| "step": 456 | |
| }, | |
| { | |
| "epoch": 0.246518998010856, | |
| "grad_norm": 57.12027359008789, | |
| "learning_rate": 0.00019789693912618524, | |
| "loss": 102.4627, | |
| "step": 457 | |
| }, | |
| { | |
| "epoch": 0.24705842689052965, | |
| "grad_norm": 67.92241668701172, | |
| "learning_rate": 0.00019788518253343376, | |
| "loss": 90.2483, | |
| "step": 458 | |
| }, | |
| { | |
| "epoch": 0.2475978557702033, | |
| "grad_norm": 63.95331573486328, | |
| "learning_rate": 0.00019787339352236264, | |
| "loss": 94.7671, | |
| "step": 459 | |
| }, | |
| { | |
| "epoch": 0.24813728464987694, | |
| "grad_norm": 55.70960235595703, | |
| "learning_rate": 0.00019786157209687627, | |
| "loss": 92.1523, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.2486767135295506, | |
| "grad_norm": 44.270233154296875, | |
| "learning_rate": 0.00019784971826088973, | |
| "loss": 82.3084, | |
| "step": 461 | |
| }, | |
| { | |
| "epoch": 0.24921614240922424, | |
| "grad_norm": 35.74955749511719, | |
| "learning_rate": 0.0001978378320183289, | |
| "loss": 71.401, | |
| "step": 462 | |
| }, | |
| { | |
| "epoch": 0.2497555712888979, | |
| "grad_norm": 26.20838165283203, | |
| "learning_rate": 0.00019782591337313035, | |
| "loss": 68.6018, | |
| "step": 463 | |
| }, | |
| { | |
| "epoch": 0.25029500016857154, | |
| "grad_norm": 20.70208740234375, | |
| "learning_rate": 0.00019781396232924133, | |
| "loss": 62.6257, | |
| "step": 464 | |
| }, | |
| { | |
| "epoch": 0.25083442904824516, | |
| "grad_norm": 17.804771423339844, | |
| "learning_rate": 0.00019780197889061993, | |
| "loss": 54.6564, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.25137385792791883, | |
| "grad_norm": 24.327360153198242, | |
| "learning_rate": 0.0001977899630612349, | |
| "loss": 50.7451, | |
| "step": 466 | |
| }, | |
| { | |
| "epoch": 0.25191328680759245, | |
| "grad_norm": 29.580142974853516, | |
| "learning_rate": 0.00019777791484506567, | |
| "loss": 34.4045, | |
| "step": 467 | |
| }, | |
| { | |
| "epoch": 0.2524527156872661, | |
| "grad_norm": 30.99888801574707, | |
| "learning_rate": 0.00019776583424610254, | |
| "loss": 41.2975, | |
| "step": 468 | |
| }, | |
| { | |
| "epoch": 0.25299214456693975, | |
| "grad_norm": 40.59465408325195, | |
| "learning_rate": 0.0001977537212683464, | |
| "loss": 56.0607, | |
| "step": 469 | |
| }, | |
| { | |
| "epoch": 0.2535315734466134, | |
| "grad_norm": 42.85790252685547, | |
| "learning_rate": 0.00019774157591580894, | |
| "loss": 40.9168, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.25407100232628704, | |
| "grad_norm": 38.090885162353516, | |
| "learning_rate": 0.0001977293981925125, | |
| "loss": 49.6262, | |
| "step": 471 | |
| }, | |
| { | |
| "epoch": 0.25461043120596066, | |
| "grad_norm": 33.007991790771484, | |
| "learning_rate": 0.0001977171881024902, | |
| "loss": 44.5241, | |
| "step": 472 | |
| }, | |
| { | |
| "epoch": 0.25514986008563434, | |
| "grad_norm": 39.41592025756836, | |
| "learning_rate": 0.00019770494564978595, | |
| "loss": 38.185, | |
| "step": 473 | |
| }, | |
| { | |
| "epoch": 0.25568928896530796, | |
| "grad_norm": 33.008148193359375, | |
| "learning_rate": 0.00019769267083845417, | |
| "loss": 42.3843, | |
| "step": 474 | |
| }, | |
| { | |
| "epoch": 0.25622871784498163, | |
| "grad_norm": 27.917991638183594, | |
| "learning_rate": 0.0001976803636725602, | |
| "loss": 33.7216, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.25676814672465526, | |
| "grad_norm": 29.870256423950195, | |
| "learning_rate": 0.00019766802415617998, | |
| "loss": 35.7963, | |
| "step": 476 | |
| }, | |
| { | |
| "epoch": 0.25730757560432893, | |
| "grad_norm": 44.98633575439453, | |
| "learning_rate": 0.0001976556522934002, | |
| "loss": 35.8127, | |
| "step": 477 | |
| }, | |
| { | |
| "epoch": 0.25784700448400255, | |
| "grad_norm": 43.03909683227539, | |
| "learning_rate": 0.0001976432480883183, | |
| "loss": 35.4111, | |
| "step": 478 | |
| }, | |
| { | |
| "epoch": 0.2583864333636762, | |
| "grad_norm": 47.32424545288086, | |
| "learning_rate": 0.00019763081154504234, | |
| "loss": 41.8895, | |
| "step": 479 | |
| }, | |
| { | |
| "epoch": 0.25892586224334985, | |
| "grad_norm": 49.7735595703125, | |
| "learning_rate": 0.0001976183426676912, | |
| "loss": 32.9801, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.2594652911230235, | |
| "grad_norm": 44.57673645019531, | |
| "learning_rate": 0.0001976058414603944, | |
| "loss": 36.089, | |
| "step": 481 | |
| }, | |
| { | |
| "epoch": 0.26000472000269714, | |
| "grad_norm": 36.22349548339844, | |
| "learning_rate": 0.00019759330792729212, | |
| "loss": 47.0487, | |
| "step": 482 | |
| }, | |
| { | |
| "epoch": 0.26054414888237076, | |
| "grad_norm": 38.58706283569336, | |
| "learning_rate": 0.00019758074207253535, | |
| "loss": 34.3672, | |
| "step": 483 | |
| }, | |
| { | |
| "epoch": 0.26108357776204444, | |
| "grad_norm": 40.61176300048828, | |
| "learning_rate": 0.00019756814390028575, | |
| "loss": 39.7468, | |
| "step": 484 | |
| }, | |
| { | |
| "epoch": 0.26162300664171806, | |
| "grad_norm": 29.439836502075195, | |
| "learning_rate": 0.00019755551341471566, | |
| "loss": 34.1449, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.26216243552139173, | |
| "grad_norm": 35.68241882324219, | |
| "learning_rate": 0.00019754285062000815, | |
| "loss": 31.6102, | |
| "step": 486 | |
| }, | |
| { | |
| "epoch": 0.26270186440106535, | |
| "grad_norm": 44.2021598815918, | |
| "learning_rate": 0.000197530155520357, | |
| "loss": 31.8889, | |
| "step": 487 | |
| }, | |
| { | |
| "epoch": 0.26324129328073903, | |
| "grad_norm": 53.82715606689453, | |
| "learning_rate": 0.00019751742811996656, | |
| "loss": 31.6853, | |
| "step": 488 | |
| }, | |
| { | |
| "epoch": 0.26378072216041265, | |
| "grad_norm": 41.77256774902344, | |
| "learning_rate": 0.00019750466842305208, | |
| "loss": 39.1939, | |
| "step": 489 | |
| }, | |
| { | |
| "epoch": 0.2643201510400863, | |
| "grad_norm": 36.42414093017578, | |
| "learning_rate": 0.00019749187643383937, | |
| "loss": 26.3978, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.26485957991975995, | |
| "grad_norm": 49.238014221191406, | |
| "learning_rate": 0.00019747905215656498, | |
| "loss": 33.8181, | |
| "step": 491 | |
| }, | |
| { | |
| "epoch": 0.2653990087994336, | |
| "grad_norm": 37.46484375, | |
| "learning_rate": 0.00019746619559547619, | |
| "loss": 32.0879, | |
| "step": 492 | |
| }, | |
| { | |
| "epoch": 0.26593843767910724, | |
| "grad_norm": 29.428075790405273, | |
| "learning_rate": 0.00019745330675483084, | |
| "loss": 22.5194, | |
| "step": 493 | |
| }, | |
| { | |
| "epoch": 0.2664778665587809, | |
| "grad_norm": 42.24260330200195, | |
| "learning_rate": 0.00019744038563889764, | |
| "loss": 34.5577, | |
| "step": 494 | |
| }, | |
| { | |
| "epoch": 0.26701729543845454, | |
| "grad_norm": 43.271976470947266, | |
| "learning_rate": 0.00019742743225195582, | |
| "loss": 25.107, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.26755672431812816, | |
| "grad_norm": 41.1341667175293, | |
| "learning_rate": 0.00019741444659829543, | |
| "loss": 24.4596, | |
| "step": 496 | |
| }, | |
| { | |
| "epoch": 0.26809615319780183, | |
| "grad_norm": 35.3587760925293, | |
| "learning_rate": 0.00019740142868221713, | |
| "loss": 21.1434, | |
| "step": 497 | |
| }, | |
| { | |
| "epoch": 0.26863558207747545, | |
| "grad_norm": 47.48214340209961, | |
| "learning_rate": 0.00019738837850803226, | |
| "loss": 23.4752, | |
| "step": 498 | |
| }, | |
| { | |
| "epoch": 0.26917501095714913, | |
| "grad_norm": 44.637882232666016, | |
| "learning_rate": 0.00019737529608006293, | |
| "loss": 21.9525, | |
| "step": 499 | |
| }, | |
| { | |
| "epoch": 0.26971443983682275, | |
| "grad_norm": 31.005287170410156, | |
| "learning_rate": 0.00019736218140264185, | |
| "loss": 19.1622, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.2702538687164964, | |
| "grad_norm": 32.10681915283203, | |
| "learning_rate": 0.0001973490344801124, | |
| "loss": 44.8021, | |
| "step": 501 | |
| }, | |
| { | |
| "epoch": 0.27079329759617005, | |
| "grad_norm": 67.818603515625, | |
| "learning_rate": 0.0001973358553168287, | |
| "loss": 90.5945, | |
| "step": 502 | |
| }, | |
| { | |
| "epoch": 0.2713327264758437, | |
| "grad_norm": 78.30387115478516, | |
| "learning_rate": 0.00019732264391715556, | |
| "loss": 101.037, | |
| "step": 503 | |
| }, | |
| { | |
| "epoch": 0.27187215535551734, | |
| "grad_norm": 92.50519561767578, | |
| "learning_rate": 0.00019730940028546835, | |
| "loss": 124.3723, | |
| "step": 504 | |
| }, | |
| { | |
| "epoch": 0.272411584235191, | |
| "grad_norm": 38.794246673583984, | |
| "learning_rate": 0.0001972961244261532, | |
| "loss": 105.1317, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.27295101311486464, | |
| "grad_norm": 34.56374740600586, | |
| "learning_rate": 0.00019728281634360698, | |
| "loss": 101.3536, | |
| "step": 506 | |
| }, | |
| { | |
| "epoch": 0.27349044199453826, | |
| "grad_norm": 33.79701614379883, | |
| "learning_rate": 0.00019726947604223712, | |
| "loss": 105.4946, | |
| "step": 507 | |
| }, | |
| { | |
| "epoch": 0.27402987087421193, | |
| "grad_norm": 39.242740631103516, | |
| "learning_rate": 0.00019725610352646172, | |
| "loss": 82.6645, | |
| "step": 508 | |
| }, | |
| { | |
| "epoch": 0.27456929975388555, | |
| "grad_norm": 41.144683837890625, | |
| "learning_rate": 0.0001972426988007096, | |
| "loss": 99.5104, | |
| "step": 509 | |
| }, | |
| { | |
| "epoch": 0.27510872863355923, | |
| "grad_norm": 43.32292175292969, | |
| "learning_rate": 0.00019722926186942026, | |
| "loss": 90.6068, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.27564815751323285, | |
| "grad_norm": 40.97383117675781, | |
| "learning_rate": 0.0001972157927370438, | |
| "loss": 71.8933, | |
| "step": 511 | |
| }, | |
| { | |
| "epoch": 0.2761875863929065, | |
| "grad_norm": 27.89875602722168, | |
| "learning_rate": 0.0001972022914080411, | |
| "loss": 66.0499, | |
| "step": 512 | |
| }, | |
| { | |
| "epoch": 0.27672701527258015, | |
| "grad_norm": 23.75403594970703, | |
| "learning_rate": 0.00019718875788688354, | |
| "loss": 59.9798, | |
| "step": 513 | |
| }, | |
| { | |
| "epoch": 0.2772664441522538, | |
| "grad_norm": 18.101530075073242, | |
| "learning_rate": 0.0001971751921780533, | |
| "loss": 55.1379, | |
| "step": 514 | |
| }, | |
| { | |
| "epoch": 0.27780587303192744, | |
| "grad_norm": 24.123146057128906, | |
| "learning_rate": 0.00019716159428604315, | |
| "loss": 51.0036, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.2783453019116011, | |
| "grad_norm": 29.12915802001953, | |
| "learning_rate": 0.00019714796421535654, | |
| "loss": 35.74, | |
| "step": 516 | |
| }, | |
| { | |
| "epoch": 0.27888473079127474, | |
| "grad_norm": 41.40327072143555, | |
| "learning_rate": 0.00019713430197050756, | |
| "loss": 34.8342, | |
| "step": 517 | |
| }, | |
| { | |
| "epoch": 0.27942415967094836, | |
| "grad_norm": 65.70941162109375, | |
| "learning_rate": 0.00019712060755602102, | |
| "loss": 45.6267, | |
| "step": 518 | |
| }, | |
| { | |
| "epoch": 0.27996358855062203, | |
| "grad_norm": 37.733158111572266, | |
| "learning_rate": 0.00019710688097643227, | |
| "loss": 40.7, | |
| "step": 519 | |
| }, | |
| { | |
| "epoch": 0.28050301743029565, | |
| "grad_norm": 39.90540313720703, | |
| "learning_rate": 0.0001970931222362874, | |
| "loss": 52.105, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.28104244630996933, | |
| "grad_norm": 41.023155212402344, | |
| "learning_rate": 0.0001970793313401432, | |
| "loss": 47.4019, | |
| "step": 521 | |
| }, | |
| { | |
| "epoch": 0.28158187518964295, | |
| "grad_norm": 39.340972900390625, | |
| "learning_rate": 0.00019706550829256693, | |
| "loss": 36.3784, | |
| "step": 522 | |
| }, | |
| { | |
| "epoch": 0.2821213040693166, | |
| "grad_norm": 31.36964988708496, | |
| "learning_rate": 0.0001970516530981367, | |
| "loss": 32.5883, | |
| "step": 523 | |
| }, | |
| { | |
| "epoch": 0.28266073294899025, | |
| "grad_norm": 31.426342010498047, | |
| "learning_rate": 0.00019703776576144105, | |
| "loss": 37.0281, | |
| "step": 524 | |
| }, | |
| { | |
| "epoch": 0.2832001618286639, | |
| "grad_norm": 48.170589447021484, | |
| "learning_rate": 0.00019702384628707945, | |
| "loss": 50.0541, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.28373959070833754, | |
| "grad_norm": 58.017845153808594, | |
| "learning_rate": 0.0001970098946796617, | |
| "loss": 35.1185, | |
| "step": 526 | |
| }, | |
| { | |
| "epoch": 0.2842790195880112, | |
| "grad_norm": 44.51712417602539, | |
| "learning_rate": 0.0001969959109438085, | |
| "loss": 30.6861, | |
| "step": 527 | |
| }, | |
| { | |
| "epoch": 0.28481844846768484, | |
| "grad_norm": 38.26441955566406, | |
| "learning_rate": 0.00019698189508415102, | |
| "loss": 42.7979, | |
| "step": 528 | |
| }, | |
| { | |
| "epoch": 0.28535787734735846, | |
| "grad_norm": 33.41388702392578, | |
| "learning_rate": 0.00019696784710533115, | |
| "loss": 31.6934, | |
| "step": 529 | |
| }, | |
| { | |
| "epoch": 0.28589730622703213, | |
| "grad_norm": 39.14249038696289, | |
| "learning_rate": 0.00019695376701200145, | |
| "loss": 31.4034, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.28643673510670575, | |
| "grad_norm": 38.64737319946289, | |
| "learning_rate": 0.000196939654808825, | |
| "loss": 35.3318, | |
| "step": 531 | |
| }, | |
| { | |
| "epoch": 0.28697616398637943, | |
| "grad_norm": 32.65852355957031, | |
| "learning_rate": 0.0001969255105004756, | |
| "loss": 33.1427, | |
| "step": 532 | |
| }, | |
| { | |
| "epoch": 0.28751559286605305, | |
| "grad_norm": 33.65852355957031, | |
| "learning_rate": 0.0001969113340916377, | |
| "loss": 31.0407, | |
| "step": 533 | |
| }, | |
| { | |
| "epoch": 0.2880550217457267, | |
| "grad_norm": 31.496322631835938, | |
| "learning_rate": 0.00019689712558700628, | |
| "loss": 32.1776, | |
| "step": 534 | |
| }, | |
| { | |
| "epoch": 0.28859445062540034, | |
| "grad_norm": 37.255680084228516, | |
| "learning_rate": 0.00019688288499128707, | |
| "loss": 32.4352, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.289133879505074, | |
| "grad_norm": 35.74131774902344, | |
| "learning_rate": 0.00019686861230919635, | |
| "loss": 39.0239, | |
| "step": 536 | |
| }, | |
| { | |
| "epoch": 0.28967330838474764, | |
| "grad_norm": 62.805694580078125, | |
| "learning_rate": 0.00019685430754546107, | |
| "loss": 39.168, | |
| "step": 537 | |
| }, | |
| { | |
| "epoch": 0.2902127372644213, | |
| "grad_norm": 32.74406814575195, | |
| "learning_rate": 0.00019683997070481875, | |
| "loss": 27.3064, | |
| "step": 538 | |
| }, | |
| { | |
| "epoch": 0.29075216614409494, | |
| "grad_norm": 60.63595199584961, | |
| "learning_rate": 0.00019682560179201759, | |
| "loss": 37.3217, | |
| "step": 539 | |
| }, | |
| { | |
| "epoch": 0.2912915950237686, | |
| "grad_norm": 49.350975036621094, | |
| "learning_rate": 0.00019681120081181636, | |
| "loss": 32.6254, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.29183102390344223, | |
| "grad_norm": 33.03507614135742, | |
| "learning_rate": 0.00019679676776898454, | |
| "loss": 23.6142, | |
| "step": 541 | |
| }, | |
| { | |
| "epoch": 0.29237045278311585, | |
| "grad_norm": 46.380985260009766, | |
| "learning_rate": 0.00019678230266830212, | |
| "loss": 26.1048, | |
| "step": 542 | |
| }, | |
| { | |
| "epoch": 0.29290988166278953, | |
| "grad_norm": 44.384132385253906, | |
| "learning_rate": 0.00019676780551455977, | |
| "loss": 19.0745, | |
| "step": 543 | |
| }, | |
| { | |
| "epoch": 0.29344931054246315, | |
| "grad_norm": 32.757320404052734, | |
| "learning_rate": 0.0001967532763125588, | |
| "loss": 33.5921, | |
| "step": 544 | |
| }, | |
| { | |
| "epoch": 0.2939887394221368, | |
| "grad_norm": 40.512939453125, | |
| "learning_rate": 0.000196738715067111, | |
| "loss": 23.9648, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.29452816830181044, | |
| "grad_norm": 36.085330963134766, | |
| "learning_rate": 0.00019672412178303898, | |
| "loss": 25.8736, | |
| "step": 546 | |
| }, | |
| { | |
| "epoch": 0.2950675971814841, | |
| "grad_norm": 39.4991340637207, | |
| "learning_rate": 0.00019670949646517576, | |
| "loss": 35.8085, | |
| "step": 547 | |
| }, | |
| { | |
| "epoch": 0.29560702606115774, | |
| "grad_norm": 56.80205535888672, | |
| "learning_rate": 0.0001966948391183651, | |
| "loss": 21.2566, | |
| "step": 548 | |
| }, | |
| { | |
| "epoch": 0.2961464549408314, | |
| "grad_norm": 51.80792999267578, | |
| "learning_rate": 0.00019668014974746133, | |
| "loss": 19.3891, | |
| "step": 549 | |
| }, | |
| { | |
| "epoch": 0.29668588382050504, | |
| "grad_norm": 40.740726470947266, | |
| "learning_rate": 0.00019666542835732937, | |
| "loss": 17.442, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.2972253127001787, | |
| "grad_norm": 43.78228759765625, | |
| "learning_rate": 0.00019665067495284476, | |
| "loss": 53.1444, | |
| "step": 551 | |
| }, | |
| { | |
| "epoch": 0.29776474157985233, | |
| "grad_norm": 68.15139770507812, | |
| "learning_rate": 0.00019663588953889363, | |
| "loss": 83.8455, | |
| "step": 552 | |
| }, | |
| { | |
| "epoch": 0.29830417045952595, | |
| "grad_norm": 57.72416305541992, | |
| "learning_rate": 0.00019662107212037273, | |
| "loss": 91.3314, | |
| "step": 553 | |
| }, | |
| { | |
| "epoch": 0.29884359933919963, | |
| "grad_norm": 70.40361785888672, | |
| "learning_rate": 0.0001966062227021894, | |
| "loss": 115.1381, | |
| "step": 554 | |
| }, | |
| { | |
| "epoch": 0.29938302821887325, | |
| "grad_norm": 33.6906623840332, | |
| "learning_rate": 0.00019659134128926156, | |
| "loss": 96.5649, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.2999224570985469, | |
| "grad_norm": 41.24090576171875, | |
| "learning_rate": 0.00019657642788651776, | |
| "loss": 104.8012, | |
| "step": 556 | |
| }, | |
| { | |
| "epoch": 0.30046188597822054, | |
| "grad_norm": 62.62508773803711, | |
| "learning_rate": 0.00019656148249889714, | |
| "loss": 89.1584, | |
| "step": 557 | |
| }, | |
| { | |
| "epoch": 0.3010013148578942, | |
| "grad_norm": 54.20726013183594, | |
| "learning_rate": 0.00019654650513134937, | |
| "loss": 102.4601, | |
| "step": 558 | |
| }, | |
| { | |
| "epoch": 0.30154074373756784, | |
| "grad_norm": 51.19554138183594, | |
| "learning_rate": 0.00019653149578883482, | |
| "loss": 94.7273, | |
| "step": 559 | |
| }, | |
| { | |
| "epoch": 0.3020801726172415, | |
| "grad_norm": 50.297447204589844, | |
| "learning_rate": 0.00019651645447632437, | |
| "loss": 85.4999, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.30261960149691514, | |
| "grad_norm": 43.541648864746094, | |
| "learning_rate": 0.00019650138119879952, | |
| "loss": 84.9936, | |
| "step": 561 | |
| }, | |
| { | |
| "epoch": 0.3031590303765888, | |
| "grad_norm": 30.611860275268555, | |
| "learning_rate": 0.00019648627596125233, | |
| "loss": 68.3871, | |
| "step": 562 | |
| }, | |
| { | |
| "epoch": 0.30369845925626243, | |
| "grad_norm": 18.373859405517578, | |
| "learning_rate": 0.00019647113876868546, | |
| "loss": 64.1806, | |
| "step": 563 | |
| }, | |
| { | |
| "epoch": 0.30423788813593605, | |
| "grad_norm": 17.967041015625, | |
| "learning_rate": 0.00019645596962611218, | |
| "loss": 58.1967, | |
| "step": 564 | |
| }, | |
| { | |
| "epoch": 0.30477731701560973, | |
| "grad_norm": 17.57683563232422, | |
| "learning_rate": 0.00019644076853855626, | |
| "loss": 48.7426, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 0.30531674589528335, | |
| "grad_norm": 24.4635066986084, | |
| "learning_rate": 0.00019642553551105219, | |
| "loss": 45.5702, | |
| "step": 566 | |
| }, | |
| { | |
| "epoch": 0.305856174774957, | |
| "grad_norm": 44.31038284301758, | |
| "learning_rate": 0.0001964102705486449, | |
| "loss": 36.4538, | |
| "step": 567 | |
| }, | |
| { | |
| "epoch": 0.30639560365463064, | |
| "grad_norm": 45.66762924194336, | |
| "learning_rate": 0.00019639497365638993, | |
| "loss": 37.6228, | |
| "step": 568 | |
| }, | |
| { | |
| "epoch": 0.3069350325343043, | |
| "grad_norm": 45.2806282043457, | |
| "learning_rate": 0.00019637964483935346, | |
| "loss": 47.7514, | |
| "step": 569 | |
| }, | |
| { | |
| "epoch": 0.30747446141397794, | |
| "grad_norm": 44.627296447753906, | |
| "learning_rate": 0.00019636428410261218, | |
| "loss": 50.5934, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.3080138902936516, | |
| "grad_norm": 39.8631706237793, | |
| "learning_rate": 0.00019634889145125336, | |
| "loss": 33.2035, | |
| "step": 571 | |
| }, | |
| { | |
| "epoch": 0.30855331917332524, | |
| "grad_norm": 43.88326644897461, | |
| "learning_rate": 0.00019633346689037486, | |
| "loss": 44.4418, | |
| "step": 572 | |
| }, | |
| { | |
| "epoch": 0.3090927480529989, | |
| "grad_norm": 31.599515914916992, | |
| "learning_rate": 0.0001963180104250851, | |
| "loss": 29.8656, | |
| "step": 573 | |
| }, | |
| { | |
| "epoch": 0.30963217693267253, | |
| "grad_norm": 29.062061309814453, | |
| "learning_rate": 0.00019630252206050307, | |
| "loss": 29.4416, | |
| "step": 574 | |
| }, | |
| { | |
| "epoch": 0.31017160581234615, | |
| "grad_norm": 35.07856750488281, | |
| "learning_rate": 0.00019628700180175833, | |
| "loss": 33.663, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.3107110346920198, | |
| "grad_norm": 38.65933609008789, | |
| "learning_rate": 0.00019627144965399094, | |
| "loss": 43.6982, | |
| "step": 576 | |
| }, | |
| { | |
| "epoch": 0.31125046357169345, | |
| "grad_norm": 36.53346252441406, | |
| "learning_rate": 0.0001962558656223516, | |
| "loss": 41.9741, | |
| "step": 577 | |
| }, | |
| { | |
| "epoch": 0.3117898924513671, | |
| "grad_norm": 50.61214065551758, | |
| "learning_rate": 0.00019624024971200154, | |
| "loss": 31.3103, | |
| "step": 578 | |
| }, | |
| { | |
| "epoch": 0.31232932133104074, | |
| "grad_norm": 39.70477294921875, | |
| "learning_rate": 0.00019622460192811255, | |
| "loss": 40.1001, | |
| "step": 579 | |
| }, | |
| { | |
| "epoch": 0.3128687502107144, | |
| "grad_norm": 43.24115753173828, | |
| "learning_rate": 0.000196208922275867, | |
| "loss": 38.9648, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.31340817909038804, | |
| "grad_norm": 49.614410400390625, | |
| "learning_rate": 0.00019619321076045778, | |
| "loss": 38.396, | |
| "step": 581 | |
| }, | |
| { | |
| "epoch": 0.3139476079700617, | |
| "grad_norm": 38.65335464477539, | |
| "learning_rate": 0.0001961774673870883, | |
| "loss": 33.8401, | |
| "step": 582 | |
| }, | |
| { | |
| "epoch": 0.31448703684973534, | |
| "grad_norm": 36.919837951660156, | |
| "learning_rate": 0.00019616169216097262, | |
| "loss": 40.8598, | |
| "step": 583 | |
| }, | |
| { | |
| "epoch": 0.315026465729409, | |
| "grad_norm": 34.90658187866211, | |
| "learning_rate": 0.00019614588508733524, | |
| "loss": 26.7875, | |
| "step": 584 | |
| }, | |
| { | |
| "epoch": 0.31556589460908263, | |
| "grad_norm": 36.6773796081543, | |
| "learning_rate": 0.00019613004617141132, | |
| "loss": 38.7512, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 0.3161053234887563, | |
| "grad_norm": 38.80603790283203, | |
| "learning_rate": 0.00019611417541844645, | |
| "loss": 22.4567, | |
| "step": 586 | |
| }, | |
| { | |
| "epoch": 0.3166447523684299, | |
| "grad_norm": 39.85905838012695, | |
| "learning_rate": 0.00019609827283369687, | |
| "loss": 34.7722, | |
| "step": 587 | |
| }, | |
| { | |
| "epoch": 0.31718418124810355, | |
| "grad_norm": 42.714210510253906, | |
| "learning_rate": 0.00019608233842242925, | |
| "loss": 29.6514, | |
| "step": 588 | |
| }, | |
| { | |
| "epoch": 0.3177236101277772, | |
| "grad_norm": 28.49331283569336, | |
| "learning_rate": 0.00019606637218992092, | |
| "loss": 32.2811, | |
| "step": 589 | |
| }, | |
| { | |
| "epoch": 0.31826303900745084, | |
| "grad_norm": 38.48284912109375, | |
| "learning_rate": 0.0001960503741414597, | |
| "loss": 19.4347, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.3188024678871245, | |
| "grad_norm": 40.46686553955078, | |
| "learning_rate": 0.00019603434428234389, | |
| "loss": 36.0755, | |
| "step": 591 | |
| }, | |
| { | |
| "epoch": 0.31934189676679814, | |
| "grad_norm": 33.52849578857422, | |
| "learning_rate": 0.00019601828261788236, | |
| "loss": 23.4967, | |
| "step": 592 | |
| }, | |
| { | |
| "epoch": 0.3198813256464718, | |
| "grad_norm": 36.89003372192383, | |
| "learning_rate": 0.0001960021891533946, | |
| "loss": 17.4822, | |
| "step": 593 | |
| }, | |
| { | |
| "epoch": 0.32042075452614543, | |
| "grad_norm": 47.023624420166016, | |
| "learning_rate": 0.00019598606389421055, | |
| "loss": 26.3533, | |
| "step": 594 | |
| }, | |
| { | |
| "epoch": 0.3209601834058191, | |
| "grad_norm": 53.969627380371094, | |
| "learning_rate": 0.00019596990684567063, | |
| "loss": 36.3338, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 0.32149961228549273, | |
| "grad_norm": 31.71206283569336, | |
| "learning_rate": 0.00019595371801312588, | |
| "loss": 23.1099, | |
| "step": 596 | |
| }, | |
| { | |
| "epoch": 0.3220390411651664, | |
| "grad_norm": 34.602901458740234, | |
| "learning_rate": 0.00019593749740193784, | |
| "loss": 20.7281, | |
| "step": 597 | |
| }, | |
| { | |
| "epoch": 0.32257847004484, | |
| "grad_norm": 32.23836135864258, | |
| "learning_rate": 0.00019592124501747855, | |
| "loss": 19.1565, | |
| "step": 598 | |
| }, | |
| { | |
| "epoch": 0.32311789892451365, | |
| "grad_norm": 31.762807846069336, | |
| "learning_rate": 0.00019590496086513063, | |
| "loss": 20.822, | |
| "step": 599 | |
| }, | |
| { | |
| "epoch": 0.3236573278041873, | |
| "grad_norm": 38.77958297729492, | |
| "learning_rate": 0.00019588864495028712, | |
| "loss": 20.7172, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.3236573278041873, | |
| "eval_loss": 1.5770864486694336, | |
| "eval_runtime": 140.3936, | |
| "eval_samples_per_second": 2.13, | |
| "eval_steps_per_second": 2.13, | |
| "step": 600 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 5559, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 200, | |
| "stateful_callbacks": { | |
| "EarlyStoppingCallback": { | |
| "args": { | |
| "early_stopping_patience": 3, | |
| "early_stopping_threshold": 0.0 | |
| }, | |
| "attributes": { | |
| "early_stopping_patience_counter": 0 | |
| } | |
| }, | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.0696873835715625e+18, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |