diff --git "a/checkpoint-10260/trainer_state.json" "b/checkpoint-10260/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-10260/trainer_state.json" @@ -0,0 +1,71853 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.997726534589153, + "eval_steps": 500, + "global_step": 10260, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0004871711594673595, + "grad_norm": 5.946425914764404, + "learning_rate": 5.0000000000000004e-08, + "loss": 1.5593, + "step": 1 + }, + { + "epoch": 0.000974342318934719, + "grad_norm": 5.445633411407471, + "learning_rate": 1.0000000000000001e-07, + "loss": 1.4606, + "step": 2 + }, + { + "epoch": 0.0014615134784020786, + "grad_norm": 5.560287952423096, + "learning_rate": 1.5000000000000002e-07, + "loss": 1.4527, + "step": 3 + }, + { + "epoch": 0.001948684637869438, + "grad_norm": 5.473464012145996, + "learning_rate": 2.0000000000000002e-07, + "loss": 1.5261, + "step": 4 + }, + { + "epoch": 0.0024358557973367975, + "grad_norm": 5.982491970062256, + "learning_rate": 2.5000000000000004e-07, + "loss": 1.523, + "step": 5 + }, + { + "epoch": 0.002923026956804157, + "grad_norm": 5.706677436828613, + "learning_rate": 3.0000000000000004e-07, + "loss": 1.4752, + "step": 6 + }, + { + "epoch": 0.003410198116271517, + "grad_norm": 5.859375476837158, + "learning_rate": 3.5000000000000004e-07, + "loss": 1.4534, + "step": 7 + }, + { + "epoch": 0.003897369275738876, + "grad_norm": 5.490967750549316, + "learning_rate": 4.0000000000000003e-07, + "loss": 1.5146, + "step": 8 + }, + { + "epoch": 0.004384540435206236, + "grad_norm": 5.191905498504639, + "learning_rate": 4.5000000000000003e-07, + "loss": 1.535, + "step": 9 + }, + { + "epoch": 0.004871711594673595, + "grad_norm": 5.527923107147217, + "learning_rate": 5.000000000000001e-07, + "loss": 1.5085, + "step": 10 + }, + { + "epoch": 0.005358882754140955, + "grad_norm": 5.797060012817383, + "learning_rate": 5.5e-07, + "loss": 1.5359, + "step": 11 + }, + { + "epoch": 0.005846053913608314, + "grad_norm": 5.807928085327148, + "learning_rate": 6.000000000000001e-07, + "loss": 1.5697, + "step": 12 + }, + { + "epoch": 0.006333225073075674, + "grad_norm": 5.768320560455322, + "learning_rate": 6.5e-07, + "loss": 1.4542, + "step": 13 + }, + { + "epoch": 0.006820396232543034, + "grad_norm": 5.768290042877197, + "learning_rate": 7.000000000000001e-07, + "loss": 1.5397, + "step": 14 + }, + { + "epoch": 0.007307567392010393, + "grad_norm": 5.508993148803711, + "learning_rate": 7.5e-07, + "loss": 1.6136, + "step": 15 + }, + { + "epoch": 0.007794738551477752, + "grad_norm": 5.312325477600098, + "learning_rate": 8.000000000000001e-07, + "loss": 1.4704, + "step": 16 + }, + { + "epoch": 0.008281909710945112, + "grad_norm": 5.432854175567627, + "learning_rate": 8.500000000000001e-07, + "loss": 1.4709, + "step": 17 + }, + { + "epoch": 0.008769080870412472, + "grad_norm": 5.390946865081787, + "learning_rate": 9.000000000000001e-07, + "loss": 1.4136, + "step": 18 + }, + { + "epoch": 0.00925625202987983, + "grad_norm": 5.483653545379639, + "learning_rate": 9.500000000000001e-07, + "loss": 1.5113, + "step": 19 + }, + { + "epoch": 0.00974342318934719, + "grad_norm": 5.077030658721924, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.4578, + "step": 20 + }, + { + "epoch": 0.010230594348814551, + "grad_norm": 5.132854461669922, + "learning_rate": 1.0500000000000001e-06, + "loss": 1.4998, + "step": 21 + }, + { + "epoch": 0.01071776550828191, + "grad_norm": 4.747710704803467, + "learning_rate": 1.1e-06, + "loss": 1.5041, + "step": 22 + }, + { + "epoch": 0.01120493666774927, + "grad_norm": 4.940703868865967, + "learning_rate": 1.1500000000000002e-06, + "loss": 1.4211, + "step": 23 + }, + { + "epoch": 0.011692107827216629, + "grad_norm": 5.1911773681640625, + "learning_rate": 1.2000000000000002e-06, + "loss": 1.4115, + "step": 24 + }, + { + "epoch": 0.012179278986683988, + "grad_norm": 4.377867221832275, + "learning_rate": 1.25e-06, + "loss": 1.3275, + "step": 25 + }, + { + "epoch": 0.012666450146151347, + "grad_norm": 4.47191858291626, + "learning_rate": 1.3e-06, + "loss": 1.4002, + "step": 26 + }, + { + "epoch": 0.013153621305618708, + "grad_norm": 6.754418849945068, + "learning_rate": 1.3500000000000002e-06, + "loss": 1.5266, + "step": 27 + }, + { + "epoch": 0.013640792465086067, + "grad_norm": 4.353283405303955, + "learning_rate": 1.4000000000000001e-06, + "loss": 1.3703, + "step": 28 + }, + { + "epoch": 0.014127963624553427, + "grad_norm": 5.387121200561523, + "learning_rate": 1.45e-06, + "loss": 1.381, + "step": 29 + }, + { + "epoch": 0.014615134784020786, + "grad_norm": 4.188816070556641, + "learning_rate": 1.5e-06, + "loss": 1.3162, + "step": 30 + }, + { + "epoch": 0.015102305943488145, + "grad_norm": 4.31308126449585, + "learning_rate": 1.5500000000000002e-06, + "loss": 1.2944, + "step": 31 + }, + { + "epoch": 0.015589477102955504, + "grad_norm": 4.509305000305176, + "learning_rate": 1.6000000000000001e-06, + "loss": 1.3902, + "step": 32 + }, + { + "epoch": 0.016076648262422864, + "grad_norm": 4.434366226196289, + "learning_rate": 1.6500000000000003e-06, + "loss": 1.191, + "step": 33 + }, + { + "epoch": 0.016563819421890225, + "grad_norm": 4.176339626312256, + "learning_rate": 1.7000000000000002e-06, + "loss": 1.2362, + "step": 34 + }, + { + "epoch": 0.017050990581357582, + "grad_norm": 4.190090656280518, + "learning_rate": 1.75e-06, + "loss": 1.2479, + "step": 35 + }, + { + "epoch": 0.017538161740824943, + "grad_norm": 3.9618754386901855, + "learning_rate": 1.8000000000000001e-06, + "loss": 1.1941, + "step": 36 + }, + { + "epoch": 0.018025332900292304, + "grad_norm": 3.46006178855896, + "learning_rate": 1.85e-06, + "loss": 1.1604, + "step": 37 + }, + { + "epoch": 0.01851250405975966, + "grad_norm": 3.583003520965576, + "learning_rate": 1.9000000000000002e-06, + "loss": 1.2056, + "step": 38 + }, + { + "epoch": 0.018999675219227023, + "grad_norm": 3.5080394744873047, + "learning_rate": 1.9500000000000004e-06, + "loss": 1.2165, + "step": 39 + }, + { + "epoch": 0.01948684637869438, + "grad_norm": 3.5452187061309814, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.083, + "step": 40 + }, + { + "epoch": 0.01997401753816174, + "grad_norm": 3.866508722305298, + "learning_rate": 2.05e-06, + "loss": 1.0542, + "step": 41 + }, + { + "epoch": 0.020461188697629102, + "grad_norm": 4.184366703033447, + "learning_rate": 2.1000000000000002e-06, + "loss": 1.2397, + "step": 42 + }, + { + "epoch": 0.02094835985709646, + "grad_norm": 3.9701080322265625, + "learning_rate": 2.15e-06, + "loss": 1.2578, + "step": 43 + }, + { + "epoch": 0.02143553101656382, + "grad_norm": 3.9603922367095947, + "learning_rate": 2.2e-06, + "loss": 1.1443, + "step": 44 + }, + { + "epoch": 0.021922702176031178, + "grad_norm": 4.203791618347168, + "learning_rate": 2.25e-06, + "loss": 1.0998, + "step": 45 + }, + { + "epoch": 0.02240987333549854, + "grad_norm": 3.8709661960601807, + "learning_rate": 2.3000000000000004e-06, + "loss": 1.2485, + "step": 46 + }, + { + "epoch": 0.022897044494965896, + "grad_norm": 3.652940273284912, + "learning_rate": 2.35e-06, + "loss": 1.0723, + "step": 47 + }, + { + "epoch": 0.023384215654433257, + "grad_norm": 4.0225510597229, + "learning_rate": 2.4000000000000003e-06, + "loss": 1.0658, + "step": 48 + }, + { + "epoch": 0.02387138681390062, + "grad_norm": 3.3929052352905273, + "learning_rate": 2.4500000000000003e-06, + "loss": 1.0853, + "step": 49 + }, + { + "epoch": 0.024358557973367976, + "grad_norm": 3.0837128162384033, + "learning_rate": 2.5e-06, + "loss": 0.9808, + "step": 50 + }, + { + "epoch": 0.024845729132835337, + "grad_norm": 3.8023931980133057, + "learning_rate": 2.55e-06, + "loss": 1.0352, + "step": 51 + }, + { + "epoch": 0.025332900292302694, + "grad_norm": 3.1048154830932617, + "learning_rate": 2.6e-06, + "loss": 1.0028, + "step": 52 + }, + { + "epoch": 0.025820071451770055, + "grad_norm": 3.6437830924987793, + "learning_rate": 2.6500000000000005e-06, + "loss": 1.0345, + "step": 53 + }, + { + "epoch": 0.026307242611237416, + "grad_norm": 3.299567461013794, + "learning_rate": 2.7000000000000004e-06, + "loss": 0.9814, + "step": 54 + }, + { + "epoch": 0.026794413770704774, + "grad_norm": 3.2365713119506836, + "learning_rate": 2.7500000000000004e-06, + "loss": 1.0671, + "step": 55 + }, + { + "epoch": 0.027281584930172135, + "grad_norm": 3.1641845703125, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.9618, + "step": 56 + }, + { + "epoch": 0.027768756089639492, + "grad_norm": 3.265040397644043, + "learning_rate": 2.85e-06, + "loss": 1.0449, + "step": 57 + }, + { + "epoch": 0.028255927249106853, + "grad_norm": 2.8479838371276855, + "learning_rate": 2.9e-06, + "loss": 1.0409, + "step": 58 + }, + { + "epoch": 0.02874309840857421, + "grad_norm": 3.100071668624878, + "learning_rate": 2.95e-06, + "loss": 1.0306, + "step": 59 + }, + { + "epoch": 0.029230269568041572, + "grad_norm": 3.631537675857544, + "learning_rate": 3e-06, + "loss": 1.0105, + "step": 60 + }, + { + "epoch": 0.029717440727508933, + "grad_norm": 3.702788829803467, + "learning_rate": 3.05e-06, + "loss": 1.0938, + "step": 61 + }, + { + "epoch": 0.03020461188697629, + "grad_norm": 2.996607542037964, + "learning_rate": 3.1000000000000004e-06, + "loss": 0.9835, + "step": 62 + }, + { + "epoch": 0.03069178304644365, + "grad_norm": 3.7307591438293457, + "learning_rate": 3.1500000000000003e-06, + "loss": 1.0041, + "step": 63 + }, + { + "epoch": 0.03117895420591101, + "grad_norm": 4.302492618560791, + "learning_rate": 3.2000000000000003e-06, + "loss": 1.1092, + "step": 64 + }, + { + "epoch": 0.03166612536537837, + "grad_norm": 3.4153385162353516, + "learning_rate": 3.2500000000000002e-06, + "loss": 1.0701, + "step": 65 + }, + { + "epoch": 0.03215329652484573, + "grad_norm": 3.4296133518218994, + "learning_rate": 3.3000000000000006e-06, + "loss": 0.957, + "step": 66 + }, + { + "epoch": 0.03264046768431309, + "grad_norm": 3.2810206413269043, + "learning_rate": 3.3500000000000005e-06, + "loss": 0.9495, + "step": 67 + }, + { + "epoch": 0.03312763884378045, + "grad_norm": 2.8856170177459717, + "learning_rate": 3.4000000000000005e-06, + "loss": 1.0142, + "step": 68 + }, + { + "epoch": 0.03361481000324781, + "grad_norm": 3.3225011825561523, + "learning_rate": 3.45e-06, + "loss": 1.0929, + "step": 69 + }, + { + "epoch": 0.034101981162715164, + "grad_norm": 4.14732027053833, + "learning_rate": 3.5e-06, + "loss": 0.9973, + "step": 70 + }, + { + "epoch": 0.03458915232218253, + "grad_norm": 3.167543888092041, + "learning_rate": 3.5500000000000003e-06, + "loss": 1.0188, + "step": 71 + }, + { + "epoch": 0.035076323481649886, + "grad_norm": 3.4668428897857666, + "learning_rate": 3.6000000000000003e-06, + "loss": 1.0102, + "step": 72 + }, + { + "epoch": 0.035563494641117244, + "grad_norm": 3.363309383392334, + "learning_rate": 3.65e-06, + "loss": 0.9051, + "step": 73 + }, + { + "epoch": 0.03605066580058461, + "grad_norm": 3.634291410446167, + "learning_rate": 3.7e-06, + "loss": 1.0927, + "step": 74 + }, + { + "epoch": 0.036537836960051966, + "grad_norm": 3.1377744674682617, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.9, + "step": 75 + }, + { + "epoch": 0.03702500811951932, + "grad_norm": 3.007258653640747, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.9638, + "step": 76 + }, + { + "epoch": 0.03751217927898668, + "grad_norm": 3.4406495094299316, + "learning_rate": 3.85e-06, + "loss": 1.0382, + "step": 77 + }, + { + "epoch": 0.037999350438454045, + "grad_norm": 3.4302561283111572, + "learning_rate": 3.900000000000001e-06, + "loss": 0.9945, + "step": 78 + }, + { + "epoch": 0.0384865215979214, + "grad_norm": 2.9176478385925293, + "learning_rate": 3.95e-06, + "loss": 0.9163, + "step": 79 + }, + { + "epoch": 0.03897369275738876, + "grad_norm": 3.527169942855835, + "learning_rate": 4.000000000000001e-06, + "loss": 1.0216, + "step": 80 + }, + { + "epoch": 0.039460863916856125, + "grad_norm": 3.102243185043335, + "learning_rate": 4.05e-06, + "loss": 1.0982, + "step": 81 + }, + { + "epoch": 0.03994803507632348, + "grad_norm": 3.011439085006714, + "learning_rate": 4.1e-06, + "loss": 0.9466, + "step": 82 + }, + { + "epoch": 0.04043520623579084, + "grad_norm": 3.754337787628174, + "learning_rate": 4.15e-06, + "loss": 0.9937, + "step": 83 + }, + { + "epoch": 0.040922377395258204, + "grad_norm": 3.3973500728607178, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.9962, + "step": 84 + }, + { + "epoch": 0.04140954855472556, + "grad_norm": 3.5010437965393066, + "learning_rate": 4.25e-06, + "loss": 0.9936, + "step": 85 + }, + { + "epoch": 0.04189671971419292, + "grad_norm": 3.094677209854126, + "learning_rate": 4.3e-06, + "loss": 0.9255, + "step": 86 + }, + { + "epoch": 0.042383890873660277, + "grad_norm": 3.2721681594848633, + "learning_rate": 4.350000000000001e-06, + "loss": 0.9276, + "step": 87 + }, + { + "epoch": 0.04287106203312764, + "grad_norm": 3.1438446044921875, + "learning_rate": 4.4e-06, + "loss": 0.9035, + "step": 88 + }, + { + "epoch": 0.043358233192595, + "grad_norm": 3.0705089569091797, + "learning_rate": 4.450000000000001e-06, + "loss": 0.957, + "step": 89 + }, + { + "epoch": 0.043845404352062356, + "grad_norm": 3.9328744411468506, + "learning_rate": 4.5e-06, + "loss": 0.985, + "step": 90 + }, + { + "epoch": 0.04433257551152972, + "grad_norm": 3.7138543128967285, + "learning_rate": 4.5500000000000005e-06, + "loss": 0.8824, + "step": 91 + }, + { + "epoch": 0.04481974667099708, + "grad_norm": 3.5230917930603027, + "learning_rate": 4.600000000000001e-06, + "loss": 0.9753, + "step": 92 + }, + { + "epoch": 0.045306917830464435, + "grad_norm": 3.260451555252075, + "learning_rate": 4.65e-06, + "loss": 0.9122, + "step": 93 + }, + { + "epoch": 0.04579408898993179, + "grad_norm": 3.270282030105591, + "learning_rate": 4.7e-06, + "loss": 0.9122, + "step": 94 + }, + { + "epoch": 0.04628126014939916, + "grad_norm": 3.2026121616363525, + "learning_rate": 4.75e-06, + "loss": 0.9256, + "step": 95 + }, + { + "epoch": 0.046768431308866515, + "grad_norm": 2.956109046936035, + "learning_rate": 4.800000000000001e-06, + "loss": 0.848, + "step": 96 + }, + { + "epoch": 0.04725560246833387, + "grad_norm": 3.5045127868652344, + "learning_rate": 4.85e-06, + "loss": 0.9216, + "step": 97 + }, + { + "epoch": 0.04774277362780124, + "grad_norm": 3.467245578765869, + "learning_rate": 4.9000000000000005e-06, + "loss": 0.9826, + "step": 98 + }, + { + "epoch": 0.048229944787268594, + "grad_norm": 3.6045069694519043, + "learning_rate": 4.95e-06, + "loss": 0.8863, + "step": 99 + }, + { + "epoch": 0.04871711594673595, + "grad_norm": 2.9389102458953857, + "learning_rate": 5e-06, + "loss": 0.8814, + "step": 100 + }, + { + "epoch": 0.04920428710620331, + "grad_norm": 3.482452869415283, + "learning_rate": 4.999999917275117e-06, + "loss": 0.9817, + "step": 101 + }, + { + "epoch": 0.049691458265670674, + "grad_norm": 3.0891847610473633, + "learning_rate": 4.999999669100472e-06, + "loss": 0.8796, + "step": 102 + }, + { + "epoch": 0.05017862942513803, + "grad_norm": 3.5698392391204834, + "learning_rate": 4.9999992554760816e-06, + "loss": 0.8558, + "step": 103 + }, + { + "epoch": 0.05066580058460539, + "grad_norm": 2.8370072841644287, + "learning_rate": 4.9999986764019725e-06, + "loss": 0.8922, + "step": 104 + }, + { + "epoch": 0.05115297174407275, + "grad_norm": 4.343459129333496, + "learning_rate": 4.999997931878184e-06, + "loss": 1.0405, + "step": 105 + }, + { + "epoch": 0.05164014290354011, + "grad_norm": 3.8089804649353027, + "learning_rate": 4.9999970219047655e-06, + "loss": 0.9619, + "step": 106 + }, + { + "epoch": 0.05212731406300747, + "grad_norm": 3.000192880630493, + "learning_rate": 4.999995946481777e-06, + "loss": 0.8708, + "step": 107 + }, + { + "epoch": 0.05261448522247483, + "grad_norm": 3.886390447616577, + "learning_rate": 4.99999470560929e-06, + "loss": 0.9912, + "step": 108 + }, + { + "epoch": 0.05310165638194219, + "grad_norm": 3.1797807216644287, + "learning_rate": 4.9999932992873855e-06, + "loss": 0.8967, + "step": 109 + }, + { + "epoch": 0.05358882754140955, + "grad_norm": 3.0956058502197266, + "learning_rate": 4.999991727516157e-06, + "loss": 0.9248, + "step": 110 + }, + { + "epoch": 0.054075998700876905, + "grad_norm": 3.2430050373077393, + "learning_rate": 4.999989990295709e-06, + "loss": 0.8996, + "step": 111 + }, + { + "epoch": 0.05456316986034427, + "grad_norm": 3.1134862899780273, + "learning_rate": 4.9999880876261565e-06, + "loss": 0.8843, + "step": 112 + }, + { + "epoch": 0.05505034101981163, + "grad_norm": 3.037472724914551, + "learning_rate": 4.999986019507626e-06, + "loss": 0.8518, + "step": 113 + }, + { + "epoch": 0.055537512179278985, + "grad_norm": 3.3779070377349854, + "learning_rate": 4.999983785940252e-06, + "loss": 0.8862, + "step": 114 + }, + { + "epoch": 0.05602468333874635, + "grad_norm": 3.2379722595214844, + "learning_rate": 4.9999813869241845e-06, + "loss": 0.9583, + "step": 115 + }, + { + "epoch": 0.05651185449821371, + "grad_norm": 3.217869281768799, + "learning_rate": 4.9999788224595815e-06, + "loss": 0.9687, + "step": 116 + }, + { + "epoch": 0.056999025657681064, + "grad_norm": 3.246709108352661, + "learning_rate": 4.999976092546612e-06, + "loss": 0.9085, + "step": 117 + }, + { + "epoch": 0.05748619681714842, + "grad_norm": 3.3147687911987305, + "learning_rate": 4.999973197185459e-06, + "loss": 0.9696, + "step": 118 + }, + { + "epoch": 0.057973367976615786, + "grad_norm": 3.2359671592712402, + "learning_rate": 4.999970136376312e-06, + "loss": 0.839, + "step": 119 + }, + { + "epoch": 0.058460539136083144, + "grad_norm": 9.301551818847656, + "learning_rate": 4.999966910119374e-06, + "loss": 0.9454, + "step": 120 + }, + { + "epoch": 0.0589477102955505, + "grad_norm": 3.589207649230957, + "learning_rate": 4.9999635184148585e-06, + "loss": 0.9087, + "step": 121 + }, + { + "epoch": 0.059434881455017866, + "grad_norm": 3.8146495819091797, + "learning_rate": 4.999959961262991e-06, + "loss": 0.9507, + "step": 122 + }, + { + "epoch": 0.05992205261448522, + "grad_norm": 3.379023313522339, + "learning_rate": 4.9999562386640055e-06, + "loss": 0.8315, + "step": 123 + }, + { + "epoch": 0.06040922377395258, + "grad_norm": 3.059197187423706, + "learning_rate": 4.999952350618149e-06, + "loss": 0.887, + "step": 124 + }, + { + "epoch": 0.06089639493341994, + "grad_norm": 3.259502410888672, + "learning_rate": 4.999948297125679e-06, + "loss": 0.8793, + "step": 125 + }, + { + "epoch": 0.0613835660928873, + "grad_norm": 3.2446255683898926, + "learning_rate": 4.999944078186863e-06, + "loss": 0.8398, + "step": 126 + }, + { + "epoch": 0.06187073725235466, + "grad_norm": 3.5450680255889893, + "learning_rate": 4.999939693801981e-06, + "loss": 0.9517, + "step": 127 + }, + { + "epoch": 0.06235790841182202, + "grad_norm": 3.7202939987182617, + "learning_rate": 4.999935143971323e-06, + "loss": 0.9883, + "step": 128 + }, + { + "epoch": 0.06284507957128938, + "grad_norm": 3.31451416015625, + "learning_rate": 4.99993042869519e-06, + "loss": 0.8509, + "step": 129 + }, + { + "epoch": 0.06333225073075674, + "grad_norm": 3.430546522140503, + "learning_rate": 4.999925547973893e-06, + "loss": 0.9789, + "step": 130 + }, + { + "epoch": 0.0638194218902241, + "grad_norm": 3.173535108566284, + "learning_rate": 4.999920501807757e-06, + "loss": 0.9049, + "step": 131 + }, + { + "epoch": 0.06430659304969145, + "grad_norm": 3.350964069366455, + "learning_rate": 4.999915290197115e-06, + "loss": 0.8863, + "step": 132 + }, + { + "epoch": 0.06479376420915882, + "grad_norm": 3.065786600112915, + "learning_rate": 4.999909913142311e-06, + "loss": 0.8421, + "step": 133 + }, + { + "epoch": 0.06528093536862618, + "grad_norm": 2.947803497314453, + "learning_rate": 4.9999043706437024e-06, + "loss": 0.8088, + "step": 134 + }, + { + "epoch": 0.06576810652809353, + "grad_norm": 2.96244478225708, + "learning_rate": 4.999898662701656e-06, + "loss": 0.8284, + "step": 135 + }, + { + "epoch": 0.0662552776875609, + "grad_norm": 3.0162854194641113, + "learning_rate": 4.999892789316547e-06, + "loss": 0.8754, + "step": 136 + }, + { + "epoch": 0.06674244884702826, + "grad_norm": 3.3704018592834473, + "learning_rate": 4.999886750488768e-06, + "loss": 0.8693, + "step": 137 + }, + { + "epoch": 0.06722962000649561, + "grad_norm": 3.7147414684295654, + "learning_rate": 4.999880546218716e-06, + "loss": 0.9688, + "step": 138 + }, + { + "epoch": 0.06771679116596298, + "grad_norm": 3.636547327041626, + "learning_rate": 4.9998741765068015e-06, + "loss": 0.8006, + "step": 139 + }, + { + "epoch": 0.06820396232543033, + "grad_norm": 3.60339617729187, + "learning_rate": 4.999867641353447e-06, + "loss": 0.8935, + "step": 140 + }, + { + "epoch": 0.06869113348489769, + "grad_norm": 3.6125295162200928, + "learning_rate": 4.999860940759086e-06, + "loss": 0.8341, + "step": 141 + }, + { + "epoch": 0.06917830464436506, + "grad_norm": 2.730962038040161, + "learning_rate": 4.99985407472416e-06, + "loss": 0.7529, + "step": 142 + }, + { + "epoch": 0.06966547580383241, + "grad_norm": 3.239326000213623, + "learning_rate": 4.999847043249124e-06, + "loss": 0.8857, + "step": 143 + }, + { + "epoch": 0.07015264696329977, + "grad_norm": 3.3659021854400635, + "learning_rate": 4.999839846334443e-06, + "loss": 0.8159, + "step": 144 + }, + { + "epoch": 0.07063981812276714, + "grad_norm": 3.816138505935669, + "learning_rate": 4.999832483980594e-06, + "loss": 0.9351, + "step": 145 + }, + { + "epoch": 0.07112698928223449, + "grad_norm": 3.3815481662750244, + "learning_rate": 4.9998249561880645e-06, + "loss": 0.8318, + "step": 146 + }, + { + "epoch": 0.07161416044170185, + "grad_norm": 3.119555950164795, + "learning_rate": 4.999817262957352e-06, + "loss": 0.7837, + "step": 147 + }, + { + "epoch": 0.07210133160116922, + "grad_norm": 2.9735164642333984, + "learning_rate": 4.999809404288966e-06, + "loss": 0.9028, + "step": 148 + }, + { + "epoch": 0.07258850276063657, + "grad_norm": 3.825808048248291, + "learning_rate": 4.999801380183425e-06, + "loss": 0.8913, + "step": 149 + }, + { + "epoch": 0.07307567392010393, + "grad_norm": 3.3037729263305664, + "learning_rate": 4.999793190641263e-06, + "loss": 0.9509, + "step": 150 + }, + { + "epoch": 0.0735628450795713, + "grad_norm": 3.735775947570801, + "learning_rate": 4.999784835663021e-06, + "loss": 0.8384, + "step": 151 + }, + { + "epoch": 0.07405001623903865, + "grad_norm": 3.867992877960205, + "learning_rate": 4.999776315249249e-06, + "loss": 0.9515, + "step": 152 + }, + { + "epoch": 0.07453718739850601, + "grad_norm": 3.104728937149048, + "learning_rate": 4.9997676294005155e-06, + "loss": 0.8611, + "step": 153 + }, + { + "epoch": 0.07502435855797336, + "grad_norm": 3.158883571624756, + "learning_rate": 4.999758778117392e-06, + "loss": 0.866, + "step": 154 + }, + { + "epoch": 0.07551152971744073, + "grad_norm": 3.0615832805633545, + "learning_rate": 4.9997497614004655e-06, + "loss": 0.8774, + "step": 155 + }, + { + "epoch": 0.07599870087690809, + "grad_norm": 3.321518898010254, + "learning_rate": 4.999740579250333e-06, + "loss": 0.7898, + "step": 156 + }, + { + "epoch": 0.07648587203637544, + "grad_norm": 3.861950635910034, + "learning_rate": 4.999731231667601e-06, + "loss": 0.927, + "step": 157 + }, + { + "epoch": 0.0769730431958428, + "grad_norm": 3.4500153064727783, + "learning_rate": 4.999721718652889e-06, + "loss": 0.9171, + "step": 158 + }, + { + "epoch": 0.07746021435531017, + "grad_norm": 3.480621576309204, + "learning_rate": 4.999712040206827e-06, + "loss": 0.8601, + "step": 159 + }, + { + "epoch": 0.07794738551477752, + "grad_norm": 3.465864419937134, + "learning_rate": 4.999702196330054e-06, + "loss": 0.8179, + "step": 160 + }, + { + "epoch": 0.07843455667424488, + "grad_norm": 3.1263835430145264, + "learning_rate": 4.9996921870232226e-06, + "loss": 0.77, + "step": 161 + }, + { + "epoch": 0.07892172783371225, + "grad_norm": 3.227463960647583, + "learning_rate": 4.999682012286996e-06, + "loss": 0.8341, + "step": 162 + }, + { + "epoch": 0.0794088989931796, + "grad_norm": 3.2053098678588867, + "learning_rate": 4.999671672122046e-06, + "loss": 0.8105, + "step": 163 + }, + { + "epoch": 0.07989607015264696, + "grad_norm": 3.161653518676758, + "learning_rate": 4.999661166529057e-06, + "loss": 0.8085, + "step": 164 + }, + { + "epoch": 0.08038324131211433, + "grad_norm": 3.4612393379211426, + "learning_rate": 4.999650495508726e-06, + "loss": 0.8015, + "step": 165 + }, + { + "epoch": 0.08087041247158168, + "grad_norm": 2.978531837463379, + "learning_rate": 4.999639659061757e-06, + "loss": 0.7324, + "step": 166 + }, + { + "epoch": 0.08135758363104904, + "grad_norm": 3.628941774368286, + "learning_rate": 4.9996286571888685e-06, + "loss": 0.9406, + "step": 167 + }, + { + "epoch": 0.08184475479051641, + "grad_norm": 4.08802604675293, + "learning_rate": 4.999617489890788e-06, + "loss": 0.801, + "step": 168 + }, + { + "epoch": 0.08233192594998376, + "grad_norm": 3.6072585582733154, + "learning_rate": 4.999606157168255e-06, + "loss": 0.8419, + "step": 169 + }, + { + "epoch": 0.08281909710945112, + "grad_norm": 3.3501553535461426, + "learning_rate": 4.999594659022019e-06, + "loss": 0.8329, + "step": 170 + }, + { + "epoch": 0.08330626826891847, + "grad_norm": 2.943817138671875, + "learning_rate": 4.999582995452842e-06, + "loss": 0.7088, + "step": 171 + }, + { + "epoch": 0.08379343942838584, + "grad_norm": 3.2661526203155518, + "learning_rate": 4.999571166461494e-06, + "loss": 0.7739, + "step": 172 + }, + { + "epoch": 0.0842806105878532, + "grad_norm": 3.6151397228240967, + "learning_rate": 4.9995591720487595e-06, + "loss": 0.8574, + "step": 173 + }, + { + "epoch": 0.08476778174732055, + "grad_norm": 3.7293925285339355, + "learning_rate": 4.999547012215432e-06, + "loss": 0.794, + "step": 174 + }, + { + "epoch": 0.08525495290678792, + "grad_norm": 3.2431507110595703, + "learning_rate": 4.999534686962315e-06, + "loss": 0.8934, + "step": 175 + }, + { + "epoch": 0.08574212406625528, + "grad_norm": 2.846268653869629, + "learning_rate": 4.999522196290226e-06, + "loss": 0.841, + "step": 176 + }, + { + "epoch": 0.08622929522572263, + "grad_norm": 2.8477535247802734, + "learning_rate": 4.999509540199992e-06, + "loss": 0.7858, + "step": 177 + }, + { + "epoch": 0.08671646638519, + "grad_norm": 3.25290584564209, + "learning_rate": 4.999496718692447e-06, + "loss": 0.8242, + "step": 178 + }, + { + "epoch": 0.08720363754465736, + "grad_norm": 3.2920644283294678, + "learning_rate": 4.999483731768443e-06, + "loss": 0.794, + "step": 179 + }, + { + "epoch": 0.08769080870412471, + "grad_norm": 3.036412239074707, + "learning_rate": 4.999470579428839e-06, + "loss": 0.8293, + "step": 180 + }, + { + "epoch": 0.08817797986359208, + "grad_norm": 3.367645263671875, + "learning_rate": 4.999457261674504e-06, + "loss": 0.7962, + "step": 181 + }, + { + "epoch": 0.08866515102305944, + "grad_norm": 3.6690030097961426, + "learning_rate": 4.99944377850632e-06, + "loss": 0.9328, + "step": 182 + }, + { + "epoch": 0.08915232218252679, + "grad_norm": 3.3169894218444824, + "learning_rate": 4.9994301299251805e-06, + "loss": 0.8179, + "step": 183 + }, + { + "epoch": 0.08963949334199416, + "grad_norm": 3.437082529067993, + "learning_rate": 4.999416315931986e-06, + "loss": 0.7742, + "step": 184 + }, + { + "epoch": 0.09012666450146152, + "grad_norm": 3.3023359775543213, + "learning_rate": 4.999402336527655e-06, + "loss": 0.8402, + "step": 185 + }, + { + "epoch": 0.09061383566092887, + "grad_norm": 3.340036392211914, + "learning_rate": 4.9993881917131086e-06, + "loss": 0.8292, + "step": 186 + }, + { + "epoch": 0.09110100682039624, + "grad_norm": 3.1481611728668213, + "learning_rate": 4.999373881489284e-06, + "loss": 0.7699, + "step": 187 + }, + { + "epoch": 0.09158817797986359, + "grad_norm": 3.1452813148498535, + "learning_rate": 4.9993594058571295e-06, + "loss": 0.7712, + "step": 188 + }, + { + "epoch": 0.09207534913933095, + "grad_norm": 3.7541093826293945, + "learning_rate": 4.999344764817601e-06, + "loss": 0.855, + "step": 189 + }, + { + "epoch": 0.09256252029879831, + "grad_norm": 2.9269936084747314, + "learning_rate": 4.999329958371671e-06, + "loss": 0.7544, + "step": 190 + }, + { + "epoch": 0.09304969145826567, + "grad_norm": 3.1032509803771973, + "learning_rate": 4.999314986520315e-06, + "loss": 0.8016, + "step": 191 + }, + { + "epoch": 0.09353686261773303, + "grad_norm": 3.6601321697235107, + "learning_rate": 4.999299849264526e-06, + "loss": 0.8528, + "step": 192 + }, + { + "epoch": 0.0940240337772004, + "grad_norm": 3.2727208137512207, + "learning_rate": 4.999284546605306e-06, + "loss": 0.8547, + "step": 193 + }, + { + "epoch": 0.09451120493666774, + "grad_norm": 3.0187244415283203, + "learning_rate": 4.9992690785436685e-06, + "loss": 0.7563, + "step": 194 + }, + { + "epoch": 0.09499837609613511, + "grad_norm": 3.6217947006225586, + "learning_rate": 4.999253445080635e-06, + "loss": 0.829, + "step": 195 + }, + { + "epoch": 0.09548554725560247, + "grad_norm": 3.018751621246338, + "learning_rate": 4.999237646217242e-06, + "loss": 0.7659, + "step": 196 + }, + { + "epoch": 0.09597271841506982, + "grad_norm": 4.121409893035889, + "learning_rate": 4.999221681954534e-06, + "loss": 0.9549, + "step": 197 + }, + { + "epoch": 0.09645988957453719, + "grad_norm": 2.9107954502105713, + "learning_rate": 4.999205552293568e-06, + "loss": 0.7599, + "step": 198 + }, + { + "epoch": 0.09694706073400455, + "grad_norm": 3.1102752685546875, + "learning_rate": 4.999189257235411e-06, + "loss": 0.7374, + "step": 199 + }, + { + "epoch": 0.0974342318934719, + "grad_norm": 3.860538959503174, + "learning_rate": 4.999172796781142e-06, + "loss": 0.8679, + "step": 200 + }, + { + "epoch": 0.09792140305293927, + "grad_norm": 3.5360591411590576, + "learning_rate": 4.999156170931849e-06, + "loss": 0.8137, + "step": 201 + }, + { + "epoch": 0.09840857421240662, + "grad_norm": 3.048168420791626, + "learning_rate": 4.999139379688634e-06, + "loss": 0.856, + "step": 202 + }, + { + "epoch": 0.09889574537187398, + "grad_norm": 3.749922513961792, + "learning_rate": 4.999122423052608e-06, + "loss": 0.8591, + "step": 203 + }, + { + "epoch": 0.09938291653134135, + "grad_norm": 3.2922539710998535, + "learning_rate": 4.999105301024892e-06, + "loss": 0.6767, + "step": 204 + }, + { + "epoch": 0.0998700876908087, + "grad_norm": 3.1477956771850586, + "learning_rate": 4.99908801360662e-06, + "loss": 0.8567, + "step": 205 + }, + { + "epoch": 0.10035725885027606, + "grad_norm": 3.110806465148926, + "learning_rate": 4.999070560798937e-06, + "loss": 0.7942, + "step": 206 + }, + { + "epoch": 0.10084443000974343, + "grad_norm": 3.3488848209381104, + "learning_rate": 4.999052942602997e-06, + "loss": 0.8489, + "step": 207 + }, + { + "epoch": 0.10133160116921078, + "grad_norm": 2.957911252975464, + "learning_rate": 4.999035159019965e-06, + "loss": 0.7421, + "step": 208 + }, + { + "epoch": 0.10181877232867814, + "grad_norm": 3.266347885131836, + "learning_rate": 4.999017210051019e-06, + "loss": 0.8407, + "step": 209 + }, + { + "epoch": 0.1023059434881455, + "grad_norm": 2.947484254837036, + "learning_rate": 4.998999095697347e-06, + "loss": 0.7946, + "step": 210 + }, + { + "epoch": 0.10279311464761286, + "grad_norm": 3.4025514125823975, + "learning_rate": 4.998980815960147e-06, + "loss": 0.8949, + "step": 211 + }, + { + "epoch": 0.10328028580708022, + "grad_norm": 3.513850450515747, + "learning_rate": 4.99896237084063e-06, + "loss": 0.8383, + "step": 212 + }, + { + "epoch": 0.10376745696654759, + "grad_norm": 4.565975189208984, + "learning_rate": 4.998943760340015e-06, + "loss": 0.7771, + "step": 213 + }, + { + "epoch": 0.10425462812601494, + "grad_norm": 3.561448574066162, + "learning_rate": 4.998924984459535e-06, + "loss": 0.8395, + "step": 214 + }, + { + "epoch": 0.1047417992854823, + "grad_norm": 3.4219553470611572, + "learning_rate": 4.998906043200433e-06, + "loss": 0.8809, + "step": 215 + }, + { + "epoch": 0.10522897044494967, + "grad_norm": 3.5774660110473633, + "learning_rate": 4.998886936563961e-06, + "loss": 0.8283, + "step": 216 + }, + { + "epoch": 0.10571614160441702, + "grad_norm": 3.0910964012145996, + "learning_rate": 4.9988676645513855e-06, + "loss": 0.91, + "step": 217 + }, + { + "epoch": 0.10620331276388438, + "grad_norm": 3.2896907329559326, + "learning_rate": 4.99884822716398e-06, + "loss": 0.8618, + "step": 218 + }, + { + "epoch": 0.10669048392335173, + "grad_norm": 3.287799119949341, + "learning_rate": 4.998828624403031e-06, + "loss": 0.749, + "step": 219 + }, + { + "epoch": 0.1071776550828191, + "grad_norm": 3.7398433685302734, + "learning_rate": 4.998808856269837e-06, + "loss": 0.8221, + "step": 220 + }, + { + "epoch": 0.10766482624228646, + "grad_norm": 4.054739475250244, + "learning_rate": 4.998788922765705e-06, + "loss": 0.8182, + "step": 221 + }, + { + "epoch": 0.10815199740175381, + "grad_norm": 3.2944395542144775, + "learning_rate": 4.998768823891955e-06, + "loss": 0.7374, + "step": 222 + }, + { + "epoch": 0.10863916856122117, + "grad_norm": 3.289983034133911, + "learning_rate": 4.998748559649916e-06, + "loss": 0.7237, + "step": 223 + }, + { + "epoch": 0.10912633972068854, + "grad_norm": 3.2896676063537598, + "learning_rate": 4.9987281300409315e-06, + "loss": 0.8339, + "step": 224 + }, + { + "epoch": 0.10961351088015589, + "grad_norm": 2.745098352432251, + "learning_rate": 4.9987075350663515e-06, + "loss": 0.8352, + "step": 225 + }, + { + "epoch": 0.11010068203962325, + "grad_norm": 3.7420592308044434, + "learning_rate": 4.998686774727539e-06, + "loss": 0.9096, + "step": 226 + }, + { + "epoch": 0.11058785319909062, + "grad_norm": 3.1445212364196777, + "learning_rate": 4.998665849025869e-06, + "loss": 0.8261, + "step": 227 + }, + { + "epoch": 0.11107502435855797, + "grad_norm": 3.26448392868042, + "learning_rate": 4.998644757962725e-06, + "loss": 0.8015, + "step": 228 + }, + { + "epoch": 0.11156219551802533, + "grad_norm": 3.8933675289154053, + "learning_rate": 4.998623501539504e-06, + "loss": 0.8853, + "step": 229 + }, + { + "epoch": 0.1120493666774927, + "grad_norm": 3.027663230895996, + "learning_rate": 4.998602079757614e-06, + "loss": 0.7388, + "step": 230 + }, + { + "epoch": 0.11253653783696005, + "grad_norm": 3.061408758163452, + "learning_rate": 4.9985804926184686e-06, + "loss": 0.865, + "step": 231 + }, + { + "epoch": 0.11302370899642741, + "grad_norm": 3.0314342975616455, + "learning_rate": 4.9985587401235005e-06, + "loss": 0.8702, + "step": 232 + }, + { + "epoch": 0.11351088015589478, + "grad_norm": 3.3834195137023926, + "learning_rate": 4.998536822274147e-06, + "loss": 0.7455, + "step": 233 + }, + { + "epoch": 0.11399805131536213, + "grad_norm": 3.5031490325927734, + "learning_rate": 4.998514739071859e-06, + "loss": 0.9494, + "step": 234 + }, + { + "epoch": 0.11448522247482949, + "grad_norm": 2.9739365577697754, + "learning_rate": 4.998492490518099e-06, + "loss": 0.8263, + "step": 235 + }, + { + "epoch": 0.11497239363429684, + "grad_norm": 3.3670201301574707, + "learning_rate": 4.998470076614338e-06, + "loss": 0.8552, + "step": 236 + }, + { + "epoch": 0.11545956479376421, + "grad_norm": 3.184229612350464, + "learning_rate": 4.99844749736206e-06, + "loss": 0.8975, + "step": 237 + }, + { + "epoch": 0.11594673595323157, + "grad_norm": 3.5422258377075195, + "learning_rate": 4.998424752762759e-06, + "loss": 0.8657, + "step": 238 + }, + { + "epoch": 0.11643390711269892, + "grad_norm": 3.1927084922790527, + "learning_rate": 4.998401842817942e-06, + "loss": 0.7653, + "step": 239 + }, + { + "epoch": 0.11692107827216629, + "grad_norm": 3.1916470527648926, + "learning_rate": 4.998378767529122e-06, + "loss": 0.7904, + "step": 240 + }, + { + "epoch": 0.11740824943163365, + "grad_norm": 3.3462588787078857, + "learning_rate": 4.998355526897829e-06, + "loss": 0.8391, + "step": 241 + }, + { + "epoch": 0.117895420591101, + "grad_norm": 3.3479907512664795, + "learning_rate": 4.998332120925598e-06, + "loss": 0.7772, + "step": 242 + }, + { + "epoch": 0.11838259175056837, + "grad_norm": 3.004615068435669, + "learning_rate": 4.9983085496139825e-06, + "loss": 0.8042, + "step": 243 + }, + { + "epoch": 0.11886976291003573, + "grad_norm": 3.531154155731201, + "learning_rate": 4.998284812964538e-06, + "loss": 0.83, + "step": 244 + }, + { + "epoch": 0.11935693406950308, + "grad_norm": 3.5921900272369385, + "learning_rate": 4.998260910978838e-06, + "loss": 0.9098, + "step": 245 + }, + { + "epoch": 0.11984410522897045, + "grad_norm": 3.5157384872436523, + "learning_rate": 4.998236843658463e-06, + "loss": 0.8417, + "step": 246 + }, + { + "epoch": 0.12033127638843781, + "grad_norm": 3.0616397857666016, + "learning_rate": 4.998212611005007e-06, + "loss": 0.7913, + "step": 247 + }, + { + "epoch": 0.12081844754790516, + "grad_norm": 3.4472525119781494, + "learning_rate": 4.998188213020072e-06, + "loss": 0.8195, + "step": 248 + }, + { + "epoch": 0.12130561870737253, + "grad_norm": 3.559234619140625, + "learning_rate": 4.998163649705275e-06, + "loss": 0.7715, + "step": 249 + }, + { + "epoch": 0.12179278986683988, + "grad_norm": 3.267324447631836, + "learning_rate": 4.99813892106224e-06, + "loss": 0.757, + "step": 250 + }, + { + "epoch": 0.12227996102630724, + "grad_norm": 3.920325756072998, + "learning_rate": 4.998114027092603e-06, + "loss": 0.9173, + "step": 251 + }, + { + "epoch": 0.1227671321857746, + "grad_norm": 3.029587507247925, + "learning_rate": 4.9980889677980126e-06, + "loss": 0.7601, + "step": 252 + }, + { + "epoch": 0.12325430334524196, + "grad_norm": 3.4762823581695557, + "learning_rate": 4.998063743180127e-06, + "loss": 0.8591, + "step": 253 + }, + { + "epoch": 0.12374147450470932, + "grad_norm": 3.5568935871124268, + "learning_rate": 4.998038353240616e-06, + "loss": 0.7907, + "step": 254 + }, + { + "epoch": 0.12422864566417668, + "grad_norm": 3.217931032180786, + "learning_rate": 4.998012797981159e-06, + "loss": 0.7855, + "step": 255 + }, + { + "epoch": 0.12471581682364404, + "grad_norm": 3.1351888179779053, + "learning_rate": 4.997987077403448e-06, + "loss": 0.8601, + "step": 256 + }, + { + "epoch": 0.1252029879831114, + "grad_norm": 3.1680617332458496, + "learning_rate": 4.997961191509184e-06, + "loss": 0.7912, + "step": 257 + }, + { + "epoch": 0.12569015914257875, + "grad_norm": 2.889327049255371, + "learning_rate": 4.997935140300082e-06, + "loss": 0.7997, + "step": 258 + }, + { + "epoch": 0.12617733030204611, + "grad_norm": 3.288731098175049, + "learning_rate": 4.997908923777864e-06, + "loss": 0.8886, + "step": 259 + }, + { + "epoch": 0.12666450146151348, + "grad_norm": 3.093944787979126, + "learning_rate": 4.9978825419442665e-06, + "loss": 0.7515, + "step": 260 + }, + { + "epoch": 0.12715167262098084, + "grad_norm": 3.2139713764190674, + "learning_rate": 4.997855994801035e-06, + "loss": 0.7533, + "step": 261 + }, + { + "epoch": 0.1276388437804482, + "grad_norm": 3.3837687969207764, + "learning_rate": 4.997829282349926e-06, + "loss": 0.8529, + "step": 262 + }, + { + "epoch": 0.12812601493991554, + "grad_norm": 3.012937068939209, + "learning_rate": 4.997802404592708e-06, + "loss": 0.7932, + "step": 263 + }, + { + "epoch": 0.1286131860993829, + "grad_norm": 3.3094675540924072, + "learning_rate": 4.99777536153116e-06, + "loss": 0.7965, + "step": 264 + }, + { + "epoch": 0.12910035725885027, + "grad_norm": 3.3423569202423096, + "learning_rate": 4.997748153167071e-06, + "loss": 0.7681, + "step": 265 + }, + { + "epoch": 0.12958752841831764, + "grad_norm": 3.2247314453125, + "learning_rate": 4.997720779502241e-06, + "loss": 0.8674, + "step": 266 + }, + { + "epoch": 0.130074699577785, + "grad_norm": 3.0396835803985596, + "learning_rate": 4.997693240538482e-06, + "loss": 0.7071, + "step": 267 + }, + { + "epoch": 0.13056187073725237, + "grad_norm": 2.7752859592437744, + "learning_rate": 4.997665536277618e-06, + "loss": 0.7827, + "step": 268 + }, + { + "epoch": 0.1310490418967197, + "grad_norm": 3.464545965194702, + "learning_rate": 4.997637666721482e-06, + "loss": 0.7775, + "step": 269 + }, + { + "epoch": 0.13153621305618707, + "grad_norm": 3.3566205501556396, + "learning_rate": 4.997609631871917e-06, + "loss": 0.793, + "step": 270 + }, + { + "epoch": 0.13202338421565443, + "grad_norm": 2.7811262607574463, + "learning_rate": 4.997581431730779e-06, + "loss": 0.7477, + "step": 271 + }, + { + "epoch": 0.1325105553751218, + "grad_norm": 3.4187402725219727, + "learning_rate": 4.997553066299934e-06, + "loss": 0.8409, + "step": 272 + }, + { + "epoch": 0.13299772653458916, + "grad_norm": 3.104055881500244, + "learning_rate": 4.997524535581261e-06, + "loss": 0.8347, + "step": 273 + }, + { + "epoch": 0.13348489769405653, + "grad_norm": 3.425652027130127, + "learning_rate": 4.997495839576646e-06, + "loss": 0.831, + "step": 274 + }, + { + "epoch": 0.13397206885352386, + "grad_norm": 3.528977632522583, + "learning_rate": 4.997466978287989e-06, + "loss": 0.7412, + "step": 275 + }, + { + "epoch": 0.13445924001299123, + "grad_norm": 3.7779102325439453, + "learning_rate": 4.9974379517172e-06, + "loss": 0.8176, + "step": 276 + }, + { + "epoch": 0.1349464111724586, + "grad_norm": 3.1169090270996094, + "learning_rate": 4.9974087598661995e-06, + "loss": 0.7315, + "step": 277 + }, + { + "epoch": 0.13543358233192596, + "grad_norm": 3.2752583026885986, + "learning_rate": 4.99737940273692e-06, + "loss": 0.848, + "step": 278 + }, + { + "epoch": 0.13592075349139332, + "grad_norm": 4.25148344039917, + "learning_rate": 4.9973498803313035e-06, + "loss": 0.8675, + "step": 279 + }, + { + "epoch": 0.13640792465086066, + "grad_norm": 3.8973281383514404, + "learning_rate": 4.997320192651306e-06, + "loss": 0.8091, + "step": 280 + }, + { + "epoch": 0.13689509581032802, + "grad_norm": 3.348989248275757, + "learning_rate": 4.99729033969889e-06, + "loss": 0.716, + "step": 281 + }, + { + "epoch": 0.13738226696979539, + "grad_norm": 3.4347431659698486, + "learning_rate": 4.997260321476032e-06, + "loss": 0.7296, + "step": 282 + }, + { + "epoch": 0.13786943812926275, + "grad_norm": 3.344578742980957, + "learning_rate": 4.997230137984718e-06, + "loss": 0.7359, + "step": 283 + }, + { + "epoch": 0.13835660928873011, + "grad_norm": 3.155571460723877, + "learning_rate": 4.9971997892269466e-06, + "loss": 0.8017, + "step": 284 + }, + { + "epoch": 0.13884378044819748, + "grad_norm": 3.4627034664154053, + "learning_rate": 4.997169275204725e-06, + "loss": 0.7657, + "step": 285 + }, + { + "epoch": 0.13933095160766482, + "grad_norm": 3.3656647205352783, + "learning_rate": 4.997138595920074e-06, + "loss": 0.8668, + "step": 286 + }, + { + "epoch": 0.13981812276713218, + "grad_norm": 3.2416927814483643, + "learning_rate": 4.997107751375022e-06, + "loss": 0.8169, + "step": 287 + }, + { + "epoch": 0.14030529392659954, + "grad_norm": 2.910968065261841, + "learning_rate": 4.997076741571613e-06, + "loss": 0.8211, + "step": 288 + }, + { + "epoch": 0.1407924650860669, + "grad_norm": 2.715421438217163, + "learning_rate": 4.997045566511897e-06, + "loss": 0.7913, + "step": 289 + }, + { + "epoch": 0.14127963624553427, + "grad_norm": 3.455901861190796, + "learning_rate": 4.997014226197938e-06, + "loss": 0.8271, + "step": 290 + }, + { + "epoch": 0.1417668074050016, + "grad_norm": 3.061500072479248, + "learning_rate": 4.996982720631811e-06, + "loss": 0.782, + "step": 291 + }, + { + "epoch": 0.14225397856446897, + "grad_norm": 2.823646068572998, + "learning_rate": 4.996951049815599e-06, + "loss": 0.8159, + "step": 292 + }, + { + "epoch": 0.14274114972393634, + "grad_norm": 2.9467599391937256, + "learning_rate": 4.996919213751399e-06, + "loss": 0.7673, + "step": 293 + }, + { + "epoch": 0.1432283208834037, + "grad_norm": 2.97687029838562, + "learning_rate": 4.996887212441319e-06, + "loss": 0.827, + "step": 294 + }, + { + "epoch": 0.14371549204287107, + "grad_norm": 3.3996310234069824, + "learning_rate": 4.996855045887474e-06, + "loss": 0.7561, + "step": 295 + }, + { + "epoch": 0.14420266320233843, + "grad_norm": 3.3233866691589355, + "learning_rate": 4.9968227140919964e-06, + "loss": 0.7981, + "step": 296 + }, + { + "epoch": 0.14468983436180577, + "grad_norm": 3.1843292713165283, + "learning_rate": 4.996790217057023e-06, + "loss": 0.8685, + "step": 297 + }, + { + "epoch": 0.14517700552127313, + "grad_norm": 3.338949680328369, + "learning_rate": 4.996757554784706e-06, + "loss": 0.9283, + "step": 298 + }, + { + "epoch": 0.1456641766807405, + "grad_norm": 3.257240056991577, + "learning_rate": 4.996724727277207e-06, + "loss": 0.7982, + "step": 299 + }, + { + "epoch": 0.14615134784020786, + "grad_norm": 3.4961326122283936, + "learning_rate": 4.996691734536697e-06, + "loss": 0.7586, + "step": 300 + }, + { + "epoch": 0.14663851899967523, + "grad_norm": 3.750736713409424, + "learning_rate": 4.996658576565361e-06, + "loss": 0.8281, + "step": 301 + }, + { + "epoch": 0.1471256901591426, + "grad_norm": 3.052049398422241, + "learning_rate": 4.996625253365393e-06, + "loss": 0.7807, + "step": 302 + }, + { + "epoch": 0.14761286131860993, + "grad_norm": 3.296076774597168, + "learning_rate": 4.996591764938998e-06, + "loss": 0.7859, + "step": 303 + }, + { + "epoch": 0.1481000324780773, + "grad_norm": 3.441911220550537, + "learning_rate": 4.996558111288392e-06, + "loss": 0.729, + "step": 304 + }, + { + "epoch": 0.14858720363754466, + "grad_norm": 3.3899362087249756, + "learning_rate": 4.996524292415803e-06, + "loss": 0.783, + "step": 305 + }, + { + "epoch": 0.14907437479701202, + "grad_norm": 3.2165610790252686, + "learning_rate": 4.9964903083234695e-06, + "loss": 0.7958, + "step": 306 + }, + { + "epoch": 0.14956154595647939, + "grad_norm": 3.414353370666504, + "learning_rate": 4.996456159013639e-06, + "loss": 0.7531, + "step": 307 + }, + { + "epoch": 0.15004871711594672, + "grad_norm": 3.131279706954956, + "learning_rate": 4.996421844488572e-06, + "loss": 0.731, + "step": 308 + }, + { + "epoch": 0.1505358882754141, + "grad_norm": 3.11667537689209, + "learning_rate": 4.996387364750541e-06, + "loss": 0.7912, + "step": 309 + }, + { + "epoch": 0.15102305943488145, + "grad_norm": 3.271320343017578, + "learning_rate": 4.996352719801827e-06, + "loss": 0.7759, + "step": 310 + }, + { + "epoch": 0.15151023059434882, + "grad_norm": 3.525672435760498, + "learning_rate": 4.996317909644721e-06, + "loss": 0.8233, + "step": 311 + }, + { + "epoch": 0.15199740175381618, + "grad_norm": 3.441464424133301, + "learning_rate": 4.996282934281529e-06, + "loss": 0.7504, + "step": 312 + }, + { + "epoch": 0.15248457291328354, + "grad_norm": 3.1751627922058105, + "learning_rate": 4.996247793714565e-06, + "loss": 0.8712, + "step": 313 + }, + { + "epoch": 0.15297174407275088, + "grad_norm": 3.126542091369629, + "learning_rate": 4.996212487946153e-06, + "loss": 0.7701, + "step": 314 + }, + { + "epoch": 0.15345891523221825, + "grad_norm": 3.1096179485321045, + "learning_rate": 4.996177016978633e-06, + "loss": 0.7654, + "step": 315 + }, + { + "epoch": 0.1539460863916856, + "grad_norm": 3.4651455879211426, + "learning_rate": 4.99614138081435e-06, + "loss": 0.7749, + "step": 316 + }, + { + "epoch": 0.15443325755115297, + "grad_norm": 3.798231363296509, + "learning_rate": 4.9961055794556626e-06, + "loss": 0.8709, + "step": 317 + }, + { + "epoch": 0.15492042871062034, + "grad_norm": 3.2954092025756836, + "learning_rate": 4.99606961290494e-06, + "loss": 0.7578, + "step": 318 + }, + { + "epoch": 0.1554075998700877, + "grad_norm": 2.795562982559204, + "learning_rate": 4.996033481164563e-06, + "loss": 0.653, + "step": 319 + }, + { + "epoch": 0.15589477102955504, + "grad_norm": 2.730229616165161, + "learning_rate": 4.995997184236923e-06, + "loss": 0.7195, + "step": 320 + }, + { + "epoch": 0.1563819421890224, + "grad_norm": 3.0185186862945557, + "learning_rate": 4.995960722124421e-06, + "loss": 0.8741, + "step": 321 + }, + { + "epoch": 0.15686911334848977, + "grad_norm": 3.5037736892700195, + "learning_rate": 4.995924094829472e-06, + "loss": 0.8812, + "step": 322 + }, + { + "epoch": 0.15735628450795713, + "grad_norm": 3.2769973278045654, + "learning_rate": 4.995887302354498e-06, + "loss": 0.7838, + "step": 323 + }, + { + "epoch": 0.1578434556674245, + "grad_norm": 3.51176381111145, + "learning_rate": 4.9958503447019355e-06, + "loss": 0.7598, + "step": 324 + }, + { + "epoch": 0.15833062682689183, + "grad_norm": 2.9758284091949463, + "learning_rate": 4.995813221874229e-06, + "loss": 0.7967, + "step": 325 + }, + { + "epoch": 0.1588177979863592, + "grad_norm": 3.658317804336548, + "learning_rate": 4.995775933873835e-06, + "loss": 0.8319, + "step": 326 + }, + { + "epoch": 0.15930496914582656, + "grad_norm": 3.0576634407043457, + "learning_rate": 4.995738480703224e-06, + "loss": 0.7624, + "step": 327 + }, + { + "epoch": 0.15979214030529393, + "grad_norm": 3.2392992973327637, + "learning_rate": 4.9957008623648715e-06, + "loss": 0.7179, + "step": 328 + }, + { + "epoch": 0.1602793114647613, + "grad_norm": 3.1752021312713623, + "learning_rate": 4.995663078861269e-06, + "loss": 0.7729, + "step": 329 + }, + { + "epoch": 0.16076648262422866, + "grad_norm": 3.4821293354034424, + "learning_rate": 4.995625130194915e-06, + "loss": 0.7456, + "step": 330 + }, + { + "epoch": 0.161253653783696, + "grad_norm": 3.2283756732940674, + "learning_rate": 4.995587016368324e-06, + "loss": 0.6972, + "step": 331 + }, + { + "epoch": 0.16174082494316336, + "grad_norm": 4.575419902801514, + "learning_rate": 4.995548737384016e-06, + "loss": 0.763, + "step": 332 + }, + { + "epoch": 0.16222799610263072, + "grad_norm": 3.302342176437378, + "learning_rate": 4.995510293244525e-06, + "loss": 0.7492, + "step": 333 + }, + { + "epoch": 0.1627151672620981, + "grad_norm": 2.717120885848999, + "learning_rate": 4.995471683952394e-06, + "loss": 0.6926, + "step": 334 + }, + { + "epoch": 0.16320233842156545, + "grad_norm": 3.604538679122925, + "learning_rate": 4.995432909510181e-06, + "loss": 0.844, + "step": 335 + }, + { + "epoch": 0.16368950958103282, + "grad_norm": 3.112515687942505, + "learning_rate": 4.99539396992045e-06, + "loss": 0.7245, + "step": 336 + }, + { + "epoch": 0.16417668074050015, + "grad_norm": 3.071145534515381, + "learning_rate": 4.995354865185778e-06, + "loss": 0.8425, + "step": 337 + }, + { + "epoch": 0.16466385189996752, + "grad_norm": 3.2264928817749023, + "learning_rate": 4.995315595308753e-06, + "loss": 0.8398, + "step": 338 + }, + { + "epoch": 0.16515102305943488, + "grad_norm": 3.007964849472046, + "learning_rate": 4.995276160291975e-06, + "loss": 0.6357, + "step": 339 + }, + { + "epoch": 0.16563819421890225, + "grad_norm": 3.0440287590026855, + "learning_rate": 4.995236560138053e-06, + "loss": 0.7607, + "step": 340 + }, + { + "epoch": 0.1661253653783696, + "grad_norm": 3.7261955738067627, + "learning_rate": 4.995196794849608e-06, + "loss": 0.823, + "step": 341 + }, + { + "epoch": 0.16661253653783695, + "grad_norm": 3.6539804935455322, + "learning_rate": 4.9951568644292715e-06, + "loss": 0.845, + "step": 342 + }, + { + "epoch": 0.1670997076973043, + "grad_norm": 2.9929962158203125, + "learning_rate": 4.995116768879686e-06, + "loss": 0.7633, + "step": 343 + }, + { + "epoch": 0.16758687885677168, + "grad_norm": 4.820560455322266, + "learning_rate": 4.9950765082035045e-06, + "loss": 0.7743, + "step": 344 + }, + { + "epoch": 0.16807405001623904, + "grad_norm": 3.492880344390869, + "learning_rate": 4.9950360824033925e-06, + "loss": 0.6976, + "step": 345 + }, + { + "epoch": 0.1685612211757064, + "grad_norm": 3.163750410079956, + "learning_rate": 4.994995491482024e-06, + "loss": 0.7673, + "step": 346 + }, + { + "epoch": 0.16904839233517377, + "grad_norm": 3.1745569705963135, + "learning_rate": 4.994954735442087e-06, + "loss": 0.84, + "step": 347 + }, + { + "epoch": 0.1695355634946411, + "grad_norm": 3.244572162628174, + "learning_rate": 4.994913814286278e-06, + "loss": 0.8504, + "step": 348 + }, + { + "epoch": 0.17002273465410847, + "grad_norm": 3.0497121810913086, + "learning_rate": 4.994872728017306e-06, + "loss": 0.7712, + "step": 349 + }, + { + "epoch": 0.17050990581357583, + "grad_norm": 3.384981870651245, + "learning_rate": 4.994831476637888e-06, + "loss": 0.8415, + "step": 350 + }, + { + "epoch": 0.1709970769730432, + "grad_norm": 2.8780624866485596, + "learning_rate": 4.994790060150755e-06, + "loss": 0.7139, + "step": 351 + }, + { + "epoch": 0.17148424813251056, + "grad_norm": 3.063708543777466, + "learning_rate": 4.994748478558648e-06, + "loss": 0.6946, + "step": 352 + }, + { + "epoch": 0.17197141929197793, + "grad_norm": 4.257659435272217, + "learning_rate": 4.994706731864321e-06, + "loss": 0.7263, + "step": 353 + }, + { + "epoch": 0.17245859045144527, + "grad_norm": 3.384066343307495, + "learning_rate": 4.994664820070533e-06, + "loss": 0.7621, + "step": 354 + }, + { + "epoch": 0.17294576161091263, + "grad_norm": 3.989457368850708, + "learning_rate": 4.99462274318006e-06, + "loss": 0.9232, + "step": 355 + }, + { + "epoch": 0.17343293277038, + "grad_norm": 3.276092052459717, + "learning_rate": 4.994580501195686e-06, + "loss": 0.807, + "step": 356 + }, + { + "epoch": 0.17392010392984736, + "grad_norm": 2.703735113143921, + "learning_rate": 4.994538094120206e-06, + "loss": 0.7353, + "step": 357 + }, + { + "epoch": 0.17440727508931472, + "grad_norm": 3.561746597290039, + "learning_rate": 4.9944955219564285e-06, + "loss": 0.7439, + "step": 358 + }, + { + "epoch": 0.17489444624878206, + "grad_norm": 2.8842926025390625, + "learning_rate": 4.994452784707169e-06, + "loss": 0.6772, + "step": 359 + }, + { + "epoch": 0.17538161740824942, + "grad_norm": 3.5043461322784424, + "learning_rate": 4.994409882375256e-06, + "loss": 0.7206, + "step": 360 + }, + { + "epoch": 0.1758687885677168, + "grad_norm": 2.7805278301239014, + "learning_rate": 4.9943668149635306e-06, + "loss": 0.715, + "step": 361 + }, + { + "epoch": 0.17635595972718415, + "grad_norm": 2.8721609115600586, + "learning_rate": 4.994323582474841e-06, + "loss": 0.7324, + "step": 362 + }, + { + "epoch": 0.17684313088665152, + "grad_norm": 3.001356363296509, + "learning_rate": 4.9942801849120485e-06, + "loss": 0.7433, + "step": 363 + }, + { + "epoch": 0.17733030204611888, + "grad_norm": 3.1706466674804688, + "learning_rate": 4.9942366222780255e-06, + "loss": 0.8484, + "step": 364 + }, + { + "epoch": 0.17781747320558622, + "grad_norm": 2.9669687747955322, + "learning_rate": 4.994192894575656e-06, + "loss": 0.773, + "step": 365 + }, + { + "epoch": 0.17830464436505358, + "grad_norm": 2.9264001846313477, + "learning_rate": 4.994149001807833e-06, + "loss": 0.7071, + "step": 366 + }, + { + "epoch": 0.17879181552452095, + "grad_norm": 3.248176097869873, + "learning_rate": 4.99410494397746e-06, + "loss": 0.7811, + "step": 367 + }, + { + "epoch": 0.1792789866839883, + "grad_norm": 3.27270245552063, + "learning_rate": 4.994060721087456e-06, + "loss": 0.6994, + "step": 368 + }, + { + "epoch": 0.17976615784345568, + "grad_norm": 3.7114439010620117, + "learning_rate": 4.9940163331407445e-06, + "loss": 0.7466, + "step": 369 + }, + { + "epoch": 0.18025332900292304, + "grad_norm": 3.325208902359009, + "learning_rate": 4.993971780140264e-06, + "loss": 0.6709, + "step": 370 + }, + { + "epoch": 0.18074050016239038, + "grad_norm": 2.984499216079712, + "learning_rate": 4.993927062088964e-06, + "loss": 0.6618, + "step": 371 + }, + { + "epoch": 0.18122767132185774, + "grad_norm": 3.26320219039917, + "learning_rate": 4.993882178989804e-06, + "loss": 0.7474, + "step": 372 + }, + { + "epoch": 0.1817148424813251, + "grad_norm": 2.811854839324951, + "learning_rate": 4.993837130845752e-06, + "loss": 0.7341, + "step": 373 + }, + { + "epoch": 0.18220201364079247, + "grad_norm": 2.788036346435547, + "learning_rate": 4.993791917659792e-06, + "loss": 0.7006, + "step": 374 + }, + { + "epoch": 0.18268918480025984, + "grad_norm": 3.263852119445801, + "learning_rate": 4.993746539434915e-06, + "loss": 0.8261, + "step": 375 + }, + { + "epoch": 0.18317635595972717, + "grad_norm": 2.754335641860962, + "learning_rate": 4.993700996174124e-06, + "loss": 0.634, + "step": 376 + }, + { + "epoch": 0.18366352711919454, + "grad_norm": 3.29323148727417, + "learning_rate": 4.9936552878804326e-06, + "loss": 0.8155, + "step": 377 + }, + { + "epoch": 0.1841506982786619, + "grad_norm": 3.282834768295288, + "learning_rate": 4.993609414556867e-06, + "loss": 0.7751, + "step": 378 + }, + { + "epoch": 0.18463786943812927, + "grad_norm": 3.116795778274536, + "learning_rate": 4.993563376206463e-06, + "loss": 0.7709, + "step": 379 + }, + { + "epoch": 0.18512504059759663, + "grad_norm": 3.109051465988159, + "learning_rate": 4.993517172832266e-06, + "loss": 0.7459, + "step": 380 + }, + { + "epoch": 0.185612211757064, + "grad_norm": 3.1151955127716064, + "learning_rate": 4.993470804437336e-06, + "loss": 0.7836, + "step": 381 + }, + { + "epoch": 0.18609938291653133, + "grad_norm": 3.975283145904541, + "learning_rate": 4.99342427102474e-06, + "loss": 0.7338, + "step": 382 + }, + { + "epoch": 0.1865865540759987, + "grad_norm": 3.0632002353668213, + "learning_rate": 4.993377572597558e-06, + "loss": 0.7861, + "step": 383 + }, + { + "epoch": 0.18707372523546606, + "grad_norm": 2.751840353012085, + "learning_rate": 4.993330709158879e-06, + "loss": 0.7077, + "step": 384 + }, + { + "epoch": 0.18756089639493342, + "grad_norm": 3.2106361389160156, + "learning_rate": 4.993283680711808e-06, + "loss": 0.7948, + "step": 385 + }, + { + "epoch": 0.1880480675544008, + "grad_norm": 2.9412217140197754, + "learning_rate": 4.993236487259454e-06, + "loss": 0.8237, + "step": 386 + }, + { + "epoch": 0.18853523871386813, + "grad_norm": 2.8641064167022705, + "learning_rate": 4.993189128804941e-06, + "loss": 0.766, + "step": 387 + }, + { + "epoch": 0.1890224098733355, + "grad_norm": 3.9343535900115967, + "learning_rate": 4.993141605351404e-06, + "loss": 0.8024, + "step": 388 + }, + { + "epoch": 0.18950958103280285, + "grad_norm": 2.8462302684783936, + "learning_rate": 4.9930939169019885e-06, + "loss": 0.7456, + "step": 389 + }, + { + "epoch": 0.18999675219227022, + "grad_norm": 3.1572110652923584, + "learning_rate": 4.9930460634598485e-06, + "loss": 0.7448, + "step": 390 + }, + { + "epoch": 0.19048392335173758, + "grad_norm": 3.121652603149414, + "learning_rate": 4.9929980450281536e-06, + "loss": 0.763, + "step": 391 + }, + { + "epoch": 0.19097109451120495, + "grad_norm": 3.0173838138580322, + "learning_rate": 4.992949861610079e-06, + "loss": 0.6917, + "step": 392 + }, + { + "epoch": 0.19145826567067228, + "grad_norm": 3.6358203887939453, + "learning_rate": 4.992901513208816e-06, + "loss": 0.8207, + "step": 393 + }, + { + "epoch": 0.19194543683013965, + "grad_norm": 3.4599456787109375, + "learning_rate": 4.992852999827561e-06, + "loss": 0.8576, + "step": 394 + }, + { + "epoch": 0.192432607989607, + "grad_norm": 2.8600244522094727, + "learning_rate": 4.992804321469528e-06, + "loss": 0.6446, + "step": 395 + }, + { + "epoch": 0.19291977914907438, + "grad_norm": 3.140765428543091, + "learning_rate": 4.992755478137937e-06, + "loss": 0.77, + "step": 396 + }, + { + "epoch": 0.19340695030854174, + "grad_norm": 3.26961612701416, + "learning_rate": 4.992706469836021e-06, + "loss": 0.8129, + "step": 397 + }, + { + "epoch": 0.1938941214680091, + "grad_norm": 3.0631744861602783, + "learning_rate": 4.992657296567021e-06, + "loss": 0.7202, + "step": 398 + }, + { + "epoch": 0.19438129262747644, + "grad_norm": 3.2196922302246094, + "learning_rate": 4.992607958334196e-06, + "loss": 0.7739, + "step": 399 + }, + { + "epoch": 0.1948684637869438, + "grad_norm": 3.2724921703338623, + "learning_rate": 4.9925584551408065e-06, + "loss": 0.7879, + "step": 400 + }, + { + "epoch": 0.19535563494641117, + "grad_norm": 3.7339844703674316, + "learning_rate": 4.992508786990131e-06, + "loss": 0.7152, + "step": 401 + }, + { + "epoch": 0.19584280610587854, + "grad_norm": 3.510960340499878, + "learning_rate": 4.992458953885457e-06, + "loss": 0.7465, + "step": 402 + }, + { + "epoch": 0.1963299772653459, + "grad_norm": 2.9735565185546875, + "learning_rate": 4.992408955830081e-06, + "loss": 0.7365, + "step": 403 + }, + { + "epoch": 0.19681714842481324, + "grad_norm": 3.1451427936553955, + "learning_rate": 4.992358792827311e-06, + "loss": 0.7275, + "step": 404 + }, + { + "epoch": 0.1973043195842806, + "grad_norm": 3.1571359634399414, + "learning_rate": 4.992308464880469e-06, + "loss": 0.7071, + "step": 405 + }, + { + "epoch": 0.19779149074374797, + "grad_norm": 3.3366994857788086, + "learning_rate": 4.992257971992886e-06, + "loss": 0.7584, + "step": 406 + }, + { + "epoch": 0.19827866190321533, + "grad_norm": 3.129516839981079, + "learning_rate": 4.992207314167901e-06, + "loss": 0.7764, + "step": 407 + }, + { + "epoch": 0.1987658330626827, + "grad_norm": 3.06172776222229, + "learning_rate": 4.992156491408869e-06, + "loss": 0.7324, + "step": 408 + }, + { + "epoch": 0.19925300422215006, + "grad_norm": 3.081023931503296, + "learning_rate": 4.992105503719152e-06, + "loss": 0.719, + "step": 409 + }, + { + "epoch": 0.1997401753816174, + "grad_norm": 2.7442142963409424, + "learning_rate": 4.992054351102124e-06, + "loss": 0.6964, + "step": 410 + }, + { + "epoch": 0.20022734654108476, + "grad_norm": 3.0648653507232666, + "learning_rate": 4.992003033561174e-06, + "loss": 0.8442, + "step": 411 + }, + { + "epoch": 0.20071451770055213, + "grad_norm": 2.87860369682312, + "learning_rate": 4.991951551099692e-06, + "loss": 0.7039, + "step": 412 + }, + { + "epoch": 0.2012016888600195, + "grad_norm": 2.9739737510681152, + "learning_rate": 4.99189990372109e-06, + "loss": 0.7024, + "step": 413 + }, + { + "epoch": 0.20168886001948685, + "grad_norm": 2.900397777557373, + "learning_rate": 4.9918480914287845e-06, + "loss": 0.6459, + "step": 414 + }, + { + "epoch": 0.20217603117895422, + "grad_norm": 3.8083720207214355, + "learning_rate": 4.991796114226204e-06, + "loss": 0.7959, + "step": 415 + }, + { + "epoch": 0.20266320233842156, + "grad_norm": 2.7646360397338867, + "learning_rate": 4.991743972116789e-06, + "loss": 0.7128, + "step": 416 + }, + { + "epoch": 0.20315037349788892, + "grad_norm": 3.1812686920166016, + "learning_rate": 4.991691665103989e-06, + "loss": 0.7331, + "step": 417 + }, + { + "epoch": 0.20363754465735628, + "grad_norm": 2.8281126022338867, + "learning_rate": 4.991639193191268e-06, + "loss": 0.669, + "step": 418 + }, + { + "epoch": 0.20412471581682365, + "grad_norm": 2.9445865154266357, + "learning_rate": 4.991586556382096e-06, + "loss": 0.7449, + "step": 419 + }, + { + "epoch": 0.204611886976291, + "grad_norm": 3.1232261657714844, + "learning_rate": 4.991533754679959e-06, + "loss": 0.785, + "step": 420 + }, + { + "epoch": 0.20509905813575835, + "grad_norm": 3.19449520111084, + "learning_rate": 4.991480788088349e-06, + "loss": 0.7677, + "step": 421 + }, + { + "epoch": 0.20558622929522571, + "grad_norm": 3.073873996734619, + "learning_rate": 4.991427656610773e-06, + "loss": 0.7517, + "step": 422 + }, + { + "epoch": 0.20607340045469308, + "grad_norm": 3.3958497047424316, + "learning_rate": 4.991374360250747e-06, + "loss": 0.7651, + "step": 423 + }, + { + "epoch": 0.20656057161416044, + "grad_norm": 3.0608630180358887, + "learning_rate": 4.991320899011797e-06, + "loss": 0.7275, + "step": 424 + }, + { + "epoch": 0.2070477427736278, + "grad_norm": 3.2098941802978516, + "learning_rate": 4.991267272897462e-06, + "loss": 0.6957, + "step": 425 + }, + { + "epoch": 0.20753491393309517, + "grad_norm": 3.044623613357544, + "learning_rate": 4.991213481911291e-06, + "loss": 0.7378, + "step": 426 + }, + { + "epoch": 0.2080220850925625, + "grad_norm": 3.127129554748535, + "learning_rate": 4.991159526056844e-06, + "loss": 0.7506, + "step": 427 + }, + { + "epoch": 0.20850925625202987, + "grad_norm": 3.434532642364502, + "learning_rate": 4.991105405337692e-06, + "loss": 0.8966, + "step": 428 + }, + { + "epoch": 0.20899642741149724, + "grad_norm": 3.0520131587982178, + "learning_rate": 4.991051119757415e-06, + "loss": 0.7355, + "step": 429 + }, + { + "epoch": 0.2094835985709646, + "grad_norm": 3.4622936248779297, + "learning_rate": 4.990996669319607e-06, + "loss": 0.7537, + "step": 430 + }, + { + "epoch": 0.20997076973043197, + "grad_norm": 3.609536647796631, + "learning_rate": 4.990942054027873e-06, + "loss": 0.6778, + "step": 431 + }, + { + "epoch": 0.21045794088989933, + "grad_norm": 2.987701416015625, + "learning_rate": 4.990887273885824e-06, + "loss": 0.7958, + "step": 432 + }, + { + "epoch": 0.21094511204936667, + "grad_norm": 3.0531394481658936, + "learning_rate": 4.9908323288970885e-06, + "loss": 0.6786, + "step": 433 + }, + { + "epoch": 0.21143228320883403, + "grad_norm": 3.2400259971618652, + "learning_rate": 4.9907772190653005e-06, + "loss": 0.7607, + "step": 434 + }, + { + "epoch": 0.2119194543683014, + "grad_norm": 3.0143797397613525, + "learning_rate": 4.990721944394108e-06, + "loss": 0.7803, + "step": 435 + }, + { + "epoch": 0.21240662552776876, + "grad_norm": 3.000843048095703, + "learning_rate": 4.9906665048871696e-06, + "loss": 0.6941, + "step": 436 + }, + { + "epoch": 0.21289379668723613, + "grad_norm": 3.1390156745910645, + "learning_rate": 4.990610900548154e-06, + "loss": 0.7934, + "step": 437 + }, + { + "epoch": 0.21338096784670346, + "grad_norm": 3.069143533706665, + "learning_rate": 4.990555131380741e-06, + "loss": 0.6586, + "step": 438 + }, + { + "epoch": 0.21386813900617083, + "grad_norm": 3.627168655395508, + "learning_rate": 4.990499197388621e-06, + "loss": 0.8715, + "step": 439 + }, + { + "epoch": 0.2143553101656382, + "grad_norm": 2.990938425064087, + "learning_rate": 4.990443098575496e-06, + "loss": 0.7348, + "step": 440 + }, + { + "epoch": 0.21484248132510556, + "grad_norm": 2.9687612056732178, + "learning_rate": 4.990386834945079e-06, + "loss": 0.8184, + "step": 441 + }, + { + "epoch": 0.21532965248457292, + "grad_norm": 3.42073655128479, + "learning_rate": 4.990330406501093e-06, + "loss": 0.7647, + "step": 442 + }, + { + "epoch": 0.21581682364404028, + "grad_norm": 3.539968729019165, + "learning_rate": 4.990273813247272e-06, + "loss": 0.8891, + "step": 443 + }, + { + "epoch": 0.21630399480350762, + "grad_norm": 2.9788267612457275, + "learning_rate": 4.990217055187363e-06, + "loss": 0.7024, + "step": 444 + }, + { + "epoch": 0.21679116596297499, + "grad_norm": 3.274918794631958, + "learning_rate": 4.99016013232512e-06, + "loss": 0.6978, + "step": 445 + }, + { + "epoch": 0.21727833712244235, + "grad_norm": 2.945810556411743, + "learning_rate": 4.990103044664311e-06, + "loss": 0.7113, + "step": 446 + }, + { + "epoch": 0.21776550828190971, + "grad_norm": 2.995314598083496, + "learning_rate": 4.990045792208716e-06, + "loss": 0.8148, + "step": 447 + }, + { + "epoch": 0.21825267944137708, + "grad_norm": 3.056879758834839, + "learning_rate": 4.989988374962122e-06, + "loss": 0.7335, + "step": 448 + }, + { + "epoch": 0.21873985060084444, + "grad_norm": 2.8488824367523193, + "learning_rate": 4.989930792928329e-06, + "loss": 0.7408, + "step": 449 + }, + { + "epoch": 0.21922702176031178, + "grad_norm": 3.809095859527588, + "learning_rate": 4.989873046111148e-06, + "loss": 0.6742, + "step": 450 + }, + { + "epoch": 0.21971419291977914, + "grad_norm": 3.1999945640563965, + "learning_rate": 4.9898151345144e-06, + "loss": 0.8187, + "step": 451 + }, + { + "epoch": 0.2202013640792465, + "grad_norm": 3.043477773666382, + "learning_rate": 4.989757058141919e-06, + "loss": 0.7769, + "step": 452 + }, + { + "epoch": 0.22068853523871387, + "grad_norm": 2.95509934425354, + "learning_rate": 4.989698816997547e-06, + "loss": 0.6234, + "step": 453 + }, + { + "epoch": 0.22117570639818124, + "grad_norm": 3.3677315711975098, + "learning_rate": 4.98964041108514e-06, + "loss": 0.7986, + "step": 454 + }, + { + "epoch": 0.22166287755764857, + "grad_norm": 3.304133415222168, + "learning_rate": 4.989581840408562e-06, + "loss": 0.6978, + "step": 455 + }, + { + "epoch": 0.22215004871711594, + "grad_norm": 2.8625566959381104, + "learning_rate": 4.989523104971689e-06, + "loss": 0.7272, + "step": 456 + }, + { + "epoch": 0.2226372198765833, + "grad_norm": 3.6632297039031982, + "learning_rate": 4.98946420477841e-06, + "loss": 0.7496, + "step": 457 + }, + { + "epoch": 0.22312439103605067, + "grad_norm": 3.5343070030212402, + "learning_rate": 4.989405139832622e-06, + "loss": 0.7916, + "step": 458 + }, + { + "epoch": 0.22361156219551803, + "grad_norm": 3.1070194244384766, + "learning_rate": 4.989345910138232e-06, + "loss": 0.6498, + "step": 459 + }, + { + "epoch": 0.2240987333549854, + "grad_norm": 3.139521837234497, + "learning_rate": 4.989286515699162e-06, + "loss": 0.7679, + "step": 460 + }, + { + "epoch": 0.22458590451445273, + "grad_norm": 3.8357222080230713, + "learning_rate": 4.9892269565193426e-06, + "loss": 0.9922, + "step": 461 + }, + { + "epoch": 0.2250730756739201, + "grad_norm": 2.829184055328369, + "learning_rate": 4.989167232602715e-06, + "loss": 0.7151, + "step": 462 + }, + { + "epoch": 0.22556024683338746, + "grad_norm": 3.034545660018921, + "learning_rate": 4.9891073439532325e-06, + "loss": 0.7426, + "step": 463 + }, + { + "epoch": 0.22604741799285483, + "grad_norm": 3.3481853008270264, + "learning_rate": 4.989047290574857e-06, + "loss": 0.7852, + "step": 464 + }, + { + "epoch": 0.2265345891523222, + "grad_norm": 3.2835097312927246, + "learning_rate": 4.988987072471562e-06, + "loss": 0.6924, + "step": 465 + }, + { + "epoch": 0.22702176031178956, + "grad_norm": 2.890019416809082, + "learning_rate": 4.988926689647337e-06, + "loss": 0.7541, + "step": 466 + }, + { + "epoch": 0.2275089314712569, + "grad_norm": 2.8760833740234375, + "learning_rate": 4.988866142106175e-06, + "loss": 0.7421, + "step": 467 + }, + { + "epoch": 0.22799610263072426, + "grad_norm": 2.8886656761169434, + "learning_rate": 4.988805429852082e-06, + "loss": 0.7259, + "step": 468 + }, + { + "epoch": 0.22848327379019162, + "grad_norm": 3.0977914333343506, + "learning_rate": 4.988744552889078e-06, + "loss": 0.7408, + "step": 469 + }, + { + "epoch": 0.22897044494965899, + "grad_norm": 3.2646775245666504, + "learning_rate": 4.988683511221192e-06, + "loss": 0.8031, + "step": 470 + }, + { + "epoch": 0.22945761610912635, + "grad_norm": 3.3476619720458984, + "learning_rate": 4.988622304852462e-06, + "loss": 0.7719, + "step": 471 + }, + { + "epoch": 0.2299447872685937, + "grad_norm": 3.3434810638427734, + "learning_rate": 4.98856093378694e-06, + "loss": 0.7809, + "step": 472 + }, + { + "epoch": 0.23043195842806105, + "grad_norm": 3.072542428970337, + "learning_rate": 4.988499398028687e-06, + "loss": 0.8415, + "step": 473 + }, + { + "epoch": 0.23091912958752842, + "grad_norm": 3.5418765544891357, + "learning_rate": 4.988437697581776e-06, + "loss": 0.8244, + "step": 474 + }, + { + "epoch": 0.23140630074699578, + "grad_norm": 3.5211474895477295, + "learning_rate": 4.9883758324502895e-06, + "loss": 0.7789, + "step": 475 + }, + { + "epoch": 0.23189347190646314, + "grad_norm": 3.104797601699829, + "learning_rate": 4.988313802638322e-06, + "loss": 0.7691, + "step": 476 + }, + { + "epoch": 0.2323806430659305, + "grad_norm": 3.1609880924224854, + "learning_rate": 4.988251608149979e-06, + "loss": 0.6707, + "step": 477 + }, + { + "epoch": 0.23286781422539785, + "grad_norm": 2.924140691757202, + "learning_rate": 4.988189248989376e-06, + "loss": 0.7436, + "step": 478 + }, + { + "epoch": 0.2333549853848652, + "grad_norm": 2.668699026107788, + "learning_rate": 4.988126725160641e-06, + "loss": 0.6792, + "step": 479 + }, + { + "epoch": 0.23384215654433257, + "grad_norm": 3.304591655731201, + "learning_rate": 4.98806403666791e-06, + "loss": 0.7605, + "step": 480 + }, + { + "epoch": 0.23432932770379994, + "grad_norm": 2.911376953125, + "learning_rate": 4.988001183515333e-06, + "loss": 0.7695, + "step": 481 + }, + { + "epoch": 0.2348164988632673, + "grad_norm": 2.9467458724975586, + "learning_rate": 4.9879381657070705e-06, + "loss": 0.7405, + "step": 482 + }, + { + "epoch": 0.23530367002273467, + "grad_norm": 3.7774853706359863, + "learning_rate": 4.987874983247291e-06, + "loss": 0.8325, + "step": 483 + }, + { + "epoch": 0.235790841182202, + "grad_norm": 2.905942678451538, + "learning_rate": 4.9878116361401765e-06, + "loss": 0.7126, + "step": 484 + }, + { + "epoch": 0.23627801234166937, + "grad_norm": 2.787506341934204, + "learning_rate": 4.987748124389919e-06, + "loss": 0.8109, + "step": 485 + }, + { + "epoch": 0.23676518350113673, + "grad_norm": 3.7370619773864746, + "learning_rate": 4.987684448000723e-06, + "loss": 0.7646, + "step": 486 + }, + { + "epoch": 0.2372523546606041, + "grad_norm": 3.894577741622925, + "learning_rate": 4.9876206069768025e-06, + "loss": 0.8471, + "step": 487 + }, + { + "epoch": 0.23773952582007146, + "grad_norm": 3.475742816925049, + "learning_rate": 4.9875566013223815e-06, + "loss": 0.7193, + "step": 488 + }, + { + "epoch": 0.2382266969795388, + "grad_norm": 3.386749267578125, + "learning_rate": 4.987492431041696e-06, + "loss": 0.7243, + "step": 489 + }, + { + "epoch": 0.23871386813900616, + "grad_norm": 3.0964181423187256, + "learning_rate": 4.987428096138993e-06, + "loss": 0.6747, + "step": 490 + }, + { + "epoch": 0.23920103929847353, + "grad_norm": 2.7195754051208496, + "learning_rate": 4.987363596618531e-06, + "loss": 0.7758, + "step": 491 + }, + { + "epoch": 0.2396882104579409, + "grad_norm": 2.9415524005889893, + "learning_rate": 4.987298932484577e-06, + "loss": 0.7025, + "step": 492 + }, + { + "epoch": 0.24017538161740826, + "grad_norm": 3.0502498149871826, + "learning_rate": 4.987234103741411e-06, + "loss": 0.7409, + "step": 493 + }, + { + "epoch": 0.24066255277687562, + "grad_norm": 2.7624154090881348, + "learning_rate": 4.987169110393324e-06, + "loss": 0.7976, + "step": 494 + }, + { + "epoch": 0.24114972393634296, + "grad_norm": 2.7861759662628174, + "learning_rate": 4.987103952444618e-06, + "loss": 0.788, + "step": 495 + }, + { + "epoch": 0.24163689509581032, + "grad_norm": 3.0857086181640625, + "learning_rate": 4.9870386298996025e-06, + "loss": 0.7656, + "step": 496 + }, + { + "epoch": 0.2421240662552777, + "grad_norm": 2.8564188480377197, + "learning_rate": 4.9869731427626024e-06, + "loss": 0.7238, + "step": 497 + }, + { + "epoch": 0.24261123741474505, + "grad_norm": 3.1248390674591064, + "learning_rate": 4.986907491037952e-06, + "loss": 0.7913, + "step": 498 + }, + { + "epoch": 0.24309840857421242, + "grad_norm": 3.209285020828247, + "learning_rate": 4.986841674729994e-06, + "loss": 0.7461, + "step": 499 + }, + { + "epoch": 0.24358557973367975, + "grad_norm": 3.3589634895324707, + "learning_rate": 4.986775693843087e-06, + "loss": 0.7708, + "step": 500 + }, + { + "epoch": 0.24407275089314712, + "grad_norm": 3.9337844848632812, + "learning_rate": 4.9867095483815955e-06, + "loss": 0.7208, + "step": 501 + }, + { + "epoch": 0.24455992205261448, + "grad_norm": 2.8993723392486572, + "learning_rate": 4.986643238349898e-06, + "loss": 0.7658, + "step": 502 + }, + { + "epoch": 0.24504709321208185, + "grad_norm": 2.8959178924560547, + "learning_rate": 4.9865767637523824e-06, + "loss": 0.6669, + "step": 503 + }, + { + "epoch": 0.2455342643715492, + "grad_norm": 3.276644468307495, + "learning_rate": 4.986510124593448e-06, + "loss": 0.7547, + "step": 504 + }, + { + "epoch": 0.24602143553101657, + "grad_norm": 3.107353925704956, + "learning_rate": 4.986443320877504e-06, + "loss": 0.7175, + "step": 505 + }, + { + "epoch": 0.2465086066904839, + "grad_norm": 2.872662305831909, + "learning_rate": 4.986376352608974e-06, + "loss": 0.8354, + "step": 506 + }, + { + "epoch": 0.24699577784995128, + "grad_norm": 2.9970054626464844, + "learning_rate": 4.986309219792289e-06, + "loss": 0.6717, + "step": 507 + }, + { + "epoch": 0.24748294900941864, + "grad_norm": 2.6521317958831787, + "learning_rate": 4.98624192243189e-06, + "loss": 0.6963, + "step": 508 + }, + { + "epoch": 0.247970120168886, + "grad_norm": 2.9642088413238525, + "learning_rate": 4.986174460532233e-06, + "loss": 0.7671, + "step": 509 + }, + { + "epoch": 0.24845729132835337, + "grad_norm": 3.0520436763763428, + "learning_rate": 4.986106834097781e-06, + "loss": 0.7604, + "step": 510 + }, + { + "epoch": 0.24894446248782073, + "grad_norm": 2.9707155227661133, + "learning_rate": 4.986039043133011e-06, + "loss": 0.7288, + "step": 511 + }, + { + "epoch": 0.24943163364728807, + "grad_norm": 2.679050922393799, + "learning_rate": 4.985971087642408e-06, + "loss": 0.7318, + "step": 512 + }, + { + "epoch": 0.24991880480675543, + "grad_norm": 3.2742114067077637, + "learning_rate": 4.98590296763047e-06, + "loss": 0.7575, + "step": 513 + }, + { + "epoch": 0.2504059759662228, + "grad_norm": 3.611093044281006, + "learning_rate": 4.985834683101704e-06, + "loss": 0.6907, + "step": 514 + }, + { + "epoch": 0.25089314712569016, + "grad_norm": 3.2771365642547607, + "learning_rate": 4.985766234060633e-06, + "loss": 0.7396, + "step": 515 + }, + { + "epoch": 0.2513803182851575, + "grad_norm": 4.130541801452637, + "learning_rate": 4.985697620511782e-06, + "loss": 0.7304, + "step": 516 + }, + { + "epoch": 0.2518674894446249, + "grad_norm": 2.987365245819092, + "learning_rate": 4.9856288424596945e-06, + "loss": 0.7257, + "step": 517 + }, + { + "epoch": 0.25235466060409223, + "grad_norm": 2.9074463844299316, + "learning_rate": 4.985559899908922e-06, + "loss": 0.7108, + "step": 518 + }, + { + "epoch": 0.2528418317635596, + "grad_norm": 2.862644910812378, + "learning_rate": 4.985490792864026e-06, + "loss": 0.7222, + "step": 519 + }, + { + "epoch": 0.25332900292302696, + "grad_norm": 2.8999810218811035, + "learning_rate": 4.985421521329581e-06, + "loss": 0.74, + "step": 520 + }, + { + "epoch": 0.2538161740824943, + "grad_norm": 3.529918670654297, + "learning_rate": 4.985352085310171e-06, + "loss": 0.6894, + "step": 521 + }, + { + "epoch": 0.2543033452419617, + "grad_norm": 3.046361207962036, + "learning_rate": 4.985282484810392e-06, + "loss": 0.7399, + "step": 522 + }, + { + "epoch": 0.254790516401429, + "grad_norm": 2.6811437606811523, + "learning_rate": 4.985212719834849e-06, + "loss": 0.7142, + "step": 523 + }, + { + "epoch": 0.2552776875608964, + "grad_norm": 2.77946400642395, + "learning_rate": 4.98514279038816e-06, + "loss": 0.6433, + "step": 524 + }, + { + "epoch": 0.25576485872036375, + "grad_norm": 3.5190768241882324, + "learning_rate": 4.9850726964749525e-06, + "loss": 0.6571, + "step": 525 + }, + { + "epoch": 0.2562520298798311, + "grad_norm": 3.1387054920196533, + "learning_rate": 4.9850024380998655e-06, + "loss": 0.7406, + "step": 526 + }, + { + "epoch": 0.2567392010392985, + "grad_norm": 2.8703768253326416, + "learning_rate": 4.984932015267548e-06, + "loss": 0.6783, + "step": 527 + }, + { + "epoch": 0.2572263721987658, + "grad_norm": 3.0279030799865723, + "learning_rate": 4.984861427982661e-06, + "loss": 0.6609, + "step": 528 + }, + { + "epoch": 0.2577135433582332, + "grad_norm": 4.169360637664795, + "learning_rate": 4.984790676249877e-06, + "loss": 0.8431, + "step": 529 + }, + { + "epoch": 0.25820071451770055, + "grad_norm": 3.339386463165283, + "learning_rate": 4.984719760073877e-06, + "loss": 0.7297, + "step": 530 + }, + { + "epoch": 0.25868788567716794, + "grad_norm": 2.995542049407959, + "learning_rate": 4.984648679459355e-06, + "loss": 0.701, + "step": 531 + }, + { + "epoch": 0.2591750568366353, + "grad_norm": 2.852863073348999, + "learning_rate": 4.984577434411014e-06, + "loss": 0.7206, + "step": 532 + }, + { + "epoch": 0.2596622279961026, + "grad_norm": 3.4059152603149414, + "learning_rate": 4.984506024933571e-06, + "loss": 0.8386, + "step": 533 + }, + { + "epoch": 0.26014939915557, + "grad_norm": 3.0080018043518066, + "learning_rate": 4.984434451031749e-06, + "loss": 0.7003, + "step": 534 + }, + { + "epoch": 0.26063657031503734, + "grad_norm": 3.093029499053955, + "learning_rate": 4.9843627127102874e-06, + "loss": 0.6613, + "step": 535 + }, + { + "epoch": 0.26112374147450473, + "grad_norm": 3.2855396270751953, + "learning_rate": 4.984290809973933e-06, + "loss": 0.8489, + "step": 536 + }, + { + "epoch": 0.26161091263397207, + "grad_norm": 3.0735745429992676, + "learning_rate": 4.984218742827444e-06, + "loss": 0.7424, + "step": 537 + }, + { + "epoch": 0.2620980837934394, + "grad_norm": 3.222146511077881, + "learning_rate": 4.984146511275589e-06, + "loss": 0.7563, + "step": 538 + }, + { + "epoch": 0.2625852549529068, + "grad_norm": 3.1822657585144043, + "learning_rate": 4.9840741153231495e-06, + "loss": 0.7941, + "step": 539 + }, + { + "epoch": 0.26307242611237414, + "grad_norm": 2.878441333770752, + "learning_rate": 4.9840015549749175e-06, + "loss": 0.6758, + "step": 540 + }, + { + "epoch": 0.26355959727184153, + "grad_norm": 2.7886645793914795, + "learning_rate": 4.9839288302356924e-06, + "loss": 0.6711, + "step": 541 + }, + { + "epoch": 0.26404676843130886, + "grad_norm": 3.385986089706421, + "learning_rate": 4.98385594111029e-06, + "loss": 0.7244, + "step": 542 + }, + { + "epoch": 0.2645339395907762, + "grad_norm": 3.3058037757873535, + "learning_rate": 4.983782887603532e-06, + "loss": 0.735, + "step": 543 + }, + { + "epoch": 0.2650211107502436, + "grad_norm": 2.992058277130127, + "learning_rate": 4.983709669720254e-06, + "loss": 0.7617, + "step": 544 + }, + { + "epoch": 0.26550828190971093, + "grad_norm": 2.8403966426849365, + "learning_rate": 4.983636287465301e-06, + "loss": 0.7124, + "step": 545 + }, + { + "epoch": 0.2659954530691783, + "grad_norm": 3.4580605030059814, + "learning_rate": 4.98356274084353e-06, + "loss": 0.672, + "step": 546 + }, + { + "epoch": 0.26648262422864566, + "grad_norm": 3.4375038146972656, + "learning_rate": 4.9834890298598085e-06, + "loss": 0.8374, + "step": 547 + }, + { + "epoch": 0.26696979538811305, + "grad_norm": 3.2027034759521484, + "learning_rate": 4.983415154519014e-06, + "loss": 0.7842, + "step": 548 + }, + { + "epoch": 0.2674569665475804, + "grad_norm": 4.787693977355957, + "learning_rate": 4.983341114826035e-06, + "loss": 0.7532, + "step": 549 + }, + { + "epoch": 0.2679441377070477, + "grad_norm": 3.1346354484558105, + "learning_rate": 4.983266910785774e-06, + "loss": 0.6366, + "step": 550 + }, + { + "epoch": 0.2684313088665151, + "grad_norm": 3.039419412612915, + "learning_rate": 4.983192542403138e-06, + "loss": 0.7412, + "step": 551 + }, + { + "epoch": 0.26891848002598245, + "grad_norm": 3.012873411178589, + "learning_rate": 4.983118009683053e-06, + "loss": 0.6813, + "step": 552 + }, + { + "epoch": 0.26940565118544985, + "grad_norm": 3.143331527709961, + "learning_rate": 4.983043312630447e-06, + "loss": 0.802, + "step": 553 + }, + { + "epoch": 0.2698928223449172, + "grad_norm": 2.8300979137420654, + "learning_rate": 4.9829684512502665e-06, + "loss": 0.66, + "step": 554 + }, + { + "epoch": 0.2703799935043845, + "grad_norm": 3.426264762878418, + "learning_rate": 4.982893425547466e-06, + "loss": 0.9233, + "step": 555 + }, + { + "epoch": 0.2708671646638519, + "grad_norm": 3.17462420463562, + "learning_rate": 4.98281823552701e-06, + "loss": 0.6788, + "step": 556 + }, + { + "epoch": 0.27135433582331925, + "grad_norm": 3.6160736083984375, + "learning_rate": 4.982742881193873e-06, + "loss": 0.7763, + "step": 557 + }, + { + "epoch": 0.27184150698278664, + "grad_norm": 3.615028142929077, + "learning_rate": 4.9826673625530445e-06, + "loss": 0.7449, + "step": 558 + }, + { + "epoch": 0.272328678142254, + "grad_norm": 3.189872980117798, + "learning_rate": 4.98259167960952e-06, + "loss": 0.6797, + "step": 559 + }, + { + "epoch": 0.2728158493017213, + "grad_norm": 3.154878616333008, + "learning_rate": 4.98251583236831e-06, + "loss": 0.6252, + "step": 560 + }, + { + "epoch": 0.2733030204611887, + "grad_norm": 3.1456124782562256, + "learning_rate": 4.982439820834434e-06, + "loss": 0.7727, + "step": 561 + }, + { + "epoch": 0.27379019162065604, + "grad_norm": 2.9189112186431885, + "learning_rate": 4.982363645012921e-06, + "loss": 0.7573, + "step": 562 + }, + { + "epoch": 0.27427736278012343, + "grad_norm": 3.0716946125030518, + "learning_rate": 4.982287304908813e-06, + "loss": 0.7182, + "step": 563 + }, + { + "epoch": 0.27476453393959077, + "grad_norm": 2.9219491481781006, + "learning_rate": 4.982210800527163e-06, + "loss": 0.6996, + "step": 564 + }, + { + "epoch": 0.2752517050990581, + "grad_norm": 2.883068323135376, + "learning_rate": 4.982134131873033e-06, + "loss": 0.6964, + "step": 565 + }, + { + "epoch": 0.2757388762585255, + "grad_norm": 3.6582651138305664, + "learning_rate": 4.982057298951497e-06, + "loss": 0.7997, + "step": 566 + }, + { + "epoch": 0.27622604741799284, + "grad_norm": 3.082056999206543, + "learning_rate": 4.981980301767641e-06, + "loss": 0.7277, + "step": 567 + }, + { + "epoch": 0.27671321857746023, + "grad_norm": 2.6361477375030518, + "learning_rate": 4.981903140326558e-06, + "loss": 0.7503, + "step": 568 + }, + { + "epoch": 0.27720038973692757, + "grad_norm": 3.266690254211426, + "learning_rate": 4.981825814633358e-06, + "loss": 0.6717, + "step": 569 + }, + { + "epoch": 0.27768756089639496, + "grad_norm": 3.2274410724639893, + "learning_rate": 4.981748324693156e-06, + "loss": 0.7309, + "step": 570 + }, + { + "epoch": 0.2781747320558623, + "grad_norm": 2.9285659790039062, + "learning_rate": 4.981670670511081e-06, + "loss": 0.6537, + "step": 571 + }, + { + "epoch": 0.27866190321532963, + "grad_norm": 2.7468581199645996, + "learning_rate": 4.981592852092272e-06, + "loss": 0.7865, + "step": 572 + }, + { + "epoch": 0.279149074374797, + "grad_norm": 3.617844343185425, + "learning_rate": 4.98151486944188e-06, + "loss": 0.8394, + "step": 573 + }, + { + "epoch": 0.27963624553426436, + "grad_norm": 3.201842784881592, + "learning_rate": 4.981436722565064e-06, + "loss": 0.7932, + "step": 574 + }, + { + "epoch": 0.28012341669373175, + "grad_norm": 3.0306272506713867, + "learning_rate": 4.9813584114669975e-06, + "loss": 0.6068, + "step": 575 + }, + { + "epoch": 0.2806105878531991, + "grad_norm": 3.3684725761413574, + "learning_rate": 4.981279936152862e-06, + "loss": 0.7253, + "step": 576 + }, + { + "epoch": 0.2810977590126664, + "grad_norm": 2.852841377258301, + "learning_rate": 4.981201296627851e-06, + "loss": 0.703, + "step": 577 + }, + { + "epoch": 0.2815849301721338, + "grad_norm": 2.629058361053467, + "learning_rate": 4.98112249289717e-06, + "loss": 0.7066, + "step": 578 + }, + { + "epoch": 0.28207210133160115, + "grad_norm": 2.54529070854187, + "learning_rate": 4.981043524966033e-06, + "loss": 0.5487, + "step": 579 + }, + { + "epoch": 0.28255927249106855, + "grad_norm": 3.1242196559906006, + "learning_rate": 4.980964392839667e-06, + "loss": 0.8082, + "step": 580 + }, + { + "epoch": 0.2830464436505359, + "grad_norm": 3.220381021499634, + "learning_rate": 4.980885096523308e-06, + "loss": 0.7406, + "step": 581 + }, + { + "epoch": 0.2835336148100032, + "grad_norm": 2.699308395385742, + "learning_rate": 4.9808056360222045e-06, + "loss": 0.6562, + "step": 582 + }, + { + "epoch": 0.2840207859694706, + "grad_norm": 3.5496177673339844, + "learning_rate": 4.980726011341614e-06, + "loss": 0.7148, + "step": 583 + }, + { + "epoch": 0.28450795712893795, + "grad_norm": 3.1091442108154297, + "learning_rate": 4.980646222486809e-06, + "loss": 0.6952, + "step": 584 + }, + { + "epoch": 0.28499512828840534, + "grad_norm": 3.2739107608795166, + "learning_rate": 4.9805662694630676e-06, + "loss": 0.7852, + "step": 585 + }, + { + "epoch": 0.2854822994478727, + "grad_norm": 2.7408127784729004, + "learning_rate": 4.98048615227568e-06, + "loss": 0.6998, + "step": 586 + }, + { + "epoch": 0.28596947060734007, + "grad_norm": 2.532198429107666, + "learning_rate": 4.980405870929952e-06, + "loss": 0.6113, + "step": 587 + }, + { + "epoch": 0.2864566417668074, + "grad_norm": 3.1315362453460693, + "learning_rate": 4.980325425431193e-06, + "loss": 0.6832, + "step": 588 + }, + { + "epoch": 0.28694381292627474, + "grad_norm": 2.784146308898926, + "learning_rate": 4.980244815784728e-06, + "loss": 0.6567, + "step": 589 + }, + { + "epoch": 0.28743098408574214, + "grad_norm": 3.079704999923706, + "learning_rate": 4.980164041995893e-06, + "loss": 0.6797, + "step": 590 + }, + { + "epoch": 0.2879181552452095, + "grad_norm": 3.688685417175293, + "learning_rate": 4.9800831040700315e-06, + "loss": 0.8292, + "step": 591 + }, + { + "epoch": 0.28840532640467686, + "grad_norm": 2.845691442489624, + "learning_rate": 4.980002002012502e-06, + "loss": 0.7408, + "step": 592 + }, + { + "epoch": 0.2888924975641442, + "grad_norm": 3.1021828651428223, + "learning_rate": 4.979920735828671e-06, + "loss": 0.7511, + "step": 593 + }, + { + "epoch": 0.28937966872361154, + "grad_norm": 3.4156479835510254, + "learning_rate": 4.979839305523916e-06, + "loss": 0.753, + "step": 594 + }, + { + "epoch": 0.28986683988307893, + "grad_norm": 3.0878429412841797, + "learning_rate": 4.9797577111036275e-06, + "loss": 0.7721, + "step": 595 + }, + { + "epoch": 0.29035401104254627, + "grad_norm": 3.249091863632202, + "learning_rate": 4.979675952573204e-06, + "loss": 0.7679, + "step": 596 + }, + { + "epoch": 0.29084118220201366, + "grad_norm": 3.1718780994415283, + "learning_rate": 4.979594029938058e-06, + "loss": 0.6502, + "step": 597 + }, + { + "epoch": 0.291328353361481, + "grad_norm": 2.953247547149658, + "learning_rate": 4.979511943203609e-06, + "loss": 0.7032, + "step": 598 + }, + { + "epoch": 0.29181552452094833, + "grad_norm": 3.2849438190460205, + "learning_rate": 4.97942969237529e-06, + "loss": 0.7173, + "step": 599 + }, + { + "epoch": 0.2923026956804157, + "grad_norm": 2.6484012603759766, + "learning_rate": 4.979347277458546e-06, + "loss": 0.7184, + "step": 600 + }, + { + "epoch": 0.29278986683988306, + "grad_norm": 2.993244171142578, + "learning_rate": 4.979264698458829e-06, + "loss": 0.7644, + "step": 601 + }, + { + "epoch": 0.29327703799935045, + "grad_norm": 2.6556572914123535, + "learning_rate": 4.979181955381606e-06, + "loss": 0.6679, + "step": 602 + }, + { + "epoch": 0.2937642091588178, + "grad_norm": 2.9809210300445557, + "learning_rate": 4.979099048232351e-06, + "loss": 0.6882, + "step": 603 + }, + { + "epoch": 0.2942513803182852, + "grad_norm": 2.7689754962921143, + "learning_rate": 4.979015977016553e-06, + "loss": 0.7582, + "step": 604 + }, + { + "epoch": 0.2947385514777525, + "grad_norm": 2.7017085552215576, + "learning_rate": 4.978932741739708e-06, + "loss": 0.7578, + "step": 605 + }, + { + "epoch": 0.29522572263721986, + "grad_norm": 2.919285297393799, + "learning_rate": 4.978849342407325e-06, + "loss": 0.7323, + "step": 606 + }, + { + "epoch": 0.29571289379668725, + "grad_norm": 3.3418233394622803, + "learning_rate": 4.978765779024923e-06, + "loss": 0.7503, + "step": 607 + }, + { + "epoch": 0.2962000649561546, + "grad_norm": 2.8753433227539062, + "learning_rate": 4.978682051598033e-06, + "loss": 0.7692, + "step": 608 + }, + { + "epoch": 0.296687236115622, + "grad_norm": 2.9428939819335938, + "learning_rate": 4.978598160132195e-06, + "loss": 0.6647, + "step": 609 + }, + { + "epoch": 0.2971744072750893, + "grad_norm": 2.963970899581909, + "learning_rate": 4.9785141046329625e-06, + "loss": 0.784, + "step": 610 + }, + { + "epoch": 0.29766157843455665, + "grad_norm": 3.2333405017852783, + "learning_rate": 4.978429885105897e-06, + "loss": 0.7253, + "step": 611 + }, + { + "epoch": 0.29814874959402404, + "grad_norm": 3.369206428527832, + "learning_rate": 4.978345501556573e-06, + "loss": 0.7702, + "step": 612 + }, + { + "epoch": 0.2986359207534914, + "grad_norm": 3.9205472469329834, + "learning_rate": 4.978260953990573e-06, + "loss": 0.8777, + "step": 613 + }, + { + "epoch": 0.29912309191295877, + "grad_norm": 3.0797290802001953, + "learning_rate": 4.978176242413495e-06, + "loss": 0.6744, + "step": 614 + }, + { + "epoch": 0.2996102630724261, + "grad_norm": 3.2869179248809814, + "learning_rate": 4.978091366830944e-06, + "loss": 0.7378, + "step": 615 + }, + { + "epoch": 0.30009743423189345, + "grad_norm": 2.8514933586120605, + "learning_rate": 4.978006327248537e-06, + "loss": 0.793, + "step": 616 + }, + { + "epoch": 0.30058460539136084, + "grad_norm": 2.962007999420166, + "learning_rate": 4.977921123671902e-06, + "loss": 0.7228, + "step": 617 + }, + { + "epoch": 0.3010717765508282, + "grad_norm": 2.982631206512451, + "learning_rate": 4.9778357561066776e-06, + "loss": 0.6953, + "step": 618 + }, + { + "epoch": 0.30155894771029557, + "grad_norm": 3.340541124343872, + "learning_rate": 4.977750224558514e-06, + "loss": 0.7781, + "step": 619 + }, + { + "epoch": 0.3020461188697629, + "grad_norm": 3.2366039752960205, + "learning_rate": 4.977664529033071e-06, + "loss": 0.6705, + "step": 620 + }, + { + "epoch": 0.3025332900292303, + "grad_norm": 2.9599039554595947, + "learning_rate": 4.977578669536019e-06, + "loss": 0.7127, + "step": 621 + }, + { + "epoch": 0.30302046118869763, + "grad_norm": 3.5922718048095703, + "learning_rate": 4.977492646073043e-06, + "loss": 0.6213, + "step": 622 + }, + { + "epoch": 0.30350763234816497, + "grad_norm": 2.857668876647949, + "learning_rate": 4.977406458649835e-06, + "loss": 0.6311, + "step": 623 + }, + { + "epoch": 0.30399480350763236, + "grad_norm": 3.022108554840088, + "learning_rate": 4.977320107272097e-06, + "loss": 0.8246, + "step": 624 + }, + { + "epoch": 0.3044819746670997, + "grad_norm": 2.9922406673431396, + "learning_rate": 4.977233591945545e-06, + "loss": 0.6818, + "step": 625 + }, + { + "epoch": 0.3049691458265671, + "grad_norm": 2.8949661254882812, + "learning_rate": 4.9771469126759044e-06, + "loss": 0.6899, + "step": 626 + }, + { + "epoch": 0.3054563169860344, + "grad_norm": 3.0150210857391357, + "learning_rate": 4.9770600694689116e-06, + "loss": 0.7036, + "step": 627 + }, + { + "epoch": 0.30594348814550176, + "grad_norm": 3.3416731357574463, + "learning_rate": 4.976973062330316e-06, + "loss": 0.7771, + "step": 628 + }, + { + "epoch": 0.30643065930496916, + "grad_norm": 2.7653419971466064, + "learning_rate": 4.976885891265871e-06, + "loss": 0.6847, + "step": 629 + }, + { + "epoch": 0.3069178304644365, + "grad_norm": 3.253023147583008, + "learning_rate": 4.97679855628135e-06, + "loss": 0.7541, + "step": 630 + }, + { + "epoch": 0.3074050016239039, + "grad_norm": 3.293530225753784, + "learning_rate": 4.976711057382532e-06, + "loss": 0.7627, + "step": 631 + }, + { + "epoch": 0.3078921727833712, + "grad_norm": 2.828322410583496, + "learning_rate": 4.9766233945752055e-06, + "loss": 0.7018, + "step": 632 + }, + { + "epoch": 0.30837934394283856, + "grad_norm": 3.2225327491760254, + "learning_rate": 4.9765355678651735e-06, + "loss": 0.705, + "step": 633 + }, + { + "epoch": 0.30886651510230595, + "grad_norm": 3.6675002574920654, + "learning_rate": 4.976447577258248e-06, + "loss": 0.8318, + "step": 634 + }, + { + "epoch": 0.3093536862617733, + "grad_norm": 3.0039336681365967, + "learning_rate": 4.976359422760253e-06, + "loss": 0.7088, + "step": 635 + }, + { + "epoch": 0.3098408574212407, + "grad_norm": 3.253809928894043, + "learning_rate": 4.9762711043770215e-06, + "loss": 0.6708, + "step": 636 + }, + { + "epoch": 0.310328028580708, + "grad_norm": 2.906940460205078, + "learning_rate": 4.976182622114399e-06, + "loss": 0.7056, + "step": 637 + }, + { + "epoch": 0.3108151997401754, + "grad_norm": 3.162949800491333, + "learning_rate": 4.976093975978242e-06, + "loss": 0.7113, + "step": 638 + }, + { + "epoch": 0.31130237089964274, + "grad_norm": 2.9454689025878906, + "learning_rate": 4.976005165974416e-06, + "loss": 0.6571, + "step": 639 + }, + { + "epoch": 0.3117895420591101, + "grad_norm": 3.233058452606201, + "learning_rate": 4.9759161921087975e-06, + "loss": 0.6978, + "step": 640 + }, + { + "epoch": 0.3122767132185775, + "grad_norm": 2.7803280353546143, + "learning_rate": 4.975827054387277e-06, + "loss": 0.6566, + "step": 641 + }, + { + "epoch": 0.3127638843780448, + "grad_norm": 3.0447194576263428, + "learning_rate": 4.975737752815752e-06, + "loss": 0.6918, + "step": 642 + }, + { + "epoch": 0.3132510555375122, + "grad_norm": 3.3912508487701416, + "learning_rate": 4.975648287400135e-06, + "loss": 0.7443, + "step": 643 + }, + { + "epoch": 0.31373822669697954, + "grad_norm": 3.042163133621216, + "learning_rate": 4.975558658146342e-06, + "loss": 0.7058, + "step": 644 + }, + { + "epoch": 0.3142253978564469, + "grad_norm": 3.025132894515991, + "learning_rate": 4.975468865060309e-06, + "loss": 0.7199, + "step": 645 + }, + { + "epoch": 0.31471256901591427, + "grad_norm": 3.072265863418579, + "learning_rate": 4.975378908147977e-06, + "loss": 0.7687, + "step": 646 + }, + { + "epoch": 0.3151997401753816, + "grad_norm": 2.9606990814208984, + "learning_rate": 4.975288787415299e-06, + "loss": 0.7564, + "step": 647 + }, + { + "epoch": 0.315686911334849, + "grad_norm": 3.193209409713745, + "learning_rate": 4.97519850286824e-06, + "loss": 0.6424, + "step": 648 + }, + { + "epoch": 0.31617408249431633, + "grad_norm": 2.8444671630859375, + "learning_rate": 4.975108054512774e-06, + "loss": 0.6967, + "step": 649 + }, + { + "epoch": 0.31666125365378367, + "grad_norm": 2.893613815307617, + "learning_rate": 4.975017442354888e-06, + "loss": 0.6416, + "step": 650 + }, + { + "epoch": 0.31714842481325106, + "grad_norm": 3.3328137397766113, + "learning_rate": 4.974926666400578e-06, + "loss": 0.7959, + "step": 651 + }, + { + "epoch": 0.3176355959727184, + "grad_norm": 2.9793059825897217, + "learning_rate": 4.974835726655852e-06, + "loss": 0.7385, + "step": 652 + }, + { + "epoch": 0.3181227671321858, + "grad_norm": 3.041425943374634, + "learning_rate": 4.974744623126727e-06, + "loss": 0.7579, + "step": 653 + }, + { + "epoch": 0.3186099382916531, + "grad_norm": 3.385289192199707, + "learning_rate": 4.9746533558192345e-06, + "loss": 0.762, + "step": 654 + }, + { + "epoch": 0.3190971094511205, + "grad_norm": 3.2125935554504395, + "learning_rate": 4.9745619247394125e-06, + "loss": 0.7001, + "step": 655 + }, + { + "epoch": 0.31958428061058786, + "grad_norm": 2.844728708267212, + "learning_rate": 4.974470329893313e-06, + "loss": 0.7507, + "step": 656 + }, + { + "epoch": 0.3200714517700552, + "grad_norm": 2.9737110137939453, + "learning_rate": 4.974378571286997e-06, + "loss": 0.7036, + "step": 657 + }, + { + "epoch": 0.3205586229295226, + "grad_norm": 2.8542730808258057, + "learning_rate": 4.974286648926539e-06, + "loss": 0.6856, + "step": 658 + }, + { + "epoch": 0.3210457940889899, + "grad_norm": 3.1387994289398193, + "learning_rate": 4.97419456281802e-06, + "loss": 0.7731, + "step": 659 + }, + { + "epoch": 0.3215329652484573, + "grad_norm": 2.91030216217041, + "learning_rate": 4.9741023129675355e-06, + "loss": 0.7925, + "step": 660 + }, + { + "epoch": 0.32202013640792465, + "grad_norm": 3.107212543487549, + "learning_rate": 4.97400989938119e-06, + "loss": 0.773, + "step": 661 + }, + { + "epoch": 0.322507307567392, + "grad_norm": 2.8088126182556152, + "learning_rate": 4.9739173220650994e-06, + "loss": 0.7017, + "step": 662 + }, + { + "epoch": 0.3229944787268594, + "grad_norm": 2.8702244758605957, + "learning_rate": 4.973824581025391e-06, + "loss": 0.7337, + "step": 663 + }, + { + "epoch": 0.3234816498863267, + "grad_norm": 3.25809907913208, + "learning_rate": 4.973731676268203e-06, + "loss": 0.6926, + "step": 664 + }, + { + "epoch": 0.3239688210457941, + "grad_norm": 2.7823312282562256, + "learning_rate": 4.973638607799682e-06, + "loss": 0.8017, + "step": 665 + }, + { + "epoch": 0.32445599220526145, + "grad_norm": 3.024813175201416, + "learning_rate": 4.973545375625989e-06, + "loss": 0.7921, + "step": 666 + }, + { + "epoch": 0.3249431633647288, + "grad_norm": 2.8297739028930664, + "learning_rate": 4.973451979753293e-06, + "loss": 0.7895, + "step": 667 + }, + { + "epoch": 0.3254303345241962, + "grad_norm": 3.0611732006073, + "learning_rate": 4.973358420187776e-06, + "loss": 0.745, + "step": 668 + }, + { + "epoch": 0.3259175056836635, + "grad_norm": 2.9425783157348633, + "learning_rate": 4.9732646969356285e-06, + "loss": 0.8373, + "step": 669 + }, + { + "epoch": 0.3264046768431309, + "grad_norm": 2.7338247299194336, + "learning_rate": 4.973170810003054e-06, + "loss": 0.6901, + "step": 670 + }, + { + "epoch": 0.32689184800259824, + "grad_norm": 3.208296060562134, + "learning_rate": 4.973076759396265e-06, + "loss": 0.6973, + "step": 671 + }, + { + "epoch": 0.32737901916206563, + "grad_norm": 2.730543851852417, + "learning_rate": 4.972982545121487e-06, + "loss": 0.7195, + "step": 672 + }, + { + "epoch": 0.32786619032153297, + "grad_norm": 2.828946352005005, + "learning_rate": 4.972888167184954e-06, + "loss": 0.8461, + "step": 673 + }, + { + "epoch": 0.3283533614810003, + "grad_norm": 3.1049857139587402, + "learning_rate": 4.972793625592912e-06, + "loss": 0.6565, + "step": 674 + }, + { + "epoch": 0.3288405326404677, + "grad_norm": 2.7441859245300293, + "learning_rate": 4.972698920351619e-06, + "loss": 0.7528, + "step": 675 + }, + { + "epoch": 0.32932770379993503, + "grad_norm": 2.833142042160034, + "learning_rate": 4.972604051467341e-06, + "loss": 0.7101, + "step": 676 + }, + { + "epoch": 0.3298148749594024, + "grad_norm": 2.596060037612915, + "learning_rate": 4.9725090189463574e-06, + "loss": 0.7208, + "step": 677 + }, + { + "epoch": 0.33030204611886976, + "grad_norm": 2.757789373397827, + "learning_rate": 4.972413822794957e-06, + "loss": 0.6778, + "step": 678 + }, + { + "epoch": 0.3307892172783371, + "grad_norm": 3.197985887527466, + "learning_rate": 4.972318463019441e-06, + "loss": 0.7095, + "step": 679 + }, + { + "epoch": 0.3312763884378045, + "grad_norm": 3.425492525100708, + "learning_rate": 4.972222939626118e-06, + "loss": 0.8379, + "step": 680 + }, + { + "epoch": 0.33176355959727183, + "grad_norm": 2.917781352996826, + "learning_rate": 4.972127252621312e-06, + "loss": 0.6574, + "step": 681 + }, + { + "epoch": 0.3322507307567392, + "grad_norm": 3.099487781524658, + "learning_rate": 4.972031402011354e-06, + "loss": 0.7519, + "step": 682 + }, + { + "epoch": 0.33273790191620656, + "grad_norm": 4.981099605560303, + "learning_rate": 4.971935387802588e-06, + "loss": 0.6456, + "step": 683 + }, + { + "epoch": 0.3332250730756739, + "grad_norm": 2.9259285926818848, + "learning_rate": 4.971839210001368e-06, + "loss": 0.6284, + "step": 684 + }, + { + "epoch": 0.3337122442351413, + "grad_norm": 3.228144884109497, + "learning_rate": 4.97174286861406e-06, + "loss": 0.6887, + "step": 685 + }, + { + "epoch": 0.3341994153946086, + "grad_norm": 2.5236542224884033, + "learning_rate": 4.971646363647039e-06, + "loss": 0.6249, + "step": 686 + }, + { + "epoch": 0.334686586554076, + "grad_norm": 3.388943672180176, + "learning_rate": 4.971549695106692e-06, + "loss": 0.7024, + "step": 687 + }, + { + "epoch": 0.33517375771354335, + "grad_norm": 3.202430248260498, + "learning_rate": 4.971452862999416e-06, + "loss": 0.8064, + "step": 688 + }, + { + "epoch": 0.33566092887301074, + "grad_norm": 2.8268139362335205, + "learning_rate": 4.97135586733162e-06, + "loss": 0.674, + "step": 689 + }, + { + "epoch": 0.3361481000324781, + "grad_norm": 2.8757483959198, + "learning_rate": 4.971258708109722e-06, + "loss": 0.6943, + "step": 690 + }, + { + "epoch": 0.3366352711919454, + "grad_norm": 2.732343912124634, + "learning_rate": 4.971161385340154e-06, + "loss": 0.6731, + "step": 691 + }, + { + "epoch": 0.3371224423514128, + "grad_norm": 3.1368134021759033, + "learning_rate": 4.971063899029355e-06, + "loss": 0.718, + "step": 692 + }, + { + "epoch": 0.33760961351088015, + "grad_norm": 2.8329148292541504, + "learning_rate": 4.970966249183777e-06, + "loss": 0.6312, + "step": 693 + }, + { + "epoch": 0.33809678467034754, + "grad_norm": 3.236966848373413, + "learning_rate": 4.970868435809882e-06, + "loss": 0.762, + "step": 694 + }, + { + "epoch": 0.3385839558298149, + "grad_norm": 3.1643359661102295, + "learning_rate": 4.970770458914146e-06, + "loss": 0.6475, + "step": 695 + }, + { + "epoch": 0.3390711269892822, + "grad_norm": 2.8889825344085693, + "learning_rate": 4.97067231850305e-06, + "loss": 0.6422, + "step": 696 + }, + { + "epoch": 0.3395582981487496, + "grad_norm": 3.1033542156219482, + "learning_rate": 4.970574014583089e-06, + "loss": 0.6893, + "step": 697 + }, + { + "epoch": 0.34004546930821694, + "grad_norm": 3.114347219467163, + "learning_rate": 4.9704755471607715e-06, + "loss": 0.7776, + "step": 698 + }, + { + "epoch": 0.34053264046768433, + "grad_norm": 3.5101583003997803, + "learning_rate": 4.970376916242612e-06, + "loss": 0.6693, + "step": 699 + }, + { + "epoch": 0.34101981162715167, + "grad_norm": 3.126526355743408, + "learning_rate": 4.970278121835138e-06, + "loss": 0.7773, + "step": 700 + }, + { + "epoch": 0.341506982786619, + "grad_norm": 3.2738585472106934, + "learning_rate": 4.9701791639448875e-06, + "loss": 0.7726, + "step": 701 + }, + { + "epoch": 0.3419941539460864, + "grad_norm": 3.385855197906494, + "learning_rate": 4.97008004257841e-06, + "loss": 0.7583, + "step": 702 + }, + { + "epoch": 0.34248132510555374, + "grad_norm": 3.3120808601379395, + "learning_rate": 4.969980757742266e-06, + "loss": 0.6998, + "step": 703 + }, + { + "epoch": 0.3429684962650211, + "grad_norm": 3.030841588973999, + "learning_rate": 4.969881309443026e-06, + "loss": 0.7855, + "step": 704 + }, + { + "epoch": 0.34345566742448846, + "grad_norm": 3.1443891525268555, + "learning_rate": 4.969781697687269e-06, + "loss": 0.7803, + "step": 705 + }, + { + "epoch": 0.34394283858395586, + "grad_norm": 3.8327183723449707, + "learning_rate": 4.969681922481591e-06, + "loss": 0.7202, + "step": 706 + }, + { + "epoch": 0.3444300097434232, + "grad_norm": 3.1238725185394287, + "learning_rate": 4.969581983832592e-06, + "loss": 0.7095, + "step": 707 + }, + { + "epoch": 0.34491718090289053, + "grad_norm": 3.090975761413574, + "learning_rate": 4.9694818817468885e-06, + "loss": 0.72, + "step": 708 + }, + { + "epoch": 0.3454043520623579, + "grad_norm": 2.9895384311676025, + "learning_rate": 4.969381616231102e-06, + "loss": 0.6881, + "step": 709 + }, + { + "epoch": 0.34589152322182526, + "grad_norm": 3.4560930728912354, + "learning_rate": 4.969281187291872e-06, + "loss": 0.8304, + "step": 710 + }, + { + "epoch": 0.34637869438129265, + "grad_norm": 3.061272144317627, + "learning_rate": 4.969180594935843e-06, + "loss": 0.7701, + "step": 711 + }, + { + "epoch": 0.34686586554076, + "grad_norm": 3.3422701358795166, + "learning_rate": 4.9690798391696715e-06, + "loss": 0.7625, + "step": 712 + }, + { + "epoch": 0.3473530367002273, + "grad_norm": 2.9271738529205322, + "learning_rate": 4.968978920000026e-06, + "loss": 0.7527, + "step": 713 + }, + { + "epoch": 0.3478402078596947, + "grad_norm": 3.1923418045043945, + "learning_rate": 4.968877837433586e-06, + "loss": 0.8074, + "step": 714 + }, + { + "epoch": 0.34832737901916205, + "grad_norm": 3.0770676136016846, + "learning_rate": 4.9687765914770406e-06, + "loss": 0.7706, + "step": 715 + }, + { + "epoch": 0.34881455017862945, + "grad_norm": 3.02119517326355, + "learning_rate": 4.96867518213709e-06, + "loss": 0.76, + "step": 716 + }, + { + "epoch": 0.3493017213380968, + "grad_norm": 3.059785842895508, + "learning_rate": 4.968573609420446e-06, + "loss": 0.6696, + "step": 717 + }, + { + "epoch": 0.3497888924975641, + "grad_norm": 3.0176191329956055, + "learning_rate": 4.96847187333383e-06, + "loss": 0.7092, + "step": 718 + }, + { + "epoch": 0.3502760636570315, + "grad_norm": 2.873680591583252, + "learning_rate": 4.968369973883975e-06, + "loss": 0.7912, + "step": 719 + }, + { + "epoch": 0.35076323481649885, + "grad_norm": 2.7856075763702393, + "learning_rate": 4.968267911077625e-06, + "loss": 0.6928, + "step": 720 + }, + { + "epoch": 0.35125040597596624, + "grad_norm": 2.9769766330718994, + "learning_rate": 4.968165684921535e-06, + "loss": 0.7256, + "step": 721 + }, + { + "epoch": 0.3517375771354336, + "grad_norm": 3.3023297786712646, + "learning_rate": 4.968063295422471e-06, + "loss": 0.8165, + "step": 722 + }, + { + "epoch": 0.35222474829490097, + "grad_norm": 2.8819432258605957, + "learning_rate": 4.967960742587205e-06, + "loss": 0.7889, + "step": 723 + }, + { + "epoch": 0.3527119194543683, + "grad_norm": 3.27093505859375, + "learning_rate": 4.967858026422529e-06, + "loss": 0.7361, + "step": 724 + }, + { + "epoch": 0.35319909061383564, + "grad_norm": 3.015216588973999, + "learning_rate": 4.9677551469352385e-06, + "loss": 0.7083, + "step": 725 + }, + { + "epoch": 0.35368626177330303, + "grad_norm": 2.759962558746338, + "learning_rate": 4.967652104132142e-06, + "loss": 0.6785, + "step": 726 + }, + { + "epoch": 0.35417343293277037, + "grad_norm": 3.1907877922058105, + "learning_rate": 4.967548898020059e-06, + "loss": 0.6808, + "step": 727 + }, + { + "epoch": 0.35466060409223776, + "grad_norm": 3.15336012840271, + "learning_rate": 4.96744552860582e-06, + "loss": 0.7095, + "step": 728 + }, + { + "epoch": 0.3551477752517051, + "grad_norm": 3.6297004222869873, + "learning_rate": 4.967341995896265e-06, + "loss": 0.7858, + "step": 729 + }, + { + "epoch": 0.35563494641117244, + "grad_norm": 3.3379082679748535, + "learning_rate": 4.967238299898247e-06, + "loss": 0.7611, + "step": 730 + }, + { + "epoch": 0.35612211757063983, + "grad_norm": 3.0844526290893555, + "learning_rate": 4.967134440618627e-06, + "loss": 0.7311, + "step": 731 + }, + { + "epoch": 0.35660928873010717, + "grad_norm": 2.7313549518585205, + "learning_rate": 4.9670304180642805e-06, + "loss": 0.6903, + "step": 732 + }, + { + "epoch": 0.35709645988957456, + "grad_norm": 2.888186454772949, + "learning_rate": 4.966926232242091e-06, + "loss": 0.6992, + "step": 733 + }, + { + "epoch": 0.3575836310490419, + "grad_norm": 2.892136573791504, + "learning_rate": 4.966821883158952e-06, + "loss": 0.7886, + "step": 734 + }, + { + "epoch": 0.35807080220850923, + "grad_norm": 3.096757411956787, + "learning_rate": 4.966717370821771e-06, + "loss": 0.7176, + "step": 735 + }, + { + "epoch": 0.3585579733679766, + "grad_norm": 3.161013603210449, + "learning_rate": 4.966612695237464e-06, + "loss": 0.7088, + "step": 736 + }, + { + "epoch": 0.35904514452744396, + "grad_norm": 2.582063674926758, + "learning_rate": 4.966507856412959e-06, + "loss": 0.7285, + "step": 737 + }, + { + "epoch": 0.35953231568691135, + "grad_norm": 3.4485549926757812, + "learning_rate": 4.966402854355193e-06, + "loss": 0.8377, + "step": 738 + }, + { + "epoch": 0.3600194868463787, + "grad_norm": 3.3897018432617188, + "learning_rate": 4.966297689071117e-06, + "loss": 0.6737, + "step": 739 + }, + { + "epoch": 0.3605066580058461, + "grad_norm": 2.8640854358673096, + "learning_rate": 4.9661923605676885e-06, + "loss": 0.7938, + "step": 740 + }, + { + "epoch": 0.3609938291653134, + "grad_norm": 3.1328823566436768, + "learning_rate": 4.96608686885188e-06, + "loss": 0.6483, + "step": 741 + }, + { + "epoch": 0.36148100032478075, + "grad_norm": 2.9927916526794434, + "learning_rate": 4.9659812139306714e-06, + "loss": 0.7036, + "step": 742 + }, + { + "epoch": 0.36196817148424815, + "grad_norm": 2.704850196838379, + "learning_rate": 4.965875395811056e-06, + "loss": 0.6809, + "step": 743 + }, + { + "epoch": 0.3624553426437155, + "grad_norm": 2.7413852214813232, + "learning_rate": 4.965769414500037e-06, + "loss": 0.7799, + "step": 744 + }, + { + "epoch": 0.3629425138031829, + "grad_norm": 2.8349411487579346, + "learning_rate": 4.965663270004627e-06, + "loss": 0.6323, + "step": 745 + }, + { + "epoch": 0.3634296849626502, + "grad_norm": 3.059112310409546, + "learning_rate": 4.965556962331851e-06, + "loss": 0.7661, + "step": 746 + }, + { + "epoch": 0.36391685612211755, + "grad_norm": 2.9534528255462646, + "learning_rate": 4.965450491488746e-06, + "loss": 0.8461, + "step": 747 + }, + { + "epoch": 0.36440402728158494, + "grad_norm": 2.859358072280884, + "learning_rate": 4.965343857482356e-06, + "loss": 0.7425, + "step": 748 + }, + { + "epoch": 0.3648911984410523, + "grad_norm": 2.913658380508423, + "learning_rate": 4.96523706031974e-06, + "loss": 0.725, + "step": 749 + }, + { + "epoch": 0.36537836960051967, + "grad_norm": 3.1464006900787354, + "learning_rate": 4.965130100007964e-06, + "loss": 0.7782, + "step": 750 + }, + { + "epoch": 0.365865540759987, + "grad_norm": 3.1690781116485596, + "learning_rate": 4.965022976554109e-06, + "loss": 0.7101, + "step": 751 + }, + { + "epoch": 0.36635271191945434, + "grad_norm": 3.355820655822754, + "learning_rate": 4.964915689965262e-06, + "loss": 0.7405, + "step": 752 + }, + { + "epoch": 0.36683988307892174, + "grad_norm": 2.8038432598114014, + "learning_rate": 4.964808240248524e-06, + "loss": 0.7989, + "step": 753 + }, + { + "epoch": 0.3673270542383891, + "grad_norm": 3.256728410720825, + "learning_rate": 4.9647006274110066e-06, + "loss": 0.8028, + "step": 754 + }, + { + "epoch": 0.36781422539785646, + "grad_norm": 3.244044542312622, + "learning_rate": 4.964592851459831e-06, + "loss": 0.7671, + "step": 755 + }, + { + "epoch": 0.3683013965573238, + "grad_norm": 3.1787285804748535, + "learning_rate": 4.96448491240213e-06, + "loss": 0.7046, + "step": 756 + }, + { + "epoch": 0.3687885677167912, + "grad_norm": 2.6339385509490967, + "learning_rate": 4.9643768102450464e-06, + "loss": 0.684, + "step": 757 + }, + { + "epoch": 0.36927573887625853, + "grad_norm": 2.600348949432373, + "learning_rate": 4.964268544995736e-06, + "loss": 0.685, + "step": 758 + }, + { + "epoch": 0.36976291003572587, + "grad_norm": 4.353481292724609, + "learning_rate": 4.964160116661362e-06, + "loss": 0.666, + "step": 759 + }, + { + "epoch": 0.37025008119519326, + "grad_norm": 2.965785026550293, + "learning_rate": 4.964051525249102e-06, + "loss": 0.6492, + "step": 760 + }, + { + "epoch": 0.3707372523546606, + "grad_norm": 3.6632156372070312, + "learning_rate": 4.96394277076614e-06, + "loss": 0.7732, + "step": 761 + }, + { + "epoch": 0.371224423514128, + "grad_norm": 2.9973819255828857, + "learning_rate": 4.963833853219676e-06, + "loss": 0.7395, + "step": 762 + }, + { + "epoch": 0.3717115946735953, + "grad_norm": 3.2523787021636963, + "learning_rate": 4.963724772616917e-06, + "loss": 0.7135, + "step": 763 + }, + { + "epoch": 0.37219876583306266, + "grad_norm": 3.2469663619995117, + "learning_rate": 4.963615528965081e-06, + "loss": 0.7422, + "step": 764 + }, + { + "epoch": 0.37268593699253005, + "grad_norm": 3.313406229019165, + "learning_rate": 4.9635061222713995e-06, + "loss": 0.5667, + "step": 765 + }, + { + "epoch": 0.3731731081519974, + "grad_norm": 2.8879752159118652, + "learning_rate": 4.963396552543112e-06, + "loss": 0.641, + "step": 766 + }, + { + "epoch": 0.3736602793114648, + "grad_norm": 3.1395103931427, + "learning_rate": 4.9632868197874706e-06, + "loss": 0.791, + "step": 767 + }, + { + "epoch": 0.3741474504709321, + "grad_norm": 3.1205685138702393, + "learning_rate": 4.963176924011736e-06, + "loss": 0.6989, + "step": 768 + }, + { + "epoch": 0.37463462163039946, + "grad_norm": 3.1742031574249268, + "learning_rate": 4.9630668652231825e-06, + "loss": 0.6842, + "step": 769 + }, + { + "epoch": 0.37512179278986685, + "grad_norm": 2.8118529319763184, + "learning_rate": 4.9629566434290935e-06, + "loss": 0.6683, + "step": 770 + }, + { + "epoch": 0.3756089639493342, + "grad_norm": 3.344158172607422, + "learning_rate": 4.962846258636762e-06, + "loss": 0.7599, + "step": 771 + }, + { + "epoch": 0.3760961351088016, + "grad_norm": 3.0003392696380615, + "learning_rate": 4.962735710853496e-06, + "loss": 0.7204, + "step": 772 + }, + { + "epoch": 0.3765833062682689, + "grad_norm": 3.460519790649414, + "learning_rate": 4.962625000086608e-06, + "loss": 0.7755, + "step": 773 + }, + { + "epoch": 0.37707047742773625, + "grad_norm": 2.8102471828460693, + "learning_rate": 4.962514126343429e-06, + "loss": 0.7047, + "step": 774 + }, + { + "epoch": 0.37755764858720364, + "grad_norm": 2.8564882278442383, + "learning_rate": 4.9624030896312934e-06, + "loss": 0.6167, + "step": 775 + }, + { + "epoch": 0.378044819746671, + "grad_norm": 2.74745774269104, + "learning_rate": 4.96229188995755e-06, + "loss": 0.6937, + "step": 776 + }, + { + "epoch": 0.37853199090613837, + "grad_norm": 2.973332405090332, + "learning_rate": 4.96218052732956e-06, + "loss": 0.6734, + "step": 777 + }, + { + "epoch": 0.3790191620656057, + "grad_norm": 2.827725648880005, + "learning_rate": 4.96206900175469e-06, + "loss": 0.6998, + "step": 778 + }, + { + "epoch": 0.3795063332250731, + "grad_norm": 3.8159842491149902, + "learning_rate": 4.961957313240324e-06, + "loss": 0.7064, + "step": 779 + }, + { + "epoch": 0.37999350438454044, + "grad_norm": 2.8060832023620605, + "learning_rate": 4.961845461793853e-06, + "loss": 0.6966, + "step": 780 + }, + { + "epoch": 0.3804806755440078, + "grad_norm": 2.851733446121216, + "learning_rate": 4.961733447422677e-06, + "loss": 0.6613, + "step": 781 + }, + { + "epoch": 0.38096784670347517, + "grad_norm": 3.3790793418884277, + "learning_rate": 4.961621270134212e-06, + "loss": 0.779, + "step": 782 + }, + { + "epoch": 0.3814550178629425, + "grad_norm": 3.037398338317871, + "learning_rate": 4.9615089299358794e-06, + "loss": 0.7234, + "step": 783 + }, + { + "epoch": 0.3819421890224099, + "grad_norm": 2.8155462741851807, + "learning_rate": 4.961396426835117e-06, + "loss": 0.727, + "step": 784 + }, + { + "epoch": 0.38242936018187723, + "grad_norm": 3.0404229164123535, + "learning_rate": 4.961283760839366e-06, + "loss": 0.6933, + "step": 785 + }, + { + "epoch": 0.38291653134134457, + "grad_norm": 2.761014699935913, + "learning_rate": 4.9611709319560865e-06, + "loss": 0.7053, + "step": 786 + }, + { + "epoch": 0.38340370250081196, + "grad_norm": 2.824709415435791, + "learning_rate": 4.9610579401927425e-06, + "loss": 0.664, + "step": 787 + }, + { + "epoch": 0.3838908736602793, + "grad_norm": 2.950286388397217, + "learning_rate": 4.960944785556814e-06, + "loss": 0.7053, + "step": 788 + }, + { + "epoch": 0.3843780448197467, + "grad_norm": 2.8123116493225098, + "learning_rate": 4.960831468055789e-06, + "loss": 0.7324, + "step": 789 + }, + { + "epoch": 0.384865215979214, + "grad_norm": 3.2874491214752197, + "learning_rate": 4.9607179876971665e-06, + "loss": 0.7519, + "step": 790 + }, + { + "epoch": 0.38535238713868136, + "grad_norm": 3.0428056716918945, + "learning_rate": 4.960604344488457e-06, + "loss": 0.7029, + "step": 791 + }, + { + "epoch": 0.38583955829814875, + "grad_norm": 2.7900290489196777, + "learning_rate": 4.96049053843718e-06, + "loss": 0.668, + "step": 792 + }, + { + "epoch": 0.3863267294576161, + "grad_norm": 2.6960673332214355, + "learning_rate": 4.960376569550869e-06, + "loss": 0.5731, + "step": 793 + }, + { + "epoch": 0.3868139006170835, + "grad_norm": 2.959625720977783, + "learning_rate": 4.960262437837066e-06, + "loss": 0.6333, + "step": 794 + }, + { + "epoch": 0.3873010717765508, + "grad_norm": 2.850362539291382, + "learning_rate": 4.960148143303323e-06, + "loss": 0.7112, + "step": 795 + }, + { + "epoch": 0.3877882429360182, + "grad_norm": 3.550096273422241, + "learning_rate": 4.960033685957207e-06, + "loss": 0.7625, + "step": 796 + }, + { + "epoch": 0.38827541409548555, + "grad_norm": 2.616661310195923, + "learning_rate": 4.959919065806288e-06, + "loss": 0.6823, + "step": 797 + }, + { + "epoch": 0.3887625852549529, + "grad_norm": 2.9208765029907227, + "learning_rate": 4.959804282858156e-06, + "loss": 0.7786, + "step": 798 + }, + { + "epoch": 0.3892497564144203, + "grad_norm": 2.9757163524627686, + "learning_rate": 4.959689337120406e-06, + "loss": 0.7285, + "step": 799 + }, + { + "epoch": 0.3897369275738876, + "grad_norm": 3.16194224357605, + "learning_rate": 4.959574228600643e-06, + "loss": 0.7641, + "step": 800 + }, + { + "epoch": 0.390224098733355, + "grad_norm": 2.8474180698394775, + "learning_rate": 4.959458957306488e-06, + "loss": 0.6945, + "step": 801 + }, + { + "epoch": 0.39071126989282234, + "grad_norm": 3.288355588912964, + "learning_rate": 4.959343523245568e-06, + "loss": 0.7074, + "step": 802 + }, + { + "epoch": 0.3911984410522897, + "grad_norm": 3.0072669982910156, + "learning_rate": 4.959227926425522e-06, + "loss": 0.6661, + "step": 803 + }, + { + "epoch": 0.3916856122117571, + "grad_norm": 3.124610185623169, + "learning_rate": 4.959112166854001e-06, + "loss": 0.7073, + "step": 804 + }, + { + "epoch": 0.3921727833712244, + "grad_norm": 3.4099831581115723, + "learning_rate": 4.958996244538666e-06, + "loss": 0.6739, + "step": 805 + }, + { + "epoch": 0.3926599545306918, + "grad_norm": 2.9599106311798096, + "learning_rate": 4.9588801594871886e-06, + "loss": 0.6169, + "step": 806 + }, + { + "epoch": 0.39314712569015914, + "grad_norm": 3.173701763153076, + "learning_rate": 4.958763911707252e-06, + "loss": 0.7825, + "step": 807 + }, + { + "epoch": 0.3936342968496265, + "grad_norm": 2.9035353660583496, + "learning_rate": 4.958647501206547e-06, + "loss": 0.7678, + "step": 808 + }, + { + "epoch": 0.39412146800909387, + "grad_norm": 3.108928680419922, + "learning_rate": 4.95853092799278e-06, + "loss": 0.8108, + "step": 809 + }, + { + "epoch": 0.3946086391685612, + "grad_norm": 3.0525922775268555, + "learning_rate": 4.958414192073665e-06, + "loss": 0.735, + "step": 810 + }, + { + "epoch": 0.3950958103280286, + "grad_norm": 2.8083386421203613, + "learning_rate": 4.958297293456928e-06, + "loss": 0.6624, + "step": 811 + }, + { + "epoch": 0.39558298148749593, + "grad_norm": 2.874572992324829, + "learning_rate": 4.958180232150306e-06, + "loss": 0.7641, + "step": 812 + }, + { + "epoch": 0.3960701526469633, + "grad_norm": 2.8061399459838867, + "learning_rate": 4.958063008161544e-06, + "loss": 0.7062, + "step": 813 + }, + { + "epoch": 0.39655732380643066, + "grad_norm": 3.322542905807495, + "learning_rate": 4.957945621498402e-06, + "loss": 0.7041, + "step": 814 + }, + { + "epoch": 0.397044494965898, + "grad_norm": 3.096618890762329, + "learning_rate": 4.9578280721686455e-06, + "loss": 0.8773, + "step": 815 + }, + { + "epoch": 0.3975316661253654, + "grad_norm": 2.475116491317749, + "learning_rate": 4.957710360180058e-06, + "loss": 0.6849, + "step": 816 + }, + { + "epoch": 0.3980188372848327, + "grad_norm": 2.941108226776123, + "learning_rate": 4.957592485540427e-06, + "loss": 0.7392, + "step": 817 + }, + { + "epoch": 0.3985060084443001, + "grad_norm": 3.364873170852661, + "learning_rate": 4.957474448257554e-06, + "loss": 0.7386, + "step": 818 + }, + { + "epoch": 0.39899317960376746, + "grad_norm": 3.811307191848755, + "learning_rate": 4.957356248339251e-06, + "loss": 0.7334, + "step": 819 + }, + { + "epoch": 0.3994803507632348, + "grad_norm": 3.2343180179595947, + "learning_rate": 4.957237885793341e-06, + "loss": 0.6985, + "step": 820 + }, + { + "epoch": 0.3999675219227022, + "grad_norm": 3.09297776222229, + "learning_rate": 4.957119360627656e-06, + "loss": 0.7153, + "step": 821 + }, + { + "epoch": 0.4004546930821695, + "grad_norm": 2.753073215484619, + "learning_rate": 4.957000672850041e-06, + "loss": 0.6517, + "step": 822 + }, + { + "epoch": 0.4009418642416369, + "grad_norm": 3.0206501483917236, + "learning_rate": 4.956881822468349e-06, + "loss": 0.6133, + "step": 823 + }, + { + "epoch": 0.40142903540110425, + "grad_norm": 3.326246738433838, + "learning_rate": 4.956762809490447e-06, + "loss": 0.7563, + "step": 824 + }, + { + "epoch": 0.4019162065605716, + "grad_norm": 2.804481267929077, + "learning_rate": 4.956643633924212e-06, + "loss": 0.6151, + "step": 825 + }, + { + "epoch": 0.402403377720039, + "grad_norm": 2.9218714237213135, + "learning_rate": 4.95652429577753e-06, + "loss": 0.7079, + "step": 826 + }, + { + "epoch": 0.4028905488795063, + "grad_norm": 2.868002414703369, + "learning_rate": 4.956404795058298e-06, + "loss": 0.6382, + "step": 827 + }, + { + "epoch": 0.4033777200389737, + "grad_norm": 2.8526697158813477, + "learning_rate": 4.956285131774427e-06, + "loss": 0.6627, + "step": 828 + }, + { + "epoch": 0.40386489119844105, + "grad_norm": 3.0466036796569824, + "learning_rate": 4.956165305933833e-06, + "loss": 0.7474, + "step": 829 + }, + { + "epoch": 0.40435206235790844, + "grad_norm": 3.141603469848633, + "learning_rate": 4.956045317544449e-06, + "loss": 0.7261, + "step": 830 + }, + { + "epoch": 0.4048392335173758, + "grad_norm": 3.217470169067383, + "learning_rate": 4.9559251666142135e-06, + "loss": 0.7022, + "step": 831 + }, + { + "epoch": 0.4053264046768431, + "grad_norm": 3.205657482147217, + "learning_rate": 4.955804853151079e-06, + "loss": 0.6874, + "step": 832 + }, + { + "epoch": 0.4058135758363105, + "grad_norm": 3.0774147510528564, + "learning_rate": 4.955684377163009e-06, + "loss": 0.642, + "step": 833 + }, + { + "epoch": 0.40630074699577784, + "grad_norm": 3.12593150138855, + "learning_rate": 4.955563738657976e-06, + "loss": 0.6158, + "step": 834 + }, + { + "epoch": 0.40678791815524523, + "grad_norm": 3.0781733989715576, + "learning_rate": 4.955442937643963e-06, + "loss": 0.6928, + "step": 835 + }, + { + "epoch": 0.40727508931471257, + "grad_norm": 3.1655073165893555, + "learning_rate": 4.955321974128963e-06, + "loss": 0.7876, + "step": 836 + }, + { + "epoch": 0.4077622604741799, + "grad_norm": 2.77483868598938, + "learning_rate": 4.955200848120986e-06, + "loss": 0.6488, + "step": 837 + }, + { + "epoch": 0.4082494316336473, + "grad_norm": 2.9822964668273926, + "learning_rate": 4.955079559628045e-06, + "loss": 0.6983, + "step": 838 + }, + { + "epoch": 0.40873660279311463, + "grad_norm": 2.815422296524048, + "learning_rate": 4.954958108658167e-06, + "loss": 0.7145, + "step": 839 + }, + { + "epoch": 0.409223773952582, + "grad_norm": 2.7520341873168945, + "learning_rate": 4.9548364952193895e-06, + "loss": 0.688, + "step": 840 + }, + { + "epoch": 0.40971094511204936, + "grad_norm": 3.6288466453552246, + "learning_rate": 4.954714719319762e-06, + "loss": 0.6449, + "step": 841 + }, + { + "epoch": 0.4101981162715167, + "grad_norm": 2.645132541656494, + "learning_rate": 4.954592780967343e-06, + "loss": 0.6673, + "step": 842 + }, + { + "epoch": 0.4106852874309841, + "grad_norm": 3.7818257808685303, + "learning_rate": 4.954470680170202e-06, + "loss": 0.6455, + "step": 843 + }, + { + "epoch": 0.41117245859045143, + "grad_norm": 3.4410901069641113, + "learning_rate": 4.95434841693642e-06, + "loss": 0.7313, + "step": 844 + }, + { + "epoch": 0.4116596297499188, + "grad_norm": 3.298218011856079, + "learning_rate": 4.954225991274089e-06, + "loss": 0.6679, + "step": 845 + }, + { + "epoch": 0.41214680090938616, + "grad_norm": 3.038174629211426, + "learning_rate": 4.9541034031913095e-06, + "loss": 0.6399, + "step": 846 + }, + { + "epoch": 0.41263397206885355, + "grad_norm": 3.3298346996307373, + "learning_rate": 4.953980652696195e-06, + "loss": 0.6643, + "step": 847 + }, + { + "epoch": 0.4131211432283209, + "grad_norm": 5.142780780792236, + "learning_rate": 4.95385773979687e-06, + "loss": 0.6911, + "step": 848 + }, + { + "epoch": 0.4136083143877882, + "grad_norm": 2.8838491439819336, + "learning_rate": 4.953734664501468e-06, + "loss": 0.589, + "step": 849 + }, + { + "epoch": 0.4140954855472556, + "grad_norm": 3.7825253009796143, + "learning_rate": 4.953611426818136e-06, + "loss": 0.7469, + "step": 850 + }, + { + "epoch": 0.41458265670672295, + "grad_norm": 2.954834222793579, + "learning_rate": 4.953488026755026e-06, + "loss": 0.663, + "step": 851 + }, + { + "epoch": 0.41506982786619034, + "grad_norm": 2.952829360961914, + "learning_rate": 4.953364464320309e-06, + "loss": 0.7031, + "step": 852 + }, + { + "epoch": 0.4155569990256577, + "grad_norm": 2.906210422515869, + "learning_rate": 4.9532407395221585e-06, + "loss": 0.6323, + "step": 853 + }, + { + "epoch": 0.416044170185125, + "grad_norm": 3.1586973667144775, + "learning_rate": 4.953116852368765e-06, + "loss": 0.7041, + "step": 854 + }, + { + "epoch": 0.4165313413445924, + "grad_norm": 3.731808662414551, + "learning_rate": 4.952992802868326e-06, + "loss": 0.7523, + "step": 855 + }, + { + "epoch": 0.41701851250405975, + "grad_norm": 2.6852481365203857, + "learning_rate": 4.952868591029052e-06, + "loss": 0.6866, + "step": 856 + }, + { + "epoch": 0.41750568366352714, + "grad_norm": 2.6834824085235596, + "learning_rate": 4.952744216859164e-06, + "loss": 0.7187, + "step": 857 + }, + { + "epoch": 0.4179928548229945, + "grad_norm": 3.081655502319336, + "learning_rate": 4.952619680366891e-06, + "loss": 0.6922, + "step": 858 + }, + { + "epoch": 0.4184800259824618, + "grad_norm": 2.5715348720550537, + "learning_rate": 4.9524949815604765e-06, + "loss": 0.6456, + "step": 859 + }, + { + "epoch": 0.4189671971419292, + "grad_norm": 2.860729694366455, + "learning_rate": 4.952370120448172e-06, + "loss": 0.6285, + "step": 860 + }, + { + "epoch": 0.41945436830139654, + "grad_norm": 2.665663242340088, + "learning_rate": 4.952245097038241e-06, + "loss": 0.6812, + "step": 861 + }, + { + "epoch": 0.41994153946086393, + "grad_norm": 2.8201069831848145, + "learning_rate": 4.952119911338959e-06, + "loss": 0.6935, + "step": 862 + }, + { + "epoch": 0.42042871062033127, + "grad_norm": 2.7964589595794678, + "learning_rate": 4.951994563358608e-06, + "loss": 0.6718, + "step": 863 + }, + { + "epoch": 0.42091588177979866, + "grad_norm": 3.0835235118865967, + "learning_rate": 4.951869053105487e-06, + "loss": 0.7638, + "step": 864 + }, + { + "epoch": 0.421403052939266, + "grad_norm": 2.5425171852111816, + "learning_rate": 4.9517433805879e-06, + "loss": 0.7337, + "step": 865 + }, + { + "epoch": 0.42189022409873334, + "grad_norm": 3.1050055027008057, + "learning_rate": 4.951617545814164e-06, + "loss": 0.6544, + "step": 866 + }, + { + "epoch": 0.4223773952582007, + "grad_norm": 2.8168699741363525, + "learning_rate": 4.951491548792606e-06, + "loss": 0.7, + "step": 867 + }, + { + "epoch": 0.42286456641766806, + "grad_norm": 2.923502206802368, + "learning_rate": 4.951365389531567e-06, + "loss": 0.7246, + "step": 868 + }, + { + "epoch": 0.42335173757713546, + "grad_norm": 2.737367630004883, + "learning_rate": 4.951239068039395e-06, + "loss": 0.6399, + "step": 869 + }, + { + "epoch": 0.4238389087366028, + "grad_norm": 3.0989811420440674, + "learning_rate": 4.951112584324449e-06, + "loss": 0.7104, + "step": 870 + }, + { + "epoch": 0.42432607989607013, + "grad_norm": 3.168170690536499, + "learning_rate": 4.9509859383951e-06, + "loss": 0.7916, + "step": 871 + }, + { + "epoch": 0.4248132510555375, + "grad_norm": 2.9268569946289062, + "learning_rate": 4.950859130259731e-06, + "loss": 0.7538, + "step": 872 + }, + { + "epoch": 0.42530042221500486, + "grad_norm": 3.08477783203125, + "learning_rate": 4.950732159926732e-06, + "loss": 0.6837, + "step": 873 + }, + { + "epoch": 0.42578759337447225, + "grad_norm": 2.954622983932495, + "learning_rate": 4.9506050274045076e-06, + "loss": 0.6245, + "step": 874 + }, + { + "epoch": 0.4262747645339396, + "grad_norm": 2.7474029064178467, + "learning_rate": 4.95047773270147e-06, + "loss": 0.7213, + "step": 875 + }, + { + "epoch": 0.4267619356934069, + "grad_norm": 2.8774850368499756, + "learning_rate": 4.9503502758260445e-06, + "loss": 0.6819, + "step": 876 + }, + { + "epoch": 0.4272491068528743, + "grad_norm": 3.0016515254974365, + "learning_rate": 4.950222656786666e-06, + "loss": 0.6791, + "step": 877 + }, + { + "epoch": 0.42773627801234165, + "grad_norm": 2.8074467182159424, + "learning_rate": 4.950094875591781e-06, + "loss": 0.6534, + "step": 878 + }, + { + "epoch": 0.42822344917180905, + "grad_norm": 2.8037002086639404, + "learning_rate": 4.949966932249844e-06, + "loss": 0.7404, + "step": 879 + }, + { + "epoch": 0.4287106203312764, + "grad_norm": 3.387838363647461, + "learning_rate": 4.949838826769324e-06, + "loss": 0.7184, + "step": 880 + }, + { + "epoch": 0.4291977914907438, + "grad_norm": 2.8196122646331787, + "learning_rate": 4.949710559158699e-06, + "loss": 0.6807, + "step": 881 + }, + { + "epoch": 0.4296849626502111, + "grad_norm": 2.6130056381225586, + "learning_rate": 4.949582129426456e-06, + "loss": 0.7322, + "step": 882 + }, + { + "epoch": 0.43017213380967845, + "grad_norm": 2.988542318344116, + "learning_rate": 4.949453537581098e-06, + "loss": 0.7215, + "step": 883 + }, + { + "epoch": 0.43065930496914584, + "grad_norm": 2.8219540119171143, + "learning_rate": 4.949324783631132e-06, + "loss": 0.5232, + "step": 884 + }, + { + "epoch": 0.4311464761286132, + "grad_norm": 3.545703411102295, + "learning_rate": 4.949195867585079e-06, + "loss": 0.6442, + "step": 885 + }, + { + "epoch": 0.43163364728808057, + "grad_norm": 2.999408483505249, + "learning_rate": 4.949066789451472e-06, + "loss": 0.6975, + "step": 886 + }, + { + "epoch": 0.4321208184475479, + "grad_norm": 2.687673330307007, + "learning_rate": 4.948937549238854e-06, + "loss": 0.6931, + "step": 887 + }, + { + "epoch": 0.43260798960701524, + "grad_norm": 2.9466278553009033, + "learning_rate": 4.9488081469557755e-06, + "loss": 0.663, + "step": 888 + }, + { + "epoch": 0.43309516076648263, + "grad_norm": 2.952465295791626, + "learning_rate": 4.948678582610802e-06, + "loss": 0.6747, + "step": 889 + }, + { + "epoch": 0.43358233192594997, + "grad_norm": 2.7713959217071533, + "learning_rate": 4.948548856212509e-06, + "loss": 0.616, + "step": 890 + }, + { + "epoch": 0.43406950308541736, + "grad_norm": 3.278367757797241, + "learning_rate": 4.94841896776948e-06, + "loss": 0.7444, + "step": 891 + }, + { + "epoch": 0.4345566742448847, + "grad_norm": 3.6711766719818115, + "learning_rate": 4.948288917290312e-06, + "loss": 0.7989, + "step": 892 + }, + { + "epoch": 0.43504384540435204, + "grad_norm": 2.6600773334503174, + "learning_rate": 4.948158704783611e-06, + "loss": 0.6583, + "step": 893 + }, + { + "epoch": 0.43553101656381943, + "grad_norm": 2.805708646774292, + "learning_rate": 4.948028330257996e-06, + "loss": 0.7462, + "step": 894 + }, + { + "epoch": 0.43601818772328677, + "grad_norm": 2.628798723220825, + "learning_rate": 4.947897793722093e-06, + "loss": 0.6628, + "step": 895 + }, + { + "epoch": 0.43650535888275416, + "grad_norm": 2.708449602127075, + "learning_rate": 4.947767095184542e-06, + "loss": 0.6876, + "step": 896 + }, + { + "epoch": 0.4369925300422215, + "grad_norm": 3.135808229446411, + "learning_rate": 4.947636234653993e-06, + "loss": 0.7249, + "step": 897 + }, + { + "epoch": 0.4374797012016889, + "grad_norm": 3.0428903102874756, + "learning_rate": 4.947505212139105e-06, + "loss": 0.7038, + "step": 898 + }, + { + "epoch": 0.4379668723611562, + "grad_norm": 2.6677520275115967, + "learning_rate": 4.947374027648551e-06, + "loss": 0.7045, + "step": 899 + }, + { + "epoch": 0.43845404352062356, + "grad_norm": 3.1434218883514404, + "learning_rate": 4.947242681191011e-06, + "loss": 0.6653, + "step": 900 + }, + { + "epoch": 0.43894121468009095, + "grad_norm": 2.8240435123443604, + "learning_rate": 4.94711117277518e-06, + "loss": 0.7224, + "step": 901 + }, + { + "epoch": 0.4394283858395583, + "grad_norm": 3.0268328189849854, + "learning_rate": 4.946979502409758e-06, + "loss": 0.6664, + "step": 902 + }, + { + "epoch": 0.4399155569990257, + "grad_norm": 3.0497162342071533, + "learning_rate": 4.94684767010346e-06, + "loss": 0.6476, + "step": 903 + }, + { + "epoch": 0.440402728158493, + "grad_norm": 3.2347774505615234, + "learning_rate": 4.946715675865012e-06, + "loss": 0.7157, + "step": 904 + }, + { + "epoch": 0.44088989931796035, + "grad_norm": 2.932398557662964, + "learning_rate": 4.946583519703148e-06, + "loss": 0.6577, + "step": 905 + }, + { + "epoch": 0.44137707047742775, + "grad_norm": 2.6946096420288086, + "learning_rate": 4.946451201626614e-06, + "loss": 0.6672, + "step": 906 + }, + { + "epoch": 0.4418642416368951, + "grad_norm": 2.9501147270202637, + "learning_rate": 4.946318721644168e-06, + "loss": 0.7029, + "step": 907 + }, + { + "epoch": 0.4423514127963625, + "grad_norm": 2.7623543739318848, + "learning_rate": 4.946186079764577e-06, + "loss": 0.6257, + "step": 908 + }, + { + "epoch": 0.4428385839558298, + "grad_norm": 3.08900785446167, + "learning_rate": 4.946053275996618e-06, + "loss": 0.7297, + "step": 909 + }, + { + "epoch": 0.44332575511529715, + "grad_norm": 3.101914167404175, + "learning_rate": 4.9459203103490815e-06, + "loss": 0.6588, + "step": 910 + }, + { + "epoch": 0.44381292627476454, + "grad_norm": 3.1675543785095215, + "learning_rate": 4.945787182830767e-06, + "loss": 0.7202, + "step": 911 + }, + { + "epoch": 0.4443000974342319, + "grad_norm": 3.7978718280792236, + "learning_rate": 4.945653893450484e-06, + "loss": 0.7501, + "step": 912 + }, + { + "epoch": 0.44478726859369927, + "grad_norm": 3.0319149494171143, + "learning_rate": 4.9455204422170545e-06, + "loss": 0.7442, + "step": 913 + }, + { + "epoch": 0.4452744397531666, + "grad_norm": 2.892979621887207, + "learning_rate": 4.9453868291393095e-06, + "loss": 0.7184, + "step": 914 + }, + { + "epoch": 0.445761610912634, + "grad_norm": 3.2373123168945312, + "learning_rate": 4.945253054226092e-06, + "loss": 0.6927, + "step": 915 + }, + { + "epoch": 0.44624878207210134, + "grad_norm": 3.109800100326538, + "learning_rate": 4.945119117486255e-06, + "loss": 0.7486, + "step": 916 + }, + { + "epoch": 0.44673595323156867, + "grad_norm": 2.87620210647583, + "learning_rate": 4.944985018928662e-06, + "loss": 0.6827, + "step": 917 + }, + { + "epoch": 0.44722312439103606, + "grad_norm": 3.1890792846679688, + "learning_rate": 4.9448507585621895e-06, + "loss": 0.7253, + "step": 918 + }, + { + "epoch": 0.4477102955505034, + "grad_norm": 2.6164519786834717, + "learning_rate": 4.9447163363957205e-06, + "loss": 0.6599, + "step": 919 + }, + { + "epoch": 0.4481974667099708, + "grad_norm": 2.982541799545288, + "learning_rate": 4.9445817524381514e-06, + "loss": 0.7031, + "step": 920 + }, + { + "epoch": 0.44868463786943813, + "grad_norm": 2.7315728664398193, + "learning_rate": 4.944447006698391e-06, + "loss": 0.6627, + "step": 921 + }, + { + "epoch": 0.44917180902890547, + "grad_norm": 2.6117117404937744, + "learning_rate": 4.944312099185355e-06, + "loss": 0.6502, + "step": 922 + }, + { + "epoch": 0.44965898018837286, + "grad_norm": 2.6255176067352295, + "learning_rate": 4.944177029907971e-06, + "loss": 0.6418, + "step": 923 + }, + { + "epoch": 0.4501461513478402, + "grad_norm": 2.8090381622314453, + "learning_rate": 4.94404179887518e-06, + "loss": 0.618, + "step": 924 + }, + { + "epoch": 0.4506333225073076, + "grad_norm": 2.8472368717193604, + "learning_rate": 4.943906406095928e-06, + "loss": 0.7209, + "step": 925 + }, + { + "epoch": 0.4511204936667749, + "grad_norm": 2.960270881652832, + "learning_rate": 4.94377085157918e-06, + "loss": 0.7221, + "step": 926 + }, + { + "epoch": 0.45160766482624226, + "grad_norm": 3.5508499145507812, + "learning_rate": 4.943635135333904e-06, + "loss": 0.7543, + "step": 927 + }, + { + "epoch": 0.45209483598570965, + "grad_norm": 2.575913429260254, + "learning_rate": 4.943499257369082e-06, + "loss": 0.7092, + "step": 928 + }, + { + "epoch": 0.452582007145177, + "grad_norm": 2.957888126373291, + "learning_rate": 4.943363217693707e-06, + "loss": 0.7584, + "step": 929 + }, + { + "epoch": 0.4530691783046444, + "grad_norm": 2.802731513977051, + "learning_rate": 4.943227016316782e-06, + "loss": 0.6335, + "step": 930 + }, + { + "epoch": 0.4535563494641117, + "grad_norm": 2.688985824584961, + "learning_rate": 4.94309065324732e-06, + "loss": 0.6801, + "step": 931 + }, + { + "epoch": 0.4540435206235791, + "grad_norm": 2.9441144466400146, + "learning_rate": 4.942954128494346e-06, + "loss": 0.7076, + "step": 932 + }, + { + "epoch": 0.45453069178304645, + "grad_norm": 3.0513391494750977, + "learning_rate": 4.942817442066896e-06, + "loss": 0.6619, + "step": 933 + }, + { + "epoch": 0.4550178629425138, + "grad_norm": 3.0560243129730225, + "learning_rate": 4.942680593974014e-06, + "loss": 0.7047, + "step": 934 + }, + { + "epoch": 0.4555050341019812, + "grad_norm": 2.697582244873047, + "learning_rate": 4.942543584224758e-06, + "loss": 0.7517, + "step": 935 + }, + { + "epoch": 0.4559922052614485, + "grad_norm": 2.536072254180908, + "learning_rate": 4.942406412828196e-06, + "loss": 0.6865, + "step": 936 + }, + { + "epoch": 0.4564793764209159, + "grad_norm": 2.854029417037964, + "learning_rate": 4.942269079793405e-06, + "loss": 0.6908, + "step": 937 + }, + { + "epoch": 0.45696654758038324, + "grad_norm": 2.9755396842956543, + "learning_rate": 4.9421315851294726e-06, + "loss": 0.6666, + "step": 938 + }, + { + "epoch": 0.4574537187398506, + "grad_norm": 3.253432512283325, + "learning_rate": 4.9419939288455e-06, + "loss": 0.7572, + "step": 939 + }, + { + "epoch": 0.45794088989931797, + "grad_norm": 3.291907548904419, + "learning_rate": 4.941856110950597e-06, + "loss": 0.6304, + "step": 940 + }, + { + "epoch": 0.4584280610587853, + "grad_norm": 2.922811985015869, + "learning_rate": 4.941718131453883e-06, + "loss": 0.7008, + "step": 941 + }, + { + "epoch": 0.4589152322182527, + "grad_norm": 2.773911952972412, + "learning_rate": 4.941579990364491e-06, + "loss": 0.7647, + "step": 942 + }, + { + "epoch": 0.45940240337772004, + "grad_norm": 3.393615961074829, + "learning_rate": 4.9414416876915625e-06, + "loss": 0.6917, + "step": 943 + }, + { + "epoch": 0.4598895745371874, + "grad_norm": 3.2832698822021484, + "learning_rate": 4.94130322344425e-06, + "loss": 0.6842, + "step": 944 + }, + { + "epoch": 0.46037674569665477, + "grad_norm": 2.501657247543335, + "learning_rate": 4.941164597631719e-06, + "loss": 0.6242, + "step": 945 + }, + { + "epoch": 0.4608639168561221, + "grad_norm": 3.306835889816284, + "learning_rate": 4.941025810263141e-06, + "loss": 0.7247, + "step": 946 + }, + { + "epoch": 0.4613510880155895, + "grad_norm": 3.3257951736450195, + "learning_rate": 4.940886861347702e-06, + "loss": 0.8203, + "step": 947 + }, + { + "epoch": 0.46183825917505683, + "grad_norm": 2.779113531112671, + "learning_rate": 4.940747750894599e-06, + "loss": 0.7468, + "step": 948 + }, + { + "epoch": 0.4623254303345242, + "grad_norm": 2.804795742034912, + "learning_rate": 4.940608478913036e-06, + "loss": 0.7324, + "step": 949 + }, + { + "epoch": 0.46281260149399156, + "grad_norm": 3.1646060943603516, + "learning_rate": 4.940469045412232e-06, + "loss": 0.686, + "step": 950 + }, + { + "epoch": 0.4632997726534589, + "grad_norm": 2.9998586177825928, + "learning_rate": 4.940329450401413e-06, + "loss": 0.6643, + "step": 951 + }, + { + "epoch": 0.4637869438129263, + "grad_norm": 2.944629669189453, + "learning_rate": 4.940189693889819e-06, + "loss": 0.6868, + "step": 952 + }, + { + "epoch": 0.4642741149723936, + "grad_norm": 3.3283119201660156, + "learning_rate": 4.940049775886697e-06, + "loss": 0.697, + "step": 953 + }, + { + "epoch": 0.464761286131861, + "grad_norm": 3.062570571899414, + "learning_rate": 4.939909696401309e-06, + "loss": 0.7213, + "step": 954 + }, + { + "epoch": 0.46524845729132835, + "grad_norm": 3.165452241897583, + "learning_rate": 4.939769455442924e-06, + "loss": 0.6736, + "step": 955 + }, + { + "epoch": 0.4657356284507957, + "grad_norm": 3.0059127807617188, + "learning_rate": 4.939629053020825e-06, + "loss": 0.7177, + "step": 956 + }, + { + "epoch": 0.4662227996102631, + "grad_norm": 3.3514556884765625, + "learning_rate": 4.9394884891443005e-06, + "loss": 0.7968, + "step": 957 + }, + { + "epoch": 0.4667099707697304, + "grad_norm": 3.0937087535858154, + "learning_rate": 4.939347763822656e-06, + "loss": 0.6761, + "step": 958 + }, + { + "epoch": 0.4671971419291978, + "grad_norm": 3.0020763874053955, + "learning_rate": 4.939206877065203e-06, + "loss": 0.7475, + "step": 959 + }, + { + "epoch": 0.46768431308866515, + "grad_norm": 2.5966131687164307, + "learning_rate": 4.9390658288812675e-06, + "loss": 0.5923, + "step": 960 + }, + { + "epoch": 0.4681714842481325, + "grad_norm": 2.425727605819702, + "learning_rate": 4.9389246192801814e-06, + "loss": 0.6094, + "step": 961 + }, + { + "epoch": 0.4686586554075999, + "grad_norm": 3.1866202354431152, + "learning_rate": 4.938783248271291e-06, + "loss": 0.6996, + "step": 962 + }, + { + "epoch": 0.4691458265670672, + "grad_norm": 2.9760167598724365, + "learning_rate": 4.9386417158639535e-06, + "loss": 0.6664, + "step": 963 + }, + { + "epoch": 0.4696329977265346, + "grad_norm": 3.217965602874756, + "learning_rate": 4.938500022067534e-06, + "loss": 0.732, + "step": 964 + }, + { + "epoch": 0.47012016888600194, + "grad_norm": 2.8332862854003906, + "learning_rate": 4.938358166891409e-06, + "loss": 0.6149, + "step": 965 + }, + { + "epoch": 0.47060734004546934, + "grad_norm": 3.0543148517608643, + "learning_rate": 4.938216150344968e-06, + "loss": 0.6888, + "step": 966 + }, + { + "epoch": 0.4710945112049367, + "grad_norm": 3.252396821975708, + "learning_rate": 4.93807397243761e-06, + "loss": 0.7611, + "step": 967 + }, + { + "epoch": 0.471581682364404, + "grad_norm": 2.8159399032592773, + "learning_rate": 4.937931633178742e-06, + "loss": 0.6655, + "step": 968 + }, + { + "epoch": 0.4720688535238714, + "grad_norm": 2.9493396282196045, + "learning_rate": 4.937789132577787e-06, + "loss": 0.6851, + "step": 969 + }, + { + "epoch": 0.47255602468333874, + "grad_norm": 2.828371524810791, + "learning_rate": 4.937646470644174e-06, + "loss": 0.6998, + "step": 970 + }, + { + "epoch": 0.47304319584280613, + "grad_norm": 3.126873254776001, + "learning_rate": 4.937503647387344e-06, + "loss": 0.6169, + "step": 971 + }, + { + "epoch": 0.47353036700227347, + "grad_norm": 3.2905385494232178, + "learning_rate": 4.93736066281675e-06, + "loss": 0.6991, + "step": 972 + }, + { + "epoch": 0.4740175381617408, + "grad_norm": 3.146986961364746, + "learning_rate": 4.9372175169418536e-06, + "loss": 0.7052, + "step": 973 + }, + { + "epoch": 0.4745047093212082, + "grad_norm": 2.824061393737793, + "learning_rate": 4.9370742097721295e-06, + "loss": 0.6619, + "step": 974 + }, + { + "epoch": 0.47499188048067553, + "grad_norm": 3.0653462409973145, + "learning_rate": 4.936930741317061e-06, + "loss": 0.6818, + "step": 975 + }, + { + "epoch": 0.4754790516401429, + "grad_norm": 3.33349871635437, + "learning_rate": 4.936787111586143e-06, + "loss": 0.7238, + "step": 976 + }, + { + "epoch": 0.47596622279961026, + "grad_norm": 3.011549472808838, + "learning_rate": 4.936643320588882e-06, + "loss": 0.6351, + "step": 977 + }, + { + "epoch": 0.4764533939590776, + "grad_norm": 3.2372446060180664, + "learning_rate": 4.936499368334791e-06, + "loss": 0.6897, + "step": 978 + }, + { + "epoch": 0.476940565118545, + "grad_norm": 3.180311918258667, + "learning_rate": 4.9363552548333995e-06, + "loss": 0.7481, + "step": 979 + }, + { + "epoch": 0.4774277362780123, + "grad_norm": 3.221527099609375, + "learning_rate": 4.936210980094244e-06, + "loss": 0.6957, + "step": 980 + }, + { + "epoch": 0.4779149074374797, + "grad_norm": 3.2644543647766113, + "learning_rate": 4.936066544126873e-06, + "loss": 0.712, + "step": 981 + }, + { + "epoch": 0.47840207859694706, + "grad_norm": 2.966308832168579, + "learning_rate": 4.935921946940845e-06, + "loss": 0.7279, + "step": 982 + }, + { + "epoch": 0.47888924975641445, + "grad_norm": 2.897141695022583, + "learning_rate": 4.93577718854573e-06, + "loss": 0.7406, + "step": 983 + }, + { + "epoch": 0.4793764209158818, + "grad_norm": 2.974050998687744, + "learning_rate": 4.935632268951106e-06, + "loss": 0.7374, + "step": 984 + }, + { + "epoch": 0.4798635920753491, + "grad_norm": 2.94781231880188, + "learning_rate": 4.935487188166566e-06, + "loss": 0.6327, + "step": 985 + }, + { + "epoch": 0.4803507632348165, + "grad_norm": 2.9274091720581055, + "learning_rate": 4.93534194620171e-06, + "loss": 0.6717, + "step": 986 + }, + { + "epoch": 0.48083793439428385, + "grad_norm": 3.1931262016296387, + "learning_rate": 4.935196543066152e-06, + "loss": 0.707, + "step": 987 + }, + { + "epoch": 0.48132510555375124, + "grad_norm": 3.525869607925415, + "learning_rate": 4.935050978769513e-06, + "loss": 0.7754, + "step": 988 + }, + { + "epoch": 0.4818122767132186, + "grad_norm": 3.5500378608703613, + "learning_rate": 4.934905253321427e-06, + "loss": 0.7619, + "step": 989 + }, + { + "epoch": 0.4822994478726859, + "grad_norm": 2.7735486030578613, + "learning_rate": 4.934759366731537e-06, + "loss": 0.6365, + "step": 990 + }, + { + "epoch": 0.4827866190321533, + "grad_norm": 2.705486297607422, + "learning_rate": 4.9346133190095e-06, + "loss": 0.6484, + "step": 991 + }, + { + "epoch": 0.48327379019162064, + "grad_norm": 3.1812021732330322, + "learning_rate": 4.93446711016498e-06, + "loss": 0.6874, + "step": 992 + }, + { + "epoch": 0.48376096135108804, + "grad_norm": 2.729980707168579, + "learning_rate": 4.934320740207654e-06, + "loss": 0.6454, + "step": 993 + }, + { + "epoch": 0.4842481325105554, + "grad_norm": 3.212476968765259, + "learning_rate": 4.934174209147208e-06, + "loss": 0.7398, + "step": 994 + }, + { + "epoch": 0.4847353036700227, + "grad_norm": 2.528515577316284, + "learning_rate": 4.934027516993338e-06, + "loss": 0.5999, + "step": 995 + }, + { + "epoch": 0.4852224748294901, + "grad_norm": 2.582738161087036, + "learning_rate": 4.933880663755755e-06, + "loss": 0.5737, + "step": 996 + }, + { + "epoch": 0.48570964598895744, + "grad_norm": 2.869213819503784, + "learning_rate": 4.933733649444176e-06, + "loss": 0.6325, + "step": 997 + }, + { + "epoch": 0.48619681714842483, + "grad_norm": 2.399386405944824, + "learning_rate": 4.9335864740683305e-06, + "loss": 0.641, + "step": 998 + }, + { + "epoch": 0.48668398830789217, + "grad_norm": 2.9301421642303467, + "learning_rate": 4.933439137637959e-06, + "loss": 0.7068, + "step": 999 + }, + { + "epoch": 0.4871711594673595, + "grad_norm": 2.996840476989746, + "learning_rate": 4.933291640162812e-06, + "loss": 0.7297, + "step": 1000 + }, + { + "epoch": 0.4876583306268269, + "grad_norm": 2.8244705200195312, + "learning_rate": 4.93314398165265e-06, + "loss": 0.7028, + "step": 1001 + }, + { + "epoch": 0.48814550178629423, + "grad_norm": 2.72131609916687, + "learning_rate": 4.9329961621172474e-06, + "loss": 0.6134, + "step": 1002 + }, + { + "epoch": 0.4886326729457616, + "grad_norm": 3.087415933609009, + "learning_rate": 4.9328481815663845e-06, + "loss": 0.6481, + "step": 1003 + }, + { + "epoch": 0.48911984410522896, + "grad_norm": 3.3591127395629883, + "learning_rate": 4.932700040009856e-06, + "loss": 0.6274, + "step": 1004 + }, + { + "epoch": 0.48960701526469635, + "grad_norm": 2.6973142623901367, + "learning_rate": 4.9325517374574644e-06, + "loss": 0.6986, + "step": 1005 + }, + { + "epoch": 0.4900941864241637, + "grad_norm": 3.2651560306549072, + "learning_rate": 4.9324032739190256e-06, + "loss": 0.7605, + "step": 1006 + }, + { + "epoch": 0.49058135758363103, + "grad_norm": 2.922163486480713, + "learning_rate": 4.932254649404365e-06, + "loss": 0.6917, + "step": 1007 + }, + { + "epoch": 0.4910685287430984, + "grad_norm": 3.0268335342407227, + "learning_rate": 4.932105863923318e-06, + "loss": 0.7023, + "step": 1008 + }, + { + "epoch": 0.49155569990256576, + "grad_norm": 3.0823400020599365, + "learning_rate": 4.931956917485732e-06, + "loss": 0.6754, + "step": 1009 + }, + { + "epoch": 0.49204287106203315, + "grad_norm": 2.492593765258789, + "learning_rate": 4.931807810101463e-06, + "loss": 0.6416, + "step": 1010 + }, + { + "epoch": 0.4925300422215005, + "grad_norm": 2.693918466567993, + "learning_rate": 4.93165854178038e-06, + "loss": 0.6403, + "step": 1011 + }, + { + "epoch": 0.4930172133809678, + "grad_norm": 2.894146680831909, + "learning_rate": 4.93150911253236e-06, + "loss": 0.6601, + "step": 1012 + }, + { + "epoch": 0.4935043845404352, + "grad_norm": 2.917987108230591, + "learning_rate": 4.931359522367295e-06, + "loss": 0.7313, + "step": 1013 + }, + { + "epoch": 0.49399155569990255, + "grad_norm": 3.3123602867126465, + "learning_rate": 4.931209771295082e-06, + "loss": 0.6073, + "step": 1014 + }, + { + "epoch": 0.49447872685936994, + "grad_norm": 3.6420321464538574, + "learning_rate": 4.931059859325633e-06, + "loss": 0.7023, + "step": 1015 + }, + { + "epoch": 0.4949658980188373, + "grad_norm": 2.974235773086548, + "learning_rate": 4.930909786468869e-06, + "loss": 0.6785, + "step": 1016 + }, + { + "epoch": 0.4954530691783046, + "grad_norm": 2.8365585803985596, + "learning_rate": 4.9307595527347216e-06, + "loss": 0.6625, + "step": 1017 + }, + { + "epoch": 0.495940240337772, + "grad_norm": 2.7830541133880615, + "learning_rate": 4.9306091581331335e-06, + "loss": 0.6662, + "step": 1018 + }, + { + "epoch": 0.49642741149723935, + "grad_norm": 3.395017147064209, + "learning_rate": 4.930458602674058e-06, + "loss": 0.7419, + "step": 1019 + }, + { + "epoch": 0.49691458265670674, + "grad_norm": 3.3760900497436523, + "learning_rate": 4.930307886367458e-06, + "loss": 0.7435, + "step": 1020 + }, + { + "epoch": 0.4974017538161741, + "grad_norm": 3.1736550331115723, + "learning_rate": 4.9301570092233084e-06, + "loss": 0.6779, + "step": 1021 + }, + { + "epoch": 0.49788892497564147, + "grad_norm": 3.2735767364501953, + "learning_rate": 4.930005971251595e-06, + "loss": 0.7245, + "step": 1022 + }, + { + "epoch": 0.4983760961351088, + "grad_norm": 2.494114637374878, + "learning_rate": 4.929854772462312e-06, + "loss": 0.6266, + "step": 1023 + }, + { + "epoch": 0.49886326729457614, + "grad_norm": 2.925532579421997, + "learning_rate": 4.929703412865467e-06, + "loss": 0.6973, + "step": 1024 + }, + { + "epoch": 0.49935043845404353, + "grad_norm": 2.9729201793670654, + "learning_rate": 4.9295518924710754e-06, + "loss": 0.6256, + "step": 1025 + }, + { + "epoch": 0.49983760961351087, + "grad_norm": 3.051567554473877, + "learning_rate": 4.929400211289166e-06, + "loss": 0.6709, + "step": 1026 + }, + { + "epoch": 0.5003247807729783, + "grad_norm": 2.6071934700012207, + "learning_rate": 4.929248369329778e-06, + "loss": 0.6401, + "step": 1027 + }, + { + "epoch": 0.5008119519324457, + "grad_norm": 2.878251314163208, + "learning_rate": 4.9290963666029576e-06, + "loss": 0.7068, + "step": 1028 + }, + { + "epoch": 0.5012991230919129, + "grad_norm": 2.857102632522583, + "learning_rate": 4.928944203118767e-06, + "loss": 0.6272, + "step": 1029 + }, + { + "epoch": 0.5017862942513803, + "grad_norm": 2.9291653633117676, + "learning_rate": 4.928791878887274e-06, + "loss": 0.6831, + "step": 1030 + }, + { + "epoch": 0.5022734654108477, + "grad_norm": 3.4554789066314697, + "learning_rate": 4.928639393918562e-06, + "loss": 0.7127, + "step": 1031 + }, + { + "epoch": 0.502760636570315, + "grad_norm": 2.492072105407715, + "learning_rate": 4.92848674822272e-06, + "loss": 0.5293, + "step": 1032 + }, + { + "epoch": 0.5032478077297824, + "grad_norm": 3.074232816696167, + "learning_rate": 4.928333941809852e-06, + "loss": 0.7161, + "step": 1033 + }, + { + "epoch": 0.5037349788892498, + "grad_norm": 2.978029489517212, + "learning_rate": 4.928180974690069e-06, + "loss": 0.6764, + "step": 1034 + }, + { + "epoch": 0.5042221500487171, + "grad_norm": 3.1064999103546143, + "learning_rate": 4.928027846873496e-06, + "loss": 0.6952, + "step": 1035 + }, + { + "epoch": 0.5047093212081845, + "grad_norm": 2.9264984130859375, + "learning_rate": 4.927874558370266e-06, + "loss": 0.746, + "step": 1036 + }, + { + "epoch": 0.5051964923676519, + "grad_norm": 2.9964468479156494, + "learning_rate": 4.927721109190523e-06, + "loss": 0.6832, + "step": 1037 + }, + { + "epoch": 0.5056836635271192, + "grad_norm": 2.9041526317596436, + "learning_rate": 4.927567499344424e-06, + "loss": 0.7873, + "step": 1038 + }, + { + "epoch": 0.5061708346865865, + "grad_norm": 3.0163636207580566, + "learning_rate": 4.927413728842134e-06, + "loss": 0.7452, + "step": 1039 + }, + { + "epoch": 0.5066580058460539, + "grad_norm": 2.54750657081604, + "learning_rate": 4.927259797693828e-06, + "loss": 0.6769, + "step": 1040 + }, + { + "epoch": 0.5071451770055213, + "grad_norm": 3.108290195465088, + "learning_rate": 4.927105705909696e-06, + "loss": 0.5959, + "step": 1041 + }, + { + "epoch": 0.5076323481649886, + "grad_norm": 2.769896984100342, + "learning_rate": 4.9269514534999345e-06, + "loss": 0.7413, + "step": 1042 + }, + { + "epoch": 0.508119519324456, + "grad_norm": 2.90826153755188, + "learning_rate": 4.926797040474751e-06, + "loss": 0.7022, + "step": 1043 + }, + { + "epoch": 0.5086066904839234, + "grad_norm": 4.183291435241699, + "learning_rate": 4.9266424668443666e-06, + "loss": 0.6732, + "step": 1044 + }, + { + "epoch": 0.5090938616433908, + "grad_norm": 2.9835453033447266, + "learning_rate": 4.9264877326190084e-06, + "loss": 0.6845, + "step": 1045 + }, + { + "epoch": 0.509581032802858, + "grad_norm": 2.349344253540039, + "learning_rate": 4.926332837808918e-06, + "loss": 0.6804, + "step": 1046 + }, + { + "epoch": 0.5100682039623254, + "grad_norm": 2.845362663269043, + "learning_rate": 4.926177782424347e-06, + "loss": 0.7489, + "step": 1047 + }, + { + "epoch": 0.5105553751217928, + "grad_norm": 2.8332839012145996, + "learning_rate": 4.926022566475557e-06, + "loss": 0.7672, + "step": 1048 + }, + { + "epoch": 0.5110425462812601, + "grad_norm": 2.9139339923858643, + "learning_rate": 4.925867189972818e-06, + "loss": 0.7154, + "step": 1049 + }, + { + "epoch": 0.5115297174407275, + "grad_norm": 2.6853044033050537, + "learning_rate": 4.925711652926415e-06, + "loss": 0.6985, + "step": 1050 + }, + { + "epoch": 0.5120168886001949, + "grad_norm": 3.023705244064331, + "learning_rate": 4.92555595534664e-06, + "loss": 0.7003, + "step": 1051 + }, + { + "epoch": 0.5125040597596622, + "grad_norm": 3.1321451663970947, + "learning_rate": 4.925400097243799e-06, + "loss": 0.6148, + "step": 1052 + }, + { + "epoch": 0.5129912309191296, + "grad_norm": 2.731886148452759, + "learning_rate": 4.925244078628204e-06, + "loss": 0.607, + "step": 1053 + }, + { + "epoch": 0.513478402078597, + "grad_norm": 2.798536539077759, + "learning_rate": 4.925087899510183e-06, + "loss": 0.6934, + "step": 1054 + }, + { + "epoch": 0.5139655732380644, + "grad_norm": 2.9063515663146973, + "learning_rate": 4.924931559900071e-06, + "loss": 0.6468, + "step": 1055 + }, + { + "epoch": 0.5144527443975316, + "grad_norm": 2.7820327281951904, + "learning_rate": 4.924775059808212e-06, + "loss": 0.6389, + "step": 1056 + }, + { + "epoch": 0.514939915556999, + "grad_norm": 2.9730870723724365, + "learning_rate": 4.924618399244967e-06, + "loss": 0.6678, + "step": 1057 + }, + { + "epoch": 0.5154270867164664, + "grad_norm": 3.482926368713379, + "learning_rate": 4.924461578220701e-06, + "loss": 0.6826, + "step": 1058 + }, + { + "epoch": 0.5159142578759337, + "grad_norm": 2.5701849460601807, + "learning_rate": 4.9243045967457945e-06, + "loss": 0.6788, + "step": 1059 + }, + { + "epoch": 0.5164014290354011, + "grad_norm": 3.4126956462860107, + "learning_rate": 4.924147454830636e-06, + "loss": 0.7409, + "step": 1060 + }, + { + "epoch": 0.5168886001948685, + "grad_norm": 2.652310371398926, + "learning_rate": 4.923990152485624e-06, + "loss": 0.5661, + "step": 1061 + }, + { + "epoch": 0.5173757713543359, + "grad_norm": 3.178335428237915, + "learning_rate": 4.923832689721169e-06, + "loss": 0.7043, + "step": 1062 + }, + { + "epoch": 0.5178629425138032, + "grad_norm": 3.1010327339172363, + "learning_rate": 4.923675066547692e-06, + "loss": 0.7566, + "step": 1063 + }, + { + "epoch": 0.5183501136732706, + "grad_norm": 2.9874470233917236, + "learning_rate": 4.923517282975626e-06, + "loss": 0.544, + "step": 1064 + }, + { + "epoch": 0.5188372848327379, + "grad_norm": 2.8462681770324707, + "learning_rate": 4.923359339015411e-06, + "loss": 0.6708, + "step": 1065 + }, + { + "epoch": 0.5193244559922052, + "grad_norm": 2.654353618621826, + "learning_rate": 4.923201234677501e-06, + "loss": 0.659, + "step": 1066 + }, + { + "epoch": 0.5198116271516726, + "grad_norm": 2.8220603466033936, + "learning_rate": 4.9230429699723596e-06, + "loss": 0.6751, + "step": 1067 + }, + { + "epoch": 0.52029879831114, + "grad_norm": 2.556986093521118, + "learning_rate": 4.922884544910459e-06, + "loss": 0.6241, + "step": 1068 + }, + { + "epoch": 0.5207859694706073, + "grad_norm": 3.0685765743255615, + "learning_rate": 4.922725959502285e-06, + "loss": 0.6848, + "step": 1069 + }, + { + "epoch": 0.5212731406300747, + "grad_norm": 2.9253416061401367, + "learning_rate": 4.922567213758332e-06, + "loss": 0.7845, + "step": 1070 + }, + { + "epoch": 0.5217603117895421, + "grad_norm": 2.623793601989746, + "learning_rate": 4.922408307689108e-06, + "loss": 0.5898, + "step": 1071 + }, + { + "epoch": 0.5222474829490095, + "grad_norm": 2.6791207790374756, + "learning_rate": 4.922249241305127e-06, + "loss": 0.5813, + "step": 1072 + }, + { + "epoch": 0.5227346541084767, + "grad_norm": 2.910895824432373, + "learning_rate": 4.922090014616916e-06, + "loss": 0.7118, + "step": 1073 + }, + { + "epoch": 0.5232218252679441, + "grad_norm": 2.8293228149414062, + "learning_rate": 4.921930627635014e-06, + "loss": 0.7269, + "step": 1074 + }, + { + "epoch": 0.5237089964274115, + "grad_norm": 2.768953561782837, + "learning_rate": 4.921771080369969e-06, + "loss": 0.6629, + "step": 1075 + }, + { + "epoch": 0.5241961675868788, + "grad_norm": 2.9247262477874756, + "learning_rate": 4.921611372832339e-06, + "loss": 0.7129, + "step": 1076 + }, + { + "epoch": 0.5246833387463462, + "grad_norm": 2.9735381603240967, + "learning_rate": 4.921451505032695e-06, + "loss": 0.7033, + "step": 1077 + }, + { + "epoch": 0.5251705099058136, + "grad_norm": 2.9002068042755127, + "learning_rate": 4.921291476981614e-06, + "loss": 0.725, + "step": 1078 + }, + { + "epoch": 0.525657681065281, + "grad_norm": 3.007343053817749, + "learning_rate": 4.921131288689689e-06, + "loss": 0.6739, + "step": 1079 + }, + { + "epoch": 0.5261448522247483, + "grad_norm": 2.526569128036499, + "learning_rate": 4.920970940167522e-06, + "loss": 0.6237, + "step": 1080 + }, + { + "epoch": 0.5266320233842157, + "grad_norm": 2.7773635387420654, + "learning_rate": 4.920810431425722e-06, + "loss": 0.7165, + "step": 1081 + }, + { + "epoch": 0.5271191945436831, + "grad_norm": 3.171161651611328, + "learning_rate": 4.920649762474914e-06, + "loss": 0.6578, + "step": 1082 + }, + { + "epoch": 0.5276063657031503, + "grad_norm": 3.0068509578704834, + "learning_rate": 4.92048893332573e-06, + "loss": 0.69, + "step": 1083 + }, + { + "epoch": 0.5280935368626177, + "grad_norm": 3.0939905643463135, + "learning_rate": 4.920327943988814e-06, + "loss": 0.641, + "step": 1084 + }, + { + "epoch": 0.5285807080220851, + "grad_norm": 3.069173812866211, + "learning_rate": 4.92016679447482e-06, + "loss": 0.6925, + "step": 1085 + }, + { + "epoch": 0.5290678791815524, + "grad_norm": 2.7703135013580322, + "learning_rate": 4.920005484794412e-06, + "loss": 0.6513, + "step": 1086 + }, + { + "epoch": 0.5295550503410198, + "grad_norm": 2.7235524654388428, + "learning_rate": 4.9198440149582675e-06, + "loss": 0.5775, + "step": 1087 + }, + { + "epoch": 0.5300422215004872, + "grad_norm": 2.753606081008911, + "learning_rate": 4.919682384977071e-06, + "loss": 0.7193, + "step": 1088 + }, + { + "epoch": 0.5305293926599546, + "grad_norm": 2.68229341506958, + "learning_rate": 4.91952059486152e-06, + "loss": 0.622, + "step": 1089 + }, + { + "epoch": 0.5310165638194219, + "grad_norm": 2.7047665119171143, + "learning_rate": 4.91935864462232e-06, + "loss": 0.724, + "step": 1090 + }, + { + "epoch": 0.5315037349788893, + "grad_norm": 2.9090986251831055, + "learning_rate": 4.919196534270192e-06, + "loss": 0.6114, + "step": 1091 + }, + { + "epoch": 0.5319909061383566, + "grad_norm": 2.93923020362854, + "learning_rate": 4.919034263815861e-06, + "loss": 0.7204, + "step": 1092 + }, + { + "epoch": 0.5324780772978239, + "grad_norm": 2.540090560913086, + "learning_rate": 4.918871833270069e-06, + "loss": 0.7168, + "step": 1093 + }, + { + "epoch": 0.5329652484572913, + "grad_norm": 3.103158473968506, + "learning_rate": 4.918709242643563e-06, + "loss": 0.6648, + "step": 1094 + }, + { + "epoch": 0.5334524196167587, + "grad_norm": 2.625450372695923, + "learning_rate": 4.918546491947106e-06, + "loss": 0.6526, + "step": 1095 + }, + { + "epoch": 0.5339395907762261, + "grad_norm": 2.599675178527832, + "learning_rate": 4.9183835811914665e-06, + "loss": 0.5989, + "step": 1096 + }, + { + "epoch": 0.5344267619356934, + "grad_norm": 2.7103993892669678, + "learning_rate": 4.918220510387427e-06, + "loss": 0.602, + "step": 1097 + }, + { + "epoch": 0.5349139330951608, + "grad_norm": 2.724177598953247, + "learning_rate": 4.918057279545779e-06, + "loss": 0.6101, + "step": 1098 + }, + { + "epoch": 0.5354011042546282, + "grad_norm": 2.8167989253997803, + "learning_rate": 4.9178938886773255e-06, + "loss": 0.6831, + "step": 1099 + }, + { + "epoch": 0.5358882754140954, + "grad_norm": 2.9069645404815674, + "learning_rate": 4.91773033779288e-06, + "loss": 0.6204, + "step": 1100 + }, + { + "epoch": 0.5363754465735628, + "grad_norm": 2.8461880683898926, + "learning_rate": 4.917566626903267e-06, + "loss": 0.6078, + "step": 1101 + }, + { + "epoch": 0.5368626177330302, + "grad_norm": 2.848750591278076, + "learning_rate": 4.917402756019317e-06, + "loss": 0.7737, + "step": 1102 + }, + { + "epoch": 0.5373497888924975, + "grad_norm": 2.570277214050293, + "learning_rate": 4.91723872515188e-06, + "loss": 0.665, + "step": 1103 + }, + { + "epoch": 0.5378369600519649, + "grad_norm": 2.6272923946380615, + "learning_rate": 4.917074534311807e-06, + "loss": 0.6938, + "step": 1104 + }, + { + "epoch": 0.5383241312114323, + "grad_norm": 2.617082118988037, + "learning_rate": 4.9169101835099685e-06, + "loss": 0.5761, + "step": 1105 + }, + { + "epoch": 0.5388113023708997, + "grad_norm": 2.9861764907836914, + "learning_rate": 4.9167456727572385e-06, + "loss": 0.728, + "step": 1106 + }, + { + "epoch": 0.539298473530367, + "grad_norm": 2.960075616836548, + "learning_rate": 4.916581002064505e-06, + "loss": 0.7089, + "step": 1107 + }, + { + "epoch": 0.5397856446898344, + "grad_norm": 2.7446067333221436, + "learning_rate": 4.916416171442666e-06, + "loss": 0.6919, + "step": 1108 + }, + { + "epoch": 0.5402728158493018, + "grad_norm": 2.8137757778167725, + "learning_rate": 4.916251180902629e-06, + "loss": 0.6554, + "step": 1109 + }, + { + "epoch": 0.540759987008769, + "grad_norm": 3.1151537895202637, + "learning_rate": 4.916086030455315e-06, + "loss": 0.5952, + "step": 1110 + }, + { + "epoch": 0.5412471581682364, + "grad_norm": 2.4293153285980225, + "learning_rate": 4.915920720111651e-06, + "loss": 0.7004, + "step": 1111 + }, + { + "epoch": 0.5417343293277038, + "grad_norm": 3.3177921772003174, + "learning_rate": 4.91575524988258e-06, + "loss": 0.7057, + "step": 1112 + }, + { + "epoch": 0.5422215004871711, + "grad_norm": 3.028184652328491, + "learning_rate": 4.915589619779051e-06, + "loss": 0.7402, + "step": 1113 + }, + { + "epoch": 0.5427086716466385, + "grad_norm": 3.1493051052093506, + "learning_rate": 4.915423829812025e-06, + "loss": 0.6668, + "step": 1114 + }, + { + "epoch": 0.5431958428061059, + "grad_norm": 2.519679546356201, + "learning_rate": 4.9152578799924765e-06, + "loss": 0.6567, + "step": 1115 + }, + { + "epoch": 0.5436830139655733, + "grad_norm": 2.7507076263427734, + "learning_rate": 4.915091770331386e-06, + "loss": 0.7261, + "step": 1116 + }, + { + "epoch": 0.5441701851250406, + "grad_norm": 2.913113832473755, + "learning_rate": 4.914925500839746e-06, + "loss": 0.7259, + "step": 1117 + }, + { + "epoch": 0.544657356284508, + "grad_norm": 2.5625803470611572, + "learning_rate": 4.914759071528562e-06, + "loss": 0.632, + "step": 1118 + }, + { + "epoch": 0.5451445274439753, + "grad_norm": 2.6524088382720947, + "learning_rate": 4.9145924824088475e-06, + "loss": 0.6244, + "step": 1119 + }, + { + "epoch": 0.5456316986034426, + "grad_norm": 3.0280094146728516, + "learning_rate": 4.914425733491628e-06, + "loss": 0.6715, + "step": 1120 + }, + { + "epoch": 0.54611886976291, + "grad_norm": 2.9596850872039795, + "learning_rate": 4.914258824787937e-06, + "loss": 0.7245, + "step": 1121 + }, + { + "epoch": 0.5466060409223774, + "grad_norm": 2.737692356109619, + "learning_rate": 4.9140917563088225e-06, + "loss": 0.6386, + "step": 1122 + }, + { + "epoch": 0.5470932120818448, + "grad_norm": 2.968540668487549, + "learning_rate": 4.913924528065341e-06, + "loss": 0.7709, + "step": 1123 + }, + { + "epoch": 0.5475803832413121, + "grad_norm": 2.5927393436431885, + "learning_rate": 4.913757140068558e-06, + "loss": 0.6558, + "step": 1124 + }, + { + "epoch": 0.5480675544007795, + "grad_norm": 2.783342123031616, + "learning_rate": 4.913589592329552e-06, + "loss": 0.619, + "step": 1125 + }, + { + "epoch": 0.5485547255602469, + "grad_norm": 2.801626443862915, + "learning_rate": 4.913421884859412e-06, + "loss": 0.6882, + "step": 1126 + }, + { + "epoch": 0.5490418967197142, + "grad_norm": 2.8141798973083496, + "learning_rate": 4.913254017669237e-06, + "loss": 0.7948, + "step": 1127 + }, + { + "epoch": 0.5495290678791815, + "grad_norm": 2.6959855556488037, + "learning_rate": 4.913085990770135e-06, + "loss": 0.6769, + "step": 1128 + }, + { + "epoch": 0.5500162390386489, + "grad_norm": 2.588724374771118, + "learning_rate": 4.912917804173227e-06, + "loss": 0.6288, + "step": 1129 + }, + { + "epoch": 0.5505034101981162, + "grad_norm": 2.596820116043091, + "learning_rate": 4.912749457889644e-06, + "loss": 0.615, + "step": 1130 + }, + { + "epoch": 0.5509905813575836, + "grad_norm": 2.624694347381592, + "learning_rate": 4.912580951930526e-06, + "loss": 0.594, + "step": 1131 + }, + { + "epoch": 0.551477752517051, + "grad_norm": 2.635634183883667, + "learning_rate": 4.912412286307025e-06, + "loss": 0.6373, + "step": 1132 + }, + { + "epoch": 0.5519649236765184, + "grad_norm": 2.5630428791046143, + "learning_rate": 4.912243461030304e-06, + "loss": 0.6561, + "step": 1133 + }, + { + "epoch": 0.5524520948359857, + "grad_norm": 2.624570608139038, + "learning_rate": 4.912074476111536e-06, + "loss": 0.683, + "step": 1134 + }, + { + "epoch": 0.5529392659954531, + "grad_norm": 3.3587305545806885, + "learning_rate": 4.9119053315619025e-06, + "loss": 0.9111, + "step": 1135 + }, + { + "epoch": 0.5534264371549205, + "grad_norm": 2.8497469425201416, + "learning_rate": 4.911736027392599e-06, + "loss": 0.6461, + "step": 1136 + }, + { + "epoch": 0.5539136083143877, + "grad_norm": 2.6497693061828613, + "learning_rate": 4.91156656361483e-06, + "loss": 0.6296, + "step": 1137 + }, + { + "epoch": 0.5544007794738551, + "grad_norm": 3.3392043113708496, + "learning_rate": 4.911396940239811e-06, + "loss": 0.6389, + "step": 1138 + }, + { + "epoch": 0.5548879506333225, + "grad_norm": 3.125595808029175, + "learning_rate": 4.911227157278766e-06, + "loss": 0.7011, + "step": 1139 + }, + { + "epoch": 0.5553751217927899, + "grad_norm": 2.958301305770874, + "learning_rate": 4.911057214742934e-06, + "loss": 0.659, + "step": 1140 + }, + { + "epoch": 0.5558622929522572, + "grad_norm": 2.6320693492889404, + "learning_rate": 4.910887112643558e-06, + "loss": 0.6394, + "step": 1141 + }, + { + "epoch": 0.5563494641117246, + "grad_norm": 3.3005614280700684, + "learning_rate": 4.910716850991898e-06, + "loss": 0.7108, + "step": 1142 + }, + { + "epoch": 0.556836635271192, + "grad_norm": 2.7681660652160645, + "learning_rate": 4.910546429799223e-06, + "loss": 0.6095, + "step": 1143 + }, + { + "epoch": 0.5573238064306593, + "grad_norm": 2.9881365299224854, + "learning_rate": 4.910375849076807e-06, + "loss": 0.5872, + "step": 1144 + }, + { + "epoch": 0.5578109775901267, + "grad_norm": 3.0350120067596436, + "learning_rate": 4.910205108835943e-06, + "loss": 0.6357, + "step": 1145 + }, + { + "epoch": 0.558298148749594, + "grad_norm": 2.900965690612793, + "learning_rate": 4.910034209087929e-06, + "loss": 0.7526, + "step": 1146 + }, + { + "epoch": 0.5587853199090613, + "grad_norm": 3.087676525115967, + "learning_rate": 4.909863149844076e-06, + "loss": 0.747, + "step": 1147 + }, + { + "epoch": 0.5592724910685287, + "grad_norm": 3.310762882232666, + "learning_rate": 4.909691931115704e-06, + "loss": 0.7123, + "step": 1148 + }, + { + "epoch": 0.5597596622279961, + "grad_norm": 3.0361690521240234, + "learning_rate": 4.909520552914144e-06, + "loss": 0.6892, + "step": 1149 + }, + { + "epoch": 0.5602468333874635, + "grad_norm": 2.535762310028076, + "learning_rate": 4.90934901525074e-06, + "loss": 0.6112, + "step": 1150 + }, + { + "epoch": 0.5607340045469308, + "grad_norm": 3.086287021636963, + "learning_rate": 4.90917731813684e-06, + "loss": 0.6916, + "step": 1151 + }, + { + "epoch": 0.5612211757063982, + "grad_norm": 3.0162851810455322, + "learning_rate": 4.909005461583811e-06, + "loss": 0.7149, + "step": 1152 + }, + { + "epoch": 0.5617083468658656, + "grad_norm": 2.8515400886535645, + "learning_rate": 4.908833445603024e-06, + "loss": 0.7178, + "step": 1153 + }, + { + "epoch": 0.5621955180253329, + "grad_norm": 3.0707004070281982, + "learning_rate": 4.908661270205864e-06, + "loss": 0.6913, + "step": 1154 + }, + { + "epoch": 0.5626826891848002, + "grad_norm": 3.1491355895996094, + "learning_rate": 4.908488935403726e-06, + "loss": 0.6353, + "step": 1155 + }, + { + "epoch": 0.5631698603442676, + "grad_norm": 3.2622625827789307, + "learning_rate": 4.908316441208014e-06, + "loss": 0.7606, + "step": 1156 + }, + { + "epoch": 0.563657031503735, + "grad_norm": 2.5276904106140137, + "learning_rate": 4.908143787630145e-06, + "loss": 0.5226, + "step": 1157 + }, + { + "epoch": 0.5641442026632023, + "grad_norm": 2.8795313835144043, + "learning_rate": 4.907970974681543e-06, + "loss": 0.727, + "step": 1158 + }, + { + "epoch": 0.5646313738226697, + "grad_norm": 3.2028603553771973, + "learning_rate": 4.907798002373647e-06, + "loss": 0.8305, + "step": 1159 + }, + { + "epoch": 0.5651185449821371, + "grad_norm": 2.995795726776123, + "learning_rate": 4.9076248707179045e-06, + "loss": 0.6899, + "step": 1160 + }, + { + "epoch": 0.5656057161416044, + "grad_norm": 2.6402111053466797, + "learning_rate": 4.907451579725771e-06, + "loss": 0.658, + "step": 1161 + }, + { + "epoch": 0.5660928873010718, + "grad_norm": 3.11444354057312, + "learning_rate": 4.907278129408716e-06, + "loss": 0.6584, + "step": 1162 + }, + { + "epoch": 0.5665800584605392, + "grad_norm": 2.880690574645996, + "learning_rate": 4.907104519778218e-06, + "loss": 0.6116, + "step": 1163 + }, + { + "epoch": 0.5670672296200064, + "grad_norm": 2.422348737716675, + "learning_rate": 4.906930750845769e-06, + "loss": 0.5817, + "step": 1164 + }, + { + "epoch": 0.5675544007794738, + "grad_norm": 2.7165842056274414, + "learning_rate": 4.906756822622865e-06, + "loss": 0.6845, + "step": 1165 + }, + { + "epoch": 0.5680415719389412, + "grad_norm": 2.974665641784668, + "learning_rate": 4.906582735121019e-06, + "loss": 0.6638, + "step": 1166 + }, + { + "epoch": 0.5685287430984086, + "grad_norm": 3.0898873805999756, + "learning_rate": 4.906408488351753e-06, + "loss": 0.7058, + "step": 1167 + }, + { + "epoch": 0.5690159142578759, + "grad_norm": 2.6559629440307617, + "learning_rate": 4.906234082326597e-06, + "loss": 0.696, + "step": 1168 + }, + { + "epoch": 0.5695030854173433, + "grad_norm": 2.8202643394470215, + "learning_rate": 4.9060595170570925e-06, + "loss": 0.6951, + "step": 1169 + }, + { + "epoch": 0.5699902565768107, + "grad_norm": 3.0967023372650146, + "learning_rate": 4.905884792554794e-06, + "loss": 0.7604, + "step": 1170 + }, + { + "epoch": 0.570477427736278, + "grad_norm": 2.9298272132873535, + "learning_rate": 4.905709908831263e-06, + "loss": 0.6904, + "step": 1171 + }, + { + "epoch": 0.5709645988957454, + "grad_norm": 2.557478427886963, + "learning_rate": 4.905534865898075e-06, + "loss": 0.5917, + "step": 1172 + }, + { + "epoch": 0.5714517700552127, + "grad_norm": 3.338494062423706, + "learning_rate": 4.905359663766813e-06, + "loss": 0.6472, + "step": 1173 + }, + { + "epoch": 0.5719389412146801, + "grad_norm": 2.7269961833953857, + "learning_rate": 4.905184302449073e-06, + "loss": 0.6804, + "step": 1174 + }, + { + "epoch": 0.5724261123741474, + "grad_norm": 2.7366442680358887, + "learning_rate": 4.90500878195646e-06, + "loss": 0.7273, + "step": 1175 + }, + { + "epoch": 0.5729132835336148, + "grad_norm": 2.6776039600372314, + "learning_rate": 4.90483310230059e-06, + "loss": 0.6941, + "step": 1176 + }, + { + "epoch": 0.5734004546930822, + "grad_norm": 2.8550007343292236, + "learning_rate": 4.904657263493089e-06, + "loss": 0.6625, + "step": 1177 + }, + { + "epoch": 0.5738876258525495, + "grad_norm": 3.045435667037964, + "learning_rate": 4.904481265545593e-06, + "loss": 0.6394, + "step": 1178 + }, + { + "epoch": 0.5743747970120169, + "grad_norm": 2.7269983291625977, + "learning_rate": 4.904305108469753e-06, + "loss": 0.6859, + "step": 1179 + }, + { + "epoch": 0.5748619681714843, + "grad_norm": 3.5289368629455566, + "learning_rate": 4.904128792277223e-06, + "loss": 0.7293, + "step": 1180 + }, + { + "epoch": 0.5753491393309516, + "grad_norm": 3.133817672729492, + "learning_rate": 4.9039523169796734e-06, + "loss": 0.7296, + "step": 1181 + }, + { + "epoch": 0.575836310490419, + "grad_norm": 3.2423651218414307, + "learning_rate": 4.903775682588784e-06, + "loss": 0.7061, + "step": 1182 + }, + { + "epoch": 0.5763234816498863, + "grad_norm": 2.6286673545837402, + "learning_rate": 4.903598889116243e-06, + "loss": 0.6204, + "step": 1183 + }, + { + "epoch": 0.5768106528093537, + "grad_norm": 2.6988279819488525, + "learning_rate": 4.9034219365737525e-06, + "loss": 0.702, + "step": 1184 + }, + { + "epoch": 0.577297823968821, + "grad_norm": 2.5735116004943848, + "learning_rate": 4.903244824973021e-06, + "loss": 0.6304, + "step": 1185 + }, + { + "epoch": 0.5777849951282884, + "grad_norm": 2.8130884170532227, + "learning_rate": 4.903067554325772e-06, + "loss": 0.681, + "step": 1186 + }, + { + "epoch": 0.5782721662877558, + "grad_norm": 2.907750129699707, + "learning_rate": 4.902890124643735e-06, + "loss": 0.6598, + "step": 1187 + }, + { + "epoch": 0.5787593374472231, + "grad_norm": 2.9107213020324707, + "learning_rate": 4.902712535938654e-06, + "loss": 0.6795, + "step": 1188 + }, + { + "epoch": 0.5792465086066905, + "grad_norm": 3.0805652141571045, + "learning_rate": 4.9025347882222806e-06, + "loss": 0.6808, + "step": 1189 + }, + { + "epoch": 0.5797336797661579, + "grad_norm": 3.2664177417755127, + "learning_rate": 4.902356881506379e-06, + "loss": 0.6843, + "step": 1190 + }, + { + "epoch": 0.5802208509256253, + "grad_norm": 2.859464406967163, + "learning_rate": 4.902178815802722e-06, + "loss": 0.7232, + "step": 1191 + }, + { + "epoch": 0.5807080220850925, + "grad_norm": 2.7322444915771484, + "learning_rate": 4.902000591123095e-06, + "loss": 0.5883, + "step": 1192 + }, + { + "epoch": 0.5811951932445599, + "grad_norm": 2.6795573234558105, + "learning_rate": 4.901822207479293e-06, + "loss": 0.6943, + "step": 1193 + }, + { + "epoch": 0.5816823644040273, + "grad_norm": 2.9793968200683594, + "learning_rate": 4.9016436648831206e-06, + "loss": 0.6454, + "step": 1194 + }, + { + "epoch": 0.5821695355634946, + "grad_norm": 2.8620493412017822, + "learning_rate": 4.901464963346394e-06, + "loss": 0.6356, + "step": 1195 + }, + { + "epoch": 0.582656706722962, + "grad_norm": 3.2030999660491943, + "learning_rate": 4.90128610288094e-06, + "loss": 0.6482, + "step": 1196 + }, + { + "epoch": 0.5831438778824294, + "grad_norm": 3.061072587966919, + "learning_rate": 4.901107083498595e-06, + "loss": 0.615, + "step": 1197 + }, + { + "epoch": 0.5836310490418967, + "grad_norm": 3.057978868484497, + "learning_rate": 4.9009279052112075e-06, + "loss": 0.7789, + "step": 1198 + }, + { + "epoch": 0.5841182202013641, + "grad_norm": 2.7651169300079346, + "learning_rate": 4.900748568030634e-06, + "loss": 0.6887, + "step": 1199 + }, + { + "epoch": 0.5846053913608315, + "grad_norm": 2.882568597793579, + "learning_rate": 4.900569071968744e-06, + "loss": 0.6945, + "step": 1200 + }, + { + "epoch": 0.5850925625202988, + "grad_norm": 3.2294602394104004, + "learning_rate": 4.900389417037417e-06, + "loss": 0.6883, + "step": 1201 + }, + { + "epoch": 0.5855797336797661, + "grad_norm": 3.1631877422332764, + "learning_rate": 4.900209603248541e-06, + "loss": 0.7505, + "step": 1202 + }, + { + "epoch": 0.5860669048392335, + "grad_norm": 3.047727108001709, + "learning_rate": 4.900029630614017e-06, + "loss": 0.6614, + "step": 1203 + }, + { + "epoch": 0.5865540759987009, + "grad_norm": 2.674960136413574, + "learning_rate": 4.899849499145757e-06, + "loss": 0.6137, + "step": 1204 + }, + { + "epoch": 0.5870412471581682, + "grad_norm": 2.826732635498047, + "learning_rate": 4.8996692088556794e-06, + "loss": 0.6367, + "step": 1205 + }, + { + "epoch": 0.5875284183176356, + "grad_norm": 2.8269195556640625, + "learning_rate": 4.899488759755717e-06, + "loss": 0.6683, + "step": 1206 + }, + { + "epoch": 0.588015589477103, + "grad_norm": 3.153785228729248, + "learning_rate": 4.899308151857812e-06, + "loss": 0.7003, + "step": 1207 + }, + { + "epoch": 0.5885027606365704, + "grad_norm": 3.655123233795166, + "learning_rate": 4.899127385173917e-06, + "loss": 0.6917, + "step": 1208 + }, + { + "epoch": 0.5889899317960376, + "grad_norm": 2.9560744762420654, + "learning_rate": 4.898946459715995e-06, + "loss": 0.6339, + "step": 1209 + }, + { + "epoch": 0.589477102955505, + "grad_norm": 2.6344680786132812, + "learning_rate": 4.89876537549602e-06, + "loss": 0.59, + "step": 1210 + }, + { + "epoch": 0.5899642741149724, + "grad_norm": 2.631334066390991, + "learning_rate": 4.898584132525976e-06, + "loss": 0.6103, + "step": 1211 + }, + { + "epoch": 0.5904514452744397, + "grad_norm": 3.2415876388549805, + "learning_rate": 4.898402730817856e-06, + "loss": 0.67, + "step": 1212 + }, + { + "epoch": 0.5909386164339071, + "grad_norm": 2.7919132709503174, + "learning_rate": 4.898221170383668e-06, + "loss": 0.6408, + "step": 1213 + }, + { + "epoch": 0.5914257875933745, + "grad_norm": 3.043227195739746, + "learning_rate": 4.898039451235426e-06, + "loss": 0.7439, + "step": 1214 + }, + { + "epoch": 0.5919129587528418, + "grad_norm": 2.805347442626953, + "learning_rate": 4.897857573385156e-06, + "loss": 0.6712, + "step": 1215 + }, + { + "epoch": 0.5924001299123092, + "grad_norm": 3.0195353031158447, + "learning_rate": 4.897675536844896e-06, + "loss": 0.7073, + "step": 1216 + }, + { + "epoch": 0.5928873010717766, + "grad_norm": 3.4065299034118652, + "learning_rate": 4.897493341626691e-06, + "loss": 0.6269, + "step": 1217 + }, + { + "epoch": 0.593374472231244, + "grad_norm": 3.0204195976257324, + "learning_rate": 4.8973109877426e-06, + "loss": 0.6793, + "step": 1218 + }, + { + "epoch": 0.5938616433907112, + "grad_norm": 2.684950590133667, + "learning_rate": 4.8971284752046924e-06, + "loss": 0.7053, + "step": 1219 + }, + { + "epoch": 0.5943488145501786, + "grad_norm": 2.6978840827941895, + "learning_rate": 4.896945804025045e-06, + "loss": 0.7247, + "step": 1220 + }, + { + "epoch": 0.594835985709646, + "grad_norm": 2.9530441761016846, + "learning_rate": 4.896762974215747e-06, + "loss": 0.6754, + "step": 1221 + }, + { + "epoch": 0.5953231568691133, + "grad_norm": 2.9510486125946045, + "learning_rate": 4.896579985788898e-06, + "loss": 0.7497, + "step": 1222 + }, + { + "epoch": 0.5958103280285807, + "grad_norm": 2.9127306938171387, + "learning_rate": 4.8963968387566105e-06, + "loss": 0.6787, + "step": 1223 + }, + { + "epoch": 0.5962974991880481, + "grad_norm": 2.918684482574463, + "learning_rate": 4.896213533131003e-06, + "loss": 0.6288, + "step": 1224 + }, + { + "epoch": 0.5967846703475155, + "grad_norm": 3.6824493408203125, + "learning_rate": 4.896030068924206e-06, + "loss": 0.6851, + "step": 1225 + }, + { + "epoch": 0.5972718415069828, + "grad_norm": 2.942737102508545, + "learning_rate": 4.895846446148362e-06, + "loss": 0.6787, + "step": 1226 + }, + { + "epoch": 0.5977590126664502, + "grad_norm": 3.372896909713745, + "learning_rate": 4.895662664815624e-06, + "loss": 0.6873, + "step": 1227 + }, + { + "epoch": 0.5982461838259175, + "grad_norm": 2.596632957458496, + "learning_rate": 4.8954787249381545e-06, + "loss": 0.6292, + "step": 1228 + }, + { + "epoch": 0.5987333549853848, + "grad_norm": 3.295400857925415, + "learning_rate": 4.895294626528124e-06, + "loss": 0.6839, + "step": 1229 + }, + { + "epoch": 0.5992205261448522, + "grad_norm": 3.0339982509613037, + "learning_rate": 4.8951103695977196e-06, + "loss": 0.6812, + "step": 1230 + }, + { + "epoch": 0.5997076973043196, + "grad_norm": 2.7537946701049805, + "learning_rate": 4.894925954159134e-06, + "loss": 0.677, + "step": 1231 + }, + { + "epoch": 0.6001948684637869, + "grad_norm": 3.138216018676758, + "learning_rate": 4.894741380224572e-06, + "loss": 0.6869, + "step": 1232 + }, + { + "epoch": 0.6006820396232543, + "grad_norm": 2.789608955383301, + "learning_rate": 4.894556647806248e-06, + "loss": 0.6767, + "step": 1233 + }, + { + "epoch": 0.6011692107827217, + "grad_norm": 2.8629252910614014, + "learning_rate": 4.8943717569163886e-06, + "loss": 0.6745, + "step": 1234 + }, + { + "epoch": 0.6016563819421891, + "grad_norm": 2.91640567779541, + "learning_rate": 4.894186707567228e-06, + "loss": 0.6417, + "step": 1235 + }, + { + "epoch": 0.6021435531016563, + "grad_norm": 3.3210175037384033, + "learning_rate": 4.894001499771015e-06, + "loss": 0.5988, + "step": 1236 + }, + { + "epoch": 0.6026307242611237, + "grad_norm": 2.8391079902648926, + "learning_rate": 4.893816133540006e-06, + "loss": 0.6389, + "step": 1237 + }, + { + "epoch": 0.6031178954205911, + "grad_norm": 2.812790870666504, + "learning_rate": 4.8936306088864675e-06, + "loss": 0.6214, + "step": 1238 + }, + { + "epoch": 0.6036050665800584, + "grad_norm": 2.8793914318084717, + "learning_rate": 4.893444925822679e-06, + "loss": 0.7183, + "step": 1239 + }, + { + "epoch": 0.6040922377395258, + "grad_norm": 3.4638001918792725, + "learning_rate": 4.893259084360927e-06, + "loss": 0.6817, + "step": 1240 + }, + { + "epoch": 0.6045794088989932, + "grad_norm": 3.061485767364502, + "learning_rate": 4.893073084513512e-06, + "loss": 0.6931, + "step": 1241 + }, + { + "epoch": 0.6050665800584606, + "grad_norm": 2.7251217365264893, + "learning_rate": 4.8928869262927435e-06, + "loss": 0.616, + "step": 1242 + }, + { + "epoch": 0.6055537512179279, + "grad_norm": 2.97714900970459, + "learning_rate": 4.892700609710941e-06, + "loss": 0.7758, + "step": 1243 + }, + { + "epoch": 0.6060409223773953, + "grad_norm": 2.9239413738250732, + "learning_rate": 4.892514134780435e-06, + "loss": 0.6857, + "step": 1244 + }, + { + "epoch": 0.6065280935368627, + "grad_norm": 2.8919365406036377, + "learning_rate": 4.8923275015135665e-06, + "loss": 0.6761, + "step": 1245 + }, + { + "epoch": 0.6070152646963299, + "grad_norm": 2.7838709354400635, + "learning_rate": 4.892140709922686e-06, + "loss": 0.6875, + "step": 1246 + }, + { + "epoch": 0.6075024358557973, + "grad_norm": 2.7668848037719727, + "learning_rate": 4.891953760020157e-06, + "loss": 0.7177, + "step": 1247 + }, + { + "epoch": 0.6079896070152647, + "grad_norm": 3.0985493659973145, + "learning_rate": 4.8917666518183505e-06, + "loss": 0.711, + "step": 1248 + }, + { + "epoch": 0.608476778174732, + "grad_norm": 2.663057804107666, + "learning_rate": 4.891579385329649e-06, + "loss": 0.616, + "step": 1249 + }, + { + "epoch": 0.6089639493341994, + "grad_norm": 3.041912794113159, + "learning_rate": 4.891391960566447e-06, + "loss": 0.6023, + "step": 1250 + }, + { + "epoch": 0.6094511204936668, + "grad_norm": 2.821688652038574, + "learning_rate": 4.891204377541148e-06, + "loss": 0.6915, + "step": 1251 + }, + { + "epoch": 0.6099382916531342, + "grad_norm": 3.426327705383301, + "learning_rate": 4.891016636266166e-06, + "loss": 0.7452, + "step": 1252 + }, + { + "epoch": 0.6104254628126015, + "grad_norm": 3.3691697120666504, + "learning_rate": 4.890828736753925e-06, + "loss": 0.7009, + "step": 1253 + }, + { + "epoch": 0.6109126339720689, + "grad_norm": 2.6485559940338135, + "learning_rate": 4.890640679016861e-06, + "loss": 0.6566, + "step": 1254 + }, + { + "epoch": 0.6113998051315362, + "grad_norm": 3.1091909408569336, + "learning_rate": 4.8904524630674205e-06, + "loss": 0.7224, + "step": 1255 + }, + { + "epoch": 0.6118869762910035, + "grad_norm": 2.925912380218506, + "learning_rate": 4.890264088918058e-06, + "loss": 0.7006, + "step": 1256 + }, + { + "epoch": 0.6123741474504709, + "grad_norm": 3.019599676132202, + "learning_rate": 4.890075556581241e-06, + "loss": 0.6844, + "step": 1257 + }, + { + "epoch": 0.6128613186099383, + "grad_norm": 2.7866744995117188, + "learning_rate": 4.889886866069446e-06, + "loss": 0.6725, + "step": 1258 + }, + { + "epoch": 0.6133484897694057, + "grad_norm": 3.033198833465576, + "learning_rate": 4.889698017395159e-06, + "loss": 0.6384, + "step": 1259 + }, + { + "epoch": 0.613835660928873, + "grad_norm": 2.9199202060699463, + "learning_rate": 4.889509010570882e-06, + "loss": 0.6925, + "step": 1260 + }, + { + "epoch": 0.6143228320883404, + "grad_norm": 2.795685052871704, + "learning_rate": 4.889319845609121e-06, + "loss": 0.7325, + "step": 1261 + }, + { + "epoch": 0.6148100032478078, + "grad_norm": 3.20528507232666, + "learning_rate": 4.889130522522394e-06, + "loss": 0.6829, + "step": 1262 + }, + { + "epoch": 0.615297174407275, + "grad_norm": 2.6329658031463623, + "learning_rate": 4.888941041323232e-06, + "loss": 0.664, + "step": 1263 + }, + { + "epoch": 0.6157843455667424, + "grad_norm": 2.711719036102295, + "learning_rate": 4.8887514020241745e-06, + "loss": 0.7229, + "step": 1264 + }, + { + "epoch": 0.6162715167262098, + "grad_norm": 2.9265105724334717, + "learning_rate": 4.888561604637772e-06, + "loss": 0.6062, + "step": 1265 + }, + { + "epoch": 0.6167586878856771, + "grad_norm": 2.8186028003692627, + "learning_rate": 4.888371649176584e-06, + "loss": 0.6548, + "step": 1266 + }, + { + "epoch": 0.6172458590451445, + "grad_norm": 3.65623140335083, + "learning_rate": 4.888181535653184e-06, + "loss": 0.6639, + "step": 1267 + }, + { + "epoch": 0.6177330302046119, + "grad_norm": 2.9881300926208496, + "learning_rate": 4.887991264080151e-06, + "loss": 0.68, + "step": 1268 + }, + { + "epoch": 0.6182202013640793, + "grad_norm": 3.012219190597534, + "learning_rate": 4.887800834470079e-06, + "loss": 0.6921, + "step": 1269 + }, + { + "epoch": 0.6187073725235466, + "grad_norm": 2.7714498043060303, + "learning_rate": 4.887610246835571e-06, + "loss": 0.6471, + "step": 1270 + }, + { + "epoch": 0.619194543683014, + "grad_norm": 2.7644546031951904, + "learning_rate": 4.887419501189238e-06, + "loss": 0.7581, + "step": 1271 + }, + { + "epoch": 0.6196817148424814, + "grad_norm": 2.9372506141662598, + "learning_rate": 4.887228597543704e-06, + "loss": 0.6274, + "step": 1272 + }, + { + "epoch": 0.6201688860019486, + "grad_norm": 2.867342710494995, + "learning_rate": 4.887037535911605e-06, + "loss": 0.6718, + "step": 1273 + }, + { + "epoch": 0.620656057161416, + "grad_norm": 2.98188853263855, + "learning_rate": 4.886846316305584e-06, + "loss": 0.6786, + "step": 1274 + }, + { + "epoch": 0.6211432283208834, + "grad_norm": 3.1635735034942627, + "learning_rate": 4.886654938738296e-06, + "loss": 0.6206, + "step": 1275 + }, + { + "epoch": 0.6216303994803508, + "grad_norm": 2.9690449237823486, + "learning_rate": 4.886463403222406e-06, + "loss": 0.7135, + "step": 1276 + }, + { + "epoch": 0.6221175706398181, + "grad_norm": 2.909879446029663, + "learning_rate": 4.8862717097705915e-06, + "loss": 0.605, + "step": 1277 + }, + { + "epoch": 0.6226047417992855, + "grad_norm": 2.6281769275665283, + "learning_rate": 4.886079858395536e-06, + "loss": 0.7012, + "step": 1278 + }, + { + "epoch": 0.6230919129587529, + "grad_norm": 3.60782527923584, + "learning_rate": 4.8858878491099385e-06, + "loss": 0.7479, + "step": 1279 + }, + { + "epoch": 0.6235790841182202, + "grad_norm": 2.9436447620391846, + "learning_rate": 4.885695681926505e-06, + "loss": 0.5978, + "step": 1280 + }, + { + "epoch": 0.6240662552776876, + "grad_norm": 3.1093945503234863, + "learning_rate": 4.885503356857954e-06, + "loss": 0.6314, + "step": 1281 + }, + { + "epoch": 0.624553426437155, + "grad_norm": 2.8307738304138184, + "learning_rate": 4.885310873917012e-06, + "loss": 0.7441, + "step": 1282 + }, + { + "epoch": 0.6250405975966222, + "grad_norm": 3.0014870166778564, + "learning_rate": 4.885118233116419e-06, + "loss": 0.6843, + "step": 1283 + }, + { + "epoch": 0.6255277687560896, + "grad_norm": 2.7991085052490234, + "learning_rate": 4.8849254344689235e-06, + "loss": 0.6048, + "step": 1284 + }, + { + "epoch": 0.626014939915557, + "grad_norm": 2.7178614139556885, + "learning_rate": 4.884732477987285e-06, + "loss": 0.6705, + "step": 1285 + }, + { + "epoch": 0.6265021110750244, + "grad_norm": 2.9674830436706543, + "learning_rate": 4.884539363684272e-06, + "loss": 0.695, + "step": 1286 + }, + { + "epoch": 0.6269892822344917, + "grad_norm": 2.40849232673645, + "learning_rate": 4.884346091572667e-06, + "loss": 0.6663, + "step": 1287 + }, + { + "epoch": 0.6274764533939591, + "grad_norm": 2.979879856109619, + "learning_rate": 4.88415266166526e-06, + "loss": 0.6754, + "step": 1288 + }, + { + "epoch": 0.6279636245534265, + "grad_norm": 3.1227662563323975, + "learning_rate": 4.883959073974851e-06, + "loss": 0.6745, + "step": 1289 + }, + { + "epoch": 0.6284507957128938, + "grad_norm": 2.702918529510498, + "learning_rate": 4.8837653285142525e-06, + "loss": 0.6862, + "step": 1290 + }, + { + "epoch": 0.6289379668723611, + "grad_norm": 2.864070177078247, + "learning_rate": 4.883571425296287e-06, + "loss": 0.6446, + "step": 1291 + }, + { + "epoch": 0.6294251380318285, + "grad_norm": 2.7680630683898926, + "learning_rate": 4.883377364333787e-06, + "loss": 0.6809, + "step": 1292 + }, + { + "epoch": 0.6299123091912959, + "grad_norm": 2.963129997253418, + "learning_rate": 4.883183145639594e-06, + "loss": 0.6389, + "step": 1293 + }, + { + "epoch": 0.6303994803507632, + "grad_norm": 2.9651098251342773, + "learning_rate": 4.8829887692265634e-06, + "loss": 0.7024, + "step": 1294 + }, + { + "epoch": 0.6308866515102306, + "grad_norm": 2.847716808319092, + "learning_rate": 4.882794235107557e-06, + "loss": 0.5901, + "step": 1295 + }, + { + "epoch": 0.631373822669698, + "grad_norm": 3.3205437660217285, + "learning_rate": 4.8825995432954495e-06, + "loss": 0.7053, + "step": 1296 + }, + { + "epoch": 0.6318609938291653, + "grad_norm": 3.562293529510498, + "learning_rate": 4.882404693803128e-06, + "loss": 0.735, + "step": 1297 + }, + { + "epoch": 0.6323481649886327, + "grad_norm": 4.713667869567871, + "learning_rate": 4.882209686643484e-06, + "loss": 0.6508, + "step": 1298 + }, + { + "epoch": 0.6328353361481001, + "grad_norm": 2.737187147140503, + "learning_rate": 4.882014521829425e-06, + "loss": 0.6637, + "step": 1299 + }, + { + "epoch": 0.6333225073075673, + "grad_norm": 2.7326292991638184, + "learning_rate": 4.881819199373868e-06, + "loss": 0.6633, + "step": 1300 + }, + { + "epoch": 0.6338096784670347, + "grad_norm": 3.158315658569336, + "learning_rate": 4.881623719289737e-06, + "loss": 0.6672, + "step": 1301 + }, + { + "epoch": 0.6342968496265021, + "grad_norm": 2.9179928302764893, + "learning_rate": 4.88142808158997e-06, + "loss": 0.6929, + "step": 1302 + }, + { + "epoch": 0.6347840207859695, + "grad_norm": 2.53971529006958, + "learning_rate": 4.881232286287515e-06, + "loss": 0.6332, + "step": 1303 + }, + { + "epoch": 0.6352711919454368, + "grad_norm": 2.5609347820281982, + "learning_rate": 4.881036333395329e-06, + "loss": 0.6886, + "step": 1304 + }, + { + "epoch": 0.6357583631049042, + "grad_norm": 2.6296966075897217, + "learning_rate": 4.88084022292638e-06, + "loss": 0.7568, + "step": 1305 + }, + { + "epoch": 0.6362455342643716, + "grad_norm": 2.6810879707336426, + "learning_rate": 4.880643954893646e-06, + "loss": 0.7075, + "step": 1306 + }, + { + "epoch": 0.6367327054238389, + "grad_norm": 2.8895821571350098, + "learning_rate": 4.880447529310118e-06, + "loss": 0.6639, + "step": 1307 + }, + { + "epoch": 0.6372198765833063, + "grad_norm": 2.7590932846069336, + "learning_rate": 4.880250946188793e-06, + "loss": 0.6992, + "step": 1308 + }, + { + "epoch": 0.6377070477427736, + "grad_norm": 2.735081434249878, + "learning_rate": 4.880054205542683e-06, + "loss": 0.6715, + "step": 1309 + }, + { + "epoch": 0.638194218902241, + "grad_norm": 3.249833106994629, + "learning_rate": 4.879857307384807e-06, + "loss": 0.646, + "step": 1310 + }, + { + "epoch": 0.6386813900617083, + "grad_norm": 3.2091174125671387, + "learning_rate": 4.879660251728196e-06, + "loss": 0.6826, + "step": 1311 + }, + { + "epoch": 0.6391685612211757, + "grad_norm": 2.53538179397583, + "learning_rate": 4.8794630385858906e-06, + "loss": 0.6551, + "step": 1312 + }, + { + "epoch": 0.6396557323806431, + "grad_norm": 2.865388870239258, + "learning_rate": 4.879265667970944e-06, + "loss": 0.6754, + "step": 1313 + }, + { + "epoch": 0.6401429035401104, + "grad_norm": 3.2241597175598145, + "learning_rate": 4.879068139896416e-06, + "loss": 0.7028, + "step": 1314 + }, + { + "epoch": 0.6406300746995778, + "grad_norm": 2.752300262451172, + "learning_rate": 4.878870454375381e-06, + "loss": 0.6949, + "step": 1315 + }, + { + "epoch": 0.6411172458590452, + "grad_norm": 3.228745937347412, + "learning_rate": 4.878672611420919e-06, + "loss": 0.637, + "step": 1316 + }, + { + "epoch": 0.6416044170185125, + "grad_norm": 3.06768536567688, + "learning_rate": 4.878474611046126e-06, + "loss": 0.6339, + "step": 1317 + }, + { + "epoch": 0.6420915881779798, + "grad_norm": 2.971081495285034, + "learning_rate": 4.878276453264106e-06, + "loss": 0.6733, + "step": 1318 + }, + { + "epoch": 0.6425787593374472, + "grad_norm": 2.8612172603607178, + "learning_rate": 4.87807813808797e-06, + "loss": 0.6007, + "step": 1319 + }, + { + "epoch": 0.6430659304969146, + "grad_norm": 2.720156192779541, + "learning_rate": 4.8778796655308455e-06, + "loss": 0.6742, + "step": 1320 + }, + { + "epoch": 0.6435531016563819, + "grad_norm": 3.245084047317505, + "learning_rate": 4.877681035605866e-06, + "loss": 0.7122, + "step": 1321 + }, + { + "epoch": 0.6440402728158493, + "grad_norm": 2.7091197967529297, + "learning_rate": 4.877482248326177e-06, + "loss": 0.6739, + "step": 1322 + }, + { + "epoch": 0.6445274439753167, + "grad_norm": 2.8105721473693848, + "learning_rate": 4.877283303704934e-06, + "loss": 0.7334, + "step": 1323 + }, + { + "epoch": 0.645014615134784, + "grad_norm": 2.8615012168884277, + "learning_rate": 4.8770842017553036e-06, + "loss": 0.7302, + "step": 1324 + }, + { + "epoch": 0.6455017862942514, + "grad_norm": 3.121901035308838, + "learning_rate": 4.876884942490462e-06, + "loss": 0.7064, + "step": 1325 + }, + { + "epoch": 0.6459889574537188, + "grad_norm": 2.742579936981201, + "learning_rate": 4.8766855259235956e-06, + "loss": 0.7068, + "step": 1326 + }, + { + "epoch": 0.6464761286131862, + "grad_norm": 2.8907008171081543, + "learning_rate": 4.876485952067903e-06, + "loss": 0.6068, + "step": 1327 + }, + { + "epoch": 0.6469632997726534, + "grad_norm": 2.805582284927368, + "learning_rate": 4.876286220936591e-06, + "loss": 0.7647, + "step": 1328 + }, + { + "epoch": 0.6474504709321208, + "grad_norm": 3.165165662765503, + "learning_rate": 4.876086332542878e-06, + "loss": 0.6643, + "step": 1329 + }, + { + "epoch": 0.6479376420915882, + "grad_norm": 3.0390419960021973, + "learning_rate": 4.8758862868999935e-06, + "loss": 0.7026, + "step": 1330 + }, + { + "epoch": 0.6484248132510555, + "grad_norm": 3.107884645462036, + "learning_rate": 4.8756860840211755e-06, + "loss": 0.6393, + "step": 1331 + }, + { + "epoch": 0.6489119844105229, + "grad_norm": 3.301903009414673, + "learning_rate": 4.875485723919674e-06, + "loss": 0.7818, + "step": 1332 + }, + { + "epoch": 0.6493991555699903, + "grad_norm": 2.841451406478882, + "learning_rate": 4.875285206608748e-06, + "loss": 0.7055, + "step": 1333 + }, + { + "epoch": 0.6498863267294576, + "grad_norm": 3.2265055179595947, + "learning_rate": 4.875084532101668e-06, + "loss": 0.7046, + "step": 1334 + }, + { + "epoch": 0.650373497888925, + "grad_norm": 2.852292060852051, + "learning_rate": 4.874883700411716e-06, + "loss": 0.6169, + "step": 1335 + }, + { + "epoch": 0.6508606690483923, + "grad_norm": 2.9566972255706787, + "learning_rate": 4.87468271155218e-06, + "loss": 0.566, + "step": 1336 + }, + { + "epoch": 0.6513478402078597, + "grad_norm": 2.912672519683838, + "learning_rate": 4.874481565536365e-06, + "loss": 0.7198, + "step": 1337 + }, + { + "epoch": 0.651835011367327, + "grad_norm": 2.8088269233703613, + "learning_rate": 4.87428026237758e-06, + "loss": 0.6704, + "step": 1338 + }, + { + "epoch": 0.6523221825267944, + "grad_norm": 2.900940418243408, + "learning_rate": 4.874078802089149e-06, + "loss": 0.6424, + "step": 1339 + }, + { + "epoch": 0.6528093536862618, + "grad_norm": 3.291261672973633, + "learning_rate": 4.873877184684404e-06, + "loss": 0.6207, + "step": 1340 + }, + { + "epoch": 0.6532965248457291, + "grad_norm": 2.7472972869873047, + "learning_rate": 4.873675410176687e-06, + "loss": 0.6576, + "step": 1341 + }, + { + "epoch": 0.6537836960051965, + "grad_norm": 2.88376784324646, + "learning_rate": 4.873473478579353e-06, + "loss": 0.6923, + "step": 1342 + }, + { + "epoch": 0.6542708671646639, + "grad_norm": 2.6092398166656494, + "learning_rate": 4.873271389905765e-06, + "loss": 0.658, + "step": 1343 + }, + { + "epoch": 0.6547580383241313, + "grad_norm": 2.915902853012085, + "learning_rate": 4.873069144169298e-06, + "loss": 0.6255, + "step": 1344 + }, + { + "epoch": 0.6552452094835985, + "grad_norm": 3.033963441848755, + "learning_rate": 4.872866741383336e-06, + "loss": 0.6808, + "step": 1345 + }, + { + "epoch": 0.6557323806430659, + "grad_norm": 2.91329026222229, + "learning_rate": 4.872664181561273e-06, + "loss": 0.7003, + "step": 1346 + }, + { + "epoch": 0.6562195518025333, + "grad_norm": 2.9898769855499268, + "learning_rate": 4.8724614647165165e-06, + "loss": 0.6593, + "step": 1347 + }, + { + "epoch": 0.6567067229620006, + "grad_norm": 2.867344379425049, + "learning_rate": 4.872258590862481e-06, + "loss": 0.6594, + "step": 1348 + }, + { + "epoch": 0.657193894121468, + "grad_norm": 2.6539762020111084, + "learning_rate": 4.872055560012593e-06, + "loss": 0.6661, + "step": 1349 + }, + { + "epoch": 0.6576810652809354, + "grad_norm": 2.7188942432403564, + "learning_rate": 4.871852372180288e-06, + "loss": 0.662, + "step": 1350 + }, + { + "epoch": 0.6581682364404027, + "grad_norm": 2.8805758953094482, + "learning_rate": 4.871649027379015e-06, + "loss": 0.6761, + "step": 1351 + }, + { + "epoch": 0.6586554075998701, + "grad_norm": 3.017864942550659, + "learning_rate": 4.871445525622229e-06, + "loss": 0.6951, + "step": 1352 + }, + { + "epoch": 0.6591425787593375, + "grad_norm": 2.7673416137695312, + "learning_rate": 4.8712418669234e-06, + "loss": 0.71, + "step": 1353 + }, + { + "epoch": 0.6596297499188049, + "grad_norm": 2.931844711303711, + "learning_rate": 4.871038051296005e-06, + "loss": 0.6877, + "step": 1354 + }, + { + "epoch": 0.6601169210782721, + "grad_norm": 2.865537643432617, + "learning_rate": 4.8708340787535326e-06, + "loss": 0.6851, + "step": 1355 + }, + { + "epoch": 0.6606040922377395, + "grad_norm": 3.097399950027466, + "learning_rate": 4.870629949309481e-06, + "loss": 0.679, + "step": 1356 + }, + { + "epoch": 0.6610912633972069, + "grad_norm": 2.6341848373413086, + "learning_rate": 4.870425662977361e-06, + "loss": 0.6873, + "step": 1357 + }, + { + "epoch": 0.6615784345566742, + "grad_norm": 2.8127968311309814, + "learning_rate": 4.87022121977069e-06, + "loss": 0.6351, + "step": 1358 + }, + { + "epoch": 0.6620656057161416, + "grad_norm": 2.8380773067474365, + "learning_rate": 4.8700166197029994e-06, + "loss": 0.6325, + "step": 1359 + }, + { + "epoch": 0.662552776875609, + "grad_norm": 2.711852788925171, + "learning_rate": 4.869811862787831e-06, + "loss": 0.6326, + "step": 1360 + }, + { + "epoch": 0.6630399480350764, + "grad_norm": 2.670659065246582, + "learning_rate": 4.869606949038734e-06, + "loss": 0.5698, + "step": 1361 + }, + { + "epoch": 0.6635271191945437, + "grad_norm": 3.1156880855560303, + "learning_rate": 4.869401878469269e-06, + "loss": 0.6805, + "step": 1362 + }, + { + "epoch": 0.664014290354011, + "grad_norm": 3.1888983249664307, + "learning_rate": 4.869196651093009e-06, + "loss": 0.6967, + "step": 1363 + }, + { + "epoch": 0.6645014615134784, + "grad_norm": 2.828157424926758, + "learning_rate": 4.868991266923534e-06, + "loss": 0.6792, + "step": 1364 + }, + { + "epoch": 0.6649886326729457, + "grad_norm": 2.862194061279297, + "learning_rate": 4.868785725974439e-06, + "loss": 0.6505, + "step": 1365 + }, + { + "epoch": 0.6654758038324131, + "grad_norm": 2.7665913105010986, + "learning_rate": 4.8685800282593245e-06, + "loss": 0.6851, + "step": 1366 + }, + { + "epoch": 0.6659629749918805, + "grad_norm": 2.7155845165252686, + "learning_rate": 4.868374173791805e-06, + "loss": 0.7082, + "step": 1367 + }, + { + "epoch": 0.6664501461513478, + "grad_norm": 2.922487497329712, + "learning_rate": 4.868168162585502e-06, + "loss": 0.735, + "step": 1368 + }, + { + "epoch": 0.6669373173108152, + "grad_norm": 2.970154285430908, + "learning_rate": 4.8679619946540515e-06, + "loss": 0.6664, + "step": 1369 + }, + { + "epoch": 0.6674244884702826, + "grad_norm": 3.083962917327881, + "learning_rate": 4.867755670011096e-06, + "loss": 0.6498, + "step": 1370 + }, + { + "epoch": 0.66791165962975, + "grad_norm": 2.931126356124878, + "learning_rate": 4.867549188670292e-06, + "loss": 0.7138, + "step": 1371 + }, + { + "epoch": 0.6683988307892172, + "grad_norm": 3.092895030975342, + "learning_rate": 4.867342550645302e-06, + "loss": 0.6215, + "step": 1372 + }, + { + "epoch": 0.6688860019486846, + "grad_norm": 2.8992607593536377, + "learning_rate": 4.867135755949802e-06, + "loss": 0.679, + "step": 1373 + }, + { + "epoch": 0.669373173108152, + "grad_norm": 3.169772148132324, + "learning_rate": 4.866928804597479e-06, + "loss": 0.6676, + "step": 1374 + }, + { + "epoch": 0.6698603442676193, + "grad_norm": 3.0948896408081055, + "learning_rate": 4.866721696602028e-06, + "loss": 0.7379, + "step": 1375 + }, + { + "epoch": 0.6703475154270867, + "grad_norm": 2.6151814460754395, + "learning_rate": 4.866514431977156e-06, + "loss": 0.6585, + "step": 1376 + }, + { + "epoch": 0.6708346865865541, + "grad_norm": 2.5203561782836914, + "learning_rate": 4.866307010736578e-06, + "loss": 0.7064, + "step": 1377 + }, + { + "epoch": 0.6713218577460215, + "grad_norm": 2.691878080368042, + "learning_rate": 4.866099432894023e-06, + "loss": 0.6368, + "step": 1378 + }, + { + "epoch": 0.6718090289054888, + "grad_norm": 3.3201122283935547, + "learning_rate": 4.865891698463228e-06, + "loss": 0.6415, + "step": 1379 + }, + { + "epoch": 0.6722962000649562, + "grad_norm": 2.678941249847412, + "learning_rate": 4.865683807457942e-06, + "loss": 0.6519, + "step": 1380 + }, + { + "epoch": 0.6727833712244236, + "grad_norm": 2.7869582176208496, + "learning_rate": 4.86547575989192e-06, + "loss": 0.6726, + "step": 1381 + }, + { + "epoch": 0.6732705423838908, + "grad_norm": 3.178537130355835, + "learning_rate": 4.865267555778933e-06, + "loss": 0.7269, + "step": 1382 + }, + { + "epoch": 0.6737577135433582, + "grad_norm": 3.1821537017822266, + "learning_rate": 4.865059195132759e-06, + "loss": 0.693, + "step": 1383 + }, + { + "epoch": 0.6742448847028256, + "grad_norm": 2.8188440799713135, + "learning_rate": 4.864850677967189e-06, + "loss": 0.807, + "step": 1384 + }, + { + "epoch": 0.6747320558622929, + "grad_norm": 3.099503993988037, + "learning_rate": 4.8646420042960205e-06, + "loss": 0.6747, + "step": 1385 + }, + { + "epoch": 0.6752192270217603, + "grad_norm": 3.1733510494232178, + "learning_rate": 4.8644331741330645e-06, + "loss": 0.6694, + "step": 1386 + }, + { + "epoch": 0.6757063981812277, + "grad_norm": 2.742971658706665, + "learning_rate": 4.864224187492141e-06, + "loss": 0.6773, + "step": 1387 + }, + { + "epoch": 0.6761935693406951, + "grad_norm": 3.0451159477233887, + "learning_rate": 4.864015044387082e-06, + "loss": 0.6967, + "step": 1388 + }, + { + "epoch": 0.6766807405001624, + "grad_norm": 2.8736109733581543, + "learning_rate": 4.863805744831726e-06, + "loss": 0.6881, + "step": 1389 + }, + { + "epoch": 0.6771679116596298, + "grad_norm": 2.7655134201049805, + "learning_rate": 4.863596288839926e-06, + "loss": 0.6474, + "step": 1390 + }, + { + "epoch": 0.6776550828190971, + "grad_norm": 3.1350884437561035, + "learning_rate": 4.8633866764255445e-06, + "loss": 0.669, + "step": 1391 + }, + { + "epoch": 0.6781422539785644, + "grad_norm": 2.8546204566955566, + "learning_rate": 4.863176907602453e-06, + "loss": 0.6556, + "step": 1392 + }, + { + "epoch": 0.6786294251380318, + "grad_norm": 3.3180091381073, + "learning_rate": 4.862966982384532e-06, + "loss": 0.6408, + "step": 1393 + }, + { + "epoch": 0.6791165962974992, + "grad_norm": 2.593170642852783, + "learning_rate": 4.8627569007856775e-06, + "loss": 0.6195, + "step": 1394 + }, + { + "epoch": 0.6796037674569666, + "grad_norm": 2.8153388500213623, + "learning_rate": 4.862546662819792e-06, + "loss": 0.7408, + "step": 1395 + }, + { + "epoch": 0.6800909386164339, + "grad_norm": 2.9460268020629883, + "learning_rate": 4.862336268500788e-06, + "loss": 0.6793, + "step": 1396 + }, + { + "epoch": 0.6805781097759013, + "grad_norm": 2.787916898727417, + "learning_rate": 4.862125717842589e-06, + "loss": 0.6949, + "step": 1397 + }, + { + "epoch": 0.6810652809353687, + "grad_norm": 3.311187267303467, + "learning_rate": 4.86191501085913e-06, + "loss": 0.6547, + "step": 1398 + }, + { + "epoch": 0.681552452094836, + "grad_norm": 2.7382853031158447, + "learning_rate": 4.861704147564356e-06, + "loss": 0.6746, + "step": 1399 + }, + { + "epoch": 0.6820396232543033, + "grad_norm": 2.8074758052825928, + "learning_rate": 4.861493127972222e-06, + "loss": 0.685, + "step": 1400 + }, + { + "epoch": 0.6825267944137707, + "grad_norm": 2.9132559299468994, + "learning_rate": 4.8612819520966915e-06, + "loss": 0.6904, + "step": 1401 + }, + { + "epoch": 0.683013965573238, + "grad_norm": 2.768872022628784, + "learning_rate": 4.8610706199517425e-06, + "loss": 0.6918, + "step": 1402 + }, + { + "epoch": 0.6835011367327054, + "grad_norm": 2.7861709594726562, + "learning_rate": 4.860859131551359e-06, + "loss": 0.6292, + "step": 1403 + }, + { + "epoch": 0.6839883078921728, + "grad_norm": 2.4699950218200684, + "learning_rate": 4.860647486909537e-06, + "loss": 0.6304, + "step": 1404 + }, + { + "epoch": 0.6844754790516402, + "grad_norm": 2.792409896850586, + "learning_rate": 4.860435686040286e-06, + "loss": 0.7922, + "step": 1405 + }, + { + "epoch": 0.6849626502111075, + "grad_norm": 2.892138957977295, + "learning_rate": 4.86022372895762e-06, + "loss": 0.6672, + "step": 1406 + }, + { + "epoch": 0.6854498213705749, + "grad_norm": 2.9412872791290283, + "learning_rate": 4.860011615675568e-06, + "loss": 0.6894, + "step": 1407 + }, + { + "epoch": 0.6859369925300423, + "grad_norm": 2.660968542098999, + "learning_rate": 4.859799346208166e-06, + "loss": 0.6346, + "step": 1408 + }, + { + "epoch": 0.6864241636895095, + "grad_norm": 3.0947563648223877, + "learning_rate": 4.859586920569464e-06, + "loss": 0.7427, + "step": 1409 + }, + { + "epoch": 0.6869113348489769, + "grad_norm": 2.956549882888794, + "learning_rate": 4.859374338773519e-06, + "loss": 0.7341, + "step": 1410 + }, + { + "epoch": 0.6873985060084443, + "grad_norm": 3.3777639865875244, + "learning_rate": 4.8591616008344e-06, + "loss": 0.6942, + "step": 1411 + }, + { + "epoch": 0.6878856771679117, + "grad_norm": 3.134417772293091, + "learning_rate": 4.858948706766186e-06, + "loss": 0.7597, + "step": 1412 + }, + { + "epoch": 0.688372848327379, + "grad_norm": 2.7429795265197754, + "learning_rate": 4.858735656582966e-06, + "loss": 0.6978, + "step": 1413 + }, + { + "epoch": 0.6888600194868464, + "grad_norm": 2.4149482250213623, + "learning_rate": 4.85852245029884e-06, + "loss": 0.6519, + "step": 1414 + }, + { + "epoch": 0.6893471906463138, + "grad_norm": 3.0898258686065674, + "learning_rate": 4.858309087927918e-06, + "loss": 0.6204, + "step": 1415 + }, + { + "epoch": 0.6898343618057811, + "grad_norm": 2.66479229927063, + "learning_rate": 4.85809556948432e-06, + "loss": 0.6478, + "step": 1416 + }, + { + "epoch": 0.6903215329652485, + "grad_norm": 2.729653835296631, + "learning_rate": 4.857881894982177e-06, + "loss": 0.6399, + "step": 1417 + }, + { + "epoch": 0.6908087041247158, + "grad_norm": 4.576007843017578, + "learning_rate": 4.85766806443563e-06, + "loss": 0.7259, + "step": 1418 + }, + { + "epoch": 0.6912958752841831, + "grad_norm": 2.8979780673980713, + "learning_rate": 4.857454077858829e-06, + "loss": 0.6133, + "step": 1419 + }, + { + "epoch": 0.6917830464436505, + "grad_norm": 2.66524076461792, + "learning_rate": 4.857239935265938e-06, + "loss": 0.7183, + "step": 1420 + }, + { + "epoch": 0.6922702176031179, + "grad_norm": 3.088717222213745, + "learning_rate": 4.857025636671127e-06, + "loss": 0.6624, + "step": 1421 + }, + { + "epoch": 0.6927573887625853, + "grad_norm": 2.802431583404541, + "learning_rate": 4.856811182088578e-06, + "loss": 0.6626, + "step": 1422 + }, + { + "epoch": 0.6932445599220526, + "grad_norm": 3.0806515216827393, + "learning_rate": 4.856596571532486e-06, + "loss": 0.6182, + "step": 1423 + }, + { + "epoch": 0.69373173108152, + "grad_norm": 3.296372890472412, + "learning_rate": 4.856381805017052e-06, + "loss": 0.7309, + "step": 1424 + }, + { + "epoch": 0.6942189022409874, + "grad_norm": 3.0013396739959717, + "learning_rate": 4.856166882556489e-06, + "loss": 0.6553, + "step": 1425 + }, + { + "epoch": 0.6947060734004546, + "grad_norm": 3.139897108078003, + "learning_rate": 4.855951804165021e-06, + "loss": 0.6972, + "step": 1426 + }, + { + "epoch": 0.695193244559922, + "grad_norm": 2.5325207710266113, + "learning_rate": 4.855736569856882e-06, + "loss": 0.6304, + "step": 1427 + }, + { + "epoch": 0.6956804157193894, + "grad_norm": 3.0275585651397705, + "learning_rate": 4.855521179646317e-06, + "loss": 0.5524, + "step": 1428 + }, + { + "epoch": 0.6961675868788568, + "grad_norm": 2.9808473587036133, + "learning_rate": 4.855305633547579e-06, + "loss": 0.6658, + "step": 1429 + }, + { + "epoch": 0.6966547580383241, + "grad_norm": 3.3235907554626465, + "learning_rate": 4.855089931574933e-06, + "loss": 0.7352, + "step": 1430 + }, + { + "epoch": 0.6971419291977915, + "grad_norm": 2.8101320266723633, + "learning_rate": 4.854874073742656e-06, + "loss": 0.7032, + "step": 1431 + }, + { + "epoch": 0.6976291003572589, + "grad_norm": 2.8666696548461914, + "learning_rate": 4.854658060065032e-06, + "loss": 0.6502, + "step": 1432 + }, + { + "epoch": 0.6981162715167262, + "grad_norm": 2.775832414627075, + "learning_rate": 4.8544418905563556e-06, + "loss": 0.6443, + "step": 1433 + }, + { + "epoch": 0.6986034426761936, + "grad_norm": 2.7817442417144775, + "learning_rate": 4.8542255652309346e-06, + "loss": 0.6228, + "step": 1434 + }, + { + "epoch": 0.699090613835661, + "grad_norm": 2.6929314136505127, + "learning_rate": 4.8540090841030855e-06, + "loss": 0.6787, + "step": 1435 + }, + { + "epoch": 0.6995777849951282, + "grad_norm": 2.9793765544891357, + "learning_rate": 4.8537924471871335e-06, + "loss": 0.6816, + "step": 1436 + }, + { + "epoch": 0.7000649561545956, + "grad_norm": 3.5930159091949463, + "learning_rate": 4.853575654497417e-06, + "loss": 0.6982, + "step": 1437 + }, + { + "epoch": 0.700552127314063, + "grad_norm": 2.6330065727233887, + "learning_rate": 4.8533587060482825e-06, + "loss": 0.548, + "step": 1438 + }, + { + "epoch": 0.7010392984735304, + "grad_norm": 2.9593679904937744, + "learning_rate": 4.8531416018540885e-06, + "loss": 0.6617, + "step": 1439 + }, + { + "epoch": 0.7015264696329977, + "grad_norm": 2.8706464767456055, + "learning_rate": 4.852924341929203e-06, + "loss": 0.7052, + "step": 1440 + }, + { + "epoch": 0.7020136407924651, + "grad_norm": 2.8764193058013916, + "learning_rate": 4.852706926288002e-06, + "loss": 0.6967, + "step": 1441 + }, + { + "epoch": 0.7025008119519325, + "grad_norm": 2.8912651538848877, + "learning_rate": 4.852489354944877e-06, + "loss": 0.644, + "step": 1442 + }, + { + "epoch": 0.7029879831113998, + "grad_norm": 2.8986287117004395, + "learning_rate": 4.852271627914225e-06, + "loss": 0.6662, + "step": 1443 + }, + { + "epoch": 0.7034751542708672, + "grad_norm": 2.651211738586426, + "learning_rate": 4.852053745210456e-06, + "loss": 0.6262, + "step": 1444 + }, + { + "epoch": 0.7039623254303345, + "grad_norm": 2.811964988708496, + "learning_rate": 4.85183570684799e-06, + "loss": 0.6309, + "step": 1445 + }, + { + "epoch": 0.7044494965898019, + "grad_norm": 2.8483409881591797, + "learning_rate": 4.851617512841254e-06, + "loss": 0.6901, + "step": 1446 + }, + { + "epoch": 0.7049366677492692, + "grad_norm": 2.7701985836029053, + "learning_rate": 4.851399163204691e-06, + "loss": 0.7064, + "step": 1447 + }, + { + "epoch": 0.7054238389087366, + "grad_norm": 2.911325216293335, + "learning_rate": 4.85118065795275e-06, + "loss": 0.7668, + "step": 1448 + }, + { + "epoch": 0.705911010068204, + "grad_norm": 3.0673491954803467, + "learning_rate": 4.850961997099892e-06, + "loss": 0.6906, + "step": 1449 + }, + { + "epoch": 0.7063981812276713, + "grad_norm": 3.0555496215820312, + "learning_rate": 4.850743180660589e-06, + "loss": 0.6346, + "step": 1450 + }, + { + "epoch": 0.7068853523871387, + "grad_norm": 3.0384905338287354, + "learning_rate": 4.8505242086493195e-06, + "loss": 0.68, + "step": 1451 + }, + { + "epoch": 0.7073725235466061, + "grad_norm": 2.8832099437713623, + "learning_rate": 4.850305081080578e-06, + "loss": 0.6522, + "step": 1452 + }, + { + "epoch": 0.7078596947060734, + "grad_norm": 2.875410795211792, + "learning_rate": 4.8500857979688655e-06, + "loss": 0.6517, + "step": 1453 + }, + { + "epoch": 0.7083468658655407, + "grad_norm": 2.6804556846618652, + "learning_rate": 4.849866359328692e-06, + "loss": 0.6175, + "step": 1454 + }, + { + "epoch": 0.7088340370250081, + "grad_norm": 2.869264841079712, + "learning_rate": 4.849646765174583e-06, + "loss": 0.632, + "step": 1455 + }, + { + "epoch": 0.7093212081844755, + "grad_norm": 2.663464069366455, + "learning_rate": 4.84942701552107e-06, + "loss": 0.7199, + "step": 1456 + }, + { + "epoch": 0.7098083793439428, + "grad_norm": 2.6646957397460938, + "learning_rate": 4.849207110382695e-06, + "loss": 0.545, + "step": 1457 + }, + { + "epoch": 0.7102955505034102, + "grad_norm": 3.158817768096924, + "learning_rate": 4.848987049774012e-06, + "loss": 0.6958, + "step": 1458 + }, + { + "epoch": 0.7107827216628776, + "grad_norm": 2.73225998878479, + "learning_rate": 4.848766833709586e-06, + "loss": 0.6921, + "step": 1459 + }, + { + "epoch": 0.7112698928223449, + "grad_norm": 2.9388587474823, + "learning_rate": 4.848546462203989e-06, + "loss": 0.6774, + "step": 1460 + }, + { + "epoch": 0.7117570639818123, + "grad_norm": 2.9524388313293457, + "learning_rate": 4.848325935271806e-06, + "loss": 0.6179, + "step": 1461 + }, + { + "epoch": 0.7122442351412797, + "grad_norm": 2.6748063564300537, + "learning_rate": 4.848105252927632e-06, + "loss": 0.7091, + "step": 1462 + }, + { + "epoch": 0.712731406300747, + "grad_norm": 2.9452409744262695, + "learning_rate": 4.847884415186069e-06, + "loss": 0.5614, + "step": 1463 + }, + { + "epoch": 0.7132185774602143, + "grad_norm": 2.8729782104492188, + "learning_rate": 4.847663422061736e-06, + "loss": 0.6031, + "step": 1464 + }, + { + "epoch": 0.7137057486196817, + "grad_norm": 2.622037172317505, + "learning_rate": 4.847442273569255e-06, + "loss": 0.6478, + "step": 1465 + }, + { + "epoch": 0.7141929197791491, + "grad_norm": 2.6864190101623535, + "learning_rate": 4.847220969723265e-06, + "loss": 0.5952, + "step": 1466 + }, + { + "epoch": 0.7146800909386164, + "grad_norm": 2.8963305950164795, + "learning_rate": 4.846999510538407e-06, + "loss": 0.6792, + "step": 1467 + }, + { + "epoch": 0.7151672620980838, + "grad_norm": 2.9926562309265137, + "learning_rate": 4.846777896029341e-06, + "loss": 0.6347, + "step": 1468 + }, + { + "epoch": 0.7156544332575512, + "grad_norm": 3.092228412628174, + "learning_rate": 4.846556126210733e-06, + "loss": 0.6218, + "step": 1469 + }, + { + "epoch": 0.7161416044170185, + "grad_norm": 2.9564502239227295, + "learning_rate": 4.846334201097259e-06, + "loss": 0.6997, + "step": 1470 + }, + { + "epoch": 0.7166287755764859, + "grad_norm": 2.9941654205322266, + "learning_rate": 4.846112120703606e-06, + "loss": 0.7271, + "step": 1471 + }, + { + "epoch": 0.7171159467359532, + "grad_norm": 3.1310412883758545, + "learning_rate": 4.845889885044471e-06, + "loss": 0.6687, + "step": 1472 + }, + { + "epoch": 0.7176031178954206, + "grad_norm": 2.694955825805664, + "learning_rate": 4.845667494134561e-06, + "loss": 0.6239, + "step": 1473 + }, + { + "epoch": 0.7180902890548879, + "grad_norm": 2.753143548965454, + "learning_rate": 4.845444947988597e-06, + "loss": 0.6007, + "step": 1474 + }, + { + "epoch": 0.7185774602143553, + "grad_norm": 2.7791175842285156, + "learning_rate": 4.845222246621303e-06, + "loss": 0.6243, + "step": 1475 + }, + { + "epoch": 0.7190646313738227, + "grad_norm": 2.9054105281829834, + "learning_rate": 4.844999390047419e-06, + "loss": 0.7219, + "step": 1476 + }, + { + "epoch": 0.71955180253329, + "grad_norm": 2.682668447494507, + "learning_rate": 4.844776378281695e-06, + "loss": 0.6583, + "step": 1477 + }, + { + "epoch": 0.7200389736927574, + "grad_norm": 3.2399604320526123, + "learning_rate": 4.844553211338887e-06, + "loss": 0.6328, + "step": 1478 + }, + { + "epoch": 0.7205261448522248, + "grad_norm": 3.205878973007202, + "learning_rate": 4.8443298892337665e-06, + "loss": 0.7544, + "step": 1479 + }, + { + "epoch": 0.7210133160116922, + "grad_norm": 3.0898571014404297, + "learning_rate": 4.844106411981113e-06, + "loss": 0.7127, + "step": 1480 + }, + { + "epoch": 0.7215004871711594, + "grad_norm": 2.791027069091797, + "learning_rate": 4.843882779595715e-06, + "loss": 0.6829, + "step": 1481 + }, + { + "epoch": 0.7219876583306268, + "grad_norm": 2.757032632827759, + "learning_rate": 4.843658992092372e-06, + "loss": 0.6876, + "step": 1482 + }, + { + "epoch": 0.7224748294900942, + "grad_norm": 3.513105630874634, + "learning_rate": 4.843435049485896e-06, + "loss": 0.7534, + "step": 1483 + }, + { + "epoch": 0.7229620006495615, + "grad_norm": 2.889742136001587, + "learning_rate": 4.843210951791106e-06, + "loss": 0.6564, + "step": 1484 + }, + { + "epoch": 0.7234491718090289, + "grad_norm": 2.8261356353759766, + "learning_rate": 4.8429866990228345e-06, + "loss": 0.6473, + "step": 1485 + }, + { + "epoch": 0.7239363429684963, + "grad_norm": 2.7393062114715576, + "learning_rate": 4.8427622911959206e-06, + "loss": 0.7253, + "step": 1486 + }, + { + "epoch": 0.7244235141279636, + "grad_norm": 3.1168155670166016, + "learning_rate": 4.842537728325217e-06, + "loss": 0.7008, + "step": 1487 + }, + { + "epoch": 0.724910685287431, + "grad_norm": 2.7415263652801514, + "learning_rate": 4.842313010425585e-06, + "loss": 0.6246, + "step": 1488 + }, + { + "epoch": 0.7253978564468984, + "grad_norm": 3.3704833984375, + "learning_rate": 4.8420881375118944e-06, + "loss": 0.696, + "step": 1489 + }, + { + "epoch": 0.7258850276063658, + "grad_norm": 2.7834880352020264, + "learning_rate": 4.84186310959903e-06, + "loss": 0.6214, + "step": 1490 + }, + { + "epoch": 0.726372198765833, + "grad_norm": 3.021171808242798, + "learning_rate": 4.841637926701883e-06, + "loss": 0.6767, + "step": 1491 + }, + { + "epoch": 0.7268593699253004, + "grad_norm": 2.81209659576416, + "learning_rate": 4.841412588835356e-06, + "loss": 0.676, + "step": 1492 + }, + { + "epoch": 0.7273465410847678, + "grad_norm": 2.692965507507324, + "learning_rate": 4.841187096014362e-06, + "loss": 0.6305, + "step": 1493 + }, + { + "epoch": 0.7278337122442351, + "grad_norm": 2.5665605068206787, + "learning_rate": 4.840961448253825e-06, + "loss": 0.6421, + "step": 1494 + }, + { + "epoch": 0.7283208834037025, + "grad_norm": 3.0173540115356445, + "learning_rate": 4.8407356455686764e-06, + "loss": 0.654, + "step": 1495 + }, + { + "epoch": 0.7288080545631699, + "grad_norm": 2.666347026824951, + "learning_rate": 4.84050968797386e-06, + "loss": 0.7038, + "step": 1496 + }, + { + "epoch": 0.7292952257226373, + "grad_norm": 3.013444662094116, + "learning_rate": 4.840283575484331e-06, + "loss": 0.6809, + "step": 1497 + }, + { + "epoch": 0.7297823968821046, + "grad_norm": 2.8147640228271484, + "learning_rate": 4.8400573081150535e-06, + "loss": 0.5841, + "step": 1498 + }, + { + "epoch": 0.730269568041572, + "grad_norm": 3.1315267086029053, + "learning_rate": 4.839830885881001e-06, + "loss": 0.6188, + "step": 1499 + }, + { + "epoch": 0.7307567392010393, + "grad_norm": 2.8658041954040527, + "learning_rate": 4.839604308797158e-06, + "loss": 0.6378, + "step": 1500 + }, + { + "epoch": 0.7312439103605066, + "grad_norm": 2.8983044624328613, + "learning_rate": 4.8393775768785195e-06, + "loss": 0.6978, + "step": 1501 + }, + { + "epoch": 0.731731081519974, + "grad_norm": 2.7666687965393066, + "learning_rate": 4.839150690140091e-06, + "loss": 0.6399, + "step": 1502 + }, + { + "epoch": 0.7322182526794414, + "grad_norm": 3.2628889083862305, + "learning_rate": 4.838923648596889e-06, + "loss": 0.6056, + "step": 1503 + }, + { + "epoch": 0.7327054238389087, + "grad_norm": 2.7552928924560547, + "learning_rate": 4.838696452263936e-06, + "loss": 0.6332, + "step": 1504 + }, + { + "epoch": 0.7331925949983761, + "grad_norm": 2.7772674560546875, + "learning_rate": 4.83846910115627e-06, + "loss": 0.6601, + "step": 1505 + }, + { + "epoch": 0.7336797661578435, + "grad_norm": 3.1311991214752197, + "learning_rate": 4.838241595288936e-06, + "loss": 0.6823, + "step": 1506 + }, + { + "epoch": 0.7341669373173109, + "grad_norm": 2.922022819519043, + "learning_rate": 4.838013934676992e-06, + "loss": 0.6396, + "step": 1507 + }, + { + "epoch": 0.7346541084767781, + "grad_norm": 3.1259007453918457, + "learning_rate": 4.837786119335504e-06, + "loss": 0.6706, + "step": 1508 + }, + { + "epoch": 0.7351412796362455, + "grad_norm": 2.6733272075653076, + "learning_rate": 4.837558149279547e-06, + "loss": 0.6299, + "step": 1509 + }, + { + "epoch": 0.7356284507957129, + "grad_norm": 3.0329158306121826, + "learning_rate": 4.83733002452421e-06, + "loss": 0.6509, + "step": 1510 + }, + { + "epoch": 0.7361156219551802, + "grad_norm": 2.5678775310516357, + "learning_rate": 4.8371017450845895e-06, + "loss": 0.6443, + "step": 1511 + }, + { + "epoch": 0.7366027931146476, + "grad_norm": 2.8471391201019287, + "learning_rate": 4.836873310975793e-06, + "loss": 0.6501, + "step": 1512 + }, + { + "epoch": 0.737089964274115, + "grad_norm": 2.511535167694092, + "learning_rate": 4.836644722212939e-06, + "loss": 0.6498, + "step": 1513 + }, + { + "epoch": 0.7375771354335824, + "grad_norm": 2.6571738719940186, + "learning_rate": 4.8364159788111545e-06, + "loss": 0.6366, + "step": 1514 + }, + { + "epoch": 0.7380643065930497, + "grad_norm": 2.8165817260742188, + "learning_rate": 4.836187080785577e-06, + "loss": 0.6593, + "step": 1515 + }, + { + "epoch": 0.7385514777525171, + "grad_norm": 2.6863911151885986, + "learning_rate": 4.835958028151358e-06, + "loss": 0.6832, + "step": 1516 + }, + { + "epoch": 0.7390386489119845, + "grad_norm": 2.5124759674072266, + "learning_rate": 4.835728820923653e-06, + "loss": 0.6522, + "step": 1517 + }, + { + "epoch": 0.7395258200714517, + "grad_norm": 2.6282575130462646, + "learning_rate": 4.835499459117632e-06, + "loss": 0.7143, + "step": 1518 + }, + { + "epoch": 0.7400129912309191, + "grad_norm": 2.7077746391296387, + "learning_rate": 4.835269942748475e-06, + "loss": 0.6117, + "step": 1519 + }, + { + "epoch": 0.7405001623903865, + "grad_norm": 3.3443875312805176, + "learning_rate": 4.835040271831371e-06, + "loss": 0.7013, + "step": 1520 + }, + { + "epoch": 0.7409873335498538, + "grad_norm": 2.6956310272216797, + "learning_rate": 4.834810446381518e-06, + "loss": 0.6544, + "step": 1521 + }, + { + "epoch": 0.7414745047093212, + "grad_norm": 2.8815555572509766, + "learning_rate": 4.8345804664141275e-06, + "loss": 0.6673, + "step": 1522 + }, + { + "epoch": 0.7419616758687886, + "grad_norm": 2.6243109703063965, + "learning_rate": 4.83435033194442e-06, + "loss": 0.7107, + "step": 1523 + }, + { + "epoch": 0.742448847028256, + "grad_norm": 2.875157117843628, + "learning_rate": 4.834120042987623e-06, + "loss": 0.6691, + "step": 1524 + }, + { + "epoch": 0.7429360181877233, + "grad_norm": 2.654294729232788, + "learning_rate": 4.83388959955898e-06, + "loss": 0.6883, + "step": 1525 + }, + { + "epoch": 0.7434231893471906, + "grad_norm": 3.0916733741760254, + "learning_rate": 4.833659001673742e-06, + "loss": 0.6532, + "step": 1526 + }, + { + "epoch": 0.743910360506658, + "grad_norm": 2.5020391941070557, + "learning_rate": 4.833428249347167e-06, + "loss": 0.6059, + "step": 1527 + }, + { + "epoch": 0.7443975316661253, + "grad_norm": 2.667780637741089, + "learning_rate": 4.833197342594528e-06, + "loss": 0.7244, + "step": 1528 + }, + { + "epoch": 0.7448847028255927, + "grad_norm": 2.837090253829956, + "learning_rate": 4.832966281431106e-06, + "loss": 0.685, + "step": 1529 + }, + { + "epoch": 0.7453718739850601, + "grad_norm": 2.851560592651367, + "learning_rate": 4.832735065872193e-06, + "loss": 0.6687, + "step": 1530 + }, + { + "epoch": 0.7458590451445275, + "grad_norm": 3.315594434738159, + "learning_rate": 4.832503695933091e-06, + "loss": 0.6958, + "step": 1531 + }, + { + "epoch": 0.7463462163039948, + "grad_norm": 2.989652395248413, + "learning_rate": 4.832272171629111e-06, + "loss": 0.7733, + "step": 1532 + }, + { + "epoch": 0.7468333874634622, + "grad_norm": 2.7238001823425293, + "learning_rate": 4.832040492975576e-06, + "loss": 0.6528, + "step": 1533 + }, + { + "epoch": 0.7473205586229296, + "grad_norm": 2.9514448642730713, + "learning_rate": 4.831808659987818e-06, + "loss": 0.7389, + "step": 1534 + }, + { + "epoch": 0.7478077297823968, + "grad_norm": 2.8230507373809814, + "learning_rate": 4.831576672681181e-06, + "loss": 0.6996, + "step": 1535 + }, + { + "epoch": 0.7482949009418642, + "grad_norm": 2.536350727081299, + "learning_rate": 4.831344531071016e-06, + "loss": 0.5956, + "step": 1536 + }, + { + "epoch": 0.7487820721013316, + "grad_norm": 2.6593246459960938, + "learning_rate": 4.831112235172687e-06, + "loss": 0.7052, + "step": 1537 + }, + { + "epoch": 0.7492692432607989, + "grad_norm": 2.6742324829101562, + "learning_rate": 4.8308797850015675e-06, + "loss": 0.6793, + "step": 1538 + }, + { + "epoch": 0.7497564144202663, + "grad_norm": 2.714921474456787, + "learning_rate": 4.830647180573041e-06, + "loss": 0.6467, + "step": 1539 + }, + { + "epoch": 0.7502435855797337, + "grad_norm": 2.834484815597534, + "learning_rate": 4.830414421902501e-06, + "loss": 0.6377, + "step": 1540 + }, + { + "epoch": 0.7507307567392011, + "grad_norm": 2.7873177528381348, + "learning_rate": 4.830181509005352e-06, + "loss": 0.6271, + "step": 1541 + }, + { + "epoch": 0.7512179278986684, + "grad_norm": 2.8010365962982178, + "learning_rate": 4.829948441897008e-06, + "loss": 0.6796, + "step": 1542 + }, + { + "epoch": 0.7517050990581358, + "grad_norm": 2.6636765003204346, + "learning_rate": 4.829715220592892e-06, + "loss": 0.6452, + "step": 1543 + }, + { + "epoch": 0.7521922702176032, + "grad_norm": 2.5430893898010254, + "learning_rate": 4.82948184510844e-06, + "loss": 0.7341, + "step": 1544 + }, + { + "epoch": 0.7526794413770704, + "grad_norm": 2.8108255863189697, + "learning_rate": 4.829248315459096e-06, + "loss": 0.5981, + "step": 1545 + }, + { + "epoch": 0.7531666125365378, + "grad_norm": 3.0465662479400635, + "learning_rate": 4.8290146316603165e-06, + "loss": 0.65, + "step": 1546 + }, + { + "epoch": 0.7536537836960052, + "grad_norm": 2.579493761062622, + "learning_rate": 4.828780793727565e-06, + "loss": 0.6321, + "step": 1547 + }, + { + "epoch": 0.7541409548554725, + "grad_norm": 2.6736576557159424, + "learning_rate": 4.828546801676317e-06, + "loss": 0.625, + "step": 1548 + }, + { + "epoch": 0.7546281260149399, + "grad_norm": 3.0961058139801025, + "learning_rate": 4.828312655522059e-06, + "loss": 0.6716, + "step": 1549 + }, + { + "epoch": 0.7551152971744073, + "grad_norm": 2.784420967102051, + "learning_rate": 4.8280783552802865e-06, + "loss": 0.6665, + "step": 1550 + }, + { + "epoch": 0.7556024683338747, + "grad_norm": 2.897686004638672, + "learning_rate": 4.827843900966505e-06, + "loss": 0.6762, + "step": 1551 + }, + { + "epoch": 0.756089639493342, + "grad_norm": 3.0019984245300293, + "learning_rate": 4.827609292596231e-06, + "loss": 0.7046, + "step": 1552 + }, + { + "epoch": 0.7565768106528094, + "grad_norm": 2.8679444789886475, + "learning_rate": 4.827374530184991e-06, + "loss": 0.7177, + "step": 1553 + }, + { + "epoch": 0.7570639818122767, + "grad_norm": 2.8401541709899902, + "learning_rate": 4.827139613748321e-06, + "loss": 0.7278, + "step": 1554 + }, + { + "epoch": 0.757551152971744, + "grad_norm": 3.427031993865967, + "learning_rate": 4.826904543301767e-06, + "loss": 0.7396, + "step": 1555 + }, + { + "epoch": 0.7580383241312114, + "grad_norm": 2.789902925491333, + "learning_rate": 4.8266693188608885e-06, + "loss": 0.608, + "step": 1556 + }, + { + "epoch": 0.7585254952906788, + "grad_norm": 2.9815115928649902, + "learning_rate": 4.82643394044125e-06, + "loss": 0.7256, + "step": 1557 + }, + { + "epoch": 0.7590126664501462, + "grad_norm": 2.421722650527954, + "learning_rate": 4.8261984080584315e-06, + "loss": 0.6575, + "step": 1558 + }, + { + "epoch": 0.7594998376096135, + "grad_norm": 2.800110101699829, + "learning_rate": 4.825962721728018e-06, + "loss": 0.6227, + "step": 1559 + }, + { + "epoch": 0.7599870087690809, + "grad_norm": 2.864353656768799, + "learning_rate": 4.825726881465608e-06, + "loss": 0.6879, + "step": 1560 + }, + { + "epoch": 0.7604741799285483, + "grad_norm": 2.791027069091797, + "learning_rate": 4.82549088728681e-06, + "loss": 0.6308, + "step": 1561 + }, + { + "epoch": 0.7609613510880155, + "grad_norm": 2.907008647918701, + "learning_rate": 4.8252547392072415e-06, + "loss": 0.6207, + "step": 1562 + }, + { + "epoch": 0.7614485222474829, + "grad_norm": 2.851966381072998, + "learning_rate": 4.82501843724253e-06, + "loss": 0.7277, + "step": 1563 + }, + { + "epoch": 0.7619356934069503, + "grad_norm": 2.7802536487579346, + "learning_rate": 4.824781981408316e-06, + "loss": 0.616, + "step": 1564 + }, + { + "epoch": 0.7624228645664176, + "grad_norm": 2.7199723720550537, + "learning_rate": 4.824545371720246e-06, + "loss": 0.6827, + "step": 1565 + }, + { + "epoch": 0.762910035725885, + "grad_norm": 3.114469051361084, + "learning_rate": 4.82430860819398e-06, + "loss": 0.6612, + "step": 1566 + }, + { + "epoch": 0.7633972068853524, + "grad_norm": 3.0508153438568115, + "learning_rate": 4.824071690845188e-06, + "loss": 0.7538, + "step": 1567 + }, + { + "epoch": 0.7638843780448198, + "grad_norm": 3.1569244861602783, + "learning_rate": 4.823834619689547e-06, + "loss": 0.725, + "step": 1568 + }, + { + "epoch": 0.7643715492042871, + "grad_norm": 3.2019855976104736, + "learning_rate": 4.8235973947427475e-06, + "loss": 0.7207, + "step": 1569 + }, + { + "epoch": 0.7648587203637545, + "grad_norm": 2.993770122528076, + "learning_rate": 4.823360016020489e-06, + "loss": 0.6676, + "step": 1570 + }, + { + "epoch": 0.7653458915232219, + "grad_norm": 2.7239606380462646, + "learning_rate": 4.823122483538479e-06, + "loss": 0.6356, + "step": 1571 + }, + { + "epoch": 0.7658330626826891, + "grad_norm": 3.0987823009490967, + "learning_rate": 4.8228847973124415e-06, + "loss": 0.688, + "step": 1572 + }, + { + "epoch": 0.7663202338421565, + "grad_norm": 3.2940807342529297, + "learning_rate": 4.822646957358104e-06, + "loss": 0.7716, + "step": 1573 + }, + { + "epoch": 0.7668074050016239, + "grad_norm": 2.9178309440612793, + "learning_rate": 4.822408963691207e-06, + "loss": 0.7549, + "step": 1574 + }, + { + "epoch": 0.7672945761610913, + "grad_norm": 2.730964422225952, + "learning_rate": 4.822170816327501e-06, + "loss": 0.722, + "step": 1575 + }, + { + "epoch": 0.7677817473205586, + "grad_norm": 3.4310362339019775, + "learning_rate": 4.821932515282747e-06, + "loss": 0.644, + "step": 1576 + }, + { + "epoch": 0.768268918480026, + "grad_norm": 2.803546667098999, + "learning_rate": 4.8216940605727145e-06, + "loss": 0.5919, + "step": 1577 + }, + { + "epoch": 0.7687560896394934, + "grad_norm": 6.645774841308594, + "learning_rate": 4.8214554522131864e-06, + "loss": 0.6787, + "step": 1578 + }, + { + "epoch": 0.7692432607989607, + "grad_norm": 2.782576322555542, + "learning_rate": 4.8212166902199515e-06, + "loss": 0.6734, + "step": 1579 + }, + { + "epoch": 0.769730431958428, + "grad_norm": 2.883519172668457, + "learning_rate": 4.820977774608813e-06, + "loss": 0.6364, + "step": 1580 + }, + { + "epoch": 0.7702176031178954, + "grad_norm": 2.7184650897979736, + "learning_rate": 4.820738705395581e-06, + "loss": 0.6516, + "step": 1581 + }, + { + "epoch": 0.7707047742773627, + "grad_norm": 2.5719962120056152, + "learning_rate": 4.820499482596078e-06, + "loss": 0.6908, + "step": 1582 + }, + { + "epoch": 0.7711919454368301, + "grad_norm": 2.4087960720062256, + "learning_rate": 4.820260106226134e-06, + "loss": 0.6051, + "step": 1583 + }, + { + "epoch": 0.7716791165962975, + "grad_norm": 2.6729280948638916, + "learning_rate": 4.820020576301594e-06, + "loss": 0.6942, + "step": 1584 + }, + { + "epoch": 0.7721662877557649, + "grad_norm": 3.1016852855682373, + "learning_rate": 4.819780892838308e-06, + "loss": 0.6707, + "step": 1585 + }, + { + "epoch": 0.7726534589152322, + "grad_norm": 3.6113035678863525, + "learning_rate": 4.819541055852139e-06, + "loss": 0.675, + "step": 1586 + }, + { + "epoch": 0.7731406300746996, + "grad_norm": 3.012632369995117, + "learning_rate": 4.819301065358957e-06, + "loss": 0.6782, + "step": 1587 + }, + { + "epoch": 0.773627801234167, + "grad_norm": 3.3734211921691895, + "learning_rate": 4.819060921374648e-06, + "loss": 0.7019, + "step": 1588 + }, + { + "epoch": 0.7741149723936342, + "grad_norm": 2.9023215770721436, + "learning_rate": 4.8188206239151025e-06, + "loss": 0.6957, + "step": 1589 + }, + { + "epoch": 0.7746021435531016, + "grad_norm": 2.7459726333618164, + "learning_rate": 4.818580172996225e-06, + "loss": 0.6334, + "step": 1590 + }, + { + "epoch": 0.775089314712569, + "grad_norm": 3.2348151206970215, + "learning_rate": 4.818339568633926e-06, + "loss": 0.6794, + "step": 1591 + }, + { + "epoch": 0.7755764858720364, + "grad_norm": 3.11722731590271, + "learning_rate": 4.818098810844131e-06, + "loss": 0.7059, + "step": 1592 + }, + { + "epoch": 0.7760636570315037, + "grad_norm": 2.9740419387817383, + "learning_rate": 4.8178578996427726e-06, + "loss": 0.6744, + "step": 1593 + }, + { + "epoch": 0.7765508281909711, + "grad_norm": 2.9221930503845215, + "learning_rate": 4.8176168350457944e-06, + "loss": 0.7016, + "step": 1594 + }, + { + "epoch": 0.7770379993504385, + "grad_norm": 2.78851318359375, + "learning_rate": 4.81737561706915e-06, + "loss": 0.6404, + "step": 1595 + }, + { + "epoch": 0.7775251705099058, + "grad_norm": 2.93473744392395, + "learning_rate": 4.817134245728803e-06, + "loss": 0.6922, + "step": 1596 + }, + { + "epoch": 0.7780123416693732, + "grad_norm": 2.55315899848938, + "learning_rate": 4.816892721040727e-06, + "loss": 0.5716, + "step": 1597 + }, + { + "epoch": 0.7784995128288406, + "grad_norm": 2.800201416015625, + "learning_rate": 4.816651043020907e-06, + "loss": 0.6316, + "step": 1598 + }, + { + "epoch": 0.7789866839883078, + "grad_norm": 3.3347039222717285, + "learning_rate": 4.816409211685336e-06, + "loss": 0.6247, + "step": 1599 + }, + { + "epoch": 0.7794738551477752, + "grad_norm": 2.673733949661255, + "learning_rate": 4.816167227050019e-06, + "loss": 0.6242, + "step": 1600 + }, + { + "epoch": 0.7799610263072426, + "grad_norm": 2.917616844177246, + "learning_rate": 4.815925089130971e-06, + "loss": 0.7203, + "step": 1601 + }, + { + "epoch": 0.78044819746671, + "grad_norm": 3.0653932094573975, + "learning_rate": 4.815682797944217e-06, + "loss": 0.6736, + "step": 1602 + }, + { + "epoch": 0.7809353686261773, + "grad_norm": 2.825183153152466, + "learning_rate": 4.81544035350579e-06, + "loss": 0.6589, + "step": 1603 + }, + { + "epoch": 0.7814225397856447, + "grad_norm": 2.8591160774230957, + "learning_rate": 4.815197755831736e-06, + "loss": 0.7198, + "step": 1604 + }, + { + "epoch": 0.7819097109451121, + "grad_norm": 2.5680999755859375, + "learning_rate": 4.814955004938111e-06, + "loss": 0.6202, + "step": 1605 + }, + { + "epoch": 0.7823968821045794, + "grad_norm": 2.58031964302063, + "learning_rate": 4.814712100840979e-06, + "loss": 0.5976, + "step": 1606 + }, + { + "epoch": 0.7828840532640468, + "grad_norm": 2.5157158374786377, + "learning_rate": 4.814469043556416e-06, + "loss": 0.5787, + "step": 1607 + }, + { + "epoch": 0.7833712244235141, + "grad_norm": 2.9430766105651855, + "learning_rate": 4.814225833100507e-06, + "loss": 0.5407, + "step": 1608 + }, + { + "epoch": 0.7838583955829815, + "grad_norm": 2.859650135040283, + "learning_rate": 4.813982469489347e-06, + "loss": 0.6087, + "step": 1609 + }, + { + "epoch": 0.7843455667424488, + "grad_norm": 3.021313428878784, + "learning_rate": 4.813738952739043e-06, + "loss": 0.6627, + "step": 1610 + }, + { + "epoch": 0.7848327379019162, + "grad_norm": 3.2855145931243896, + "learning_rate": 4.813495282865711e-06, + "loss": 0.7216, + "step": 1611 + }, + { + "epoch": 0.7853199090613836, + "grad_norm": 2.8651838302612305, + "learning_rate": 4.813251459885476e-06, + "loss": 0.6459, + "step": 1612 + }, + { + "epoch": 0.7858070802208509, + "grad_norm": 2.792447090148926, + "learning_rate": 4.813007483814476e-06, + "loss": 0.6208, + "step": 1613 + }, + { + "epoch": 0.7862942513803183, + "grad_norm": 2.921827554702759, + "learning_rate": 4.812763354668855e-06, + "loss": 0.6705, + "step": 1614 + }, + { + "epoch": 0.7867814225397857, + "grad_norm": 2.7905936241149902, + "learning_rate": 4.812519072464771e-06, + "loss": 0.6474, + "step": 1615 + }, + { + "epoch": 0.787268593699253, + "grad_norm": 2.9696950912475586, + "learning_rate": 4.81227463721839e-06, + "loss": 0.6107, + "step": 1616 + }, + { + "epoch": 0.7877557648587203, + "grad_norm": 2.802077293395996, + "learning_rate": 4.812030048945889e-06, + "loss": 0.6159, + "step": 1617 + }, + { + "epoch": 0.7882429360181877, + "grad_norm": 2.9984991550445557, + "learning_rate": 4.811785307663454e-06, + "loss": 0.7765, + "step": 1618 + }, + { + "epoch": 0.7887301071776551, + "grad_norm": 2.675612211227417, + "learning_rate": 4.811540413387283e-06, + "loss": 0.6349, + "step": 1619 + }, + { + "epoch": 0.7892172783371224, + "grad_norm": 5.121064186096191, + "learning_rate": 4.811295366133583e-06, + "loss": 0.6336, + "step": 1620 + }, + { + "epoch": 0.7897044494965898, + "grad_norm": 3.219654083251953, + "learning_rate": 4.811050165918571e-06, + "loss": 0.6941, + "step": 1621 + }, + { + "epoch": 0.7901916206560572, + "grad_norm": 3.1843581199645996, + "learning_rate": 4.810804812758474e-06, + "loss": 0.6916, + "step": 1622 + }, + { + "epoch": 0.7906787918155245, + "grad_norm": 2.4533796310424805, + "learning_rate": 4.81055930666953e-06, + "loss": 0.5868, + "step": 1623 + }, + { + "epoch": 0.7911659629749919, + "grad_norm": 3.0580852031707764, + "learning_rate": 4.810313647667986e-06, + "loss": 0.7319, + "step": 1624 + }, + { + "epoch": 0.7916531341344593, + "grad_norm": 2.6133460998535156, + "learning_rate": 4.810067835770099e-06, + "loss": 0.7059, + "step": 1625 + }, + { + "epoch": 0.7921403052939266, + "grad_norm": 3.003751277923584, + "learning_rate": 4.809821870992139e-06, + "loss": 0.604, + "step": 1626 + }, + { + "epoch": 0.7926274764533939, + "grad_norm": 4.22611665725708, + "learning_rate": 4.809575753350382e-06, + "loss": 0.6897, + "step": 1627 + }, + { + "epoch": 0.7931146476128613, + "grad_norm": 2.6755928993225098, + "learning_rate": 4.809329482861117e-06, + "loss": 0.6886, + "step": 1628 + }, + { + "epoch": 0.7936018187723287, + "grad_norm": 2.8718373775482178, + "learning_rate": 4.8090830595406415e-06, + "loss": 0.7454, + "step": 1629 + }, + { + "epoch": 0.794088989931796, + "grad_norm": 2.6615827083587646, + "learning_rate": 4.8088364834052645e-06, + "loss": 0.6024, + "step": 1630 + }, + { + "epoch": 0.7945761610912634, + "grad_norm": 2.618401288986206, + "learning_rate": 4.808589754471304e-06, + "loss": 0.6236, + "step": 1631 + }, + { + "epoch": 0.7950633322507308, + "grad_norm": 2.824303388595581, + "learning_rate": 4.808342872755088e-06, + "loss": 0.6126, + "step": 1632 + }, + { + "epoch": 0.7955505034101981, + "grad_norm": 3.2916085720062256, + "learning_rate": 4.808095838272957e-06, + "loss": 0.6251, + "step": 1633 + }, + { + "epoch": 0.7960376745696655, + "grad_norm": 3.000215768814087, + "learning_rate": 4.807848651041257e-06, + "loss": 0.6149, + "step": 1634 + }, + { + "epoch": 0.7965248457291328, + "grad_norm": 2.919588327407837, + "learning_rate": 4.807601311076349e-06, + "loss": 0.6584, + "step": 1635 + }, + { + "epoch": 0.7970120168886002, + "grad_norm": 2.8931100368499756, + "learning_rate": 4.807353818394601e-06, + "loss": 0.695, + "step": 1636 + }, + { + "epoch": 0.7974991880480675, + "grad_norm": 2.945324420928955, + "learning_rate": 4.807106173012392e-06, + "loss": 0.6442, + "step": 1637 + }, + { + "epoch": 0.7979863592075349, + "grad_norm": 2.8930301666259766, + "learning_rate": 4.806858374946111e-06, + "loss": 0.6647, + "step": 1638 + }, + { + "epoch": 0.7984735303670023, + "grad_norm": 3.0065932273864746, + "learning_rate": 4.8066104242121584e-06, + "loss": 0.688, + "step": 1639 + }, + { + "epoch": 0.7989607015264696, + "grad_norm": 2.9815316200256348, + "learning_rate": 4.806362320826941e-06, + "loss": 0.6447, + "step": 1640 + }, + { + "epoch": 0.799447872685937, + "grad_norm": 2.6405844688415527, + "learning_rate": 4.806114064806882e-06, + "loss": 0.6736, + "step": 1641 + }, + { + "epoch": 0.7999350438454044, + "grad_norm": 2.864736318588257, + "learning_rate": 4.805865656168408e-06, + "loss": 0.6198, + "step": 1642 + }, + { + "epoch": 0.8004222150048718, + "grad_norm": 3.1961493492126465, + "learning_rate": 4.805617094927959e-06, + "loss": 0.7012, + "step": 1643 + }, + { + "epoch": 0.800909386164339, + "grad_norm": 2.494650363922119, + "learning_rate": 4.805368381101986e-06, + "loss": 0.5841, + "step": 1644 + }, + { + "epoch": 0.8013965573238064, + "grad_norm": 5.9223456382751465, + "learning_rate": 4.805119514706948e-06, + "loss": 0.687, + "step": 1645 + }, + { + "epoch": 0.8018837284832738, + "grad_norm": 2.844710350036621, + "learning_rate": 4.8048704957593165e-06, + "loss": 0.6228, + "step": 1646 + }, + { + "epoch": 0.8023708996427411, + "grad_norm": 2.8228306770324707, + "learning_rate": 4.804621324275568e-06, + "loss": 0.6701, + "step": 1647 + }, + { + "epoch": 0.8028580708022085, + "grad_norm": 2.7788565158843994, + "learning_rate": 4.804372000272196e-06, + "loss": 0.7454, + "step": 1648 + }, + { + "epoch": 0.8033452419616759, + "grad_norm": 2.6730830669403076, + "learning_rate": 4.8041225237657e-06, + "loss": 0.5959, + "step": 1649 + }, + { + "epoch": 0.8038324131211432, + "grad_norm": 2.7615954875946045, + "learning_rate": 4.803872894772589e-06, + "loss": 0.6909, + "step": 1650 + }, + { + "epoch": 0.8043195842806106, + "grad_norm": 2.639944553375244, + "learning_rate": 4.803623113309385e-06, + "loss": 0.6173, + "step": 1651 + }, + { + "epoch": 0.804806755440078, + "grad_norm": 2.7126355171203613, + "learning_rate": 4.803373179392618e-06, + "loss": 0.6077, + "step": 1652 + }, + { + "epoch": 0.8052939265995454, + "grad_norm": 2.471296787261963, + "learning_rate": 4.8031230930388284e-06, + "loss": 0.5768, + "step": 1653 + }, + { + "epoch": 0.8057810977590126, + "grad_norm": 2.8603663444519043, + "learning_rate": 4.802872854264567e-06, + "loss": 0.7256, + "step": 1654 + }, + { + "epoch": 0.80626826891848, + "grad_norm": 3.2085132598876953, + "learning_rate": 4.802622463086393e-06, + "loss": 0.6516, + "step": 1655 + }, + { + "epoch": 0.8067554400779474, + "grad_norm": 2.839597463607788, + "learning_rate": 4.802371919520881e-06, + "loss": 0.6728, + "step": 1656 + }, + { + "epoch": 0.8072426112374147, + "grad_norm": 2.851947546005249, + "learning_rate": 4.802121223584609e-06, + "loss": 0.6168, + "step": 1657 + }, + { + "epoch": 0.8077297823968821, + "grad_norm": 2.7272493839263916, + "learning_rate": 4.801870375294169e-06, + "loss": 0.7614, + "step": 1658 + }, + { + "epoch": 0.8082169535563495, + "grad_norm": 2.8600902557373047, + "learning_rate": 4.801619374666161e-06, + "loss": 0.6429, + "step": 1659 + }, + { + "epoch": 0.8087041247158169, + "grad_norm": 3.076272964477539, + "learning_rate": 4.801368221717198e-06, + "loss": 0.6932, + "step": 1660 + }, + { + "epoch": 0.8091912958752842, + "grad_norm": 2.8368048667907715, + "learning_rate": 4.8011169164639e-06, + "loss": 0.6803, + "step": 1661 + }, + { + "epoch": 0.8096784670347515, + "grad_norm": 3.0479695796966553, + "learning_rate": 4.800865458922899e-06, + "loss": 0.6963, + "step": 1662 + }, + { + "epoch": 0.8101656381942189, + "grad_norm": 2.466048240661621, + "learning_rate": 4.800613849110836e-06, + "loss": 0.6574, + "step": 1663 + }, + { + "epoch": 0.8106528093536862, + "grad_norm": 3.2765090465545654, + "learning_rate": 4.8003620870443625e-06, + "loss": 0.6094, + "step": 1664 + }, + { + "epoch": 0.8111399805131536, + "grad_norm": 2.859508514404297, + "learning_rate": 4.800110172740141e-06, + "loss": 0.648, + "step": 1665 + }, + { + "epoch": 0.811627151672621, + "grad_norm": 2.965350389480591, + "learning_rate": 4.799858106214842e-06, + "loss": 0.661, + "step": 1666 + }, + { + "epoch": 0.8121143228320883, + "grad_norm": 3.0386836528778076, + "learning_rate": 4.799605887485148e-06, + "loss": 0.6512, + "step": 1667 + }, + { + "epoch": 0.8126014939915557, + "grad_norm": 2.629664897918701, + "learning_rate": 4.79935351656775e-06, + "loss": 0.6066, + "step": 1668 + }, + { + "epoch": 0.8130886651510231, + "grad_norm": 2.691894054412842, + "learning_rate": 4.79910099347935e-06, + "loss": 0.6889, + "step": 1669 + }, + { + "epoch": 0.8135758363104905, + "grad_norm": 2.5522191524505615, + "learning_rate": 4.798848318236662e-06, + "loss": 0.6397, + "step": 1670 + }, + { + "epoch": 0.8140630074699577, + "grad_norm": 2.719240665435791, + "learning_rate": 4.798595490856405e-06, + "loss": 0.6237, + "step": 1671 + }, + { + "epoch": 0.8145501786294251, + "grad_norm": 2.540175676345825, + "learning_rate": 4.7983425113553126e-06, + "loss": 0.6108, + "step": 1672 + }, + { + "epoch": 0.8150373497888925, + "grad_norm": 3.2215304374694824, + "learning_rate": 4.798089379750127e-06, + "loss": 0.659, + "step": 1673 + }, + { + "epoch": 0.8155245209483598, + "grad_norm": 2.942089557647705, + "learning_rate": 4.7978360960576e-06, + "loss": 0.6808, + "step": 1674 + }, + { + "epoch": 0.8160116921078272, + "grad_norm": 2.968858003616333, + "learning_rate": 4.7975826602944945e-06, + "loss": 0.5858, + "step": 1675 + }, + { + "epoch": 0.8164988632672946, + "grad_norm": 2.9430527687072754, + "learning_rate": 4.797329072477582e-06, + "loss": 0.6862, + "step": 1676 + }, + { + "epoch": 0.816986034426762, + "grad_norm": 2.4118993282318115, + "learning_rate": 4.797075332623646e-06, + "loss": 0.6771, + "step": 1677 + }, + { + "epoch": 0.8174732055862293, + "grad_norm": 3.6525840759277344, + "learning_rate": 4.796821440749479e-06, + "loss": 0.5734, + "step": 1678 + }, + { + "epoch": 0.8179603767456967, + "grad_norm": 2.948838949203491, + "learning_rate": 4.796567396871881e-06, + "loss": 0.6684, + "step": 1679 + }, + { + "epoch": 0.818447547905164, + "grad_norm": 3.000809669494629, + "learning_rate": 4.796313201007667e-06, + "loss": 0.7151, + "step": 1680 + }, + { + "epoch": 0.8189347190646313, + "grad_norm": 2.64255690574646, + "learning_rate": 4.796058853173661e-06, + "loss": 0.685, + "step": 1681 + }, + { + "epoch": 0.8194218902240987, + "grad_norm": 2.7932355403900146, + "learning_rate": 4.7958043533866925e-06, + "loss": 0.6699, + "step": 1682 + }, + { + "epoch": 0.8199090613835661, + "grad_norm": 3.270399808883667, + "learning_rate": 4.795549701663605e-06, + "loss": 0.677, + "step": 1683 + }, + { + "epoch": 0.8203962325430334, + "grad_norm": 2.8925912380218506, + "learning_rate": 4.795294898021253e-06, + "loss": 0.667, + "step": 1684 + }, + { + "epoch": 0.8208834037025008, + "grad_norm": 2.645648241043091, + "learning_rate": 4.7950399424764985e-06, + "loss": 0.6466, + "step": 1685 + }, + { + "epoch": 0.8213705748619682, + "grad_norm": 2.456831455230713, + "learning_rate": 4.7947848350462145e-06, + "loss": 0.6259, + "step": 1686 + }, + { + "epoch": 0.8218577460214356, + "grad_norm": 3.1084372997283936, + "learning_rate": 4.794529575747283e-06, + "loss": 0.7106, + "step": 1687 + }, + { + "epoch": 0.8223449171809029, + "grad_norm": 2.658613920211792, + "learning_rate": 4.794274164596598e-06, + "loss": 0.6251, + "step": 1688 + }, + { + "epoch": 0.8228320883403702, + "grad_norm": 2.527292490005493, + "learning_rate": 4.794018601611063e-06, + "loss": 0.6477, + "step": 1689 + }, + { + "epoch": 0.8233192594998376, + "grad_norm": 2.570606231689453, + "learning_rate": 4.79376288680759e-06, + "loss": 0.6418, + "step": 1690 + }, + { + "epoch": 0.8238064306593049, + "grad_norm": 2.813328266143799, + "learning_rate": 4.793507020203104e-06, + "loss": 0.656, + "step": 1691 + }, + { + "epoch": 0.8242936018187723, + "grad_norm": 2.7522871494293213, + "learning_rate": 4.793251001814536e-06, + "loss": 0.6827, + "step": 1692 + }, + { + "epoch": 0.8247807729782397, + "grad_norm": 2.709200143814087, + "learning_rate": 4.79299483165883e-06, + "loss": 0.7248, + "step": 1693 + }, + { + "epoch": 0.8252679441377071, + "grad_norm": 2.8823297023773193, + "learning_rate": 4.792738509752939e-06, + "loss": 0.6374, + "step": 1694 + }, + { + "epoch": 0.8257551152971744, + "grad_norm": 2.75046706199646, + "learning_rate": 4.792482036113828e-06, + "loss": 0.6489, + "step": 1695 + }, + { + "epoch": 0.8262422864566418, + "grad_norm": 2.9617245197296143, + "learning_rate": 4.792225410758469e-06, + "loss": 0.6444, + "step": 1696 + }, + { + "epoch": 0.8267294576161092, + "grad_norm": 2.7676427364349365, + "learning_rate": 4.791968633703846e-06, + "loss": 0.6245, + "step": 1697 + }, + { + "epoch": 0.8272166287755764, + "grad_norm": 3.1251511573791504, + "learning_rate": 4.791711704966952e-06, + "loss": 0.5886, + "step": 1698 + }, + { + "epoch": 0.8277037999350438, + "grad_norm": 2.8801345825195312, + "learning_rate": 4.791454624564791e-06, + "loss": 0.7293, + "step": 1699 + }, + { + "epoch": 0.8281909710945112, + "grad_norm": 2.5419859886169434, + "learning_rate": 4.791197392514376e-06, + "loss": 0.6816, + "step": 1700 + }, + { + "epoch": 0.8286781422539785, + "grad_norm": 2.7969753742218018, + "learning_rate": 4.79094000883273e-06, + "loss": 0.658, + "step": 1701 + }, + { + "epoch": 0.8291653134134459, + "grad_norm": 3.2935400009155273, + "learning_rate": 4.790682473536889e-06, + "loss": 0.6772, + "step": 1702 + }, + { + "epoch": 0.8296524845729133, + "grad_norm": 2.763822555541992, + "learning_rate": 4.790424786643896e-06, + "loss": 0.5977, + "step": 1703 + }, + { + "epoch": 0.8301396557323807, + "grad_norm": 3.905242681503296, + "learning_rate": 4.790166948170803e-06, + "loss": 0.6054, + "step": 1704 + }, + { + "epoch": 0.830626826891848, + "grad_norm": 2.8511135578155518, + "learning_rate": 4.789908958134673e-06, + "loss": 0.7088, + "step": 1705 + }, + { + "epoch": 0.8311139980513154, + "grad_norm": 2.924529790878296, + "learning_rate": 4.7896508165525835e-06, + "loss": 0.7028, + "step": 1706 + }, + { + "epoch": 0.8316011692107828, + "grad_norm": 2.8980789184570312, + "learning_rate": 4.789392523441615e-06, + "loss": 0.6058, + "step": 1707 + }, + { + "epoch": 0.83208834037025, + "grad_norm": 4.417017936706543, + "learning_rate": 4.789134078818864e-06, + "loss": 0.6303, + "step": 1708 + }, + { + "epoch": 0.8325755115297174, + "grad_norm": 2.8203632831573486, + "learning_rate": 4.78887548270143e-06, + "loss": 0.6967, + "step": 1709 + }, + { + "epoch": 0.8330626826891848, + "grad_norm": 2.4767041206359863, + "learning_rate": 4.788616735106432e-06, + "loss": 0.6293, + "step": 1710 + }, + { + "epoch": 0.8335498538486522, + "grad_norm": 2.8282806873321533, + "learning_rate": 4.788357836050991e-06, + "loss": 0.7015, + "step": 1711 + }, + { + "epoch": 0.8340370250081195, + "grad_norm": 2.5334019660949707, + "learning_rate": 4.788098785552242e-06, + "loss": 0.6556, + "step": 1712 + }, + { + "epoch": 0.8345241961675869, + "grad_norm": 2.8480947017669678, + "learning_rate": 4.787839583627328e-06, + "loss": 0.6192, + "step": 1713 + }, + { + "epoch": 0.8350113673270543, + "grad_norm": 2.5977208614349365, + "learning_rate": 4.787580230293403e-06, + "loss": 0.6811, + "step": 1714 + }, + { + "epoch": 0.8354985384865216, + "grad_norm": 2.775914192199707, + "learning_rate": 4.787320725567632e-06, + "loss": 0.6354, + "step": 1715 + }, + { + "epoch": 0.835985709645989, + "grad_norm": 2.7444751262664795, + "learning_rate": 4.787061069467188e-06, + "loss": 0.6361, + "step": 1716 + }, + { + "epoch": 0.8364728808054563, + "grad_norm": 2.5416295528411865, + "learning_rate": 4.786801262009255e-06, + "loss": 0.7186, + "step": 1717 + }, + { + "epoch": 0.8369600519649236, + "grad_norm": 2.830871820449829, + "learning_rate": 4.786541303211028e-06, + "loss": 0.6871, + "step": 1718 + }, + { + "epoch": 0.837447223124391, + "grad_norm": 2.6855006217956543, + "learning_rate": 4.786281193089711e-06, + "loss": 0.6167, + "step": 1719 + }, + { + "epoch": 0.8379343942838584, + "grad_norm": 3.150886058807373, + "learning_rate": 4.786020931662516e-06, + "loss": 0.7514, + "step": 1720 + }, + { + "epoch": 0.8384215654433258, + "grad_norm": 2.7723240852355957, + "learning_rate": 4.78576051894667e-06, + "loss": 0.7308, + "step": 1721 + }, + { + "epoch": 0.8389087366027931, + "grad_norm": 3.0975492000579834, + "learning_rate": 4.785499954959405e-06, + "loss": 0.6373, + "step": 1722 + }, + { + "epoch": 0.8393959077622605, + "grad_norm": 2.422149896621704, + "learning_rate": 4.785239239717967e-06, + "loss": 0.5701, + "step": 1723 + }, + { + "epoch": 0.8398830789217279, + "grad_norm": 2.4786500930786133, + "learning_rate": 4.784978373239608e-06, + "loss": 0.6008, + "step": 1724 + }, + { + "epoch": 0.8403702500811951, + "grad_norm": 2.9390981197357178, + "learning_rate": 4.7847173555415925e-06, + "loss": 0.6675, + "step": 1725 + }, + { + "epoch": 0.8408574212406625, + "grad_norm": 2.875692844390869, + "learning_rate": 4.784456186641196e-06, + "loss": 0.569, + "step": 1726 + }, + { + "epoch": 0.8413445924001299, + "grad_norm": 3.093482494354248, + "learning_rate": 4.7841948665557016e-06, + "loss": 0.7123, + "step": 1727 + }, + { + "epoch": 0.8418317635595973, + "grad_norm": 2.536482572555542, + "learning_rate": 4.783933395302404e-06, + "loss": 0.6005, + "step": 1728 + }, + { + "epoch": 0.8423189347190646, + "grad_norm": 2.5597527027130127, + "learning_rate": 4.783671772898606e-06, + "loss": 0.6542, + "step": 1729 + }, + { + "epoch": 0.842806105878532, + "grad_norm": 2.620718240737915, + "learning_rate": 4.783409999361623e-06, + "loss": 0.6575, + "step": 1730 + }, + { + "epoch": 0.8432932770379994, + "grad_norm": 2.756053924560547, + "learning_rate": 4.783148074708779e-06, + "loss": 0.5432, + "step": 1731 + }, + { + "epoch": 0.8437804481974667, + "grad_norm": 2.7816274166107178, + "learning_rate": 4.782885998957409e-06, + "loss": 0.6815, + "step": 1732 + }, + { + "epoch": 0.8442676193569341, + "grad_norm": 2.8159849643707275, + "learning_rate": 4.782623772124854e-06, + "loss": 0.6504, + "step": 1733 + }, + { + "epoch": 0.8447547905164015, + "grad_norm": 2.6799442768096924, + "learning_rate": 4.7823613942284716e-06, + "loss": 0.5921, + "step": 1734 + }, + { + "epoch": 0.8452419616758687, + "grad_norm": 2.91845965385437, + "learning_rate": 4.782098865285625e-06, + "loss": 0.6547, + "step": 1735 + }, + { + "epoch": 0.8457291328353361, + "grad_norm": 2.841020107269287, + "learning_rate": 4.781836185313688e-06, + "loss": 0.6769, + "step": 1736 + }, + { + "epoch": 0.8462163039948035, + "grad_norm": 2.8927102088928223, + "learning_rate": 4.7815733543300445e-06, + "loss": 0.6489, + "step": 1737 + }, + { + "epoch": 0.8467034751542709, + "grad_norm": 2.9709253311157227, + "learning_rate": 4.781310372352089e-06, + "loss": 0.6884, + "step": 1738 + }, + { + "epoch": 0.8471906463137382, + "grad_norm": 2.7702372074127197, + "learning_rate": 4.781047239397225e-06, + "loss": 0.6814, + "step": 1739 + }, + { + "epoch": 0.8476778174732056, + "grad_norm": 2.652073860168457, + "learning_rate": 4.780783955482868e-06, + "loss": 0.565, + "step": 1740 + }, + { + "epoch": 0.848164988632673, + "grad_norm": 4.970697402954102, + "learning_rate": 4.78052052062644e-06, + "loss": 0.6543, + "step": 1741 + }, + { + "epoch": 0.8486521597921403, + "grad_norm": 2.732276201248169, + "learning_rate": 4.780256934845377e-06, + "loss": 0.5845, + "step": 1742 + }, + { + "epoch": 0.8491393309516077, + "grad_norm": 2.839522123336792, + "learning_rate": 4.779993198157122e-06, + "loss": 0.5932, + "step": 1743 + }, + { + "epoch": 0.849626502111075, + "grad_norm": 2.8928401470184326, + "learning_rate": 4.77972931057913e-06, + "loss": 0.6466, + "step": 1744 + }, + { + "epoch": 0.8501136732705424, + "grad_norm": 2.836385726928711, + "learning_rate": 4.779465272128864e-06, + "loss": 0.6464, + "step": 1745 + }, + { + "epoch": 0.8506008444300097, + "grad_norm": 2.9475929737091064, + "learning_rate": 4.779201082823799e-06, + "loss": 0.6323, + "step": 1746 + }, + { + "epoch": 0.8510880155894771, + "grad_norm": 3.0073442459106445, + "learning_rate": 4.778936742681418e-06, + "loss": 0.6492, + "step": 1747 + }, + { + "epoch": 0.8515751867489445, + "grad_norm": 2.7641022205352783, + "learning_rate": 4.7786722517192164e-06, + "loss": 0.6745, + "step": 1748 + }, + { + "epoch": 0.8520623579084118, + "grad_norm": 2.70119047164917, + "learning_rate": 4.7784076099546974e-06, + "loss": 0.6191, + "step": 1749 + }, + { + "epoch": 0.8525495290678792, + "grad_norm": 2.6288323402404785, + "learning_rate": 4.778142817405374e-06, + "loss": 0.6878, + "step": 1750 + }, + { + "epoch": 0.8530367002273466, + "grad_norm": 3.6958134174346924, + "learning_rate": 4.777877874088771e-06, + "loss": 0.6031, + "step": 1751 + }, + { + "epoch": 0.8535238713868138, + "grad_norm": 2.5743751525878906, + "learning_rate": 4.7776127800224235e-06, + "loss": 0.5943, + "step": 1752 + }, + { + "epoch": 0.8540110425462812, + "grad_norm": 7.183384418487549, + "learning_rate": 4.777347535223874e-06, + "loss": 0.6392, + "step": 1753 + }, + { + "epoch": 0.8544982137057486, + "grad_norm": 2.6348230838775635, + "learning_rate": 4.777082139710677e-06, + "loss": 0.6458, + "step": 1754 + }, + { + "epoch": 0.854985384865216, + "grad_norm": 2.880856990814209, + "learning_rate": 4.776816593500395e-06, + "loss": 0.6101, + "step": 1755 + }, + { + "epoch": 0.8554725560246833, + "grad_norm": 2.4800682067871094, + "learning_rate": 4.776550896610603e-06, + "loss": 0.6543, + "step": 1756 + }, + { + "epoch": 0.8559597271841507, + "grad_norm": 2.6741621494293213, + "learning_rate": 4.7762850490588855e-06, + "loss": 0.6151, + "step": 1757 + }, + { + "epoch": 0.8564468983436181, + "grad_norm": 2.6714537143707275, + "learning_rate": 4.776019050862834e-06, + "loss": 0.5673, + "step": 1758 + }, + { + "epoch": 0.8569340695030854, + "grad_norm": 2.5318028926849365, + "learning_rate": 4.775752902040056e-06, + "loss": 0.6269, + "step": 1759 + }, + { + "epoch": 0.8574212406625528, + "grad_norm": 3.5664288997650146, + "learning_rate": 4.775486602608162e-06, + "loss": 0.7008, + "step": 1760 + }, + { + "epoch": 0.8579084118220202, + "grad_norm": 2.5818753242492676, + "learning_rate": 4.775220152584775e-06, + "loss": 0.6232, + "step": 1761 + }, + { + "epoch": 0.8583955829814875, + "grad_norm": 5.0362749099731445, + "learning_rate": 4.774953551987531e-06, + "loss": 0.6317, + "step": 1762 + }, + { + "epoch": 0.8588827541409548, + "grad_norm": 2.8132097721099854, + "learning_rate": 4.774686800834073e-06, + "loss": 0.6576, + "step": 1763 + }, + { + "epoch": 0.8593699253004222, + "grad_norm": 2.89772367477417, + "learning_rate": 4.774419899142054e-06, + "loss": 0.6917, + "step": 1764 + }, + { + "epoch": 0.8598570964598896, + "grad_norm": 2.686187982559204, + "learning_rate": 4.774152846929139e-06, + "loss": 0.68, + "step": 1765 + }, + { + "epoch": 0.8603442676193569, + "grad_norm": 2.6978094577789307, + "learning_rate": 4.7738856442129985e-06, + "loss": 0.6086, + "step": 1766 + }, + { + "epoch": 0.8608314387788243, + "grad_norm": 3.1704514026641846, + "learning_rate": 4.77361829101132e-06, + "loss": 0.597, + "step": 1767 + }, + { + "epoch": 0.8613186099382917, + "grad_norm": 2.7017416954040527, + "learning_rate": 4.773350787341792e-06, + "loss": 0.6613, + "step": 1768 + }, + { + "epoch": 0.861805781097759, + "grad_norm": 3.0749270915985107, + "learning_rate": 4.773083133222123e-06, + "loss": 0.7056, + "step": 1769 + }, + { + "epoch": 0.8622929522572264, + "grad_norm": 3.2618939876556396, + "learning_rate": 4.772815328670023e-06, + "loss": 0.6211, + "step": 1770 + }, + { + "epoch": 0.8627801234166937, + "grad_norm": 3.171116828918457, + "learning_rate": 4.772547373703217e-06, + "loss": 0.7594, + "step": 1771 + }, + { + "epoch": 0.8632672945761611, + "grad_norm": 2.786775827407837, + "learning_rate": 4.7722792683394355e-06, + "loss": 0.6655, + "step": 1772 + }, + { + "epoch": 0.8637544657356284, + "grad_norm": 2.6025044918060303, + "learning_rate": 4.7720110125964255e-06, + "loss": 0.5437, + "step": 1773 + }, + { + "epoch": 0.8642416368950958, + "grad_norm": 2.623366117477417, + "learning_rate": 4.771742606491937e-06, + "loss": 0.597, + "step": 1774 + }, + { + "epoch": 0.8647288080545632, + "grad_norm": 2.8571362495422363, + "learning_rate": 4.771474050043735e-06, + "loss": 0.6992, + "step": 1775 + }, + { + "epoch": 0.8652159792140305, + "grad_norm": 2.5834884643554688, + "learning_rate": 4.771205343269592e-06, + "loss": 0.6104, + "step": 1776 + }, + { + "epoch": 0.8657031503734979, + "grad_norm": 2.577981948852539, + "learning_rate": 4.770936486187291e-06, + "loss": 0.5903, + "step": 1777 + }, + { + "epoch": 0.8661903215329653, + "grad_norm": 2.9371185302734375, + "learning_rate": 4.7706674788146245e-06, + "loss": 0.6574, + "step": 1778 + }, + { + "epoch": 0.8666774926924327, + "grad_norm": 2.5136783123016357, + "learning_rate": 4.770398321169396e-06, + "loss": 0.5599, + "step": 1779 + }, + { + "epoch": 0.8671646638518999, + "grad_norm": 3.0917625427246094, + "learning_rate": 4.770129013269417e-06, + "loss": 0.6692, + "step": 1780 + }, + { + "epoch": 0.8676518350113673, + "grad_norm": 2.73344349861145, + "learning_rate": 4.769859555132512e-06, + "loss": 0.6524, + "step": 1781 + }, + { + "epoch": 0.8681390061708347, + "grad_norm": 3.0138654708862305, + "learning_rate": 4.769589946776513e-06, + "loss": 0.6679, + "step": 1782 + }, + { + "epoch": 0.868626177330302, + "grad_norm": 2.960054636001587, + "learning_rate": 4.769320188219263e-06, + "loss": 0.674, + "step": 1783 + }, + { + "epoch": 0.8691133484897694, + "grad_norm": 2.7041687965393066, + "learning_rate": 4.769050279478614e-06, + "loss": 0.6791, + "step": 1784 + }, + { + "epoch": 0.8696005196492368, + "grad_norm": 2.957379102706909, + "learning_rate": 4.768780220572429e-06, + "loss": 0.6224, + "step": 1785 + }, + { + "epoch": 0.8700876908087041, + "grad_norm": 2.8396692276000977, + "learning_rate": 4.76851001151858e-06, + "loss": 0.6233, + "step": 1786 + }, + { + "epoch": 0.8705748619681715, + "grad_norm": 2.9208085536956787, + "learning_rate": 4.76823965233495e-06, + "loss": 0.6774, + "step": 1787 + }, + { + "epoch": 0.8710620331276389, + "grad_norm": 3.119649648666382, + "learning_rate": 4.7679691430394306e-06, + "loss": 0.6164, + "step": 1788 + }, + { + "epoch": 0.8715492042871062, + "grad_norm": 2.7975692749023438, + "learning_rate": 4.767698483649924e-06, + "loss": 0.693, + "step": 1789 + }, + { + "epoch": 0.8720363754465735, + "grad_norm": 2.6548798084259033, + "learning_rate": 4.767427674184344e-06, + "loss": 0.6817, + "step": 1790 + }, + { + "epoch": 0.8725235466060409, + "grad_norm": 2.9185261726379395, + "learning_rate": 4.7671567146606105e-06, + "loss": 0.6774, + "step": 1791 + }, + { + "epoch": 0.8730107177655083, + "grad_norm": 3.0116472244262695, + "learning_rate": 4.766885605096658e-06, + "loss": 0.717, + "step": 1792 + }, + { + "epoch": 0.8734978889249756, + "grad_norm": 2.7482826709747314, + "learning_rate": 4.766614345510426e-06, + "loss": 0.6532, + "step": 1793 + }, + { + "epoch": 0.873985060084443, + "grad_norm": 3.085552453994751, + "learning_rate": 4.7663429359198685e-06, + "loss": 0.6194, + "step": 1794 + }, + { + "epoch": 0.8744722312439104, + "grad_norm": 2.9721498489379883, + "learning_rate": 4.766071376342947e-06, + "loss": 0.5711, + "step": 1795 + }, + { + "epoch": 0.8749594024033778, + "grad_norm": 2.809293031692505, + "learning_rate": 4.765799666797632e-06, + "loss": 0.6476, + "step": 1796 + }, + { + "epoch": 0.875446573562845, + "grad_norm": 2.472198486328125, + "learning_rate": 4.765527807301906e-06, + "loss": 0.6434, + "step": 1797 + }, + { + "epoch": 0.8759337447223124, + "grad_norm": 2.6068003177642822, + "learning_rate": 4.7652557978737615e-06, + "loss": 0.6229, + "step": 1798 + }, + { + "epoch": 0.8764209158817798, + "grad_norm": 3.26090931892395, + "learning_rate": 4.764983638531198e-06, + "loss": 0.6349, + "step": 1799 + }, + { + "epoch": 0.8769080870412471, + "grad_norm": 2.871054172515869, + "learning_rate": 4.764711329292229e-06, + "loss": 0.7286, + "step": 1800 + }, + { + "epoch": 0.8773952582007145, + "grad_norm": 2.824431896209717, + "learning_rate": 4.764438870174875e-06, + "loss": 0.599, + "step": 1801 + }, + { + "epoch": 0.8778824293601819, + "grad_norm": 2.672170639038086, + "learning_rate": 4.764166261197168e-06, + "loss": 0.6718, + "step": 1802 + }, + { + "epoch": 0.8783696005196492, + "grad_norm": 2.9746270179748535, + "learning_rate": 4.763893502377148e-06, + "loss": 0.7736, + "step": 1803 + }, + { + "epoch": 0.8788567716791166, + "grad_norm": 2.833075523376465, + "learning_rate": 4.763620593732867e-06, + "loss": 0.649, + "step": 1804 + }, + { + "epoch": 0.879343942838584, + "grad_norm": 2.822556972503662, + "learning_rate": 4.7633475352823855e-06, + "loss": 0.655, + "step": 1805 + }, + { + "epoch": 0.8798311139980514, + "grad_norm": 2.9149892330169678, + "learning_rate": 4.7630743270437754e-06, + "loss": 0.6256, + "step": 1806 + }, + { + "epoch": 0.8803182851575186, + "grad_norm": 2.409228563308716, + "learning_rate": 4.762800969035117e-06, + "loss": 0.5416, + "step": 1807 + }, + { + "epoch": 0.880805456316986, + "grad_norm": 2.878772258758545, + "learning_rate": 4.762527461274501e-06, + "loss": 0.7175, + "step": 1808 + }, + { + "epoch": 0.8812926274764534, + "grad_norm": 2.738560676574707, + "learning_rate": 4.762253803780029e-06, + "loss": 0.6801, + "step": 1809 + }, + { + "epoch": 0.8817797986359207, + "grad_norm": 2.7061314582824707, + "learning_rate": 4.76197999656981e-06, + "loss": 0.5895, + "step": 1810 + }, + { + "epoch": 0.8822669697953881, + "grad_norm": 2.8491952419281006, + "learning_rate": 4.761706039661966e-06, + "loss": 0.5993, + "step": 1811 + }, + { + "epoch": 0.8827541409548555, + "grad_norm": 2.617893695831299, + "learning_rate": 4.7614319330746265e-06, + "loss": 0.602, + "step": 1812 + }, + { + "epoch": 0.8832413121143229, + "grad_norm": 2.7258870601654053, + "learning_rate": 4.761157676825933e-06, + "loss": 0.6165, + "step": 1813 + }, + { + "epoch": 0.8837284832737902, + "grad_norm": 2.4197027683258057, + "learning_rate": 4.7608832709340335e-06, + "loss": 0.7118, + "step": 1814 + }, + { + "epoch": 0.8842156544332576, + "grad_norm": 3.33412766456604, + "learning_rate": 4.760608715417091e-06, + "loss": 0.702, + "step": 1815 + }, + { + "epoch": 0.884702825592725, + "grad_norm": 2.578948497772217, + "learning_rate": 4.760334010293274e-06, + "loss": 0.6959, + "step": 1816 + }, + { + "epoch": 0.8851899967521922, + "grad_norm": 2.731266975402832, + "learning_rate": 4.760059155580763e-06, + "loss": 0.699, + "step": 1817 + }, + { + "epoch": 0.8856771679116596, + "grad_norm": 3.1176397800445557, + "learning_rate": 4.759784151297746e-06, + "loss": 0.6706, + "step": 1818 + }, + { + "epoch": 0.886164339071127, + "grad_norm": 2.640035390853882, + "learning_rate": 4.759508997462425e-06, + "loss": 0.5919, + "step": 1819 + }, + { + "epoch": 0.8866515102305943, + "grad_norm": 2.694939374923706, + "learning_rate": 4.75923369409301e-06, + "loss": 0.6917, + "step": 1820 + }, + { + "epoch": 0.8871386813900617, + "grad_norm": 2.9706029891967773, + "learning_rate": 4.758958241207718e-06, + "loss": 0.6749, + "step": 1821 + }, + { + "epoch": 0.8876258525495291, + "grad_norm": 3.05694580078125, + "learning_rate": 4.758682638824782e-06, + "loss": 0.644, + "step": 1822 + }, + { + "epoch": 0.8881130237089965, + "grad_norm": 2.9298934936523438, + "learning_rate": 4.758406886962438e-06, + "loss": 0.6902, + "step": 1823 + }, + { + "epoch": 0.8886001948684638, + "grad_norm": 2.8209517002105713, + "learning_rate": 4.758130985638936e-06, + "loss": 0.6834, + "step": 1824 + }, + { + "epoch": 0.8890873660279311, + "grad_norm": 3.8696930408477783, + "learning_rate": 4.757854934872536e-06, + "loss": 0.5656, + "step": 1825 + }, + { + "epoch": 0.8895745371873985, + "grad_norm": 3.0057787895202637, + "learning_rate": 4.757578734681508e-06, + "loss": 0.6953, + "step": 1826 + }, + { + "epoch": 0.8900617083468658, + "grad_norm": 2.9540843963623047, + "learning_rate": 4.757302385084128e-06, + "loss": 0.7474, + "step": 1827 + }, + { + "epoch": 0.8905488795063332, + "grad_norm": 2.9931507110595703, + "learning_rate": 4.7570258860986866e-06, + "loss": 0.6894, + "step": 1828 + }, + { + "epoch": 0.8910360506658006, + "grad_norm": 2.7935216426849365, + "learning_rate": 4.756749237743483e-06, + "loss": 0.6001, + "step": 1829 + }, + { + "epoch": 0.891523221825268, + "grad_norm": 2.6889216899871826, + "learning_rate": 4.756472440036824e-06, + "loss": 0.7074, + "step": 1830 + }, + { + "epoch": 0.8920103929847353, + "grad_norm": 2.6027767658233643, + "learning_rate": 4.756195492997029e-06, + "loss": 0.6862, + "step": 1831 + }, + { + "epoch": 0.8924975641442027, + "grad_norm": 2.611947774887085, + "learning_rate": 4.755918396642427e-06, + "loss": 0.6715, + "step": 1832 + }, + { + "epoch": 0.8929847353036701, + "grad_norm": 2.421370506286621, + "learning_rate": 4.755641150991356e-06, + "loss": 0.6313, + "step": 1833 + }, + { + "epoch": 0.8934719064631373, + "grad_norm": 2.6558837890625, + "learning_rate": 4.7553637560621625e-06, + "loss": 0.6536, + "step": 1834 + }, + { + "epoch": 0.8939590776226047, + "grad_norm": 2.844350814819336, + "learning_rate": 4.755086211873207e-06, + "loss": 0.6414, + "step": 1835 + }, + { + "epoch": 0.8944462487820721, + "grad_norm": 2.9931600093841553, + "learning_rate": 4.754808518442855e-06, + "loss": 0.6246, + "step": 1836 + }, + { + "epoch": 0.8949334199415394, + "grad_norm": 3.636462688446045, + "learning_rate": 4.754530675789485e-06, + "loss": 0.5748, + "step": 1837 + }, + { + "epoch": 0.8954205911010068, + "grad_norm": 2.9691007137298584, + "learning_rate": 4.7542526839314855e-06, + "loss": 0.6266, + "step": 1838 + }, + { + "epoch": 0.8959077622604742, + "grad_norm": 2.874422788619995, + "learning_rate": 4.753974542887253e-06, + "loss": 0.6966, + "step": 1839 + }, + { + "epoch": 0.8963949334199416, + "grad_norm": 2.723869800567627, + "learning_rate": 4.753696252675195e-06, + "loss": 0.6474, + "step": 1840 + }, + { + "epoch": 0.8968821045794089, + "grad_norm": 3.63291597366333, + "learning_rate": 4.75341781331373e-06, + "loss": 0.6445, + "step": 1841 + }, + { + "epoch": 0.8973692757388763, + "grad_norm": 2.392764091491699, + "learning_rate": 4.7531392248212826e-06, + "loss": 0.658, + "step": 1842 + }, + { + "epoch": 0.8978564468983437, + "grad_norm": 2.478520154953003, + "learning_rate": 4.752860487216292e-06, + "loss": 0.6093, + "step": 1843 + }, + { + "epoch": 0.8983436180578109, + "grad_norm": 2.7084758281707764, + "learning_rate": 4.752581600517204e-06, + "loss": 0.6066, + "step": 1844 + }, + { + "epoch": 0.8988307892172783, + "grad_norm": 2.657496452331543, + "learning_rate": 4.752302564742474e-06, + "loss": 0.6716, + "step": 1845 + }, + { + "epoch": 0.8993179603767457, + "grad_norm": 2.729076862335205, + "learning_rate": 4.752023379910571e-06, + "loss": 0.6418, + "step": 1846 + }, + { + "epoch": 0.8998051315362131, + "grad_norm": 2.2668187618255615, + "learning_rate": 4.751744046039971e-06, + "loss": 0.6275, + "step": 1847 + }, + { + "epoch": 0.9002923026956804, + "grad_norm": 2.5292437076568604, + "learning_rate": 4.75146456314916e-06, + "loss": 0.6392, + "step": 1848 + }, + { + "epoch": 0.9007794738551478, + "grad_norm": 2.8361191749572754, + "learning_rate": 4.751184931256632e-06, + "loss": 0.6188, + "step": 1849 + }, + { + "epoch": 0.9012666450146152, + "grad_norm": 2.506009817123413, + "learning_rate": 4.750905150380897e-06, + "loss": 0.5775, + "step": 1850 + }, + { + "epoch": 0.9017538161740825, + "grad_norm": 2.7887680530548096, + "learning_rate": 4.750625220540467e-06, + "loss": 0.6512, + "step": 1851 + }, + { + "epoch": 0.9022409873335498, + "grad_norm": 3.1933047771453857, + "learning_rate": 4.7503451417538706e-06, + "loss": 0.6674, + "step": 1852 + }, + { + "epoch": 0.9027281584930172, + "grad_norm": 2.644780158996582, + "learning_rate": 4.750064914039641e-06, + "loss": 0.6384, + "step": 1853 + }, + { + "epoch": 0.9032153296524845, + "grad_norm": 2.553333282470703, + "learning_rate": 4.749784537416327e-06, + "loss": 0.6518, + "step": 1854 + }, + { + "epoch": 0.9037025008119519, + "grad_norm": 2.929993152618408, + "learning_rate": 4.749504011902481e-06, + "loss": 0.6424, + "step": 1855 + }, + { + "epoch": 0.9041896719714193, + "grad_norm": 2.8733534812927246, + "learning_rate": 4.749223337516668e-06, + "loss": 0.6191, + "step": 1856 + }, + { + "epoch": 0.9046768431308867, + "grad_norm": 2.551684617996216, + "learning_rate": 4.7489425142774645e-06, + "loss": 0.5905, + "step": 1857 + }, + { + "epoch": 0.905164014290354, + "grad_norm": 2.682617664337158, + "learning_rate": 4.748661542203455e-06, + "loss": 0.6539, + "step": 1858 + }, + { + "epoch": 0.9056511854498214, + "grad_norm": 2.757833957672119, + "learning_rate": 4.748380421313234e-06, + "loss": 0.6609, + "step": 1859 + }, + { + "epoch": 0.9061383566092888, + "grad_norm": 2.590351104736328, + "learning_rate": 4.748099151625406e-06, + "loss": 0.5651, + "step": 1860 + }, + { + "epoch": 0.906625527768756, + "grad_norm": 2.7189817428588867, + "learning_rate": 4.747817733158585e-06, + "loss": 0.6876, + "step": 1861 + }, + { + "epoch": 0.9071126989282234, + "grad_norm": 2.8818840980529785, + "learning_rate": 4.747536165931397e-06, + "loss": 0.6598, + "step": 1862 + }, + { + "epoch": 0.9075998700876908, + "grad_norm": 2.995805263519287, + "learning_rate": 4.747254449962474e-06, + "loss": 0.6738, + "step": 1863 + }, + { + "epoch": 0.9080870412471582, + "grad_norm": 2.7625908851623535, + "learning_rate": 4.746972585270461e-06, + "loss": 0.638, + "step": 1864 + }, + { + "epoch": 0.9085742124066255, + "grad_norm": 3.080444574356079, + "learning_rate": 4.7466905718740105e-06, + "loss": 0.7049, + "step": 1865 + }, + { + "epoch": 0.9090613835660929, + "grad_norm": 2.9501311779022217, + "learning_rate": 4.746408409791788e-06, + "loss": 0.685, + "step": 1866 + }, + { + "epoch": 0.9095485547255603, + "grad_norm": 2.856534242630005, + "learning_rate": 4.746126099042466e-06, + "loss": 0.5978, + "step": 1867 + }, + { + "epoch": 0.9100357258850276, + "grad_norm": 2.713294744491577, + "learning_rate": 4.745843639644729e-06, + "loss": 0.6284, + "step": 1868 + }, + { + "epoch": 0.910522897044495, + "grad_norm": 2.854541778564453, + "learning_rate": 4.745561031617268e-06, + "loss": 0.6532, + "step": 1869 + }, + { + "epoch": 0.9110100682039624, + "grad_norm": 2.7404134273529053, + "learning_rate": 4.745278274978787e-06, + "loss": 0.619, + "step": 1870 + }, + { + "epoch": 0.9114972393634296, + "grad_norm": 2.7377026081085205, + "learning_rate": 4.744995369747999e-06, + "loss": 0.607, + "step": 1871 + }, + { + "epoch": 0.911984410522897, + "grad_norm": 3.0728209018707275, + "learning_rate": 4.744712315943627e-06, + "loss": 0.6371, + "step": 1872 + }, + { + "epoch": 0.9124715816823644, + "grad_norm": 2.7304697036743164, + "learning_rate": 4.744429113584403e-06, + "loss": 0.6728, + "step": 1873 + }, + { + "epoch": 0.9129587528418318, + "grad_norm": 2.4732489585876465, + "learning_rate": 4.744145762689068e-06, + "loss": 0.6306, + "step": 1874 + }, + { + "epoch": 0.9134459240012991, + "grad_norm": 2.819247245788574, + "learning_rate": 4.743862263276376e-06, + "loss": 0.6351, + "step": 1875 + }, + { + "epoch": 0.9139330951607665, + "grad_norm": 2.8093454837799072, + "learning_rate": 4.743578615365089e-06, + "loss": 0.6328, + "step": 1876 + }, + { + "epoch": 0.9144202663202339, + "grad_norm": 3.006072521209717, + "learning_rate": 4.7432948189739765e-06, + "loss": 0.6857, + "step": 1877 + }, + { + "epoch": 0.9149074374797012, + "grad_norm": 2.4597671031951904, + "learning_rate": 4.743010874121824e-06, + "loss": 0.605, + "step": 1878 + }, + { + "epoch": 0.9153946086391685, + "grad_norm": 2.5488526821136475, + "learning_rate": 4.742726780827419e-06, + "loss": 0.5765, + "step": 1879 + }, + { + "epoch": 0.9158817797986359, + "grad_norm": 2.676079511642456, + "learning_rate": 4.742442539109565e-06, + "loss": 0.6127, + "step": 1880 + }, + { + "epoch": 0.9163689509581033, + "grad_norm": 2.6127514839172363, + "learning_rate": 4.742158148987073e-06, + "loss": 0.5938, + "step": 1881 + }, + { + "epoch": 0.9168561221175706, + "grad_norm": 2.9890429973602295, + "learning_rate": 4.741873610478763e-06, + "loss": 0.7073, + "step": 1882 + }, + { + "epoch": 0.917343293277038, + "grad_norm": 2.5918984413146973, + "learning_rate": 4.741588923603467e-06, + "loss": 0.5688, + "step": 1883 + }, + { + "epoch": 0.9178304644365054, + "grad_norm": 2.8222508430480957, + "learning_rate": 4.741304088380024e-06, + "loss": 0.6311, + "step": 1884 + }, + { + "epoch": 0.9183176355959727, + "grad_norm": 2.720745801925659, + "learning_rate": 4.741019104827286e-06, + "loss": 0.6984, + "step": 1885 + }, + { + "epoch": 0.9188048067554401, + "grad_norm": 3.2683398723602295, + "learning_rate": 4.740733972964111e-06, + "loss": 0.6386, + "step": 1886 + }, + { + "epoch": 0.9192919779149075, + "grad_norm": 3.8714780807495117, + "learning_rate": 4.740448692809371e-06, + "loss": 0.7593, + "step": 1887 + }, + { + "epoch": 0.9197791490743747, + "grad_norm": 2.8123347759246826, + "learning_rate": 4.7401632643819455e-06, + "loss": 0.6412, + "step": 1888 + }, + { + "epoch": 0.9202663202338421, + "grad_norm": 2.612628698348999, + "learning_rate": 4.7398776877007234e-06, + "loss": 0.6713, + "step": 1889 + }, + { + "epoch": 0.9207534913933095, + "grad_norm": 3.1371636390686035, + "learning_rate": 4.739591962784605e-06, + "loss": 0.6746, + "step": 1890 + }, + { + "epoch": 0.9212406625527769, + "grad_norm": 2.713404893875122, + "learning_rate": 4.7393060896524975e-06, + "loss": 0.7245, + "step": 1891 + }, + { + "epoch": 0.9217278337122442, + "grad_norm": 2.461813449859619, + "learning_rate": 4.739020068323324e-06, + "loss": 0.6591, + "step": 1892 + }, + { + "epoch": 0.9222150048717116, + "grad_norm": 2.2156307697296143, + "learning_rate": 4.738733898816009e-06, + "loss": 0.5232, + "step": 1893 + }, + { + "epoch": 0.922702176031179, + "grad_norm": 2.683418035507202, + "learning_rate": 4.738447581149494e-06, + "loss": 0.6097, + "step": 1894 + }, + { + "epoch": 0.9231893471906463, + "grad_norm": 2.7981178760528564, + "learning_rate": 4.738161115342725e-06, + "loss": 0.6008, + "step": 1895 + }, + { + "epoch": 0.9236765183501137, + "grad_norm": 2.4182374477386475, + "learning_rate": 4.737874501414663e-06, + "loss": 0.5592, + "step": 1896 + }, + { + "epoch": 0.924163689509581, + "grad_norm": 2.780683994293213, + "learning_rate": 4.737587739384275e-06, + "loss": 0.6518, + "step": 1897 + }, + { + "epoch": 0.9246508606690484, + "grad_norm": 2.4911985397338867, + "learning_rate": 4.737300829270538e-06, + "loss": 0.6677, + "step": 1898 + }, + { + "epoch": 0.9251380318285157, + "grad_norm": 2.7835662364959717, + "learning_rate": 4.737013771092442e-06, + "loss": 0.6978, + "step": 1899 + }, + { + "epoch": 0.9256252029879831, + "grad_norm": 2.4145543575286865, + "learning_rate": 4.7367265648689805e-06, + "loss": 0.583, + "step": 1900 + }, + { + "epoch": 0.9261123741474505, + "grad_norm": 2.6378276348114014, + "learning_rate": 4.736439210619166e-06, + "loss": 0.6585, + "step": 1901 + }, + { + "epoch": 0.9265995453069178, + "grad_norm": 2.9576001167297363, + "learning_rate": 4.73615170836201e-06, + "loss": 0.6389, + "step": 1902 + }, + { + "epoch": 0.9270867164663852, + "grad_norm": 2.476680278778076, + "learning_rate": 4.735864058116545e-06, + "loss": 0.59, + "step": 1903 + }, + { + "epoch": 0.9275738876258526, + "grad_norm": 2.7953028678894043, + "learning_rate": 4.7355762599018025e-06, + "loss": 0.6788, + "step": 1904 + }, + { + "epoch": 0.9280610587853199, + "grad_norm": 2.768141746520996, + "learning_rate": 4.735288313736832e-06, + "loss": 0.702, + "step": 1905 + }, + { + "epoch": 0.9285482299447873, + "grad_norm": 2.7766036987304688, + "learning_rate": 4.735000219640689e-06, + "loss": 0.7203, + "step": 1906 + }, + { + "epoch": 0.9290354011042546, + "grad_norm": 3.0115091800689697, + "learning_rate": 4.73471197763244e-06, + "loss": 0.7374, + "step": 1907 + }, + { + "epoch": 0.929522572263722, + "grad_norm": 2.5350422859191895, + "learning_rate": 4.734423587731161e-06, + "loss": 0.5557, + "step": 1908 + }, + { + "epoch": 0.9300097434231893, + "grad_norm": 3.0745933055877686, + "learning_rate": 4.734135049955936e-06, + "loss": 0.6584, + "step": 1909 + }, + { + "epoch": 0.9304969145826567, + "grad_norm": 2.464172124862671, + "learning_rate": 4.7338463643258616e-06, + "loss": 0.5653, + "step": 1910 + }, + { + "epoch": 0.9309840857421241, + "grad_norm": 2.728201150894165, + "learning_rate": 4.733557530860043e-06, + "loss": 0.617, + "step": 1911 + }, + { + "epoch": 0.9314712569015914, + "grad_norm": 2.7186777591705322, + "learning_rate": 4.733268549577595e-06, + "loss": 0.7465, + "step": 1912 + }, + { + "epoch": 0.9319584280610588, + "grad_norm": 5.799171447753906, + "learning_rate": 4.732979420497643e-06, + "loss": 0.6459, + "step": 1913 + }, + { + "epoch": 0.9324455992205262, + "grad_norm": 3.1872918605804443, + "learning_rate": 4.73269014363932e-06, + "loss": 0.6512, + "step": 1914 + }, + { + "epoch": 0.9329327703799936, + "grad_norm": 2.700185775756836, + "learning_rate": 4.732400719021772e-06, + "loss": 0.6079, + "step": 1915 + }, + { + "epoch": 0.9334199415394608, + "grad_norm": 3.26216459274292, + "learning_rate": 4.732111146664151e-06, + "loss": 0.599, + "step": 1916 + }, + { + "epoch": 0.9339071126989282, + "grad_norm": 2.8430185317993164, + "learning_rate": 4.731821426585623e-06, + "loss": 0.6336, + "step": 1917 + }, + { + "epoch": 0.9343942838583956, + "grad_norm": 2.8044750690460205, + "learning_rate": 4.7315315588053605e-06, + "loss": 0.6413, + "step": 1918 + }, + { + "epoch": 0.9348814550178629, + "grad_norm": 2.73343825340271, + "learning_rate": 4.731241543342547e-06, + "loss": 0.6516, + "step": 1919 + }, + { + "epoch": 0.9353686261773303, + "grad_norm": 2.8428094387054443, + "learning_rate": 4.730951380216377e-06, + "loss": 0.6509, + "step": 1920 + }, + { + "epoch": 0.9358557973367977, + "grad_norm": 2.6652748584747314, + "learning_rate": 4.730661069446051e-06, + "loss": 0.6643, + "step": 1921 + }, + { + "epoch": 0.936342968496265, + "grad_norm": 3.2325143814086914, + "learning_rate": 4.730370611050784e-06, + "loss": 0.7395, + "step": 1922 + }, + { + "epoch": 0.9368301396557324, + "grad_norm": 2.8638267517089844, + "learning_rate": 4.730080005049798e-06, + "loss": 0.6036, + "step": 1923 + }, + { + "epoch": 0.9373173108151998, + "grad_norm": 2.9446749687194824, + "learning_rate": 4.729789251462324e-06, + "loss": 0.6964, + "step": 1924 + }, + { + "epoch": 0.9378044819746671, + "grad_norm": 2.7506000995635986, + "learning_rate": 4.7294983503076055e-06, + "loss": 0.6593, + "step": 1925 + }, + { + "epoch": 0.9382916531341344, + "grad_norm": 2.646747350692749, + "learning_rate": 4.729207301604895e-06, + "loss": 0.6844, + "step": 1926 + }, + { + "epoch": 0.9387788242936018, + "grad_norm": 2.6507275104522705, + "learning_rate": 4.728916105373451e-06, + "loss": 0.6307, + "step": 1927 + }, + { + "epoch": 0.9392659954530692, + "grad_norm": 2.732590436935425, + "learning_rate": 4.728624761632547e-06, + "loss": 0.6938, + "step": 1928 + }, + { + "epoch": 0.9397531666125365, + "grad_norm": 3.536129951477051, + "learning_rate": 4.728333270401464e-06, + "loss": 0.6937, + "step": 1929 + }, + { + "epoch": 0.9402403377720039, + "grad_norm": 3.9246675968170166, + "learning_rate": 4.728041631699493e-06, + "loss": 0.5764, + "step": 1930 + }, + { + "epoch": 0.9407275089314713, + "grad_norm": 2.997680187225342, + "learning_rate": 4.727749845545934e-06, + "loss": 0.7535, + "step": 1931 + }, + { + "epoch": 0.9412146800909387, + "grad_norm": 2.762873649597168, + "learning_rate": 4.727457911960098e-06, + "loss": 0.6232, + "step": 1932 + }, + { + "epoch": 0.941701851250406, + "grad_norm": 2.880753517150879, + "learning_rate": 4.727165830961305e-06, + "loss": 0.6584, + "step": 1933 + }, + { + "epoch": 0.9421890224098733, + "grad_norm": 2.5573067665100098, + "learning_rate": 4.726873602568884e-06, + "loss": 0.6704, + "step": 1934 + }, + { + "epoch": 0.9426761935693407, + "grad_norm": 2.897188663482666, + "learning_rate": 4.726581226802175e-06, + "loss": 0.6514, + "step": 1935 + }, + { + "epoch": 0.943163364728808, + "grad_norm": 2.6803908348083496, + "learning_rate": 4.7262887036805285e-06, + "loss": 0.6898, + "step": 1936 + }, + { + "epoch": 0.9436505358882754, + "grad_norm": 2.4502036571502686, + "learning_rate": 4.725996033223303e-06, + "loss": 0.6765, + "step": 1937 + }, + { + "epoch": 0.9441377070477428, + "grad_norm": 3.2004828453063965, + "learning_rate": 4.7257032154498675e-06, + "loss": 0.5987, + "step": 1938 + }, + { + "epoch": 0.9446248782072101, + "grad_norm": 2.8516674041748047, + "learning_rate": 4.7254102503796e-06, + "loss": 0.6581, + "step": 1939 + }, + { + "epoch": 0.9451120493666775, + "grad_norm": 2.6943001747131348, + "learning_rate": 4.725117138031889e-06, + "loss": 0.6858, + "step": 1940 + }, + { + "epoch": 0.9455992205261449, + "grad_norm": 2.55597186088562, + "learning_rate": 4.7248238784261335e-06, + "loss": 0.6231, + "step": 1941 + }, + { + "epoch": 0.9460863916856123, + "grad_norm": 2.607405185699463, + "learning_rate": 4.724530471581741e-06, + "loss": 0.6072, + "step": 1942 + }, + { + "epoch": 0.9465735628450795, + "grad_norm": 2.6038544178009033, + "learning_rate": 4.724236917518129e-06, + "loss": 0.675, + "step": 1943 + }, + { + "epoch": 0.9470607340045469, + "grad_norm": 3.354515314102173, + "learning_rate": 4.723943216254724e-06, + "loss": 0.5709, + "step": 1944 + }, + { + "epoch": 0.9475479051640143, + "grad_norm": 2.553802728652954, + "learning_rate": 4.7236493678109654e-06, + "loss": 0.5859, + "step": 1945 + }, + { + "epoch": 0.9480350763234816, + "grad_norm": 2.594815254211426, + "learning_rate": 4.723355372206297e-06, + "loss": 0.6198, + "step": 1946 + }, + { + "epoch": 0.948522247482949, + "grad_norm": 2.7860357761383057, + "learning_rate": 4.723061229460178e-06, + "loss": 0.63, + "step": 1947 + }, + { + "epoch": 0.9490094186424164, + "grad_norm": 2.646634101867676, + "learning_rate": 4.722766939592075e-06, + "loss": 0.6525, + "step": 1948 + }, + { + "epoch": 0.9494965898018838, + "grad_norm": 3.322751045227051, + "learning_rate": 4.722472502621462e-06, + "loss": 0.6969, + "step": 1949 + }, + { + "epoch": 0.9499837609613511, + "grad_norm": 2.984806537628174, + "learning_rate": 4.7221779185678255e-06, + "loss": 0.7359, + "step": 1950 + }, + { + "epoch": 0.9504709321208185, + "grad_norm": 2.6718947887420654, + "learning_rate": 4.721883187450662e-06, + "loss": 0.6281, + "step": 1951 + }, + { + "epoch": 0.9509581032802858, + "grad_norm": 2.482856035232544, + "learning_rate": 4.721588309289476e-06, + "loss": 0.6357, + "step": 1952 + }, + { + "epoch": 0.9514452744397531, + "grad_norm": 2.527796983718872, + "learning_rate": 4.721293284103781e-06, + "loss": 0.6564, + "step": 1953 + }, + { + "epoch": 0.9519324455992205, + "grad_norm": 2.5601255893707275, + "learning_rate": 4.720998111913105e-06, + "loss": 0.5823, + "step": 1954 + }, + { + "epoch": 0.9524196167586879, + "grad_norm": 2.8543596267700195, + "learning_rate": 4.72070279273698e-06, + "loss": 0.6285, + "step": 1955 + }, + { + "epoch": 0.9529067879181552, + "grad_norm": 2.4315574169158936, + "learning_rate": 4.720407326594951e-06, + "loss": 0.626, + "step": 1956 + }, + { + "epoch": 0.9533939590776226, + "grad_norm": 2.5006439685821533, + "learning_rate": 4.720111713506572e-06, + "loss": 0.596, + "step": 1957 + }, + { + "epoch": 0.95388113023709, + "grad_norm": 2.6445484161376953, + "learning_rate": 4.719815953491407e-06, + "loss": 0.5805, + "step": 1958 + }, + { + "epoch": 0.9543683013965574, + "grad_norm": 2.836704730987549, + "learning_rate": 4.719520046569029e-06, + "loss": 0.6594, + "step": 1959 + }, + { + "epoch": 0.9548554725560247, + "grad_norm": 2.849188804626465, + "learning_rate": 4.71922399275902e-06, + "loss": 0.6638, + "step": 1960 + }, + { + "epoch": 0.955342643715492, + "grad_norm": 2.7750256061553955, + "learning_rate": 4.718927792080974e-06, + "loss": 0.6416, + "step": 1961 + }, + { + "epoch": 0.9558298148749594, + "grad_norm": 2.5716843605041504, + "learning_rate": 4.718631444554493e-06, + "loss": 0.6563, + "step": 1962 + }, + { + "epoch": 0.9563169860344267, + "grad_norm": 2.600188732147217, + "learning_rate": 4.71833495019919e-06, + "loss": 0.6079, + "step": 1963 + }, + { + "epoch": 0.9568041571938941, + "grad_norm": 2.9701859951019287, + "learning_rate": 4.718038309034688e-06, + "loss": 0.6731, + "step": 1964 + }, + { + "epoch": 0.9572913283533615, + "grad_norm": 2.652195692062378, + "learning_rate": 4.717741521080615e-06, + "loss": 0.7015, + "step": 1965 + }, + { + "epoch": 0.9577784995128289, + "grad_norm": 3.3615341186523438, + "learning_rate": 4.717444586356617e-06, + "loss": 0.7758, + "step": 1966 + }, + { + "epoch": 0.9582656706722962, + "grad_norm": 2.6280970573425293, + "learning_rate": 4.717147504882341e-06, + "loss": 0.7067, + "step": 1967 + }, + { + "epoch": 0.9587528418317636, + "grad_norm": 2.6628780364990234, + "learning_rate": 4.7168502766774495e-06, + "loss": 0.5846, + "step": 1968 + }, + { + "epoch": 0.959240012991231, + "grad_norm": 2.763232469558716, + "learning_rate": 4.716552901761614e-06, + "loss": 0.6487, + "step": 1969 + }, + { + "epoch": 0.9597271841506982, + "grad_norm": 2.5388834476470947, + "learning_rate": 4.716255380154514e-06, + "loss": 0.7152, + "step": 1970 + }, + { + "epoch": 0.9602143553101656, + "grad_norm": 2.783630609512329, + "learning_rate": 4.715957711875838e-06, + "loss": 0.5881, + "step": 1971 + }, + { + "epoch": 0.960701526469633, + "grad_norm": 2.8640003204345703, + "learning_rate": 4.7156598969452886e-06, + "loss": 0.6471, + "step": 1972 + }, + { + "epoch": 0.9611886976291003, + "grad_norm": 2.9970688819885254, + "learning_rate": 4.715361935382573e-06, + "loss": 0.6226, + "step": 1973 + }, + { + "epoch": 0.9616758687885677, + "grad_norm": 2.853774309158325, + "learning_rate": 4.71506382720741e-06, + "loss": 0.6394, + "step": 1974 + }, + { + "epoch": 0.9621630399480351, + "grad_norm": 2.9204158782958984, + "learning_rate": 4.71476557243953e-06, + "loss": 0.6586, + "step": 1975 + }, + { + "epoch": 0.9626502111075025, + "grad_norm": 2.4485809803009033, + "learning_rate": 4.71446717109867e-06, + "loss": 0.6229, + "step": 1976 + }, + { + "epoch": 0.9631373822669698, + "grad_norm": 2.678495168685913, + "learning_rate": 4.7141686232045795e-06, + "loss": 0.6795, + "step": 1977 + }, + { + "epoch": 0.9636245534264372, + "grad_norm": 2.6118338108062744, + "learning_rate": 4.7138699287770156e-06, + "loss": 0.5672, + "step": 1978 + }, + { + "epoch": 0.9641117245859046, + "grad_norm": 2.7400400638580322, + "learning_rate": 4.713571087835746e-06, + "loss": 0.7033, + "step": 1979 + }, + { + "epoch": 0.9645988957453718, + "grad_norm": 2.6587555408477783, + "learning_rate": 4.713272100400548e-06, + "loss": 0.6444, + "step": 1980 + }, + { + "epoch": 0.9650860669048392, + "grad_norm": 2.8563742637634277, + "learning_rate": 4.712972966491208e-06, + "loss": 0.5903, + "step": 1981 + }, + { + "epoch": 0.9655732380643066, + "grad_norm": 2.9035396575927734, + "learning_rate": 4.712673686127524e-06, + "loss": 0.6956, + "step": 1982 + }, + { + "epoch": 0.9660604092237739, + "grad_norm": 2.758864164352417, + "learning_rate": 4.7123742593293e-06, + "loss": 0.7292, + "step": 1983 + }, + { + "epoch": 0.9665475803832413, + "grad_norm": 2.4242186546325684, + "learning_rate": 4.712074686116354e-06, + "loss": 0.5802, + "step": 1984 + }, + { + "epoch": 0.9670347515427087, + "grad_norm": 2.615192413330078, + "learning_rate": 4.711774966508512e-06, + "loss": 0.5766, + "step": 1985 + }, + { + "epoch": 0.9675219227021761, + "grad_norm": 3.1337411403656006, + "learning_rate": 4.711475100525609e-06, + "loss": 0.5717, + "step": 1986 + }, + { + "epoch": 0.9680090938616434, + "grad_norm": 2.754444122314453, + "learning_rate": 4.711175088187489e-06, + "loss": 0.6425, + "step": 1987 + }, + { + "epoch": 0.9684962650211107, + "grad_norm": 3.211772918701172, + "learning_rate": 4.710874929514008e-06, + "loss": 0.6635, + "step": 1988 + }, + { + "epoch": 0.9689834361805781, + "grad_norm": 2.685687303543091, + "learning_rate": 4.71057462452503e-06, + "loss": 0.6147, + "step": 1989 + }, + { + "epoch": 0.9694706073400454, + "grad_norm": 3.0321238040924072, + "learning_rate": 4.710274173240429e-06, + "loss": 0.7467, + "step": 1990 + }, + { + "epoch": 0.9699577784995128, + "grad_norm": 2.7745680809020996, + "learning_rate": 4.709973575680089e-06, + "loss": 0.6445, + "step": 1991 + }, + { + "epoch": 0.9704449496589802, + "grad_norm": 2.9365670680999756, + "learning_rate": 4.709672831863903e-06, + "loss": 0.6151, + "step": 1992 + }, + { + "epoch": 0.9709321208184476, + "grad_norm": 3.0751678943634033, + "learning_rate": 4.709371941811776e-06, + "loss": 0.5983, + "step": 1993 + }, + { + "epoch": 0.9714192919779149, + "grad_norm": 2.5358290672302246, + "learning_rate": 4.709070905543618e-06, + "loss": 0.5949, + "step": 1994 + }, + { + "epoch": 0.9719064631373823, + "grad_norm": 2.690899610519409, + "learning_rate": 4.708769723079355e-06, + "loss": 0.6711, + "step": 1995 + }, + { + "epoch": 0.9723936342968497, + "grad_norm": 2.8458821773529053, + "learning_rate": 4.708468394438917e-06, + "loss": 0.623, + "step": 1996 + }, + { + "epoch": 0.9728808054563169, + "grad_norm": 2.8401291370391846, + "learning_rate": 4.708166919642245e-06, + "loss": 0.6633, + "step": 1997 + }, + { + "epoch": 0.9733679766157843, + "grad_norm": 2.313567638397217, + "learning_rate": 4.707865298709293e-06, + "loss": 0.5412, + "step": 1998 + }, + { + "epoch": 0.9738551477752517, + "grad_norm": 2.8934807777404785, + "learning_rate": 4.70756353166002e-06, + "loss": 0.6943, + "step": 1999 + }, + { + "epoch": 0.974342318934719, + "grad_norm": 2.5478222370147705, + "learning_rate": 4.707261618514399e-06, + "loss": 0.6147, + "step": 2000 + }, + { + "epoch": 0.9748294900941864, + "grad_norm": 3.2183430194854736, + "learning_rate": 4.70695955929241e-06, + "loss": 0.7156, + "step": 2001 + }, + { + "epoch": 0.9753166612536538, + "grad_norm": 2.8861172199249268, + "learning_rate": 4.706657354014042e-06, + "loss": 0.5802, + "step": 2002 + }, + { + "epoch": 0.9758038324131212, + "grad_norm": 3.033914804458618, + "learning_rate": 4.706355002699296e-06, + "loss": 0.6435, + "step": 2003 + }, + { + "epoch": 0.9762910035725885, + "grad_norm": 2.8954670429229736, + "learning_rate": 4.706052505368182e-06, + "loss": 0.6766, + "step": 2004 + }, + { + "epoch": 0.9767781747320559, + "grad_norm": 2.9950602054595947, + "learning_rate": 4.705749862040718e-06, + "loss": 0.6454, + "step": 2005 + }, + { + "epoch": 0.9772653458915233, + "grad_norm": 2.765563726425171, + "learning_rate": 4.705447072736934e-06, + "loss": 0.6312, + "step": 2006 + }, + { + "epoch": 0.9777525170509905, + "grad_norm": 2.9135022163391113, + "learning_rate": 4.705144137476868e-06, + "loss": 0.724, + "step": 2007 + }, + { + "epoch": 0.9782396882104579, + "grad_norm": 2.7625973224639893, + "learning_rate": 4.704841056280568e-06, + "loss": 0.6426, + "step": 2008 + }, + { + "epoch": 0.9787268593699253, + "grad_norm": 2.818840503692627, + "learning_rate": 4.704537829168092e-06, + "loss": 0.6693, + "step": 2009 + }, + { + "epoch": 0.9792140305293927, + "grad_norm": 3.230715751647949, + "learning_rate": 4.704234456159508e-06, + "loss": 0.6661, + "step": 2010 + }, + { + "epoch": 0.97970120168886, + "grad_norm": 2.634568214416504, + "learning_rate": 4.703930937274893e-06, + "loss": 0.5577, + "step": 2011 + }, + { + "epoch": 0.9801883728483274, + "grad_norm": 2.5668134689331055, + "learning_rate": 4.703627272534335e-06, + "loss": 0.5874, + "step": 2012 + }, + { + "epoch": 0.9806755440077948, + "grad_norm": 3.0553674697875977, + "learning_rate": 4.703323461957928e-06, + "loss": 0.6107, + "step": 2013 + }, + { + "epoch": 0.9811627151672621, + "grad_norm": 2.832594394683838, + "learning_rate": 4.703019505565779e-06, + "loss": 0.6359, + "step": 2014 + }, + { + "epoch": 0.9816498863267294, + "grad_norm": 2.584470510482788, + "learning_rate": 4.702715403378005e-06, + "loss": 0.5695, + "step": 2015 + }, + { + "epoch": 0.9821370574861968, + "grad_norm": 2.8094727993011475, + "learning_rate": 4.70241115541473e-06, + "loss": 0.6225, + "step": 2016 + }, + { + "epoch": 0.9826242286456641, + "grad_norm": 2.543111562728882, + "learning_rate": 4.702106761696091e-06, + "loss": 0.5915, + "step": 2017 + }, + { + "epoch": 0.9831113998051315, + "grad_norm": 2.496753215789795, + "learning_rate": 4.701802222242231e-06, + "loss": 0.6144, + "step": 2018 + }, + { + "epoch": 0.9835985709645989, + "grad_norm": 3.0539820194244385, + "learning_rate": 4.701497537073305e-06, + "loss": 0.6786, + "step": 2019 + }, + { + "epoch": 0.9840857421240663, + "grad_norm": 2.5794825553894043, + "learning_rate": 4.701192706209477e-06, + "loss": 0.5907, + "step": 2020 + }, + { + "epoch": 0.9845729132835336, + "grad_norm": 3.1018481254577637, + "learning_rate": 4.700887729670921e-06, + "loss": 0.6809, + "step": 2021 + }, + { + "epoch": 0.985060084443001, + "grad_norm": 2.629465103149414, + "learning_rate": 4.700582607477819e-06, + "loss": 0.6546, + "step": 2022 + }, + { + "epoch": 0.9855472556024684, + "grad_norm": 2.592811346054077, + "learning_rate": 4.700277339650366e-06, + "loss": 0.6405, + "step": 2023 + }, + { + "epoch": 0.9860344267619356, + "grad_norm": 3.3388514518737793, + "learning_rate": 4.699971926208763e-06, + "loss": 0.5964, + "step": 2024 + }, + { + "epoch": 0.986521597921403, + "grad_norm": 3.026329755783081, + "learning_rate": 4.699666367173223e-06, + "loss": 0.7144, + "step": 2025 + }, + { + "epoch": 0.9870087690808704, + "grad_norm": 2.7518527507781982, + "learning_rate": 4.699360662563967e-06, + "loss": 0.648, + "step": 2026 + }, + { + "epoch": 0.9874959402403378, + "grad_norm": 3.3328449726104736, + "learning_rate": 4.699054812401229e-06, + "loss": 0.6679, + "step": 2027 + }, + { + "epoch": 0.9879831113998051, + "grad_norm": 2.949556350708008, + "learning_rate": 4.698748816705246e-06, + "loss": 0.6692, + "step": 2028 + }, + { + "epoch": 0.9884702825592725, + "grad_norm": 2.5895659923553467, + "learning_rate": 4.698442675496273e-06, + "loss": 0.5844, + "step": 2029 + }, + { + "epoch": 0.9889574537187399, + "grad_norm": 2.7603037357330322, + "learning_rate": 4.698136388794567e-06, + "loss": 0.6068, + "step": 2030 + }, + { + "epoch": 0.9894446248782072, + "grad_norm": 2.5493600368499756, + "learning_rate": 4.6978299566204e-06, + "loss": 0.5924, + "step": 2031 + }, + { + "epoch": 0.9899317960376746, + "grad_norm": 2.764676809310913, + "learning_rate": 4.697523378994051e-06, + "loss": 0.5896, + "step": 2032 + }, + { + "epoch": 0.990418967197142, + "grad_norm": 2.8489527702331543, + "learning_rate": 4.69721665593581e-06, + "loss": 0.7379, + "step": 2033 + }, + { + "epoch": 0.9909061383566092, + "grad_norm": 2.793085813522339, + "learning_rate": 4.696909787465975e-06, + "loss": 0.692, + "step": 2034 + }, + { + "epoch": 0.9913933095160766, + "grad_norm": 2.69687819480896, + "learning_rate": 4.696602773604855e-06, + "loss": 0.6513, + "step": 2035 + }, + { + "epoch": 0.991880480675544, + "grad_norm": 2.4406559467315674, + "learning_rate": 4.696295614372767e-06, + "loss": 0.6599, + "step": 2036 + }, + { + "epoch": 0.9923676518350114, + "grad_norm": 2.9056105613708496, + "learning_rate": 4.695988309790041e-06, + "loss": 0.6274, + "step": 2037 + }, + { + "epoch": 0.9928548229944787, + "grad_norm": 3.0408174991607666, + "learning_rate": 4.695680859877012e-06, + "loss": 0.6368, + "step": 2038 + }, + { + "epoch": 0.9933419941539461, + "grad_norm": 2.666773796081543, + "learning_rate": 4.695373264654029e-06, + "loss": 0.5689, + "step": 2039 + }, + { + "epoch": 0.9938291653134135, + "grad_norm": 2.8946385383605957, + "learning_rate": 4.695065524141448e-06, + "loss": 0.6491, + "step": 2040 + }, + { + "epoch": 0.9943163364728808, + "grad_norm": 3.184481143951416, + "learning_rate": 4.694757638359634e-06, + "loss": 0.6971, + "step": 2041 + }, + { + "epoch": 0.9948035076323481, + "grad_norm": 2.843937397003174, + "learning_rate": 4.694449607328965e-06, + "loss": 0.6743, + "step": 2042 + }, + { + "epoch": 0.9952906787918155, + "grad_norm": 3.239546537399292, + "learning_rate": 4.694141431069825e-06, + "loss": 0.5982, + "step": 2043 + }, + { + "epoch": 0.9957778499512829, + "grad_norm": 2.726733922958374, + "learning_rate": 4.693833109602609e-06, + "loss": 0.6174, + "step": 2044 + }, + { + "epoch": 0.9962650211107502, + "grad_norm": 2.5475666522979736, + "learning_rate": 4.693524642947722e-06, + "loss": 0.5558, + "step": 2045 + }, + { + "epoch": 0.9967521922702176, + "grad_norm": 2.8366470336914062, + "learning_rate": 4.693216031125579e-06, + "loss": 0.6709, + "step": 2046 + }, + { + "epoch": 0.997239363429685, + "grad_norm": 2.578596830368042, + "learning_rate": 4.692907274156603e-06, + "loss": 0.5595, + "step": 2047 + }, + { + "epoch": 0.9977265345891523, + "grad_norm": 2.649754762649536, + "learning_rate": 4.692598372061228e-06, + "loss": 0.6177, + "step": 2048 + }, + { + "epoch": 0.9982137057486197, + "grad_norm": 3.1366255283355713, + "learning_rate": 4.692289324859897e-06, + "loss": 0.6538, + "step": 2049 + }, + { + "epoch": 0.9987008769080871, + "grad_norm": 2.480708360671997, + "learning_rate": 4.6919801325730615e-06, + "loss": 0.6252, + "step": 2050 + }, + { + "epoch": 0.9991880480675543, + "grad_norm": 2.7265095710754395, + "learning_rate": 4.691670795221186e-06, + "loss": 0.6545, + "step": 2051 + }, + { + "epoch": 0.9996752192270217, + "grad_norm": 2.4723410606384277, + "learning_rate": 4.691361312824741e-06, + "loss": 0.6364, + "step": 2052 + }, + { + "epoch": 1.0, + "grad_norm": 2.4723410606384277, + "learning_rate": 4.691051685404209e-06, + "loss": 0.7507, + "step": 2053 + }, + { + "epoch": 1.0004871711594674, + "grad_norm": 4.271796226501465, + "learning_rate": 4.69074191298008e-06, + "loss": 0.5478, + "step": 2054 + }, + { + "epoch": 1.0009743423189348, + "grad_norm": 2.4522266387939453, + "learning_rate": 4.690431995572855e-06, + "loss": 0.5607, + "step": 2055 + }, + { + "epoch": 1.0014615134784022, + "grad_norm": 2.6194567680358887, + "learning_rate": 4.690121933203045e-06, + "loss": 0.5315, + "step": 2056 + }, + { + "epoch": 1.0019486846378693, + "grad_norm": 2.789560079574585, + "learning_rate": 4.689811725891169e-06, + "loss": 0.6932, + "step": 2057 + }, + { + "epoch": 1.0024358557973367, + "grad_norm": 5.321357727050781, + "learning_rate": 4.689501373657756e-06, + "loss": 0.5806, + "step": 2058 + }, + { + "epoch": 1.0029230269568041, + "grad_norm": 3.3521203994750977, + "learning_rate": 4.689190876523347e-06, + "loss": 0.5752, + "step": 2059 + }, + { + "epoch": 1.0034101981162715, + "grad_norm": 2.5092971324920654, + "learning_rate": 4.68888023450849e-06, + "loss": 0.4898, + "step": 2060 + }, + { + "epoch": 1.003897369275739, + "grad_norm": 2.6809635162353516, + "learning_rate": 4.6885694476337425e-06, + "loss": 0.6124, + "step": 2061 + }, + { + "epoch": 1.0043845404352063, + "grad_norm": 2.84936261177063, + "learning_rate": 4.688258515919673e-06, + "loss": 0.6937, + "step": 2062 + }, + { + "epoch": 1.0048717115946737, + "grad_norm": 2.638604164123535, + "learning_rate": 4.687947439386859e-06, + "loss": 0.6667, + "step": 2063 + }, + { + "epoch": 1.0053588827541409, + "grad_norm": 2.805375099182129, + "learning_rate": 4.687636218055886e-06, + "loss": 0.5516, + "step": 2064 + }, + { + "epoch": 1.0058460539136083, + "grad_norm": 2.4401586055755615, + "learning_rate": 4.687324851947353e-06, + "loss": 0.557, + "step": 2065 + }, + { + "epoch": 1.0063332250730757, + "grad_norm": 2.517761707305908, + "learning_rate": 4.687013341081864e-06, + "loss": 0.6244, + "step": 2066 + }, + { + "epoch": 1.006820396232543, + "grad_norm": 2.9480912685394287, + "learning_rate": 4.686701685480036e-06, + "loss": 0.6172, + "step": 2067 + }, + { + "epoch": 1.0073075673920104, + "grad_norm": 3.065389633178711, + "learning_rate": 4.686389885162494e-06, + "loss": 0.6164, + "step": 2068 + }, + { + "epoch": 1.0077947385514778, + "grad_norm": 2.982752561569214, + "learning_rate": 4.686077940149873e-06, + "loss": 0.5831, + "step": 2069 + }, + { + "epoch": 1.008281909710945, + "grad_norm": 2.80723237991333, + "learning_rate": 4.6857658504628185e-06, + "loss": 0.5479, + "step": 2070 + }, + { + "epoch": 1.0087690808704124, + "grad_norm": 2.6983211040496826, + "learning_rate": 4.685453616121983e-06, + "loss": 0.5889, + "step": 2071 + }, + { + "epoch": 1.0092562520298798, + "grad_norm": 2.6924548149108887, + "learning_rate": 4.685141237148031e-06, + "loss": 0.5409, + "step": 2072 + }, + { + "epoch": 1.0097434231893472, + "grad_norm": 2.6049201488494873, + "learning_rate": 4.684828713561634e-06, + "loss": 0.563, + "step": 2073 + }, + { + "epoch": 1.0102305943488146, + "grad_norm": 2.5006775856018066, + "learning_rate": 4.684516045383477e-06, + "loss": 0.5526, + "step": 2074 + }, + { + "epoch": 1.010717765508282, + "grad_norm": 2.8809561729431152, + "learning_rate": 4.684203232634252e-06, + "loss": 0.6029, + "step": 2075 + }, + { + "epoch": 1.0112049366677494, + "grad_norm": 2.627882957458496, + "learning_rate": 4.683890275334661e-06, + "loss": 0.5768, + "step": 2076 + }, + { + "epoch": 1.0116921078272165, + "grad_norm": 2.8152873516082764, + "learning_rate": 4.683577173505414e-06, + "loss": 0.5352, + "step": 2077 + }, + { + "epoch": 1.012179278986684, + "grad_norm": 2.733295202255249, + "learning_rate": 4.683263927167232e-06, + "loss": 0.5731, + "step": 2078 + }, + { + "epoch": 1.0126664501461513, + "grad_norm": 2.9358327388763428, + "learning_rate": 4.682950536340848e-06, + "loss": 0.5662, + "step": 2079 + }, + { + "epoch": 1.0131536213056187, + "grad_norm": 2.851231575012207, + "learning_rate": 4.682637001047e-06, + "loss": 0.5479, + "step": 2080 + }, + { + "epoch": 1.013640792465086, + "grad_norm": 2.6071937084198, + "learning_rate": 4.682323321306439e-06, + "loss": 0.6122, + "step": 2081 + }, + { + "epoch": 1.0141279636245535, + "grad_norm": 2.604856014251709, + "learning_rate": 4.682009497139924e-06, + "loss": 0.6071, + "step": 2082 + }, + { + "epoch": 1.0146151347840209, + "grad_norm": 2.629518508911133, + "learning_rate": 4.681695528568223e-06, + "loss": 0.5965, + "step": 2083 + }, + { + "epoch": 1.015102305943488, + "grad_norm": 2.672603130340576, + "learning_rate": 4.681381415612117e-06, + "loss": 0.5736, + "step": 2084 + }, + { + "epoch": 1.0155894771029554, + "grad_norm": 2.9917349815368652, + "learning_rate": 4.6810671582923895e-06, + "loss": 0.5569, + "step": 2085 + }, + { + "epoch": 1.0160766482624228, + "grad_norm": 2.5426175594329834, + "learning_rate": 4.680752756629841e-06, + "loss": 0.6083, + "step": 2086 + }, + { + "epoch": 1.0165638194218902, + "grad_norm": 2.5255961418151855, + "learning_rate": 4.6804382106452795e-06, + "loss": 0.5874, + "step": 2087 + }, + { + "epoch": 1.0170509905813576, + "grad_norm": 2.478966236114502, + "learning_rate": 4.68012352035952e-06, + "loss": 0.545, + "step": 2088 + }, + { + "epoch": 1.017538161740825, + "grad_norm": 4.001131057739258, + "learning_rate": 4.6798086857933875e-06, + "loss": 0.5377, + "step": 2089 + }, + { + "epoch": 1.0180253329002924, + "grad_norm": 2.688753604888916, + "learning_rate": 4.67949370696772e-06, + "loss": 0.5815, + "step": 2090 + }, + { + "epoch": 1.0185125040597596, + "grad_norm": 2.666856050491333, + "learning_rate": 4.679178583903362e-06, + "loss": 0.6034, + "step": 2091 + }, + { + "epoch": 1.018999675219227, + "grad_norm": 2.9675910472869873, + "learning_rate": 4.678863316621168e-06, + "loss": 0.5401, + "step": 2092 + }, + { + "epoch": 1.0194868463786944, + "grad_norm": 2.643982172012329, + "learning_rate": 4.678547905142003e-06, + "loss": 0.5821, + "step": 2093 + }, + { + "epoch": 1.0199740175381617, + "grad_norm": 2.5761210918426514, + "learning_rate": 4.678232349486741e-06, + "loss": 0.5931, + "step": 2094 + }, + { + "epoch": 1.0204611886976291, + "grad_norm": 2.6323447227478027, + "learning_rate": 4.677916649676265e-06, + "loss": 0.6141, + "step": 2095 + }, + { + "epoch": 1.0209483598570965, + "grad_norm": 3.0204811096191406, + "learning_rate": 4.677600805731467e-06, + "loss": 0.508, + "step": 2096 + }, + { + "epoch": 1.021435531016564, + "grad_norm": 2.921832323074341, + "learning_rate": 4.67728481767325e-06, + "loss": 0.6375, + "step": 2097 + }, + { + "epoch": 1.021922702176031, + "grad_norm": 2.649522066116333, + "learning_rate": 4.676968685522528e-06, + "loss": 0.5306, + "step": 2098 + }, + { + "epoch": 1.0224098733354985, + "grad_norm": 2.7227838039398193, + "learning_rate": 4.67665240930022e-06, + "loss": 0.6044, + "step": 2099 + }, + { + "epoch": 1.0228970444949659, + "grad_norm": 2.678169012069702, + "learning_rate": 4.676335989027259e-06, + "loss": 0.5439, + "step": 2100 + }, + { + "epoch": 1.0233842156544333, + "grad_norm": 2.772019624710083, + "learning_rate": 4.676019424724584e-06, + "loss": 0.5365, + "step": 2101 + }, + { + "epoch": 1.0238713868139007, + "grad_norm": 2.731652021408081, + "learning_rate": 4.675702716413147e-06, + "loss": 0.561, + "step": 2102 + }, + { + "epoch": 1.024358557973368, + "grad_norm": 3.066760301589966, + "learning_rate": 4.675385864113906e-06, + "loss": 0.5716, + "step": 2103 + }, + { + "epoch": 1.0248457291328354, + "grad_norm": 2.7809107303619385, + "learning_rate": 4.675068867847832e-06, + "loss": 0.5528, + "step": 2104 + }, + { + "epoch": 1.0253329002923026, + "grad_norm": 2.7124745845794678, + "learning_rate": 4.674751727635902e-06, + "loss": 0.5732, + "step": 2105 + }, + { + "epoch": 1.02582007145177, + "grad_norm": 2.798182964324951, + "learning_rate": 4.674434443499106e-06, + "loss": 0.5693, + "step": 2106 + }, + { + "epoch": 1.0263072426112374, + "grad_norm": 2.5630013942718506, + "learning_rate": 4.67411701545844e-06, + "loss": 0.5544, + "step": 2107 + }, + { + "epoch": 1.0267944137707048, + "grad_norm": 2.8776867389678955, + "learning_rate": 4.673799443534913e-06, + "loss": 0.5438, + "step": 2108 + }, + { + "epoch": 1.0272815849301722, + "grad_norm": 2.783783435821533, + "learning_rate": 4.673481727749542e-06, + "loss": 0.5371, + "step": 2109 + }, + { + "epoch": 1.0277687560896396, + "grad_norm": 2.522869110107422, + "learning_rate": 4.673163868123352e-06, + "loss": 0.5877, + "step": 2110 + }, + { + "epoch": 1.0282559272491067, + "grad_norm": 2.6746437549591064, + "learning_rate": 4.672845864677379e-06, + "loss": 0.5659, + "step": 2111 + }, + { + "epoch": 1.0287430984085741, + "grad_norm": 2.7580599784851074, + "learning_rate": 4.672527717432669e-06, + "loss": 0.5505, + "step": 2112 + }, + { + "epoch": 1.0292302695680415, + "grad_norm": 2.494053840637207, + "learning_rate": 4.672209426410277e-06, + "loss": 0.5711, + "step": 2113 + }, + { + "epoch": 1.029717440727509, + "grad_norm": 4.19864559173584, + "learning_rate": 4.671890991631268e-06, + "loss": 0.5798, + "step": 2114 + }, + { + "epoch": 1.0302046118869763, + "grad_norm": 2.918562412261963, + "learning_rate": 4.671572413116715e-06, + "loss": 0.5801, + "step": 2115 + }, + { + "epoch": 1.0306917830464437, + "grad_norm": 2.783236026763916, + "learning_rate": 4.671253690887702e-06, + "loss": 0.594, + "step": 2116 + }, + { + "epoch": 1.031178954205911, + "grad_norm": 2.783982753753662, + "learning_rate": 4.670934824965323e-06, + "loss": 0.647, + "step": 2117 + }, + { + "epoch": 1.0316661253653783, + "grad_norm": 2.965730905532837, + "learning_rate": 4.670615815370678e-06, + "loss": 0.5854, + "step": 2118 + }, + { + "epoch": 1.0321532965248457, + "grad_norm": 3.029357433319092, + "learning_rate": 4.670296662124881e-06, + "loss": 0.644, + "step": 2119 + }, + { + "epoch": 1.032640467684313, + "grad_norm": 2.5346035957336426, + "learning_rate": 4.669977365249053e-06, + "loss": 0.5948, + "step": 2120 + }, + { + "epoch": 1.0331276388437804, + "grad_norm": 2.844843864440918, + "learning_rate": 4.669657924764325e-06, + "loss": 0.5983, + "step": 2121 + }, + { + "epoch": 1.0336148100032478, + "grad_norm": 2.8159141540527344, + "learning_rate": 4.669338340691838e-06, + "loss": 0.4815, + "step": 2122 + }, + { + "epoch": 1.0341019811627152, + "grad_norm": 2.61793851852417, + "learning_rate": 4.6690186130527415e-06, + "loss": 0.5228, + "step": 2123 + }, + { + "epoch": 1.0345891523221826, + "grad_norm": 2.695662260055542, + "learning_rate": 4.668698741868195e-06, + "loss": 0.5732, + "step": 2124 + }, + { + "epoch": 1.0350763234816498, + "grad_norm": 6.356692314147949, + "learning_rate": 4.668378727159367e-06, + "loss": 0.5321, + "step": 2125 + }, + { + "epoch": 1.0355634946411172, + "grad_norm": 2.4827868938446045, + "learning_rate": 4.6680585689474376e-06, + "loss": 0.5077, + "step": 2126 + }, + { + "epoch": 1.0360506658005846, + "grad_norm": 2.5069069862365723, + "learning_rate": 4.667738267253593e-06, + "loss": 0.5452, + "step": 2127 + }, + { + "epoch": 1.036537836960052, + "grad_norm": 2.8135058879852295, + "learning_rate": 4.667417822099033e-06, + "loss": 0.6093, + "step": 2128 + }, + { + "epoch": 1.0370250081195194, + "grad_norm": 2.6845595836639404, + "learning_rate": 4.667097233504963e-06, + "loss": 0.5496, + "step": 2129 + }, + { + "epoch": 1.0375121792789868, + "grad_norm": 2.8885626792907715, + "learning_rate": 4.666776501492601e-06, + "loss": 0.5089, + "step": 2130 + }, + { + "epoch": 1.0379993504384541, + "grad_norm": 2.7338898181915283, + "learning_rate": 4.666455626083169e-06, + "loss": 0.5814, + "step": 2131 + }, + { + "epoch": 1.0384865215979213, + "grad_norm": 2.661667585372925, + "learning_rate": 4.666134607297907e-06, + "loss": 0.5807, + "step": 2132 + }, + { + "epoch": 1.0389736927573887, + "grad_norm": 2.9887278079986572, + "learning_rate": 4.6658134451580596e-06, + "loss": 0.6068, + "step": 2133 + }, + { + "epoch": 1.039460863916856, + "grad_norm": 2.6260030269622803, + "learning_rate": 4.665492139684879e-06, + "loss": 0.5751, + "step": 2134 + }, + { + "epoch": 1.0399480350763235, + "grad_norm": 2.7620582580566406, + "learning_rate": 4.6651706908996295e-06, + "loss": 0.6164, + "step": 2135 + }, + { + "epoch": 1.0404352062357909, + "grad_norm": 2.8758156299591064, + "learning_rate": 4.664849098823586e-06, + "loss": 0.5304, + "step": 2136 + }, + { + "epoch": 1.0409223773952583, + "grad_norm": 3.5743935108184814, + "learning_rate": 4.664527363478031e-06, + "loss": 0.5592, + "step": 2137 + }, + { + "epoch": 1.0414095485547255, + "grad_norm": 2.8853418827056885, + "learning_rate": 4.664205484884256e-06, + "loss": 0.5263, + "step": 2138 + }, + { + "epoch": 1.0418967197141928, + "grad_norm": 5.163991451263428, + "learning_rate": 4.663883463063564e-06, + "loss": 0.5924, + "step": 2139 + }, + { + "epoch": 1.0423838908736602, + "grad_norm": 2.8603978157043457, + "learning_rate": 4.663561298037266e-06, + "loss": 0.5795, + "step": 2140 + }, + { + "epoch": 1.0428710620331276, + "grad_norm": 2.9560983180999756, + "learning_rate": 4.6632389898266825e-06, + "loss": 0.5083, + "step": 2141 + }, + { + "epoch": 1.043358233192595, + "grad_norm": 2.6044600009918213, + "learning_rate": 4.6629165384531435e-06, + "loss": 0.5624, + "step": 2142 + }, + { + "epoch": 1.0438454043520624, + "grad_norm": 2.727571964263916, + "learning_rate": 4.6625939439379905e-06, + "loss": 0.6026, + "step": 2143 + }, + { + "epoch": 1.0443325755115298, + "grad_norm": 2.5859286785125732, + "learning_rate": 4.662271206302571e-06, + "loss": 0.5576, + "step": 2144 + }, + { + "epoch": 1.044819746670997, + "grad_norm": 2.59718918800354, + "learning_rate": 4.661948325568245e-06, + "loss": 0.6151, + "step": 2145 + }, + { + "epoch": 1.0453069178304644, + "grad_norm": 2.9069881439208984, + "learning_rate": 4.66162530175638e-06, + "loss": 0.5134, + "step": 2146 + }, + { + "epoch": 1.0457940889899318, + "grad_norm": 2.519467353820801, + "learning_rate": 4.661302134888354e-06, + "loss": 0.5404, + "step": 2147 + }, + { + "epoch": 1.0462812601493992, + "grad_norm": 2.434392213821411, + "learning_rate": 4.6609788249855535e-06, + "loss": 0.5958, + "step": 2148 + }, + { + "epoch": 1.0467684313088665, + "grad_norm": 2.930746078491211, + "learning_rate": 4.660655372069375e-06, + "loss": 0.5736, + "step": 2149 + }, + { + "epoch": 1.047255602468334, + "grad_norm": 2.6583712100982666, + "learning_rate": 4.660331776161227e-06, + "loss": 0.5694, + "step": 2150 + }, + { + "epoch": 1.0477427736278013, + "grad_norm": 2.679342031478882, + "learning_rate": 4.660008037282522e-06, + "loss": 0.5568, + "step": 2151 + }, + { + "epoch": 1.0482299447872685, + "grad_norm": 2.5642178058624268, + "learning_rate": 4.6596841554546854e-06, + "loss": 0.5692, + "step": 2152 + }, + { + "epoch": 1.048717115946736, + "grad_norm": 2.5338618755340576, + "learning_rate": 4.6593601306991535e-06, + "loss": 0.4932, + "step": 2153 + }, + { + "epoch": 1.0492042871062033, + "grad_norm": 2.766436815261841, + "learning_rate": 4.659035963037369e-06, + "loss": 0.5787, + "step": 2154 + }, + { + "epoch": 1.0496914582656707, + "grad_norm": 2.7245490550994873, + "learning_rate": 4.6587116524907864e-06, + "loss": 0.5615, + "step": 2155 + }, + { + "epoch": 1.050178629425138, + "grad_norm": 2.3732588291168213, + "learning_rate": 4.658387199080866e-06, + "loss": 0.5115, + "step": 2156 + }, + { + "epoch": 1.0506658005846055, + "grad_norm": 2.4861414432525635, + "learning_rate": 4.6580626028290835e-06, + "loss": 0.5485, + "step": 2157 + }, + { + "epoch": 1.0511529717440729, + "grad_norm": 2.7187747955322266, + "learning_rate": 4.657737863756917e-06, + "loss": 0.5671, + "step": 2158 + }, + { + "epoch": 1.05164014290354, + "grad_norm": 2.7013304233551025, + "learning_rate": 4.657412981885862e-06, + "loss": 0.5665, + "step": 2159 + }, + { + "epoch": 1.0521273140630074, + "grad_norm": 2.7664783000946045, + "learning_rate": 4.657087957237415e-06, + "loss": 0.5574, + "step": 2160 + }, + { + "epoch": 1.0526144852224748, + "grad_norm": 3.151124954223633, + "learning_rate": 4.656762789833088e-06, + "loss": 0.6352, + "step": 2161 + }, + { + "epoch": 1.0531016563819422, + "grad_norm": 2.8857107162475586, + "learning_rate": 4.656437479694401e-06, + "loss": 0.5541, + "step": 2162 + }, + { + "epoch": 1.0535888275414096, + "grad_norm": 3.0360682010650635, + "learning_rate": 4.656112026842882e-06, + "loss": 0.6598, + "step": 2163 + }, + { + "epoch": 1.054075998700877, + "grad_norm": 3.191641330718994, + "learning_rate": 4.65578643130007e-06, + "loss": 0.5906, + "step": 2164 + }, + { + "epoch": 1.0545631698603444, + "grad_norm": 2.7092056274414062, + "learning_rate": 4.655460693087512e-06, + "loss": 0.6325, + "step": 2165 + }, + { + "epoch": 1.0550503410198115, + "grad_norm": 3.1685545444488525, + "learning_rate": 4.655134812226767e-06, + "loss": 0.5563, + "step": 2166 + }, + { + "epoch": 1.055537512179279, + "grad_norm": 2.730607748031616, + "learning_rate": 4.654808788739401e-06, + "loss": 0.5783, + "step": 2167 + }, + { + "epoch": 1.0560246833387463, + "grad_norm": 2.7430622577667236, + "learning_rate": 4.65448262264699e-06, + "loss": 0.6251, + "step": 2168 + }, + { + "epoch": 1.0565118544982137, + "grad_norm": 2.7723639011383057, + "learning_rate": 4.654156313971119e-06, + "loss": 0.6504, + "step": 2169 + }, + { + "epoch": 1.0569990256576811, + "grad_norm": 2.8647243976593018, + "learning_rate": 4.6538298627333835e-06, + "loss": 0.5015, + "step": 2170 + }, + { + "epoch": 1.0574861968171485, + "grad_norm": 2.7482738494873047, + "learning_rate": 4.653503268955388e-06, + "loss": 0.5978, + "step": 2171 + }, + { + "epoch": 1.057973367976616, + "grad_norm": 2.8962318897247314, + "learning_rate": 4.653176532658748e-06, + "loss": 0.5327, + "step": 2172 + }, + { + "epoch": 1.058460539136083, + "grad_norm": 2.575655698776245, + "learning_rate": 4.652849653865085e-06, + "loss": 0.5098, + "step": 2173 + }, + { + "epoch": 1.0589477102955505, + "grad_norm": 2.966796875, + "learning_rate": 4.652522632596032e-06, + "loss": 0.5635, + "step": 2174 + }, + { + "epoch": 1.0594348814550179, + "grad_norm": 2.5692672729492188, + "learning_rate": 4.652195468873233e-06, + "loss": 0.5746, + "step": 2175 + }, + { + "epoch": 1.0599220526144852, + "grad_norm": 2.8504207134246826, + "learning_rate": 4.651868162718336e-06, + "loss": 0.5287, + "step": 2176 + }, + { + "epoch": 1.0604092237739526, + "grad_norm": 2.5734548568725586, + "learning_rate": 4.651540714153006e-06, + "loss": 0.5929, + "step": 2177 + }, + { + "epoch": 1.06089639493342, + "grad_norm": 2.7500762939453125, + "learning_rate": 4.651213123198912e-06, + "loss": 0.5637, + "step": 2178 + }, + { + "epoch": 1.0613835660928872, + "grad_norm": 2.768622398376465, + "learning_rate": 4.650885389877733e-06, + "loss": 0.5662, + "step": 2179 + }, + { + "epoch": 1.0618707372523546, + "grad_norm": 2.3955085277557373, + "learning_rate": 4.650557514211159e-06, + "loss": 0.6018, + "step": 2180 + }, + { + "epoch": 1.062357908411822, + "grad_norm": 2.6087632179260254, + "learning_rate": 4.65022949622089e-06, + "loss": 0.5482, + "step": 2181 + }, + { + "epoch": 1.0628450795712894, + "grad_norm": 2.604672908782959, + "learning_rate": 4.649901335928632e-06, + "loss": 0.5057, + "step": 2182 + }, + { + "epoch": 1.0633322507307568, + "grad_norm": 2.986675262451172, + "learning_rate": 4.649573033356105e-06, + "loss": 0.5575, + "step": 2183 + }, + { + "epoch": 1.0638194218902242, + "grad_norm": 3.05542254447937, + "learning_rate": 4.649244588525034e-06, + "loss": 0.6256, + "step": 2184 + }, + { + "epoch": 1.0643065930496916, + "grad_norm": 2.989687204360962, + "learning_rate": 4.648916001457157e-06, + "loss": 0.5794, + "step": 2185 + }, + { + "epoch": 1.0647937642091587, + "grad_norm": 2.970353603363037, + "learning_rate": 4.648587272174219e-06, + "loss": 0.6173, + "step": 2186 + }, + { + "epoch": 1.0652809353686261, + "grad_norm": 2.836885690689087, + "learning_rate": 4.6482584006979745e-06, + "loss": 0.5254, + "step": 2187 + }, + { + "epoch": 1.0657681065280935, + "grad_norm": 2.835261821746826, + "learning_rate": 4.647929387050189e-06, + "loss": 0.655, + "step": 2188 + }, + { + "epoch": 1.066255277687561, + "grad_norm": 3.024304151535034, + "learning_rate": 4.6476002312526385e-06, + "loss": 0.5549, + "step": 2189 + }, + { + "epoch": 1.0667424488470283, + "grad_norm": 2.6071386337280273, + "learning_rate": 4.647270933327103e-06, + "loss": 0.5273, + "step": 2190 + }, + { + "epoch": 1.0672296200064957, + "grad_norm": 2.5583298206329346, + "learning_rate": 4.646941493295377e-06, + "loss": 0.5835, + "step": 2191 + }, + { + "epoch": 1.067716791165963, + "grad_norm": 2.777275562286377, + "learning_rate": 4.646611911179264e-06, + "loss": 0.5842, + "step": 2192 + }, + { + "epoch": 1.0682039623254302, + "grad_norm": 3.1114954948425293, + "learning_rate": 4.646282187000574e-06, + "loss": 0.5919, + "step": 2193 + }, + { + "epoch": 1.0686911334848976, + "grad_norm": 2.749817132949829, + "learning_rate": 4.6459523207811295e-06, + "loss": 0.5722, + "step": 2194 + }, + { + "epoch": 1.069178304644365, + "grad_norm": 2.484736680984497, + "learning_rate": 4.645622312542759e-06, + "loss": 0.5872, + "step": 2195 + }, + { + "epoch": 1.0696654758038324, + "grad_norm": 2.786766767501831, + "learning_rate": 4.645292162307305e-06, + "loss": 0.5974, + "step": 2196 + }, + { + "epoch": 1.0701526469632998, + "grad_norm": 2.5767416954040527, + "learning_rate": 4.644961870096615e-06, + "loss": 0.5792, + "step": 2197 + }, + { + "epoch": 1.0706398181227672, + "grad_norm": 2.9308557510375977, + "learning_rate": 4.644631435932547e-06, + "loss": 0.6846, + "step": 2198 + }, + { + "epoch": 1.0711269892822344, + "grad_norm": 2.716264247894287, + "learning_rate": 4.6443008598369715e-06, + "loss": 0.5069, + "step": 2199 + }, + { + "epoch": 1.0716141604417018, + "grad_norm": 2.4755804538726807, + "learning_rate": 4.643970141831764e-06, + "loss": 0.5982, + "step": 2200 + }, + { + "epoch": 1.0721013316011692, + "grad_norm": 2.8186306953430176, + "learning_rate": 4.643639281938814e-06, + "loss": 0.5441, + "step": 2201 + }, + { + "epoch": 1.0725885027606366, + "grad_norm": 2.675227642059326, + "learning_rate": 4.643308280180014e-06, + "loss": 0.5605, + "step": 2202 + }, + { + "epoch": 1.073075673920104, + "grad_norm": 2.7755582332611084, + "learning_rate": 4.642977136577271e-06, + "loss": 0.5286, + "step": 2203 + }, + { + "epoch": 1.0735628450795713, + "grad_norm": 2.3209989070892334, + "learning_rate": 4.642645851152502e-06, + "loss": 0.5626, + "step": 2204 + }, + { + "epoch": 1.0740500162390387, + "grad_norm": 3.1823108196258545, + "learning_rate": 4.642314423927631e-06, + "loss": 0.5708, + "step": 2205 + }, + { + "epoch": 1.074537187398506, + "grad_norm": 2.5405075550079346, + "learning_rate": 4.6419828549245894e-06, + "loss": 0.5236, + "step": 2206 + }, + { + "epoch": 1.0750243585579733, + "grad_norm": 2.721665859222412, + "learning_rate": 4.641651144165322e-06, + "loss": 0.5099, + "step": 2207 + }, + { + "epoch": 1.0755115297174407, + "grad_norm": 2.7177488803863525, + "learning_rate": 4.641319291671782e-06, + "loss": 0.5812, + "step": 2208 + }, + { + "epoch": 1.075998700876908, + "grad_norm": 2.9655773639678955, + "learning_rate": 4.64098729746593e-06, + "loss": 0.5213, + "step": 2209 + }, + { + "epoch": 1.0764858720363755, + "grad_norm": 3.2053089141845703, + "learning_rate": 4.640655161569738e-06, + "loss": 0.6444, + "step": 2210 + }, + { + "epoch": 1.0769730431958429, + "grad_norm": 3.076167106628418, + "learning_rate": 4.640322884005187e-06, + "loss": 0.602, + "step": 2211 + }, + { + "epoch": 1.0774602143553103, + "grad_norm": 2.789241313934326, + "learning_rate": 4.639990464794268e-06, + "loss": 0.537, + "step": 2212 + }, + { + "epoch": 1.0779473855147774, + "grad_norm": 2.795335054397583, + "learning_rate": 4.639657903958977e-06, + "loss": 0.5741, + "step": 2213 + }, + { + "epoch": 1.0784345566742448, + "grad_norm": 2.817671060562134, + "learning_rate": 4.639325201521327e-06, + "loss": 0.6019, + "step": 2214 + }, + { + "epoch": 1.0789217278337122, + "grad_norm": 2.6204891204833984, + "learning_rate": 4.638992357503333e-06, + "loss": 0.5149, + "step": 2215 + }, + { + "epoch": 1.0794088989931796, + "grad_norm": 2.9368016719818115, + "learning_rate": 4.638659371927025e-06, + "loss": 0.6125, + "step": 2216 + }, + { + "epoch": 1.079896070152647, + "grad_norm": 2.6617281436920166, + "learning_rate": 4.638326244814439e-06, + "loss": 0.5938, + "step": 2217 + }, + { + "epoch": 1.0803832413121144, + "grad_norm": 2.888479709625244, + "learning_rate": 4.637992976187621e-06, + "loss": 0.5638, + "step": 2218 + }, + { + "epoch": 1.0808704124715818, + "grad_norm": 3.1526670455932617, + "learning_rate": 4.637659566068626e-06, + "loss": 0.5971, + "step": 2219 + }, + { + "epoch": 1.081357583631049, + "grad_norm": 2.859954595565796, + "learning_rate": 4.637326014479522e-06, + "loss": 0.6267, + "step": 2220 + }, + { + "epoch": 1.0818447547905163, + "grad_norm": 2.841395616531372, + "learning_rate": 4.636992321442379e-06, + "loss": 0.656, + "step": 2221 + }, + { + "epoch": 1.0823319259499837, + "grad_norm": 3.019949436187744, + "learning_rate": 4.636658486979285e-06, + "loss": 0.5008, + "step": 2222 + }, + { + "epoch": 1.0828190971094511, + "grad_norm": 2.9283266067504883, + "learning_rate": 4.636324511112331e-06, + "loss": 0.6201, + "step": 2223 + }, + { + "epoch": 1.0833062682689185, + "grad_norm": 2.686347723007202, + "learning_rate": 4.635990393863619e-06, + "loss": 0.5667, + "step": 2224 + }, + { + "epoch": 1.083793439428386, + "grad_norm": 2.618767738342285, + "learning_rate": 4.635656135255262e-06, + "loss": 0.5458, + "step": 2225 + }, + { + "epoch": 1.0842806105878533, + "grad_norm": 2.7507712841033936, + "learning_rate": 4.635321735309381e-06, + "loss": 0.567, + "step": 2226 + }, + { + "epoch": 1.0847677817473205, + "grad_norm": 2.8014514446258545, + "learning_rate": 4.634987194048106e-06, + "loss": 0.5813, + "step": 2227 + }, + { + "epoch": 1.0852549529067879, + "grad_norm": 2.6670055389404297, + "learning_rate": 4.634652511493578e-06, + "loss": 0.5974, + "step": 2228 + }, + { + "epoch": 1.0857421240662553, + "grad_norm": 2.9148333072662354, + "learning_rate": 4.634317687667945e-06, + "loss": 0.5845, + "step": 2229 + }, + { + "epoch": 1.0862292952257226, + "grad_norm": 2.7455508708953857, + "learning_rate": 4.633982722593367e-06, + "loss": 0.5727, + "step": 2230 + }, + { + "epoch": 1.08671646638519, + "grad_norm": 2.46783709526062, + "learning_rate": 4.6336476162920094e-06, + "loss": 0.5566, + "step": 2231 + }, + { + "epoch": 1.0872036375446574, + "grad_norm": 3.130129098892212, + "learning_rate": 4.633312368786053e-06, + "loss": 0.562, + "step": 2232 + }, + { + "epoch": 1.0876908087041248, + "grad_norm": 2.5780537128448486, + "learning_rate": 4.632976980097682e-06, + "loss": 0.6039, + "step": 2233 + }, + { + "epoch": 1.088177979863592, + "grad_norm": 2.5723373889923096, + "learning_rate": 4.632641450249093e-06, + "loss": 0.5855, + "step": 2234 + }, + { + "epoch": 1.0886651510230594, + "grad_norm": 2.656656265258789, + "learning_rate": 4.632305779262492e-06, + "loss": 0.605, + "step": 2235 + }, + { + "epoch": 1.0891523221825268, + "grad_norm": 3.164982318878174, + "learning_rate": 4.631969967160092e-06, + "loss": 0.562, + "step": 2236 + }, + { + "epoch": 1.0896394933419942, + "grad_norm": 2.666731834411621, + "learning_rate": 4.6316340139641195e-06, + "loss": 0.5357, + "step": 2237 + }, + { + "epoch": 1.0901266645014616, + "grad_norm": 2.7077624797821045, + "learning_rate": 4.631297919696805e-06, + "loss": 0.5081, + "step": 2238 + }, + { + "epoch": 1.090613835660929, + "grad_norm": 3.04278826713562, + "learning_rate": 4.630961684380394e-06, + "loss": 0.6606, + "step": 2239 + }, + { + "epoch": 1.0911010068203963, + "grad_norm": 3.1693670749664307, + "learning_rate": 4.630625308037136e-06, + "loss": 0.5364, + "step": 2240 + }, + { + "epoch": 1.0915881779798635, + "grad_norm": 2.5890049934387207, + "learning_rate": 4.630288790689293e-06, + "loss": 0.6223, + "step": 2241 + }, + { + "epoch": 1.092075349139331, + "grad_norm": 3.0299458503723145, + "learning_rate": 4.629952132359138e-06, + "loss": 0.5379, + "step": 2242 + }, + { + "epoch": 1.0925625202987983, + "grad_norm": 3.4686312675476074, + "learning_rate": 4.629615333068949e-06, + "loss": 0.5582, + "step": 2243 + }, + { + "epoch": 1.0930496914582657, + "grad_norm": 2.7653894424438477, + "learning_rate": 4.629278392841014e-06, + "loss": 0.5322, + "step": 2244 + }, + { + "epoch": 1.093536862617733, + "grad_norm": 2.8124635219573975, + "learning_rate": 4.6289413116976344e-06, + "loss": 0.6593, + "step": 2245 + }, + { + "epoch": 1.0940240337772005, + "grad_norm": 3.7366180419921875, + "learning_rate": 4.628604089661118e-06, + "loss": 0.6056, + "step": 2246 + }, + { + "epoch": 1.0945112049366676, + "grad_norm": 2.474555015563965, + "learning_rate": 4.628266726753779e-06, + "loss": 0.4771, + "step": 2247 + }, + { + "epoch": 1.094998376096135, + "grad_norm": 2.8801090717315674, + "learning_rate": 4.627929222997948e-06, + "loss": 0.562, + "step": 2248 + }, + { + "epoch": 1.0954855472556024, + "grad_norm": 3.039750814437866, + "learning_rate": 4.627591578415958e-06, + "loss": 0.4884, + "step": 2249 + }, + { + "epoch": 1.0959727184150698, + "grad_norm": 2.869478940963745, + "learning_rate": 4.627253793030155e-06, + "loss": 0.535, + "step": 2250 + }, + { + "epoch": 1.0964598895745372, + "grad_norm": 2.9447181224823, + "learning_rate": 4.626915866862896e-06, + "loss": 0.4927, + "step": 2251 + }, + { + "epoch": 1.0969470607340046, + "grad_norm": 2.96523118019104, + "learning_rate": 4.626577799936541e-06, + "loss": 0.5852, + "step": 2252 + }, + { + "epoch": 1.097434231893472, + "grad_norm": 2.5235745906829834, + "learning_rate": 4.6262395922734675e-06, + "loss": 0.4916, + "step": 2253 + }, + { + "epoch": 1.0979214030529392, + "grad_norm": 2.527085542678833, + "learning_rate": 4.625901243896054e-06, + "loss": 0.5478, + "step": 2254 + }, + { + "epoch": 1.0984085742124066, + "grad_norm": 2.739410161972046, + "learning_rate": 4.6255627548266944e-06, + "loss": 0.6462, + "step": 2255 + }, + { + "epoch": 1.098895745371874, + "grad_norm": 2.9406888484954834, + "learning_rate": 4.625224125087789e-06, + "loss": 0.6192, + "step": 2256 + }, + { + "epoch": 1.0993829165313413, + "grad_norm": 2.9735610485076904, + "learning_rate": 4.62488535470175e-06, + "loss": 0.5895, + "step": 2257 + }, + { + "epoch": 1.0998700876908087, + "grad_norm": 3.395421028137207, + "learning_rate": 4.624546443690996e-06, + "loss": 0.5681, + "step": 2258 + }, + { + "epoch": 1.1003572588502761, + "grad_norm": 2.847752571105957, + "learning_rate": 4.624207392077956e-06, + "loss": 0.589, + "step": 2259 + }, + { + "epoch": 1.1008444300097435, + "grad_norm": 2.6043901443481445, + "learning_rate": 4.623868199885068e-06, + "loss": 0.608, + "step": 2260 + }, + { + "epoch": 1.1013316011692107, + "grad_norm": 2.677841901779175, + "learning_rate": 4.623528867134781e-06, + "loss": 0.639, + "step": 2261 + }, + { + "epoch": 1.101818772328678, + "grad_norm": 2.844337224960327, + "learning_rate": 4.623189393849552e-06, + "loss": 0.5483, + "step": 2262 + }, + { + "epoch": 1.1023059434881455, + "grad_norm": 2.4141337871551514, + "learning_rate": 4.622849780051846e-06, + "loss": 0.4977, + "step": 2263 + }, + { + "epoch": 1.1027931146476129, + "grad_norm": 2.942962646484375, + "learning_rate": 4.6225100257641385e-06, + "loss": 0.5823, + "step": 2264 + }, + { + "epoch": 1.1032802858070803, + "grad_norm": 2.9807050228118896, + "learning_rate": 4.6221701310089164e-06, + "loss": 0.559, + "step": 2265 + }, + { + "epoch": 1.1037674569665477, + "grad_norm": 3.1349377632141113, + "learning_rate": 4.621830095808672e-06, + "loss": 0.4768, + "step": 2266 + }, + { + "epoch": 1.1042546281260148, + "grad_norm": 2.9695029258728027, + "learning_rate": 4.621489920185909e-06, + "loss": 0.5746, + "step": 2267 + }, + { + "epoch": 1.1047417992854822, + "grad_norm": 2.7055764198303223, + "learning_rate": 4.621149604163141e-06, + "loss": 0.5137, + "step": 2268 + }, + { + "epoch": 1.1052289704449496, + "grad_norm": 2.719174861907959, + "learning_rate": 4.62080914776289e-06, + "loss": 0.5036, + "step": 2269 + }, + { + "epoch": 1.105716141604417, + "grad_norm": 3.049334764480591, + "learning_rate": 4.620468551007686e-06, + "loss": 0.5489, + "step": 2270 + }, + { + "epoch": 1.1062033127638844, + "grad_norm": 2.8946311473846436, + "learning_rate": 4.620127813920073e-06, + "loss": 0.5491, + "step": 2271 + }, + { + "epoch": 1.1066904839233518, + "grad_norm": 2.7922651767730713, + "learning_rate": 4.619786936522596e-06, + "loss": 0.5785, + "step": 2272 + }, + { + "epoch": 1.1071776550828192, + "grad_norm": 2.528576612472534, + "learning_rate": 4.619445918837818e-06, + "loss": 0.5594, + "step": 2273 + }, + { + "epoch": 1.1076648262422863, + "grad_norm": 2.7297322750091553, + "learning_rate": 4.619104760888307e-06, + "loss": 0.5755, + "step": 2274 + }, + { + "epoch": 1.1081519974017537, + "grad_norm": 2.84513783454895, + "learning_rate": 4.61876346269664e-06, + "loss": 0.5501, + "step": 2275 + }, + { + "epoch": 1.1086391685612211, + "grad_norm": 2.8665544986724854, + "learning_rate": 4.618422024285404e-06, + "loss": 0.5983, + "step": 2276 + }, + { + "epoch": 1.1091263397206885, + "grad_norm": 2.8044464588165283, + "learning_rate": 4.618080445677197e-06, + "loss": 0.5429, + "step": 2277 + }, + { + "epoch": 1.109613510880156, + "grad_norm": 3.1704256534576416, + "learning_rate": 4.617738726894622e-06, + "loss": 0.5661, + "step": 2278 + }, + { + "epoch": 1.1101006820396233, + "grad_norm": 2.7728028297424316, + "learning_rate": 4.6173968679602954e-06, + "loss": 0.5916, + "step": 2279 + }, + { + "epoch": 1.1105878531990907, + "grad_norm": 2.840179920196533, + "learning_rate": 4.6170548688968414e-06, + "loss": 0.6012, + "step": 2280 + }, + { + "epoch": 1.1110750243585579, + "grad_norm": 2.710991621017456, + "learning_rate": 4.616712729726893e-06, + "loss": 0.6325, + "step": 2281 + }, + { + "epoch": 1.1115621955180253, + "grad_norm": 2.9726510047912598, + "learning_rate": 4.616370450473094e-06, + "loss": 0.5768, + "step": 2282 + }, + { + "epoch": 1.1120493666774927, + "grad_norm": 2.690660238265991, + "learning_rate": 4.616028031158095e-06, + "loss": 0.5246, + "step": 2283 + }, + { + "epoch": 1.11253653783696, + "grad_norm": 2.4715518951416016, + "learning_rate": 4.6156854718045585e-06, + "loss": 0.5652, + "step": 2284 + }, + { + "epoch": 1.1130237089964274, + "grad_norm": 2.9480907917022705, + "learning_rate": 4.615342772435154e-06, + "loss": 0.5335, + "step": 2285 + }, + { + "epoch": 1.1135108801558948, + "grad_norm": 2.809520721435547, + "learning_rate": 4.614999933072562e-06, + "loss": 0.6634, + "step": 2286 + }, + { + "epoch": 1.1139980513153622, + "grad_norm": 2.7221944332122803, + "learning_rate": 4.614656953739472e-06, + "loss": 0.5416, + "step": 2287 + }, + { + "epoch": 1.1144852224748294, + "grad_norm": 2.584869623184204, + "learning_rate": 4.61431383445858e-06, + "loss": 0.6503, + "step": 2288 + }, + { + "epoch": 1.1149723936342968, + "grad_norm": 2.672809362411499, + "learning_rate": 4.613970575252597e-06, + "loss": 0.5085, + "step": 2289 + }, + { + "epoch": 1.1154595647937642, + "grad_norm": 3.0067214965820312, + "learning_rate": 4.6136271761442375e-06, + "loss": 0.5704, + "step": 2290 + }, + { + "epoch": 1.1159467359532316, + "grad_norm": 2.7077791690826416, + "learning_rate": 4.613283637156229e-06, + "loss": 0.5701, + "step": 2291 + }, + { + "epoch": 1.116433907112699, + "grad_norm": 2.8523800373077393, + "learning_rate": 4.612939958311305e-06, + "loss": 0.5996, + "step": 2292 + }, + { + "epoch": 1.1169210782721664, + "grad_norm": 2.906377077102661, + "learning_rate": 4.612596139632212e-06, + "loss": 0.5806, + "step": 2293 + }, + { + "epoch": 1.1174082494316337, + "grad_norm": 4.028451919555664, + "learning_rate": 4.6122521811417035e-06, + "loss": 0.4981, + "step": 2294 + }, + { + "epoch": 1.117895420591101, + "grad_norm": 2.7259726524353027, + "learning_rate": 4.611908082862541e-06, + "loss": 0.5553, + "step": 2295 + }, + { + "epoch": 1.1183825917505683, + "grad_norm": 2.795499563217163, + "learning_rate": 4.611563844817499e-06, + "loss": 0.6215, + "step": 2296 + }, + { + "epoch": 1.1188697629100357, + "grad_norm": 3.0742533206939697, + "learning_rate": 4.6112194670293594e-06, + "loss": 0.5714, + "step": 2297 + }, + { + "epoch": 1.119356934069503, + "grad_norm": 3.1698405742645264, + "learning_rate": 4.610874949520911e-06, + "loss": 0.66, + "step": 2298 + }, + { + "epoch": 1.1198441052289705, + "grad_norm": 2.717935085296631, + "learning_rate": 4.610530292314955e-06, + "loss": 0.6032, + "step": 2299 + }, + { + "epoch": 1.1203312763884379, + "grad_norm": 2.856475353240967, + "learning_rate": 4.610185495434302e-06, + "loss": 0.6173, + "step": 2300 + }, + { + "epoch": 1.1208184475479053, + "grad_norm": 3.0328004360198975, + "learning_rate": 4.6098405589017685e-06, + "loss": 0.528, + "step": 2301 + }, + { + "epoch": 1.1213056187073724, + "grad_norm": 3.045421600341797, + "learning_rate": 4.609495482740183e-06, + "loss": 0.6027, + "step": 2302 + }, + { + "epoch": 1.1217927898668398, + "grad_norm": 2.9758408069610596, + "learning_rate": 4.6091502669723825e-06, + "loss": 0.6461, + "step": 2303 + }, + { + "epoch": 1.1222799610263072, + "grad_norm": 2.9456968307495117, + "learning_rate": 4.608804911621214e-06, + "loss": 0.4769, + "step": 2304 + }, + { + "epoch": 1.1227671321857746, + "grad_norm": 3.139524459838867, + "learning_rate": 4.608459416709533e-06, + "loss": 0.5293, + "step": 2305 + }, + { + "epoch": 1.123254303345242, + "grad_norm": 2.6803505420684814, + "learning_rate": 4.608113782260205e-06, + "loss": 0.5515, + "step": 2306 + }, + { + "epoch": 1.1237414745047094, + "grad_norm": 2.7592084407806396, + "learning_rate": 4.6077680082961025e-06, + "loss": 0.5561, + "step": 2307 + }, + { + "epoch": 1.1242286456641768, + "grad_norm": 2.962873935699463, + "learning_rate": 4.607422094840109e-06, + "loss": 0.6325, + "step": 2308 + }, + { + "epoch": 1.124715816823644, + "grad_norm": 2.8350396156311035, + "learning_rate": 4.6070760419151184e-06, + "loss": 0.5111, + "step": 2309 + }, + { + "epoch": 1.1252029879831114, + "grad_norm": 2.6283016204833984, + "learning_rate": 4.606729849544032e-06, + "loss": 0.5349, + "step": 2310 + }, + { + "epoch": 1.1256901591425788, + "grad_norm": 2.7001912593841553, + "learning_rate": 4.606383517749759e-06, + "loss": 0.6206, + "step": 2311 + }, + { + "epoch": 1.1261773303020461, + "grad_norm": 2.7472846508026123, + "learning_rate": 4.606037046555221e-06, + "loss": 0.5738, + "step": 2312 + }, + { + "epoch": 1.1266645014615135, + "grad_norm": 2.876810312271118, + "learning_rate": 4.605690435983348e-06, + "loss": 0.5615, + "step": 2313 + }, + { + "epoch": 1.127151672620981, + "grad_norm": 2.979128360748291, + "learning_rate": 4.605343686057077e-06, + "loss": 0.5691, + "step": 2314 + }, + { + "epoch": 1.1276388437804483, + "grad_norm": 2.915635347366333, + "learning_rate": 4.604996796799358e-06, + "loss": 0.5959, + "step": 2315 + }, + { + "epoch": 1.1281260149399155, + "grad_norm": 2.999089479446411, + "learning_rate": 4.604649768233147e-06, + "loss": 0.5513, + "step": 2316 + }, + { + "epoch": 1.1286131860993829, + "grad_norm": 2.9983723163604736, + "learning_rate": 4.604302600381411e-06, + "loss": 0.4933, + "step": 2317 + }, + { + "epoch": 1.1291003572588503, + "grad_norm": 2.4815733432769775, + "learning_rate": 4.603955293267123e-06, + "loss": 0.5596, + "step": 2318 + }, + { + "epoch": 1.1295875284183177, + "grad_norm": 2.7884483337402344, + "learning_rate": 4.60360784691327e-06, + "loss": 0.6019, + "step": 2319 + }, + { + "epoch": 1.130074699577785, + "grad_norm": 2.5081613063812256, + "learning_rate": 4.603260261342847e-06, + "loss": 0.5473, + "step": 2320 + }, + { + "epoch": 1.1305618707372525, + "grad_norm": 2.5478949546813965, + "learning_rate": 4.602912536578855e-06, + "loss": 0.5579, + "step": 2321 + }, + { + "epoch": 1.1310490418967196, + "grad_norm": 2.7724907398223877, + "learning_rate": 4.602564672644307e-06, + "loss": 0.6581, + "step": 2322 + }, + { + "epoch": 1.131536213056187, + "grad_norm": 2.9631428718566895, + "learning_rate": 4.602216669562225e-06, + "loss": 0.538, + "step": 2323 + }, + { + "epoch": 1.1320233842156544, + "grad_norm": 2.877471923828125, + "learning_rate": 4.6018685273556384e-06, + "loss": 0.5303, + "step": 2324 + }, + { + "epoch": 1.1325105553751218, + "grad_norm": 2.841881036758423, + "learning_rate": 4.6015202460475896e-06, + "loss": 0.5672, + "step": 2325 + }, + { + "epoch": 1.1329977265345892, + "grad_norm": 3.1488702297210693, + "learning_rate": 4.601171825661127e-06, + "loss": 0.6066, + "step": 2326 + }, + { + "epoch": 1.1334848976940566, + "grad_norm": 2.915126323699951, + "learning_rate": 4.600823266219307e-06, + "loss": 0.5727, + "step": 2327 + }, + { + "epoch": 1.1339720688535238, + "grad_norm": 2.730637311935425, + "learning_rate": 4.6004745677452e-06, + "loss": 0.5965, + "step": 2328 + }, + { + "epoch": 1.1344592400129911, + "grad_norm": 2.585397481918335, + "learning_rate": 4.600125730261882e-06, + "loss": 0.6283, + "step": 2329 + }, + { + "epoch": 1.1349464111724585, + "grad_norm": 3.103215456008911, + "learning_rate": 4.5997767537924385e-06, + "loss": 0.6232, + "step": 2330 + }, + { + "epoch": 1.135433582331926, + "grad_norm": 3.040714740753174, + "learning_rate": 4.5994276383599645e-06, + "loss": 0.5358, + "step": 2331 + }, + { + "epoch": 1.1359207534913933, + "grad_norm": 2.4605515003204346, + "learning_rate": 4.599078383987566e-06, + "loss": 0.6816, + "step": 2332 + }, + { + "epoch": 1.1364079246508607, + "grad_norm": 2.916703224182129, + "learning_rate": 4.598728990698354e-06, + "loss": 0.5769, + "step": 2333 + }, + { + "epoch": 1.136895095810328, + "grad_norm": 3.356475830078125, + "learning_rate": 4.598379458515453e-06, + "loss": 0.5841, + "step": 2334 + }, + { + "epoch": 1.1373822669697953, + "grad_norm": 2.562690258026123, + "learning_rate": 4.598029787461996e-06, + "loss": 0.5141, + "step": 2335 + }, + { + "epoch": 1.1378694381292627, + "grad_norm": 2.6746251583099365, + "learning_rate": 4.597679977561122e-06, + "loss": 0.6443, + "step": 2336 + }, + { + "epoch": 1.13835660928873, + "grad_norm": 2.5905721187591553, + "learning_rate": 4.597330028835983e-06, + "loss": 0.5773, + "step": 2337 + }, + { + "epoch": 1.1388437804481975, + "grad_norm": 2.7195987701416016, + "learning_rate": 4.596979941309738e-06, + "loss": 0.5595, + "step": 2338 + }, + { + "epoch": 1.1393309516076648, + "grad_norm": 2.478087902069092, + "learning_rate": 4.596629715005555e-06, + "loss": 0.5811, + "step": 2339 + }, + { + "epoch": 1.1398181227671322, + "grad_norm": 2.6510679721832275, + "learning_rate": 4.596279349946613e-06, + "loss": 0.5951, + "step": 2340 + }, + { + "epoch": 1.1403052939265996, + "grad_norm": 2.7975804805755615, + "learning_rate": 4.595928846156099e-06, + "loss": 0.5739, + "step": 2341 + }, + { + "epoch": 1.1407924650860668, + "grad_norm": 2.7602388858795166, + "learning_rate": 4.59557820365721e-06, + "loss": 0.4869, + "step": 2342 + }, + { + "epoch": 1.1412796362455342, + "grad_norm": 2.564473867416382, + "learning_rate": 4.5952274224731494e-06, + "loss": 0.5985, + "step": 2343 + }, + { + "epoch": 1.1417668074050016, + "grad_norm": 3.214627504348755, + "learning_rate": 4.594876502627133e-06, + "loss": 0.6065, + "step": 2344 + }, + { + "epoch": 1.142253978564469, + "grad_norm": 3.0695221424102783, + "learning_rate": 4.594525444142386e-06, + "loss": 0.5577, + "step": 2345 + }, + { + "epoch": 1.1427411497239364, + "grad_norm": 2.940066337585449, + "learning_rate": 4.594174247042139e-06, + "loss": 0.6308, + "step": 2346 + }, + { + "epoch": 1.1432283208834038, + "grad_norm": 2.7924060821533203, + "learning_rate": 4.5938229113496355e-06, + "loss": 0.5667, + "step": 2347 + }, + { + "epoch": 1.1437154920428712, + "grad_norm": 2.813570261001587, + "learning_rate": 4.593471437088128e-06, + "loss": 0.6007, + "step": 2348 + }, + { + "epoch": 1.1442026632023383, + "grad_norm": 2.5929181575775146, + "learning_rate": 4.593119824280874e-06, + "loss": 0.5911, + "step": 2349 + }, + { + "epoch": 1.1446898343618057, + "grad_norm": 2.7526888847351074, + "learning_rate": 4.592768072951146e-06, + "loss": 0.6362, + "step": 2350 + }, + { + "epoch": 1.145177005521273, + "grad_norm": 3.119004964828491, + "learning_rate": 4.592416183122221e-06, + "loss": 0.5327, + "step": 2351 + }, + { + "epoch": 1.1456641766807405, + "grad_norm": 3.008434295654297, + "learning_rate": 4.592064154817388e-06, + "loss": 0.6248, + "step": 2352 + }, + { + "epoch": 1.146151347840208, + "grad_norm": 2.7384815216064453, + "learning_rate": 4.591711988059945e-06, + "loss": 0.5469, + "step": 2353 + }, + { + "epoch": 1.1466385189996753, + "grad_norm": 2.8623807430267334, + "learning_rate": 4.591359682873197e-06, + "loss": 0.5044, + "step": 2354 + }, + { + "epoch": 1.1471256901591427, + "grad_norm": 2.464040756225586, + "learning_rate": 4.59100723928046e-06, + "loss": 0.5729, + "step": 2355 + }, + { + "epoch": 1.1476128613186098, + "grad_norm": 2.7397615909576416, + "learning_rate": 4.590654657305057e-06, + "loss": 0.5622, + "step": 2356 + }, + { + "epoch": 1.1481000324780772, + "grad_norm": 2.7450449466705322, + "learning_rate": 4.590301936970325e-06, + "loss": 0.578, + "step": 2357 + }, + { + "epoch": 1.1485872036375446, + "grad_norm": 2.695934772491455, + "learning_rate": 4.589949078299605e-06, + "loss": 0.6188, + "step": 2358 + }, + { + "epoch": 1.149074374797012, + "grad_norm": 2.881281852722168, + "learning_rate": 4.58959608131625e-06, + "loss": 0.5848, + "step": 2359 + }, + { + "epoch": 1.1495615459564794, + "grad_norm": 2.9010136127471924, + "learning_rate": 4.58924294604362e-06, + "loss": 0.5431, + "step": 2360 + }, + { + "epoch": 1.1500487171159468, + "grad_norm": 2.3557021617889404, + "learning_rate": 4.588889672505088e-06, + "loss": 0.595, + "step": 2361 + }, + { + "epoch": 1.1505358882754142, + "grad_norm": 2.6904313564300537, + "learning_rate": 4.588536260724031e-06, + "loss": 0.5507, + "step": 2362 + }, + { + "epoch": 1.1510230594348814, + "grad_norm": 2.5639474391937256, + "learning_rate": 4.5881827107238385e-06, + "loss": 0.6167, + "step": 2363 + }, + { + "epoch": 1.1515102305943488, + "grad_norm": 4.007477283477783, + "learning_rate": 4.58782902252791e-06, + "loss": 0.6424, + "step": 2364 + }, + { + "epoch": 1.1519974017538162, + "grad_norm": 2.7735745906829834, + "learning_rate": 4.5874751961596495e-06, + "loss": 0.6093, + "step": 2365 + }, + { + "epoch": 1.1524845729132835, + "grad_norm": 2.5832040309906006, + "learning_rate": 4.5871212316424755e-06, + "loss": 0.6381, + "step": 2366 + }, + { + "epoch": 1.152971744072751, + "grad_norm": 2.949803113937378, + "learning_rate": 4.586767128999813e-06, + "loss": 0.5802, + "step": 2367 + }, + { + "epoch": 1.1534589152322183, + "grad_norm": 2.7254128456115723, + "learning_rate": 4.586412888255096e-06, + "loss": 0.5382, + "step": 2368 + }, + { + "epoch": 1.1539460863916857, + "grad_norm": 2.745072364807129, + "learning_rate": 4.586058509431769e-06, + "loss": 0.588, + "step": 2369 + }, + { + "epoch": 1.154433257551153, + "grad_norm": 2.7625458240509033, + "learning_rate": 4.585703992553283e-06, + "loss": 0.5458, + "step": 2370 + }, + { + "epoch": 1.1549204287106203, + "grad_norm": 2.842344284057617, + "learning_rate": 4.585349337643101e-06, + "loss": 0.5915, + "step": 2371 + }, + { + "epoch": 1.1554075998700877, + "grad_norm": 2.480079174041748, + "learning_rate": 4.584994544724695e-06, + "loss": 0.5652, + "step": 2372 + }, + { + "epoch": 1.155894771029555, + "grad_norm": 2.8131937980651855, + "learning_rate": 4.584639613821545e-06, + "loss": 0.5987, + "step": 2373 + }, + { + "epoch": 1.1563819421890225, + "grad_norm": 2.6191322803497314, + "learning_rate": 4.584284544957137e-06, + "loss": 0.5833, + "step": 2374 + }, + { + "epoch": 1.1568691133484899, + "grad_norm": 4.382278919219971, + "learning_rate": 4.583929338154973e-06, + "loss": 0.6475, + "step": 2375 + }, + { + "epoch": 1.1573562845079572, + "grad_norm": 2.90503191947937, + "learning_rate": 4.583573993438559e-06, + "loss": 0.5221, + "step": 2376 + }, + { + "epoch": 1.1578434556674244, + "grad_norm": 2.953878164291382, + "learning_rate": 4.583218510831414e-06, + "loss": 0.6051, + "step": 2377 + }, + { + "epoch": 1.1583306268268918, + "grad_norm": 2.602182149887085, + "learning_rate": 4.5828628903570604e-06, + "loss": 0.5008, + "step": 2378 + }, + { + "epoch": 1.1588177979863592, + "grad_norm": 2.981954336166382, + "learning_rate": 4.5825071320390345e-06, + "loss": 0.6267, + "step": 2379 + }, + { + "epoch": 1.1593049691458266, + "grad_norm": 2.8299286365509033, + "learning_rate": 4.5821512359008816e-06, + "loss": 0.4877, + "step": 2380 + }, + { + "epoch": 1.159792140305294, + "grad_norm": 2.572594404220581, + "learning_rate": 4.581795201966153e-06, + "loss": 0.5199, + "step": 2381 + }, + { + "epoch": 1.1602793114647614, + "grad_norm": 2.8213512897491455, + "learning_rate": 4.581439030258411e-06, + "loss": 0.5703, + "step": 2382 + }, + { + "epoch": 1.1607664826242288, + "grad_norm": 3.043307304382324, + "learning_rate": 4.58108272080123e-06, + "loss": 0.646, + "step": 2383 + }, + { + "epoch": 1.161253653783696, + "grad_norm": 2.6094627380371094, + "learning_rate": 4.580726273618187e-06, + "loss": 0.5458, + "step": 2384 + }, + { + "epoch": 1.1617408249431633, + "grad_norm": 2.9488165378570557, + "learning_rate": 4.580369688732873e-06, + "loss": 0.5719, + "step": 2385 + }, + { + "epoch": 1.1622279961026307, + "grad_norm": 2.8557677268981934, + "learning_rate": 4.5800129661688865e-06, + "loss": 0.6473, + "step": 2386 + }, + { + "epoch": 1.1627151672620981, + "grad_norm": 2.5459704399108887, + "learning_rate": 4.5796561059498355e-06, + "loss": 0.5321, + "step": 2387 + }, + { + "epoch": 1.1632023384215655, + "grad_norm": 3.078120470046997, + "learning_rate": 4.5792991080993376e-06, + "loss": 0.5145, + "step": 2388 + }, + { + "epoch": 1.163689509581033, + "grad_norm": 2.18644642829895, + "learning_rate": 4.578941972641018e-06, + "loss": 0.5368, + "step": 2389 + }, + { + "epoch": 1.1641766807405, + "grad_norm": 2.7420356273651123, + "learning_rate": 4.578584699598513e-06, + "loss": 0.4913, + "step": 2390 + }, + { + "epoch": 1.1646638518999675, + "grad_norm": 3.0193707942962646, + "learning_rate": 4.578227288995465e-06, + "loss": 0.6261, + "step": 2391 + }, + { + "epoch": 1.1651510230594349, + "grad_norm": 2.832667589187622, + "learning_rate": 4.577869740855529e-06, + "loss": 0.6343, + "step": 2392 + }, + { + "epoch": 1.1656381942189022, + "grad_norm": 2.8881397247314453, + "learning_rate": 4.577512055202367e-06, + "loss": 0.5488, + "step": 2393 + }, + { + "epoch": 1.1661253653783696, + "grad_norm": 2.6420066356658936, + "learning_rate": 4.577154232059651e-06, + "loss": 0.5714, + "step": 2394 + }, + { + "epoch": 1.166612536537837, + "grad_norm": 2.777855634689331, + "learning_rate": 4.576796271451061e-06, + "loss": 0.5494, + "step": 2395 + }, + { + "epoch": 1.1670997076973042, + "grad_norm": 2.896953582763672, + "learning_rate": 4.576438173400287e-06, + "loss": 0.5853, + "step": 2396 + }, + { + "epoch": 1.1675868788567716, + "grad_norm": 2.670149564743042, + "learning_rate": 4.576079937931029e-06, + "loss": 0.5646, + "step": 2397 + }, + { + "epoch": 1.168074050016239, + "grad_norm": 2.583988904953003, + "learning_rate": 4.575721565066993e-06, + "loss": 0.5522, + "step": 2398 + }, + { + "epoch": 1.1685612211757064, + "grad_norm": 2.9450387954711914, + "learning_rate": 4.575363054831898e-06, + "loss": 0.5106, + "step": 2399 + }, + { + "epoch": 1.1690483923351738, + "grad_norm": 2.6557648181915283, + "learning_rate": 4.575004407249468e-06, + "loss": 0.5389, + "step": 2400 + }, + { + "epoch": 1.1695355634946412, + "grad_norm": 3.1303493976593018, + "learning_rate": 4.57464562234344e-06, + "loss": 0.5826, + "step": 2401 + }, + { + "epoch": 1.1700227346541086, + "grad_norm": 3.0899746417999268, + "learning_rate": 4.574286700137559e-06, + "loss": 0.5701, + "step": 2402 + }, + { + "epoch": 1.1705099058135757, + "grad_norm": 2.4170732498168945, + "learning_rate": 4.573927640655576e-06, + "loss": 0.6221, + "step": 2403 + }, + { + "epoch": 1.1709970769730431, + "grad_norm": 2.7211127281188965, + "learning_rate": 4.573568443921256e-06, + "loss": 0.6122, + "step": 2404 + }, + { + "epoch": 1.1714842481325105, + "grad_norm": 2.811253309249878, + "learning_rate": 4.573209109958368e-06, + "loss": 0.5855, + "step": 2405 + }, + { + "epoch": 1.171971419291978, + "grad_norm": 2.5643720626831055, + "learning_rate": 4.5728496387906954e-06, + "loss": 0.5147, + "step": 2406 + }, + { + "epoch": 1.1724585904514453, + "grad_norm": 3.1337218284606934, + "learning_rate": 4.572490030442027e-06, + "loss": 0.631, + "step": 2407 + }, + { + "epoch": 1.1729457616109127, + "grad_norm": 4.931292533874512, + "learning_rate": 4.57213028493616e-06, + "loss": 0.6621, + "step": 2408 + }, + { + "epoch": 1.17343293277038, + "grad_norm": 2.819248914718628, + "learning_rate": 4.5717704022969045e-06, + "loss": 0.5252, + "step": 2409 + }, + { + "epoch": 1.1739201039298472, + "grad_norm": 2.674919605255127, + "learning_rate": 4.5714103825480775e-06, + "loss": 0.5582, + "step": 2410 + }, + { + "epoch": 1.1744072750893146, + "grad_norm": 2.5691118240356445, + "learning_rate": 4.571050225713503e-06, + "loss": 0.5953, + "step": 2411 + }, + { + "epoch": 1.174894446248782, + "grad_norm": 2.61698579788208, + "learning_rate": 4.570689931817018e-06, + "loss": 0.563, + "step": 2412 + }, + { + "epoch": 1.1753816174082494, + "grad_norm": 2.495426893234253, + "learning_rate": 4.570329500882465e-06, + "loss": 0.6346, + "step": 2413 + }, + { + "epoch": 1.1758687885677168, + "grad_norm": 2.7325992584228516, + "learning_rate": 4.5699689329337e-06, + "loss": 0.6017, + "step": 2414 + }, + { + "epoch": 1.1763559597271842, + "grad_norm": 2.9711663722991943, + "learning_rate": 4.569608227994583e-06, + "loss": 0.5783, + "step": 2415 + }, + { + "epoch": 1.1768431308866516, + "grad_norm": 2.998420238494873, + "learning_rate": 4.569247386088987e-06, + "loss": 0.5493, + "step": 2416 + }, + { + "epoch": 1.1773303020461188, + "grad_norm": 2.805405855178833, + "learning_rate": 4.568886407240791e-06, + "loss": 0.5268, + "step": 2417 + }, + { + "epoch": 1.1778174732055862, + "grad_norm": 2.6920201778411865, + "learning_rate": 4.568525291473885e-06, + "loss": 0.5578, + "step": 2418 + }, + { + "epoch": 1.1783046443650536, + "grad_norm": 2.8098299503326416, + "learning_rate": 4.568164038812168e-06, + "loss": 0.7429, + "step": 2419 + }, + { + "epoch": 1.178791815524521, + "grad_norm": 2.960599899291992, + "learning_rate": 4.567802649279548e-06, + "loss": 0.4972, + "step": 2420 + }, + { + "epoch": 1.1792789866839883, + "grad_norm": 2.5879101753234863, + "learning_rate": 4.567441122899941e-06, + "loss": 0.587, + "step": 2421 + }, + { + "epoch": 1.1797661578434557, + "grad_norm": 3.1168715953826904, + "learning_rate": 4.5670794596972724e-06, + "loss": 0.6832, + "step": 2422 + }, + { + "epoch": 1.1802533290029231, + "grad_norm": 2.9566876888275146, + "learning_rate": 4.566717659695478e-06, + "loss": 0.5441, + "step": 2423 + }, + { + "epoch": 1.1807405001623903, + "grad_norm": 2.805867910385132, + "learning_rate": 4.5663557229185015e-06, + "loss": 0.5652, + "step": 2424 + }, + { + "epoch": 1.1812276713218577, + "grad_norm": 2.7001521587371826, + "learning_rate": 4.565993649390296e-06, + "loss": 0.5619, + "step": 2425 + }, + { + "epoch": 1.181714842481325, + "grad_norm": 3.2214207649230957, + "learning_rate": 4.565631439134822e-06, + "loss": 0.6705, + "step": 2426 + }, + { + "epoch": 1.1822020136407925, + "grad_norm": 2.876047134399414, + "learning_rate": 4.565269092176053e-06, + "loss": 0.5334, + "step": 2427 + }, + { + "epoch": 1.1826891848002599, + "grad_norm": 2.7881405353546143, + "learning_rate": 4.564906608537967e-06, + "loss": 0.6476, + "step": 2428 + }, + { + "epoch": 1.1831763559597273, + "grad_norm": 3.22598934173584, + "learning_rate": 4.5645439882445545e-06, + "loss": 0.6231, + "step": 2429 + }, + { + "epoch": 1.1836635271191946, + "grad_norm": 2.9333784580230713, + "learning_rate": 4.564181231319812e-06, + "loss": 0.6826, + "step": 2430 + }, + { + "epoch": 1.1841506982786618, + "grad_norm": 3.024423122406006, + "learning_rate": 4.5638183377877484e-06, + "loss": 0.6104, + "step": 2431 + }, + { + "epoch": 1.1846378694381292, + "grad_norm": 3.0884368419647217, + "learning_rate": 4.56345530767238e-06, + "loss": 0.5945, + "step": 2432 + }, + { + "epoch": 1.1851250405975966, + "grad_norm": 2.925002336502075, + "learning_rate": 4.563092140997731e-06, + "loss": 0.5961, + "step": 2433 + }, + { + "epoch": 1.185612211757064, + "grad_norm": 2.755260467529297, + "learning_rate": 4.562728837787836e-06, + "loss": 0.6263, + "step": 2434 + }, + { + "epoch": 1.1860993829165314, + "grad_norm": 2.850696325302124, + "learning_rate": 4.562365398066739e-06, + "loss": 0.531, + "step": 2435 + }, + { + "epoch": 1.1865865540759988, + "grad_norm": 3.0258700847625732, + "learning_rate": 4.562001821858492e-06, + "loss": 0.5606, + "step": 2436 + }, + { + "epoch": 1.1870737252354662, + "grad_norm": 2.7628767490386963, + "learning_rate": 4.561638109187157e-06, + "loss": 0.6122, + "step": 2437 + }, + { + "epoch": 1.1875608963949333, + "grad_norm": 3.0189878940582275, + "learning_rate": 4.561274260076803e-06, + "loss": 0.5291, + "step": 2438 + }, + { + "epoch": 1.1880480675544007, + "grad_norm": 2.5630807876586914, + "learning_rate": 4.56091027455151e-06, + "loss": 0.5417, + "step": 2439 + }, + { + "epoch": 1.1885352387138681, + "grad_norm": 2.747924327850342, + "learning_rate": 4.560546152635368e-06, + "loss": 0.5988, + "step": 2440 + }, + { + "epoch": 1.1890224098733355, + "grad_norm": 2.6322972774505615, + "learning_rate": 4.560181894352473e-06, + "loss": 0.5787, + "step": 2441 + }, + { + "epoch": 1.189509581032803, + "grad_norm": 2.6190261840820312, + "learning_rate": 4.559817499726933e-06, + "loss": 0.5566, + "step": 2442 + }, + { + "epoch": 1.1899967521922703, + "grad_norm": 2.660268545150757, + "learning_rate": 4.5594529687828615e-06, + "loss": 0.5362, + "step": 2443 + }, + { + "epoch": 1.1904839233517377, + "grad_norm": 3.179797887802124, + "learning_rate": 4.559088301544385e-06, + "loss": 0.5097, + "step": 2444 + }, + { + "epoch": 1.1909710945112049, + "grad_norm": 2.9575507640838623, + "learning_rate": 4.558723498035635e-06, + "loss": 0.6081, + "step": 2445 + }, + { + "epoch": 1.1914582656706723, + "grad_norm": 2.884690523147583, + "learning_rate": 4.558358558280758e-06, + "loss": 0.5778, + "step": 2446 + }, + { + "epoch": 1.1919454368301396, + "grad_norm": 3.0036518573760986, + "learning_rate": 4.557993482303902e-06, + "loss": 0.5705, + "step": 2447 + }, + { + "epoch": 1.192432607989607, + "grad_norm": 2.860417127609253, + "learning_rate": 4.557628270129229e-06, + "loss": 0.5756, + "step": 2448 + }, + { + "epoch": 1.1929197791490744, + "grad_norm": 3.101024866104126, + "learning_rate": 4.5572629217809085e-06, + "loss": 0.5996, + "step": 2449 + }, + { + "epoch": 1.1934069503085418, + "grad_norm": 3.3909502029418945, + "learning_rate": 4.556897437283119e-06, + "loss": 0.5657, + "step": 2450 + }, + { + "epoch": 1.1938941214680092, + "grad_norm": 2.5865566730499268, + "learning_rate": 4.556531816660049e-06, + "loss": 0.5673, + "step": 2451 + }, + { + "epoch": 1.1943812926274764, + "grad_norm": 2.6879184246063232, + "learning_rate": 4.5561660599358955e-06, + "loss": 0.551, + "step": 2452 + }, + { + "epoch": 1.1948684637869438, + "grad_norm": 3.0687270164489746, + "learning_rate": 4.555800167134864e-06, + "loss": 0.6464, + "step": 2453 + }, + { + "epoch": 1.1953556349464112, + "grad_norm": 2.8800928592681885, + "learning_rate": 4.555434138281167e-06, + "loss": 0.6758, + "step": 2454 + }, + { + "epoch": 1.1958428061058786, + "grad_norm": 2.727818727493286, + "learning_rate": 4.5550679733990326e-06, + "loss": 0.5727, + "step": 2455 + }, + { + "epoch": 1.196329977265346, + "grad_norm": 2.837873697280884, + "learning_rate": 4.5547016725126895e-06, + "loss": 0.6082, + "step": 2456 + }, + { + "epoch": 1.1968171484248131, + "grad_norm": 2.7087948322296143, + "learning_rate": 4.554335235646381e-06, + "loss": 0.6625, + "step": 2457 + }, + { + "epoch": 1.1973043195842805, + "grad_norm": 2.9681456089019775, + "learning_rate": 4.553968662824358e-06, + "loss": 0.5548, + "step": 2458 + }, + { + "epoch": 1.197791490743748, + "grad_norm": 2.6016743183135986, + "learning_rate": 4.5536019540708806e-06, + "loss": 0.5631, + "step": 2459 + }, + { + "epoch": 1.1982786619032153, + "grad_norm": 2.7272868156433105, + "learning_rate": 4.5532351094102164e-06, + "loss": 0.5998, + "step": 2460 + }, + { + "epoch": 1.1987658330626827, + "grad_norm": 2.99001407623291, + "learning_rate": 4.552868128866644e-06, + "loss": 0.5724, + "step": 2461 + }, + { + "epoch": 1.19925300422215, + "grad_norm": 2.904127359390259, + "learning_rate": 4.552501012464451e-06, + "loss": 0.5964, + "step": 2462 + }, + { + "epoch": 1.1997401753816175, + "grad_norm": 2.7340481281280518, + "learning_rate": 4.552133760227931e-06, + "loss": 0.5794, + "step": 2463 + }, + { + "epoch": 1.2002273465410846, + "grad_norm": 3.2207257747650146, + "learning_rate": 4.551766372181391e-06, + "loss": 0.5551, + "step": 2464 + }, + { + "epoch": 1.200714517700552, + "grad_norm": 2.811436891555786, + "learning_rate": 4.551398848349142e-06, + "loss": 0.5424, + "step": 2465 + }, + { + "epoch": 1.2012016888600194, + "grad_norm": 2.569868564605713, + "learning_rate": 4.5510311887555095e-06, + "loss": 0.5565, + "step": 2466 + }, + { + "epoch": 1.2016888600194868, + "grad_norm": 2.797970771789551, + "learning_rate": 4.5506633934248234e-06, + "loss": 0.5703, + "step": 2467 + }, + { + "epoch": 1.2021760311789542, + "grad_norm": 2.769767999649048, + "learning_rate": 4.550295462381425e-06, + "loss": 0.484, + "step": 2468 + }, + { + "epoch": 1.2026632023384216, + "grad_norm": 2.5655477046966553, + "learning_rate": 4.549927395649664e-06, + "loss": 0.495, + "step": 2469 + }, + { + "epoch": 1.203150373497889, + "grad_norm": 2.369896650314331, + "learning_rate": 4.549559193253898e-06, + "loss": 0.5544, + "step": 2470 + }, + { + "epoch": 1.2036375446573562, + "grad_norm": 2.475151300430298, + "learning_rate": 4.549190855218496e-06, + "loss": 0.614, + "step": 2471 + }, + { + "epoch": 1.2041247158168236, + "grad_norm": 3.1195309162139893, + "learning_rate": 4.548822381567834e-06, + "loss": 0.584, + "step": 2472 + }, + { + "epoch": 1.204611886976291, + "grad_norm": 2.714754581451416, + "learning_rate": 4.548453772326298e-06, + "loss": 0.54, + "step": 2473 + }, + { + "epoch": 1.2050990581357583, + "grad_norm": 2.6439402103424072, + "learning_rate": 4.548085027518282e-06, + "loss": 0.5655, + "step": 2474 + }, + { + "epoch": 1.2055862292952257, + "grad_norm": 2.6897528171539307, + "learning_rate": 4.547716147168189e-06, + "loss": 0.6159, + "step": 2475 + }, + { + "epoch": 1.2060734004546931, + "grad_norm": 2.8643133640289307, + "learning_rate": 4.547347131300433e-06, + "loss": 0.4892, + "step": 2476 + }, + { + "epoch": 1.2065605716141605, + "grad_norm": 2.5964763164520264, + "learning_rate": 4.546977979939434e-06, + "loss": 0.651, + "step": 2477 + }, + { + "epoch": 1.2070477427736277, + "grad_norm": 3.04134464263916, + "learning_rate": 4.546608693109623e-06, + "loss": 0.6697, + "step": 2478 + }, + { + "epoch": 1.207534913933095, + "grad_norm": 3.749264717102051, + "learning_rate": 4.546239270835439e-06, + "loss": 0.5995, + "step": 2479 + }, + { + "epoch": 1.2080220850925625, + "grad_norm": 2.9449589252471924, + "learning_rate": 4.5458697131413305e-06, + "loss": 0.624, + "step": 2480 + }, + { + "epoch": 1.2085092562520299, + "grad_norm": 2.83860445022583, + "learning_rate": 4.545500020051755e-06, + "loss": 0.6255, + "step": 2481 + }, + { + "epoch": 1.2089964274114973, + "grad_norm": 2.4550697803497314, + "learning_rate": 4.545130191591179e-06, + "loss": 0.5608, + "step": 2482 + }, + { + "epoch": 1.2094835985709647, + "grad_norm": 2.697411298751831, + "learning_rate": 4.544760227784077e-06, + "loss": 0.6591, + "step": 2483 + }, + { + "epoch": 1.209970769730432, + "grad_norm": 2.7904810905456543, + "learning_rate": 4.544390128654933e-06, + "loss": 0.615, + "step": 2484 + }, + { + "epoch": 1.2104579408898992, + "grad_norm": 2.7608375549316406, + "learning_rate": 4.544019894228242e-06, + "loss": 0.5559, + "step": 2485 + }, + { + "epoch": 1.2109451120493666, + "grad_norm": 3.109513521194458, + "learning_rate": 4.543649524528504e-06, + "loss": 0.65, + "step": 2486 + }, + { + "epoch": 1.211432283208834, + "grad_norm": 3.1037628650665283, + "learning_rate": 4.543279019580231e-06, + "loss": 0.5967, + "step": 2487 + }, + { + "epoch": 1.2119194543683014, + "grad_norm": 2.8756327629089355, + "learning_rate": 4.542908379407942e-06, + "loss": 0.531, + "step": 2488 + }, + { + "epoch": 1.2124066255277688, + "grad_norm": 2.7715890407562256, + "learning_rate": 4.542537604036168e-06, + "loss": 0.5788, + "step": 2489 + }, + { + "epoch": 1.2128937966872362, + "grad_norm": 2.514050245285034, + "learning_rate": 4.542166693489444e-06, + "loss": 0.5743, + "step": 2490 + }, + { + "epoch": 1.2133809678467036, + "grad_norm": 2.9868741035461426, + "learning_rate": 4.54179564779232e-06, + "loss": 0.5737, + "step": 2491 + }, + { + "epoch": 1.2138681390061707, + "grad_norm": 2.7268712520599365, + "learning_rate": 4.541424466969349e-06, + "loss": 0.5572, + "step": 2492 + }, + { + "epoch": 1.2143553101656381, + "grad_norm": 2.9451286792755127, + "learning_rate": 4.541053151045097e-06, + "loss": 0.6135, + "step": 2493 + }, + { + "epoch": 1.2148424813251055, + "grad_norm": 2.8532917499542236, + "learning_rate": 4.5406817000441386e-06, + "loss": 0.6098, + "step": 2494 + }, + { + "epoch": 1.215329652484573, + "grad_norm": 2.6144237518310547, + "learning_rate": 4.540310113991054e-06, + "loss": 0.5493, + "step": 2495 + }, + { + "epoch": 1.2158168236440403, + "grad_norm": 6.1479644775390625, + "learning_rate": 4.539938392910437e-06, + "loss": 0.5457, + "step": 2496 + }, + { + "epoch": 1.2163039948035077, + "grad_norm": 2.9845032691955566, + "learning_rate": 4.539566536826886e-06, + "loss": 0.5557, + "step": 2497 + }, + { + "epoch": 1.216791165962975, + "grad_norm": 3.2495555877685547, + "learning_rate": 4.539194545765012e-06, + "loss": 0.6152, + "step": 2498 + }, + { + "epoch": 1.2172783371224423, + "grad_norm": 2.8222854137420654, + "learning_rate": 4.538822419749433e-06, + "loss": 0.5268, + "step": 2499 + }, + { + "epoch": 1.2177655082819097, + "grad_norm": 3.036560297012329, + "learning_rate": 4.538450158804777e-06, + "loss": 0.6009, + "step": 2500 + }, + { + "epoch": 1.218252679441377, + "grad_norm": 2.8789520263671875, + "learning_rate": 4.538077762955678e-06, + "loss": 0.5506, + "step": 2501 + }, + { + "epoch": 1.2187398506008444, + "grad_norm": 2.875253677368164, + "learning_rate": 4.537705232226783e-06, + "loss": 0.5217, + "step": 2502 + }, + { + "epoch": 1.2192270217603118, + "grad_norm": 3.3245303630828857, + "learning_rate": 4.537332566642745e-06, + "loss": 0.588, + "step": 2503 + }, + { + "epoch": 1.2197141929197792, + "grad_norm": 2.909040927886963, + "learning_rate": 4.536959766228229e-06, + "loss": 0.6624, + "step": 2504 + }, + { + "epoch": 1.2202013640792466, + "grad_norm": 3.3315746784210205, + "learning_rate": 4.536586831007903e-06, + "loss": 0.5992, + "step": 2505 + }, + { + "epoch": 1.2206885352387138, + "grad_norm": 2.8663153648376465, + "learning_rate": 4.536213761006452e-06, + "loss": 0.636, + "step": 2506 + }, + { + "epoch": 1.2211757063981812, + "grad_norm": 2.785860538482666, + "learning_rate": 4.535840556248563e-06, + "loss": 0.5427, + "step": 2507 + }, + { + "epoch": 1.2216628775576486, + "grad_norm": 2.414602279663086, + "learning_rate": 4.535467216758936e-06, + "loss": 0.5341, + "step": 2508 + }, + { + "epoch": 1.222150048717116, + "grad_norm": 2.945741891860962, + "learning_rate": 4.535093742562278e-06, + "loss": 0.4783, + "step": 2509 + }, + { + "epoch": 1.2226372198765834, + "grad_norm": 2.677344799041748, + "learning_rate": 4.534720133683306e-06, + "loss": 0.5234, + "step": 2510 + }, + { + "epoch": 1.2231243910360508, + "grad_norm": 2.363163948059082, + "learning_rate": 4.534346390146744e-06, + "loss": 0.4981, + "step": 2511 + }, + { + "epoch": 1.2236115621955181, + "grad_norm": 3.290342330932617, + "learning_rate": 4.533972511977328e-06, + "loss": 0.5592, + "step": 2512 + }, + { + "epoch": 1.2240987333549853, + "grad_norm": 2.6569297313690186, + "learning_rate": 4.5335984991998005e-06, + "loss": 0.5563, + "step": 2513 + }, + { + "epoch": 1.2245859045144527, + "grad_norm": 2.9604578018188477, + "learning_rate": 4.533224351838914e-06, + "loss": 0.6321, + "step": 2514 + }, + { + "epoch": 1.22507307567392, + "grad_norm": 3.0533530712127686, + "learning_rate": 4.532850069919429e-06, + "loss": 0.5815, + "step": 2515 + }, + { + "epoch": 1.2255602468333875, + "grad_norm": 2.61440372467041, + "learning_rate": 4.532475653466115e-06, + "loss": 0.5865, + "step": 2516 + }, + { + "epoch": 1.2260474179928549, + "grad_norm": 2.613497018814087, + "learning_rate": 4.532101102503753e-06, + "loss": 0.5316, + "step": 2517 + }, + { + "epoch": 1.2265345891523223, + "grad_norm": 2.8812060356140137, + "learning_rate": 4.531726417057128e-06, + "loss": 0.6687, + "step": 2518 + }, + { + "epoch": 1.2270217603117897, + "grad_norm": 2.8062944412231445, + "learning_rate": 4.531351597151038e-06, + "loss": 0.5963, + "step": 2519 + }, + { + "epoch": 1.2275089314712568, + "grad_norm": 2.6505331993103027, + "learning_rate": 4.530976642810289e-06, + "loss": 0.5802, + "step": 2520 + }, + { + "epoch": 1.2279961026307242, + "grad_norm": 2.6740853786468506, + "learning_rate": 4.5306015540596944e-06, + "loss": 0.6222, + "step": 2521 + }, + { + "epoch": 1.2284832737901916, + "grad_norm": 3.061666965484619, + "learning_rate": 4.530226330924078e-06, + "loss": 0.499, + "step": 2522 + }, + { + "epoch": 1.228970444949659, + "grad_norm": 2.6552176475524902, + "learning_rate": 4.529850973428273e-06, + "loss": 0.6069, + "step": 2523 + }, + { + "epoch": 1.2294576161091264, + "grad_norm": 2.854926586151123, + "learning_rate": 4.529475481597118e-06, + "loss": 0.6434, + "step": 2524 + }, + { + "epoch": 1.2299447872685936, + "grad_norm": 3.4438085556030273, + "learning_rate": 4.529099855455467e-06, + "loss": 0.5908, + "step": 2525 + }, + { + "epoch": 1.230431958428061, + "grad_norm": 2.788752317428589, + "learning_rate": 4.528724095028175e-06, + "loss": 0.5565, + "step": 2526 + }, + { + "epoch": 1.2309191295875284, + "grad_norm": 2.9042491912841797, + "learning_rate": 4.5283482003401115e-06, + "loss": 0.5136, + "step": 2527 + }, + { + "epoch": 1.2314063007469958, + "grad_norm": 2.881950855255127, + "learning_rate": 4.527972171416153e-06, + "loss": 0.5636, + "step": 2528 + }, + { + "epoch": 1.2318934719064631, + "grad_norm": 2.5842370986938477, + "learning_rate": 4.527596008281184e-06, + "loss": 0.5313, + "step": 2529 + }, + { + "epoch": 1.2323806430659305, + "grad_norm": 2.9726943969726562, + "learning_rate": 4.527219710960101e-06, + "loss": 0.5583, + "step": 2530 + }, + { + "epoch": 1.232867814225398, + "grad_norm": 2.921773910522461, + "learning_rate": 4.526843279477807e-06, + "loss": 0.5573, + "step": 2531 + }, + { + "epoch": 1.233354985384865, + "grad_norm": 3.0915191173553467, + "learning_rate": 4.526466713859212e-06, + "loss": 0.5814, + "step": 2532 + }, + { + "epoch": 1.2338421565443325, + "grad_norm": 2.8473572731018066, + "learning_rate": 4.52609001412924e-06, + "loss": 0.5953, + "step": 2533 + }, + { + "epoch": 1.2343293277037999, + "grad_norm": 3.000953197479248, + "learning_rate": 4.525713180312818e-06, + "loss": 0.6263, + "step": 2534 + }, + { + "epoch": 1.2348164988632673, + "grad_norm": 3.1622626781463623, + "learning_rate": 4.525336212434887e-06, + "loss": 0.6727, + "step": 2535 + }, + { + "epoch": 1.2353036700227347, + "grad_norm": 2.5150210857391357, + "learning_rate": 4.524959110520395e-06, + "loss": 0.5595, + "step": 2536 + }, + { + "epoch": 1.235790841182202, + "grad_norm": 2.846153974533081, + "learning_rate": 4.524581874594297e-06, + "loss": 0.6178, + "step": 2537 + }, + { + "epoch": 1.2362780123416695, + "grad_norm": 2.787376880645752, + "learning_rate": 4.524204504681558e-06, + "loss": 0.6208, + "step": 2538 + }, + { + "epoch": 1.2367651835011366, + "grad_norm": 2.6546473503112793, + "learning_rate": 4.523827000807155e-06, + "loss": 0.5399, + "step": 2539 + }, + { + "epoch": 1.237252354660604, + "grad_norm": 2.6734495162963867, + "learning_rate": 4.523449362996069e-06, + "loss": 0.6372, + "step": 2540 + }, + { + "epoch": 1.2377395258200714, + "grad_norm": 2.755573034286499, + "learning_rate": 4.5230715912732925e-06, + "loss": 0.6049, + "step": 2541 + }, + { + "epoch": 1.2382266969795388, + "grad_norm": 3.0823442935943604, + "learning_rate": 4.522693685663826e-06, + "loss": 0.5143, + "step": 2542 + }, + { + "epoch": 1.2387138681390062, + "grad_norm": 6.010650634765625, + "learning_rate": 4.522315646192681e-06, + "loss": 0.5946, + "step": 2543 + }, + { + "epoch": 1.2392010392984736, + "grad_norm": 3.4582467079162598, + "learning_rate": 4.521937472884874e-06, + "loss": 0.7008, + "step": 2544 + }, + { + "epoch": 1.239688210457941, + "grad_norm": 2.9213647842407227, + "learning_rate": 4.521559165765433e-06, + "loss": 0.6254, + "step": 2545 + }, + { + "epoch": 1.2401753816174081, + "grad_norm": 3.1245105266571045, + "learning_rate": 4.521180724859395e-06, + "loss": 0.5666, + "step": 2546 + }, + { + "epoch": 1.2406625527768755, + "grad_norm": 3.180208206176758, + "learning_rate": 4.520802150191805e-06, + "loss": 0.6165, + "step": 2547 + }, + { + "epoch": 1.241149723936343, + "grad_norm": 3.003445625305176, + "learning_rate": 4.520423441787717e-06, + "loss": 0.6165, + "step": 2548 + }, + { + "epoch": 1.2416368950958103, + "grad_norm": 3.0981016159057617, + "learning_rate": 4.520044599672193e-06, + "loss": 0.5314, + "step": 2549 + }, + { + "epoch": 1.2421240662552777, + "grad_norm": 2.9670498371124268, + "learning_rate": 4.5196656238703065e-06, + "loss": 0.5539, + "step": 2550 + }, + { + "epoch": 1.242611237414745, + "grad_norm": 2.910249710083008, + "learning_rate": 4.519286514407137e-06, + "loss": 0.5444, + "step": 2551 + }, + { + "epoch": 1.2430984085742125, + "grad_norm": 2.6049280166625977, + "learning_rate": 4.518907271307773e-06, + "loss": 0.6137, + "step": 2552 + }, + { + "epoch": 1.2435855797336797, + "grad_norm": 2.838146448135376, + "learning_rate": 4.518527894597315e-06, + "loss": 0.5399, + "step": 2553 + }, + { + "epoch": 1.244072750893147, + "grad_norm": 2.7533860206604004, + "learning_rate": 4.518148384300868e-06, + "loss": 0.5778, + "step": 2554 + }, + { + "epoch": 1.2445599220526145, + "grad_norm": 2.7707862854003906, + "learning_rate": 4.517768740443549e-06, + "loss": 0.5618, + "step": 2555 + }, + { + "epoch": 1.2450470932120818, + "grad_norm": 2.718510627746582, + "learning_rate": 4.517388963050482e-06, + "loss": 0.6264, + "step": 2556 + }, + { + "epoch": 1.2455342643715492, + "grad_norm": 2.883047580718994, + "learning_rate": 4.517009052146802e-06, + "loss": 0.6567, + "step": 2557 + }, + { + "epoch": 1.2460214355310166, + "grad_norm": 2.8093819618225098, + "learning_rate": 4.5166290077576515e-06, + "loss": 0.5295, + "step": 2558 + }, + { + "epoch": 1.246508606690484, + "grad_norm": 3.020583152770996, + "learning_rate": 4.516248829908181e-06, + "loss": 0.4822, + "step": 2559 + }, + { + "epoch": 1.2469957778499512, + "grad_norm": 2.6894407272338867, + "learning_rate": 4.51586851862355e-06, + "loss": 0.556, + "step": 2560 + }, + { + "epoch": 1.2474829490094186, + "grad_norm": 2.446040630340576, + "learning_rate": 4.515488073928927e-06, + "loss": 0.5967, + "step": 2561 + }, + { + "epoch": 1.247970120168886, + "grad_norm": 2.553210735321045, + "learning_rate": 4.515107495849493e-06, + "loss": 0.5636, + "step": 2562 + }, + { + "epoch": 1.2484572913283534, + "grad_norm": 2.918545961380005, + "learning_rate": 4.514726784410431e-06, + "loss": 0.5921, + "step": 2563 + }, + { + "epoch": 1.2489444624878208, + "grad_norm": 2.7270894050598145, + "learning_rate": 4.514345939636939e-06, + "loss": 0.554, + "step": 2564 + }, + { + "epoch": 1.2494316336472882, + "grad_norm": 4.283918380737305, + "learning_rate": 4.51396496155422e-06, + "loss": 0.5658, + "step": 2565 + }, + { + "epoch": 1.2499188048067555, + "grad_norm": 3.006279230117798, + "learning_rate": 4.513583850187488e-06, + "loss": 0.5846, + "step": 2566 + }, + { + "epoch": 1.2504059759662227, + "grad_norm": 2.7191874980926514, + "learning_rate": 4.5132026055619634e-06, + "loss": 0.5791, + "step": 2567 + }, + { + "epoch": 1.25089314712569, + "grad_norm": 3.2445428371429443, + "learning_rate": 4.512821227702878e-06, + "loss": 0.6166, + "step": 2568 + }, + { + "epoch": 1.2513803182851575, + "grad_norm": 2.5941784381866455, + "learning_rate": 4.512439716635472e-06, + "loss": 0.5834, + "step": 2569 + }, + { + "epoch": 1.251867489444625, + "grad_norm": 2.6572322845458984, + "learning_rate": 4.5120580723849925e-06, + "loss": 0.5569, + "step": 2570 + }, + { + "epoch": 1.2523546606040923, + "grad_norm": 3.1713969707489014, + "learning_rate": 4.511676294976697e-06, + "loss": 0.5268, + "step": 2571 + }, + { + "epoch": 1.2528418317635597, + "grad_norm": 2.848407745361328, + "learning_rate": 4.511294384435851e-06, + "loss": 0.6274, + "step": 2572 + }, + { + "epoch": 1.253329002923027, + "grad_norm": 2.838974952697754, + "learning_rate": 4.510912340787731e-06, + "loss": 0.5538, + "step": 2573 + }, + { + "epoch": 1.2538161740824942, + "grad_norm": 2.8233704566955566, + "learning_rate": 4.5105301640576186e-06, + "loss": 0.5511, + "step": 2574 + }, + { + "epoch": 1.2543033452419616, + "grad_norm": 2.6723082065582275, + "learning_rate": 4.510147854270808e-06, + "loss": 0.5283, + "step": 2575 + }, + { + "epoch": 1.254790516401429, + "grad_norm": 2.4741344451904297, + "learning_rate": 4.509765411452599e-06, + "loss": 0.6188, + "step": 2576 + }, + { + "epoch": 1.2552776875608964, + "grad_norm": 2.616361141204834, + "learning_rate": 4.509382835628303e-06, + "loss": 0.5148, + "step": 2577 + }, + { + "epoch": 1.2557648587203638, + "grad_norm": 3.006361961364746, + "learning_rate": 4.509000126823237e-06, + "loss": 0.6219, + "step": 2578 + }, + { + "epoch": 1.256252029879831, + "grad_norm": 3.284032106399536, + "learning_rate": 4.50861728506273e-06, + "loss": 0.5924, + "step": 2579 + }, + { + "epoch": 1.2567392010392986, + "grad_norm": 2.715196132659912, + "learning_rate": 4.508234310372118e-06, + "loss": 0.5633, + "step": 2580 + }, + { + "epoch": 1.2572263721987658, + "grad_norm": 2.52996826171875, + "learning_rate": 4.507851202776747e-06, + "loss": 0.589, + "step": 2581 + }, + { + "epoch": 1.2577135433582332, + "grad_norm": 2.9561727046966553, + "learning_rate": 4.50746796230197e-06, + "loss": 0.7009, + "step": 2582 + }, + { + "epoch": 1.2582007145177005, + "grad_norm": 3.183326482772827, + "learning_rate": 4.50708458897315e-06, + "loss": 0.68, + "step": 2583 + }, + { + "epoch": 1.258687885677168, + "grad_norm": 3.2352590560913086, + "learning_rate": 4.50670108281566e-06, + "loss": 0.5565, + "step": 2584 + }, + { + "epoch": 1.2591750568366353, + "grad_norm": 2.8046836853027344, + "learning_rate": 4.5063174438548775e-06, + "loss": 0.553, + "step": 2585 + }, + { + "epoch": 1.2596622279961025, + "grad_norm": 2.8518340587615967, + "learning_rate": 4.505933672116194e-06, + "loss": 0.6049, + "step": 2586 + }, + { + "epoch": 1.2601493991555701, + "grad_norm": 2.6242260932922363, + "learning_rate": 4.505549767625007e-06, + "loss": 0.6066, + "step": 2587 + }, + { + "epoch": 1.2606365703150373, + "grad_norm": 2.583493947982788, + "learning_rate": 4.505165730406723e-06, + "loss": 0.6265, + "step": 2588 + }, + { + "epoch": 1.2611237414745047, + "grad_norm": 3.033698558807373, + "learning_rate": 4.504781560486758e-06, + "loss": 0.572, + "step": 2589 + }, + { + "epoch": 1.261610912633972, + "grad_norm": 2.5957581996917725, + "learning_rate": 4.504397257890536e-06, + "loss": 0.5773, + "step": 2590 + }, + { + "epoch": 1.2620980837934395, + "grad_norm": 3.0853049755096436, + "learning_rate": 4.50401282264349e-06, + "loss": 0.5615, + "step": 2591 + }, + { + "epoch": 1.2625852549529069, + "grad_norm": 2.8241875171661377, + "learning_rate": 4.5036282547710615e-06, + "loss": 0.547, + "step": 2592 + }, + { + "epoch": 1.263072426112374, + "grad_norm": 2.655130386352539, + "learning_rate": 4.503243554298702e-06, + "loss": 0.5167, + "step": 2593 + }, + { + "epoch": 1.2635595972718416, + "grad_norm": 2.8257036209106445, + "learning_rate": 4.5028587212518705e-06, + "loss": 0.6094, + "step": 2594 + }, + { + "epoch": 1.2640467684313088, + "grad_norm": 2.7168850898742676, + "learning_rate": 4.502473755656036e-06, + "loss": 0.5481, + "step": 2595 + }, + { + "epoch": 1.2645339395907762, + "grad_norm": 2.7603390216827393, + "learning_rate": 4.502088657536674e-06, + "loss": 0.5758, + "step": 2596 + }, + { + "epoch": 1.2650211107502436, + "grad_norm": 3.1443023681640625, + "learning_rate": 4.501703426919271e-06, + "loss": 0.5698, + "step": 2597 + }, + { + "epoch": 1.265508281909711, + "grad_norm": 2.8113696575164795, + "learning_rate": 4.501318063829323e-06, + "loss": 0.6171, + "step": 2598 + }, + { + "epoch": 1.2659954530691784, + "grad_norm": 2.9009954929351807, + "learning_rate": 4.5009325682923305e-06, + "loss": 0.5671, + "step": 2599 + }, + { + "epoch": 1.2664826242286455, + "grad_norm": 2.532987356185913, + "learning_rate": 4.500546940333807e-06, + "loss": 0.5089, + "step": 2600 + }, + { + "epoch": 1.2669697953881132, + "grad_norm": 2.759620428085327, + "learning_rate": 4.500161179979275e-06, + "loss": 0.6473, + "step": 2601 + }, + { + "epoch": 1.2674569665475803, + "grad_norm": 3.0337772369384766, + "learning_rate": 4.49977528725426e-06, + "loss": 0.5325, + "step": 2602 + }, + { + "epoch": 1.2679441377070477, + "grad_norm": 2.996462821960449, + "learning_rate": 4.499389262184304e-06, + "loss": 0.6359, + "step": 2603 + }, + { + "epoch": 1.2684313088665151, + "grad_norm": 2.517245054244995, + "learning_rate": 4.499003104794952e-06, + "loss": 0.4998, + "step": 2604 + }, + { + "epoch": 1.2689184800259825, + "grad_norm": 2.7514967918395996, + "learning_rate": 4.498616815111762e-06, + "loss": 0.5706, + "step": 2605 + }, + { + "epoch": 1.26940565118545, + "grad_norm": 2.7698144912719727, + "learning_rate": 4.498230393160297e-06, + "loss": 0.5505, + "step": 2606 + }, + { + "epoch": 1.269892822344917, + "grad_norm": 2.9053797721862793, + "learning_rate": 4.4978438389661295e-06, + "loss": 0.5944, + "step": 2607 + }, + { + "epoch": 1.2703799935043845, + "grad_norm": 2.690213680267334, + "learning_rate": 4.497457152554844e-06, + "loss": 0.6309, + "step": 2608 + }, + { + "epoch": 1.2708671646638519, + "grad_norm": 3.8909943103790283, + "learning_rate": 4.497070333952029e-06, + "loss": 0.6069, + "step": 2609 + }, + { + "epoch": 1.2713543358233192, + "grad_norm": 2.7033982276916504, + "learning_rate": 4.496683383183286e-06, + "loss": 0.5636, + "step": 2610 + }, + { + "epoch": 1.2718415069827866, + "grad_norm": 2.737581491470337, + "learning_rate": 4.496296300274223e-06, + "loss": 0.54, + "step": 2611 + }, + { + "epoch": 1.272328678142254, + "grad_norm": 2.6048054695129395, + "learning_rate": 4.495909085250456e-06, + "loss": 0.5693, + "step": 2612 + }, + { + "epoch": 1.2728158493017214, + "grad_norm": 2.617307186126709, + "learning_rate": 4.495521738137612e-06, + "loss": 0.517, + "step": 2613 + }, + { + "epoch": 1.2733030204611886, + "grad_norm": 2.9942970275878906, + "learning_rate": 4.495134258961325e-06, + "loss": 0.5808, + "step": 2614 + }, + { + "epoch": 1.273790191620656, + "grad_norm": 2.8893909454345703, + "learning_rate": 4.494746647747238e-06, + "loss": 0.5607, + "step": 2615 + }, + { + "epoch": 1.2742773627801234, + "grad_norm": 2.782464027404785, + "learning_rate": 4.494358904521005e-06, + "loss": 0.5576, + "step": 2616 + }, + { + "epoch": 1.2747645339395908, + "grad_norm": 2.966750383377075, + "learning_rate": 4.493971029308284e-06, + "loss": 0.6786, + "step": 2617 + }, + { + "epoch": 1.2752517050990582, + "grad_norm": 3.0885729789733887, + "learning_rate": 4.493583022134746e-06, + "loss": 0.5981, + "step": 2618 + }, + { + "epoch": 1.2757388762585256, + "grad_norm": 2.8833818435668945, + "learning_rate": 4.493194883026071e-06, + "loss": 0.5513, + "step": 2619 + }, + { + "epoch": 1.276226047417993, + "grad_norm": 2.9152684211730957, + "learning_rate": 4.492806612007943e-06, + "loss": 0.5325, + "step": 2620 + }, + { + "epoch": 1.2767132185774601, + "grad_norm": 2.545198917388916, + "learning_rate": 4.4924182091060585e-06, + "loss": 0.6838, + "step": 2621 + }, + { + "epoch": 1.2772003897369275, + "grad_norm": 3.078113317489624, + "learning_rate": 4.492029674346123e-06, + "loss": 0.5205, + "step": 2622 + }, + { + "epoch": 1.277687560896395, + "grad_norm": 2.644131898880005, + "learning_rate": 4.491641007753849e-06, + "loss": 0.6083, + "step": 2623 + }, + { + "epoch": 1.2781747320558623, + "grad_norm": 2.751107931137085, + "learning_rate": 4.491252209354959e-06, + "loss": 0.56, + "step": 2624 + }, + { + "epoch": 1.2786619032153297, + "grad_norm": 2.460390329360962, + "learning_rate": 4.4908632791751825e-06, + "loss": 0.5121, + "step": 2625 + }, + { + "epoch": 1.279149074374797, + "grad_norm": 2.630091905593872, + "learning_rate": 4.4904742172402605e-06, + "loss": 0.6065, + "step": 2626 + }, + { + "epoch": 1.2796362455342645, + "grad_norm": 2.597182512283325, + "learning_rate": 4.490085023575939e-06, + "loss": 0.5265, + "step": 2627 + }, + { + "epoch": 1.2801234166937316, + "grad_norm": 2.4664769172668457, + "learning_rate": 4.489695698207977e-06, + "loss": 0.5616, + "step": 2628 + }, + { + "epoch": 1.280610587853199, + "grad_norm": 2.3367788791656494, + "learning_rate": 4.489306241162139e-06, + "loss": 0.5325, + "step": 2629 + }, + { + "epoch": 1.2810977590126664, + "grad_norm": 2.429753541946411, + "learning_rate": 4.488916652464198e-06, + "loss": 0.5547, + "step": 2630 + }, + { + "epoch": 1.2815849301721338, + "grad_norm": 2.369037628173828, + "learning_rate": 4.488526932139939e-06, + "loss": 0.5794, + "step": 2631 + }, + { + "epoch": 1.2820721013316012, + "grad_norm": 2.671706199645996, + "learning_rate": 4.488137080215152e-06, + "loss": 0.5821, + "step": 2632 + }, + { + "epoch": 1.2825592724910686, + "grad_norm": 2.588916063308716, + "learning_rate": 4.48774709671564e-06, + "loss": 0.5661, + "step": 2633 + }, + { + "epoch": 1.283046443650536, + "grad_norm": 2.782611846923828, + "learning_rate": 4.487356981667209e-06, + "loss": 0.5425, + "step": 2634 + }, + { + "epoch": 1.2835336148100032, + "grad_norm": 2.7505874633789062, + "learning_rate": 4.486966735095678e-06, + "loss": 0.6171, + "step": 2635 + }, + { + "epoch": 1.2840207859694706, + "grad_norm": 2.889120578765869, + "learning_rate": 4.486576357026874e-06, + "loss": 0.5847, + "step": 2636 + }, + { + "epoch": 1.284507957128938, + "grad_norm": 2.5447590351104736, + "learning_rate": 4.486185847486631e-06, + "loss": 0.634, + "step": 2637 + }, + { + "epoch": 1.2849951282884053, + "grad_norm": 3.574097156524658, + "learning_rate": 4.485795206500794e-06, + "loss": 0.6364, + "step": 2638 + }, + { + "epoch": 1.2854822994478727, + "grad_norm": 2.962902069091797, + "learning_rate": 4.485404434095215e-06, + "loss": 0.5638, + "step": 2639 + }, + { + "epoch": 1.2859694706073401, + "grad_norm": 2.6743128299713135, + "learning_rate": 4.485013530295755e-06, + "loss": 0.6259, + "step": 2640 + }, + { + "epoch": 1.2864566417668075, + "grad_norm": 2.5481555461883545, + "learning_rate": 4.484622495128285e-06, + "loss": 0.5224, + "step": 2641 + }, + { + "epoch": 1.2869438129262747, + "grad_norm": 3.1361351013183594, + "learning_rate": 4.484231328618683e-06, + "loss": 0.5673, + "step": 2642 + }, + { + "epoch": 1.287430984085742, + "grad_norm": 2.692227363586426, + "learning_rate": 4.483840030792836e-06, + "loss": 0.5246, + "step": 2643 + }, + { + "epoch": 1.2879181552452095, + "grad_norm": 3.6108367443084717, + "learning_rate": 4.483448601676642e-06, + "loss": 0.5949, + "step": 2644 + }, + { + "epoch": 1.2884053264046769, + "grad_norm": 2.8302369117736816, + "learning_rate": 4.483057041296003e-06, + "loss": 0.6083, + "step": 2645 + }, + { + "epoch": 1.2888924975641443, + "grad_norm": 2.7552835941314697, + "learning_rate": 4.482665349676833e-06, + "loss": 0.5598, + "step": 2646 + }, + { + "epoch": 1.2893796687236114, + "grad_norm": 2.777360200881958, + "learning_rate": 4.482273526845055e-06, + "loss": 0.5109, + "step": 2647 + }, + { + "epoch": 1.289866839883079, + "grad_norm": 2.4993844032287598, + "learning_rate": 4.481881572826601e-06, + "loss": 0.5772, + "step": 2648 + }, + { + "epoch": 1.2903540110425462, + "grad_norm": 2.8850669860839844, + "learning_rate": 4.481489487647408e-06, + "loss": 0.6068, + "step": 2649 + }, + { + "epoch": 1.2908411822020136, + "grad_norm": 2.7317872047424316, + "learning_rate": 4.481097271333425e-06, + "loss": 0.6137, + "step": 2650 + }, + { + "epoch": 1.291328353361481, + "grad_norm": 2.781651496887207, + "learning_rate": 4.480704923910611e-06, + "loss": 0.5696, + "step": 2651 + }, + { + "epoch": 1.2918155245209484, + "grad_norm": 2.6508610248565674, + "learning_rate": 4.480312445404928e-06, + "loss": 0.5997, + "step": 2652 + }, + { + "epoch": 1.2923026956804158, + "grad_norm": 2.7831809520721436, + "learning_rate": 4.4799198358423515e-06, + "loss": 0.6113, + "step": 2653 + }, + { + "epoch": 1.292789866839883, + "grad_norm": 3.2718937397003174, + "learning_rate": 4.479527095248865e-06, + "loss": 0.5269, + "step": 2654 + }, + { + "epoch": 1.2932770379993506, + "grad_norm": 2.5265448093414307, + "learning_rate": 4.47913422365046e-06, + "loss": 0.5441, + "step": 2655 + }, + { + "epoch": 1.2937642091588177, + "grad_norm": 2.9526443481445312, + "learning_rate": 4.478741221073136e-06, + "loss": 0.6035, + "step": 2656 + }, + { + "epoch": 1.2942513803182851, + "grad_norm": 2.769726037979126, + "learning_rate": 4.478348087542902e-06, + "loss": 0.4899, + "step": 2657 + }, + { + "epoch": 1.2947385514777525, + "grad_norm": 2.598862648010254, + "learning_rate": 4.477954823085776e-06, + "loss": 0.5537, + "step": 2658 + }, + { + "epoch": 1.29522572263722, + "grad_norm": 3.099825143814087, + "learning_rate": 4.477561427727784e-06, + "loss": 0.5418, + "step": 2659 + }, + { + "epoch": 1.2957128937966873, + "grad_norm": 2.6322388648986816, + "learning_rate": 4.477167901494961e-06, + "loss": 0.5301, + "step": 2660 + }, + { + "epoch": 1.2962000649561545, + "grad_norm": 2.6254937648773193, + "learning_rate": 4.476774244413351e-06, + "loss": 0.643, + "step": 2661 + }, + { + "epoch": 1.296687236115622, + "grad_norm": 2.5662310123443604, + "learning_rate": 4.476380456509004e-06, + "loss": 0.566, + "step": 2662 + }, + { + "epoch": 1.2971744072750893, + "grad_norm": 2.7497987747192383, + "learning_rate": 4.475986537807984e-06, + "loss": 0.5598, + "step": 2663 + }, + { + "epoch": 1.2976615784345567, + "grad_norm": 2.5806846618652344, + "learning_rate": 4.475592488336358e-06, + "loss": 0.5805, + "step": 2664 + }, + { + "epoch": 1.298148749594024, + "grad_norm": 2.833782911300659, + "learning_rate": 4.475198308120205e-06, + "loss": 0.6285, + "step": 2665 + }, + { + "epoch": 1.2986359207534914, + "grad_norm": 3.037559986114502, + "learning_rate": 4.474803997185613e-06, + "loss": 0.5636, + "step": 2666 + }, + { + "epoch": 1.2991230919129588, + "grad_norm": 2.840493679046631, + "learning_rate": 4.474409555558675e-06, + "loss": 0.576, + "step": 2667 + }, + { + "epoch": 1.299610263072426, + "grad_norm": 3.2027037143707275, + "learning_rate": 4.474014983265498e-06, + "loss": 0.5806, + "step": 2668 + }, + { + "epoch": 1.3000974342318934, + "grad_norm": 3.2064497470855713, + "learning_rate": 4.473620280332192e-06, + "loss": 0.5745, + "step": 2669 + }, + { + "epoch": 1.3005846053913608, + "grad_norm": 2.807873487472534, + "learning_rate": 4.47322544678488e-06, + "loss": 0.6183, + "step": 2670 + }, + { + "epoch": 1.3010717765508282, + "grad_norm": 2.9865119457244873, + "learning_rate": 4.472830482649691e-06, + "loss": 0.5755, + "step": 2671 + }, + { + "epoch": 1.3015589477102956, + "grad_norm": 2.5179100036621094, + "learning_rate": 4.472435387952766e-06, + "loss": 0.5662, + "step": 2672 + }, + { + "epoch": 1.302046118869763, + "grad_norm": 2.8079543113708496, + "learning_rate": 4.472040162720249e-06, + "loss": 0.5319, + "step": 2673 + }, + { + "epoch": 1.3025332900292304, + "grad_norm": 2.9218146800994873, + "learning_rate": 4.471644806978298e-06, + "loss": 0.5464, + "step": 2674 + }, + { + "epoch": 1.3030204611886975, + "grad_norm": 3.6714110374450684, + "learning_rate": 4.471249320753078e-06, + "loss": 0.5542, + "step": 2675 + }, + { + "epoch": 1.303507632348165, + "grad_norm": 2.788975238800049, + "learning_rate": 4.470853704070761e-06, + "loss": 0.6611, + "step": 2676 + }, + { + "epoch": 1.3039948035076323, + "grad_norm": 2.7497527599334717, + "learning_rate": 4.47045795695753e-06, + "loss": 0.5291, + "step": 2677 + }, + { + "epoch": 1.3044819746670997, + "grad_norm": 2.813725471496582, + "learning_rate": 4.4700620794395745e-06, + "loss": 0.6231, + "step": 2678 + }, + { + "epoch": 1.304969145826567, + "grad_norm": 2.9838552474975586, + "learning_rate": 4.469666071543094e-06, + "loss": 0.6541, + "step": 2679 + }, + { + "epoch": 1.3054563169860345, + "grad_norm": 2.9000661373138428, + "learning_rate": 4.469269933294296e-06, + "loss": 0.662, + "step": 2680 + }, + { + "epoch": 1.3059434881455019, + "grad_norm": 2.733574628829956, + "learning_rate": 4.468873664719398e-06, + "loss": 0.5431, + "step": 2681 + }, + { + "epoch": 1.306430659304969, + "grad_norm": 3.4491496086120605, + "learning_rate": 4.468477265844623e-06, + "loss": 0.6328, + "step": 2682 + }, + { + "epoch": 1.3069178304644364, + "grad_norm": 2.8113162517547607, + "learning_rate": 4.468080736696206e-06, + "loss": 0.6212, + "step": 2683 + }, + { + "epoch": 1.3074050016239038, + "grad_norm": 3.033055543899536, + "learning_rate": 4.46768407730039e-06, + "loss": 0.4987, + "step": 2684 + }, + { + "epoch": 1.3078921727833712, + "grad_norm": 2.5716516971588135, + "learning_rate": 4.467287287683425e-06, + "loss": 0.5814, + "step": 2685 + }, + { + "epoch": 1.3083793439428386, + "grad_norm": 2.4821014404296875, + "learning_rate": 4.4668903678715705e-06, + "loss": 0.6093, + "step": 2686 + }, + { + "epoch": 1.308866515102306, + "grad_norm": 2.9516100883483887, + "learning_rate": 4.4664933178910945e-06, + "loss": 0.5305, + "step": 2687 + }, + { + "epoch": 1.3093536862617734, + "grad_norm": 2.5255680084228516, + "learning_rate": 4.4660961377682735e-06, + "loss": 0.5902, + "step": 2688 + }, + { + "epoch": 1.3098408574212406, + "grad_norm": 2.828105926513672, + "learning_rate": 4.4656988275293934e-06, + "loss": 0.5306, + "step": 2689 + }, + { + "epoch": 1.310328028580708, + "grad_norm": 2.4518916606903076, + "learning_rate": 4.465301387200748e-06, + "loss": 0.5911, + "step": 2690 + }, + { + "epoch": 1.3108151997401754, + "grad_norm": 3.031892776489258, + "learning_rate": 4.464903816808639e-06, + "loss": 0.595, + "step": 2691 + }, + { + "epoch": 1.3113023708996427, + "grad_norm": 2.7388522624969482, + "learning_rate": 4.464506116379379e-06, + "loss": 0.5767, + "step": 2692 + }, + { + "epoch": 1.3117895420591101, + "grad_norm": 3.0621023178100586, + "learning_rate": 4.464108285939287e-06, + "loss": 0.5839, + "step": 2693 + }, + { + "epoch": 1.3122767132185775, + "grad_norm": 3.195301055908203, + "learning_rate": 4.463710325514692e-06, + "loss": 0.5729, + "step": 2694 + }, + { + "epoch": 1.312763884378045, + "grad_norm": 2.492788076400757, + "learning_rate": 4.46331223513193e-06, + "loss": 0.5225, + "step": 2695 + }, + { + "epoch": 1.313251055537512, + "grad_norm": 2.6036934852600098, + "learning_rate": 4.462914014817348e-06, + "loss": 0.5409, + "step": 2696 + }, + { + "epoch": 1.3137382266969795, + "grad_norm": 2.8170831203460693, + "learning_rate": 4.462515664597298e-06, + "loss": 0.6118, + "step": 2697 + }, + { + "epoch": 1.3142253978564469, + "grad_norm": 2.698979139328003, + "learning_rate": 4.462117184498145e-06, + "loss": 0.5487, + "step": 2698 + }, + { + "epoch": 1.3147125690159143, + "grad_norm": 2.522156238555908, + "learning_rate": 4.461718574546259e-06, + "loss": 0.5415, + "step": 2699 + }, + { + "epoch": 1.3151997401753817, + "grad_norm": 2.8315505981445312, + "learning_rate": 4.4613198347680206e-06, + "loss": 0.5893, + "step": 2700 + }, + { + "epoch": 1.315686911334849, + "grad_norm": 2.6818623542785645, + "learning_rate": 4.460920965189818e-06, + "loss": 0.5842, + "step": 2701 + }, + { + "epoch": 1.3161740824943164, + "grad_norm": 2.6697962284088135, + "learning_rate": 4.460521965838048e-06, + "loss": 0.5623, + "step": 2702 + }, + { + "epoch": 1.3166612536537836, + "grad_norm": 2.799041748046875, + "learning_rate": 4.460122836739118e-06, + "loss": 0.5579, + "step": 2703 + }, + { + "epoch": 1.317148424813251, + "grad_norm": 2.3257200717926025, + "learning_rate": 4.45972357791944e-06, + "loss": 0.5091, + "step": 2704 + }, + { + "epoch": 1.3176355959727184, + "grad_norm": 2.576106548309326, + "learning_rate": 4.459324189405439e-06, + "loss": 0.5501, + "step": 2705 + }, + { + "epoch": 1.3181227671321858, + "grad_norm": 2.6331210136413574, + "learning_rate": 4.4589246712235456e-06, + "loss": 0.5715, + "step": 2706 + }, + { + "epoch": 1.3186099382916532, + "grad_norm": 2.615739345550537, + "learning_rate": 4.4585250234002e-06, + "loss": 0.5741, + "step": 2707 + }, + { + "epoch": 1.3190971094511206, + "grad_norm": 2.4693963527679443, + "learning_rate": 4.4581252459618505e-06, + "loss": 0.5649, + "step": 2708 + }, + { + "epoch": 1.319584280610588, + "grad_norm": 2.704479455947876, + "learning_rate": 4.457725338934954e-06, + "loss": 0.6041, + "step": 2709 + }, + { + "epoch": 1.3200714517700551, + "grad_norm": 2.9278759956359863, + "learning_rate": 4.4573253023459776e-06, + "loss": 0.5338, + "step": 2710 + }, + { + "epoch": 1.3205586229295225, + "grad_norm": 2.5485472679138184, + "learning_rate": 4.456925136221394e-06, + "loss": 0.5686, + "step": 2711 + }, + { + "epoch": 1.32104579408899, + "grad_norm": 2.5377233028411865, + "learning_rate": 4.456524840587688e-06, + "loss": 0.576, + "step": 2712 + }, + { + "epoch": 1.3215329652484573, + "grad_norm": 3.7573447227478027, + "learning_rate": 4.45612441547135e-06, + "loss": 0.5895, + "step": 2713 + }, + { + "epoch": 1.3220201364079247, + "grad_norm": 2.5095114707946777, + "learning_rate": 4.455723860898881e-06, + "loss": 0.5504, + "step": 2714 + }, + { + "epoch": 1.3225073075673919, + "grad_norm": 2.9174599647521973, + "learning_rate": 4.455323176896788e-06, + "loss": 0.5966, + "step": 2715 + }, + { + "epoch": 1.3229944787268595, + "grad_norm": 2.819927930831909, + "learning_rate": 4.454922363491589e-06, + "loss": 0.6607, + "step": 2716 + }, + { + "epoch": 1.3234816498863267, + "grad_norm": 3.098043918609619, + "learning_rate": 4.4545214207098116e-06, + "loss": 0.5641, + "step": 2717 + }, + { + "epoch": 1.323968821045794, + "grad_norm": 2.7157533168792725, + "learning_rate": 4.454120348577987e-06, + "loss": 0.4773, + "step": 2718 + }, + { + "epoch": 1.3244559922052614, + "grad_norm": 2.461726188659668, + "learning_rate": 4.45371914712266e-06, + "loss": 0.5632, + "step": 2719 + }, + { + "epoch": 1.3249431633647288, + "grad_norm": 2.5255331993103027, + "learning_rate": 4.4533178163703826e-06, + "loss": 0.5405, + "step": 2720 + }, + { + "epoch": 1.3254303345241962, + "grad_norm": 3.1714437007904053, + "learning_rate": 4.4529163563477144e-06, + "loss": 0.583, + "step": 2721 + }, + { + "epoch": 1.3259175056836634, + "grad_norm": 2.7523887157440186, + "learning_rate": 4.452514767081222e-06, + "loss": 0.6969, + "step": 2722 + }, + { + "epoch": 1.326404676843131, + "grad_norm": 2.8845577239990234, + "learning_rate": 4.452113048597485e-06, + "loss": 0.5215, + "step": 2723 + }, + { + "epoch": 1.3268918480025982, + "grad_norm": 2.725421190261841, + "learning_rate": 4.451711200923088e-06, + "loss": 0.5322, + "step": 2724 + }, + { + "epoch": 1.3273790191620656, + "grad_norm": 2.629425287246704, + "learning_rate": 4.451309224084626e-06, + "loss": 0.5703, + "step": 2725 + }, + { + "epoch": 1.327866190321533, + "grad_norm": 2.6784350872039795, + "learning_rate": 4.4509071181086995e-06, + "loss": 0.6471, + "step": 2726 + }, + { + "epoch": 1.3283533614810004, + "grad_norm": 2.9275763034820557, + "learning_rate": 4.450504883021923e-06, + "loss": 0.5228, + "step": 2727 + }, + { + "epoch": 1.3288405326404678, + "grad_norm": 2.6438703536987305, + "learning_rate": 4.450102518850915e-06, + "loss": 0.4986, + "step": 2728 + }, + { + "epoch": 1.329327703799935, + "grad_norm": 3.023615598678589, + "learning_rate": 4.4497000256223044e-06, + "loss": 0.478, + "step": 2729 + }, + { + "epoch": 1.3298148749594025, + "grad_norm": 2.287832260131836, + "learning_rate": 4.4492974033627265e-06, + "loss": 0.5797, + "step": 2730 + }, + { + "epoch": 1.3303020461188697, + "grad_norm": 2.764441728591919, + "learning_rate": 4.448894652098829e-06, + "loss": 0.5567, + "step": 2731 + }, + { + "epoch": 1.330789217278337, + "grad_norm": 2.520728826522827, + "learning_rate": 4.448491771857264e-06, + "loss": 0.6282, + "step": 2732 + }, + { + "epoch": 1.3312763884378045, + "grad_norm": 2.8247814178466797, + "learning_rate": 4.448088762664695e-06, + "loss": 0.5302, + "step": 2733 + }, + { + "epoch": 1.3317635595972719, + "grad_norm": 2.876258134841919, + "learning_rate": 4.447685624547794e-06, + "loss": 0.5636, + "step": 2734 + }, + { + "epoch": 1.3322507307567393, + "grad_norm": 2.546354293823242, + "learning_rate": 4.4472823575332395e-06, + "loss": 0.61, + "step": 2735 + }, + { + "epoch": 1.3327379019162064, + "grad_norm": 2.9330642223358154, + "learning_rate": 4.4468789616477184e-06, + "loss": 0.5813, + "step": 2736 + }, + { + "epoch": 1.3332250730756738, + "grad_norm": 2.8993875980377197, + "learning_rate": 4.44647543691793e-06, + "loss": 0.613, + "step": 2737 + }, + { + "epoch": 1.3337122442351412, + "grad_norm": 2.6963186264038086, + "learning_rate": 4.446071783370579e-06, + "loss": 0.5742, + "step": 2738 + }, + { + "epoch": 1.3341994153946086, + "grad_norm": 2.8073761463165283, + "learning_rate": 4.445668001032379e-06, + "loss": 0.6038, + "step": 2739 + }, + { + "epoch": 1.334686586554076, + "grad_norm": 2.5836710929870605, + "learning_rate": 4.44526408993005e-06, + "loss": 0.5992, + "step": 2740 + }, + { + "epoch": 1.3351737577135434, + "grad_norm": 2.7242937088012695, + "learning_rate": 4.444860050090326e-06, + "loss": 0.4986, + "step": 2741 + }, + { + "epoch": 1.3356609288730108, + "grad_norm": 2.6571319103240967, + "learning_rate": 4.4444558815399445e-06, + "loss": 0.512, + "step": 2742 + }, + { + "epoch": 1.336148100032478, + "grad_norm": 2.5277745723724365, + "learning_rate": 4.4440515843056544e-06, + "loss": 0.5333, + "step": 2743 + }, + { + "epoch": 1.3366352711919454, + "grad_norm": 2.6077065467834473, + "learning_rate": 4.44364715841421e-06, + "loss": 0.614, + "step": 2744 + }, + { + "epoch": 1.3371224423514128, + "grad_norm": 2.626702308654785, + "learning_rate": 4.4432426038923794e-06, + "loss": 0.5847, + "step": 2745 + }, + { + "epoch": 1.3376096135108801, + "grad_norm": 2.885105848312378, + "learning_rate": 4.442837920766934e-06, + "loss": 0.6054, + "step": 2746 + }, + { + "epoch": 1.3380967846703475, + "grad_norm": 2.6807634830474854, + "learning_rate": 4.442433109064655e-06, + "loss": 0.5404, + "step": 2747 + }, + { + "epoch": 1.338583955829815, + "grad_norm": 2.723670721054077, + "learning_rate": 4.442028168812334e-06, + "loss": 0.5673, + "step": 2748 + }, + { + "epoch": 1.3390711269892823, + "grad_norm": 2.547699213027954, + "learning_rate": 4.44162310003677e-06, + "loss": 0.5283, + "step": 2749 + }, + { + "epoch": 1.3395582981487495, + "grad_norm": 2.438793897628784, + "learning_rate": 4.44121790276477e-06, + "loss": 0.5614, + "step": 2750 + }, + { + "epoch": 1.3400454693082169, + "grad_norm": 2.827037811279297, + "learning_rate": 4.4408125770231495e-06, + "loss": 0.5346, + "step": 2751 + }, + { + "epoch": 1.3405326404676843, + "grad_norm": 2.4102678298950195, + "learning_rate": 4.440407122838734e-06, + "loss": 0.5228, + "step": 2752 + }, + { + "epoch": 1.3410198116271517, + "grad_norm": 3.2095682621002197, + "learning_rate": 4.440001540238356e-06, + "loss": 0.602, + "step": 2753 + }, + { + "epoch": 1.341506982786619, + "grad_norm": 2.5140323638916016, + "learning_rate": 4.4395958292488556e-06, + "loss": 0.5505, + "step": 2754 + }, + { + "epoch": 1.3419941539460865, + "grad_norm": 3.019561529159546, + "learning_rate": 4.439189989897084e-06, + "loss": 0.6443, + "step": 2755 + }, + { + "epoch": 1.3424813251055538, + "grad_norm": 3.1722757816314697, + "learning_rate": 4.4387840222099e-06, + "loss": 0.5616, + "step": 2756 + }, + { + "epoch": 1.342968496265021, + "grad_norm": 3.2663016319274902, + "learning_rate": 4.43837792621417e-06, + "loss": 0.4637, + "step": 2757 + }, + { + "epoch": 1.3434556674244884, + "grad_norm": 3.0957560539245605, + "learning_rate": 4.437971701936769e-06, + "loss": 0.5651, + "step": 2758 + }, + { + "epoch": 1.3439428385839558, + "grad_norm": 2.9166133403778076, + "learning_rate": 4.437565349404581e-06, + "loss": 0.5672, + "step": 2759 + }, + { + "epoch": 1.3444300097434232, + "grad_norm": 2.576301336288452, + "learning_rate": 4.4371588686445e-06, + "loss": 0.5363, + "step": 2760 + }, + { + "epoch": 1.3449171809028906, + "grad_norm": 2.7068564891815186, + "learning_rate": 4.4367522596834245e-06, + "loss": 0.587, + "step": 2761 + }, + { + "epoch": 1.345404352062358, + "grad_norm": 2.965141773223877, + "learning_rate": 4.436345522548264e-06, + "loss": 0.612, + "step": 2762 + }, + { + "epoch": 1.3458915232218254, + "grad_norm": 2.9287776947021484, + "learning_rate": 4.435938657265939e-06, + "loss": 0.5841, + "step": 2763 + }, + { + "epoch": 1.3463786943812925, + "grad_norm": 2.966031789779663, + "learning_rate": 4.4355316638633725e-06, + "loss": 0.603, + "step": 2764 + }, + { + "epoch": 1.34686586554076, + "grad_norm": 2.5258851051330566, + "learning_rate": 4.435124542367501e-06, + "loss": 0.5571, + "step": 2765 + }, + { + "epoch": 1.3473530367002273, + "grad_norm": 2.497694492340088, + "learning_rate": 4.434717292805267e-06, + "loss": 0.6055, + "step": 2766 + }, + { + "epoch": 1.3478402078596947, + "grad_norm": 2.581587791442871, + "learning_rate": 4.434309915203624e-06, + "loss": 0.6154, + "step": 2767 + }, + { + "epoch": 1.348327379019162, + "grad_norm": 2.8528659343719482, + "learning_rate": 4.43390240958953e-06, + "loss": 0.5483, + "step": 2768 + }, + { + "epoch": 1.3488145501786295, + "grad_norm": 3.10967755317688, + "learning_rate": 4.433494775989955e-06, + "loss": 0.5614, + "step": 2769 + }, + { + "epoch": 1.349301721338097, + "grad_norm": 2.4328768253326416, + "learning_rate": 4.4330870144318755e-06, + "loss": 0.6082, + "step": 2770 + }, + { + "epoch": 1.349788892497564, + "grad_norm": 2.7110517024993896, + "learning_rate": 4.432679124942278e-06, + "loss": 0.5267, + "step": 2771 + }, + { + "epoch": 1.3502760636570315, + "grad_norm": 2.9273059368133545, + "learning_rate": 4.432271107548155e-06, + "loss": 0.5312, + "step": 2772 + }, + { + "epoch": 1.3507632348164988, + "grad_norm": 2.8727049827575684, + "learning_rate": 4.4318629622765105e-06, + "loss": 0.6282, + "step": 2773 + }, + { + "epoch": 1.3512504059759662, + "grad_norm": 3.0850226879119873, + "learning_rate": 4.431454689154355e-06, + "loss": 0.6171, + "step": 2774 + }, + { + "epoch": 1.3517375771354336, + "grad_norm": 2.8854594230651855, + "learning_rate": 4.431046288208708e-06, + "loss": 0.6166, + "step": 2775 + }, + { + "epoch": 1.352224748294901, + "grad_norm": 2.6787257194519043, + "learning_rate": 4.430637759466598e-06, + "loss": 0.4923, + "step": 2776 + }, + { + "epoch": 1.3527119194543684, + "grad_norm": 2.744450092315674, + "learning_rate": 4.43022910295506e-06, + "loss": 0.6223, + "step": 2777 + }, + { + "epoch": 1.3531990906138356, + "grad_norm": 2.7572832107543945, + "learning_rate": 4.4298203187011406e-06, + "loss": 0.5379, + "step": 2778 + }, + { + "epoch": 1.353686261773303, + "grad_norm": 2.904282569885254, + "learning_rate": 4.429411406731892e-06, + "loss": 0.5109, + "step": 2779 + }, + { + "epoch": 1.3541734329327704, + "grad_norm": 2.898763418197632, + "learning_rate": 4.4290023670743755e-06, + "loss": 0.6752, + "step": 2780 + }, + { + "epoch": 1.3546606040922378, + "grad_norm": 2.863982677459717, + "learning_rate": 4.428593199755662e-06, + "loss": 0.5969, + "step": 2781 + }, + { + "epoch": 1.3551477752517052, + "grad_norm": 3.0059282779693604, + "learning_rate": 4.428183904802831e-06, + "loss": 0.59, + "step": 2782 + }, + { + "epoch": 1.3556349464111723, + "grad_norm": 3.022444009780884, + "learning_rate": 4.427774482242968e-06, + "loss": 0.5953, + "step": 2783 + }, + { + "epoch": 1.35612211757064, + "grad_norm": 2.8332273960113525, + "learning_rate": 4.42736493210317e-06, + "loss": 0.5545, + "step": 2784 + }, + { + "epoch": 1.356609288730107, + "grad_norm": 2.8214850425720215, + "learning_rate": 4.42695525441054e-06, + "loss": 0.5819, + "step": 2785 + }, + { + "epoch": 1.3570964598895745, + "grad_norm": 2.7027463912963867, + "learning_rate": 4.42654544919219e-06, + "loss": 0.5641, + "step": 2786 + }, + { + "epoch": 1.357583631049042, + "grad_norm": 3.1006999015808105, + "learning_rate": 4.426135516475242e-06, + "loss": 0.5892, + "step": 2787 + }, + { + "epoch": 1.3580708022085093, + "grad_norm": 2.684481620788574, + "learning_rate": 4.425725456286825e-06, + "loss": 0.6237, + "step": 2788 + }, + { + "epoch": 1.3585579733679767, + "grad_norm": 2.672241449356079, + "learning_rate": 4.425315268654077e-06, + "loss": 0.649, + "step": 2789 + }, + { + "epoch": 1.3590451445274438, + "grad_norm": 2.6828315258026123, + "learning_rate": 4.424904953604143e-06, + "loss": 0.5506, + "step": 2790 + }, + { + "epoch": 1.3595323156869115, + "grad_norm": 3.0671000480651855, + "learning_rate": 4.424494511164179e-06, + "loss": 0.543, + "step": 2791 + }, + { + "epoch": 1.3600194868463786, + "grad_norm": 2.929980516433716, + "learning_rate": 4.424083941361347e-06, + "loss": 0.6173, + "step": 2792 + }, + { + "epoch": 1.360506658005846, + "grad_norm": 2.80680513381958, + "learning_rate": 4.423673244222819e-06, + "loss": 0.6279, + "step": 2793 + }, + { + "epoch": 1.3609938291653134, + "grad_norm": 2.838385820388794, + "learning_rate": 4.423262419775775e-06, + "loss": 0.5878, + "step": 2794 + }, + { + "epoch": 1.3614810003247808, + "grad_norm": 2.9775049686431885, + "learning_rate": 4.4228514680474035e-06, + "loss": 0.5789, + "step": 2795 + }, + { + "epoch": 1.3619681714842482, + "grad_norm": 2.8313138484954834, + "learning_rate": 4.422440389064901e-06, + "loss": 0.5655, + "step": 2796 + }, + { + "epoch": 1.3624553426437154, + "grad_norm": 3.813946008682251, + "learning_rate": 4.4220291828554715e-06, + "loss": 0.582, + "step": 2797 + }, + { + "epoch": 1.362942513803183, + "grad_norm": 2.707127332687378, + "learning_rate": 4.4216178494463305e-06, + "loss": 0.5256, + "step": 2798 + }, + { + "epoch": 1.3634296849626502, + "grad_norm": 2.4181535243988037, + "learning_rate": 4.4212063888646995e-06, + "loss": 0.5452, + "step": 2799 + }, + { + "epoch": 1.3639168561221175, + "grad_norm": 2.9436962604522705, + "learning_rate": 4.420794801137807e-06, + "loss": 0.5346, + "step": 2800 + }, + { + "epoch": 1.364404027281585, + "grad_norm": 2.69236159324646, + "learning_rate": 4.4203830862928954e-06, + "loss": 0.5454, + "step": 2801 + }, + { + "epoch": 1.3648911984410523, + "grad_norm": 2.258228063583374, + "learning_rate": 4.4199712443572085e-06, + "loss": 0.5106, + "step": 2802 + }, + { + "epoch": 1.3653783696005197, + "grad_norm": 2.744932174682617, + "learning_rate": 4.419559275358004e-06, + "loss": 0.5557, + "step": 2803 + }, + { + "epoch": 1.365865540759987, + "grad_norm": 2.731269359588623, + "learning_rate": 4.419147179322546e-06, + "loss": 0.597, + "step": 2804 + }, + { + "epoch": 1.3663527119194543, + "grad_norm": 2.951632499694824, + "learning_rate": 4.418734956278105e-06, + "loss": 0.5272, + "step": 2805 + }, + { + "epoch": 1.3668398830789217, + "grad_norm": 2.60044527053833, + "learning_rate": 4.418322606251965e-06, + "loss": 0.6227, + "step": 2806 + }, + { + "epoch": 1.367327054238389, + "grad_norm": 2.987895965576172, + "learning_rate": 4.4179101292714125e-06, + "loss": 0.6471, + "step": 2807 + }, + { + "epoch": 1.3678142253978565, + "grad_norm": 2.7151479721069336, + "learning_rate": 4.417497525363746e-06, + "loss": 0.5903, + "step": 2808 + }, + { + "epoch": 1.3683013965573239, + "grad_norm": 2.7001330852508545, + "learning_rate": 4.417084794556272e-06, + "loss": 0.58, + "step": 2809 + }, + { + "epoch": 1.3687885677167912, + "grad_norm": 2.8885576725006104, + "learning_rate": 4.416671936876306e-06, + "loss": 0.5209, + "step": 2810 + }, + { + "epoch": 1.3692757388762584, + "grad_norm": 2.8402702808380127, + "learning_rate": 4.416258952351168e-06, + "loss": 0.5566, + "step": 2811 + }, + { + "epoch": 1.3697629100357258, + "grad_norm": 2.564209222793579, + "learning_rate": 4.4158458410081915e-06, + "loss": 0.6034, + "step": 2812 + }, + { + "epoch": 1.3702500811951932, + "grad_norm": 2.8142433166503906, + "learning_rate": 4.415432602874716e-06, + "loss": 0.6256, + "step": 2813 + }, + { + "epoch": 1.3707372523546606, + "grad_norm": 3.062380790710449, + "learning_rate": 4.415019237978089e-06, + "loss": 0.6043, + "step": 2814 + }, + { + "epoch": 1.371224423514128, + "grad_norm": 2.4872708320617676, + "learning_rate": 4.414605746345667e-06, + "loss": 0.5619, + "step": 2815 + }, + { + "epoch": 1.3717115946735954, + "grad_norm": 2.91640305519104, + "learning_rate": 4.414192128004815e-06, + "loss": 0.5804, + "step": 2816 + }, + { + "epoch": 1.3721987658330628, + "grad_norm": 2.868206024169922, + "learning_rate": 4.413778382982907e-06, + "loss": 0.6217, + "step": 2817 + }, + { + "epoch": 1.37268593699253, + "grad_norm": 2.9718732833862305, + "learning_rate": 4.4133645113073235e-06, + "loss": 0.6057, + "step": 2818 + }, + { + "epoch": 1.3731731081519973, + "grad_norm": 2.8543813228607178, + "learning_rate": 4.412950513005454e-06, + "loss": 0.6361, + "step": 2819 + }, + { + "epoch": 1.3736602793114647, + "grad_norm": 2.549443006515503, + "learning_rate": 4.412536388104698e-06, + "loss": 0.5485, + "step": 2820 + }, + { + "epoch": 1.3741474504709321, + "grad_norm": 2.6977131366729736, + "learning_rate": 4.412122136632462e-06, + "loss": 0.5041, + "step": 2821 + }, + { + "epoch": 1.3746346216303995, + "grad_norm": 2.5098788738250732, + "learning_rate": 4.4117077586161614e-06, + "loss": 0.5877, + "step": 2822 + }, + { + "epoch": 1.375121792789867, + "grad_norm": 2.966036558151245, + "learning_rate": 4.411293254083219e-06, + "loss": 0.5785, + "step": 2823 + }, + { + "epoch": 1.3756089639493343, + "grad_norm": 2.852757692337036, + "learning_rate": 4.410878623061068e-06, + "loss": 0.591, + "step": 2824 + }, + { + "epoch": 1.3760961351088015, + "grad_norm": 2.9351003170013428, + "learning_rate": 4.410463865577146e-06, + "loss": 0.5709, + "step": 2825 + }, + { + "epoch": 1.3765833062682689, + "grad_norm": 2.6671361923217773, + "learning_rate": 4.410048981658904e-06, + "loss": 0.5452, + "step": 2826 + }, + { + "epoch": 1.3770704774277363, + "grad_norm": 2.3994529247283936, + "learning_rate": 4.4096339713337985e-06, + "loss": 0.6141, + "step": 2827 + }, + { + "epoch": 1.3775576485872036, + "grad_norm": 3.18749737739563, + "learning_rate": 4.409218834629295e-06, + "loss": 0.5548, + "step": 2828 + }, + { + "epoch": 1.378044819746671, + "grad_norm": 2.533371925354004, + "learning_rate": 4.408803571572866e-06, + "loss": 0.5607, + "step": 2829 + }, + { + "epoch": 1.3785319909061384, + "grad_norm": 2.8112192153930664, + "learning_rate": 4.408388182191995e-06, + "loss": 0.5394, + "step": 2830 + }, + { + "epoch": 1.3790191620656058, + "grad_norm": 2.7301132678985596, + "learning_rate": 4.407972666514172e-06, + "loss": 0.5192, + "step": 2831 + }, + { + "epoch": 1.379506333225073, + "grad_norm": 2.750743865966797, + "learning_rate": 4.4075570245668945e-06, + "loss": 0.523, + "step": 2832 + }, + { + "epoch": 1.3799935043845404, + "grad_norm": 2.3804070949554443, + "learning_rate": 4.407141256377672e-06, + "loss": 0.5231, + "step": 2833 + }, + { + "epoch": 1.3804806755440078, + "grad_norm": 2.8739912509918213, + "learning_rate": 4.406725361974017e-06, + "loss": 0.5043, + "step": 2834 + }, + { + "epoch": 1.3809678467034752, + "grad_norm": 2.7004189491271973, + "learning_rate": 4.4063093413834564e-06, + "loss": 0.6289, + "step": 2835 + }, + { + "epoch": 1.3814550178629426, + "grad_norm": 3.0208795070648193, + "learning_rate": 4.405893194633521e-06, + "loss": 0.5812, + "step": 2836 + }, + { + "epoch": 1.38194218902241, + "grad_norm": 2.886932373046875, + "learning_rate": 4.405476921751751e-06, + "loss": 0.5731, + "step": 2837 + }, + { + "epoch": 1.3824293601818773, + "grad_norm": 2.9845869541168213, + "learning_rate": 4.405060522765697e-06, + "loss": 0.6245, + "step": 2838 + }, + { + "epoch": 1.3829165313413445, + "grad_norm": 2.960303544998169, + "learning_rate": 4.404643997702914e-06, + "loss": 0.593, + "step": 2839 + }, + { + "epoch": 1.383403702500812, + "grad_norm": 2.696019411087036, + "learning_rate": 4.404227346590968e-06, + "loss": 0.5899, + "step": 2840 + }, + { + "epoch": 1.3838908736602793, + "grad_norm": 3.0462608337402344, + "learning_rate": 4.403810569457435e-06, + "loss": 0.6419, + "step": 2841 + }, + { + "epoch": 1.3843780448197467, + "grad_norm": 2.8580029010772705, + "learning_rate": 4.4033936663298945e-06, + "loss": 0.5528, + "step": 2842 + }, + { + "epoch": 1.384865215979214, + "grad_norm": 2.623544454574585, + "learning_rate": 4.402976637235939e-06, + "loss": 0.5202, + "step": 2843 + }, + { + "epoch": 1.3853523871386813, + "grad_norm": 2.8025929927825928, + "learning_rate": 4.402559482203167e-06, + "loss": 0.566, + "step": 2844 + }, + { + "epoch": 1.3858395582981489, + "grad_norm": 2.7944540977478027, + "learning_rate": 4.402142201259185e-06, + "loss": 0.6062, + "step": 2845 + }, + { + "epoch": 1.386326729457616, + "grad_norm": 2.8249354362487793, + "learning_rate": 4.4017247944316095e-06, + "loss": 0.6036, + "step": 2846 + }, + { + "epoch": 1.3868139006170834, + "grad_norm": 2.8196027278900146, + "learning_rate": 4.4013072617480644e-06, + "loss": 0.5522, + "step": 2847 + }, + { + "epoch": 1.3873010717765508, + "grad_norm": 2.5105321407318115, + "learning_rate": 4.400889603236182e-06, + "loss": 0.4731, + "step": 2848 + }, + { + "epoch": 1.3877882429360182, + "grad_norm": 2.2745769023895264, + "learning_rate": 4.400471818923603e-06, + "loss": 0.5172, + "step": 2849 + }, + { + "epoch": 1.3882754140954856, + "grad_norm": 2.4815666675567627, + "learning_rate": 4.4000539088379745e-06, + "loss": 0.6117, + "step": 2850 + }, + { + "epoch": 1.3887625852549528, + "grad_norm": 2.618299961090088, + "learning_rate": 4.399635873006957e-06, + "loss": 0.6067, + "step": 2851 + }, + { + "epoch": 1.3892497564144204, + "grad_norm": 2.986555576324463, + "learning_rate": 4.399217711458212e-06, + "loss": 0.5306, + "step": 2852 + }, + { + "epoch": 1.3897369275738876, + "grad_norm": 2.7270994186401367, + "learning_rate": 4.398799424219418e-06, + "loss": 0.5967, + "step": 2853 + }, + { + "epoch": 1.390224098733355, + "grad_norm": 2.9676949977874756, + "learning_rate": 4.398381011318254e-06, + "loss": 0.5429, + "step": 2854 + }, + { + "epoch": 1.3907112698928223, + "grad_norm": 2.9615583419799805, + "learning_rate": 4.397962472782411e-06, + "loss": 0.5843, + "step": 2855 + }, + { + "epoch": 1.3911984410522897, + "grad_norm": 3.9990711212158203, + "learning_rate": 4.397543808639589e-06, + "loss": 0.5481, + "step": 2856 + }, + { + "epoch": 1.3916856122117571, + "grad_norm": 2.840947151184082, + "learning_rate": 4.397125018917495e-06, + "loss": 0.6196, + "step": 2857 + }, + { + "epoch": 1.3921727833712243, + "grad_norm": 3.087571859359741, + "learning_rate": 4.396706103643843e-06, + "loss": 0.6538, + "step": 2858 + }, + { + "epoch": 1.392659954530692, + "grad_norm": 2.6609413623809814, + "learning_rate": 4.3962870628463575e-06, + "loss": 0.5528, + "step": 2859 + }, + { + "epoch": 1.393147125690159, + "grad_norm": 2.55149507522583, + "learning_rate": 4.395867896552771e-06, + "loss": 0.5465, + "step": 2860 + }, + { + "epoch": 1.3936342968496265, + "grad_norm": 3.165189504623413, + "learning_rate": 4.395448604790824e-06, + "loss": 0.6062, + "step": 2861 + }, + { + "epoch": 1.3941214680090939, + "grad_norm": 3.0806219577789307, + "learning_rate": 4.395029187588265e-06, + "loss": 0.5263, + "step": 2862 + }, + { + "epoch": 1.3946086391685613, + "grad_norm": 2.535928249359131, + "learning_rate": 4.3946096449728504e-06, + "loss": 0.5156, + "step": 2863 + }, + { + "epoch": 1.3950958103280287, + "grad_norm": 2.6332719326019287, + "learning_rate": 4.394189976972346e-06, + "loss": 0.6137, + "step": 2864 + }, + { + "epoch": 1.3955829814874958, + "grad_norm": 2.8959083557128906, + "learning_rate": 4.393770183614525e-06, + "loss": 0.5538, + "step": 2865 + }, + { + "epoch": 1.3960701526469634, + "grad_norm": 2.5014145374298096, + "learning_rate": 4.39335026492717e-06, + "loss": 0.6469, + "step": 2866 + }, + { + "epoch": 1.3965573238064306, + "grad_norm": 2.6898345947265625, + "learning_rate": 4.39293022093807e-06, + "loss": 0.4886, + "step": 2867 + }, + { + "epoch": 1.397044494965898, + "grad_norm": 2.5352377891540527, + "learning_rate": 4.392510051675025e-06, + "loss": 0.5604, + "step": 2868 + }, + { + "epoch": 1.3975316661253654, + "grad_norm": 2.606215715408325, + "learning_rate": 4.392089757165841e-06, + "loss": 0.5086, + "step": 2869 + }, + { + "epoch": 1.3980188372848328, + "grad_norm": 2.368014335632324, + "learning_rate": 4.3916693374383335e-06, + "loss": 0.5716, + "step": 2870 + }, + { + "epoch": 1.3985060084443002, + "grad_norm": 2.9533262252807617, + "learning_rate": 4.391248792520324e-06, + "loss": 0.6147, + "step": 2871 + }, + { + "epoch": 1.3989931796037673, + "grad_norm": 2.7501862049102783, + "learning_rate": 4.390828122439647e-06, + "loss": 0.5354, + "step": 2872 + }, + { + "epoch": 1.3994803507632347, + "grad_norm": 2.996600866317749, + "learning_rate": 4.3904073272241405e-06, + "loss": 0.5724, + "step": 2873 + }, + { + "epoch": 1.3999675219227021, + "grad_norm": 2.902757406234741, + "learning_rate": 4.3899864069016525e-06, + "loss": 0.6295, + "step": 2874 + }, + { + "epoch": 1.4004546930821695, + "grad_norm": 2.897550344467163, + "learning_rate": 4.389565361500041e-06, + "loss": 0.5471, + "step": 2875 + }, + { + "epoch": 1.400941864241637, + "grad_norm": 2.967756748199463, + "learning_rate": 4.389144191047169e-06, + "loss": 0.6267, + "step": 2876 + }, + { + "epoch": 1.4014290354011043, + "grad_norm": 2.691723585128784, + "learning_rate": 4.38872289557091e-06, + "loss": 0.688, + "step": 2877 + }, + { + "epoch": 1.4019162065605717, + "grad_norm": 2.7954354286193848, + "learning_rate": 4.388301475099147e-06, + "loss": 0.5453, + "step": 2878 + }, + { + "epoch": 1.4024033777200389, + "grad_norm": 2.542625904083252, + "learning_rate": 4.387879929659767e-06, + "loss": 0.5471, + "step": 2879 + }, + { + "epoch": 1.4028905488795063, + "grad_norm": 2.6403536796569824, + "learning_rate": 4.3874582592806705e-06, + "loss": 0.5468, + "step": 2880 + }, + { + "epoch": 1.4033777200389737, + "grad_norm": 2.439573049545288, + "learning_rate": 4.387036463989761e-06, + "loss": 0.5715, + "step": 2881 + }, + { + "epoch": 1.403864891198441, + "grad_norm": 2.7296829223632812, + "learning_rate": 4.3866145438149545e-06, + "loss": 0.5699, + "step": 2882 + }, + { + "epoch": 1.4043520623579084, + "grad_norm": 2.671851873397827, + "learning_rate": 4.386192498784173e-06, + "loss": 0.5446, + "step": 2883 + }, + { + "epoch": 1.4048392335173758, + "grad_norm": 3.0329627990722656, + "learning_rate": 4.3857703289253474e-06, + "loss": 0.5625, + "step": 2884 + }, + { + "epoch": 1.4053264046768432, + "grad_norm": 2.5713326930999756, + "learning_rate": 4.385348034266417e-06, + "loss": 0.5269, + "step": 2885 + }, + { + "epoch": 1.4058135758363104, + "grad_norm": 2.758553981781006, + "learning_rate": 4.38492561483533e-06, + "loss": 0.5082, + "step": 2886 + }, + { + "epoch": 1.4063007469957778, + "grad_norm": 2.4484426975250244, + "learning_rate": 4.384503070660041e-06, + "loss": 0.5087, + "step": 2887 + }, + { + "epoch": 1.4067879181552452, + "grad_norm": 2.6055612564086914, + "learning_rate": 4.3840804017685135e-06, + "loss": 0.489, + "step": 2888 + }, + { + "epoch": 1.4072750893147126, + "grad_norm": 2.5102667808532715, + "learning_rate": 4.383657608188721e-06, + "loss": 0.5664, + "step": 2889 + }, + { + "epoch": 1.40776226047418, + "grad_norm": 2.8685219287872314, + "learning_rate": 4.383234689948644e-06, + "loss": 0.5336, + "step": 2890 + }, + { + "epoch": 1.4082494316336474, + "grad_norm": 2.6836156845092773, + "learning_rate": 4.38281164707627e-06, + "loss": 0.5285, + "step": 2891 + }, + { + "epoch": 1.4087366027931147, + "grad_norm": 2.514221429824829, + "learning_rate": 4.382388479599596e-06, + "loss": 0.5853, + "step": 2892 + }, + { + "epoch": 1.409223773952582, + "grad_norm": 2.765934944152832, + "learning_rate": 4.381965187546628e-06, + "loss": 0.6153, + "step": 2893 + }, + { + "epoch": 1.4097109451120493, + "grad_norm": 2.9519412517547607, + "learning_rate": 4.38154177094538e-06, + "loss": 0.5017, + "step": 2894 + }, + { + "epoch": 1.4101981162715167, + "grad_norm": 2.5243685245513916, + "learning_rate": 4.381118229823872e-06, + "loss": 0.5727, + "step": 2895 + }, + { + "epoch": 1.410685287430984, + "grad_norm": 2.7156848907470703, + "learning_rate": 4.380694564210135e-06, + "loss": 0.6002, + "step": 2896 + }, + { + "epoch": 1.4111724585904515, + "grad_norm": 2.7883858680725098, + "learning_rate": 4.380270774132206e-06, + "loss": 0.569, + "step": 2897 + }, + { + "epoch": 1.4116596297499189, + "grad_norm": 2.782036542892456, + "learning_rate": 4.379846859618133e-06, + "loss": 0.6009, + "step": 2898 + }, + { + "epoch": 1.4121468009093863, + "grad_norm": 2.9848241806030273, + "learning_rate": 4.37942282069597e-06, + "loss": 0.478, + "step": 2899 + }, + { + "epoch": 1.4126339720688534, + "grad_norm": 2.8061070442199707, + "learning_rate": 4.37899865739378e-06, + "loss": 0.5721, + "step": 2900 + }, + { + "epoch": 1.4131211432283208, + "grad_norm": 2.551666498184204, + "learning_rate": 4.378574369739633e-06, + "loss": 0.5956, + "step": 2901 + }, + { + "epoch": 1.4136083143877882, + "grad_norm": 2.7150275707244873, + "learning_rate": 4.37814995776161e-06, + "loss": 0.5698, + "step": 2902 + }, + { + "epoch": 1.4140954855472556, + "grad_norm": 2.598482370376587, + "learning_rate": 4.377725421487797e-06, + "loss": 0.5382, + "step": 2903 + }, + { + "epoch": 1.414582656706723, + "grad_norm": 2.898581027984619, + "learning_rate": 4.3773007609462914e-06, + "loss": 0.6067, + "step": 2904 + }, + { + "epoch": 1.4150698278661904, + "grad_norm": 2.887429714202881, + "learning_rate": 4.376875976165196e-06, + "loss": 0.6032, + "step": 2905 + }, + { + "epoch": 1.4155569990256578, + "grad_norm": 2.7505576610565186, + "learning_rate": 4.376451067172623e-06, + "loss": 0.551, + "step": 2906 + }, + { + "epoch": 1.416044170185125, + "grad_norm": 2.6692802906036377, + "learning_rate": 4.376026033996692e-06, + "loss": 0.5341, + "step": 2907 + }, + { + "epoch": 1.4165313413445924, + "grad_norm": 2.4872190952301025, + "learning_rate": 4.3756008766655336e-06, + "loss": 0.4529, + "step": 2908 + }, + { + "epoch": 1.4170185125040597, + "grad_norm": 2.872093439102173, + "learning_rate": 4.375175595207285e-06, + "loss": 0.6562, + "step": 2909 + }, + { + "epoch": 1.4175056836635271, + "grad_norm": 2.8738768100738525, + "learning_rate": 4.374750189650089e-06, + "loss": 0.6326, + "step": 2910 + }, + { + "epoch": 1.4179928548229945, + "grad_norm": 2.8912742137908936, + "learning_rate": 4.3743246600221e-06, + "loss": 0.5761, + "step": 2911 + }, + { + "epoch": 1.4184800259824617, + "grad_norm": 2.767561912536621, + "learning_rate": 4.3738990063514794e-06, + "loss": 0.6267, + "step": 2912 + }, + { + "epoch": 1.4189671971419293, + "grad_norm": 3.125046968460083, + "learning_rate": 4.373473228666397e-06, + "loss": 0.5764, + "step": 2913 + }, + { + "epoch": 1.4194543683013965, + "grad_norm": 3.0772035121917725, + "learning_rate": 4.373047326995031e-06, + "loss": 0.6452, + "step": 2914 + }, + { + "epoch": 1.4199415394608639, + "grad_norm": 2.8218729496002197, + "learning_rate": 4.372621301365568e-06, + "loss": 0.624, + "step": 2915 + }, + { + "epoch": 1.4204287106203313, + "grad_norm": 3.0337350368499756, + "learning_rate": 4.372195151806201e-06, + "loss": 0.7008, + "step": 2916 + }, + { + "epoch": 1.4209158817797987, + "grad_norm": 2.8606550693511963, + "learning_rate": 4.371768878345133e-06, + "loss": 0.5952, + "step": 2917 + }, + { + "epoch": 1.421403052939266, + "grad_norm": 2.8748743534088135, + "learning_rate": 4.3713424810105754e-06, + "loss": 0.5547, + "step": 2918 + }, + { + "epoch": 1.4218902240987332, + "grad_norm": 2.673685312271118, + "learning_rate": 4.370915959830747e-06, + "loss": 0.5838, + "step": 2919 + }, + { + "epoch": 1.4223773952582008, + "grad_norm": 2.537074327468872, + "learning_rate": 4.370489314833873e-06, + "loss": 0.5346, + "step": 2920 + }, + { + "epoch": 1.422864566417668, + "grad_norm": 2.7199954986572266, + "learning_rate": 4.370062546048192e-06, + "loss": 0.5287, + "step": 2921 + }, + { + "epoch": 1.4233517375771354, + "grad_norm": 2.7368485927581787, + "learning_rate": 4.369635653501946e-06, + "loss": 0.5929, + "step": 2922 + }, + { + "epoch": 1.4238389087366028, + "grad_norm": 2.987720251083374, + "learning_rate": 4.369208637223386e-06, + "loss": 0.5993, + "step": 2923 + }, + { + "epoch": 1.4243260798960702, + "grad_norm": 3.290637969970703, + "learning_rate": 4.368781497240771e-06, + "loss": 0.5849, + "step": 2924 + }, + { + "epoch": 1.4248132510555376, + "grad_norm": 2.9147961139678955, + "learning_rate": 4.368354233582372e-06, + "loss": 0.5931, + "step": 2925 + }, + { + "epoch": 1.4253004222150047, + "grad_norm": 3.710723876953125, + "learning_rate": 4.367926846276463e-06, + "loss": 0.5334, + "step": 2926 + }, + { + "epoch": 1.4257875933744724, + "grad_norm": 2.789346218109131, + "learning_rate": 4.3674993353513306e-06, + "loss": 0.5949, + "step": 2927 + }, + { + "epoch": 1.4262747645339395, + "grad_norm": 2.6310298442840576, + "learning_rate": 4.367071700835266e-06, + "loss": 0.5141, + "step": 2928 + }, + { + "epoch": 1.426761935693407, + "grad_norm": 2.560945749282837, + "learning_rate": 4.366643942756569e-06, + "loss": 0.6663, + "step": 2929 + }, + { + "epoch": 1.4272491068528743, + "grad_norm": 2.487168312072754, + "learning_rate": 4.36621606114355e-06, + "loss": 0.508, + "step": 2930 + }, + { + "epoch": 1.4277362780123417, + "grad_norm": 2.66035795211792, + "learning_rate": 4.3657880560245255e-06, + "loss": 0.5792, + "step": 2931 + }, + { + "epoch": 1.428223449171809, + "grad_norm": 2.7733852863311768, + "learning_rate": 4.3653599274278225e-06, + "loss": 0.5598, + "step": 2932 + }, + { + "epoch": 1.4287106203312763, + "grad_norm": 2.4562742710113525, + "learning_rate": 4.3649316753817715e-06, + "loss": 0.5223, + "step": 2933 + }, + { + "epoch": 1.4291977914907439, + "grad_norm": 2.617439031600952, + "learning_rate": 4.364503299914717e-06, + "loss": 0.6079, + "step": 2934 + }, + { + "epoch": 1.429684962650211, + "grad_norm": 2.684230327606201, + "learning_rate": 4.364074801055008e-06, + "loss": 0.5061, + "step": 2935 + }, + { + "epoch": 1.4301721338096784, + "grad_norm": 2.7573513984680176, + "learning_rate": 4.363646178831002e-06, + "loss": 0.5515, + "step": 2936 + }, + { + "epoch": 1.4306593049691458, + "grad_norm": 2.8063087463378906, + "learning_rate": 4.363217433271065e-06, + "loss": 0.5982, + "step": 2937 + }, + { + "epoch": 1.4311464761286132, + "grad_norm": 3.611027956008911, + "learning_rate": 4.362788564403572e-06, + "loss": 0.5306, + "step": 2938 + }, + { + "epoch": 1.4316336472880806, + "grad_norm": 2.682823419570923, + "learning_rate": 4.362359572256905e-06, + "loss": 0.5517, + "step": 2939 + }, + { + "epoch": 1.4321208184475478, + "grad_norm": 2.749704122543335, + "learning_rate": 4.361930456859455e-06, + "loss": 0.5094, + "step": 2940 + }, + { + "epoch": 1.4326079896070152, + "grad_norm": 2.384639263153076, + "learning_rate": 4.361501218239621e-06, + "loss": 0.5358, + "step": 2941 + }, + { + "epoch": 1.4330951607664826, + "grad_norm": 2.4731431007385254, + "learning_rate": 4.361071856425809e-06, + "loss": 0.5651, + "step": 2942 + }, + { + "epoch": 1.43358233192595, + "grad_norm": 2.690906047821045, + "learning_rate": 4.360642371446436e-06, + "loss": 0.6196, + "step": 2943 + }, + { + "epoch": 1.4340695030854174, + "grad_norm": 2.761388063430786, + "learning_rate": 4.360212763329924e-06, + "loss": 0.4902, + "step": 2944 + }, + { + "epoch": 1.4345566742448848, + "grad_norm": 3.1378612518310547, + "learning_rate": 4.359783032104704e-06, + "loss": 0.5815, + "step": 2945 + }, + { + "epoch": 1.4350438454043521, + "grad_norm": 2.8764729499816895, + "learning_rate": 4.359353177799216e-06, + "loss": 0.5622, + "step": 2946 + }, + { + "epoch": 1.4355310165638193, + "grad_norm": 2.551290988922119, + "learning_rate": 4.358923200441908e-06, + "loss": 0.5482, + "step": 2947 + }, + { + "epoch": 1.4360181877232867, + "grad_norm": 2.4832868576049805, + "learning_rate": 4.358493100061236e-06, + "loss": 0.6204, + "step": 2948 + }, + { + "epoch": 1.436505358882754, + "grad_norm": 2.9623336791992188, + "learning_rate": 4.358062876685663e-06, + "loss": 0.6277, + "step": 2949 + }, + { + "epoch": 1.4369925300422215, + "grad_norm": 2.9700801372528076, + "learning_rate": 4.357632530343663e-06, + "loss": 0.6201, + "step": 2950 + }, + { + "epoch": 1.4374797012016889, + "grad_norm": 2.7620127201080322, + "learning_rate": 4.357202061063715e-06, + "loss": 0.5835, + "step": 2951 + }, + { + "epoch": 1.4379668723611563, + "grad_norm": 2.9391562938690186, + "learning_rate": 4.356771468874306e-06, + "loss": 0.6246, + "step": 2952 + }, + { + "epoch": 1.4384540435206237, + "grad_norm": 2.9288647174835205, + "learning_rate": 4.356340753803935e-06, + "loss": 0.5685, + "step": 2953 + }, + { + "epoch": 1.4389412146800908, + "grad_norm": 2.6081507205963135, + "learning_rate": 4.3559099158811054e-06, + "loss": 0.6203, + "step": 2954 + }, + { + "epoch": 1.4394283858395582, + "grad_norm": 2.879058361053467, + "learning_rate": 4.355478955134331e-06, + "loss": 0.5733, + "step": 2955 + }, + { + "epoch": 1.4399155569990256, + "grad_norm": 2.463820695877075, + "learning_rate": 4.355047871592132e-06, + "loss": 0.643, + "step": 2956 + }, + { + "epoch": 1.440402728158493, + "grad_norm": 2.7148680686950684, + "learning_rate": 4.354616665283038e-06, + "loss": 0.5854, + "step": 2957 + }, + { + "epoch": 1.4408898993179604, + "grad_norm": 2.7309532165527344, + "learning_rate": 4.354185336235586e-06, + "loss": 0.581, + "step": 2958 + }, + { + "epoch": 1.4413770704774278, + "grad_norm": 2.5543322563171387, + "learning_rate": 4.35375388447832e-06, + "loss": 0.5485, + "step": 2959 + }, + { + "epoch": 1.4418642416368952, + "grad_norm": 2.815032958984375, + "learning_rate": 4.353322310039795e-06, + "loss": 0.541, + "step": 2960 + }, + { + "epoch": 1.4423514127963624, + "grad_norm": 2.8084449768066406, + "learning_rate": 4.352890612948572e-06, + "loss": 0.6194, + "step": 2961 + }, + { + "epoch": 1.4428385839558298, + "grad_norm": 3.045295238494873, + "learning_rate": 4.3524587932332205e-06, + "loss": 0.5291, + "step": 2962 + }, + { + "epoch": 1.4433257551152971, + "grad_norm": 2.9605112075805664, + "learning_rate": 4.3520268509223196e-06, + "loss": 0.6301, + "step": 2963 + }, + { + "epoch": 1.4438129262747645, + "grad_norm": 3.048701524734497, + "learning_rate": 4.3515947860444535e-06, + "loss": 0.5579, + "step": 2964 + }, + { + "epoch": 1.444300097434232, + "grad_norm": 2.633289098739624, + "learning_rate": 4.351162598628217e-06, + "loss": 0.5593, + "step": 2965 + }, + { + "epoch": 1.4447872685936993, + "grad_norm": 3.358085870742798, + "learning_rate": 4.350730288702212e-06, + "loss": 0.5617, + "step": 2966 + }, + { + "epoch": 1.4452744397531667, + "grad_norm": 2.9023964405059814, + "learning_rate": 4.350297856295049e-06, + "loss": 0.5439, + "step": 2967 + }, + { + "epoch": 1.4457616109126339, + "grad_norm": 2.5065982341766357, + "learning_rate": 4.349865301435345e-06, + "loss": 0.6589, + "step": 2968 + }, + { + "epoch": 1.4462487820721013, + "grad_norm": 3.2279393672943115, + "learning_rate": 4.349432624151729e-06, + "loss": 0.5571, + "step": 2969 + }, + { + "epoch": 1.4467359532315687, + "grad_norm": 2.808168888092041, + "learning_rate": 4.3489998244728345e-06, + "loss": 0.5512, + "step": 2970 + }, + { + "epoch": 1.447223124391036, + "grad_norm": 2.6766579151153564, + "learning_rate": 4.3485669024273025e-06, + "loss": 0.5329, + "step": 2971 + }, + { + "epoch": 1.4477102955505035, + "grad_norm": 2.359405040740967, + "learning_rate": 4.348133858043786e-06, + "loss": 0.7134, + "step": 2972 + }, + { + "epoch": 1.4481974667099708, + "grad_norm": 2.942478895187378, + "learning_rate": 4.347700691350943e-06, + "loss": 0.4703, + "step": 2973 + }, + { + "epoch": 1.4486846378694382, + "grad_norm": 2.562680959701538, + "learning_rate": 4.3472674023774396e-06, + "loss": 0.5413, + "step": 2974 + }, + { + "epoch": 1.4491718090289054, + "grad_norm": 2.7624926567077637, + "learning_rate": 4.3468339911519516e-06, + "loss": 0.6029, + "step": 2975 + }, + { + "epoch": 1.4496589801883728, + "grad_norm": 2.6813135147094727, + "learning_rate": 4.346400457703162e-06, + "loss": 0.505, + "step": 2976 + }, + { + "epoch": 1.4501461513478402, + "grad_norm": 2.4942448139190674, + "learning_rate": 4.3459668020597625e-06, + "loss": 0.5773, + "step": 2977 + }, + { + "epoch": 1.4506333225073076, + "grad_norm": 2.9535396099090576, + "learning_rate": 4.345533024250451e-06, + "loss": 0.6088, + "step": 2978 + }, + { + "epoch": 1.451120493666775, + "grad_norm": 2.8076632022857666, + "learning_rate": 4.3450991243039365e-06, + "loss": 0.6252, + "step": 2979 + }, + { + "epoch": 1.4516076648262422, + "grad_norm": 2.781756639480591, + "learning_rate": 4.344665102248934e-06, + "loss": 0.5813, + "step": 2980 + }, + { + "epoch": 1.4520948359857098, + "grad_norm": 3.0583455562591553, + "learning_rate": 4.344230958114166e-06, + "loss": 0.5869, + "step": 2981 + }, + { + "epoch": 1.452582007145177, + "grad_norm": 2.310990571975708, + "learning_rate": 4.343796691928366e-06, + "loss": 0.6015, + "step": 2982 + }, + { + "epoch": 1.4530691783046443, + "grad_norm": 3.088137626647949, + "learning_rate": 4.343362303720272e-06, + "loss": 0.5016, + "step": 2983 + }, + { + "epoch": 1.4535563494641117, + "grad_norm": 2.7681801319122314, + "learning_rate": 4.342927793518632e-06, + "loss": 0.5548, + "step": 2984 + }, + { + "epoch": 1.454043520623579, + "grad_norm": 2.919395923614502, + "learning_rate": 4.3424931613522035e-06, + "loss": 0.5467, + "step": 2985 + }, + { + "epoch": 1.4545306917830465, + "grad_norm": 2.8213398456573486, + "learning_rate": 4.342058407249748e-06, + "loss": 0.633, + "step": 2986 + }, + { + "epoch": 1.4550178629425137, + "grad_norm": 2.950021505355835, + "learning_rate": 4.341623531240039e-06, + "loss": 0.5846, + "step": 2987 + }, + { + "epoch": 1.4555050341019813, + "grad_norm": 2.672565460205078, + "learning_rate": 4.3411885333518565e-06, + "loss": 0.5855, + "step": 2988 + }, + { + "epoch": 1.4559922052614485, + "grad_norm": 2.8390796184539795, + "learning_rate": 4.340753413613988e-06, + "loss": 0.5898, + "step": 2989 + }, + { + "epoch": 1.4564793764209158, + "grad_norm": 3.0807507038116455, + "learning_rate": 4.34031817205523e-06, + "loss": 0.6309, + "step": 2990 + }, + { + "epoch": 1.4569665475803832, + "grad_norm": 3.769460678100586, + "learning_rate": 4.339882808704387e-06, + "loss": 0.5335, + "step": 2991 + }, + { + "epoch": 1.4574537187398506, + "grad_norm": 2.716188669204712, + "learning_rate": 4.33944732359027e-06, + "loss": 0.5829, + "step": 2992 + }, + { + "epoch": 1.457940889899318, + "grad_norm": 2.496126413345337, + "learning_rate": 4.339011716741702e-06, + "loss": 0.5065, + "step": 2993 + }, + { + "epoch": 1.4584280610587852, + "grad_norm": 2.984905242919922, + "learning_rate": 4.3385759881875085e-06, + "loss": 0.5357, + "step": 2994 + }, + { + "epoch": 1.4589152322182528, + "grad_norm": 2.942248582839966, + "learning_rate": 4.338140137956528e-06, + "loss": 0.5712, + "step": 2995 + }, + { + "epoch": 1.45940240337772, + "grad_norm": 2.7925870418548584, + "learning_rate": 4.337704166077604e-06, + "loss": 0.5353, + "step": 2996 + }, + { + "epoch": 1.4598895745371874, + "grad_norm": 2.5642850399017334, + "learning_rate": 4.33726807257959e-06, + "loss": 0.554, + "step": 2997 + }, + { + "epoch": 1.4603767456966548, + "grad_norm": 2.615609884262085, + "learning_rate": 4.3368318574913456e-06, + "loss": 0.6008, + "step": 2998 + }, + { + "epoch": 1.4608639168561222, + "grad_norm": 3.2458078861236572, + "learning_rate": 4.33639552084174e-06, + "loss": 0.5997, + "step": 2999 + }, + { + "epoch": 1.4613510880155895, + "grad_norm": 2.743152618408203, + "learning_rate": 4.33595906265965e-06, + "loss": 0.5208, + "step": 3000 + }, + { + "epoch": 1.4618382591750567, + "grad_norm": 2.956622838973999, + "learning_rate": 4.33552248297396e-06, + "loss": 0.5796, + "step": 3001 + }, + { + "epoch": 1.4623254303345243, + "grad_norm": 2.799286127090454, + "learning_rate": 4.335085781813563e-06, + "loss": 0.5713, + "step": 3002 + }, + { + "epoch": 1.4628126014939915, + "grad_norm": 2.5869123935699463, + "learning_rate": 4.33464895920736e-06, + "loss": 0.6444, + "step": 3003 + }, + { + "epoch": 1.463299772653459, + "grad_norm": 2.806053876876831, + "learning_rate": 4.334212015184259e-06, + "loss": 0.6243, + "step": 3004 + }, + { + "epoch": 1.4637869438129263, + "grad_norm": 3.0957090854644775, + "learning_rate": 4.333774949773179e-06, + "loss": 0.5786, + "step": 3005 + }, + { + "epoch": 1.4642741149723937, + "grad_norm": 2.7834742069244385, + "learning_rate": 4.333337763003043e-06, + "loss": 0.4751, + "step": 3006 + }, + { + "epoch": 1.464761286131861, + "grad_norm": 2.372882127761841, + "learning_rate": 4.3329004549027845e-06, + "loss": 0.5394, + "step": 3007 + }, + { + "epoch": 1.4652484572913282, + "grad_norm": 2.5066936016082764, + "learning_rate": 4.332463025501344e-06, + "loss": 0.6014, + "step": 3008 + }, + { + "epoch": 1.4657356284507956, + "grad_norm": 2.8558552265167236, + "learning_rate": 4.3320254748276715e-06, + "loss": 0.556, + "step": 3009 + }, + { + "epoch": 1.466222799610263, + "grad_norm": 2.801563024520874, + "learning_rate": 4.331587802910724e-06, + "loss": 0.5776, + "step": 3010 + }, + { + "epoch": 1.4667099707697304, + "grad_norm": 2.572490930557251, + "learning_rate": 4.3311500097794655e-06, + "loss": 0.5721, + "step": 3011 + }, + { + "epoch": 1.4671971419291978, + "grad_norm": 2.7342028617858887, + "learning_rate": 4.330712095462871e-06, + "loss": 0.559, + "step": 3012 + }, + { + "epoch": 1.4676843130886652, + "grad_norm": 2.8601155281066895, + "learning_rate": 4.33027405998992e-06, + "loss": 0.5753, + "step": 3013 + }, + { + "epoch": 1.4681714842481326, + "grad_norm": 2.814683198928833, + "learning_rate": 4.329835903389601e-06, + "loss": 0.4907, + "step": 3014 + }, + { + "epoch": 1.4686586554075998, + "grad_norm": 2.401991367340088, + "learning_rate": 4.329397625690914e-06, + "loss": 0.6057, + "step": 3015 + }, + { + "epoch": 1.4691458265670672, + "grad_norm": 2.5891449451446533, + "learning_rate": 4.328959226922862e-06, + "loss": 0.6, + "step": 3016 + }, + { + "epoch": 1.4696329977265346, + "grad_norm": 2.5300683975219727, + "learning_rate": 4.328520707114458e-06, + "loss": 0.5834, + "step": 3017 + }, + { + "epoch": 1.470120168886002, + "grad_norm": 3.0535504817962646, + "learning_rate": 4.328082066294724e-06, + "loss": 0.6027, + "step": 3018 + }, + { + "epoch": 1.4706073400454693, + "grad_norm": 2.560825824737549, + "learning_rate": 4.3276433044926905e-06, + "loss": 0.6194, + "step": 3019 + }, + { + "epoch": 1.4710945112049367, + "grad_norm": 2.5611817836761475, + "learning_rate": 4.327204421737392e-06, + "loss": 0.5394, + "step": 3020 + }, + { + "epoch": 1.4715816823644041, + "grad_norm": 2.987435817718506, + "learning_rate": 4.326765418057875e-06, + "loss": 0.5736, + "step": 3021 + }, + { + "epoch": 1.4720688535238713, + "grad_norm": 2.5884156227111816, + "learning_rate": 4.326326293483193e-06, + "loss": 0.5614, + "step": 3022 + }, + { + "epoch": 1.4725560246833387, + "grad_norm": 2.8710756301879883, + "learning_rate": 4.325887048042407e-06, + "loss": 0.5981, + "step": 3023 + }, + { + "epoch": 1.473043195842806, + "grad_norm": 3.033029556274414, + "learning_rate": 4.325447681764586e-06, + "loss": 0.5399, + "step": 3024 + }, + { + "epoch": 1.4735303670022735, + "grad_norm": 2.388206720352173, + "learning_rate": 4.325008194678807e-06, + "loss": 0.5577, + "step": 3025 + }, + { + "epoch": 1.4740175381617409, + "grad_norm": 2.3925416469573975, + "learning_rate": 4.3245685868141565e-06, + "loss": 0.4932, + "step": 3026 + }, + { + "epoch": 1.4745047093212083, + "grad_norm": 2.394986629486084, + "learning_rate": 4.324128858199727e-06, + "loss": 0.666, + "step": 3027 + }, + { + "epoch": 1.4749918804806756, + "grad_norm": 3.0291929244995117, + "learning_rate": 4.323689008864619e-06, + "loss": 0.6121, + "step": 3028 + }, + { + "epoch": 1.4754790516401428, + "grad_norm": 3.473628520965576, + "learning_rate": 4.323249038837942e-06, + "loss": 0.5723, + "step": 3029 + }, + { + "epoch": 1.4759662227996102, + "grad_norm": 3.122032403945923, + "learning_rate": 4.322808948148814e-06, + "loss": 0.5515, + "step": 3030 + }, + { + "epoch": 1.4764533939590776, + "grad_norm": 3.0819644927978516, + "learning_rate": 4.322368736826359e-06, + "loss": 0.5452, + "step": 3031 + }, + { + "epoch": 1.476940565118545, + "grad_norm": 2.7280080318450928, + "learning_rate": 4.321928404899711e-06, + "loss": 0.5099, + "step": 3032 + }, + { + "epoch": 1.4774277362780124, + "grad_norm": 2.875681161880493, + "learning_rate": 4.32148795239801e-06, + "loss": 0.5441, + "step": 3033 + }, + { + "epoch": 1.4779149074374798, + "grad_norm": 2.9071054458618164, + "learning_rate": 4.321047379350407e-06, + "loss": 0.5415, + "step": 3034 + }, + { + "epoch": 1.4784020785969472, + "grad_norm": 2.4379355907440186, + "learning_rate": 4.320606685786057e-06, + "loss": 0.5593, + "step": 3035 + }, + { + "epoch": 1.4788892497564143, + "grad_norm": 2.721301555633545, + "learning_rate": 4.320165871734128e-06, + "loss": 0.6102, + "step": 3036 + }, + { + "epoch": 1.4793764209158817, + "grad_norm": 2.731069326400757, + "learning_rate": 4.319724937223789e-06, + "loss": 0.5413, + "step": 3037 + }, + { + "epoch": 1.4798635920753491, + "grad_norm": 2.388378620147705, + "learning_rate": 4.3192838822842245e-06, + "loss": 0.5505, + "step": 3038 + }, + { + "epoch": 1.4803507632348165, + "grad_norm": 3.035205602645874, + "learning_rate": 4.318842706944622e-06, + "loss": 0.6319, + "step": 3039 + }, + { + "epoch": 1.480837934394284, + "grad_norm": 2.715198278427124, + "learning_rate": 4.318401411234178e-06, + "loss": 0.5829, + "step": 3040 + }, + { + "epoch": 1.4813251055537513, + "grad_norm": 2.4831149578094482, + "learning_rate": 4.3179599951820975e-06, + "loss": 0.5129, + "step": 3041 + }, + { + "epoch": 1.4818122767132187, + "grad_norm": 2.80399751663208, + "learning_rate": 4.317518458817595e-06, + "loss": 0.575, + "step": 3042 + }, + { + "epoch": 1.4822994478726859, + "grad_norm": 3.0094850063323975, + "learning_rate": 4.317076802169889e-06, + "loss": 0.5487, + "step": 3043 + }, + { + "epoch": 1.4827866190321533, + "grad_norm": 2.5773086547851562, + "learning_rate": 4.316635025268209e-06, + "loss": 0.5952, + "step": 3044 + }, + { + "epoch": 1.4832737901916206, + "grad_norm": 2.8063340187072754, + "learning_rate": 4.316193128141793e-06, + "loss": 0.5506, + "step": 3045 + }, + { + "epoch": 1.483760961351088, + "grad_norm": 3.267258405685425, + "learning_rate": 4.315751110819885e-06, + "loss": 0.5642, + "step": 3046 + }, + { + "epoch": 1.4842481325105554, + "grad_norm": 2.6136252880096436, + "learning_rate": 4.315308973331737e-06, + "loss": 0.5832, + "step": 3047 + }, + { + "epoch": 1.4847353036700226, + "grad_norm": 2.753310441970825, + "learning_rate": 4.3148667157066114e-06, + "loss": 0.4762, + "step": 3048 + }, + { + "epoch": 1.4852224748294902, + "grad_norm": 2.400177478790283, + "learning_rate": 4.314424337973775e-06, + "loss": 0.5092, + "step": 3049 + }, + { + "epoch": 1.4857096459889574, + "grad_norm": 2.580936908721924, + "learning_rate": 4.313981840162505e-06, + "loss": 0.5433, + "step": 3050 + }, + { + "epoch": 1.4861968171484248, + "grad_norm": 2.994781494140625, + "learning_rate": 4.3135392223020855e-06, + "loss": 0.5473, + "step": 3051 + }, + { + "epoch": 1.4866839883078922, + "grad_norm": 2.683244228363037, + "learning_rate": 4.313096484421809e-06, + "loss": 0.5144, + "step": 3052 + }, + { + "epoch": 1.4871711594673596, + "grad_norm": 2.8223559856414795, + "learning_rate": 4.312653626550976e-06, + "loss": 0.5866, + "step": 3053 + }, + { + "epoch": 1.487658330626827, + "grad_norm": 3.142163038253784, + "learning_rate": 4.312210648718895e-06, + "loss": 0.5218, + "step": 3054 + }, + { + "epoch": 1.4881455017862941, + "grad_norm": 2.9021449089050293, + "learning_rate": 4.311767550954882e-06, + "loss": 0.601, + "step": 3055 + }, + { + "epoch": 1.4886326729457617, + "grad_norm": 2.553351879119873, + "learning_rate": 4.31132433328826e-06, + "loss": 0.5599, + "step": 3056 + }, + { + "epoch": 1.489119844105229, + "grad_norm": 2.8480710983276367, + "learning_rate": 4.310880995748364e-06, + "loss": 0.5689, + "step": 3057 + }, + { + "epoch": 1.4896070152646963, + "grad_norm": 2.7450449466705322, + "learning_rate": 4.310437538364532e-06, + "loss": 0.5611, + "step": 3058 + }, + { + "epoch": 1.4900941864241637, + "grad_norm": 2.7828550338745117, + "learning_rate": 4.309993961166112e-06, + "loss": 0.5977, + "step": 3059 + }, + { + "epoch": 1.490581357583631, + "grad_norm": 2.9769585132598877, + "learning_rate": 4.3095502641824604e-06, + "loss": 0.6196, + "step": 3060 + }, + { + "epoch": 1.4910685287430985, + "grad_norm": 2.8314366340637207, + "learning_rate": 4.309106447442941e-06, + "loss": 0.5924, + "step": 3061 + }, + { + "epoch": 1.4915556999025656, + "grad_norm": 2.9224462509155273, + "learning_rate": 4.308662510976924e-06, + "loss": 0.5997, + "step": 3062 + }, + { + "epoch": 1.4920428710620333, + "grad_norm": 2.6758038997650146, + "learning_rate": 4.308218454813792e-06, + "loss": 0.5797, + "step": 3063 + }, + { + "epoch": 1.4925300422215004, + "grad_norm": 2.4228193759918213, + "learning_rate": 4.30777427898293e-06, + "loss": 0.5507, + "step": 3064 + }, + { + "epoch": 1.4930172133809678, + "grad_norm": 3.2231552600860596, + "learning_rate": 4.307329983513736e-06, + "loss": 0.5964, + "step": 3065 + }, + { + "epoch": 1.4935043845404352, + "grad_norm": 2.5358312129974365, + "learning_rate": 4.30688556843561e-06, + "loss": 0.6371, + "step": 3066 + }, + { + "epoch": 1.4939915556999026, + "grad_norm": 2.7326571941375732, + "learning_rate": 4.306441033777967e-06, + "loss": 0.5922, + "step": 3067 + }, + { + "epoch": 1.49447872685937, + "grad_norm": 2.797011375427246, + "learning_rate": 4.305996379570224e-06, + "loss": 0.6121, + "step": 3068 + }, + { + "epoch": 1.4949658980188372, + "grad_norm": 3.291520357131958, + "learning_rate": 4.305551605841809e-06, + "loss": 0.6316, + "step": 3069 + }, + { + "epoch": 1.4954530691783046, + "grad_norm": 2.8392884731292725, + "learning_rate": 4.305106712622157e-06, + "loss": 0.5105, + "step": 3070 + }, + { + "epoch": 1.495940240337772, + "grad_norm": 3.00602126121521, + "learning_rate": 4.304661699940709e-06, + "loss": 0.6027, + "step": 3071 + }, + { + "epoch": 1.4964274114972393, + "grad_norm": 2.706852436065674, + "learning_rate": 4.30421656782692e-06, + "loss": 0.5751, + "step": 3072 + }, + { + "epoch": 1.4969145826567067, + "grad_norm": 3.039973258972168, + "learning_rate": 4.303771316310246e-06, + "loss": 0.5983, + "step": 3073 + }, + { + "epoch": 1.4974017538161741, + "grad_norm": 2.940066337585449, + "learning_rate": 4.303325945420154e-06, + "loss": 0.5711, + "step": 3074 + }, + { + "epoch": 1.4978889249756415, + "grad_norm": 2.896599292755127, + "learning_rate": 4.302880455186119e-06, + "loss": 0.518, + "step": 3075 + }, + { + "epoch": 1.4983760961351087, + "grad_norm": 2.4435911178588867, + "learning_rate": 4.302434845637623e-06, + "loss": 0.5321, + "step": 3076 + }, + { + "epoch": 1.498863267294576, + "grad_norm": 2.798417568206787, + "learning_rate": 4.301989116804157e-06, + "loss": 0.5563, + "step": 3077 + }, + { + "epoch": 1.4993504384540435, + "grad_norm": 2.713413715362549, + "learning_rate": 4.3015432687152194e-06, + "loss": 0.6066, + "step": 3078 + }, + { + "epoch": 1.4998376096135109, + "grad_norm": 3.0237364768981934, + "learning_rate": 4.301097301400316e-06, + "loss": 0.6171, + "step": 3079 + }, + { + "epoch": 1.5003247807729783, + "grad_norm": 2.977691411972046, + "learning_rate": 4.30065121488896e-06, + "loss": 0.5247, + "step": 3080 + }, + { + "epoch": 1.5008119519324457, + "grad_norm": 2.7237558364868164, + "learning_rate": 4.300205009210675e-06, + "loss": 0.4997, + "step": 3081 + }, + { + "epoch": 1.501299123091913, + "grad_norm": 2.6600425243377686, + "learning_rate": 4.2997586843949905e-06, + "loss": 0.5174, + "step": 3082 + }, + { + "epoch": 1.5017862942513802, + "grad_norm": 2.3554728031158447, + "learning_rate": 4.2993122404714424e-06, + "loss": 0.6061, + "step": 3083 + }, + { + "epoch": 1.5022734654108478, + "grad_norm": 2.6185548305511475, + "learning_rate": 4.298865677469579e-06, + "loss": 0.5507, + "step": 3084 + }, + { + "epoch": 1.502760636570315, + "grad_norm": 2.4107751846313477, + "learning_rate": 4.2984189954189524e-06, + "loss": 0.5595, + "step": 3085 + }, + { + "epoch": 1.5032478077297824, + "grad_norm": 2.6800644397735596, + "learning_rate": 4.297972194349124e-06, + "loss": 0.6339, + "step": 3086 + }, + { + "epoch": 1.5037349788892498, + "grad_norm": 2.935497522354126, + "learning_rate": 4.297525274289663e-06, + "loss": 0.5326, + "step": 3087 + }, + { + "epoch": 1.504222150048717, + "grad_norm": 2.7044124603271484, + "learning_rate": 4.297078235270148e-06, + "loss": 0.5284, + "step": 3088 + }, + { + "epoch": 1.5047093212081846, + "grad_norm": 2.520364999771118, + "learning_rate": 4.296631077320161e-06, + "loss": 0.5718, + "step": 3089 + }, + { + "epoch": 1.5051964923676517, + "grad_norm": 2.656526803970337, + "learning_rate": 4.296183800469298e-06, + "loss": 0.5605, + "step": 3090 + }, + { + "epoch": 1.5056836635271194, + "grad_norm": 2.8173890113830566, + "learning_rate": 4.295736404747157e-06, + "loss": 0.5818, + "step": 3091 + }, + { + "epoch": 1.5061708346865865, + "grad_norm": 2.798828601837158, + "learning_rate": 4.295288890183349e-06, + "loss": 0.5413, + "step": 3092 + }, + { + "epoch": 1.506658005846054, + "grad_norm": 2.35724139213562, + "learning_rate": 4.294841256807489e-06, + "loss": 0.5912, + "step": 3093 + }, + { + "epoch": 1.5071451770055213, + "grad_norm": 2.831010341644287, + "learning_rate": 4.294393504649202e-06, + "loss": 0.6098, + "step": 3094 + }, + { + "epoch": 1.5076323481649885, + "grad_norm": 2.8556032180786133, + "learning_rate": 4.29394563373812e-06, + "loss": 0.5213, + "step": 3095 + }, + { + "epoch": 1.508119519324456, + "grad_norm": 3.2752230167388916, + "learning_rate": 4.293497644103883e-06, + "loss": 0.5922, + "step": 3096 + }, + { + "epoch": 1.5086066904839233, + "grad_norm": 2.810086965560913, + "learning_rate": 4.293049535776138e-06, + "loss": 0.5874, + "step": 3097 + }, + { + "epoch": 1.5090938616433909, + "grad_norm": 2.8577873706817627, + "learning_rate": 4.292601308784543e-06, + "loss": 0.5762, + "step": 3098 + }, + { + "epoch": 1.509581032802858, + "grad_norm": 2.873450517654419, + "learning_rate": 4.292152963158759e-06, + "loss": 0.6094, + "step": 3099 + }, + { + "epoch": 1.5100682039623254, + "grad_norm": 2.847317934036255, + "learning_rate": 4.29170449892846e-06, + "loss": 0.544, + "step": 3100 + }, + { + "epoch": 1.5105553751217928, + "grad_norm": 2.835008382797241, + "learning_rate": 4.291255916123323e-06, + "loss": 0.6081, + "step": 3101 + }, + { + "epoch": 1.51104254628126, + "grad_norm": 3.2685933113098145, + "learning_rate": 4.290807214773038e-06, + "loss": 0.5789, + "step": 3102 + }, + { + "epoch": 1.5115297174407276, + "grad_norm": 2.927565574645996, + "learning_rate": 4.290358394907297e-06, + "loss": 0.5278, + "step": 3103 + }, + { + "epoch": 1.5120168886001948, + "grad_norm": 2.589284896850586, + "learning_rate": 4.289909456555804e-06, + "loss": 0.4856, + "step": 3104 + }, + { + "epoch": 1.5125040597596622, + "grad_norm": 2.3880138397216797, + "learning_rate": 4.28946039974827e-06, + "loss": 0.5904, + "step": 3105 + }, + { + "epoch": 1.5129912309191296, + "grad_norm": 2.796879529953003, + "learning_rate": 4.289011224514413e-06, + "loss": 0.6086, + "step": 3106 + }, + { + "epoch": 1.513478402078597, + "grad_norm": 2.7096543312072754, + "learning_rate": 4.288561930883961e-06, + "loss": 0.5771, + "step": 3107 + }, + { + "epoch": 1.5139655732380644, + "grad_norm": 2.4158883094787598, + "learning_rate": 4.2881125188866465e-06, + "loss": 0.5936, + "step": 3108 + }, + { + "epoch": 1.5144527443975315, + "grad_norm": 2.5334084033966064, + "learning_rate": 4.287662988552212e-06, + "loss": 0.5885, + "step": 3109 + }, + { + "epoch": 1.5149399155569991, + "grad_norm": 2.7348361015319824, + "learning_rate": 4.287213339910407e-06, + "loss": 0.6427, + "step": 3110 + }, + { + "epoch": 1.5154270867164663, + "grad_norm": 3.142444610595703, + "learning_rate": 4.286763572990989e-06, + "loss": 0.626, + "step": 3111 + }, + { + "epoch": 1.5159142578759337, + "grad_norm": 2.4804580211639404, + "learning_rate": 4.286313687823725e-06, + "loss": 0.5183, + "step": 3112 + }, + { + "epoch": 1.516401429035401, + "grad_norm": 2.5711207389831543, + "learning_rate": 4.285863684438387e-06, + "loss": 0.5772, + "step": 3113 + }, + { + "epoch": 1.5168886001948685, + "grad_norm": 2.5925562381744385, + "learning_rate": 4.2854135628647575e-06, + "loss": 0.6168, + "step": 3114 + }, + { + "epoch": 1.5173757713543359, + "grad_norm": 2.5157370567321777, + "learning_rate": 4.284963323132625e-06, + "loss": 0.5452, + "step": 3115 + }, + { + "epoch": 1.517862942513803, + "grad_norm": 2.7474656105041504, + "learning_rate": 4.284512965271785e-06, + "loss": 0.5924, + "step": 3116 + }, + { + "epoch": 1.5183501136732707, + "grad_norm": 2.835881233215332, + "learning_rate": 4.284062489312043e-06, + "loss": 0.6097, + "step": 3117 + }, + { + "epoch": 1.5188372848327378, + "grad_norm": 3.157529830932617, + "learning_rate": 4.283611895283212e-06, + "loss": 0.5879, + "step": 3118 + }, + { + "epoch": 1.5193244559922052, + "grad_norm": 3.0208873748779297, + "learning_rate": 4.283161183215111e-06, + "loss": 0.5764, + "step": 3119 + }, + { + "epoch": 1.5198116271516726, + "grad_norm": 2.642880439758301, + "learning_rate": 4.28271035313757e-06, + "loss": 0.5577, + "step": 3120 + }, + { + "epoch": 1.52029879831114, + "grad_norm": 2.805220365524292, + "learning_rate": 4.282259405080424e-06, + "loss": 0.5542, + "step": 3121 + }, + { + "epoch": 1.5207859694706074, + "grad_norm": 2.687014579772949, + "learning_rate": 4.281808339073516e-06, + "loss": 0.6032, + "step": 3122 + }, + { + "epoch": 1.5212731406300746, + "grad_norm": 4.119085311889648, + "learning_rate": 4.2813571551466975e-06, + "loss": 0.5454, + "step": 3123 + }, + { + "epoch": 1.5217603117895422, + "grad_norm": 2.8101987838745117, + "learning_rate": 4.280905853329829e-06, + "loss": 0.6158, + "step": 3124 + }, + { + "epoch": 1.5222474829490094, + "grad_norm": 3.0282516479492188, + "learning_rate": 4.280454433652777e-06, + "loss": 0.5633, + "step": 3125 + }, + { + "epoch": 1.5227346541084767, + "grad_norm": 2.5620694160461426, + "learning_rate": 4.280002896145417e-06, + "loss": 0.6196, + "step": 3126 + }, + { + "epoch": 1.5232218252679441, + "grad_norm": 3.341254711151123, + "learning_rate": 4.279551240837629e-06, + "loss": 0.6199, + "step": 3127 + }, + { + "epoch": 1.5237089964274115, + "grad_norm": 2.6418890953063965, + "learning_rate": 4.2790994677593076e-06, + "loss": 0.5765, + "step": 3128 + }, + { + "epoch": 1.524196167586879, + "grad_norm": 2.8307485580444336, + "learning_rate": 4.278647576940349e-06, + "loss": 0.5601, + "step": 3129 + }, + { + "epoch": 1.524683338746346, + "grad_norm": 2.5177135467529297, + "learning_rate": 4.2781955684106585e-06, + "loss": 0.5425, + "step": 3130 + }, + { + "epoch": 1.5251705099058137, + "grad_norm": 2.644340991973877, + "learning_rate": 4.277743442200151e-06, + "loss": 0.6332, + "step": 3131 + }, + { + "epoch": 1.5256576810652809, + "grad_norm": 2.8078386783599854, + "learning_rate": 4.277291198338749e-06, + "loss": 0.5437, + "step": 3132 + }, + { + "epoch": 1.5261448522247483, + "grad_norm": 2.622234582901001, + "learning_rate": 4.276838836856379e-06, + "loss": 0.6024, + "step": 3133 + }, + { + "epoch": 1.5266320233842157, + "grad_norm": 2.782449722290039, + "learning_rate": 4.276386357782983e-06, + "loss": 0.5214, + "step": 3134 + }, + { + "epoch": 1.527119194543683, + "grad_norm": 2.54665470123291, + "learning_rate": 4.275933761148501e-06, + "loss": 0.5813, + "step": 3135 + }, + { + "epoch": 1.5276063657031504, + "grad_norm": 2.7200815677642822, + "learning_rate": 4.275481046982889e-06, + "loss": 0.624, + "step": 3136 + }, + { + "epoch": 1.5280935368626176, + "grad_norm": 3.0766429901123047, + "learning_rate": 4.275028215316106e-06, + "loss": 0.5376, + "step": 3137 + }, + { + "epoch": 1.5285807080220852, + "grad_norm": 2.656080961227417, + "learning_rate": 4.274575266178122e-06, + "loss": 0.6077, + "step": 3138 + }, + { + "epoch": 1.5290678791815524, + "grad_norm": 2.7737817764282227, + "learning_rate": 4.274122199598912e-06, + "loss": 0.6442, + "step": 3139 + }, + { + "epoch": 1.5295550503410198, + "grad_norm": 2.940990447998047, + "learning_rate": 4.2736690156084595e-06, + "loss": 0.5642, + "step": 3140 + }, + { + "epoch": 1.5300422215004872, + "grad_norm": 2.731548547744751, + "learning_rate": 4.273215714236757e-06, + "loss": 0.5845, + "step": 3141 + }, + { + "epoch": 1.5305293926599546, + "grad_norm": 2.765528678894043, + "learning_rate": 4.272762295513803e-06, + "loss": 0.6255, + "step": 3142 + }, + { + "epoch": 1.531016563819422, + "grad_norm": 2.5594301223754883, + "learning_rate": 4.272308759469606e-06, + "loss": 0.5579, + "step": 3143 + }, + { + "epoch": 1.5315037349788891, + "grad_norm": 3.332576274871826, + "learning_rate": 4.27185510613418e-06, + "loss": 0.6074, + "step": 3144 + }, + { + "epoch": 1.5319909061383568, + "grad_norm": 3.04866623878479, + "learning_rate": 4.271401335537548e-06, + "loss": 0.5953, + "step": 3145 + }, + { + "epoch": 1.532478077297824, + "grad_norm": 2.681048631668091, + "learning_rate": 4.2709474477097415e-06, + "loss": 0.4278, + "step": 3146 + }, + { + "epoch": 1.5329652484572913, + "grad_norm": 2.627917766571045, + "learning_rate": 4.2704934426807965e-06, + "loss": 0.6006, + "step": 3147 + }, + { + "epoch": 1.5334524196167587, + "grad_norm": 2.971062660217285, + "learning_rate": 4.270039320480761e-06, + "loss": 0.5353, + "step": 3148 + }, + { + "epoch": 1.533939590776226, + "grad_norm": 2.6734375953674316, + "learning_rate": 4.269585081139689e-06, + "loss": 0.6171, + "step": 3149 + }, + { + "epoch": 1.5344267619356935, + "grad_norm": 3.0422987937927246, + "learning_rate": 4.2691307246876395e-06, + "loss": 0.5816, + "step": 3150 + }, + { + "epoch": 1.5349139330951607, + "grad_norm": 2.6271731853485107, + "learning_rate": 4.268676251154684e-06, + "loss": 0.5547, + "step": 3151 + }, + { + "epoch": 1.5354011042546283, + "grad_norm": 2.5297999382019043, + "learning_rate": 4.2682216605709e-06, + "loss": 0.577, + "step": 3152 + }, + { + "epoch": 1.5358882754140954, + "grad_norm": 2.716858148574829, + "learning_rate": 4.267766952966369e-06, + "loss": 0.5966, + "step": 3153 + }, + { + "epoch": 1.5363754465735628, + "grad_norm": 2.6431267261505127, + "learning_rate": 4.267312128371187e-06, + "loss": 0.5634, + "step": 3154 + }, + { + "epoch": 1.5368626177330302, + "grad_norm": 2.8422627449035645, + "learning_rate": 4.266857186815453e-06, + "loss": 0.6279, + "step": 3155 + }, + { + "epoch": 1.5373497888924974, + "grad_norm": 2.8993144035339355, + "learning_rate": 4.266402128329274e-06, + "loss": 0.6132, + "step": 3156 + }, + { + "epoch": 1.537836960051965, + "grad_norm": 2.637044906616211, + "learning_rate": 4.265946952942767e-06, + "loss": 0.5569, + "step": 3157 + }, + { + "epoch": 1.5383241312114322, + "grad_norm": 2.654338836669922, + "learning_rate": 4.265491660686057e-06, + "loss": 0.5896, + "step": 3158 + }, + { + "epoch": 1.5388113023708998, + "grad_norm": 2.8117828369140625, + "learning_rate": 4.265036251589271e-06, + "loss": 0.5073, + "step": 3159 + }, + { + "epoch": 1.539298473530367, + "grad_norm": 2.7909996509552, + "learning_rate": 4.26458072568255e-06, + "loss": 0.5383, + "step": 3160 + }, + { + "epoch": 1.5397856446898344, + "grad_norm": 2.6417324542999268, + "learning_rate": 4.264125082996043e-06, + "loss": 0.5654, + "step": 3161 + }, + { + "epoch": 1.5402728158493018, + "grad_norm": 2.874533176422119, + "learning_rate": 4.2636693235599e-06, + "loss": 0.5225, + "step": 3162 + }, + { + "epoch": 1.540759987008769, + "grad_norm": 2.6671907901763916, + "learning_rate": 4.2632134474042866e-06, + "loss": 0.6023, + "step": 3163 + }, + { + "epoch": 1.5412471581682365, + "grad_norm": 2.4800264835357666, + "learning_rate": 4.262757454559372e-06, + "loss": 0.5274, + "step": 3164 + }, + { + "epoch": 1.5417343293277037, + "grad_norm": 2.7286581993103027, + "learning_rate": 4.2623013450553315e-06, + "loss": 0.5972, + "step": 3165 + }, + { + "epoch": 1.542221500487171, + "grad_norm": 2.9589953422546387, + "learning_rate": 4.2618451189223545e-06, + "loss": 0.5807, + "step": 3166 + }, + { + "epoch": 1.5427086716466385, + "grad_norm": 2.9651124477386475, + "learning_rate": 4.261388776190629e-06, + "loss": 0.5935, + "step": 3167 + }, + { + "epoch": 1.5431958428061059, + "grad_norm": 2.9246811866760254, + "learning_rate": 4.26093231689036e-06, + "loss": 0.5732, + "step": 3168 + }, + { + "epoch": 1.5436830139655733, + "grad_norm": 2.831141471862793, + "learning_rate": 4.260475741051754e-06, + "loss": 0.5691, + "step": 3169 + }, + { + "epoch": 1.5441701851250405, + "grad_norm": 2.690598964691162, + "learning_rate": 4.2600190487050265e-06, + "loss": 0.5793, + "step": 3170 + }, + { + "epoch": 1.544657356284508, + "grad_norm": 2.6831254959106445, + "learning_rate": 4.259562239880403e-06, + "loss": 0.4803, + "step": 3171 + }, + { + "epoch": 1.5451445274439752, + "grad_norm": 2.6692748069763184, + "learning_rate": 4.259105314608115e-06, + "loss": 0.5924, + "step": 3172 + }, + { + "epoch": 1.5456316986034426, + "grad_norm": 2.755682945251465, + "learning_rate": 4.2586482729184e-06, + "loss": 0.5814, + "step": 3173 + }, + { + "epoch": 1.54611886976291, + "grad_norm": 2.606611728668213, + "learning_rate": 4.258191114841506e-06, + "loss": 0.662, + "step": 3174 + }, + { + "epoch": 1.5466060409223774, + "grad_norm": 2.697746753692627, + "learning_rate": 4.257733840407689e-06, + "loss": 0.5374, + "step": 3175 + }, + { + "epoch": 1.5470932120818448, + "grad_norm": 2.7397172451019287, + "learning_rate": 4.257276449647209e-06, + "loss": 0.525, + "step": 3176 + }, + { + "epoch": 1.547580383241312, + "grad_norm": 2.5191996097564697, + "learning_rate": 4.256818942590337e-06, + "loss": 0.5736, + "step": 3177 + }, + { + "epoch": 1.5480675544007796, + "grad_norm": 2.6975338459014893, + "learning_rate": 4.256361319267351e-06, + "loss": 0.5609, + "step": 3178 + }, + { + "epoch": 1.5485547255602468, + "grad_norm": 2.844977378845215, + "learning_rate": 4.255903579708537e-06, + "loss": 0.5933, + "step": 3179 + }, + { + "epoch": 1.5490418967197142, + "grad_norm": 2.949798583984375, + "learning_rate": 4.2554457239441875e-06, + "loss": 0.5556, + "step": 3180 + }, + { + "epoch": 1.5495290678791815, + "grad_norm": 2.792117118835449, + "learning_rate": 4.254987752004603e-06, + "loss": 0.6114, + "step": 3181 + }, + { + "epoch": 1.550016239038649, + "grad_norm": 2.89709210395813, + "learning_rate": 4.254529663920093e-06, + "loss": 0.6239, + "step": 3182 + }, + { + "epoch": 1.5505034101981163, + "grad_norm": 2.795506000518799, + "learning_rate": 4.254071459720973e-06, + "loss": 0.5545, + "step": 3183 + }, + { + "epoch": 1.5509905813575835, + "grad_norm": 2.7865841388702393, + "learning_rate": 4.253613139437568e-06, + "loss": 0.5941, + "step": 3184 + }, + { + "epoch": 1.5514777525170511, + "grad_norm": 2.751556873321533, + "learning_rate": 4.253154703100207e-06, + "loss": 0.5521, + "step": 3185 + }, + { + "epoch": 1.5519649236765183, + "grad_norm": 2.460225820541382, + "learning_rate": 4.2526961507392325e-06, + "loss": 0.5582, + "step": 3186 + }, + { + "epoch": 1.5524520948359857, + "grad_norm": 2.604228973388672, + "learning_rate": 4.25223748238499e-06, + "loss": 0.5403, + "step": 3187 + }, + { + "epoch": 1.552939265995453, + "grad_norm": 2.625685930252075, + "learning_rate": 4.251778698067833e-06, + "loss": 0.542, + "step": 3188 + }, + { + "epoch": 1.5534264371549205, + "grad_norm": 2.4899420738220215, + "learning_rate": 4.251319797818125e-06, + "loss": 0.552, + "step": 3189 + }, + { + "epoch": 1.5539136083143879, + "grad_norm": 2.7256319522857666, + "learning_rate": 4.250860781666237e-06, + "loss": 0.5431, + "step": 3190 + }, + { + "epoch": 1.554400779473855, + "grad_norm": 2.542541742324829, + "learning_rate": 4.250401649642545e-06, + "loss": 0.4661, + "step": 3191 + }, + { + "epoch": 1.5548879506333226, + "grad_norm": 2.725856304168701, + "learning_rate": 4.2499424017774345e-06, + "loss": 0.577, + "step": 3192 + }, + { + "epoch": 1.5553751217927898, + "grad_norm": 3.061866521835327, + "learning_rate": 4.249483038101299e-06, + "loss": 0.5425, + "step": 3193 + }, + { + "epoch": 1.5558622929522572, + "grad_norm": 2.671903371810913, + "learning_rate": 4.249023558644538e-06, + "loss": 0.6169, + "step": 3194 + }, + { + "epoch": 1.5563494641117246, + "grad_norm": 2.856827974319458, + "learning_rate": 4.248563963437562e-06, + "loss": 0.5666, + "step": 3195 + }, + { + "epoch": 1.556836635271192, + "grad_norm": 2.687849521636963, + "learning_rate": 4.248104252510786e-06, + "loss": 0.527, + "step": 3196 + }, + { + "epoch": 1.5573238064306594, + "grad_norm": 2.850879669189453, + "learning_rate": 4.247644425894632e-06, + "loss": 0.5306, + "step": 3197 + }, + { + "epoch": 1.5578109775901265, + "grad_norm": 2.876729965209961, + "learning_rate": 4.247184483619534e-06, + "loss": 0.5078, + "step": 3198 + }, + { + "epoch": 1.5582981487495942, + "grad_norm": 2.889586925506592, + "learning_rate": 4.246724425715928e-06, + "loss": 0.6093, + "step": 3199 + }, + { + "epoch": 1.5587853199090613, + "grad_norm": 3.115178108215332, + "learning_rate": 4.246264252214264e-06, + "loss": 0.5782, + "step": 3200 + }, + { + "epoch": 1.5592724910685287, + "grad_norm": 2.7944905757904053, + "learning_rate": 4.2458039631449936e-06, + "loss": 0.591, + "step": 3201 + }, + { + "epoch": 1.5597596622279961, + "grad_norm": 3.206939697265625, + "learning_rate": 4.245343558538579e-06, + "loss": 0.6631, + "step": 3202 + }, + { + "epoch": 1.5602468333874635, + "grad_norm": 2.313361406326294, + "learning_rate": 4.24488303842549e-06, + "loss": 0.6508, + "step": 3203 + }, + { + "epoch": 1.560734004546931, + "grad_norm": 3.024705648422241, + "learning_rate": 4.2444224028362055e-06, + "loss": 0.5788, + "step": 3204 + }, + { + "epoch": 1.561221175706398, + "grad_norm": 2.8712470531463623, + "learning_rate": 4.243961651801207e-06, + "loss": 0.6306, + "step": 3205 + }, + { + "epoch": 1.5617083468658657, + "grad_norm": 2.473982095718384, + "learning_rate": 4.24350078535099e-06, + "loss": 0.5914, + "step": 3206 + }, + { + "epoch": 1.5621955180253329, + "grad_norm": 2.780942916870117, + "learning_rate": 4.2430398035160535e-06, + "loss": 0.4636, + "step": 3207 + }, + { + "epoch": 1.5626826891848002, + "grad_norm": 2.88266921043396, + "learning_rate": 4.242578706326904e-06, + "loss": 0.5711, + "step": 3208 + }, + { + "epoch": 1.5631698603442676, + "grad_norm": 5.16276741027832, + "learning_rate": 4.242117493814059e-06, + "loss": 0.5552, + "step": 3209 + }, + { + "epoch": 1.563657031503735, + "grad_norm": 2.872706890106201, + "learning_rate": 4.24165616600804e-06, + "loss": 0.602, + "step": 3210 + }, + { + "epoch": 1.5641442026632024, + "grad_norm": 2.918802261352539, + "learning_rate": 4.2411947229393784e-06, + "loss": 0.5323, + "step": 3211 + }, + { + "epoch": 1.5646313738226696, + "grad_norm": 2.6405222415924072, + "learning_rate": 4.2407331646386115e-06, + "loss": 0.5834, + "step": 3212 + }, + { + "epoch": 1.5651185449821372, + "grad_norm": 3.0796799659729004, + "learning_rate": 4.2402714911362875e-06, + "loss": 0.585, + "step": 3213 + }, + { + "epoch": 1.5656057161416044, + "grad_norm": 2.8584744930267334, + "learning_rate": 4.239809702462957e-06, + "loss": 0.5416, + "step": 3214 + }, + { + "epoch": 1.5660928873010718, + "grad_norm": 2.928452253341675, + "learning_rate": 4.239347798649182e-06, + "loss": 0.5698, + "step": 3215 + }, + { + "epoch": 1.5665800584605392, + "grad_norm": 2.581793785095215, + "learning_rate": 4.238885779725532e-06, + "loss": 0.5268, + "step": 3216 + }, + { + "epoch": 1.5670672296200063, + "grad_norm": 2.943290948867798, + "learning_rate": 4.2384236457225835e-06, + "loss": 0.574, + "step": 3217 + }, + { + "epoch": 1.567554400779474, + "grad_norm": 2.6974973678588867, + "learning_rate": 4.23796139667092e-06, + "loss": 0.4987, + "step": 3218 + }, + { + "epoch": 1.5680415719389411, + "grad_norm": 2.7352731227874756, + "learning_rate": 4.237499032601134e-06, + "loss": 0.5693, + "step": 3219 + }, + { + "epoch": 1.5685287430984087, + "grad_norm": 2.9451699256896973, + "learning_rate": 4.237036553543822e-06, + "loss": 0.6462, + "step": 3220 + }, + { + "epoch": 1.569015914257876, + "grad_norm": 2.7457587718963623, + "learning_rate": 4.236573959529593e-06, + "loss": 0.603, + "step": 3221 + }, + { + "epoch": 1.5695030854173433, + "grad_norm": 2.7832508087158203, + "learning_rate": 4.236111250589061e-06, + "loss": 0.5586, + "step": 3222 + }, + { + "epoch": 1.5699902565768107, + "grad_norm": 3.0336315631866455, + "learning_rate": 4.235648426752849e-06, + "loss": 0.7227, + "step": 3223 + }, + { + "epoch": 1.5704774277362779, + "grad_norm": 2.720015048980713, + "learning_rate": 4.2351854880515856e-06, + "loss": 0.5778, + "step": 3224 + }, + { + "epoch": 1.5709645988957455, + "grad_norm": 2.8658483028411865, + "learning_rate": 4.234722434515908e-06, + "loss": 0.5589, + "step": 3225 + }, + { + "epoch": 1.5714517700552126, + "grad_norm": 2.4310643672943115, + "learning_rate": 4.23425926617646e-06, + "loss": 0.5619, + "step": 3226 + }, + { + "epoch": 1.5719389412146803, + "grad_norm": 3.030565023422241, + "learning_rate": 4.233795983063897e-06, + "loss": 0.5897, + "step": 3227 + }, + { + "epoch": 1.5724261123741474, + "grad_norm": 3.0609638690948486, + "learning_rate": 4.2333325852088755e-06, + "loss": 0.5928, + "step": 3228 + }, + { + "epoch": 1.5729132835336148, + "grad_norm": 2.683995485305786, + "learning_rate": 4.2328690726420664e-06, + "loss": 0.5278, + "step": 3229 + }, + { + "epoch": 1.5734004546930822, + "grad_norm": 2.6248888969421387, + "learning_rate": 4.2324054453941436e-06, + "loss": 0.6203, + "step": 3230 + }, + { + "epoch": 1.5738876258525494, + "grad_norm": 2.6199002265930176, + "learning_rate": 4.231941703495789e-06, + "loss": 0.489, + "step": 3231 + }, + { + "epoch": 1.574374797012017, + "grad_norm": 2.817603588104248, + "learning_rate": 4.231477846977694e-06, + "loss": 0.5644, + "step": 3232 + }, + { + "epoch": 1.5748619681714842, + "grad_norm": 2.7849924564361572, + "learning_rate": 4.231013875870556e-06, + "loss": 0.5799, + "step": 3233 + }, + { + "epoch": 1.5753491393309516, + "grad_norm": 3.1894845962524414, + "learning_rate": 4.230549790205081e-06, + "loss": 0.5914, + "step": 3234 + }, + { + "epoch": 1.575836310490419, + "grad_norm": 2.833083391189575, + "learning_rate": 4.230085590011982e-06, + "loss": 0.6271, + "step": 3235 + }, + { + "epoch": 1.5763234816498863, + "grad_norm": 2.855534553527832, + "learning_rate": 4.22962127532198e-06, + "loss": 0.6005, + "step": 3236 + }, + { + "epoch": 1.5768106528093537, + "grad_norm": 2.949514865875244, + "learning_rate": 4.229156846165804e-06, + "loss": 0.5726, + "step": 3237 + }, + { + "epoch": 1.577297823968821, + "grad_norm": 3.024627685546875, + "learning_rate": 4.228692302574188e-06, + "loss": 0.5986, + "step": 3238 + }, + { + "epoch": 1.5777849951282885, + "grad_norm": 2.5939385890960693, + "learning_rate": 4.228227644577876e-06, + "loss": 0.5727, + "step": 3239 + }, + { + "epoch": 1.5782721662877557, + "grad_norm": 2.839953660964966, + "learning_rate": 4.227762872207619e-06, + "loss": 0.6317, + "step": 3240 + }, + { + "epoch": 1.578759337447223, + "grad_norm": 2.786339044570923, + "learning_rate": 4.227297985494178e-06, + "loss": 0.5139, + "step": 3241 + }, + { + "epoch": 1.5792465086066905, + "grad_norm": 2.823707103729248, + "learning_rate": 4.226832984468315e-06, + "loss": 0.6229, + "step": 3242 + }, + { + "epoch": 1.5797336797661579, + "grad_norm": 3.2037763595581055, + "learning_rate": 4.226367869160807e-06, + "loss": 0.5766, + "step": 3243 + }, + { + "epoch": 1.5802208509256253, + "grad_norm": 2.7368850708007812, + "learning_rate": 4.225902639602434e-06, + "loss": 0.5732, + "step": 3244 + }, + { + "epoch": 1.5807080220850924, + "grad_norm": 2.663905620574951, + "learning_rate": 4.225437295823986e-06, + "loss": 0.5762, + "step": 3245 + }, + { + "epoch": 1.58119519324456, + "grad_norm": 2.8529956340789795, + "learning_rate": 4.224971837856257e-06, + "loss": 0.5682, + "step": 3246 + }, + { + "epoch": 1.5816823644040272, + "grad_norm": 2.9056789875030518, + "learning_rate": 4.224506265730052e-06, + "loss": 0.5998, + "step": 3247 + }, + { + "epoch": 1.5821695355634946, + "grad_norm": 2.408174753189087, + "learning_rate": 4.224040579476184e-06, + "loss": 0.6219, + "step": 3248 + }, + { + "epoch": 1.582656706722962, + "grad_norm": 3.0200865268707275, + "learning_rate": 4.223574779125471e-06, + "loss": 0.5036, + "step": 3249 + }, + { + "epoch": 1.5831438778824294, + "grad_norm": 2.661900520324707, + "learning_rate": 4.223108864708739e-06, + "loss": 0.5144, + "step": 3250 + }, + { + "epoch": 1.5836310490418968, + "grad_norm": 2.5596203804016113, + "learning_rate": 4.222642836256824e-06, + "loss": 0.5892, + "step": 3251 + }, + { + "epoch": 1.584118220201364, + "grad_norm": 2.7041163444519043, + "learning_rate": 4.222176693800565e-06, + "loss": 0.5373, + "step": 3252 + }, + { + "epoch": 1.5846053913608316, + "grad_norm": 2.3467204570770264, + "learning_rate": 4.2217104373708135e-06, + "loss": 0.4844, + "step": 3253 + }, + { + "epoch": 1.5850925625202987, + "grad_norm": 9.276105880737305, + "learning_rate": 4.221244066998426e-06, + "loss": 0.5124, + "step": 3254 + }, + { + "epoch": 1.5855797336797661, + "grad_norm": 2.652268886566162, + "learning_rate": 4.220777582714265e-06, + "loss": 0.568, + "step": 3255 + }, + { + "epoch": 1.5860669048392335, + "grad_norm": 2.5670669078826904, + "learning_rate": 4.220310984549205e-06, + "loss": 0.5321, + "step": 3256 + }, + { + "epoch": 1.586554075998701, + "grad_norm": 2.775440216064453, + "learning_rate": 4.219844272534124e-06, + "loss": 0.5138, + "step": 3257 + }, + { + "epoch": 1.5870412471581683, + "grad_norm": 2.6613101959228516, + "learning_rate": 4.21937744669991e-06, + "loss": 0.5853, + "step": 3258 + }, + { + "epoch": 1.5875284183176355, + "grad_norm": 2.5880744457244873, + "learning_rate": 4.218910507077456e-06, + "loss": 0.5692, + "step": 3259 + }, + { + "epoch": 1.588015589477103, + "grad_norm": 2.550255298614502, + "learning_rate": 4.218443453697664e-06, + "loss": 0.4892, + "step": 3260 + }, + { + "epoch": 1.5885027606365703, + "grad_norm": 3.8956801891326904, + "learning_rate": 4.217976286591445e-06, + "loss": 0.5545, + "step": 3261 + }, + { + "epoch": 1.5889899317960376, + "grad_norm": 2.4626965522766113, + "learning_rate": 4.217509005789715e-06, + "loss": 0.5813, + "step": 3262 + }, + { + "epoch": 1.589477102955505, + "grad_norm": 2.8192484378814697, + "learning_rate": 4.217041611323399e-06, + "loss": 0.5356, + "step": 3263 + }, + { + "epoch": 1.5899642741149724, + "grad_norm": 2.7792978286743164, + "learning_rate": 4.21657410322343e-06, + "loss": 0.5661, + "step": 3264 + }, + { + "epoch": 1.5904514452744398, + "grad_norm": 2.899378538131714, + "learning_rate": 4.216106481520745e-06, + "loss": 0.6018, + "step": 3265 + }, + { + "epoch": 1.590938616433907, + "grad_norm": 2.7237794399261475, + "learning_rate": 4.215638746246295e-06, + "loss": 0.4973, + "step": 3266 + }, + { + "epoch": 1.5914257875933746, + "grad_norm": 2.663090944290161, + "learning_rate": 4.21517089743103e-06, + "loss": 0.5796, + "step": 3267 + }, + { + "epoch": 1.5919129587528418, + "grad_norm": 3.0040125846862793, + "learning_rate": 4.214702935105916e-06, + "loss": 0.547, + "step": 3268 + }, + { + "epoch": 1.5924001299123092, + "grad_norm": 2.776867389678955, + "learning_rate": 4.214234859301921e-06, + "loss": 0.563, + "step": 3269 + }, + { + "epoch": 1.5928873010717766, + "grad_norm": 2.6690456867218018, + "learning_rate": 4.213766670050024e-06, + "loss": 0.6033, + "step": 3270 + }, + { + "epoch": 1.593374472231244, + "grad_norm": 2.8002655506134033, + "learning_rate": 4.213298367381207e-06, + "loss": 0.5644, + "step": 3271 + }, + { + "epoch": 1.5938616433907113, + "grad_norm": 3.041267156600952, + "learning_rate": 4.212829951326464e-06, + "loss": 0.5803, + "step": 3272 + }, + { + "epoch": 1.5943488145501785, + "grad_norm": 3.147545337677002, + "learning_rate": 4.212361421916794e-06, + "loss": 0.5259, + "step": 3273 + }, + { + "epoch": 1.5948359857096461, + "grad_norm": 2.924354314804077, + "learning_rate": 4.211892779183205e-06, + "loss": 0.5882, + "step": 3274 + }, + { + "epoch": 1.5953231568691133, + "grad_norm": 2.633927583694458, + "learning_rate": 4.211424023156712e-06, + "loss": 0.5192, + "step": 3275 + }, + { + "epoch": 1.5958103280285807, + "grad_norm": 2.631333351135254, + "learning_rate": 4.210955153868335e-06, + "loss": 0.6019, + "step": 3276 + }, + { + "epoch": 1.596297499188048, + "grad_norm": 3.127305507659912, + "learning_rate": 4.210486171349106e-06, + "loss": 0.6023, + "step": 3277 + }, + { + "epoch": 1.5967846703475155, + "grad_norm": 2.845425605773926, + "learning_rate": 4.210017075630062e-06, + "loss": 0.6224, + "step": 3278 + }, + { + "epoch": 1.5972718415069829, + "grad_norm": 3.131885528564453, + "learning_rate": 4.2095478667422455e-06, + "loss": 0.6235, + "step": 3279 + }, + { + "epoch": 1.59775901266645, + "grad_norm": 2.678417921066284, + "learning_rate": 4.209078544716711e-06, + "loss": 0.5994, + "step": 3280 + }, + { + "epoch": 1.5982461838259177, + "grad_norm": 2.977229118347168, + "learning_rate": 4.2086091095845185e-06, + "loss": 0.6096, + "step": 3281 + }, + { + "epoch": 1.5987333549853848, + "grad_norm": 3.0648295879364014, + "learning_rate": 4.208139561376734e-06, + "loss": 0.5807, + "step": 3282 + }, + { + "epoch": 1.5992205261448522, + "grad_norm": 2.5693423748016357, + "learning_rate": 4.2076699001244326e-06, + "loss": 0.5009, + "step": 3283 + }, + { + "epoch": 1.5997076973043196, + "grad_norm": 2.6483895778656006, + "learning_rate": 4.207200125858696e-06, + "loss": 0.5712, + "step": 3284 + }, + { + "epoch": 1.6001948684637868, + "grad_norm": 2.7810187339782715, + "learning_rate": 4.206730238610613e-06, + "loss": 0.5712, + "step": 3285 + }, + { + "epoch": 1.6006820396232544, + "grad_norm": 2.8115994930267334, + "learning_rate": 4.206260238411283e-06, + "loss": 0.5709, + "step": 3286 + }, + { + "epoch": 1.6011692107827216, + "grad_norm": 2.833097457885742, + "learning_rate": 4.205790125291808e-06, + "loss": 0.575, + "step": 3287 + }, + { + "epoch": 1.6016563819421892, + "grad_norm": 2.7154653072357178, + "learning_rate": 4.2053198992833025e-06, + "loss": 0.581, + "step": 3288 + }, + { + "epoch": 1.6021435531016563, + "grad_norm": 2.7606492042541504, + "learning_rate": 4.204849560416885e-06, + "loss": 0.5405, + "step": 3289 + }, + { + "epoch": 1.6026307242611237, + "grad_norm": 2.7944493293762207, + "learning_rate": 4.204379108723682e-06, + "loss": 0.5844, + "step": 3290 + }, + { + "epoch": 1.6031178954205911, + "grad_norm": 2.435424327850342, + "learning_rate": 4.203908544234827e-06, + "loss": 0.5491, + "step": 3291 + }, + { + "epoch": 1.6036050665800583, + "grad_norm": 2.873805046081543, + "learning_rate": 4.203437866981465e-06, + "loss": 0.5322, + "step": 3292 + }, + { + "epoch": 1.604092237739526, + "grad_norm": 4.130641460418701, + "learning_rate": 4.202967076994743e-06, + "loss": 0.5687, + "step": 3293 + }, + { + "epoch": 1.604579408898993, + "grad_norm": 2.928755044937134, + "learning_rate": 4.2024961743058194e-06, + "loss": 0.6095, + "step": 3294 + }, + { + "epoch": 1.6050665800584607, + "grad_norm": 3.1397104263305664, + "learning_rate": 4.202025158945855e-06, + "loss": 0.5012, + "step": 3295 + }, + { + "epoch": 1.6055537512179279, + "grad_norm": 2.5146965980529785, + "learning_rate": 4.2015540309460266e-06, + "loss": 0.5615, + "step": 3296 + }, + { + "epoch": 1.6060409223773953, + "grad_norm": 2.64487624168396, + "learning_rate": 4.2010827903375095e-06, + "loss": 0.5309, + "step": 3297 + }, + { + "epoch": 1.6065280935368627, + "grad_norm": 2.573643922805786, + "learning_rate": 4.200611437151493e-06, + "loss": 0.5871, + "step": 3298 + }, + { + "epoch": 1.6070152646963298, + "grad_norm": 2.884523630142212, + "learning_rate": 4.200139971419168e-06, + "loss": 0.4998, + "step": 3299 + }, + { + "epoch": 1.6075024358557974, + "grad_norm": 2.7134389877319336, + "learning_rate": 4.19966839317174e-06, + "loss": 0.6535, + "step": 3300 + }, + { + "epoch": 1.6079896070152646, + "grad_norm": 2.9971165657043457, + "learning_rate": 4.1991967024404144e-06, + "loss": 0.6123, + "step": 3301 + }, + { + "epoch": 1.608476778174732, + "grad_norm": 2.972841739654541, + "learning_rate": 4.19872489925641e-06, + "loss": 0.5574, + "step": 3302 + }, + { + "epoch": 1.6089639493341994, + "grad_norm": 2.6789512634277344, + "learning_rate": 4.19825298365095e-06, + "loss": 0.5662, + "step": 3303 + }, + { + "epoch": 1.6094511204936668, + "grad_norm": 2.8997507095336914, + "learning_rate": 4.197780955655266e-06, + "loss": 0.5976, + "step": 3304 + }, + { + "epoch": 1.6099382916531342, + "grad_norm": 2.5930252075195312, + "learning_rate": 4.197308815300596e-06, + "loss": 0.584, + "step": 3305 + }, + { + "epoch": 1.6104254628126013, + "grad_norm": 2.9115936756134033, + "learning_rate": 4.196836562618186e-06, + "loss": 0.6113, + "step": 3306 + }, + { + "epoch": 1.610912633972069, + "grad_norm": 2.603524923324585, + "learning_rate": 4.196364197639291e-06, + "loss": 0.6547, + "step": 3307 + }, + { + "epoch": 1.6113998051315361, + "grad_norm": 2.838644027709961, + "learning_rate": 4.195891720395172e-06, + "loss": 0.5831, + "step": 3308 + }, + { + "epoch": 1.6118869762910035, + "grad_norm": 2.3865370750427246, + "learning_rate": 4.195419130917096e-06, + "loss": 0.5641, + "step": 3309 + }, + { + "epoch": 1.612374147450471, + "grad_norm": 2.5818798542022705, + "learning_rate": 4.194946429236339e-06, + "loss": 0.5507, + "step": 3310 + }, + { + "epoch": 1.6128613186099383, + "grad_norm": 2.6210646629333496, + "learning_rate": 4.194473615384187e-06, + "loss": 0.5567, + "step": 3311 + }, + { + "epoch": 1.6133484897694057, + "grad_norm": 2.867835283279419, + "learning_rate": 4.194000689391928e-06, + "loss": 0.5666, + "step": 3312 + }, + { + "epoch": 1.6138356609288729, + "grad_norm": 2.8483073711395264, + "learning_rate": 4.193527651290862e-06, + "loss": 0.5694, + "step": 3313 + }, + { + "epoch": 1.6143228320883405, + "grad_norm": 2.9525628089904785, + "learning_rate": 4.193054501112293e-06, + "loss": 0.6648, + "step": 3314 + }, + { + "epoch": 1.6148100032478077, + "grad_norm": 3.064037799835205, + "learning_rate": 4.192581238887534e-06, + "loss": 0.587, + "step": 3315 + }, + { + "epoch": 1.615297174407275, + "grad_norm": 2.8851442337036133, + "learning_rate": 4.192107864647907e-06, + "loss": 0.5584, + "step": 3316 + }, + { + "epoch": 1.6157843455667424, + "grad_norm": 2.527228355407715, + "learning_rate": 4.191634378424739e-06, + "loss": 0.5306, + "step": 3317 + }, + { + "epoch": 1.6162715167262098, + "grad_norm": 2.6249191761016846, + "learning_rate": 4.191160780249365e-06, + "loss": 0.6075, + "step": 3318 + }, + { + "epoch": 1.6167586878856772, + "grad_norm": 2.6993727684020996, + "learning_rate": 4.190687070153129e-06, + "loss": 0.5151, + "step": 3319 + }, + { + "epoch": 1.6172458590451444, + "grad_norm": 2.6867518424987793, + "learning_rate": 4.1902132481673795e-06, + "loss": 0.6049, + "step": 3320 + }, + { + "epoch": 1.617733030204612, + "grad_norm": 2.993861198425293, + "learning_rate": 4.189739314323475e-06, + "loss": 0.4925, + "step": 3321 + }, + { + "epoch": 1.6182202013640792, + "grad_norm": 2.474911689758301, + "learning_rate": 4.189265268652779e-06, + "loss": 0.5521, + "step": 3322 + }, + { + "epoch": 1.6187073725235466, + "grad_norm": 5.671990871429443, + "learning_rate": 4.188791111186667e-06, + "loss": 0.5646, + "step": 3323 + }, + { + "epoch": 1.619194543683014, + "grad_norm": 2.801070213317871, + "learning_rate": 4.188316841956514e-06, + "loss": 0.5576, + "step": 3324 + }, + { + "epoch": 1.6196817148424814, + "grad_norm": 2.7575573921203613, + "learning_rate": 4.1878424609937105e-06, + "loss": 0.5728, + "step": 3325 + }, + { + "epoch": 1.6201688860019487, + "grad_norm": 2.678051233291626, + "learning_rate": 4.187367968329651e-06, + "loss": 0.5521, + "step": 3326 + }, + { + "epoch": 1.620656057161416, + "grad_norm": 2.9211230278015137, + "learning_rate": 4.186893363995736e-06, + "loss": 0.5591, + "step": 3327 + }, + { + "epoch": 1.6211432283208835, + "grad_norm": 2.5851290225982666, + "learning_rate": 4.186418648023374e-06, + "loss": 0.5387, + "step": 3328 + }, + { + "epoch": 1.6216303994803507, + "grad_norm": 2.7441940307617188, + "learning_rate": 4.185943820443984e-06, + "loss": 0.5353, + "step": 3329 + }, + { + "epoch": 1.622117570639818, + "grad_norm": 2.9284749031066895, + "learning_rate": 4.185468881288989e-06, + "loss": 0.5868, + "step": 3330 + }, + { + "epoch": 1.6226047417992855, + "grad_norm": 2.5898826122283936, + "learning_rate": 4.184993830589821e-06, + "loss": 0.5242, + "step": 3331 + }, + { + "epoch": 1.6230919129587529, + "grad_norm": 2.6005148887634277, + "learning_rate": 4.184518668377917e-06, + "loss": 0.5612, + "step": 3332 + }, + { + "epoch": 1.6235790841182203, + "grad_norm": 2.7945094108581543, + "learning_rate": 4.184043394684725e-06, + "loss": 0.6251, + "step": 3333 + }, + { + "epoch": 1.6240662552776874, + "grad_norm": 2.864809274673462, + "learning_rate": 4.1835680095416974e-06, + "loss": 0.5509, + "step": 3334 + }, + { + "epoch": 1.624553426437155, + "grad_norm": 2.601548671722412, + "learning_rate": 4.183092512980297e-06, + "loss": 0.5599, + "step": 3335 + }, + { + "epoch": 1.6250405975966222, + "grad_norm": 2.629581928253174, + "learning_rate": 4.18261690503199e-06, + "loss": 0.5417, + "step": 3336 + }, + { + "epoch": 1.6255277687560896, + "grad_norm": 2.8625199794769287, + "learning_rate": 4.182141185728253e-06, + "loss": 0.5318, + "step": 3337 + }, + { + "epoch": 1.626014939915557, + "grad_norm": 2.4364137649536133, + "learning_rate": 4.181665355100568e-06, + "loss": 0.4868, + "step": 3338 + }, + { + "epoch": 1.6265021110750244, + "grad_norm": 2.5217819213867188, + "learning_rate": 4.181189413180428e-06, + "loss": 0.5215, + "step": 3339 + }, + { + "epoch": 1.6269892822344918, + "grad_norm": 2.546539068222046, + "learning_rate": 4.180713359999328e-06, + "loss": 0.5764, + "step": 3340 + }, + { + "epoch": 1.627476453393959, + "grad_norm": 2.64532470703125, + "learning_rate": 4.180237195588774e-06, + "loss": 0.6116, + "step": 3341 + }, + { + "epoch": 1.6279636245534266, + "grad_norm": 2.6714344024658203, + "learning_rate": 4.17976091998028e-06, + "loss": 0.5553, + "step": 3342 + }, + { + "epoch": 1.6284507957128938, + "grad_norm": 2.57908034324646, + "learning_rate": 4.1792845332053645e-06, + "loss": 0.5395, + "step": 3343 + }, + { + "epoch": 1.6289379668723611, + "grad_norm": 2.5601069927215576, + "learning_rate": 4.178808035295555e-06, + "loss": 0.5171, + "step": 3344 + }, + { + "epoch": 1.6294251380318285, + "grad_norm": 3.078604221343994, + "learning_rate": 4.178331426282386e-06, + "loss": 0.5982, + "step": 3345 + }, + { + "epoch": 1.629912309191296, + "grad_norm": 3.1431527137756348, + "learning_rate": 4.177854706197399e-06, + "loss": 0.5992, + "step": 3346 + }, + { + "epoch": 1.6303994803507633, + "grad_norm": 2.7161014080047607, + "learning_rate": 4.177377875072145e-06, + "loss": 0.458, + "step": 3347 + }, + { + "epoch": 1.6308866515102305, + "grad_norm": 2.636216878890991, + "learning_rate": 4.176900932938178e-06, + "loss": 0.556, + "step": 3348 + }, + { + "epoch": 1.631373822669698, + "grad_norm": 2.559230327606201, + "learning_rate": 4.1764238798270636e-06, + "loss": 0.6446, + "step": 3349 + }, + { + "epoch": 1.6318609938291653, + "grad_norm": 2.9695942401885986, + "learning_rate": 4.175946715770373e-06, + "loss": 0.5461, + "step": 3350 + }, + { + "epoch": 1.6323481649886327, + "grad_norm": 2.832136392593384, + "learning_rate": 4.175469440799686e-06, + "loss": 0.5705, + "step": 3351 + }, + { + "epoch": 1.6328353361481, + "grad_norm": 2.483391523361206, + "learning_rate": 4.1749920549465864e-06, + "loss": 0.5689, + "step": 3352 + }, + { + "epoch": 1.6333225073075672, + "grad_norm": 2.6945841312408447, + "learning_rate": 4.174514558242669e-06, + "loss": 0.5595, + "step": 3353 + }, + { + "epoch": 1.6338096784670348, + "grad_norm": 2.7737228870391846, + "learning_rate": 4.174036950719533e-06, + "loss": 0.5183, + "step": 3354 + }, + { + "epoch": 1.634296849626502, + "grad_norm": 2.567755937576294, + "learning_rate": 4.17355923240879e-06, + "loss": 0.5116, + "step": 3355 + }, + { + "epoch": 1.6347840207859696, + "grad_norm": 3.0059571266174316, + "learning_rate": 4.17308140334205e-06, + "loss": 0.6051, + "step": 3356 + }, + { + "epoch": 1.6352711919454368, + "grad_norm": 2.8292553424835205, + "learning_rate": 4.17260346355094e-06, + "loss": 0.6582, + "step": 3357 + }, + { + "epoch": 1.6357583631049042, + "grad_norm": 2.9498016834259033, + "learning_rate": 4.1721254130670876e-06, + "loss": 0.4696, + "step": 3358 + }, + { + "epoch": 1.6362455342643716, + "grad_norm": 2.5704987049102783, + "learning_rate": 4.171647251922132e-06, + "loss": 0.6549, + "step": 3359 + }, + { + "epoch": 1.6367327054238388, + "grad_norm": 2.9304192066192627, + "learning_rate": 4.171168980147717e-06, + "loss": 0.5224, + "step": 3360 + }, + { + "epoch": 1.6372198765833064, + "grad_norm": 2.7806193828582764, + "learning_rate": 4.1706905977754945e-06, + "loss": 0.5293, + "step": 3361 + }, + { + "epoch": 1.6377070477427735, + "grad_norm": 2.7159295082092285, + "learning_rate": 4.170212104837123e-06, + "loss": 0.6094, + "step": 3362 + }, + { + "epoch": 1.6381942189022412, + "grad_norm": 2.564626693725586, + "learning_rate": 4.16973350136427e-06, + "loss": 0.5906, + "step": 3363 + }, + { + "epoch": 1.6386813900617083, + "grad_norm": 2.7731282711029053, + "learning_rate": 4.16925478738861e-06, + "loss": 0.591, + "step": 3364 + }, + { + "epoch": 1.6391685612211757, + "grad_norm": 2.6886796951293945, + "learning_rate": 4.168775962941823e-06, + "loss": 0.5905, + "step": 3365 + }, + { + "epoch": 1.639655732380643, + "grad_norm": 2.76434326171875, + "learning_rate": 4.168297028055599e-06, + "loss": 0.5574, + "step": 3366 + }, + { + "epoch": 1.6401429035401103, + "grad_norm": 2.8353686332702637, + "learning_rate": 4.167817982761633e-06, + "loss": 0.6602, + "step": 3367 + }, + { + "epoch": 1.640630074699578, + "grad_norm": 2.8459970951080322, + "learning_rate": 4.167338827091627e-06, + "loss": 0.6171, + "step": 3368 + }, + { + "epoch": 1.641117245859045, + "grad_norm": 3.0377402305603027, + "learning_rate": 4.1668595610772935e-06, + "loss": 0.5669, + "step": 3369 + }, + { + "epoch": 1.6416044170185125, + "grad_norm": 2.683342456817627, + "learning_rate": 4.166380184750349e-06, + "loss": 0.6239, + "step": 3370 + }, + { + "epoch": 1.6420915881779798, + "grad_norm": 2.7734808921813965, + "learning_rate": 4.16590069814252e-06, + "loss": 0.5557, + "step": 3371 + }, + { + "epoch": 1.6425787593374472, + "grad_norm": 3.255354404449463, + "learning_rate": 4.1654211012855375e-06, + "loss": 0.6674, + "step": 3372 + }, + { + "epoch": 1.6430659304969146, + "grad_norm": 2.835949420928955, + "learning_rate": 4.164941394211141e-06, + "loss": 0.5511, + "step": 3373 + }, + { + "epoch": 1.6435531016563818, + "grad_norm": 2.715554714202881, + "learning_rate": 4.164461576951079e-06, + "loss": 0.5505, + "step": 3374 + }, + { + "epoch": 1.6440402728158494, + "grad_norm": 3.0140764713287354, + "learning_rate": 4.1639816495371045e-06, + "loss": 0.5708, + "step": 3375 + }, + { + "epoch": 1.6445274439753166, + "grad_norm": 3.0127716064453125, + "learning_rate": 4.1635016120009795e-06, + "loss": 0.6402, + "step": 3376 + }, + { + "epoch": 1.645014615134784, + "grad_norm": 2.9547345638275146, + "learning_rate": 4.163021464374472e-06, + "loss": 0.5814, + "step": 3377 + }, + { + "epoch": 1.6455017862942514, + "grad_norm": 2.6071574687957764, + "learning_rate": 4.16254120668936e-06, + "loss": 0.6067, + "step": 3378 + }, + { + "epoch": 1.6459889574537188, + "grad_norm": 2.9443747997283936, + "learning_rate": 4.162060838977425e-06, + "loss": 0.6146, + "step": 3379 + }, + { + "epoch": 1.6464761286131862, + "grad_norm": 2.8236746788024902, + "learning_rate": 4.161580361270459e-06, + "loss": 0.5485, + "step": 3380 + }, + { + "epoch": 1.6469632997726533, + "grad_norm": 2.522688627243042, + "learning_rate": 4.161099773600258e-06, + "loss": 0.5326, + "step": 3381 + }, + { + "epoch": 1.647450470932121, + "grad_norm": 2.6465022563934326, + "learning_rate": 4.16061907599863e-06, + "loss": 0.5723, + "step": 3382 + }, + { + "epoch": 1.647937642091588, + "grad_norm": 3.1461374759674072, + "learning_rate": 4.160138268497386e-06, + "loss": 0.5963, + "step": 3383 + }, + { + "epoch": 1.6484248132510555, + "grad_norm": 2.9780805110931396, + "learning_rate": 4.159657351128346e-06, + "loss": 0.5772, + "step": 3384 + }, + { + "epoch": 1.648911984410523, + "grad_norm": 2.934481620788574, + "learning_rate": 4.159176323923336e-06, + "loss": 0.4949, + "step": 3385 + }, + { + "epoch": 1.6493991555699903, + "grad_norm": 2.5095863342285156, + "learning_rate": 4.158695186914193e-06, + "loss": 0.5683, + "step": 3386 + }, + { + "epoch": 1.6498863267294577, + "grad_norm": 2.559163808822632, + "learning_rate": 4.158213940132756e-06, + "loss": 0.5134, + "step": 3387 + }, + { + "epoch": 1.6503734978889248, + "grad_norm": 2.4394471645355225, + "learning_rate": 4.157732583610875e-06, + "loss": 0.6101, + "step": 3388 + }, + { + "epoch": 1.6508606690483925, + "grad_norm": 2.393348455429077, + "learning_rate": 4.1572511173804055e-06, + "loss": 0.5944, + "step": 3389 + }, + { + "epoch": 1.6513478402078596, + "grad_norm": 2.895052194595337, + "learning_rate": 4.156769541473212e-06, + "loss": 0.6153, + "step": 3390 + }, + { + "epoch": 1.651835011367327, + "grad_norm": 2.906785726547241, + "learning_rate": 4.156287855921165e-06, + "loss": 0.6202, + "step": 3391 + }, + { + "epoch": 1.6523221825267944, + "grad_norm": 2.744677782058716, + "learning_rate": 4.155806060756141e-06, + "loss": 0.6715, + "step": 3392 + }, + { + "epoch": 1.6528093536862618, + "grad_norm": 3.020148754119873, + "learning_rate": 4.155324156010027e-06, + "loss": 0.5668, + "step": 3393 + }, + { + "epoch": 1.6532965248457292, + "grad_norm": 2.7424566745758057, + "learning_rate": 4.154842141714714e-06, + "loss": 0.5637, + "step": 3394 + }, + { + "epoch": 1.6537836960051964, + "grad_norm": 2.7196364402770996, + "learning_rate": 4.154360017902102e-06, + "loss": 0.5993, + "step": 3395 + }, + { + "epoch": 1.654270867164664, + "grad_norm": 2.9526455402374268, + "learning_rate": 4.153877784604098e-06, + "loss": 0.5629, + "step": 3396 + }, + { + "epoch": 1.6547580383241312, + "grad_norm": 2.6582565307617188, + "learning_rate": 4.153395441852616e-06, + "loss": 0.629, + "step": 3397 + }, + { + "epoch": 1.6552452094835985, + "grad_norm": 2.9556033611297607, + "learning_rate": 4.152912989679579e-06, + "loss": 0.5198, + "step": 3398 + }, + { + "epoch": 1.655732380643066, + "grad_norm": 2.667788028717041, + "learning_rate": 4.152430428116913e-06, + "loss": 0.6584, + "step": 3399 + }, + { + "epoch": 1.6562195518025333, + "grad_norm": 2.8251047134399414, + "learning_rate": 4.151947757196555e-06, + "loss": 0.6059, + "step": 3400 + }, + { + "epoch": 1.6567067229620007, + "grad_norm": 2.9306116104125977, + "learning_rate": 4.151464976950449e-06, + "loss": 0.5118, + "step": 3401 + }, + { + "epoch": 1.657193894121468, + "grad_norm": 2.810116767883301, + "learning_rate": 4.150982087410545e-06, + "loss": 0.5842, + "step": 3402 + }, + { + "epoch": 1.6576810652809355, + "grad_norm": 2.768503427505493, + "learning_rate": 4.150499088608799e-06, + "loss": 0.5594, + "step": 3403 + }, + { + "epoch": 1.6581682364404027, + "grad_norm": 3.5141708850860596, + "learning_rate": 4.150015980577178e-06, + "loss": 0.5973, + "step": 3404 + }, + { + "epoch": 1.65865540759987, + "grad_norm": 2.8957207202911377, + "learning_rate": 4.149532763347652e-06, + "loss": 0.5792, + "step": 3405 + }, + { + "epoch": 1.6591425787593375, + "grad_norm": 2.60613751411438, + "learning_rate": 4.149049436952202e-06, + "loss": 0.6003, + "step": 3406 + }, + { + "epoch": 1.6596297499188049, + "grad_norm": 2.724092960357666, + "learning_rate": 4.148566001422814e-06, + "loss": 0.5078, + "step": 3407 + }, + { + "epoch": 1.6601169210782722, + "grad_norm": 2.723714590072632, + "learning_rate": 4.148082456791481e-06, + "loss": 0.5886, + "step": 3408 + }, + { + "epoch": 1.6606040922377394, + "grad_norm": 2.569852352142334, + "learning_rate": 4.147598803090205e-06, + "loss": 0.5119, + "step": 3409 + }, + { + "epoch": 1.661091263397207, + "grad_norm": 2.895564079284668, + "learning_rate": 4.147115040350993e-06, + "loss": 0.6552, + "step": 3410 + }, + { + "epoch": 1.6615784345566742, + "grad_norm": 2.842747926712036, + "learning_rate": 4.146631168605862e-06, + "loss": 0.6212, + "step": 3411 + }, + { + "epoch": 1.6620656057161416, + "grad_norm": 2.5898427963256836, + "learning_rate": 4.1461471878868335e-06, + "loss": 0.5251, + "step": 3412 + }, + { + "epoch": 1.662552776875609, + "grad_norm": 2.5918283462524414, + "learning_rate": 4.1456630982259364e-06, + "loss": 0.5451, + "step": 3413 + }, + { + "epoch": 1.6630399480350764, + "grad_norm": 2.8487656116485596, + "learning_rate": 4.145178899655209e-06, + "loss": 0.5718, + "step": 3414 + }, + { + "epoch": 1.6635271191945438, + "grad_norm": 2.4873006343841553, + "learning_rate": 4.144694592206696e-06, + "loss": 0.6088, + "step": 3415 + }, + { + "epoch": 1.664014290354011, + "grad_norm": 2.7987208366394043, + "learning_rate": 4.144210175912447e-06, + "loss": 0.6556, + "step": 3416 + }, + { + "epoch": 1.6645014615134786, + "grad_norm": 2.766481876373291, + "learning_rate": 4.143725650804522e-06, + "loss": 0.5444, + "step": 3417 + }, + { + "epoch": 1.6649886326729457, + "grad_norm": 2.570225238800049, + "learning_rate": 4.143241016914986e-06, + "loss": 0.5014, + "step": 3418 + }, + { + "epoch": 1.6654758038324131, + "grad_norm": 2.94392991065979, + "learning_rate": 4.142756274275914e-06, + "loss": 0.5425, + "step": 3419 + }, + { + "epoch": 1.6659629749918805, + "grad_norm": 2.3863625526428223, + "learning_rate": 4.142271422919384e-06, + "loss": 0.585, + "step": 3420 + }, + { + "epoch": 1.6664501461513477, + "grad_norm": 2.5245611667633057, + "learning_rate": 4.141786462877484e-06, + "loss": 0.5523, + "step": 3421 + }, + { + "epoch": 1.6669373173108153, + "grad_norm": 2.493165969848633, + "learning_rate": 4.141301394182309e-06, + "loss": 0.5182, + "step": 3422 + }, + { + "epoch": 1.6674244884702825, + "grad_norm": 2.6159884929656982, + "learning_rate": 4.1408162168659605e-06, + "loss": 0.6314, + "step": 3423 + }, + { + "epoch": 1.66791165962975, + "grad_norm": 2.7267534732818604, + "learning_rate": 4.1403309309605465e-06, + "loss": 0.6079, + "step": 3424 + }, + { + "epoch": 1.6683988307892172, + "grad_norm": 2.791677951812744, + "learning_rate": 4.139845536498186e-06, + "loss": 0.5618, + "step": 3425 + }, + { + "epoch": 1.6688860019486846, + "grad_norm": 2.819544792175293, + "learning_rate": 4.139360033511e-06, + "loss": 0.5768, + "step": 3426 + }, + { + "epoch": 1.669373173108152, + "grad_norm": 2.782263994216919, + "learning_rate": 4.138874422031119e-06, + "loss": 0.6015, + "step": 3427 + }, + { + "epoch": 1.6698603442676192, + "grad_norm": 3.2371370792388916, + "learning_rate": 4.138388702090682e-06, + "loss": 0.5957, + "step": 3428 + }, + { + "epoch": 1.6703475154270868, + "grad_norm": 2.6818947792053223, + "learning_rate": 4.137902873721833e-06, + "loss": 0.6599, + "step": 3429 + }, + { + "epoch": 1.670834686586554, + "grad_norm": 2.9291129112243652, + "learning_rate": 4.137416936956724e-06, + "loss": 0.5309, + "step": 3430 + }, + { + "epoch": 1.6713218577460216, + "grad_norm": 3.579249620437622, + "learning_rate": 4.136930891827515e-06, + "loss": 0.6188, + "step": 3431 + }, + { + "epoch": 1.6718090289054888, + "grad_norm": 3.0972745418548584, + "learning_rate": 4.136444738366372e-06, + "loss": 0.5691, + "step": 3432 + }, + { + "epoch": 1.6722962000649562, + "grad_norm": 2.5151031017303467, + "learning_rate": 4.135958476605468e-06, + "loss": 0.6284, + "step": 3433 + }, + { + "epoch": 1.6727833712244236, + "grad_norm": 2.735560655593872, + "learning_rate": 4.135472106576985e-06, + "loss": 0.5202, + "step": 3434 + }, + { + "epoch": 1.6732705423838907, + "grad_norm": 2.5465259552001953, + "learning_rate": 4.134985628313111e-06, + "loss": 0.5589, + "step": 3435 + }, + { + "epoch": 1.6737577135433583, + "grad_norm": 2.557966947555542, + "learning_rate": 4.1344990418460396e-06, + "loss": 0.5362, + "step": 3436 + }, + { + "epoch": 1.6742448847028255, + "grad_norm": 2.584271192550659, + "learning_rate": 4.134012347207974e-06, + "loss": 0.5677, + "step": 3437 + }, + { + "epoch": 1.674732055862293, + "grad_norm": 2.6422011852264404, + "learning_rate": 4.133525544431123e-06, + "loss": 0.5451, + "step": 3438 + }, + { + "epoch": 1.6752192270217603, + "grad_norm": 3.1068367958068848, + "learning_rate": 4.133038633547703e-06, + "loss": 0.601, + "step": 3439 + }, + { + "epoch": 1.6757063981812277, + "grad_norm": 2.3515284061431885, + "learning_rate": 4.13255161458994e-06, + "loss": 0.5283, + "step": 3440 + }, + { + "epoch": 1.676193569340695, + "grad_norm": 2.3953030109405518, + "learning_rate": 4.132064487590062e-06, + "loss": 0.5359, + "step": 3441 + }, + { + "epoch": 1.6766807405001622, + "grad_norm": 2.6727242469787598, + "learning_rate": 4.131577252580309e-06, + "loss": 0.5272, + "step": 3442 + }, + { + "epoch": 1.6771679116596299, + "grad_norm": 2.4612390995025635, + "learning_rate": 4.131089909592925e-06, + "loss": 0.6131, + "step": 3443 + }, + { + "epoch": 1.677655082819097, + "grad_norm": 2.631747245788574, + "learning_rate": 4.1306024586601615e-06, + "loss": 0.558, + "step": 3444 + }, + { + "epoch": 1.6781422539785644, + "grad_norm": 2.6208763122558594, + "learning_rate": 4.13011489981428e-06, + "loss": 0.6194, + "step": 3445 + }, + { + "epoch": 1.6786294251380318, + "grad_norm": 2.953594923019409, + "learning_rate": 4.129627233087546e-06, + "loss": 0.4995, + "step": 3446 + }, + { + "epoch": 1.6791165962974992, + "grad_norm": 2.586111545562744, + "learning_rate": 4.1291394585122345e-06, + "loss": 0.5351, + "step": 3447 + }, + { + "epoch": 1.6796037674569666, + "grad_norm": 2.802863836288452, + "learning_rate": 4.128651576120625e-06, + "loss": 0.5327, + "step": 3448 + }, + { + "epoch": 1.6800909386164338, + "grad_norm": 2.5796217918395996, + "learning_rate": 4.128163585945005e-06, + "loss": 0.553, + "step": 3449 + }, + { + "epoch": 1.6805781097759014, + "grad_norm": 2.2969865798950195, + "learning_rate": 4.127675488017671e-06, + "loss": 0.5576, + "step": 3450 + }, + { + "epoch": 1.6810652809353686, + "grad_norm": 2.6603236198425293, + "learning_rate": 4.1271872823709245e-06, + "loss": 0.571, + "step": 3451 + }, + { + "epoch": 1.681552452094836, + "grad_norm": 2.773592710494995, + "learning_rate": 4.126698969037076e-06, + "loss": 0.6515, + "step": 3452 + }, + { + "epoch": 1.6820396232543033, + "grad_norm": 2.484332799911499, + "learning_rate": 4.12621054804844e-06, + "loss": 0.5403, + "step": 3453 + }, + { + "epoch": 1.6825267944137707, + "grad_norm": 2.4813613891601562, + "learning_rate": 4.125722019437342e-06, + "loss": 0.5503, + "step": 3454 + }, + { + "epoch": 1.6830139655732381, + "grad_norm": 2.532541036605835, + "learning_rate": 4.1252333832361125e-06, + "loss": 0.5591, + "step": 3455 + }, + { + "epoch": 1.6835011367327053, + "grad_norm": 3.251736879348755, + "learning_rate": 4.124744639477089e-06, + "loss": 0.6305, + "step": 3456 + }, + { + "epoch": 1.683988307892173, + "grad_norm": 2.809274673461914, + "learning_rate": 4.124255788192616e-06, + "loss": 0.5815, + "step": 3457 + }, + { + "epoch": 1.68447547905164, + "grad_norm": 2.6177356243133545, + "learning_rate": 4.123766829415047e-06, + "loss": 0.5187, + "step": 3458 + }, + { + "epoch": 1.6849626502111075, + "grad_norm": 3.9054031372070312, + "learning_rate": 4.123277763176739e-06, + "loss": 0.6229, + "step": 3459 + }, + { + "epoch": 1.6854498213705749, + "grad_norm": 2.7984418869018555, + "learning_rate": 4.122788589510061e-06, + "loss": 0.5984, + "step": 3460 + }, + { + "epoch": 1.6859369925300423, + "grad_norm": 2.608931064605713, + "learning_rate": 4.122299308447385e-06, + "loss": 0.6391, + "step": 3461 + }, + { + "epoch": 1.6864241636895096, + "grad_norm": 2.8415615558624268, + "learning_rate": 4.121809920021092e-06, + "loss": 0.4649, + "step": 3462 + }, + { + "epoch": 1.6869113348489768, + "grad_norm": 2.6110219955444336, + "learning_rate": 4.12132042426357e-06, + "loss": 0.5072, + "step": 3463 + }, + { + "epoch": 1.6873985060084444, + "grad_norm": 2.600817918777466, + "learning_rate": 4.1208308212072125e-06, + "loss": 0.509, + "step": 3464 + }, + { + "epoch": 1.6878856771679116, + "grad_norm": 2.4254424571990967, + "learning_rate": 4.120341110884421e-06, + "loss": 0.5737, + "step": 3465 + }, + { + "epoch": 1.688372848327379, + "grad_norm": 3.2630977630615234, + "learning_rate": 4.119851293327608e-06, + "loss": 0.5579, + "step": 3466 + }, + { + "epoch": 1.6888600194868464, + "grad_norm": 2.7368524074554443, + "learning_rate": 4.119361368569186e-06, + "loss": 0.5265, + "step": 3467 + }, + { + "epoch": 1.6893471906463138, + "grad_norm": 2.70279598236084, + "learning_rate": 4.11887133664158e-06, + "loss": 0.5407, + "step": 3468 + }, + { + "epoch": 1.6898343618057812, + "grad_norm": 2.6759424209594727, + "learning_rate": 4.11838119757722e-06, + "loss": 0.6376, + "step": 3469 + }, + { + "epoch": 1.6903215329652483, + "grad_norm": 3.126561403274536, + "learning_rate": 4.117890951408541e-06, + "loss": 0.5205, + "step": 3470 + }, + { + "epoch": 1.690808704124716, + "grad_norm": 2.8274970054626465, + "learning_rate": 4.117400598167991e-06, + "loss": 0.595, + "step": 3471 + }, + { + "epoch": 1.6912958752841831, + "grad_norm": 2.699551582336426, + "learning_rate": 4.11691013788802e-06, + "loss": 0.5065, + "step": 3472 + }, + { + "epoch": 1.6917830464436505, + "grad_norm": 2.3264496326446533, + "learning_rate": 4.1164195706010865e-06, + "loss": 0.5415, + "step": 3473 + }, + { + "epoch": 1.692270217603118, + "grad_norm": 2.6289865970611572, + "learning_rate": 4.115928896339656e-06, + "loss": 0.6425, + "step": 3474 + }, + { + "epoch": 1.6927573887625853, + "grad_norm": 2.9631142616271973, + "learning_rate": 4.115438115136202e-06, + "loss": 0.5394, + "step": 3475 + }, + { + "epoch": 1.6932445599220527, + "grad_norm": 2.651186943054199, + "learning_rate": 4.114947227023204e-06, + "loss": 0.5718, + "step": 3476 + }, + { + "epoch": 1.6937317310815199, + "grad_norm": 2.6379668712615967, + "learning_rate": 4.1144562320331484e-06, + "loss": 0.6036, + "step": 3477 + }, + { + "epoch": 1.6942189022409875, + "grad_norm": 2.818809986114502, + "learning_rate": 4.11396513019853e-06, + "loss": 0.5847, + "step": 3478 + }, + { + "epoch": 1.6947060734004546, + "grad_norm": 2.7259182929992676, + "learning_rate": 4.113473921551851e-06, + "loss": 0.513, + "step": 3479 + }, + { + "epoch": 1.695193244559922, + "grad_norm": 2.9294209480285645, + "learning_rate": 4.112982606125616e-06, + "loss": 0.6515, + "step": 3480 + }, + { + "epoch": 1.6956804157193894, + "grad_norm": 2.7942540645599365, + "learning_rate": 4.112491183952343e-06, + "loss": 0.5778, + "step": 3481 + }, + { + "epoch": 1.6961675868788568, + "grad_norm": 2.6468088626861572, + "learning_rate": 4.111999655064555e-06, + "loss": 0.5934, + "step": 3482 + }, + { + "epoch": 1.6966547580383242, + "grad_norm": 2.959172010421753, + "learning_rate": 4.111508019494777e-06, + "loss": 0.5465, + "step": 3483 + }, + { + "epoch": 1.6971419291977914, + "grad_norm": 2.5928571224212646, + "learning_rate": 4.1110162772755514e-06, + "loss": 0.517, + "step": 3484 + }, + { + "epoch": 1.697629100357259, + "grad_norm": 2.5758743286132812, + "learning_rate": 4.110524428439416e-06, + "loss": 0.528, + "step": 3485 + }, + { + "epoch": 1.6981162715167262, + "grad_norm": 2.7446212768554688, + "learning_rate": 4.110032473018925e-06, + "loss": 0.5803, + "step": 3486 + }, + { + "epoch": 1.6986034426761936, + "grad_norm": 2.6879448890686035, + "learning_rate": 4.1095404110466355e-06, + "loss": 0.4997, + "step": 3487 + }, + { + "epoch": 1.699090613835661, + "grad_norm": 2.6972532272338867, + "learning_rate": 4.109048242555112e-06, + "loss": 0.6284, + "step": 3488 + }, + { + "epoch": 1.6995777849951281, + "grad_norm": 2.581244945526123, + "learning_rate": 4.1085559675769234e-06, + "loss": 0.4762, + "step": 3489 + }, + { + "epoch": 1.7000649561545957, + "grad_norm": 2.4377031326293945, + "learning_rate": 4.108063586144652e-06, + "loss": 0.4937, + "step": 3490 + }, + { + "epoch": 1.700552127314063, + "grad_norm": 2.6759238243103027, + "learning_rate": 4.107571098290882e-06, + "loss": 0.5255, + "step": 3491 + }, + { + "epoch": 1.7010392984735305, + "grad_norm": 2.6579198837280273, + "learning_rate": 4.107078504048207e-06, + "loss": 0.4869, + "step": 3492 + }, + { + "epoch": 1.7015264696329977, + "grad_norm": 2.7661707401275635, + "learning_rate": 4.106585803449226e-06, + "loss": 0.6199, + "step": 3493 + }, + { + "epoch": 1.702013640792465, + "grad_norm": 3.0343830585479736, + "learning_rate": 4.106092996526545e-06, + "loss": 0.5258, + "step": 3494 + }, + { + "epoch": 1.7025008119519325, + "grad_norm": 2.611494541168213, + "learning_rate": 4.10560008331278e-06, + "loss": 0.5864, + "step": 3495 + }, + { + "epoch": 1.7029879831113997, + "grad_norm": 2.8035476207733154, + "learning_rate": 4.105107063840551e-06, + "loss": 0.6018, + "step": 3496 + }, + { + "epoch": 1.7034751542708673, + "grad_norm": 2.6956100463867188, + "learning_rate": 4.104613938142486e-06, + "loss": 0.5888, + "step": 3497 + }, + { + "epoch": 1.7039623254303344, + "grad_norm": 2.8631019592285156, + "learning_rate": 4.10412070625122e-06, + "loss": 0.6257, + "step": 3498 + }, + { + "epoch": 1.704449496589802, + "grad_norm": 2.6559078693389893, + "learning_rate": 4.103627368199394e-06, + "loss": 0.5421, + "step": 3499 + }, + { + "epoch": 1.7049366677492692, + "grad_norm": 2.907332420349121, + "learning_rate": 4.10313392401966e-06, + "loss": 0.6005, + "step": 3500 + }, + { + "epoch": 1.7054238389087366, + "grad_norm": 2.988414764404297, + "learning_rate": 4.102640373744671e-06, + "loss": 0.5746, + "step": 3501 + }, + { + "epoch": 1.705911010068204, + "grad_norm": 2.760931968688965, + "learning_rate": 4.102146717407092e-06, + "loss": 0.6065, + "step": 3502 + }, + { + "epoch": 1.7063981812276712, + "grad_norm": 2.7284786701202393, + "learning_rate": 4.101652955039591e-06, + "loss": 0.5671, + "step": 3503 + }, + { + "epoch": 1.7068853523871388, + "grad_norm": 2.5884008407592773, + "learning_rate": 4.101159086674849e-06, + "loss": 0.4869, + "step": 3504 + }, + { + "epoch": 1.707372523546606, + "grad_norm": 2.3752825260162354, + "learning_rate": 4.1006651123455454e-06, + "loss": 0.5551, + "step": 3505 + }, + { + "epoch": 1.7078596947060734, + "grad_norm": 2.4725584983825684, + "learning_rate": 4.100171032084375e-06, + "loss": 0.5553, + "step": 3506 + }, + { + "epoch": 1.7083468658655407, + "grad_norm": 2.6752941608428955, + "learning_rate": 4.099676845924034e-06, + "loss": 0.6, + "step": 3507 + }, + { + "epoch": 1.7088340370250081, + "grad_norm": 2.792651653289795, + "learning_rate": 4.099182553897228e-06, + "loss": 0.5844, + "step": 3508 + }, + { + "epoch": 1.7093212081844755, + "grad_norm": 2.8370890617370605, + "learning_rate": 4.098688156036671e-06, + "loss": 0.5705, + "step": 3509 + }, + { + "epoch": 1.7098083793439427, + "grad_norm": 2.6105809211730957, + "learning_rate": 4.09819365237508e-06, + "loss": 0.5061, + "step": 3510 + }, + { + "epoch": 1.7102955505034103, + "grad_norm": 2.4274134635925293, + "learning_rate": 4.0976990429451815e-06, + "loss": 0.5475, + "step": 3511 + }, + { + "epoch": 1.7107827216628775, + "grad_norm": 2.6595070362091064, + "learning_rate": 4.097204327779709e-06, + "loss": 0.6133, + "step": 3512 + }, + { + "epoch": 1.7112698928223449, + "grad_norm": 2.726830244064331, + "learning_rate": 4.096709506911403e-06, + "loss": 0.5112, + "step": 3513 + }, + { + "epoch": 1.7117570639818123, + "grad_norm": 2.5401391983032227, + "learning_rate": 4.096214580373011e-06, + "loss": 0.5555, + "step": 3514 + }, + { + "epoch": 1.7122442351412797, + "grad_norm": 2.945483684539795, + "learning_rate": 4.0957195481972865e-06, + "loss": 0.545, + "step": 3515 + }, + { + "epoch": 1.712731406300747, + "grad_norm": 2.679966688156128, + "learning_rate": 4.0952244104169905e-06, + "loss": 0.5454, + "step": 3516 + }, + { + "epoch": 1.7132185774602142, + "grad_norm": 3.069246530532837, + "learning_rate": 4.094729167064892e-06, + "loss": 0.6096, + "step": 3517 + }, + { + "epoch": 1.7137057486196818, + "grad_norm": 2.528883457183838, + "learning_rate": 4.094233818173765e-06, + "loss": 0.4817, + "step": 3518 + }, + { + "epoch": 1.714192919779149, + "grad_norm": 2.708413600921631, + "learning_rate": 4.093738363776394e-06, + "loss": 0.5005, + "step": 3519 + }, + { + "epoch": 1.7146800909386164, + "grad_norm": 2.7678704261779785, + "learning_rate": 4.093242803905565e-06, + "loss": 0.5484, + "step": 3520 + }, + { + "epoch": 1.7151672620980838, + "grad_norm": 2.6171722412109375, + "learning_rate": 4.092747138594076e-06, + "loss": 0.5603, + "step": 3521 + }, + { + "epoch": 1.7156544332575512, + "grad_norm": 2.841115951538086, + "learning_rate": 4.092251367874729e-06, + "loss": 0.5702, + "step": 3522 + }, + { + "epoch": 1.7161416044170186, + "grad_norm": 2.7243807315826416, + "learning_rate": 4.091755491780336e-06, + "loss": 0.4909, + "step": 3523 + }, + { + "epoch": 1.7166287755764857, + "grad_norm": 2.7989039421081543, + "learning_rate": 4.091259510343712e-06, + "loss": 0.5579, + "step": 3524 + }, + { + "epoch": 1.7171159467359534, + "grad_norm": 2.556154727935791, + "learning_rate": 4.090763423597682e-06, + "loss": 0.55, + "step": 3525 + }, + { + "epoch": 1.7176031178954205, + "grad_norm": 2.8683319091796875, + "learning_rate": 4.090267231575077e-06, + "loss": 0.5646, + "step": 3526 + }, + { + "epoch": 1.718090289054888, + "grad_norm": 3.3118138313293457, + "learning_rate": 4.089770934308734e-06, + "loss": 0.5422, + "step": 3527 + }, + { + "epoch": 1.7185774602143553, + "grad_norm": 2.979445695877075, + "learning_rate": 4.0892745318315e-06, + "loss": 0.5255, + "step": 3528 + }, + { + "epoch": 1.7190646313738227, + "grad_norm": 2.625051975250244, + "learning_rate": 4.088778024176224e-06, + "loss": 0.4874, + "step": 3529 + }, + { + "epoch": 1.71955180253329, + "grad_norm": 2.561131238937378, + "learning_rate": 4.088281411375767e-06, + "loss": 0.6228, + "step": 3530 + }, + { + "epoch": 1.7200389736927573, + "grad_norm": 2.8475935459136963, + "learning_rate": 4.087784693462994e-06, + "loss": 0.5535, + "step": 3531 + }, + { + "epoch": 1.7205261448522249, + "grad_norm": 3.705240488052368, + "learning_rate": 4.087287870470778e-06, + "loss": 0.55, + "step": 3532 + }, + { + "epoch": 1.721013316011692, + "grad_norm": 2.4695534706115723, + "learning_rate": 4.086790942431999e-06, + "loss": 0.6723, + "step": 3533 + }, + { + "epoch": 1.7215004871711594, + "grad_norm": 3.0023293495178223, + "learning_rate": 4.086293909379542e-06, + "loss": 0.604, + "step": 3534 + }, + { + "epoch": 1.7219876583306268, + "grad_norm": 3.1760218143463135, + "learning_rate": 4.085796771346302e-06, + "loss": 0.5196, + "step": 3535 + }, + { + "epoch": 1.7224748294900942, + "grad_norm": 2.682563304901123, + "learning_rate": 4.08529952836518e-06, + "loss": 0.613, + "step": 3536 + }, + { + "epoch": 1.7229620006495616, + "grad_norm": 2.977410078048706, + "learning_rate": 4.084802180469083e-06, + "loss": 0.5644, + "step": 3537 + }, + { + "epoch": 1.7234491718090288, + "grad_norm": 2.9583170413970947, + "learning_rate": 4.084304727690924e-06, + "loss": 0.4778, + "step": 3538 + }, + { + "epoch": 1.7239363429684964, + "grad_norm": 2.5104904174804688, + "learning_rate": 4.083807170063626e-06, + "loss": 0.5299, + "step": 3539 + }, + { + "epoch": 1.7244235141279636, + "grad_norm": 2.7810721397399902, + "learning_rate": 4.083309507620118e-06, + "loss": 0.5319, + "step": 3540 + }, + { + "epoch": 1.724910685287431, + "grad_norm": 2.5644497871398926, + "learning_rate": 4.082811740393333e-06, + "loss": 0.5212, + "step": 3541 + }, + { + "epoch": 1.7253978564468984, + "grad_norm": 2.532104253768921, + "learning_rate": 4.082313868416216e-06, + "loss": 0.5712, + "step": 3542 + }, + { + "epoch": 1.7258850276063658, + "grad_norm": 2.6546361446380615, + "learning_rate": 4.081815891721713e-06, + "loss": 0.5856, + "step": 3543 + }, + { + "epoch": 1.7263721987658331, + "grad_norm": 2.808546304702759, + "learning_rate": 4.081317810342782e-06, + "loss": 0.6218, + "step": 3544 + }, + { + "epoch": 1.7268593699253003, + "grad_norm": 2.5367674827575684, + "learning_rate": 4.080819624312386e-06, + "loss": 0.5529, + "step": 3545 + }, + { + "epoch": 1.727346541084768, + "grad_norm": 2.5021345615386963, + "learning_rate": 4.0803213336634935e-06, + "loss": 0.4876, + "step": 3546 + }, + { + "epoch": 1.727833712244235, + "grad_norm": 2.436018228530884, + "learning_rate": 4.0798229384290835e-06, + "loss": 0.5831, + "step": 3547 + }, + { + "epoch": 1.7283208834037025, + "grad_norm": 2.8141026496887207, + "learning_rate": 4.079324438642138e-06, + "loss": 0.5731, + "step": 3548 + }, + { + "epoch": 1.7288080545631699, + "grad_norm": 2.603200912475586, + "learning_rate": 4.078825834335649e-06, + "loss": 0.5292, + "step": 3549 + }, + { + "epoch": 1.7292952257226373, + "grad_norm": 2.4841625690460205, + "learning_rate": 4.0783271255426114e-06, + "loss": 0.5284, + "step": 3550 + }, + { + "epoch": 1.7297823968821047, + "grad_norm": 2.679086208343506, + "learning_rate": 4.077828312296033e-06, + "loss": 0.6006, + "step": 3551 + }, + { + "epoch": 1.7302695680415718, + "grad_norm": 2.9750185012817383, + "learning_rate": 4.077329394628924e-06, + "loss": 0.6533, + "step": 3552 + }, + { + "epoch": 1.7307567392010395, + "grad_norm": 3.337836980819702, + "learning_rate": 4.076830372574302e-06, + "loss": 0.5134, + "step": 3553 + }, + { + "epoch": 1.7312439103605066, + "grad_norm": 2.631493330001831, + "learning_rate": 4.076331246165193e-06, + "loss": 0.6026, + "step": 3554 + }, + { + "epoch": 1.731731081519974, + "grad_norm": 2.7962563037872314, + "learning_rate": 4.0758320154346285e-06, + "loss": 0.5944, + "step": 3555 + }, + { + "epoch": 1.7322182526794414, + "grad_norm": 2.6876614093780518, + "learning_rate": 4.075332680415648e-06, + "loss": 0.5304, + "step": 3556 + }, + { + "epoch": 1.7327054238389086, + "grad_norm": 2.6729347705841064, + "learning_rate": 4.074833241141298e-06, + "loss": 0.5843, + "step": 3557 + }, + { + "epoch": 1.7331925949983762, + "grad_norm": 2.3833906650543213, + "learning_rate": 4.07433369764463e-06, + "loss": 0.5426, + "step": 3558 + }, + { + "epoch": 1.7336797661578434, + "grad_norm": 2.7500956058502197, + "learning_rate": 4.073834049958706e-06, + "loss": 0.5965, + "step": 3559 + }, + { + "epoch": 1.734166937317311, + "grad_norm": 3.0097146034240723, + "learning_rate": 4.073334298116589e-06, + "loss": 0.6123, + "step": 3560 + }, + { + "epoch": 1.7346541084767781, + "grad_norm": 2.6718244552612305, + "learning_rate": 4.072834442151357e-06, + "loss": 0.5434, + "step": 3561 + }, + { + "epoch": 1.7351412796362455, + "grad_norm": 2.5617003440856934, + "learning_rate": 4.072334482096086e-06, + "loss": 0.5217, + "step": 3562 + }, + { + "epoch": 1.735628450795713, + "grad_norm": 2.51305890083313, + "learning_rate": 4.071834417983866e-06, + "loss": 0.589, + "step": 3563 + }, + { + "epoch": 1.73611562195518, + "grad_norm": 2.6714465618133545, + "learning_rate": 4.0713342498477905e-06, + "loss": 0.5318, + "step": 3564 + }, + { + "epoch": 1.7366027931146477, + "grad_norm": 2.6084814071655273, + "learning_rate": 4.070833977720961e-06, + "loss": 0.5942, + "step": 3565 + }, + { + "epoch": 1.7370899642741149, + "grad_norm": 2.85494327545166, + "learning_rate": 4.070333601636486e-06, + "loss": 0.556, + "step": 3566 + }, + { + "epoch": 1.7375771354335825, + "grad_norm": 2.7308731079101562, + "learning_rate": 4.0698331216274775e-06, + "loss": 0.561, + "step": 3567 + }, + { + "epoch": 1.7380643065930497, + "grad_norm": 2.960197687149048, + "learning_rate": 4.069332537727061e-06, + "loss": 0.5024, + "step": 3568 + }, + { + "epoch": 1.738551477752517, + "grad_norm": 2.6680002212524414, + "learning_rate": 4.0688318499683625e-06, + "loss": 0.6001, + "step": 3569 + }, + { + "epoch": 1.7390386489119845, + "grad_norm": 2.9886019229888916, + "learning_rate": 4.0683310583845185e-06, + "loss": 0.5097, + "step": 3570 + }, + { + "epoch": 1.7395258200714516, + "grad_norm": 2.7394731044769287, + "learning_rate": 4.06783016300867e-06, + "loss": 0.6494, + "step": 3571 + }, + { + "epoch": 1.7400129912309192, + "grad_norm": 2.759565591812134, + "learning_rate": 4.067329163873969e-06, + "loss": 0.4778, + "step": 3572 + }, + { + "epoch": 1.7405001623903864, + "grad_norm": 2.523904800415039, + "learning_rate": 4.066828061013569e-06, + "loss": 0.5538, + "step": 3573 + }, + { + "epoch": 1.7409873335498538, + "grad_norm": 2.980621337890625, + "learning_rate": 4.066326854460634e-06, + "loss": 0.5837, + "step": 3574 + }, + { + "epoch": 1.7414745047093212, + "grad_norm": 2.8498117923736572, + "learning_rate": 4.065825544248333e-06, + "loss": 0.5653, + "step": 3575 + }, + { + "epoch": 1.7419616758687886, + "grad_norm": 2.6277196407318115, + "learning_rate": 4.0653241304098444e-06, + "loss": 0.6537, + "step": 3576 + }, + { + "epoch": 1.742448847028256, + "grad_norm": 2.68266224861145, + "learning_rate": 4.06482261297835e-06, + "loss": 0.5324, + "step": 3577 + }, + { + "epoch": 1.7429360181877231, + "grad_norm": 2.8085360527038574, + "learning_rate": 4.0643209919870405e-06, + "loss": 0.6257, + "step": 3578 + }, + { + "epoch": 1.7434231893471908, + "grad_norm": 2.7985727787017822, + "learning_rate": 4.063819267469114e-06, + "loss": 0.5242, + "step": 3579 + }, + { + "epoch": 1.743910360506658, + "grad_norm": 2.570469379425049, + "learning_rate": 4.063317439457774e-06, + "loss": 0.5131, + "step": 3580 + }, + { + "epoch": 1.7443975316661253, + "grad_norm": 2.7271926403045654, + "learning_rate": 4.062815507986231e-06, + "loss": 0.5691, + "step": 3581 + }, + { + "epoch": 1.7448847028255927, + "grad_norm": 3.106187582015991, + "learning_rate": 4.062313473087704e-06, + "loss": 0.688, + "step": 3582 + }, + { + "epoch": 1.74537187398506, + "grad_norm": 3.0519919395446777, + "learning_rate": 4.061811334795415e-06, + "loss": 0.5988, + "step": 3583 + }, + { + "epoch": 1.7458590451445275, + "grad_norm": 2.8341562747955322, + "learning_rate": 4.061309093142599e-06, + "loss": 0.5414, + "step": 3584 + }, + { + "epoch": 1.7463462163039947, + "grad_norm": 2.51139235496521, + "learning_rate": 4.060806748162492e-06, + "loss": 0.5679, + "step": 3585 + }, + { + "epoch": 1.7468333874634623, + "grad_norm": 2.9550156593322754, + "learning_rate": 4.0603042998883406e-06, + "loss": 0.5268, + "step": 3586 + }, + { + "epoch": 1.7473205586229295, + "grad_norm": 2.8222901821136475, + "learning_rate": 4.0598017483533945e-06, + "loss": 0.5569, + "step": 3587 + }, + { + "epoch": 1.7478077297823968, + "grad_norm": 3.205789089202881, + "learning_rate": 4.059299093590915e-06, + "loss": 0.5457, + "step": 3588 + }, + { + "epoch": 1.7482949009418642, + "grad_norm": 2.558666229248047, + "learning_rate": 4.058796335634167e-06, + "loss": 0.5746, + "step": 3589 + }, + { + "epoch": 1.7487820721013316, + "grad_norm": 2.635920524597168, + "learning_rate": 4.058293474516421e-06, + "loss": 0.5316, + "step": 3590 + }, + { + "epoch": 1.749269243260799, + "grad_norm": 2.313603639602661, + "learning_rate": 4.057790510270959e-06, + "loss": 0.6218, + "step": 3591 + }, + { + "epoch": 1.7497564144202662, + "grad_norm": 2.8847153186798096, + "learning_rate": 4.057287442931066e-06, + "loss": 0.6155, + "step": 3592 + }, + { + "epoch": 1.7502435855797338, + "grad_norm": 2.913799285888672, + "learning_rate": 4.056784272530035e-06, + "loss": 0.58, + "step": 3593 + }, + { + "epoch": 1.750730756739201, + "grad_norm": 2.6671814918518066, + "learning_rate": 4.056280999101166e-06, + "loss": 0.5705, + "step": 3594 + }, + { + "epoch": 1.7512179278986684, + "grad_norm": 3.4759914875030518, + "learning_rate": 4.055777622677765e-06, + "loss": 0.5068, + "step": 3595 + }, + { + "epoch": 1.7517050990581358, + "grad_norm": 2.7371785640716553, + "learning_rate": 4.055274143293146e-06, + "loss": 0.6268, + "step": 3596 + }, + { + "epoch": 1.7521922702176032, + "grad_norm": 2.948265790939331, + "learning_rate": 4.05477056098063e-06, + "loss": 0.6209, + "step": 3597 + }, + { + "epoch": 1.7526794413770705, + "grad_norm": 3.1506989002227783, + "learning_rate": 4.0542668757735415e-06, + "loss": 0.6277, + "step": 3598 + }, + { + "epoch": 1.7531666125365377, + "grad_norm": 2.6934406757354736, + "learning_rate": 4.053763087705217e-06, + "loss": 0.5584, + "step": 3599 + }, + { + "epoch": 1.7536537836960053, + "grad_norm": 3.046090602874756, + "learning_rate": 4.053259196808995e-06, + "loss": 0.6058, + "step": 3600 + }, + { + "epoch": 1.7541409548554725, + "grad_norm": 2.8628621101379395, + "learning_rate": 4.052755203118225e-06, + "loss": 0.542, + "step": 3601 + }, + { + "epoch": 1.75462812601494, + "grad_norm": 2.6298980712890625, + "learning_rate": 4.0522511066662605e-06, + "loss": 0.5696, + "step": 3602 + }, + { + "epoch": 1.7551152971744073, + "grad_norm": 2.541990280151367, + "learning_rate": 4.051746907486461e-06, + "loss": 0.5377, + "step": 3603 + }, + { + "epoch": 1.7556024683338747, + "grad_norm": 3.080709934234619, + "learning_rate": 4.051242605612197e-06, + "loss": 0.5934, + "step": 3604 + }, + { + "epoch": 1.756089639493342, + "grad_norm": 2.597303628921509, + "learning_rate": 4.05073820107684e-06, + "loss": 0.4976, + "step": 3605 + }, + { + "epoch": 1.7565768106528092, + "grad_norm": 2.48854923248291, + "learning_rate": 4.050233693913776e-06, + "loss": 0.5513, + "step": 3606 + }, + { + "epoch": 1.7570639818122769, + "grad_norm": 2.8782408237457275, + "learning_rate": 4.049729084156387e-06, + "loss": 0.5509, + "step": 3607 + }, + { + "epoch": 1.757551152971744, + "grad_norm": 2.7845404148101807, + "learning_rate": 4.0492243718380744e-06, + "loss": 0.5406, + "step": 3608 + }, + { + "epoch": 1.7580383241312114, + "grad_norm": 2.3833770751953125, + "learning_rate": 4.048719556992236e-06, + "loss": 0.5406, + "step": 3609 + }, + { + "epoch": 1.7585254952906788, + "grad_norm": 2.9030301570892334, + "learning_rate": 4.048214639652281e-06, + "loss": 0.5993, + "step": 3610 + }, + { + "epoch": 1.7590126664501462, + "grad_norm": 3.0937981605529785, + "learning_rate": 4.047709619851627e-06, + "loss": 0.5159, + "step": 3611 + }, + { + "epoch": 1.7594998376096136, + "grad_norm": 2.5525565147399902, + "learning_rate": 4.047204497623694e-06, + "loss": 0.5608, + "step": 3612 + }, + { + "epoch": 1.7599870087690808, + "grad_norm": 2.593475580215454, + "learning_rate": 4.046699273001911e-06, + "loss": 0.5469, + "step": 3613 + }, + { + "epoch": 1.7604741799285484, + "grad_norm": 2.560565948486328, + "learning_rate": 4.046193946019714e-06, + "loss": 0.5913, + "step": 3614 + }, + { + "epoch": 1.7609613510880155, + "grad_norm": 2.6186089515686035, + "learning_rate": 4.045688516710545e-06, + "loss": 0.5198, + "step": 3615 + }, + { + "epoch": 1.761448522247483, + "grad_norm": 2.9824182987213135, + "learning_rate": 4.045182985107855e-06, + "loss": 0.4813, + "step": 3616 + }, + { + "epoch": 1.7619356934069503, + "grad_norm": 2.8743841648101807, + "learning_rate": 4.0446773512450976e-06, + "loss": 0.539, + "step": 3617 + }, + { + "epoch": 1.7624228645664175, + "grad_norm": 2.785275936126709, + "learning_rate": 4.044171615155738e-06, + "loss": 0.5703, + "step": 3618 + }, + { + "epoch": 1.7629100357258851, + "grad_norm": 2.570192813873291, + "learning_rate": 4.0436657768732455e-06, + "loss": 0.5659, + "step": 3619 + }, + { + "epoch": 1.7633972068853523, + "grad_norm": 2.7672812938690186, + "learning_rate": 4.043159836431094e-06, + "loss": 0.5784, + "step": 3620 + }, + { + "epoch": 1.76388437804482, + "grad_norm": 2.830672264099121, + "learning_rate": 4.04265379386277e-06, + "loss": 0.588, + "step": 3621 + }, + { + "epoch": 1.764371549204287, + "grad_norm": 2.5020179748535156, + "learning_rate": 4.04214764920176e-06, + "loss": 0.5395, + "step": 3622 + }, + { + "epoch": 1.7648587203637545, + "grad_norm": 3.08685564994812, + "learning_rate": 4.041641402481562e-06, + "loss": 0.5126, + "step": 3623 + }, + { + "epoch": 1.7653458915232219, + "grad_norm": 2.8891379833221436, + "learning_rate": 4.0411350537356804e-06, + "loss": 0.5986, + "step": 3624 + }, + { + "epoch": 1.765833062682689, + "grad_norm": 2.7828311920166016, + "learning_rate": 4.0406286029976245e-06, + "loss": 0.645, + "step": 3625 + }, + { + "epoch": 1.7663202338421566, + "grad_norm": 2.768310785293579, + "learning_rate": 4.040122050300911e-06, + "loss": 0.5479, + "step": 3626 + }, + { + "epoch": 1.7668074050016238, + "grad_norm": 2.529905080795288, + "learning_rate": 4.039615395679063e-06, + "loss": 0.5027, + "step": 3627 + }, + { + "epoch": 1.7672945761610914, + "grad_norm": 2.6459131240844727, + "learning_rate": 4.039108639165612e-06, + "loss": 0.5915, + "step": 3628 + }, + { + "epoch": 1.7677817473205586, + "grad_norm": 2.4202349185943604, + "learning_rate": 4.038601780794095e-06, + "loss": 0.5393, + "step": 3629 + }, + { + "epoch": 1.768268918480026, + "grad_norm": 2.818711519241333, + "learning_rate": 4.038094820598055e-06, + "loss": 0.5503, + "step": 3630 + }, + { + "epoch": 1.7687560896394934, + "grad_norm": 2.718536615371704, + "learning_rate": 4.037587758611043e-06, + "loss": 0.6548, + "step": 3631 + }, + { + "epoch": 1.7692432607989605, + "grad_norm": 2.905808687210083, + "learning_rate": 4.037080594866616e-06, + "loss": 0.5141, + "step": 3632 + }, + { + "epoch": 1.7697304319584282, + "grad_norm": 3.3221209049224854, + "learning_rate": 4.036573329398339e-06, + "loss": 0.5299, + "step": 3633 + }, + { + "epoch": 1.7702176031178953, + "grad_norm": 2.633625030517578, + "learning_rate": 4.036065962239782e-06, + "loss": 0.5695, + "step": 3634 + }, + { + "epoch": 1.7707047742773627, + "grad_norm": 2.4503443241119385, + "learning_rate": 4.035558493424522e-06, + "loss": 0.6106, + "step": 3635 + }, + { + "epoch": 1.7711919454368301, + "grad_norm": 2.679819345474243, + "learning_rate": 4.035050922986145e-06, + "loss": 0.5926, + "step": 3636 + }, + { + "epoch": 1.7716791165962975, + "grad_norm": 3.0640997886657715, + "learning_rate": 4.03454325095824e-06, + "loss": 0.6328, + "step": 3637 + }, + { + "epoch": 1.772166287755765, + "grad_norm": 3.317160129547119, + "learning_rate": 4.034035477374405e-06, + "loss": 0.5731, + "step": 3638 + }, + { + "epoch": 1.772653458915232, + "grad_norm": 2.8214242458343506, + "learning_rate": 4.033527602268246e-06, + "loss": 0.5507, + "step": 3639 + }, + { + "epoch": 1.7731406300746997, + "grad_norm": 2.7408955097198486, + "learning_rate": 4.033019625673373e-06, + "loss": 0.5747, + "step": 3640 + }, + { + "epoch": 1.7736278012341669, + "grad_norm": 2.701714277267456, + "learning_rate": 4.032511547623405e-06, + "loss": 0.481, + "step": 3641 + }, + { + "epoch": 1.7741149723936342, + "grad_norm": 2.537984609603882, + "learning_rate": 4.032003368151963e-06, + "loss": 0.5281, + "step": 3642 + }, + { + "epoch": 1.7746021435531016, + "grad_norm": 2.812865972518921, + "learning_rate": 4.031495087292682e-06, + "loss": 0.58, + "step": 3643 + }, + { + "epoch": 1.775089314712569, + "grad_norm": 2.6684865951538086, + "learning_rate": 4.030986705079199e-06, + "loss": 0.6164, + "step": 3644 + }, + { + "epoch": 1.7755764858720364, + "grad_norm": 2.477912425994873, + "learning_rate": 4.030478221545158e-06, + "loss": 0.5203, + "step": 3645 + }, + { + "epoch": 1.7760636570315036, + "grad_norm": 2.471618413925171, + "learning_rate": 4.029969636724211e-06, + "loss": 0.4297, + "step": 3646 + }, + { + "epoch": 1.7765508281909712, + "grad_norm": 2.509519338607788, + "learning_rate": 4.029460950650016e-06, + "loss": 0.5427, + "step": 3647 + }, + { + "epoch": 1.7770379993504384, + "grad_norm": 2.7352983951568604, + "learning_rate": 4.028952163356238e-06, + "loss": 0.6642, + "step": 3648 + }, + { + "epoch": 1.7775251705099058, + "grad_norm": 2.5060534477233887, + "learning_rate": 4.028443274876548e-06, + "loss": 0.6012, + "step": 3649 + }, + { + "epoch": 1.7780123416693732, + "grad_norm": 2.847782611846924, + "learning_rate": 4.027934285244624e-06, + "loss": 0.5166, + "step": 3650 + }, + { + "epoch": 1.7784995128288406, + "grad_norm": 2.341780424118042, + "learning_rate": 4.027425194494151e-06, + "loss": 0.6263, + "step": 3651 + }, + { + "epoch": 1.778986683988308, + "grad_norm": 2.7414677143096924, + "learning_rate": 4.026916002658822e-06, + "loss": 0.6472, + "step": 3652 + }, + { + "epoch": 1.7794738551477751, + "grad_norm": 2.7797791957855225, + "learning_rate": 4.026406709772333e-06, + "loss": 0.5454, + "step": 3653 + }, + { + "epoch": 1.7799610263072427, + "grad_norm": 2.4833016395568848, + "learning_rate": 4.0258973158683904e-06, + "loss": 0.5961, + "step": 3654 + }, + { + "epoch": 1.78044819746671, + "grad_norm": 2.874354600906372, + "learning_rate": 4.025387820980706e-06, + "loss": 0.5788, + "step": 3655 + }, + { + "epoch": 1.7809353686261773, + "grad_norm": 2.7428860664367676, + "learning_rate": 4.024878225142997e-06, + "loss": 0.5359, + "step": 3656 + }, + { + "epoch": 1.7814225397856447, + "grad_norm": 2.726670026779175, + "learning_rate": 4.02436852838899e-06, + "loss": 0.5453, + "step": 3657 + }, + { + "epoch": 1.781909710945112, + "grad_norm": 2.8144540786743164, + "learning_rate": 4.023858730752415e-06, + "loss": 0.5421, + "step": 3658 + }, + { + "epoch": 1.7823968821045795, + "grad_norm": 2.6493568420410156, + "learning_rate": 4.023348832267011e-06, + "loss": 0.5295, + "step": 3659 + }, + { + "epoch": 1.7828840532640466, + "grad_norm": 2.635338306427002, + "learning_rate": 4.022838832966524e-06, + "loss": 0.5516, + "step": 3660 + }, + { + "epoch": 1.7833712244235143, + "grad_norm": 2.6433022022247314, + "learning_rate": 4.022328732884704e-06, + "loss": 0.5798, + "step": 3661 + }, + { + "epoch": 1.7838583955829814, + "grad_norm": 2.847393751144409, + "learning_rate": 4.02181853205531e-06, + "loss": 0.5508, + "step": 3662 + }, + { + "epoch": 1.7843455667424488, + "grad_norm": 2.6131222248077393, + "learning_rate": 4.021308230512108e-06, + "loss": 0.5695, + "step": 3663 + }, + { + "epoch": 1.7848327379019162, + "grad_norm": 2.710886240005493, + "learning_rate": 4.020797828288869e-06, + "loss": 0.6104, + "step": 3664 + }, + { + "epoch": 1.7853199090613836, + "grad_norm": 2.7314648628234863, + "learning_rate": 4.02028732541937e-06, + "loss": 0.5645, + "step": 3665 + }, + { + "epoch": 1.785807080220851, + "grad_norm": 2.7500553131103516, + "learning_rate": 4.019776721937398e-06, + "loss": 0.548, + "step": 3666 + }, + { + "epoch": 1.7862942513803182, + "grad_norm": 2.715768337249756, + "learning_rate": 4.019266017876745e-06, + "loss": 0.6208, + "step": 3667 + }, + { + "epoch": 1.7867814225397858, + "grad_norm": 2.490062713623047, + "learning_rate": 4.0187552132712085e-06, + "loss": 0.6328, + "step": 3668 + }, + { + "epoch": 1.787268593699253, + "grad_norm": 2.829493999481201, + "learning_rate": 4.018244308154592e-06, + "loss": 0.5871, + "step": 3669 + }, + { + "epoch": 1.7877557648587203, + "grad_norm": 2.6276321411132812, + "learning_rate": 4.017733302560709e-06, + "loss": 0.571, + "step": 3670 + }, + { + "epoch": 1.7882429360181877, + "grad_norm": 2.7779736518859863, + "learning_rate": 4.017222196523378e-06, + "loss": 0.5315, + "step": 3671 + }, + { + "epoch": 1.7887301071776551, + "grad_norm": 2.7108263969421387, + "learning_rate": 4.016710990076423e-06, + "loss": 0.6288, + "step": 3672 + }, + { + "epoch": 1.7892172783371225, + "grad_norm": 2.822370767593384, + "learning_rate": 4.016199683253677e-06, + "loss": 0.5719, + "step": 3673 + }, + { + "epoch": 1.7897044494965897, + "grad_norm": 2.768446445465088, + "learning_rate": 4.0156882760889755e-06, + "loss": 0.5042, + "step": 3674 + }, + { + "epoch": 1.7901916206560573, + "grad_norm": 2.6918692588806152, + "learning_rate": 4.015176768616165e-06, + "loss": 0.4959, + "step": 3675 + }, + { + "epoch": 1.7906787918155245, + "grad_norm": 2.5921950340270996, + "learning_rate": 4.014665160869098e-06, + "loss": 0.5876, + "step": 3676 + }, + { + "epoch": 1.7911659629749919, + "grad_norm": 2.728764533996582, + "learning_rate": 4.0141534528816314e-06, + "loss": 0.6, + "step": 3677 + }, + { + "epoch": 1.7916531341344593, + "grad_norm": 2.913738250732422, + "learning_rate": 4.0136416446876305e-06, + "loss": 0.576, + "step": 3678 + }, + { + "epoch": 1.7921403052939266, + "grad_norm": 2.7775771617889404, + "learning_rate": 4.013129736320966e-06, + "loss": 0.5038, + "step": 3679 + }, + { + "epoch": 1.792627476453394, + "grad_norm": 2.6094629764556885, + "learning_rate": 4.0126177278155165e-06, + "loss": 0.6095, + "step": 3680 + }, + { + "epoch": 1.7931146476128612, + "grad_norm": 2.687839984893799, + "learning_rate": 4.012105619205167e-06, + "loss": 0.5791, + "step": 3681 + }, + { + "epoch": 1.7936018187723288, + "grad_norm": 2.9464669227600098, + "learning_rate": 4.011593410523809e-06, + "loss": 0.5343, + "step": 3682 + }, + { + "epoch": 1.794088989931796, + "grad_norm": 2.6441264152526855, + "learning_rate": 4.011081101805339e-06, + "loss": 0.5904, + "step": 3683 + }, + { + "epoch": 1.7945761610912634, + "grad_norm": 2.477090835571289, + "learning_rate": 4.0105686930836615e-06, + "loss": 0.5478, + "step": 3684 + }, + { + "epoch": 1.7950633322507308, + "grad_norm": 2.8917462825775146, + "learning_rate": 4.01005618439269e-06, + "loss": 0.5447, + "step": 3685 + }, + { + "epoch": 1.795550503410198, + "grad_norm": 2.7658700942993164, + "learning_rate": 4.00954357576634e-06, + "loss": 0.5452, + "step": 3686 + }, + { + "epoch": 1.7960376745696656, + "grad_norm": 2.6096315383911133, + "learning_rate": 4.009030867238537e-06, + "loss": 0.5397, + "step": 3687 + }, + { + "epoch": 1.7965248457291327, + "grad_norm": 2.441429853439331, + "learning_rate": 4.008518058843213e-06, + "loss": 0.5278, + "step": 3688 + }, + { + "epoch": 1.7970120168886003, + "grad_norm": 2.661720037460327, + "learning_rate": 4.008005150614303e-06, + "loss": 0.6464, + "step": 3689 + }, + { + "epoch": 1.7974991880480675, + "grad_norm": 2.684058666229248, + "learning_rate": 4.007492142585752e-06, + "loss": 0.5323, + "step": 3690 + }, + { + "epoch": 1.797986359207535, + "grad_norm": 2.7097833156585693, + "learning_rate": 4.006979034791512e-06, + "loss": 0.5094, + "step": 3691 + }, + { + "epoch": 1.7984735303670023, + "grad_norm": 2.5989110469818115, + "learning_rate": 4.00646582726554e-06, + "loss": 0.5688, + "step": 3692 + }, + { + "epoch": 1.7989607015264695, + "grad_norm": 2.5368595123291016, + "learning_rate": 4.0059525200418e-06, + "loss": 0.6894, + "step": 3693 + }, + { + "epoch": 1.799447872685937, + "grad_norm": 2.783780336380005, + "learning_rate": 4.005439113154262e-06, + "loss": 0.5453, + "step": 3694 + }, + { + "epoch": 1.7999350438454043, + "grad_norm": 4.393093585968018, + "learning_rate": 4.004925606636904e-06, + "loss": 0.5337, + "step": 3695 + }, + { + "epoch": 1.8004222150048719, + "grad_norm": 2.6027729511260986, + "learning_rate": 4.00441200052371e-06, + "loss": 0.5392, + "step": 3696 + }, + { + "epoch": 1.800909386164339, + "grad_norm": 2.6217074394226074, + "learning_rate": 4.003898294848668e-06, + "loss": 0.5564, + "step": 3697 + }, + { + "epoch": 1.8013965573238064, + "grad_norm": 3.105201005935669, + "learning_rate": 4.003384489645779e-06, + "loss": 0.6892, + "step": 3698 + }, + { + "epoch": 1.8018837284832738, + "grad_norm": 3.079582691192627, + "learning_rate": 4.002870584949043e-06, + "loss": 0.5631, + "step": 3699 + }, + { + "epoch": 1.802370899642741, + "grad_norm": 2.7025372982025146, + "learning_rate": 4.002356580792473e-06, + "loss": 0.477, + "step": 3700 + }, + { + "epoch": 1.8028580708022086, + "grad_norm": 2.4770452976226807, + "learning_rate": 4.001842477210084e-06, + "loss": 0.5438, + "step": 3701 + }, + { + "epoch": 1.8033452419616758, + "grad_norm": 2.9677629470825195, + "learning_rate": 4.0013282742358995e-06, + "loss": 0.5462, + "step": 3702 + }, + { + "epoch": 1.8038324131211432, + "grad_norm": 2.5698862075805664, + "learning_rate": 4.000813971903948e-06, + "loss": 0.5431, + "step": 3703 + }, + { + "epoch": 1.8043195842806106, + "grad_norm": 2.6690902709960938, + "learning_rate": 4.000299570248271e-06, + "loss": 0.637, + "step": 3704 + }, + { + "epoch": 1.804806755440078, + "grad_norm": 2.528665542602539, + "learning_rate": 3.999785069302905e-06, + "loss": 0.5003, + "step": 3705 + }, + { + "epoch": 1.8052939265995454, + "grad_norm": 2.352771282196045, + "learning_rate": 3.999270469101904e-06, + "loss": 0.5178, + "step": 3706 + }, + { + "epoch": 1.8057810977590125, + "grad_norm": 2.5314178466796875, + "learning_rate": 3.9987557696793235e-06, + "loss": 0.5308, + "step": 3707 + }, + { + "epoch": 1.8062682689184801, + "grad_norm": 2.4018614292144775, + "learning_rate": 3.998240971069225e-06, + "loss": 0.4994, + "step": 3708 + }, + { + "epoch": 1.8067554400779473, + "grad_norm": 2.696160316467285, + "learning_rate": 3.997726073305679e-06, + "loss": 0.5804, + "step": 3709 + }, + { + "epoch": 1.8072426112374147, + "grad_norm": 2.7129645347595215, + "learning_rate": 3.9972110764227605e-06, + "loss": 0.6023, + "step": 3710 + }, + { + "epoch": 1.807729782396882, + "grad_norm": 2.730910539627075, + "learning_rate": 3.996695980454552e-06, + "loss": 0.5041, + "step": 3711 + }, + { + "epoch": 1.8082169535563495, + "grad_norm": 2.4620232582092285, + "learning_rate": 3.9961807854351446e-06, + "loss": 0.5826, + "step": 3712 + }, + { + "epoch": 1.8087041247158169, + "grad_norm": 3.3586537837982178, + "learning_rate": 3.995665491398631e-06, + "loss": 0.5745, + "step": 3713 + }, + { + "epoch": 1.809191295875284, + "grad_norm": 2.6740267276763916, + "learning_rate": 3.995150098379114e-06, + "loss": 0.6134, + "step": 3714 + }, + { + "epoch": 1.8096784670347517, + "grad_norm": 2.8954389095306396, + "learning_rate": 3.994634606410703e-06, + "loss": 0.5808, + "step": 3715 + }, + { + "epoch": 1.8101656381942188, + "grad_norm": 2.686300754547119, + "learning_rate": 3.994119015527513e-06, + "loss": 0.5717, + "step": 3716 + }, + { + "epoch": 1.8106528093536862, + "grad_norm": 2.445624589920044, + "learning_rate": 3.993603325763665e-06, + "loss": 0.567, + "step": 3717 + }, + { + "epoch": 1.8111399805131536, + "grad_norm": 3.427170991897583, + "learning_rate": 3.993087537153289e-06, + "loss": 0.6046, + "step": 3718 + }, + { + "epoch": 1.811627151672621, + "grad_norm": 2.822009563446045, + "learning_rate": 3.992571649730519e-06, + "loss": 0.5508, + "step": 3719 + }, + { + "epoch": 1.8121143228320884, + "grad_norm": 2.7795636653900146, + "learning_rate": 3.992055663529496e-06, + "loss": 0.5513, + "step": 3720 + }, + { + "epoch": 1.8126014939915556, + "grad_norm": 2.709470272064209, + "learning_rate": 3.991539578584368e-06, + "loss": 0.5066, + "step": 3721 + }, + { + "epoch": 1.8130886651510232, + "grad_norm": 2.7912895679473877, + "learning_rate": 3.9910233949292895e-06, + "loss": 0.5046, + "step": 3722 + }, + { + "epoch": 1.8135758363104904, + "grad_norm": 2.6567256450653076, + "learning_rate": 3.990507112598422e-06, + "loss": 0.6186, + "step": 3723 + }, + { + "epoch": 1.8140630074699577, + "grad_norm": 2.9707512855529785, + "learning_rate": 3.989990731625931e-06, + "loss": 0.5168, + "step": 3724 + }, + { + "epoch": 1.8145501786294251, + "grad_norm": 2.838886260986328, + "learning_rate": 3.989474252045994e-06, + "loss": 0.6528, + "step": 3725 + }, + { + "epoch": 1.8150373497888925, + "grad_norm": 2.7959048748016357, + "learning_rate": 3.988957673892789e-06, + "loss": 0.5603, + "step": 3726 + }, + { + "epoch": 1.81552452094836, + "grad_norm": 2.7106127738952637, + "learning_rate": 3.988440997200504e-06, + "loss": 0.625, + "step": 3727 + }, + { + "epoch": 1.816011692107827, + "grad_norm": 2.5937204360961914, + "learning_rate": 3.987924222003332e-06, + "loss": 0.6196, + "step": 3728 + }, + { + "epoch": 1.8164988632672947, + "grad_norm": 2.878628730773926, + "learning_rate": 3.9874073483354734e-06, + "loss": 0.5381, + "step": 3729 + }, + { + "epoch": 1.8169860344267619, + "grad_norm": 2.829792022705078, + "learning_rate": 3.9868903762311355e-06, + "loss": 0.636, + "step": 3730 + }, + { + "epoch": 1.8174732055862293, + "grad_norm": 2.366971969604492, + "learning_rate": 3.986373305724531e-06, + "loss": 0.5837, + "step": 3731 + }, + { + "epoch": 1.8179603767456967, + "grad_norm": 2.74899959564209, + "learning_rate": 3.985856136849879e-06, + "loss": 0.5594, + "step": 3732 + }, + { + "epoch": 1.818447547905164, + "grad_norm": 2.6797924041748047, + "learning_rate": 3.985338869641407e-06, + "loss": 0.5059, + "step": 3733 + }, + { + "epoch": 1.8189347190646314, + "grad_norm": 2.5211496353149414, + "learning_rate": 3.984821504133346e-06, + "loss": 0.5395, + "step": 3734 + }, + { + "epoch": 1.8194218902240986, + "grad_norm": 2.514956474304199, + "learning_rate": 3.984304040359937e-06, + "loss": 0.6063, + "step": 3735 + }, + { + "epoch": 1.8199090613835662, + "grad_norm": 2.900834321975708, + "learning_rate": 3.9837864783554245e-06, + "loss": 0.5076, + "step": 3736 + }, + { + "epoch": 1.8203962325430334, + "grad_norm": 2.5016028881073, + "learning_rate": 3.9832688181540615e-06, + "loss": 0.5495, + "step": 3737 + }, + { + "epoch": 1.8208834037025008, + "grad_norm": 2.548081159591675, + "learning_rate": 3.982751059790105e-06, + "loss": 0.5128, + "step": 3738 + }, + { + "epoch": 1.8213705748619682, + "grad_norm": 2.638485908508301, + "learning_rate": 3.982233203297822e-06, + "loss": 0.5605, + "step": 3739 + }, + { + "epoch": 1.8218577460214356, + "grad_norm": 2.50589656829834, + "learning_rate": 3.981715248711484e-06, + "loss": 0.6125, + "step": 3740 + }, + { + "epoch": 1.822344917180903, + "grad_norm": 3.151634931564331, + "learning_rate": 3.981197196065369e-06, + "loss": 0.6765, + "step": 3741 + }, + { + "epoch": 1.8228320883403701, + "grad_norm": 3.055067539215088, + "learning_rate": 3.98067904539376e-06, + "loss": 0.6016, + "step": 3742 + }, + { + "epoch": 1.8233192594998378, + "grad_norm": 2.940098285675049, + "learning_rate": 3.9801607967309514e-06, + "loss": 0.6122, + "step": 3743 + }, + { + "epoch": 1.823806430659305, + "grad_norm": 2.733792543411255, + "learning_rate": 3.979642450111239e-06, + "loss": 0.635, + "step": 3744 + }, + { + "epoch": 1.8242936018187723, + "grad_norm": 3.4877665042877197, + "learning_rate": 3.979124005568926e-06, + "loss": 0.5524, + "step": 3745 + }, + { + "epoch": 1.8247807729782397, + "grad_norm": 2.6130478382110596, + "learning_rate": 3.978605463138324e-06, + "loss": 0.5015, + "step": 3746 + }, + { + "epoch": 1.825267944137707, + "grad_norm": 2.5077874660491943, + "learning_rate": 3.9780868228537515e-06, + "loss": 0.5186, + "step": 3747 + }, + { + "epoch": 1.8257551152971745, + "grad_norm": 2.8253588676452637, + "learning_rate": 3.977568084749529e-06, + "loss": 0.5759, + "step": 3748 + }, + { + "epoch": 1.8262422864566417, + "grad_norm": 2.5087783336639404, + "learning_rate": 3.97704924885999e-06, + "loss": 0.5549, + "step": 3749 + }, + { + "epoch": 1.8267294576161093, + "grad_norm": 2.6556265354156494, + "learning_rate": 3.976530315219468e-06, + "loss": 0.5618, + "step": 3750 + }, + { + "epoch": 1.8272166287755764, + "grad_norm": 2.8637988567352295, + "learning_rate": 3.976011283862308e-06, + "loss": 0.5341, + "step": 3751 + }, + { + "epoch": 1.8277037999350438, + "grad_norm": 2.4723544120788574, + "learning_rate": 3.975492154822857e-06, + "loss": 0.532, + "step": 3752 + }, + { + "epoch": 1.8281909710945112, + "grad_norm": 2.7158608436584473, + "learning_rate": 3.9749729281354746e-06, + "loss": 0.6119, + "step": 3753 + }, + { + "epoch": 1.8286781422539784, + "grad_norm": 2.628722667694092, + "learning_rate": 3.974453603834521e-06, + "loss": 0.5891, + "step": 3754 + }, + { + "epoch": 1.829165313413446, + "grad_norm": 2.803321599960327, + "learning_rate": 3.973934181954364e-06, + "loss": 0.5975, + "step": 3755 + }, + { + "epoch": 1.8296524845729132, + "grad_norm": 2.7356879711151123, + "learning_rate": 3.97341466252938e-06, + "loss": 0.6035, + "step": 3756 + }, + { + "epoch": 1.8301396557323808, + "grad_norm": 2.8311376571655273, + "learning_rate": 3.9728950455939525e-06, + "loss": 0.6266, + "step": 3757 + }, + { + "epoch": 1.830626826891848, + "grad_norm": 2.856192111968994, + "learning_rate": 3.972375331182467e-06, + "loss": 0.6179, + "step": 3758 + }, + { + "epoch": 1.8311139980513154, + "grad_norm": 2.694915533065796, + "learning_rate": 3.97185551932932e-06, + "loss": 0.5513, + "step": 3759 + }, + { + "epoch": 1.8316011692107828, + "grad_norm": 4.754314422607422, + "learning_rate": 3.97133561006891e-06, + "loss": 0.5684, + "step": 3760 + }, + { + "epoch": 1.83208834037025, + "grad_norm": 2.5785293579101562, + "learning_rate": 3.970815603435648e-06, + "loss": 0.5557, + "step": 3761 + }, + { + "epoch": 1.8325755115297175, + "grad_norm": 2.6831061840057373, + "learning_rate": 3.970295499463945e-06, + "loss": 0.5546, + "step": 3762 + }, + { + "epoch": 1.8330626826891847, + "grad_norm": 2.54059100151062, + "learning_rate": 3.969775298188224e-06, + "loss": 0.5102, + "step": 3763 + }, + { + "epoch": 1.8335498538486523, + "grad_norm": 2.570732831954956, + "learning_rate": 3.96925499964291e-06, + "loss": 0.5818, + "step": 3764 + }, + { + "epoch": 1.8340370250081195, + "grad_norm": 2.5637364387512207, + "learning_rate": 3.968734603862437e-06, + "loss": 0.5178, + "step": 3765 + }, + { + "epoch": 1.8345241961675869, + "grad_norm": 4.659806728363037, + "learning_rate": 3.968214110881245e-06, + "loss": 0.4951, + "step": 3766 + }, + { + "epoch": 1.8350113673270543, + "grad_norm": 2.6411194801330566, + "learning_rate": 3.96769352073378e-06, + "loss": 0.5256, + "step": 3767 + }, + { + "epoch": 1.8354985384865214, + "grad_norm": 2.9015774726867676, + "learning_rate": 3.967172833454493e-06, + "loss": 0.5971, + "step": 3768 + }, + { + "epoch": 1.835985709645989, + "grad_norm": 2.6830785274505615, + "learning_rate": 3.966652049077846e-06, + "loss": 0.5353, + "step": 3769 + }, + { + "epoch": 1.8364728808054562, + "grad_norm": 2.4615464210510254, + "learning_rate": 3.966131167638303e-06, + "loss": 0.5685, + "step": 3770 + }, + { + "epoch": 1.8369600519649236, + "grad_norm": 2.7616403102874756, + "learning_rate": 3.965610189170335e-06, + "loss": 0.6464, + "step": 3771 + }, + { + "epoch": 1.837447223124391, + "grad_norm": 2.641857624053955, + "learning_rate": 3.965089113708421e-06, + "loss": 0.5456, + "step": 3772 + }, + { + "epoch": 1.8379343942838584, + "grad_norm": 2.988710641860962, + "learning_rate": 3.964567941287047e-06, + "loss": 0.6489, + "step": 3773 + }, + { + "epoch": 1.8384215654433258, + "grad_norm": 2.477329730987549, + "learning_rate": 3.964046671940703e-06, + "loss": 0.5895, + "step": 3774 + }, + { + "epoch": 1.838908736602793, + "grad_norm": 2.655695915222168, + "learning_rate": 3.963525305703886e-06, + "loss": 0.5233, + "step": 3775 + }, + { + "epoch": 1.8393959077622606, + "grad_norm": 2.5568549633026123, + "learning_rate": 3.9630038426111006e-06, + "loss": 0.5147, + "step": 3776 + }, + { + "epoch": 1.8398830789217278, + "grad_norm": 2.7559549808502197, + "learning_rate": 3.962482282696857e-06, + "loss": 0.5531, + "step": 3777 + }, + { + "epoch": 1.8403702500811951, + "grad_norm": 2.864713668823242, + "learning_rate": 3.961960625995672e-06, + "loss": 0.6366, + "step": 3778 + }, + { + "epoch": 1.8408574212406625, + "grad_norm": 2.902045249938965, + "learning_rate": 3.961438872542069e-06, + "loss": 0.5372, + "step": 3779 + }, + { + "epoch": 1.84134459240013, + "grad_norm": 2.772477865219116, + "learning_rate": 3.960917022370578e-06, + "loss": 0.6824, + "step": 3780 + }, + { + "epoch": 1.8418317635595973, + "grad_norm": 2.681943893432617, + "learning_rate": 3.960395075515734e-06, + "loss": 0.5416, + "step": 3781 + }, + { + "epoch": 1.8423189347190645, + "grad_norm": 2.7607176303863525, + "learning_rate": 3.959873032012081e-06, + "loss": 0.5861, + "step": 3782 + }, + { + "epoch": 1.842806105878532, + "grad_norm": 2.7329723834991455, + "learning_rate": 3.959350891894166e-06, + "loss": 0.5309, + "step": 3783 + }, + { + "epoch": 1.8432932770379993, + "grad_norm": 2.2759571075439453, + "learning_rate": 3.958828655196543e-06, + "loss": 0.5518, + "step": 3784 + }, + { + "epoch": 1.8437804481974667, + "grad_norm": 2.606476306915283, + "learning_rate": 3.958306321953778e-06, + "loss": 0.6406, + "step": 3785 + }, + { + "epoch": 1.844267619356934, + "grad_norm": 2.630976438522339, + "learning_rate": 3.957783892200435e-06, + "loss": 0.5185, + "step": 3786 + }, + { + "epoch": 1.8447547905164015, + "grad_norm": 2.5183181762695312, + "learning_rate": 3.95726136597109e-06, + "loss": 0.4916, + "step": 3787 + }, + { + "epoch": 1.8452419616758688, + "grad_norm": 2.870701551437378, + "learning_rate": 3.956738743300324e-06, + "loss": 0.5838, + "step": 3788 + }, + { + "epoch": 1.845729132835336, + "grad_norm": 2.8152642250061035, + "learning_rate": 3.956216024222724e-06, + "loss": 0.6565, + "step": 3789 + }, + { + "epoch": 1.8462163039948036, + "grad_norm": 2.9225592613220215, + "learning_rate": 3.955693208772882e-06, + "loss": 0.5513, + "step": 3790 + }, + { + "epoch": 1.8467034751542708, + "grad_norm": 2.35239577293396, + "learning_rate": 3.9551702969854e-06, + "loss": 0.5166, + "step": 3791 + }, + { + "epoch": 1.8471906463137382, + "grad_norm": 2.673705816268921, + "learning_rate": 3.9546472888948825e-06, + "loss": 0.5056, + "step": 3792 + }, + { + "epoch": 1.8476778174732056, + "grad_norm": 2.599918842315674, + "learning_rate": 3.954124184535943e-06, + "loss": 0.544, + "step": 3793 + }, + { + "epoch": 1.848164988632673, + "grad_norm": 2.4357337951660156, + "learning_rate": 3.953600983943201e-06, + "loss": 0.5629, + "step": 3794 + }, + { + "epoch": 1.8486521597921404, + "grad_norm": 2.8911664485931396, + "learning_rate": 3.95307768715128e-06, + "loss": 0.5871, + "step": 3795 + }, + { + "epoch": 1.8491393309516075, + "grad_norm": 2.9353554248809814, + "learning_rate": 3.952554294194814e-06, + "loss": 0.5721, + "step": 3796 + }, + { + "epoch": 1.8496265021110752, + "grad_norm": 2.486661911010742, + "learning_rate": 3.95203080510844e-06, + "loss": 0.5298, + "step": 3797 + }, + { + "epoch": 1.8501136732705423, + "grad_norm": 2.6324374675750732, + "learning_rate": 3.9515072199268025e-06, + "loss": 0.5556, + "step": 3798 + }, + { + "epoch": 1.8506008444300097, + "grad_norm": 3.288942337036133, + "learning_rate": 3.950983538684552e-06, + "loss": 0.4808, + "step": 3799 + }, + { + "epoch": 1.851088015589477, + "grad_norm": 3.0199837684631348, + "learning_rate": 3.950459761416346e-06, + "loss": 0.6161, + "step": 3800 + }, + { + "epoch": 1.8515751867489445, + "grad_norm": 2.8229451179504395, + "learning_rate": 3.949935888156848e-06, + "loss": 0.5786, + "step": 3801 + }, + { + "epoch": 1.852062357908412, + "grad_norm": 2.843040704727173, + "learning_rate": 3.949411918940728e-06, + "loss": 0.5399, + "step": 3802 + }, + { + "epoch": 1.852549529067879, + "grad_norm": 2.874274969100952, + "learning_rate": 3.948887853802662e-06, + "loss": 0.6112, + "step": 3803 + }, + { + "epoch": 1.8530367002273467, + "grad_norm": 2.774353265762329, + "learning_rate": 3.948363692777334e-06, + "loss": 0.5582, + "step": 3804 + }, + { + "epoch": 1.8535238713868138, + "grad_norm": 2.579308271408081, + "learning_rate": 3.94783943589943e-06, + "loss": 0.5752, + "step": 3805 + }, + { + "epoch": 1.8540110425462812, + "grad_norm": 2.8006269931793213, + "learning_rate": 3.947315083203649e-06, + "loss": 0.5769, + "step": 3806 + }, + { + "epoch": 1.8544982137057486, + "grad_norm": 2.94958758354187, + "learning_rate": 3.946790634724689e-06, + "loss": 0.5595, + "step": 3807 + }, + { + "epoch": 1.854985384865216, + "grad_norm": 2.489267587661743, + "learning_rate": 3.946266090497259e-06, + "loss": 0.5121, + "step": 3808 + }, + { + "epoch": 1.8554725560246834, + "grad_norm": 2.6524910926818848, + "learning_rate": 3.945741450556075e-06, + "loss": 0.5407, + "step": 3809 + }, + { + "epoch": 1.8559597271841506, + "grad_norm": 2.701378345489502, + "learning_rate": 3.9452167149358564e-06, + "loss": 0.5372, + "step": 3810 + }, + { + "epoch": 1.8564468983436182, + "grad_norm": 2.489466428756714, + "learning_rate": 3.94469188367133e-06, + "loss": 0.5138, + "step": 3811 + }, + { + "epoch": 1.8569340695030854, + "grad_norm": 2.4081358909606934, + "learning_rate": 3.944166956797229e-06, + "loss": 0.5097, + "step": 3812 + }, + { + "epoch": 1.8574212406625528, + "grad_norm": 2.5874602794647217, + "learning_rate": 3.943641934348294e-06, + "loss": 0.6018, + "step": 3813 + }, + { + "epoch": 1.8579084118220202, + "grad_norm": 2.492863655090332, + "learning_rate": 3.9431168163592695e-06, + "loss": 0.4934, + "step": 3814 + }, + { + "epoch": 1.8583955829814875, + "grad_norm": 2.6260204315185547, + "learning_rate": 3.942591602864908e-06, + "loss": 0.5409, + "step": 3815 + }, + { + "epoch": 1.858882754140955, + "grad_norm": 2.907440185546875, + "learning_rate": 3.9420662938999695e-06, + "loss": 0.6089, + "step": 3816 + }, + { + "epoch": 1.859369925300422, + "grad_norm": 3.2597124576568604, + "learning_rate": 3.9415408894992165e-06, + "loss": 0.606, + "step": 3817 + }, + { + "epoch": 1.8598570964598897, + "grad_norm": 2.4870548248291016, + "learning_rate": 3.941015389697423e-06, + "loss": 0.5848, + "step": 3818 + }, + { + "epoch": 1.860344267619357, + "grad_norm": 2.51237416267395, + "learning_rate": 3.940489794529365e-06, + "loss": 0.553, + "step": 3819 + }, + { + "epoch": 1.8608314387788243, + "grad_norm": 3.029294490814209, + "learning_rate": 3.939964104029827e-06, + "loss": 0.4755, + "step": 3820 + }, + { + "epoch": 1.8613186099382917, + "grad_norm": 2.2713613510131836, + "learning_rate": 3.939438318233598e-06, + "loss": 0.5853, + "step": 3821 + }, + { + "epoch": 1.8618057810977588, + "grad_norm": 2.5592446327209473, + "learning_rate": 3.938912437175475e-06, + "loss": 0.5324, + "step": 3822 + }, + { + "epoch": 1.8622929522572265, + "grad_norm": 2.5847463607788086, + "learning_rate": 3.938386460890262e-06, + "loss": 0.5648, + "step": 3823 + }, + { + "epoch": 1.8627801234166936, + "grad_norm": 2.745863676071167, + "learning_rate": 3.937860389412766e-06, + "loss": 0.5658, + "step": 3824 + }, + { + "epoch": 1.8632672945761612, + "grad_norm": 2.757666826248169, + "learning_rate": 3.9373342227778035e-06, + "loss": 0.5685, + "step": 3825 + }, + { + "epoch": 1.8637544657356284, + "grad_norm": 2.8285820484161377, + "learning_rate": 3.9368079610201955e-06, + "loss": 0.5863, + "step": 3826 + }, + { + "epoch": 1.8642416368950958, + "grad_norm": 2.857015371322632, + "learning_rate": 3.936281604174772e-06, + "loss": 0.5601, + "step": 3827 + }, + { + "epoch": 1.8647288080545632, + "grad_norm": 2.4290153980255127, + "learning_rate": 3.935755152276364e-06, + "loss": 0.5676, + "step": 3828 + }, + { + "epoch": 1.8652159792140304, + "grad_norm": 2.951913595199585, + "learning_rate": 3.935228605359814e-06, + "loss": 0.5739, + "step": 3829 + }, + { + "epoch": 1.865703150373498, + "grad_norm": 2.783059597015381, + "learning_rate": 3.93470196345997e-06, + "loss": 0.5676, + "step": 3830 + }, + { + "epoch": 1.8661903215329652, + "grad_norm": 2.6889355182647705, + "learning_rate": 3.9341752266116815e-06, + "loss": 0.596, + "step": 3831 + }, + { + "epoch": 1.8666774926924328, + "grad_norm": 2.82362699508667, + "learning_rate": 3.933648394849812e-06, + "loss": 0.5494, + "step": 3832 + }, + { + "epoch": 1.8671646638519, + "grad_norm": 3.109005928039551, + "learning_rate": 3.933121468209224e-06, + "loss": 0.5253, + "step": 3833 + }, + { + "epoch": 1.8676518350113673, + "grad_norm": 2.5883805751800537, + "learning_rate": 3.9325944467247915e-06, + "loss": 0.5242, + "step": 3834 + }, + { + "epoch": 1.8681390061708347, + "grad_norm": 2.730119466781616, + "learning_rate": 3.93206733043139e-06, + "loss": 0.5973, + "step": 3835 + }, + { + "epoch": 1.868626177330302, + "grad_norm": 2.735896587371826, + "learning_rate": 3.931540119363908e-06, + "loss": 0.6131, + "step": 3836 + }, + { + "epoch": 1.8691133484897695, + "grad_norm": 2.695265531539917, + "learning_rate": 3.931012813557234e-06, + "loss": 0.5903, + "step": 3837 + }, + { + "epoch": 1.8696005196492367, + "grad_norm": 3.01832914352417, + "learning_rate": 3.930485413046264e-06, + "loss": 0.5299, + "step": 3838 + }, + { + "epoch": 1.870087690808704, + "grad_norm": 2.722229242324829, + "learning_rate": 3.929957917865904e-06, + "loss": 0.5343, + "step": 3839 + }, + { + "epoch": 1.8705748619681715, + "grad_norm": 2.530130386352539, + "learning_rate": 3.929430328051062e-06, + "loss": 0.5406, + "step": 3840 + }, + { + "epoch": 1.8710620331276389, + "grad_norm": 2.865084171295166, + "learning_rate": 3.928902643636653e-06, + "loss": 0.4932, + "step": 3841 + }, + { + "epoch": 1.8715492042871062, + "grad_norm": 2.6955807209014893, + "learning_rate": 3.928374864657601e-06, + "loss": 0.612, + "step": 3842 + }, + { + "epoch": 1.8720363754465734, + "grad_norm": 2.7329964637756348, + "learning_rate": 3.927846991148833e-06, + "loss": 0.6323, + "step": 3843 + }, + { + "epoch": 1.872523546606041, + "grad_norm": 2.6305251121520996, + "learning_rate": 3.927319023145284e-06, + "loss": 0.596, + "step": 3844 + }, + { + "epoch": 1.8730107177655082, + "grad_norm": 2.860774517059326, + "learning_rate": 3.926790960681895e-06, + "loss": 0.5104, + "step": 3845 + }, + { + "epoch": 1.8734978889249756, + "grad_norm": 2.490156650543213, + "learning_rate": 3.926262803793614e-06, + "loss": 0.6371, + "step": 3846 + }, + { + "epoch": 1.873985060084443, + "grad_norm": 3.082805871963501, + "learning_rate": 3.925734552515392e-06, + "loss": 0.544, + "step": 3847 + }, + { + "epoch": 1.8744722312439104, + "grad_norm": 2.5260424613952637, + "learning_rate": 3.925206206882191e-06, + "loss": 0.5338, + "step": 3848 + }, + { + "epoch": 1.8749594024033778, + "grad_norm": 2.434368848800659, + "learning_rate": 3.924677766928976e-06, + "loss": 0.5927, + "step": 3849 + }, + { + "epoch": 1.875446573562845, + "grad_norm": 2.5369224548339844, + "learning_rate": 3.924149232690718e-06, + "loss": 0.6021, + "step": 3850 + }, + { + "epoch": 1.8759337447223126, + "grad_norm": 2.541764974594116, + "learning_rate": 3.9236206042023975e-06, + "loss": 0.543, + "step": 3851 + }, + { + "epoch": 1.8764209158817797, + "grad_norm": 2.6162946224212646, + "learning_rate": 3.923091881498997e-06, + "loss": 0.5745, + "step": 3852 + }, + { + "epoch": 1.8769080870412471, + "grad_norm": 2.543821096420288, + "learning_rate": 3.92256306461551e-06, + "loss": 0.4635, + "step": 3853 + }, + { + "epoch": 1.8773952582007145, + "grad_norm": 2.687148332595825, + "learning_rate": 3.92203415358693e-06, + "loss": 0.5616, + "step": 3854 + }, + { + "epoch": 1.877882429360182, + "grad_norm": 2.834524154663086, + "learning_rate": 3.9215051484482635e-06, + "loss": 0.5231, + "step": 3855 + }, + { + "epoch": 1.8783696005196493, + "grad_norm": 2.3833818435668945, + "learning_rate": 3.920976049234518e-06, + "loss": 0.5115, + "step": 3856 + }, + { + "epoch": 1.8788567716791165, + "grad_norm": 2.638138771057129, + "learning_rate": 3.92044685598071e-06, + "loss": 0.5439, + "step": 3857 + }, + { + "epoch": 1.879343942838584, + "grad_norm": 2.2533390522003174, + "learning_rate": 3.919917568721861e-06, + "loss": 0.549, + "step": 3858 + }, + { + "epoch": 1.8798311139980513, + "grad_norm": 2.925020217895508, + "learning_rate": 3.919388187493e-06, + "loss": 0.5385, + "step": 3859 + }, + { + "epoch": 1.8803182851575186, + "grad_norm": 2.47186017036438, + "learning_rate": 3.918858712329161e-06, + "loss": 0.575, + "step": 3860 + }, + { + "epoch": 1.880805456316986, + "grad_norm": 2.5011940002441406, + "learning_rate": 3.9183291432653845e-06, + "loss": 0.6117, + "step": 3861 + }, + { + "epoch": 1.8812926274764534, + "grad_norm": 2.8381502628326416, + "learning_rate": 3.9177994803367166e-06, + "loss": 0.5805, + "step": 3862 + }, + { + "epoch": 1.8817797986359208, + "grad_norm": 2.7815113067626953, + "learning_rate": 3.917269723578212e-06, + "loss": 0.5419, + "step": 3863 + }, + { + "epoch": 1.882266969795388, + "grad_norm": 2.8004941940307617, + "learning_rate": 3.916739873024928e-06, + "loss": 0.5696, + "step": 3864 + }, + { + "epoch": 1.8827541409548556, + "grad_norm": 2.9849517345428467, + "learning_rate": 3.916209928711932e-06, + "loss": 0.6022, + "step": 3865 + }, + { + "epoch": 1.8832413121143228, + "grad_norm": 3.2580738067626953, + "learning_rate": 3.915679890674295e-06, + "loss": 0.5138, + "step": 3866 + }, + { + "epoch": 1.8837284832737902, + "grad_norm": 2.5855023860931396, + "learning_rate": 3.915149758947094e-06, + "loss": 0.5073, + "step": 3867 + }, + { + "epoch": 1.8842156544332576, + "grad_norm": 2.6170833110809326, + "learning_rate": 3.914619533565415e-06, + "loss": 0.5485, + "step": 3868 + }, + { + "epoch": 1.884702825592725, + "grad_norm": 2.880341053009033, + "learning_rate": 3.914089214564346e-06, + "loss": 0.5976, + "step": 3869 + }, + { + "epoch": 1.8851899967521923, + "grad_norm": 2.839975595474243, + "learning_rate": 3.913558801978984e-06, + "loss": 0.4717, + "step": 3870 + }, + { + "epoch": 1.8856771679116595, + "grad_norm": 2.721665143966675, + "learning_rate": 3.913028295844433e-06, + "loss": 0.514, + "step": 3871 + }, + { + "epoch": 1.8861643390711271, + "grad_norm": 2.7208662033081055, + "learning_rate": 3.9124976961958005e-06, + "loss": 0.5208, + "step": 3872 + }, + { + "epoch": 1.8866515102305943, + "grad_norm": 3.0455174446105957, + "learning_rate": 3.911967003068202e-06, + "loss": 0.6267, + "step": 3873 + }, + { + "epoch": 1.8871386813900617, + "grad_norm": 2.7235302925109863, + "learning_rate": 3.911436216496759e-06, + "loss": 0.5408, + "step": 3874 + }, + { + "epoch": 1.887625852549529, + "grad_norm": 2.6077191829681396, + "learning_rate": 3.910905336516599e-06, + "loss": 0.5558, + "step": 3875 + }, + { + "epoch": 1.8881130237089965, + "grad_norm": 2.8556571006774902, + "learning_rate": 3.910374363162854e-06, + "loss": 0.5621, + "step": 3876 + }, + { + "epoch": 1.8886001948684639, + "grad_norm": 2.6332430839538574, + "learning_rate": 3.9098432964706655e-06, + "loss": 0.6641, + "step": 3877 + }, + { + "epoch": 1.889087366027931, + "grad_norm": 2.935696840286255, + "learning_rate": 3.9093121364751795e-06, + "loss": 0.5649, + "step": 3878 + }, + { + "epoch": 1.8895745371873987, + "grad_norm": 2.6958210468292236, + "learning_rate": 3.908780883211547e-06, + "loss": 0.5051, + "step": 3879 + }, + { + "epoch": 1.8900617083468658, + "grad_norm": 2.6187562942504883, + "learning_rate": 3.908249536714927e-06, + "loss": 0.538, + "step": 3880 + }, + { + "epoch": 1.8905488795063332, + "grad_norm": 2.74429988861084, + "learning_rate": 3.907718097020484e-06, + "loss": 0.5597, + "step": 3881 + }, + { + "epoch": 1.8910360506658006, + "grad_norm": 2.74322772026062, + "learning_rate": 3.907186564163388e-06, + "loss": 0.5454, + "step": 3882 + }, + { + "epoch": 1.891523221825268, + "grad_norm": 2.4631192684173584, + "learning_rate": 3.906654938178816e-06, + "loss": 0.5875, + "step": 3883 + }, + { + "epoch": 1.8920103929847354, + "grad_norm": 2.9246342182159424, + "learning_rate": 3.9061232191019525e-06, + "loss": 0.5431, + "step": 3884 + }, + { + "epoch": 1.8924975641442026, + "grad_norm": 2.5993199348449707, + "learning_rate": 3.905591406967983e-06, + "loss": 0.6021, + "step": 3885 + }, + { + "epoch": 1.8929847353036702, + "grad_norm": 2.8829572200775146, + "learning_rate": 3.905059501812108e-06, + "loss": 0.5456, + "step": 3886 + }, + { + "epoch": 1.8934719064631373, + "grad_norm": 2.765469789505005, + "learning_rate": 3.9045275036695245e-06, + "loss": 0.5816, + "step": 3887 + }, + { + "epoch": 1.8939590776226047, + "grad_norm": 2.584289073944092, + "learning_rate": 3.903995412575441e-06, + "loss": 0.5951, + "step": 3888 + }, + { + "epoch": 1.8944462487820721, + "grad_norm": 2.9744045734405518, + "learning_rate": 3.903463228565072e-06, + "loss": 0.5538, + "step": 3889 + }, + { + "epoch": 1.8949334199415393, + "grad_norm": 2.5713858604431152, + "learning_rate": 3.902930951673638e-06, + "loss": 0.586, + "step": 3890 + }, + { + "epoch": 1.895420591101007, + "grad_norm": 2.7413175106048584, + "learning_rate": 3.902398581936364e-06, + "loss": 0.5518, + "step": 3891 + }, + { + "epoch": 1.895907762260474, + "grad_norm": 2.8488223552703857, + "learning_rate": 3.901866119388482e-06, + "loss": 0.533, + "step": 3892 + }, + { + "epoch": 1.8963949334199417, + "grad_norm": 2.598935127258301, + "learning_rate": 3.901333564065231e-06, + "loss": 0.5804, + "step": 3893 + }, + { + "epoch": 1.8968821045794089, + "grad_norm": 2.359686851501465, + "learning_rate": 3.900800916001855e-06, + "loss": 0.5971, + "step": 3894 + }, + { + "epoch": 1.8973692757388763, + "grad_norm": 2.7632925510406494, + "learning_rate": 3.900268175233606e-06, + "loss": 0.5784, + "step": 3895 + }, + { + "epoch": 1.8978564468983437, + "grad_norm": 2.328245162963867, + "learning_rate": 3.899735341795739e-06, + "loss": 0.5553, + "step": 3896 + }, + { + "epoch": 1.8983436180578108, + "grad_norm": 2.4601657390594482, + "learning_rate": 3.899202415723517e-06, + "loss": 0.5324, + "step": 3897 + }, + { + "epoch": 1.8988307892172784, + "grad_norm": 2.7745378017425537, + "learning_rate": 3.89866939705221e-06, + "loss": 0.5236, + "step": 3898 + }, + { + "epoch": 1.8993179603767456, + "grad_norm": 2.8490071296691895, + "learning_rate": 3.898136285817091e-06, + "loss": 0.5725, + "step": 3899 + }, + { + "epoch": 1.8998051315362132, + "grad_norm": 2.5595412254333496, + "learning_rate": 3.897603082053444e-06, + "loss": 0.5993, + "step": 3900 + }, + { + "epoch": 1.9002923026956804, + "grad_norm": 2.671517848968506, + "learning_rate": 3.897069785796555e-06, + "loss": 0.5973, + "step": 3901 + }, + { + "epoch": 1.9007794738551478, + "grad_norm": 2.656132698059082, + "learning_rate": 3.896536397081718e-06, + "loss": 0.5343, + "step": 3902 + }, + { + "epoch": 1.9012666450146152, + "grad_norm": 2.7595198154449463, + "learning_rate": 3.896002915944232e-06, + "loss": 0.6046, + "step": 3903 + }, + { + "epoch": 1.9017538161740823, + "grad_norm": 2.7796361446380615, + "learning_rate": 3.895469342419403e-06, + "loss": 0.5251, + "step": 3904 + }, + { + "epoch": 1.90224098733355, + "grad_norm": 2.345346450805664, + "learning_rate": 3.894935676542542e-06, + "loss": 0.5798, + "step": 3905 + }, + { + "epoch": 1.9027281584930171, + "grad_norm": 2.552638053894043, + "learning_rate": 3.894401918348969e-06, + "loss": 0.5675, + "step": 3906 + }, + { + "epoch": 1.9032153296524845, + "grad_norm": 2.567073106765747, + "learning_rate": 3.893868067874007e-06, + "loss": 0.541, + "step": 3907 + }, + { + "epoch": 1.903702500811952, + "grad_norm": 2.6803741455078125, + "learning_rate": 3.893334125152986e-06, + "loss": 0.6095, + "step": 3908 + }, + { + "epoch": 1.9041896719714193, + "grad_norm": 2.7989964485168457, + "learning_rate": 3.892800090221241e-06, + "loss": 0.5343, + "step": 3909 + }, + { + "epoch": 1.9046768431308867, + "grad_norm": 2.679039478302002, + "learning_rate": 3.892265963114117e-06, + "loss": 0.5626, + "step": 3910 + }, + { + "epoch": 1.9051640142903539, + "grad_norm": 2.7830324172973633, + "learning_rate": 3.891731743866961e-06, + "loss": 0.5267, + "step": 3911 + }, + { + "epoch": 1.9056511854498215, + "grad_norm": 3.1504108905792236, + "learning_rate": 3.891197432515128e-06, + "loss": 0.5381, + "step": 3912 + }, + { + "epoch": 1.9061383566092887, + "grad_norm": 3.9695630073547363, + "learning_rate": 3.890663029093979e-06, + "loss": 0.4563, + "step": 3913 + }, + { + "epoch": 1.906625527768756, + "grad_norm": 2.50885272026062, + "learning_rate": 3.890128533638878e-06, + "loss": 0.5156, + "step": 3914 + }, + { + "epoch": 1.9071126989282234, + "grad_norm": 2.563903331756592, + "learning_rate": 3.889593946185202e-06, + "loss": 0.6299, + "step": 3915 + }, + { + "epoch": 1.9075998700876908, + "grad_norm": 2.997382164001465, + "learning_rate": 3.889059266768328e-06, + "loss": 0.6407, + "step": 3916 + }, + { + "epoch": 1.9080870412471582, + "grad_norm": 2.7532966136932373, + "learning_rate": 3.88852449542364e-06, + "loss": 0.4891, + "step": 3917 + }, + { + "epoch": 1.9085742124066254, + "grad_norm": 2.7831599712371826, + "learning_rate": 3.8879896321865314e-06, + "loss": 0.5742, + "step": 3918 + }, + { + "epoch": 1.909061383566093, + "grad_norm": 2.555772542953491, + "learning_rate": 3.887454677092397e-06, + "loss": 0.4943, + "step": 3919 + }, + { + "epoch": 1.9095485547255602, + "grad_norm": 2.333188056945801, + "learning_rate": 3.886919630176642e-06, + "loss": 0.4882, + "step": 3920 + }, + { + "epoch": 1.9100357258850276, + "grad_norm": 2.8428099155426025, + "learning_rate": 3.886384491474675e-06, + "loss": 0.6112, + "step": 3921 + }, + { + "epoch": 1.910522897044495, + "grad_norm": 2.6305603981018066, + "learning_rate": 3.885849261021911e-06, + "loss": 0.5971, + "step": 3922 + }, + { + "epoch": 1.9110100682039624, + "grad_norm": 2.734218120574951, + "learning_rate": 3.885313938853773e-06, + "loss": 0.5994, + "step": 3923 + }, + { + "epoch": 1.9114972393634297, + "grad_norm": 2.7949609756469727, + "learning_rate": 3.8847785250056865e-06, + "loss": 0.5401, + "step": 3924 + }, + { + "epoch": 1.911984410522897, + "grad_norm": 2.96761417388916, + "learning_rate": 3.884243019513086e-06, + "loss": 0.5859, + "step": 3925 + }, + { + "epoch": 1.9124715816823645, + "grad_norm": 4.071978569030762, + "learning_rate": 3.883707422411412e-06, + "loss": 0.5712, + "step": 3926 + }, + { + "epoch": 1.9129587528418317, + "grad_norm": 2.323817491531372, + "learning_rate": 3.88317173373611e-06, + "loss": 0.5106, + "step": 3927 + }, + { + "epoch": 1.913445924001299, + "grad_norm": 2.7381339073181152, + "learning_rate": 3.882635953522631e-06, + "loss": 0.5312, + "step": 3928 + }, + { + "epoch": 1.9139330951607665, + "grad_norm": 2.6686501502990723, + "learning_rate": 3.882100081806433e-06, + "loss": 0.565, + "step": 3929 + }, + { + "epoch": 1.9144202663202339, + "grad_norm": 2.6687331199645996, + "learning_rate": 3.881564118622982e-06, + "loss": 0.5351, + "step": 3930 + }, + { + "epoch": 1.9149074374797013, + "grad_norm": 2.4137821197509766, + "learning_rate": 3.881028064007744e-06, + "loss": 0.5482, + "step": 3931 + }, + { + "epoch": 1.9153946086391684, + "grad_norm": 2.769444704055786, + "learning_rate": 3.880491917996198e-06, + "loss": 0.5366, + "step": 3932 + }, + { + "epoch": 1.915881779798636, + "grad_norm": 2.2127931118011475, + "learning_rate": 3.879955680623826e-06, + "loss": 0.5306, + "step": 3933 + }, + { + "epoch": 1.9163689509581032, + "grad_norm": 2.6008830070495605, + "learning_rate": 3.879419351926115e-06, + "loss": 0.6103, + "step": 3934 + }, + { + "epoch": 1.9168561221175706, + "grad_norm": 2.8267016410827637, + "learning_rate": 3.87888293193856e-06, + "loss": 0.6109, + "step": 3935 + }, + { + "epoch": 1.917343293277038, + "grad_norm": 2.707613229751587, + "learning_rate": 3.878346420696662e-06, + "loss": 0.5918, + "step": 3936 + }, + { + "epoch": 1.9178304644365054, + "grad_norm": 2.605958938598633, + "learning_rate": 3.877809818235925e-06, + "loss": 0.4898, + "step": 3937 + }, + { + "epoch": 1.9183176355959728, + "grad_norm": 2.6516034603118896, + "learning_rate": 3.877273124591861e-06, + "loss": 0.5306, + "step": 3938 + }, + { + "epoch": 1.91880480675544, + "grad_norm": 2.6875784397125244, + "learning_rate": 3.876736339799992e-06, + "loss": 0.5289, + "step": 3939 + }, + { + "epoch": 1.9192919779149076, + "grad_norm": 2.787963390350342, + "learning_rate": 3.87619946389584e-06, + "loss": 0.5061, + "step": 3940 + }, + { + "epoch": 1.9197791490743747, + "grad_norm": 2.4011902809143066, + "learning_rate": 3.875662496914936e-06, + "loss": 0.6008, + "step": 3941 + }, + { + "epoch": 1.9202663202338421, + "grad_norm": 2.5158510208129883, + "learning_rate": 3.875125438892815e-06, + "loss": 0.6183, + "step": 3942 + }, + { + "epoch": 1.9207534913933095, + "grad_norm": 2.8680953979492188, + "learning_rate": 3.874588289865021e-06, + "loss": 0.5627, + "step": 3943 + }, + { + "epoch": 1.921240662552777, + "grad_norm": 2.823232412338257, + "learning_rate": 3.874051049867102e-06, + "loss": 0.5882, + "step": 3944 + }, + { + "epoch": 1.9217278337122443, + "grad_norm": 2.8826253414154053, + "learning_rate": 3.873513718934612e-06, + "loss": 0.5301, + "step": 3945 + }, + { + "epoch": 1.9222150048717115, + "grad_norm": 3.0836703777313232, + "learning_rate": 3.872976297103113e-06, + "loss": 0.5342, + "step": 3946 + }, + { + "epoch": 1.922702176031179, + "grad_norm": 2.5970635414123535, + "learning_rate": 3.87243878440817e-06, + "loss": 0.4995, + "step": 3947 + }, + { + "epoch": 1.9231893471906463, + "grad_norm": 2.7458243370056152, + "learning_rate": 3.871901180885356e-06, + "loss": 0.6336, + "step": 3948 + }, + { + "epoch": 1.9236765183501137, + "grad_norm": 3.0131773948669434, + "learning_rate": 3.8713634865702495e-06, + "loss": 0.6095, + "step": 3949 + }, + { + "epoch": 1.924163689509581, + "grad_norm": 2.7261264324188232, + "learning_rate": 3.870825701498435e-06, + "loss": 0.5977, + "step": 3950 + }, + { + "epoch": 1.9246508606690484, + "grad_norm": 2.7491812705993652, + "learning_rate": 3.870287825705503e-06, + "loss": 0.6706, + "step": 3951 + }, + { + "epoch": 1.9251380318285158, + "grad_norm": 2.7679789066314697, + "learning_rate": 3.869749859227051e-06, + "loss": 0.5574, + "step": 3952 + }, + { + "epoch": 1.925625202987983, + "grad_norm": 2.488060235977173, + "learning_rate": 3.869211802098681e-06, + "loss": 0.4403, + "step": 3953 + }, + { + "epoch": 1.9261123741474506, + "grad_norm": 2.4192111492156982, + "learning_rate": 3.8686736543560014e-06, + "loss": 0.5521, + "step": 3954 + }, + { + "epoch": 1.9265995453069178, + "grad_norm": 2.7069408893585205, + "learning_rate": 3.868135416034626e-06, + "loss": 0.6188, + "step": 3955 + }, + { + "epoch": 1.9270867164663852, + "grad_norm": 2.7756688594818115, + "learning_rate": 3.867597087170176e-06, + "loss": 0.5359, + "step": 3956 + }, + { + "epoch": 1.9275738876258526, + "grad_norm": 2.6092567443847656, + "learning_rate": 3.867058667798279e-06, + "loss": 0.5438, + "step": 3957 + }, + { + "epoch": 1.9280610587853197, + "grad_norm": 2.650907039642334, + "learning_rate": 3.866520157954567e-06, + "loss": 0.5163, + "step": 3958 + }, + { + "epoch": 1.9285482299447874, + "grad_norm": 2.363525390625, + "learning_rate": 3.865981557674678e-06, + "loss": 0.5533, + "step": 3959 + }, + { + "epoch": 1.9290354011042545, + "grad_norm": 2.6469719409942627, + "learning_rate": 3.8654428669942555e-06, + "loss": 0.5348, + "step": 3960 + }, + { + "epoch": 1.9295225722637221, + "grad_norm": 2.2777156829833984, + "learning_rate": 3.864904085948952e-06, + "loss": 0.4919, + "step": 3961 + }, + { + "epoch": 1.9300097434231893, + "grad_norm": 2.5946340560913086, + "learning_rate": 3.864365214574423e-06, + "loss": 0.5634, + "step": 3962 + }, + { + "epoch": 1.9304969145826567, + "grad_norm": 2.5277652740478516, + "learning_rate": 3.863826252906332e-06, + "loss": 0.4958, + "step": 3963 + }, + { + "epoch": 1.930984085742124, + "grad_norm": 3.081791639328003, + "learning_rate": 3.863287200980346e-06, + "loss": 0.6114, + "step": 3964 + }, + { + "epoch": 1.9314712569015913, + "grad_norm": 3.1243503093719482, + "learning_rate": 3.8627480588321395e-06, + "loss": 0.5551, + "step": 3965 + }, + { + "epoch": 1.9319584280610589, + "grad_norm": 2.8798038959503174, + "learning_rate": 3.8622088264973935e-06, + "loss": 0.5805, + "step": 3966 + }, + { + "epoch": 1.932445599220526, + "grad_norm": 2.8669912815093994, + "learning_rate": 3.861669504011794e-06, + "loss": 0.5635, + "step": 3967 + }, + { + "epoch": 1.9329327703799937, + "grad_norm": 2.55116605758667, + "learning_rate": 3.861130091411035e-06, + "loss": 0.5585, + "step": 3968 + }, + { + "epoch": 1.9334199415394608, + "grad_norm": 2.5538766384124756, + "learning_rate": 3.8605905887308125e-06, + "loss": 0.5843, + "step": 3969 + }, + { + "epoch": 1.9339071126989282, + "grad_norm": 2.7259774208068848, + "learning_rate": 3.860050996006831e-06, + "loss": 0.5413, + "step": 3970 + }, + { + "epoch": 1.9343942838583956, + "grad_norm": 2.8235344886779785, + "learning_rate": 3.859511313274802e-06, + "loss": 0.5296, + "step": 3971 + }, + { + "epoch": 1.9348814550178628, + "grad_norm": 3.024791717529297, + "learning_rate": 3.85897154057044e-06, + "loss": 0.5895, + "step": 3972 + }, + { + "epoch": 1.9353686261773304, + "grad_norm": 2.891852617263794, + "learning_rate": 3.8584316779294685e-06, + "loss": 0.5597, + "step": 3973 + }, + { + "epoch": 1.9358557973367976, + "grad_norm": 2.5621566772460938, + "learning_rate": 3.8578917253876144e-06, + "loss": 0.5572, + "step": 3974 + }, + { + "epoch": 1.936342968496265, + "grad_norm": 2.540397882461548, + "learning_rate": 3.857351682980614e-06, + "loss": 0.641, + "step": 3975 + }, + { + "epoch": 1.9368301396557324, + "grad_norm": 2.5695202350616455, + "learning_rate": 3.856811550744205e-06, + "loss": 0.6427, + "step": 3976 + }, + { + "epoch": 1.9373173108151998, + "grad_norm": 2.796025037765503, + "learning_rate": 3.856271328714133e-06, + "loss": 0.5029, + "step": 3977 + }, + { + "epoch": 1.9378044819746671, + "grad_norm": 2.5731561183929443, + "learning_rate": 3.855731016926151e-06, + "loss": 0.551, + "step": 3978 + }, + { + "epoch": 1.9382916531341343, + "grad_norm": 2.600590229034424, + "learning_rate": 3.855190615416017e-06, + "loss": 0.6331, + "step": 3979 + }, + { + "epoch": 1.938778824293602, + "grad_norm": 3.1081740856170654, + "learning_rate": 3.854650124219494e-06, + "loss": 0.5505, + "step": 3980 + }, + { + "epoch": 1.939265995453069, + "grad_norm": 3.2125911712646484, + "learning_rate": 3.854109543372352e-06, + "loss": 0.5491, + "step": 3981 + }, + { + "epoch": 1.9397531666125365, + "grad_norm": 2.9952292442321777, + "learning_rate": 3.853568872910367e-06, + "loss": 0.5778, + "step": 3982 + }, + { + "epoch": 1.9402403377720039, + "grad_norm": 2.5337440967559814, + "learning_rate": 3.853028112869319e-06, + "loss": 0.5887, + "step": 3983 + }, + { + "epoch": 1.9407275089314713, + "grad_norm": 2.694032907485962, + "learning_rate": 3.852487263284998e-06, + "loss": 0.4839, + "step": 3984 + }, + { + "epoch": 1.9412146800909387, + "grad_norm": 2.6822099685668945, + "learning_rate": 3.851946324193194e-06, + "loss": 0.6168, + "step": 3985 + }, + { + "epoch": 1.9417018512504058, + "grad_norm": 2.5131537914276123, + "learning_rate": 3.85140529562971e-06, + "loss": 0.6101, + "step": 3986 + }, + { + "epoch": 1.9421890224098735, + "grad_norm": 2.714430093765259, + "learning_rate": 3.850864177630348e-06, + "loss": 0.5352, + "step": 3987 + }, + { + "epoch": 1.9426761935693406, + "grad_norm": 2.6270272731781006, + "learning_rate": 3.850322970230921e-06, + "loss": 0.5429, + "step": 3988 + }, + { + "epoch": 1.943163364728808, + "grad_norm": 2.254110813140869, + "learning_rate": 3.849781673467245e-06, + "loss": 0.5845, + "step": 3989 + }, + { + "epoch": 1.9436505358882754, + "grad_norm": 2.648325204849243, + "learning_rate": 3.849240287375145e-06, + "loss": 0.4815, + "step": 3990 + }, + { + "epoch": 1.9441377070477428, + "grad_norm": 2.495920181274414, + "learning_rate": 3.848698811990447e-06, + "loss": 0.6284, + "step": 3991 + }, + { + "epoch": 1.9446248782072102, + "grad_norm": 2.8269145488739014, + "learning_rate": 3.848157247348988e-06, + "loss": 0.5462, + "step": 3992 + }, + { + "epoch": 1.9451120493666774, + "grad_norm": 2.691204309463501, + "learning_rate": 3.847615593486608e-06, + "loss": 0.5832, + "step": 3993 + }, + { + "epoch": 1.945599220526145, + "grad_norm": 2.425912857055664, + "learning_rate": 3.8470738504391525e-06, + "loss": 0.5723, + "step": 3994 + }, + { + "epoch": 1.9460863916856121, + "grad_norm": 2.6491940021514893, + "learning_rate": 3.846532018242476e-06, + "loss": 0.5684, + "step": 3995 + }, + { + "epoch": 1.9465735628450795, + "grad_norm": 3.0151240825653076, + "learning_rate": 3.845990096932436e-06, + "loss": 0.5753, + "step": 3996 + }, + { + "epoch": 1.947060734004547, + "grad_norm": 2.9167349338531494, + "learning_rate": 3.845448086544895e-06, + "loss": 0.4995, + "step": 3997 + }, + { + "epoch": 1.9475479051640143, + "grad_norm": 2.669302463531494, + "learning_rate": 3.844905987115727e-06, + "loss": 0.5691, + "step": 3998 + }, + { + "epoch": 1.9480350763234817, + "grad_norm": 2.8752777576446533, + "learning_rate": 3.844363798680805e-06, + "loss": 0.5467, + "step": 3999 + }, + { + "epoch": 1.9485222474829489, + "grad_norm": 2.8266351222991943, + "learning_rate": 3.8438215212760124e-06, + "loss": 0.502, + "step": 4000 + }, + { + "epoch": 1.9490094186424165, + "grad_norm": 2.5316617488861084, + "learning_rate": 3.8432791549372365e-06, + "loss": 0.5632, + "step": 4001 + }, + { + "epoch": 1.9494965898018837, + "grad_norm": 2.695655107498169, + "learning_rate": 3.842736699700372e-06, + "loss": 0.5214, + "step": 4002 + }, + { + "epoch": 1.949983760961351, + "grad_norm": 2.6155834197998047, + "learning_rate": 3.842194155601316e-06, + "loss": 0.5091, + "step": 4003 + }, + { + "epoch": 1.9504709321208185, + "grad_norm": 2.495690107345581, + "learning_rate": 3.8416515226759765e-06, + "loss": 0.5399, + "step": 4004 + }, + { + "epoch": 1.9509581032802858, + "grad_norm": 2.613554000854492, + "learning_rate": 3.841108800960264e-06, + "loss": 0.5275, + "step": 4005 + }, + { + "epoch": 1.9514452744397532, + "grad_norm": 2.955801248550415, + "learning_rate": 3.840565990490097e-06, + "loss": 0.6201, + "step": 4006 + }, + { + "epoch": 1.9519324455992204, + "grad_norm": 2.8022279739379883, + "learning_rate": 3.840023091301397e-06, + "loss": 0.589, + "step": 4007 + }, + { + "epoch": 1.952419616758688, + "grad_norm": 2.776289701461792, + "learning_rate": 3.8394801034300934e-06, + "loss": 0.6095, + "step": 4008 + }, + { + "epoch": 1.9529067879181552, + "grad_norm": 2.434342861175537, + "learning_rate": 3.8389370269121216e-06, + "loss": 0.5884, + "step": 4009 + }, + { + "epoch": 1.9533939590776226, + "grad_norm": 2.770944595336914, + "learning_rate": 3.838393861783422e-06, + "loss": 0.5973, + "step": 4010 + }, + { + "epoch": 1.95388113023709, + "grad_norm": 2.5228683948516846, + "learning_rate": 3.83785060807994e-06, + "loss": 0.4657, + "step": 4011 + }, + { + "epoch": 1.9543683013965574, + "grad_norm": 2.9553472995758057, + "learning_rate": 3.837307265837632e-06, + "loss": 0.5354, + "step": 4012 + }, + { + "epoch": 1.9548554725560248, + "grad_norm": 2.912256956100464, + "learning_rate": 3.8367638350924516e-06, + "loss": 0.5783, + "step": 4013 + }, + { + "epoch": 1.955342643715492, + "grad_norm": 2.713261365890503, + "learning_rate": 3.836220315880365e-06, + "loss": 0.5696, + "step": 4014 + }, + { + "epoch": 1.9558298148749595, + "grad_norm": 2.6525304317474365, + "learning_rate": 3.835676708237343e-06, + "loss": 0.5354, + "step": 4015 + }, + { + "epoch": 1.9563169860344267, + "grad_norm": 2.569162130355835, + "learning_rate": 3.835133012199361e-06, + "loss": 0.6218, + "step": 4016 + }, + { + "epoch": 1.9568041571938941, + "grad_norm": 2.6171157360076904, + "learning_rate": 3.8345892278024e-06, + "loss": 0.6155, + "step": 4017 + }, + { + "epoch": 1.9572913283533615, + "grad_norm": 2.8048603534698486, + "learning_rate": 3.834045355082449e-06, + "loss": 0.5465, + "step": 4018 + }, + { + "epoch": 1.957778499512829, + "grad_norm": 3.0477547645568848, + "learning_rate": 3.8335013940755e-06, + "loss": 0.5042, + "step": 4019 + }, + { + "epoch": 1.9582656706722963, + "grad_norm": 2.9167733192443848, + "learning_rate": 3.832957344817553e-06, + "loss": 0.5566, + "step": 4020 + }, + { + "epoch": 1.9587528418317635, + "grad_norm": 2.8164005279541016, + "learning_rate": 3.832413207344613e-06, + "loss": 0.6458, + "step": 4021 + }, + { + "epoch": 1.959240012991231, + "grad_norm": 2.9297287464141846, + "learning_rate": 3.831868981692691e-06, + "loss": 0.5427, + "step": 4022 + }, + { + "epoch": 1.9597271841506982, + "grad_norm": 2.371713638305664, + "learning_rate": 3.831324667897803e-06, + "loss": 0.5448, + "step": 4023 + }, + { + "epoch": 1.9602143553101656, + "grad_norm": 2.860522508621216, + "learning_rate": 3.830780265995974e-06, + "loss": 0.5615, + "step": 4024 + }, + { + "epoch": 1.960701526469633, + "grad_norm": 2.4928250312805176, + "learning_rate": 3.83023577602323e-06, + "loss": 0.5411, + "step": 4025 + }, + { + "epoch": 1.9611886976291002, + "grad_norm": 2.844817638397217, + "learning_rate": 3.829691198015607e-06, + "loss": 0.571, + "step": 4026 + }, + { + "epoch": 1.9616758687885678, + "grad_norm": 3.1283679008483887, + "learning_rate": 3.829146532009144e-06, + "loss": 0.5837, + "step": 4027 + }, + { + "epoch": 1.962163039948035, + "grad_norm": 2.823026657104492, + "learning_rate": 3.828601778039888e-06, + "loss": 0.5683, + "step": 4028 + }, + { + "epoch": 1.9626502111075026, + "grad_norm": 2.3089606761932373, + "learning_rate": 3.828056936143888e-06, + "loss": 0.556, + "step": 4029 + }, + { + "epoch": 1.9631373822669698, + "grad_norm": 2.7951600551605225, + "learning_rate": 3.8275120063572056e-06, + "loss": 0.597, + "step": 4030 + }, + { + "epoch": 1.9636245534264372, + "grad_norm": 2.766152858734131, + "learning_rate": 3.826966988715901e-06, + "loss": 0.5521, + "step": 4031 + }, + { + "epoch": 1.9641117245859046, + "grad_norm": 2.6200571060180664, + "learning_rate": 3.8264218832560455e-06, + "loss": 0.5098, + "step": 4032 + }, + { + "epoch": 1.9645988957453717, + "grad_norm": 2.57442307472229, + "learning_rate": 3.825876690013713e-06, + "loss": 0.5835, + "step": 4033 + }, + { + "epoch": 1.9650860669048393, + "grad_norm": 2.957221746444702, + "learning_rate": 3.825331409024984e-06, + "loss": 0.5561, + "step": 4034 + }, + { + "epoch": 1.9655732380643065, + "grad_norm": 2.5536842346191406, + "learning_rate": 3.824786040325946e-06, + "loss": 0.5918, + "step": 4035 + }, + { + "epoch": 1.966060409223774, + "grad_norm": 2.3754079341888428, + "learning_rate": 3.824240583952692e-06, + "loss": 0.5561, + "step": 4036 + }, + { + "epoch": 1.9665475803832413, + "grad_norm": 2.5870532989501953, + "learning_rate": 3.8236950399413185e-06, + "loss": 0.5431, + "step": 4037 + }, + { + "epoch": 1.9670347515427087, + "grad_norm": 2.457977056503296, + "learning_rate": 3.823149408327931e-06, + "loss": 0.5975, + "step": 4038 + }, + { + "epoch": 1.967521922702176, + "grad_norm": 2.9539217948913574, + "learning_rate": 3.822603689148639e-06, + "loss": 0.5285, + "step": 4039 + }, + { + "epoch": 1.9680090938616432, + "grad_norm": 2.5681440830230713, + "learning_rate": 3.822057882439557e-06, + "loss": 0.4735, + "step": 4040 + }, + { + "epoch": 1.9684962650211109, + "grad_norm": 2.7848384380340576, + "learning_rate": 3.821511988236809e-06, + "loss": 0.593, + "step": 4041 + }, + { + "epoch": 1.968983436180578, + "grad_norm": 2.612391471862793, + "learning_rate": 3.8209660065765195e-06, + "loss": 0.5827, + "step": 4042 + }, + { + "epoch": 1.9694706073400454, + "grad_norm": 2.992610454559326, + "learning_rate": 3.820419937494824e-06, + "loss": 0.5752, + "step": 4043 + }, + { + "epoch": 1.9699577784995128, + "grad_norm": 2.5408904552459717, + "learning_rate": 3.819873781027858e-06, + "loss": 0.5563, + "step": 4044 + }, + { + "epoch": 1.9704449496589802, + "grad_norm": 2.7924864292144775, + "learning_rate": 3.81932753721177e-06, + "loss": 0.5079, + "step": 4045 + }, + { + "epoch": 1.9709321208184476, + "grad_norm": 2.483452558517456, + "learning_rate": 3.818781206082707e-06, + "loss": 0.5313, + "step": 4046 + }, + { + "epoch": 1.9714192919779148, + "grad_norm": 2.5602762699127197, + "learning_rate": 3.818234787676828e-06, + "loss": 0.667, + "step": 4047 + }, + { + "epoch": 1.9719064631373824, + "grad_norm": 3.2333872318267822, + "learning_rate": 3.817688282030293e-06, + "loss": 0.6858, + "step": 4048 + }, + { + "epoch": 1.9723936342968496, + "grad_norm": 2.944188117980957, + "learning_rate": 3.81714168917927e-06, + "loss": 0.54, + "step": 4049 + }, + { + "epoch": 1.972880805456317, + "grad_norm": 3.150712728500366, + "learning_rate": 3.8165950091599325e-06, + "loss": 0.4948, + "step": 4050 + }, + { + "epoch": 1.9733679766157843, + "grad_norm": 2.479444742202759, + "learning_rate": 3.81604824200846e-06, + "loss": 0.4901, + "step": 4051 + }, + { + "epoch": 1.9738551477752517, + "grad_norm": 2.817929744720459, + "learning_rate": 3.815501387761037e-06, + "loss": 0.62, + "step": 4052 + }, + { + "epoch": 1.9743423189347191, + "grad_norm": 3.0161068439483643, + "learning_rate": 3.8149544464538555e-06, + "loss": 0.677, + "step": 4053 + }, + { + "epoch": 1.9748294900941863, + "grad_norm": 3.2868332862854004, + "learning_rate": 3.8144074181231106e-06, + "loss": 0.6061, + "step": 4054 + }, + { + "epoch": 1.975316661253654, + "grad_norm": 2.602590560913086, + "learning_rate": 3.813860302805006e-06, + "loss": 0.5307, + "step": 4055 + }, + { + "epoch": 1.975803832413121, + "grad_norm": 2.579686403274536, + "learning_rate": 3.813313100535747e-06, + "loss": 0.5429, + "step": 4056 + }, + { + "epoch": 1.9762910035725885, + "grad_norm": 2.6081044673919678, + "learning_rate": 3.8127658113515508e-06, + "loss": 0.5581, + "step": 4057 + }, + { + "epoch": 1.9767781747320559, + "grad_norm": 2.768775463104248, + "learning_rate": 3.8122184352886355e-06, + "loss": 0.54, + "step": 4058 + }, + { + "epoch": 1.9772653458915233, + "grad_norm": 2.903865098953247, + "learning_rate": 3.8116709723832257e-06, + "loss": 0.5813, + "step": 4059 + }, + { + "epoch": 1.9777525170509906, + "grad_norm": 2.6727731227874756, + "learning_rate": 3.8111234226715534e-06, + "loss": 0.5697, + "step": 4060 + }, + { + "epoch": 1.9782396882104578, + "grad_norm": 2.5407192707061768, + "learning_rate": 3.8105757861898553e-06, + "loss": 0.5731, + "step": 4061 + }, + { + "epoch": 1.9787268593699254, + "grad_norm": 2.347877025604248, + "learning_rate": 3.8100280629743736e-06, + "loss": 0.5109, + "step": 4062 + }, + { + "epoch": 1.9792140305293926, + "grad_norm": 2.8380916118621826, + "learning_rate": 3.809480253061356e-06, + "loss": 0.595, + "step": 4063 + }, + { + "epoch": 1.97970120168886, + "grad_norm": 2.996250629425049, + "learning_rate": 3.8089323564870585e-06, + "loss": 0.6073, + "step": 4064 + }, + { + "epoch": 1.9801883728483274, + "grad_norm": 2.7606618404388428, + "learning_rate": 3.808384373287739e-06, + "loss": 0.5477, + "step": 4065 + }, + { + "epoch": 1.9806755440077948, + "grad_norm": 2.700167655944824, + "learning_rate": 3.8078363034996625e-06, + "loss": 0.591, + "step": 4066 + }, + { + "epoch": 1.9811627151672622, + "grad_norm": 2.804687738418579, + "learning_rate": 3.8072881471591027e-06, + "loss": 0.5638, + "step": 4067 + }, + { + "epoch": 1.9816498863267293, + "grad_norm": 2.775604486465454, + "learning_rate": 3.806739904302334e-06, + "loss": 0.58, + "step": 4068 + }, + { + "epoch": 1.982137057486197, + "grad_norm": 2.5980257987976074, + "learning_rate": 3.8061915749656407e-06, + "loss": 0.5448, + "step": 4069 + }, + { + "epoch": 1.9826242286456641, + "grad_norm": 2.7944118976593018, + "learning_rate": 3.8056431591853105e-06, + "loss": 0.5484, + "step": 4070 + }, + { + "epoch": 1.9831113998051315, + "grad_norm": 2.421776533126831, + "learning_rate": 3.805094656997638e-06, + "loss": 0.4655, + "step": 4071 + }, + { + "epoch": 1.983598570964599, + "grad_norm": 2.838672399520874, + "learning_rate": 3.804546068438922e-06, + "loss": 0.5151, + "step": 4072 + }, + { + "epoch": 1.9840857421240663, + "grad_norm": 2.336343765258789, + "learning_rate": 3.803997393545469e-06, + "loss": 0.4764, + "step": 4073 + }, + { + "epoch": 1.9845729132835337, + "grad_norm": 2.659315586090088, + "learning_rate": 3.803448632353589e-06, + "loss": 0.6289, + "step": 4074 + }, + { + "epoch": 1.9850600844430009, + "grad_norm": 2.7740836143493652, + "learning_rate": 3.8028997848996008e-06, + "loss": 0.5846, + "step": 4075 + }, + { + "epoch": 1.9855472556024685, + "grad_norm": 2.604374647140503, + "learning_rate": 3.802350851219826e-06, + "loss": 0.5702, + "step": 4076 + }, + { + "epoch": 1.9860344267619356, + "grad_norm": 2.6634740829467773, + "learning_rate": 3.8018018313505934e-06, + "loss": 0.575, + "step": 4077 + }, + { + "epoch": 1.986521597921403, + "grad_norm": 2.6927435398101807, + "learning_rate": 3.801252725328236e-06, + "loss": 0.5629, + "step": 4078 + }, + { + "epoch": 1.9870087690808704, + "grad_norm": 3.303144931793213, + "learning_rate": 3.8007035331890953e-06, + "loss": 0.5299, + "step": 4079 + }, + { + "epoch": 1.9874959402403378, + "grad_norm": 2.5135080814361572, + "learning_rate": 3.800154254969516e-06, + "loss": 0.5826, + "step": 4080 + }, + { + "epoch": 1.9879831113998052, + "grad_norm": 2.9041879177093506, + "learning_rate": 3.7996048907058484e-06, + "loss": 0.5393, + "step": 4081 + }, + { + "epoch": 1.9884702825592724, + "grad_norm": 2.7708168029785156, + "learning_rate": 3.7990554404344503e-06, + "loss": 0.6102, + "step": 4082 + }, + { + "epoch": 1.98895745371874, + "grad_norm": 2.813361406326294, + "learning_rate": 3.7985059041916848e-06, + "loss": 0.5301, + "step": 4083 + }, + { + "epoch": 1.9894446248782072, + "grad_norm": 2.7773990631103516, + "learning_rate": 3.7979562820139184e-06, + "loss": 0.5967, + "step": 4084 + }, + { + "epoch": 1.9899317960376746, + "grad_norm": 2.826681613922119, + "learning_rate": 3.7974065739375273e-06, + "loss": 0.5956, + "step": 4085 + }, + { + "epoch": 1.990418967197142, + "grad_norm": 2.7071690559387207, + "learning_rate": 3.796856779998889e-06, + "loss": 0.5434, + "step": 4086 + }, + { + "epoch": 1.9909061383566091, + "grad_norm": 2.927908420562744, + "learning_rate": 3.79630690023439e-06, + "loss": 0.5605, + "step": 4087 + }, + { + "epoch": 1.9913933095160767, + "grad_norm": 2.6933958530426025, + "learning_rate": 3.795756934680421e-06, + "loss": 0.4814, + "step": 4088 + }, + { + "epoch": 1.991880480675544, + "grad_norm": 2.4048454761505127, + "learning_rate": 3.795206883373379e-06, + "loss": 0.5434, + "step": 4089 + }, + { + "epoch": 1.9923676518350115, + "grad_norm": 2.8842344284057617, + "learning_rate": 3.7946567463496665e-06, + "loss": 0.5234, + "step": 4090 + }, + { + "epoch": 1.9928548229944787, + "grad_norm": 3.378779888153076, + "learning_rate": 3.79410652364569e-06, + "loss": 0.6106, + "step": 4091 + }, + { + "epoch": 1.993341994153946, + "grad_norm": 2.5222651958465576, + "learning_rate": 3.7935562152978656e-06, + "loss": 0.5433, + "step": 4092 + }, + { + "epoch": 1.9938291653134135, + "grad_norm": 2.7866013050079346, + "learning_rate": 3.79300582134261e-06, + "loss": 0.468, + "step": 4093 + }, + { + "epoch": 1.9943163364728806, + "grad_norm": 2.625844955444336, + "learning_rate": 3.7924553418163502e-06, + "loss": 0.6964, + "step": 4094 + }, + { + "epoch": 1.9948035076323483, + "grad_norm": 2.7554235458374023, + "learning_rate": 3.791904776755516e-06, + "loss": 0.5636, + "step": 4095 + }, + { + "epoch": 1.9952906787918154, + "grad_norm": 2.521775484085083, + "learning_rate": 3.7913541261965447e-06, + "loss": 0.5673, + "step": 4096 + }, + { + "epoch": 1.995777849951283, + "grad_norm": 2.5199427604675293, + "learning_rate": 3.7908033901758766e-06, + "loss": 0.5188, + "step": 4097 + }, + { + "epoch": 1.9962650211107502, + "grad_norm": 2.500396728515625, + "learning_rate": 3.7902525687299614e-06, + "loss": 0.554, + "step": 4098 + }, + { + "epoch": 1.9967521922702176, + "grad_norm": 2.88364315032959, + "learning_rate": 3.78970166189525e-06, + "loss": 0.607, + "step": 4099 + }, + { + "epoch": 1.997239363429685, + "grad_norm": 2.730677843093872, + "learning_rate": 3.7891506697082036e-06, + "loss": 0.4792, + "step": 4100 + }, + { + "epoch": 1.9977265345891522, + "grad_norm": 2.713953971862793, + "learning_rate": 3.7885995922052855e-06, + "loss": 0.545, + "step": 4101 + }, + { + "epoch": 1.9982137057486198, + "grad_norm": 2.8309590816497803, + "learning_rate": 3.788048429422967e-06, + "loss": 0.5263, + "step": 4102 + }, + { + "epoch": 1.998700876908087, + "grad_norm": 2.833064317703247, + "learning_rate": 3.787497181397723e-06, + "loss": 0.5649, + "step": 4103 + }, + { + "epoch": 1.9991880480675543, + "grad_norm": 2.704881191253662, + "learning_rate": 3.7869458481660354e-06, + "loss": 0.5552, + "step": 4104 + }, + { + "epoch": 1.9996752192270217, + "grad_norm": 2.7926690578460693, + "learning_rate": 3.786394429764391e-06, + "loss": 0.5167, + "step": 4105 + }, + { + "epoch": 2.0, + "grad_norm": 3.084559679031372, + "learning_rate": 3.785842926229284e-06, + "loss": 0.4913, + "step": 4106 + }, + { + "epoch": 2.000487171159467, + "grad_norm": 2.850564956665039, + "learning_rate": 3.785291337597211e-06, + "loss": 0.4732, + "step": 4107 + }, + { + "epoch": 2.000974342318935, + "grad_norm": 2.476039171218872, + "learning_rate": 3.7847396639046773e-06, + "loss": 0.5375, + "step": 4108 + }, + { + "epoch": 2.001461513478402, + "grad_norm": 2.5955593585968018, + "learning_rate": 3.7841879051881923e-06, + "loss": 0.5128, + "step": 4109 + }, + { + "epoch": 2.0019486846378696, + "grad_norm": 2.5337600708007812, + "learning_rate": 3.7836360614842715e-06, + "loss": 0.4577, + "step": 4110 + }, + { + "epoch": 2.0024358557973367, + "grad_norm": 2.3397531509399414, + "learning_rate": 3.7830841328294353e-06, + "loss": 0.4646, + "step": 4111 + }, + { + "epoch": 2.0029230269568044, + "grad_norm": 2.884005308151245, + "learning_rate": 3.782532119260211e-06, + "loss": 0.456, + "step": 4112 + }, + { + "epoch": 2.0034101981162715, + "grad_norm": 2.2972798347473145, + "learning_rate": 3.7819800208131295e-06, + "loss": 0.501, + "step": 4113 + }, + { + "epoch": 2.0038973692757387, + "grad_norm": 2.3659110069274902, + "learning_rate": 3.7814278375247314e-06, + "loss": 0.4478, + "step": 4114 + }, + { + "epoch": 2.0043845404352063, + "grad_norm": 2.6785452365875244, + "learning_rate": 3.7808755694315572e-06, + "loss": 0.4852, + "step": 4115 + }, + { + "epoch": 2.0048717115946735, + "grad_norm": 2.4957656860351562, + "learning_rate": 3.7803232165701578e-06, + "loss": 0.5312, + "step": 4116 + }, + { + "epoch": 2.005358882754141, + "grad_norm": 2.729583978652954, + "learning_rate": 3.7797707789770867e-06, + "loss": 0.4272, + "step": 4117 + }, + { + "epoch": 2.0058460539136083, + "grad_norm": 2.8516693115234375, + "learning_rate": 3.779218256688904e-06, + "loss": 0.5008, + "step": 4118 + }, + { + "epoch": 2.006333225073076, + "grad_norm": 3.015984535217285, + "learning_rate": 3.7786656497421768e-06, + "loss": 0.52, + "step": 4119 + }, + { + "epoch": 2.006820396232543, + "grad_norm": 2.5246012210845947, + "learning_rate": 3.778112958173476e-06, + "loss": 0.421, + "step": 4120 + }, + { + "epoch": 2.00730756739201, + "grad_norm": 2.7968804836273193, + "learning_rate": 3.7775601820193787e-06, + "loss": 0.5447, + "step": 4121 + }, + { + "epoch": 2.007794738551478, + "grad_norm": 2.7564141750335693, + "learning_rate": 3.777007321316467e-06, + "loss": 0.4796, + "step": 4122 + }, + { + "epoch": 2.008281909710945, + "grad_norm": 2.6016016006469727, + "learning_rate": 3.77645437610133e-06, + "loss": 0.4792, + "step": 4123 + }, + { + "epoch": 2.0087690808704126, + "grad_norm": 2.744614362716675, + "learning_rate": 3.7759013464105617e-06, + "loss": 0.5354, + "step": 4124 + }, + { + "epoch": 2.00925625202988, + "grad_norm": 2.7202248573303223, + "learning_rate": 3.77534823228076e-06, + "loss": 0.4928, + "step": 4125 + }, + { + "epoch": 2.0097434231893474, + "grad_norm": 2.791593313217163, + "learning_rate": 3.774795033748532e-06, + "loss": 0.5061, + "step": 4126 + }, + { + "epoch": 2.0102305943488146, + "grad_norm": 2.6086275577545166, + "learning_rate": 3.774241750850487e-06, + "loss": 0.4765, + "step": 4127 + }, + { + "epoch": 2.0107177655082817, + "grad_norm": 2.686204671859741, + "learning_rate": 3.773688383623242e-06, + "loss": 0.4319, + "step": 4128 + }, + { + "epoch": 2.0112049366677494, + "grad_norm": 2.5604143142700195, + "learning_rate": 3.7731349321034176e-06, + "loss": 0.4911, + "step": 4129 + }, + { + "epoch": 2.0116921078272165, + "grad_norm": 2.853247880935669, + "learning_rate": 3.7725813963276425e-06, + "loss": 0.4603, + "step": 4130 + }, + { + "epoch": 2.012179278986684, + "grad_norm": 2.5197513103485107, + "learning_rate": 3.772027776332549e-06, + "loss": 0.4771, + "step": 4131 + }, + { + "epoch": 2.0126664501461513, + "grad_norm": 2.8205978870391846, + "learning_rate": 3.771474072154776e-06, + "loss": 0.5369, + "step": 4132 + }, + { + "epoch": 2.013153621305619, + "grad_norm": 2.290468215942383, + "learning_rate": 3.7709202838309666e-06, + "loss": 0.5017, + "step": 4133 + }, + { + "epoch": 2.013640792465086, + "grad_norm": 3.154484987258911, + "learning_rate": 3.7703664113977723e-06, + "loss": 0.5372, + "step": 4134 + }, + { + "epoch": 2.0141279636245533, + "grad_norm": 2.8973188400268555, + "learning_rate": 3.7698124548918456e-06, + "loss": 0.4989, + "step": 4135 + }, + { + "epoch": 2.014615134784021, + "grad_norm": 2.548788070678711, + "learning_rate": 3.76925841434985e-06, + "loss": 0.4905, + "step": 4136 + }, + { + "epoch": 2.015102305943488, + "grad_norm": 2.967341899871826, + "learning_rate": 3.7687042898084512e-06, + "loss": 0.5262, + "step": 4137 + }, + { + "epoch": 2.0155894771029557, + "grad_norm": 2.99800968170166, + "learning_rate": 3.7681500813043206e-06, + "loss": 0.589, + "step": 4138 + }, + { + "epoch": 2.016076648262423, + "grad_norm": 2.7011587619781494, + "learning_rate": 3.7675957888741343e-06, + "loss": 0.524, + "step": 4139 + }, + { + "epoch": 2.01656381942189, + "grad_norm": 2.385652542114258, + "learning_rate": 3.767041412554578e-06, + "loss": 0.4874, + "step": 4140 + }, + { + "epoch": 2.0170509905813576, + "grad_norm": 2.5123062133789062, + "learning_rate": 3.7664869523823388e-06, + "loss": 0.4684, + "step": 4141 + }, + { + "epoch": 2.017538161740825, + "grad_norm": 2.5153071880340576, + "learning_rate": 3.76593240839411e-06, + "loss": 0.5063, + "step": 4142 + }, + { + "epoch": 2.0180253329002924, + "grad_norm": 2.694793224334717, + "learning_rate": 3.7653777806265933e-06, + "loss": 0.426, + "step": 4143 + }, + { + "epoch": 2.0185125040597596, + "grad_norm": 2.7028350830078125, + "learning_rate": 3.764823069116492e-06, + "loss": 0.5116, + "step": 4144 + }, + { + "epoch": 2.018999675219227, + "grad_norm": 2.663517951965332, + "learning_rate": 3.764268273900519e-06, + "loss": 0.4937, + "step": 4145 + }, + { + "epoch": 2.0194868463786944, + "grad_norm": 2.6547679901123047, + "learning_rate": 3.7637133950153885e-06, + "loss": 0.4671, + "step": 4146 + }, + { + "epoch": 2.0199740175381615, + "grad_norm": 2.8323938846588135, + "learning_rate": 3.763158432497824e-06, + "loss": 0.562, + "step": 4147 + }, + { + "epoch": 2.020461188697629, + "grad_norm": 2.871096611022949, + "learning_rate": 3.7626033863845512e-06, + "loss": 0.4489, + "step": 4148 + }, + { + "epoch": 2.0209483598570963, + "grad_norm": 2.903681755065918, + "learning_rate": 3.762048256712304e-06, + "loss": 0.581, + "step": 4149 + }, + { + "epoch": 2.021435531016564, + "grad_norm": 3.402867555618286, + "learning_rate": 3.761493043517821e-06, + "loss": 0.5485, + "step": 4150 + }, + { + "epoch": 2.021922702176031, + "grad_norm": 2.6488025188446045, + "learning_rate": 3.7609377468378462e-06, + "loss": 0.5316, + "step": 4151 + }, + { + "epoch": 2.0224098733354987, + "grad_norm": 2.7657289505004883, + "learning_rate": 3.7603823667091277e-06, + "loss": 0.498, + "step": 4152 + }, + { + "epoch": 2.022897044494966, + "grad_norm": 2.7289319038391113, + "learning_rate": 3.759826903168422e-06, + "loss": 0.4882, + "step": 4153 + }, + { + "epoch": 2.023384215654433, + "grad_norm": 2.9240870475769043, + "learning_rate": 3.7592713562524895e-06, + "loss": 0.5672, + "step": 4154 + }, + { + "epoch": 2.0238713868139007, + "grad_norm": 3.0976932048797607, + "learning_rate": 3.7587157259980956e-06, + "loss": 0.4654, + "step": 4155 + }, + { + "epoch": 2.024358557973368, + "grad_norm": 2.7981746196746826, + "learning_rate": 3.758160012442012e-06, + "loss": 0.4958, + "step": 4156 + }, + { + "epoch": 2.0248457291328354, + "grad_norm": 2.392773389816284, + "learning_rate": 3.757604215621017e-06, + "loss": 0.5045, + "step": 4157 + }, + { + "epoch": 2.0253329002923026, + "grad_norm": 2.604619264602661, + "learning_rate": 3.7570483355718903e-06, + "loss": 0.4498, + "step": 4158 + }, + { + "epoch": 2.0258200714517702, + "grad_norm": 2.757903575897217, + "learning_rate": 3.7564923723314226e-06, + "loss": 0.4856, + "step": 4159 + }, + { + "epoch": 2.0263072426112374, + "grad_norm": 2.5656306743621826, + "learning_rate": 3.755936325936408e-06, + "loss": 0.4302, + "step": 4160 + }, + { + "epoch": 2.0267944137707046, + "grad_norm": 2.6091902256011963, + "learning_rate": 3.755380196423643e-06, + "loss": 0.4738, + "step": 4161 + }, + { + "epoch": 2.027281584930172, + "grad_norm": 2.532905101776123, + "learning_rate": 3.7548239838299338e-06, + "loss": 0.4875, + "step": 4162 + }, + { + "epoch": 2.0277687560896394, + "grad_norm": 2.7344136238098145, + "learning_rate": 3.7542676881920904e-06, + "loss": 0.4566, + "step": 4163 + }, + { + "epoch": 2.028255927249107, + "grad_norm": 2.393876791000366, + "learning_rate": 3.7537113095469275e-06, + "loss": 0.4299, + "step": 4164 + }, + { + "epoch": 2.028743098408574, + "grad_norm": 2.46380352973938, + "learning_rate": 3.7531548479312675e-06, + "loss": 0.4784, + "step": 4165 + }, + { + "epoch": 2.0292302695680418, + "grad_norm": 2.555689811706543, + "learning_rate": 3.752598303381936e-06, + "loss": 0.5123, + "step": 4166 + }, + { + "epoch": 2.029717440727509, + "grad_norm": 2.9859821796417236, + "learning_rate": 3.752041675935766e-06, + "loss": 0.4971, + "step": 4167 + }, + { + "epoch": 2.030204611886976, + "grad_norm": 2.91348934173584, + "learning_rate": 3.7514849656295932e-06, + "loss": 0.5613, + "step": 4168 + }, + { + "epoch": 2.0306917830464437, + "grad_norm": 3.4017155170440674, + "learning_rate": 3.7509281725002632e-06, + "loss": 0.4995, + "step": 4169 + }, + { + "epoch": 2.031178954205911, + "grad_norm": 3.0212981700897217, + "learning_rate": 3.7503712965846223e-06, + "loss": 0.4691, + "step": 4170 + }, + { + "epoch": 2.0316661253653785, + "grad_norm": 3.01584792137146, + "learning_rate": 3.749814337919526e-06, + "loss": 0.4493, + "step": 4171 + }, + { + "epoch": 2.0321532965248457, + "grad_norm": 2.950549840927124, + "learning_rate": 3.749257296541833e-06, + "loss": 0.5027, + "step": 4172 + }, + { + "epoch": 2.0326404676843133, + "grad_norm": 2.7535736560821533, + "learning_rate": 3.7487001724884085e-06, + "loss": 0.5454, + "step": 4173 + }, + { + "epoch": 2.0331276388437804, + "grad_norm": 2.8979673385620117, + "learning_rate": 3.7481429657961234e-06, + "loss": 0.4997, + "step": 4174 + }, + { + "epoch": 2.0336148100032476, + "grad_norm": 2.733107566833496, + "learning_rate": 3.7475856765018527e-06, + "loss": 0.5013, + "step": 4175 + }, + { + "epoch": 2.0341019811627152, + "grad_norm": 2.721879720687866, + "learning_rate": 3.747028304642477e-06, + "loss": 0.5189, + "step": 4176 + }, + { + "epoch": 2.0345891523221824, + "grad_norm": 2.8012185096740723, + "learning_rate": 3.7464708502548857e-06, + "loss": 0.5345, + "step": 4177 + }, + { + "epoch": 2.03507632348165, + "grad_norm": 2.703606367111206, + "learning_rate": 3.745913313375969e-06, + "loss": 0.4509, + "step": 4178 + }, + { + "epoch": 2.035563494641117, + "grad_norm": 2.4452390670776367, + "learning_rate": 3.7453556940426254e-06, + "loss": 0.4237, + "step": 4179 + }, + { + "epoch": 2.036050665800585, + "grad_norm": 2.4822800159454346, + "learning_rate": 3.7447979922917584e-06, + "loss": 0.4375, + "step": 4180 + }, + { + "epoch": 2.036537836960052, + "grad_norm": 2.57785964012146, + "learning_rate": 3.7442402081602754e-06, + "loss": 0.418, + "step": 4181 + }, + { + "epoch": 2.037025008119519, + "grad_norm": 2.5463404655456543, + "learning_rate": 3.7436823416850913e-06, + "loss": 0.4749, + "step": 4182 + }, + { + "epoch": 2.0375121792789868, + "grad_norm": 2.5854551792144775, + "learning_rate": 3.743124392903126e-06, + "loss": 0.5068, + "step": 4183 + }, + { + "epoch": 2.037999350438454, + "grad_norm": 2.6252593994140625, + "learning_rate": 3.7425663618513043e-06, + "loss": 0.4476, + "step": 4184 + }, + { + "epoch": 2.0384865215979215, + "grad_norm": 2.909916877746582, + "learning_rate": 3.742008248566556e-06, + "loss": 0.5304, + "step": 4185 + }, + { + "epoch": 2.0389736927573887, + "grad_norm": 2.5122175216674805, + "learning_rate": 3.7414500530858176e-06, + "loss": 0.4203, + "step": 4186 + }, + { + "epoch": 2.0394608639168563, + "grad_norm": 3.254516363143921, + "learning_rate": 3.7408917754460306e-06, + "loss": 0.5755, + "step": 4187 + }, + { + "epoch": 2.0399480350763235, + "grad_norm": 2.842618703842163, + "learning_rate": 3.7403334156841415e-06, + "loss": 0.4942, + "step": 4188 + }, + { + "epoch": 2.0404352062357907, + "grad_norm": 2.695615768432617, + "learning_rate": 3.7397749738371014e-06, + "loss": 0.4457, + "step": 4189 + }, + { + "epoch": 2.0409223773952583, + "grad_norm": 2.739612579345703, + "learning_rate": 3.7392164499418705e-06, + "loss": 0.5014, + "step": 4190 + }, + { + "epoch": 2.0414095485547255, + "grad_norm": 2.6991183757781982, + "learning_rate": 3.7386578440354092e-06, + "loss": 0.4782, + "step": 4191 + }, + { + "epoch": 2.041896719714193, + "grad_norm": 2.5455782413482666, + "learning_rate": 3.7380991561546874e-06, + "loss": 0.4303, + "step": 4192 + }, + { + "epoch": 2.0423838908736602, + "grad_norm": 2.711796283721924, + "learning_rate": 3.7375403863366786e-06, + "loss": 0.4285, + "step": 4193 + }, + { + "epoch": 2.042871062033128, + "grad_norm": 2.8235585689544678, + "learning_rate": 3.736981534618363e-06, + "loss": 0.4776, + "step": 4194 + }, + { + "epoch": 2.043358233192595, + "grad_norm": 2.9734556674957275, + "learning_rate": 3.736422601036724e-06, + "loss": 0.5239, + "step": 4195 + }, + { + "epoch": 2.043845404352062, + "grad_norm": 2.8931407928466797, + "learning_rate": 3.735863585628753e-06, + "loss": 0.4932, + "step": 4196 + }, + { + "epoch": 2.04433257551153, + "grad_norm": 2.565134286880493, + "learning_rate": 3.7353044884314437e-06, + "loss": 0.4402, + "step": 4197 + }, + { + "epoch": 2.044819746670997, + "grad_norm": 3.070289134979248, + "learning_rate": 3.7347453094818e-06, + "loss": 0.56, + "step": 4198 + }, + { + "epoch": 2.0453069178304646, + "grad_norm": 2.834174394607544, + "learning_rate": 3.734186048816825e-06, + "loss": 0.5829, + "step": 4199 + }, + { + "epoch": 2.0457940889899318, + "grad_norm": 2.7784981727600098, + "learning_rate": 3.7336267064735336e-06, + "loss": 0.4838, + "step": 4200 + }, + { + "epoch": 2.0462812601493994, + "grad_norm": 2.719743490219116, + "learning_rate": 3.733067282488941e-06, + "loss": 0.5521, + "step": 4201 + }, + { + "epoch": 2.0467684313088665, + "grad_norm": 2.8914308547973633, + "learning_rate": 3.7325077769000704e-06, + "loss": 0.4638, + "step": 4202 + }, + { + "epoch": 2.0472556024683337, + "grad_norm": 2.667454242706299, + "learning_rate": 3.7319481897439503e-06, + "loss": 0.4927, + "step": 4203 + }, + { + "epoch": 2.0477427736278013, + "grad_norm": 2.9163436889648438, + "learning_rate": 3.731388521057614e-06, + "loss": 0.4739, + "step": 4204 + }, + { + "epoch": 2.0482299447872685, + "grad_norm": 2.57807993888855, + "learning_rate": 3.7308287708780995e-06, + "loss": 0.4933, + "step": 4205 + }, + { + "epoch": 2.048717115946736, + "grad_norm": 2.921420097351074, + "learning_rate": 3.730268939242452e-06, + "loss": 0.4873, + "step": 4206 + }, + { + "epoch": 2.0492042871062033, + "grad_norm": 2.986844778060913, + "learning_rate": 3.72970902618772e-06, + "loss": 0.4817, + "step": 4207 + }, + { + "epoch": 2.049691458265671, + "grad_norm": 2.5805258750915527, + "learning_rate": 3.729149031750959e-06, + "loss": 0.513, + "step": 4208 + }, + { + "epoch": 2.050178629425138, + "grad_norm": 2.6489763259887695, + "learning_rate": 3.72858895596923e-06, + "loss": 0.5609, + "step": 4209 + }, + { + "epoch": 2.0506658005846052, + "grad_norm": 2.7703919410705566, + "learning_rate": 3.7280287988795986e-06, + "loss": 0.4923, + "step": 4210 + }, + { + "epoch": 2.051152971744073, + "grad_norm": 2.6408400535583496, + "learning_rate": 3.7274685605191352e-06, + "loss": 0.4972, + "step": 4211 + }, + { + "epoch": 2.05164014290354, + "grad_norm": 2.7354073524475098, + "learning_rate": 3.7269082409249173e-06, + "loss": 0.49, + "step": 4212 + }, + { + "epoch": 2.0521273140630076, + "grad_norm": 2.507707118988037, + "learning_rate": 3.7263478401340257e-06, + "loss": 0.5108, + "step": 4213 + }, + { + "epoch": 2.052614485222475, + "grad_norm": 2.8346614837646484, + "learning_rate": 3.7257873581835487e-06, + "loss": 0.4759, + "step": 4214 + }, + { + "epoch": 2.053101656381942, + "grad_norm": 2.695061683654785, + "learning_rate": 3.725226795110578e-06, + "loss": 0.5313, + "step": 4215 + }, + { + "epoch": 2.0535888275414096, + "grad_norm": 2.6383893489837646, + "learning_rate": 3.7246661509522123e-06, + "loss": 0.4656, + "step": 4216 + }, + { + "epoch": 2.0540759987008768, + "grad_norm": 3.0851900577545166, + "learning_rate": 3.7241054257455543e-06, + "loss": 0.5536, + "step": 4217 + }, + { + "epoch": 2.0545631698603444, + "grad_norm": 2.72951078414917, + "learning_rate": 3.723544619527714e-06, + "loss": 0.4884, + "step": 4218 + }, + { + "epoch": 2.0550503410198115, + "grad_norm": 2.6348977088928223, + "learning_rate": 3.722983732335804e-06, + "loss": 0.4534, + "step": 4219 + }, + { + "epoch": 2.055537512179279, + "grad_norm": 2.530121088027954, + "learning_rate": 3.7224227642069455e-06, + "loss": 0.4628, + "step": 4220 + }, + { + "epoch": 2.0560246833387463, + "grad_norm": 2.9828946590423584, + "learning_rate": 3.721861715178261e-06, + "loss": 0.5085, + "step": 4221 + }, + { + "epoch": 2.0565118544982135, + "grad_norm": 3.4527506828308105, + "learning_rate": 3.7213005852868834e-06, + "loss": 0.4728, + "step": 4222 + }, + { + "epoch": 2.056999025657681, + "grad_norm": 2.926607847213745, + "learning_rate": 3.7207393745699453e-06, + "loss": 0.5542, + "step": 4223 + }, + { + "epoch": 2.0574861968171483, + "grad_norm": 3.143350124359131, + "learning_rate": 3.7201780830645907e-06, + "loss": 0.4515, + "step": 4224 + }, + { + "epoch": 2.057973367976616, + "grad_norm": 2.8731844425201416, + "learning_rate": 3.719616710807963e-06, + "loss": 0.456, + "step": 4225 + }, + { + "epoch": 2.058460539136083, + "grad_norm": 3.029998302459717, + "learning_rate": 3.7190552578372153e-06, + "loss": 0.5764, + "step": 4226 + }, + { + "epoch": 2.0589477102955507, + "grad_norm": 2.5995895862579346, + "learning_rate": 3.7184937241895043e-06, + "loss": 0.505, + "step": 4227 + }, + { + "epoch": 2.059434881455018, + "grad_norm": 2.7564008235931396, + "learning_rate": 3.7179321099019917e-06, + "loss": 0.5059, + "step": 4228 + }, + { + "epoch": 2.059922052614485, + "grad_norm": 3.000349283218384, + "learning_rate": 3.7173704150118455e-06, + "loss": 0.5766, + "step": 4229 + }, + { + "epoch": 2.0604092237739526, + "grad_norm": 2.9793319702148438, + "learning_rate": 3.716808639556239e-06, + "loss": 0.507, + "step": 4230 + }, + { + "epoch": 2.06089639493342, + "grad_norm": 2.5879807472229004, + "learning_rate": 3.71624678357235e-06, + "loss": 0.4827, + "step": 4231 + }, + { + "epoch": 2.0613835660928874, + "grad_norm": 3.017690420150757, + "learning_rate": 3.715684847097362e-06, + "loss": 0.5707, + "step": 4232 + }, + { + "epoch": 2.0618707372523546, + "grad_norm": 3.060126543045044, + "learning_rate": 3.7151228301684637e-06, + "loss": 0.5154, + "step": 4233 + }, + { + "epoch": 2.062357908411822, + "grad_norm": 3.000890016555786, + "learning_rate": 3.71456073282285e-06, + "loss": 0.4768, + "step": 4234 + }, + { + "epoch": 2.0628450795712894, + "grad_norm": 2.686298370361328, + "learning_rate": 3.71399855509772e-06, + "loss": 0.448, + "step": 4235 + }, + { + "epoch": 2.0633322507307565, + "grad_norm": 2.6077170372009277, + "learning_rate": 3.7134362970302786e-06, + "loss": 0.5174, + "step": 4236 + }, + { + "epoch": 2.063819421890224, + "grad_norm": 2.9532368183135986, + "learning_rate": 3.7128739586577357e-06, + "loss": 0.5011, + "step": 4237 + }, + { + "epoch": 2.0643065930496913, + "grad_norm": 2.704524278640747, + "learning_rate": 3.7123115400173082e-06, + "loss": 0.491, + "step": 4238 + }, + { + "epoch": 2.064793764209159, + "grad_norm": 2.817486047744751, + "learning_rate": 3.7117490411462147e-06, + "loss": 0.5469, + "step": 4239 + }, + { + "epoch": 2.065280935368626, + "grad_norm": 2.785038948059082, + "learning_rate": 3.7111864620816836e-06, + "loss": 0.4841, + "step": 4240 + }, + { + "epoch": 2.0657681065280937, + "grad_norm": 3.065769672393799, + "learning_rate": 3.710623802860945e-06, + "loss": 0.4699, + "step": 4241 + }, + { + "epoch": 2.066255277687561, + "grad_norm": 2.5411338806152344, + "learning_rate": 3.7100610635212354e-06, + "loss": 0.5643, + "step": 4242 + }, + { + "epoch": 2.066742448847028, + "grad_norm": 2.7806694507598877, + "learning_rate": 3.709498244099797e-06, + "loss": 0.4762, + "step": 4243 + }, + { + "epoch": 2.0672296200064957, + "grad_norm": 2.7232422828674316, + "learning_rate": 3.7089353446338776e-06, + "loss": 0.4737, + "step": 4244 + }, + { + "epoch": 2.067716791165963, + "grad_norm": 2.7071354389190674, + "learning_rate": 3.708372365160731e-06, + "loss": 0.502, + "step": 4245 + }, + { + "epoch": 2.0682039623254305, + "grad_norm": 2.7851455211639404, + "learning_rate": 3.707809305717612e-06, + "loss": 0.4992, + "step": 4246 + }, + { + "epoch": 2.0686911334848976, + "grad_norm": 2.8055579662323, + "learning_rate": 3.7072461663417865e-06, + "loss": 0.4035, + "step": 4247 + }, + { + "epoch": 2.0691783046443653, + "grad_norm": 2.4945871829986572, + "learning_rate": 3.7066829470705224e-06, + "loss": 0.4828, + "step": 4248 + }, + { + "epoch": 2.0696654758038324, + "grad_norm": 3.011873245239258, + "learning_rate": 3.7061196479410923e-06, + "loss": 0.5127, + "step": 4249 + }, + { + "epoch": 2.0701526469632996, + "grad_norm": 2.9066483974456787, + "learning_rate": 3.705556268990777e-06, + "loss": 0.4826, + "step": 4250 + }, + { + "epoch": 2.070639818122767, + "grad_norm": 2.353536367416382, + "learning_rate": 3.70499281025686e-06, + "loss": 0.4462, + "step": 4251 + }, + { + "epoch": 2.0711269892822344, + "grad_norm": 3.402890682220459, + "learning_rate": 3.7044292717766306e-06, + "loss": 0.4759, + "step": 4252 + }, + { + "epoch": 2.071614160441702, + "grad_norm": 2.7642486095428467, + "learning_rate": 3.7038656535873848e-06, + "loss": 0.4155, + "step": 4253 + }, + { + "epoch": 2.072101331601169, + "grad_norm": 2.4162635803222656, + "learning_rate": 3.7033019557264216e-06, + "loss": 0.4978, + "step": 4254 + }, + { + "epoch": 2.0725885027606368, + "grad_norm": 2.755091905593872, + "learning_rate": 3.7027381782310472e-06, + "loss": 0.5132, + "step": 4255 + }, + { + "epoch": 2.073075673920104, + "grad_norm": 2.8467538356781006, + "learning_rate": 3.702174321138572e-06, + "loss": 0.4761, + "step": 4256 + }, + { + "epoch": 2.073562845079571, + "grad_norm": 2.655385732650757, + "learning_rate": 3.7016103844863124e-06, + "loss": 0.4745, + "step": 4257 + }, + { + "epoch": 2.0740500162390387, + "grad_norm": 2.7069857120513916, + "learning_rate": 3.7010463683115894e-06, + "loss": 0.4938, + "step": 4258 + }, + { + "epoch": 2.074537187398506, + "grad_norm": 3.00911545753479, + "learning_rate": 3.7004822726517292e-06, + "loss": 0.5389, + "step": 4259 + }, + { + "epoch": 2.0750243585579735, + "grad_norm": 2.862018585205078, + "learning_rate": 3.699918097544064e-06, + "loss": 0.4984, + "step": 4260 + }, + { + "epoch": 2.0755115297174407, + "grad_norm": 2.7818422317504883, + "learning_rate": 3.699353843025932e-06, + "loss": 0.5161, + "step": 4261 + }, + { + "epoch": 2.0759987008769083, + "grad_norm": 3.0376882553100586, + "learning_rate": 3.6987895091346727e-06, + "loss": 0.4999, + "step": 4262 + }, + { + "epoch": 2.0764858720363755, + "grad_norm": 2.6812143325805664, + "learning_rate": 3.6982250959076366e-06, + "loss": 0.5343, + "step": 4263 + }, + { + "epoch": 2.0769730431958426, + "grad_norm": 2.6568527221679688, + "learning_rate": 3.697660603382174e-06, + "loss": 0.4653, + "step": 4264 + }, + { + "epoch": 2.0774602143553103, + "grad_norm": 2.7283434867858887, + "learning_rate": 3.6970960315956457e-06, + "loss": 0.4881, + "step": 4265 + }, + { + "epoch": 2.0779473855147774, + "grad_norm": 2.4544310569763184, + "learning_rate": 3.6965313805854124e-06, + "loss": 0.4334, + "step": 4266 + }, + { + "epoch": 2.078434556674245, + "grad_norm": 2.589125633239746, + "learning_rate": 3.6959666503888442e-06, + "loss": 0.5048, + "step": 4267 + }, + { + "epoch": 2.078921727833712, + "grad_norm": 2.450631618499756, + "learning_rate": 3.6954018410433147e-06, + "loss": 0.4826, + "step": 4268 + }, + { + "epoch": 2.0794088989931794, + "grad_norm": 2.7049777507781982, + "learning_rate": 3.6948369525862023e-06, + "loss": 0.4762, + "step": 4269 + }, + { + "epoch": 2.079896070152647, + "grad_norm": 2.7815120220184326, + "learning_rate": 3.6942719850548914e-06, + "loss": 0.5529, + "step": 4270 + }, + { + "epoch": 2.080383241312114, + "grad_norm": 2.724656343460083, + "learning_rate": 3.693706938486772e-06, + "loss": 0.5095, + "step": 4271 + }, + { + "epoch": 2.0808704124715818, + "grad_norm": 2.8838353157043457, + "learning_rate": 3.6931418129192386e-06, + "loss": 0.449, + "step": 4272 + }, + { + "epoch": 2.081357583631049, + "grad_norm": 2.7331600189208984, + "learning_rate": 3.6925766083896918e-06, + "loss": 0.4303, + "step": 4273 + }, + { + "epoch": 2.0818447547905166, + "grad_norm": 2.9291152954101562, + "learning_rate": 3.6920113249355347e-06, + "loss": 0.4762, + "step": 4274 + }, + { + "epoch": 2.0823319259499837, + "grad_norm": 2.6823654174804688, + "learning_rate": 3.69144596259418e-06, + "loss": 0.4978, + "step": 4275 + }, + { + "epoch": 2.082819097109451, + "grad_norm": 2.493927240371704, + "learning_rate": 3.6908805214030425e-06, + "loss": 0.4838, + "step": 4276 + }, + { + "epoch": 2.0833062682689185, + "grad_norm": 2.5744917392730713, + "learning_rate": 3.6903150013995426e-06, + "loss": 0.4881, + "step": 4277 + }, + { + "epoch": 2.0837934394283857, + "grad_norm": 2.5448977947235107, + "learning_rate": 3.6897494026211067e-06, + "loss": 0.5176, + "step": 4278 + }, + { + "epoch": 2.0842806105878533, + "grad_norm": 2.7847838401794434, + "learning_rate": 3.6891837251051667e-06, + "loss": 0.454, + "step": 4279 + }, + { + "epoch": 2.0847677817473205, + "grad_norm": 2.7775795459747314, + "learning_rate": 3.6886179688891577e-06, + "loss": 0.4902, + "step": 4280 + }, + { + "epoch": 2.085254952906788, + "grad_norm": 2.8158462047576904, + "learning_rate": 3.688052134010523e-06, + "loss": 0.5607, + "step": 4281 + }, + { + "epoch": 2.0857421240662553, + "grad_norm": 2.979435443878174, + "learning_rate": 3.687486220506708e-06, + "loss": 0.4794, + "step": 4282 + }, + { + "epoch": 2.0862292952257224, + "grad_norm": 2.6575722694396973, + "learning_rate": 3.6869202284151663e-06, + "loss": 0.4485, + "step": 4283 + }, + { + "epoch": 2.08671646638519, + "grad_norm": 2.9644830226898193, + "learning_rate": 3.686354157773354e-06, + "loss": 0.5424, + "step": 4284 + }, + { + "epoch": 2.087203637544657, + "grad_norm": 3.0263850688934326, + "learning_rate": 3.6857880086187335e-06, + "loss": 0.446, + "step": 4285 + }, + { + "epoch": 2.087690808704125, + "grad_norm": 3.8754138946533203, + "learning_rate": 3.6852217809887734e-06, + "loss": 0.4675, + "step": 4286 + }, + { + "epoch": 2.088177979863592, + "grad_norm": 2.782728672027588, + "learning_rate": 3.684655474920947e-06, + "loss": 0.4337, + "step": 4287 + }, + { + "epoch": 2.0886651510230596, + "grad_norm": 2.7191359996795654, + "learning_rate": 3.6840890904527303e-06, + "loss": 0.5155, + "step": 4288 + }, + { + "epoch": 2.0891523221825268, + "grad_norm": 2.632497787475586, + "learning_rate": 3.6835226276216087e-06, + "loss": 0.4414, + "step": 4289 + }, + { + "epoch": 2.089639493341994, + "grad_norm": 2.620565414428711, + "learning_rate": 3.6829560864650692e-06, + "loss": 0.4617, + "step": 4290 + }, + { + "epoch": 2.0901266645014616, + "grad_norm": 2.6551387310028076, + "learning_rate": 3.6823894670206067e-06, + "loss": 0.4972, + "step": 4291 + }, + { + "epoch": 2.0906138356609287, + "grad_norm": 2.7276132106781006, + "learning_rate": 3.6818227693257193e-06, + "loss": 0.4998, + "step": 4292 + }, + { + "epoch": 2.0911010068203963, + "grad_norm": 2.8919425010681152, + "learning_rate": 3.681255993417911e-06, + "loss": 0.5068, + "step": 4293 + }, + { + "epoch": 2.0915881779798635, + "grad_norm": 2.87068247795105, + "learning_rate": 3.680689139334691e-06, + "loss": 0.4197, + "step": 4294 + }, + { + "epoch": 2.092075349139331, + "grad_norm": 2.5613327026367188, + "learning_rate": 3.6801222071135735e-06, + "loss": 0.4515, + "step": 4295 + }, + { + "epoch": 2.0925625202987983, + "grad_norm": 3.054976224899292, + "learning_rate": 3.6795551967920783e-06, + "loss": 0.5055, + "step": 4296 + }, + { + "epoch": 2.0930496914582655, + "grad_norm": 2.486490249633789, + "learning_rate": 3.6789881084077302e-06, + "loss": 0.4377, + "step": 4297 + }, + { + "epoch": 2.093536862617733, + "grad_norm": 2.950897693634033, + "learning_rate": 3.6784209419980588e-06, + "loss": 0.5331, + "step": 4298 + }, + { + "epoch": 2.0940240337772003, + "grad_norm": 3.348738431930542, + "learning_rate": 3.677853697600599e-06, + "loss": 0.4734, + "step": 4299 + }, + { + "epoch": 2.094511204936668, + "grad_norm": 3.1199443340301514, + "learning_rate": 3.6772863752528916e-06, + "loss": 0.4926, + "step": 4300 + }, + { + "epoch": 2.094998376096135, + "grad_norm": 2.63073992729187, + "learning_rate": 3.6767189749924807e-06, + "loss": 0.4389, + "step": 4301 + }, + { + "epoch": 2.0954855472556027, + "grad_norm": 2.608527660369873, + "learning_rate": 3.6761514968569185e-06, + "loss": 0.455, + "step": 4302 + }, + { + "epoch": 2.09597271841507, + "grad_norm": 3.067507266998291, + "learning_rate": 3.6755839408837592e-06, + "loss": 0.5143, + "step": 4303 + }, + { + "epoch": 2.096459889574537, + "grad_norm": 2.8484692573547363, + "learning_rate": 3.6750163071105644e-06, + "loss": 0.4935, + "step": 4304 + }, + { + "epoch": 2.0969470607340046, + "grad_norm": 2.88215970993042, + "learning_rate": 3.6744485955749e-06, + "loss": 0.5378, + "step": 4305 + }, + { + "epoch": 2.097434231893472, + "grad_norm": 2.3850388526916504, + "learning_rate": 3.6738808063143366e-06, + "loss": 0.502, + "step": 4306 + }, + { + "epoch": 2.0979214030529394, + "grad_norm": 2.8434207439422607, + "learning_rate": 3.6733129393664503e-06, + "loss": 0.5269, + "step": 4307 + }, + { + "epoch": 2.0984085742124066, + "grad_norm": 2.6615958213806152, + "learning_rate": 3.672744994768824e-06, + "loss": 0.4231, + "step": 4308 + }, + { + "epoch": 2.098895745371874, + "grad_norm": 2.8757357597351074, + "learning_rate": 3.6721769725590422e-06, + "loss": 0.4706, + "step": 4309 + }, + { + "epoch": 2.0993829165313413, + "grad_norm": 2.883018970489502, + "learning_rate": 3.6716088727746977e-06, + "loss": 0.5042, + "step": 4310 + }, + { + "epoch": 2.0998700876908085, + "grad_norm": 2.6068105697631836, + "learning_rate": 3.671040695453387e-06, + "loss": 0.5512, + "step": 4311 + }, + { + "epoch": 2.100357258850276, + "grad_norm": 2.6428964138031006, + "learning_rate": 3.670472440632713e-06, + "loss": 0.4518, + "step": 4312 + }, + { + "epoch": 2.1008444300097433, + "grad_norm": 3.3737220764160156, + "learning_rate": 3.669904108350281e-06, + "loss": 0.5271, + "step": 4313 + }, + { + "epoch": 2.101331601169211, + "grad_norm": 2.686514139175415, + "learning_rate": 3.6693356986437047e-06, + "loss": 0.527, + "step": 4314 + }, + { + "epoch": 2.101818772328678, + "grad_norm": 2.8012311458587646, + "learning_rate": 3.6687672115506007e-06, + "loss": 0.4621, + "step": 4315 + }, + { + "epoch": 2.1023059434881457, + "grad_norm": 3.1327314376831055, + "learning_rate": 3.668198647108592e-06, + "loss": 0.4936, + "step": 4316 + }, + { + "epoch": 2.102793114647613, + "grad_norm": 2.4637444019317627, + "learning_rate": 3.667630005355304e-06, + "loss": 0.4787, + "step": 4317 + }, + { + "epoch": 2.10328028580708, + "grad_norm": 3.020857572555542, + "learning_rate": 3.6670612863283718e-06, + "loss": 0.528, + "step": 4318 + }, + { + "epoch": 2.1037674569665477, + "grad_norm": 2.944711685180664, + "learning_rate": 3.666492490065432e-06, + "loss": 0.4531, + "step": 4319 + }, + { + "epoch": 2.104254628126015, + "grad_norm": 3.0212109088897705, + "learning_rate": 3.665923616604129e-06, + "loss": 0.4469, + "step": 4320 + }, + { + "epoch": 2.1047417992854824, + "grad_norm": 3.0230743885040283, + "learning_rate": 3.6653546659821083e-06, + "loss": 0.4335, + "step": 4321 + }, + { + "epoch": 2.1052289704449496, + "grad_norm": 2.712663412094116, + "learning_rate": 3.664785638237025e-06, + "loss": 0.5068, + "step": 4322 + }, + { + "epoch": 2.1057161416044172, + "grad_norm": 2.9053194522857666, + "learning_rate": 3.664216533406536e-06, + "loss": 0.4812, + "step": 4323 + }, + { + "epoch": 2.1062033127638844, + "grad_norm": 2.855036497116089, + "learning_rate": 3.663647351528306e-06, + "loss": 0.5043, + "step": 4324 + }, + { + "epoch": 2.1066904839233516, + "grad_norm": 2.6325438022613525, + "learning_rate": 3.6630780926400023e-06, + "loss": 0.533, + "step": 4325 + }, + { + "epoch": 2.107177655082819, + "grad_norm": 3.2508974075317383, + "learning_rate": 3.6625087567792988e-06, + "loss": 0.4915, + "step": 4326 + }, + { + "epoch": 2.1076648262422863, + "grad_norm": 2.831705331802368, + "learning_rate": 3.6619393439838734e-06, + "loss": 0.4918, + "step": 4327 + }, + { + "epoch": 2.108151997401754, + "grad_norm": 2.94759202003479, + "learning_rate": 3.661369854291411e-06, + "loss": 0.4607, + "step": 4328 + }, + { + "epoch": 2.108639168561221, + "grad_norm": 3.1438193321228027, + "learning_rate": 3.6608002877396e-06, + "loss": 0.5171, + "step": 4329 + }, + { + "epoch": 2.1091263397206887, + "grad_norm": 3.212348461151123, + "learning_rate": 3.660230644366134e-06, + "loss": 0.5483, + "step": 4330 + }, + { + "epoch": 2.109613510880156, + "grad_norm": 2.685136079788208, + "learning_rate": 3.6596609242087115e-06, + "loss": 0.4485, + "step": 4331 + }, + { + "epoch": 2.110100682039623, + "grad_norm": 2.725832462310791, + "learning_rate": 3.6590911273050377e-06, + "loss": 0.4967, + "step": 4332 + }, + { + "epoch": 2.1105878531990907, + "grad_norm": 2.794126510620117, + "learning_rate": 3.6585212536928206e-06, + "loss": 0.4983, + "step": 4333 + }, + { + "epoch": 2.111075024358558, + "grad_norm": 3.09554386138916, + "learning_rate": 3.6579513034097747e-06, + "loss": 0.5465, + "step": 4334 + }, + { + "epoch": 2.1115621955180255, + "grad_norm": 2.670095443725586, + "learning_rate": 3.65738127649362e-06, + "loss": 0.5087, + "step": 4335 + }, + { + "epoch": 2.1120493666774927, + "grad_norm": 2.596989393234253, + "learning_rate": 3.6568111729820794e-06, + "loss": 0.4947, + "step": 4336 + }, + { + "epoch": 2.1125365378369603, + "grad_norm": 2.9184696674346924, + "learning_rate": 3.6562409929128833e-06, + "loss": 0.5211, + "step": 4337 + }, + { + "epoch": 2.1130237089964274, + "grad_norm": 2.9508652687072754, + "learning_rate": 3.6556707363237666e-06, + "loss": 0.4228, + "step": 4338 + }, + { + "epoch": 2.1135108801558946, + "grad_norm": 2.6787772178649902, + "learning_rate": 3.6551004032524673e-06, + "loss": 0.4745, + "step": 4339 + }, + { + "epoch": 2.1139980513153622, + "grad_norm": 2.8314406871795654, + "learning_rate": 3.6545299937367317e-06, + "loss": 0.4857, + "step": 4340 + }, + { + "epoch": 2.1144852224748294, + "grad_norm": 3.137101888656616, + "learning_rate": 3.6539595078143077e-06, + "loss": 0.5188, + "step": 4341 + }, + { + "epoch": 2.114972393634297, + "grad_norm": 2.5379550457000732, + "learning_rate": 3.6533889455229523e-06, + "loss": 0.5128, + "step": 4342 + }, + { + "epoch": 2.115459564793764, + "grad_norm": 2.487180471420288, + "learning_rate": 3.652818306900422e-06, + "loss": 0.5495, + "step": 4343 + }, + { + "epoch": 2.115946735953232, + "grad_norm": 2.826078414916992, + "learning_rate": 3.6522475919844847e-06, + "loss": 0.5168, + "step": 4344 + }, + { + "epoch": 2.116433907112699, + "grad_norm": 2.6808269023895264, + "learning_rate": 3.6516768008129096e-06, + "loss": 0.5168, + "step": 4345 + }, + { + "epoch": 2.116921078272166, + "grad_norm": 2.58339786529541, + "learning_rate": 3.6511059334234698e-06, + "loss": 0.477, + "step": 4346 + }, + { + "epoch": 2.1174082494316337, + "grad_norm": 2.838092803955078, + "learning_rate": 3.650534989853947e-06, + "loss": 0.4749, + "step": 4347 + }, + { + "epoch": 2.117895420591101, + "grad_norm": 3.0403878688812256, + "learning_rate": 3.649963970142126e-06, + "loss": 0.4999, + "step": 4348 + }, + { + "epoch": 2.1183825917505685, + "grad_norm": 2.6701467037200928, + "learning_rate": 3.6493928743257963e-06, + "loss": 0.5027, + "step": 4349 + }, + { + "epoch": 2.1188697629100357, + "grad_norm": 2.4554028511047363, + "learning_rate": 3.648821702442753e-06, + "loss": 0.4757, + "step": 4350 + }, + { + "epoch": 2.119356934069503, + "grad_norm": 2.978755474090576, + "learning_rate": 3.648250454530797e-06, + "loss": 0.4932, + "step": 4351 + }, + { + "epoch": 2.1198441052289705, + "grad_norm": 2.4069864749908447, + "learning_rate": 3.647679130627732e-06, + "loss": 0.4084, + "step": 4352 + }, + { + "epoch": 2.1203312763884377, + "grad_norm": 2.772094488143921, + "learning_rate": 3.6471077307713697e-06, + "loss": 0.4933, + "step": 4353 + }, + { + "epoch": 2.1208184475479053, + "grad_norm": 2.9927825927734375, + "learning_rate": 3.646536254999524e-06, + "loss": 0.4801, + "step": 4354 + }, + { + "epoch": 2.1213056187073724, + "grad_norm": 2.2702834606170654, + "learning_rate": 3.6459647033500163e-06, + "loss": 0.3564, + "step": 4355 + }, + { + "epoch": 2.12179278986684, + "grad_norm": 2.611454725265503, + "learning_rate": 3.645393075860671e-06, + "loss": 0.4491, + "step": 4356 + }, + { + "epoch": 2.1222799610263072, + "grad_norm": 2.560591459274292, + "learning_rate": 3.6448213725693186e-06, + "loss": 0.4569, + "step": 4357 + }, + { + "epoch": 2.1227671321857744, + "grad_norm": 2.6585261821746826, + "learning_rate": 3.6442495935137945e-06, + "loss": 0.5417, + "step": 4358 + }, + { + "epoch": 2.123254303345242, + "grad_norm": 3.056579351425171, + "learning_rate": 3.643677738731939e-06, + "loss": 0.4525, + "step": 4359 + }, + { + "epoch": 2.123741474504709, + "grad_norm": 2.695089817047119, + "learning_rate": 3.6431058082615966e-06, + "loss": 0.484, + "step": 4360 + }, + { + "epoch": 2.124228645664177, + "grad_norm": 2.7861056327819824, + "learning_rate": 3.642533802140619e-06, + "loss": 0.4485, + "step": 4361 + }, + { + "epoch": 2.124715816823644, + "grad_norm": 2.9242191314697266, + "learning_rate": 3.64196172040686e-06, + "loss": 0.5257, + "step": 4362 + }, + { + "epoch": 2.1252029879831116, + "grad_norm": 2.6898584365844727, + "learning_rate": 3.6413895630981816e-06, + "loss": 0.4505, + "step": 4363 + }, + { + "epoch": 2.1256901591425788, + "grad_norm": 2.946091651916504, + "learning_rate": 3.6408173302524475e-06, + "loss": 0.5213, + "step": 4364 + }, + { + "epoch": 2.126177330302046, + "grad_norm": 3.24550724029541, + "learning_rate": 3.6402450219075294e-06, + "loss": 0.5191, + "step": 4365 + }, + { + "epoch": 2.1266645014615135, + "grad_norm": 2.854672431945801, + "learning_rate": 3.639672638101301e-06, + "loss": 0.5323, + "step": 4366 + }, + { + "epoch": 2.1271516726209807, + "grad_norm": 2.821807384490967, + "learning_rate": 3.639100178871644e-06, + "loss": 0.4799, + "step": 4367 + }, + { + "epoch": 2.1276388437804483, + "grad_norm": 2.8042616844177246, + "learning_rate": 3.6385276442564433e-06, + "loss": 0.5079, + "step": 4368 + }, + { + "epoch": 2.1281260149399155, + "grad_norm": 2.9045660495758057, + "learning_rate": 3.6379550342935887e-06, + "loss": 0.5148, + "step": 4369 + }, + { + "epoch": 2.128613186099383, + "grad_norm": 2.606698751449585, + "learning_rate": 3.637382349020976e-06, + "loss": 0.4564, + "step": 4370 + }, + { + "epoch": 2.1291003572588503, + "grad_norm": 3.0447838306427, + "learning_rate": 3.6368095884765057e-06, + "loss": 0.5205, + "step": 4371 + }, + { + "epoch": 2.1295875284183174, + "grad_norm": 2.962946653366089, + "learning_rate": 3.6362367526980826e-06, + "loss": 0.5315, + "step": 4372 + }, + { + "epoch": 2.130074699577785, + "grad_norm": 2.81286883354187, + "learning_rate": 3.6356638417236175e-06, + "loss": 0.4535, + "step": 4373 + }, + { + "epoch": 2.1305618707372522, + "grad_norm": 2.5967020988464355, + "learning_rate": 3.635090855591024e-06, + "loss": 0.4796, + "step": 4374 + }, + { + "epoch": 2.13104904189672, + "grad_norm": 2.5718231201171875, + "learning_rate": 3.634517794338224e-06, + "loss": 0.4646, + "step": 4375 + }, + { + "epoch": 2.131536213056187, + "grad_norm": 3.9905638694763184, + "learning_rate": 3.633944658003142e-06, + "loss": 0.4307, + "step": 4376 + }, + { + "epoch": 2.1320233842156546, + "grad_norm": 2.9422457218170166, + "learning_rate": 3.6333714466237087e-06, + "loss": 0.5364, + "step": 4377 + }, + { + "epoch": 2.132510555375122, + "grad_norm": 2.667640447616577, + "learning_rate": 3.6327981602378577e-06, + "loss": 0.5138, + "step": 4378 + }, + { + "epoch": 2.132997726534589, + "grad_norm": 2.7287583351135254, + "learning_rate": 3.63222479888353e-06, + "loss": 0.4283, + "step": 4379 + }, + { + "epoch": 2.1334848976940566, + "grad_norm": 2.7172036170959473, + "learning_rate": 3.6316513625986717e-06, + "loss": 0.4281, + "step": 4380 + }, + { + "epoch": 2.1339720688535238, + "grad_norm": 2.638378381729126, + "learning_rate": 3.6310778514212296e-06, + "loss": 0.4593, + "step": 4381 + }, + { + "epoch": 2.1344592400129914, + "grad_norm": 2.5749123096466064, + "learning_rate": 3.6305042653891624e-06, + "loss": 0.4551, + "step": 4382 + }, + { + "epoch": 2.1349464111724585, + "grad_norm": 2.865609645843506, + "learning_rate": 3.6299306045404276e-06, + "loss": 0.5079, + "step": 4383 + }, + { + "epoch": 2.135433582331926, + "grad_norm": 2.761932134628296, + "learning_rate": 3.6293568689129903e-06, + "loss": 0.4615, + "step": 4384 + }, + { + "epoch": 2.1359207534913933, + "grad_norm": 2.7411038875579834, + "learning_rate": 3.6287830585448214e-06, + "loss": 0.4334, + "step": 4385 + }, + { + "epoch": 2.1364079246508605, + "grad_norm": 2.7998313903808594, + "learning_rate": 3.6282091734738933e-06, + "loss": 0.4837, + "step": 4386 + }, + { + "epoch": 2.136895095810328, + "grad_norm": 2.758434295654297, + "learning_rate": 3.6276352137381887e-06, + "loss": 0.5332, + "step": 4387 + }, + { + "epoch": 2.1373822669697953, + "grad_norm": 3.0076277256011963, + "learning_rate": 3.62706117937569e-06, + "loss": 0.4547, + "step": 4388 + }, + { + "epoch": 2.137869438129263, + "grad_norm": 2.693559169769287, + "learning_rate": 3.626487070424387e-06, + "loss": 0.5279, + "step": 4389 + }, + { + "epoch": 2.13835660928873, + "grad_norm": 2.947437047958374, + "learning_rate": 3.6259128869222747e-06, + "loss": 0.5385, + "step": 4390 + }, + { + "epoch": 2.1388437804481977, + "grad_norm": 2.6336095333099365, + "learning_rate": 3.6253386289073533e-06, + "loss": 0.4919, + "step": 4391 + }, + { + "epoch": 2.139330951607665, + "grad_norm": 3.012072801589966, + "learning_rate": 3.6247642964176254e-06, + "loss": 0.4633, + "step": 4392 + }, + { + "epoch": 2.139818122767132, + "grad_norm": 3.023101329803467, + "learning_rate": 3.6241898894911017e-06, + "loss": 0.5133, + "step": 4393 + }, + { + "epoch": 2.1403052939265996, + "grad_norm": 2.6522035598754883, + "learning_rate": 3.6236154081657947e-06, + "loss": 0.4807, + "step": 4394 + }, + { + "epoch": 2.140792465086067, + "grad_norm": 2.8969695568084717, + "learning_rate": 3.6230408524797254e-06, + "loss": 0.5263, + "step": 4395 + }, + { + "epoch": 2.1412796362455344, + "grad_norm": 2.6455883979797363, + "learning_rate": 3.622466222470918e-06, + "loss": 0.5033, + "step": 4396 + }, + { + "epoch": 2.1417668074050016, + "grad_norm": 2.885035753250122, + "learning_rate": 3.621891518177399e-06, + "loss": 0.4562, + "step": 4397 + }, + { + "epoch": 2.1422539785644688, + "grad_norm": 2.9905335903167725, + "learning_rate": 3.6213167396372045e-06, + "loss": 0.4774, + "step": 4398 + }, + { + "epoch": 2.1427411497239364, + "grad_norm": 2.5854625701904297, + "learning_rate": 3.620741886888372e-06, + "loss": 0.4998, + "step": 4399 + }, + { + "epoch": 2.1432283208834035, + "grad_norm": 2.918337106704712, + "learning_rate": 3.6201669599689466e-06, + "loss": 0.5144, + "step": 4400 + }, + { + "epoch": 2.143715492042871, + "grad_norm": 2.712831735610962, + "learning_rate": 3.6195919589169753e-06, + "loss": 0.4923, + "step": 4401 + }, + { + "epoch": 2.1442026632023383, + "grad_norm": 2.7098748683929443, + "learning_rate": 3.619016883770513e-06, + "loss": 0.4566, + "step": 4402 + }, + { + "epoch": 2.144689834361806, + "grad_norm": 3.2962710857391357, + "learning_rate": 3.6184417345676175e-06, + "loss": 0.5614, + "step": 4403 + }, + { + "epoch": 2.145177005521273, + "grad_norm": 3.095551013946533, + "learning_rate": 3.6178665113463516e-06, + "loss": 0.6611, + "step": 4404 + }, + { + "epoch": 2.1456641766807403, + "grad_norm": 2.878652334213257, + "learning_rate": 3.6172912141447852e-06, + "loss": 0.5201, + "step": 4405 + }, + { + "epoch": 2.146151347840208, + "grad_norm": 3.0722856521606445, + "learning_rate": 3.6167158430009898e-06, + "loss": 0.5103, + "step": 4406 + }, + { + "epoch": 2.146638518999675, + "grad_norm": 3.029651165008545, + "learning_rate": 3.6161403979530437e-06, + "loss": 0.4321, + "step": 4407 + }, + { + "epoch": 2.1471256901591427, + "grad_norm": 2.674102544784546, + "learning_rate": 3.6155648790390295e-06, + "loss": 0.5015, + "step": 4408 + }, + { + "epoch": 2.14761286131861, + "grad_norm": 2.65828800201416, + "learning_rate": 3.614989286297036e-06, + "loss": 0.5447, + "step": 4409 + }, + { + "epoch": 2.1481000324780775, + "grad_norm": 3.070140838623047, + "learning_rate": 3.614413619765156e-06, + "loss": 0.5384, + "step": 4410 + }, + { + "epoch": 2.1485872036375446, + "grad_norm": 3.3104982376098633, + "learning_rate": 3.6138378794814848e-06, + "loss": 0.5327, + "step": 4411 + }, + { + "epoch": 2.149074374797012, + "grad_norm": 2.575892686843872, + "learning_rate": 3.6132620654841277e-06, + "loss": 0.4646, + "step": 4412 + }, + { + "epoch": 2.1495615459564794, + "grad_norm": 3.196704387664795, + "learning_rate": 3.6126861778111906e-06, + "loss": 0.5281, + "step": 4413 + }, + { + "epoch": 2.1500487171159466, + "grad_norm": 2.9957242012023926, + "learning_rate": 3.612110216500786e-06, + "loss": 0.5313, + "step": 4414 + }, + { + "epoch": 2.150535888275414, + "grad_norm": 3.1709470748901367, + "learning_rate": 3.61153418159103e-06, + "loss": 0.5226, + "step": 4415 + }, + { + "epoch": 2.1510230594348814, + "grad_norm": 2.851456880569458, + "learning_rate": 3.6109580731200463e-06, + "loss": 0.5142, + "step": 4416 + }, + { + "epoch": 2.151510230594349, + "grad_norm": 2.9415717124938965, + "learning_rate": 3.6103818911259604e-06, + "loss": 0.4759, + "step": 4417 + }, + { + "epoch": 2.151997401753816, + "grad_norm": 2.675926923751831, + "learning_rate": 3.609805635646904e-06, + "loss": 0.4928, + "step": 4418 + }, + { + "epoch": 2.1524845729132833, + "grad_norm": 2.7974865436553955, + "learning_rate": 3.6092293067210142e-06, + "loss": 0.5163, + "step": 4419 + }, + { + "epoch": 2.152971744072751, + "grad_norm": 2.807515859603882, + "learning_rate": 3.6086529043864326e-06, + "loss": 0.5408, + "step": 4420 + }, + { + "epoch": 2.153458915232218, + "grad_norm": 2.9632177352905273, + "learning_rate": 3.608076428681304e-06, + "loss": 0.5426, + "step": 4421 + }, + { + "epoch": 2.1539460863916857, + "grad_norm": 2.8776237964630127, + "learning_rate": 3.607499879643782e-06, + "loss": 0.5085, + "step": 4422 + }, + { + "epoch": 2.154433257551153, + "grad_norm": 2.5461790561676025, + "learning_rate": 3.60692325731202e-06, + "loss": 0.4854, + "step": 4423 + }, + { + "epoch": 2.1549204287106205, + "grad_norm": 2.8912887573242188, + "learning_rate": 3.6063465617241805e-06, + "loss": 0.5213, + "step": 4424 + }, + { + "epoch": 2.1554075998700877, + "grad_norm": 2.9726369380950928, + "learning_rate": 3.605769792918429e-06, + "loss": 0.46, + "step": 4425 + }, + { + "epoch": 2.155894771029555, + "grad_norm": 2.930936813354492, + "learning_rate": 3.6051929509329342e-06, + "loss": 0.5209, + "step": 4426 + }, + { + "epoch": 2.1563819421890225, + "grad_norm": 2.9343669414520264, + "learning_rate": 3.6046160358058735e-06, + "loss": 0.5104, + "step": 4427 + }, + { + "epoch": 2.1568691133484896, + "grad_norm": 2.4199423789978027, + "learning_rate": 3.6040390475754268e-06, + "loss": 0.419, + "step": 4428 + }, + { + "epoch": 2.1573562845079572, + "grad_norm": 3.0537431240081787, + "learning_rate": 3.6034619862797782e-06, + "loss": 0.5294, + "step": 4429 + }, + { + "epoch": 2.1578434556674244, + "grad_norm": 2.5616729259490967, + "learning_rate": 3.6028848519571185e-06, + "loss": 0.4406, + "step": 4430 + }, + { + "epoch": 2.158330626826892, + "grad_norm": 2.6613330841064453, + "learning_rate": 3.6023076446456415e-06, + "loss": 0.4396, + "step": 4431 + }, + { + "epoch": 2.158817797986359, + "grad_norm": 3.086515426635742, + "learning_rate": 3.6017303643835477e-06, + "loss": 0.5584, + "step": 4432 + }, + { + "epoch": 2.1593049691458264, + "grad_norm": 2.6705808639526367, + "learning_rate": 3.6011530112090404e-06, + "loss": 0.5533, + "step": 4433 + }, + { + "epoch": 2.159792140305294, + "grad_norm": 3.3201041221618652, + "learning_rate": 3.6005755851603304e-06, + "loss": 0.4508, + "step": 4434 + }, + { + "epoch": 2.160279311464761, + "grad_norm": 3.2788400650024414, + "learning_rate": 3.5999980862756295e-06, + "loss": 0.3883, + "step": 4435 + }, + { + "epoch": 2.1607664826242288, + "grad_norm": 2.6732161045074463, + "learning_rate": 3.5994205145931584e-06, + "loss": 0.5058, + "step": 4436 + }, + { + "epoch": 2.161253653783696, + "grad_norm": 3.690370798110962, + "learning_rate": 3.598842870151139e-06, + "loss": 0.5392, + "step": 4437 + }, + { + "epoch": 2.1617408249431636, + "grad_norm": 2.8008575439453125, + "learning_rate": 3.598265152987802e-06, + "loss": 0.5386, + "step": 4438 + }, + { + "epoch": 2.1622279961026307, + "grad_norm": 3.092862367630005, + "learning_rate": 3.597687363141379e-06, + "loss": 0.5208, + "step": 4439 + }, + { + "epoch": 2.162715167262098, + "grad_norm": 3.1549675464630127, + "learning_rate": 3.5971095006501088e-06, + "loss": 0.4573, + "step": 4440 + }, + { + "epoch": 2.1632023384215655, + "grad_norm": 2.83447003364563, + "learning_rate": 3.5965315655522336e-06, + "loss": 0.4998, + "step": 4441 + }, + { + "epoch": 2.1636895095810327, + "grad_norm": 2.7137694358825684, + "learning_rate": 3.5959535578860015e-06, + "loss": 0.502, + "step": 4442 + }, + { + "epoch": 2.1641766807405003, + "grad_norm": 3.12485408782959, + "learning_rate": 3.5953754776896655e-06, + "loss": 0.5032, + "step": 4443 + }, + { + "epoch": 2.1646638518999675, + "grad_norm": 2.564269542694092, + "learning_rate": 3.5947973250014817e-06, + "loss": 0.4479, + "step": 4444 + }, + { + "epoch": 2.165151023059435, + "grad_norm": 2.6541554927825928, + "learning_rate": 3.5942190998597136e-06, + "loss": 0.4947, + "step": 4445 + }, + { + "epoch": 2.1656381942189022, + "grad_norm": 2.6468052864074707, + "learning_rate": 3.593640802302627e-06, + "loss": 0.4576, + "step": 4446 + }, + { + "epoch": 2.1661253653783694, + "grad_norm": 2.6442832946777344, + "learning_rate": 3.5930624323684937e-06, + "loss": 0.4728, + "step": 4447 + }, + { + "epoch": 2.166612536537837, + "grad_norm": 2.988839864730835, + "learning_rate": 3.592483990095591e-06, + "loss": 0.489, + "step": 4448 + }, + { + "epoch": 2.167099707697304, + "grad_norm": 3.051064968109131, + "learning_rate": 3.5919054755221987e-06, + "loss": 0.4419, + "step": 4449 + }, + { + "epoch": 2.167586878856772, + "grad_norm": 2.514317035675049, + "learning_rate": 3.591326888686605e-06, + "loss": 0.4703, + "step": 4450 + }, + { + "epoch": 2.168074050016239, + "grad_norm": 2.754297971725464, + "learning_rate": 3.5907482296270984e-06, + "loss": 0.4896, + "step": 4451 + }, + { + "epoch": 2.1685612211757066, + "grad_norm": 2.7264604568481445, + "learning_rate": 3.590169498381976e-06, + "loss": 0.5486, + "step": 4452 + }, + { + "epoch": 2.1690483923351738, + "grad_norm": 2.8918862342834473, + "learning_rate": 3.5895906949895376e-06, + "loss": 0.4322, + "step": 4453 + }, + { + "epoch": 2.169535563494641, + "grad_norm": 2.7569665908813477, + "learning_rate": 3.589011819488088e-06, + "loss": 0.5688, + "step": 4454 + }, + { + "epoch": 2.1700227346541086, + "grad_norm": 2.8935327529907227, + "learning_rate": 3.588432871915938e-06, + "loss": 0.4721, + "step": 4455 + }, + { + "epoch": 2.1705099058135757, + "grad_norm": 3.5845940113067627, + "learning_rate": 3.5878538523114014e-06, + "loss": 0.5098, + "step": 4456 + }, + { + "epoch": 2.1709970769730433, + "grad_norm": 2.6703667640686035, + "learning_rate": 3.587274760712799e-06, + "loss": 0.484, + "step": 4457 + }, + { + "epoch": 2.1714842481325105, + "grad_norm": 2.9543745517730713, + "learning_rate": 3.586695597158454e-06, + "loss": 0.5423, + "step": 4458 + }, + { + "epoch": 2.171971419291978, + "grad_norm": 2.8635902404785156, + "learning_rate": 3.586116361686695e-06, + "loss": 0.4594, + "step": 4459 + }, + { + "epoch": 2.1724585904514453, + "grad_norm": 3.0032060146331787, + "learning_rate": 3.5855370543358568e-06, + "loss": 0.521, + "step": 4460 + }, + { + "epoch": 2.1729457616109125, + "grad_norm": 2.8970842361450195, + "learning_rate": 3.584957675144277e-06, + "loss": 0.5485, + "step": 4461 + }, + { + "epoch": 2.17343293277038, + "grad_norm": 2.9949817657470703, + "learning_rate": 3.5843782241503e-06, + "loss": 0.5396, + "step": 4462 + }, + { + "epoch": 2.1739201039298472, + "grad_norm": 2.7731127738952637, + "learning_rate": 3.5837987013922725e-06, + "loss": 0.4942, + "step": 4463 + }, + { + "epoch": 2.174407275089315, + "grad_norm": 3.040935516357422, + "learning_rate": 3.583219106908548e-06, + "loss": 0.4799, + "step": 4464 + }, + { + "epoch": 2.174894446248782, + "grad_norm": 3.3745691776275635, + "learning_rate": 3.582639440737484e-06, + "loss": 0.4713, + "step": 4465 + }, + { + "epoch": 2.1753816174082496, + "grad_norm": 2.8348093032836914, + "learning_rate": 3.5820597029174427e-06, + "loss": 0.5302, + "step": 4466 + }, + { + "epoch": 2.175868788567717, + "grad_norm": 4.331133842468262, + "learning_rate": 3.5814798934867913e-06, + "loss": 0.5092, + "step": 4467 + }, + { + "epoch": 2.176355959727184, + "grad_norm": 2.812955379486084, + "learning_rate": 3.5809000124839e-06, + "loss": 0.5219, + "step": 4468 + }, + { + "epoch": 2.1768431308866516, + "grad_norm": 3.010068416595459, + "learning_rate": 3.580320059947148e-06, + "loss": 0.5774, + "step": 4469 + }, + { + "epoch": 2.1773303020461188, + "grad_norm": 2.6530487537384033, + "learning_rate": 3.5797400359149145e-06, + "loss": 0.4934, + "step": 4470 + }, + { + "epoch": 2.1778174732055864, + "grad_norm": 2.7940609455108643, + "learning_rate": 3.579159940425586e-06, + "loss": 0.4544, + "step": 4471 + }, + { + "epoch": 2.1783046443650536, + "grad_norm": 2.6620137691497803, + "learning_rate": 3.5785797735175525e-06, + "loss": 0.4311, + "step": 4472 + }, + { + "epoch": 2.178791815524521, + "grad_norm": 3.0246665477752686, + "learning_rate": 3.577999535229211e-06, + "loss": 0.4953, + "step": 4473 + }, + { + "epoch": 2.1792789866839883, + "grad_norm": 2.7668168544769287, + "learning_rate": 3.57741922559896e-06, + "loss": 0.5962, + "step": 4474 + }, + { + "epoch": 2.1797661578434555, + "grad_norm": 2.6627769470214844, + "learning_rate": 3.576838844665205e-06, + "loss": 0.4309, + "step": 4475 + }, + { + "epoch": 2.180253329002923, + "grad_norm": 2.673954725265503, + "learning_rate": 3.576258392466356e-06, + "loss": 0.5091, + "step": 4476 + }, + { + "epoch": 2.1807405001623903, + "grad_norm": 2.6338489055633545, + "learning_rate": 3.5756778690408266e-06, + "loss": 0.5251, + "step": 4477 + }, + { + "epoch": 2.181227671321858, + "grad_norm": 2.620751142501831, + "learning_rate": 3.575097274427036e-06, + "loss": 0.4755, + "step": 4478 + }, + { + "epoch": 2.181714842481325, + "grad_norm": 2.6879405975341797, + "learning_rate": 3.5745166086634076e-06, + "loss": 0.5047, + "step": 4479 + }, + { + "epoch": 2.1822020136407927, + "grad_norm": 3.0310332775115967, + "learning_rate": 3.57393587178837e-06, + "loss": 0.539, + "step": 4480 + }, + { + "epoch": 2.18268918480026, + "grad_norm": 2.9451887607574463, + "learning_rate": 3.5733550638403567e-06, + "loss": 0.5317, + "step": 4481 + }, + { + "epoch": 2.183176355959727, + "grad_norm": 2.816837787628174, + "learning_rate": 3.572774184857805e-06, + "loss": 0.4813, + "step": 4482 + }, + { + "epoch": 2.1836635271191946, + "grad_norm": 2.4933583736419678, + "learning_rate": 3.5721932348791585e-06, + "loss": 0.4515, + "step": 4483 + }, + { + "epoch": 2.184150698278662, + "grad_norm": 2.871957302093506, + "learning_rate": 3.571612213942862e-06, + "loss": 0.519, + "step": 4484 + }, + { + "epoch": 2.1846378694381294, + "grad_norm": 3.189760208129883, + "learning_rate": 3.5710311220873704e-06, + "loss": 0.5737, + "step": 4485 + }, + { + "epoch": 2.1851250405975966, + "grad_norm": 3.6641581058502197, + "learning_rate": 3.570449959351138e-06, + "loss": 0.5221, + "step": 4486 + }, + { + "epoch": 2.185612211757064, + "grad_norm": 2.8270201683044434, + "learning_rate": 3.5698687257726277e-06, + "loss": 0.5093, + "step": 4487 + }, + { + "epoch": 2.1860993829165314, + "grad_norm": 2.916764259338379, + "learning_rate": 3.5692874213903038e-06, + "loss": 0.547, + "step": 4488 + }, + { + "epoch": 2.1865865540759986, + "grad_norm": 2.9753408432006836, + "learning_rate": 3.5687060462426393e-06, + "loss": 0.49, + "step": 4489 + }, + { + "epoch": 2.187073725235466, + "grad_norm": 2.9213318824768066, + "learning_rate": 3.5681246003681075e-06, + "loss": 0.4609, + "step": 4490 + }, + { + "epoch": 2.1875608963949333, + "grad_norm": 2.596428155899048, + "learning_rate": 3.567543083805189e-06, + "loss": 0.4916, + "step": 4491 + }, + { + "epoch": 2.188048067554401, + "grad_norm": 2.8073172569274902, + "learning_rate": 3.5669614965923682e-06, + "loss": 0.5033, + "step": 4492 + }, + { + "epoch": 2.188535238713868, + "grad_norm": 2.5855720043182373, + "learning_rate": 3.566379838768136e-06, + "loss": 0.4362, + "step": 4493 + }, + { + "epoch": 2.1890224098733353, + "grad_norm": 2.9274489879608154, + "learning_rate": 3.5657981103709843e-06, + "loss": 0.4937, + "step": 4494 + }, + { + "epoch": 2.189509581032803, + "grad_norm": 2.914775848388672, + "learning_rate": 3.5652163114394133e-06, + "loss": 0.4872, + "step": 4495 + }, + { + "epoch": 2.18999675219227, + "grad_norm": 2.6423606872558594, + "learning_rate": 3.564634442011926e-06, + "loss": 0.5234, + "step": 4496 + }, + { + "epoch": 2.1904839233517377, + "grad_norm": 2.910019874572754, + "learning_rate": 3.5640525021270306e-06, + "loss": 0.5243, + "step": 4497 + }, + { + "epoch": 2.190971094511205, + "grad_norm": 2.771986484527588, + "learning_rate": 3.5634704918232394e-06, + "loss": 0.5519, + "step": 4498 + }, + { + "epoch": 2.1914582656706725, + "grad_norm": 2.634202718734741, + "learning_rate": 3.5628884111390703e-06, + "loss": 0.4285, + "step": 4499 + }, + { + "epoch": 2.1919454368301396, + "grad_norm": 2.742609739303589, + "learning_rate": 3.5623062601130455e-06, + "loss": 0.4712, + "step": 4500 + }, + { + "epoch": 2.192432607989607, + "grad_norm": 2.74979829788208, + "learning_rate": 3.5617240387836904e-06, + "loss": 0.4204, + "step": 4501 + }, + { + "epoch": 2.1929197791490744, + "grad_norm": 2.5955967903137207, + "learning_rate": 3.561141747189538e-06, + "loss": 0.4654, + "step": 4502 + }, + { + "epoch": 2.1934069503085416, + "grad_norm": 3.018862247467041, + "learning_rate": 3.560559385369123e-06, + "loss": 0.4951, + "step": 4503 + }, + { + "epoch": 2.193894121468009, + "grad_norm": 2.911381721496582, + "learning_rate": 3.559976953360987e-06, + "loss": 0.4855, + "step": 4504 + }, + { + "epoch": 2.1943812926274764, + "grad_norm": 2.803682327270508, + "learning_rate": 3.5593944512036755e-06, + "loss": 0.4438, + "step": 4505 + }, + { + "epoch": 2.194868463786944, + "grad_norm": 2.7295100688934326, + "learning_rate": 3.5588118789357374e-06, + "loss": 0.4174, + "step": 4506 + }, + { + "epoch": 2.195355634946411, + "grad_norm": 2.447697639465332, + "learning_rate": 3.558229236595727e-06, + "loss": 0.4973, + "step": 4507 + }, + { + "epoch": 2.1958428061058783, + "grad_norm": 2.6199934482574463, + "learning_rate": 3.5576465242222058e-06, + "loss": 0.4677, + "step": 4508 + }, + { + "epoch": 2.196329977265346, + "grad_norm": 3.1598267555236816, + "learning_rate": 3.5570637418537345e-06, + "loss": 0.4952, + "step": 4509 + }, + { + "epoch": 2.196817148424813, + "grad_norm": 2.7081451416015625, + "learning_rate": 3.556480889528884e-06, + "loss": 0.4561, + "step": 4510 + }, + { + "epoch": 2.1973043195842807, + "grad_norm": 2.678269863128662, + "learning_rate": 3.555897967286227e-06, + "loss": 0.4991, + "step": 4511 + }, + { + "epoch": 2.197791490743748, + "grad_norm": 2.8975696563720703, + "learning_rate": 3.55531497516434e-06, + "loss": 0.5001, + "step": 4512 + }, + { + "epoch": 2.1982786619032155, + "grad_norm": 2.9188361167907715, + "learning_rate": 3.554731913201807e-06, + "loss": 0.4955, + "step": 4513 + }, + { + "epoch": 2.1987658330626827, + "grad_norm": 2.7102572917938232, + "learning_rate": 3.554148781437214e-06, + "loss": 0.5054, + "step": 4514 + }, + { + "epoch": 2.19925300422215, + "grad_norm": 2.650454044342041, + "learning_rate": 3.553565579909153e-06, + "loss": 0.507, + "step": 4515 + }, + { + "epoch": 2.1997401753816175, + "grad_norm": 2.8193018436431885, + "learning_rate": 3.552982308656219e-06, + "loss": 0.4765, + "step": 4516 + }, + { + "epoch": 2.2002273465410846, + "grad_norm": 2.790181875228882, + "learning_rate": 3.5523989677170145e-06, + "loss": 0.5012, + "step": 4517 + }, + { + "epoch": 2.2007145177005523, + "grad_norm": 2.9711427688598633, + "learning_rate": 3.551815557130145e-06, + "loss": 0.5457, + "step": 4518 + }, + { + "epoch": 2.2012016888600194, + "grad_norm": 2.575284004211426, + "learning_rate": 3.551232076934219e-06, + "loss": 0.4731, + "step": 4519 + }, + { + "epoch": 2.201688860019487, + "grad_norm": 2.683988332748413, + "learning_rate": 3.550648527167852e-06, + "loss": 0.4707, + "step": 4520 + }, + { + "epoch": 2.202176031178954, + "grad_norm": 2.6439878940582275, + "learning_rate": 3.5500649078696635e-06, + "loss": 0.4996, + "step": 4521 + }, + { + "epoch": 2.2026632023384214, + "grad_norm": 2.835228681564331, + "learning_rate": 3.5494812190782775e-06, + "loss": 0.4875, + "step": 4522 + }, + { + "epoch": 2.203150373497889, + "grad_norm": 2.7210466861724854, + "learning_rate": 3.548897460832322e-06, + "loss": 0.4378, + "step": 4523 + }, + { + "epoch": 2.203637544657356, + "grad_norm": 2.664034128189087, + "learning_rate": 3.54831363317043e-06, + "loss": 0.5328, + "step": 4524 + }, + { + "epoch": 2.204124715816824, + "grad_norm": 3.0616071224212646, + "learning_rate": 3.547729736131239e-06, + "loss": 0.4973, + "step": 4525 + }, + { + "epoch": 2.204611886976291, + "grad_norm": 2.7511696815490723, + "learning_rate": 3.5471457697533924e-06, + "loss": 0.5036, + "step": 4526 + }, + { + "epoch": 2.205099058135758, + "grad_norm": 2.7003698348999023, + "learning_rate": 3.546561734075536e-06, + "loss": 0.4496, + "step": 4527 + }, + { + "epoch": 2.2055862292952257, + "grad_norm": 2.8163836002349854, + "learning_rate": 3.545977629136321e-06, + "loss": 0.5557, + "step": 4528 + }, + { + "epoch": 2.206073400454693, + "grad_norm": 2.8132212162017822, + "learning_rate": 3.5453934549744044e-06, + "loss": 0.5005, + "step": 4529 + }, + { + "epoch": 2.2065605716141605, + "grad_norm": 2.642232894897461, + "learning_rate": 3.5448092116284468e-06, + "loss": 0.4397, + "step": 4530 + }, + { + "epoch": 2.2070477427736277, + "grad_norm": 3.021250009536743, + "learning_rate": 3.544224899137112e-06, + "loss": 0.5128, + "step": 4531 + }, + { + "epoch": 2.2075349139330953, + "grad_norm": 3.011951446533203, + "learning_rate": 3.543640517539071e-06, + "loss": 0.5222, + "step": 4532 + }, + { + "epoch": 2.2080220850925625, + "grad_norm": 2.469424247741699, + "learning_rate": 3.5430560668729976e-06, + "loss": 0.5101, + "step": 4533 + }, + { + "epoch": 2.2085092562520297, + "grad_norm": 3.0700132846832275, + "learning_rate": 3.5424715471775713e-06, + "loss": 0.5075, + "step": 4534 + }, + { + "epoch": 2.2089964274114973, + "grad_norm": 2.936687469482422, + "learning_rate": 3.541886958491475e-06, + "loss": 0.555, + "step": 4535 + }, + { + "epoch": 2.2094835985709644, + "grad_norm": 2.7110939025878906, + "learning_rate": 3.541302300853396e-06, + "loss": 0.5426, + "step": 4536 + }, + { + "epoch": 2.209970769730432, + "grad_norm": 2.893054246902466, + "learning_rate": 3.5407175743020285e-06, + "loss": 0.5096, + "step": 4537 + }, + { + "epoch": 2.210457940889899, + "grad_norm": 3.041964292526245, + "learning_rate": 3.5401327788760687e-06, + "loss": 0.5141, + "step": 4538 + }, + { + "epoch": 2.210945112049367, + "grad_norm": 2.626317262649536, + "learning_rate": 3.539547914614218e-06, + "loss": 0.4478, + "step": 4539 + }, + { + "epoch": 2.211432283208834, + "grad_norm": 2.6341707706451416, + "learning_rate": 3.5389629815551833e-06, + "loss": 0.5144, + "step": 4540 + }, + { + "epoch": 2.211919454368301, + "grad_norm": 2.57910418510437, + "learning_rate": 3.5383779797376756e-06, + "loss": 0.5187, + "step": 4541 + }, + { + "epoch": 2.212406625527769, + "grad_norm": 2.881862163543701, + "learning_rate": 3.537792909200409e-06, + "loss": 0.5189, + "step": 4542 + }, + { + "epoch": 2.212893796687236, + "grad_norm": 2.469923973083496, + "learning_rate": 3.537207769982105e-06, + "loss": 0.492, + "step": 4543 + }, + { + "epoch": 2.2133809678467036, + "grad_norm": 2.677886724472046, + "learning_rate": 3.536622562121488e-06, + "loss": 0.4717, + "step": 4544 + }, + { + "epoch": 2.2138681390061707, + "grad_norm": 2.6866440773010254, + "learning_rate": 3.5360372856572854e-06, + "loss": 0.4774, + "step": 4545 + }, + { + "epoch": 2.2143553101656384, + "grad_norm": 3.2308244705200195, + "learning_rate": 3.5354519406282316e-06, + "loss": 0.4592, + "step": 4546 + }, + { + "epoch": 2.2148424813251055, + "grad_norm": 2.789896011352539, + "learning_rate": 3.5348665270730655e-06, + "loss": 0.5094, + "step": 4547 + }, + { + "epoch": 2.2153296524845727, + "grad_norm": 2.6344151496887207, + "learning_rate": 3.5342810450305287e-06, + "loss": 0.4312, + "step": 4548 + }, + { + "epoch": 2.2158168236440403, + "grad_norm": 2.737929344177246, + "learning_rate": 3.5336954945393687e-06, + "loss": 0.458, + "step": 4549 + }, + { + "epoch": 2.2163039948035075, + "grad_norm": 2.5745885372161865, + "learning_rate": 3.5331098756383368e-06, + "loss": 0.4327, + "step": 4550 + }, + { + "epoch": 2.216791165962975, + "grad_norm": 2.968714952468872, + "learning_rate": 3.5325241883661903e-06, + "loss": 0.5619, + "step": 4551 + }, + { + "epoch": 2.2172783371224423, + "grad_norm": 2.731976270675659, + "learning_rate": 3.5319384327616885e-06, + "loss": 0.4625, + "step": 4552 + }, + { + "epoch": 2.21776550828191, + "grad_norm": 3.441816568374634, + "learning_rate": 3.5313526088635973e-06, + "loss": 0.5354, + "step": 4553 + }, + { + "epoch": 2.218252679441377, + "grad_norm": 2.9708988666534424, + "learning_rate": 3.530766716710686e-06, + "loss": 0.4699, + "step": 4554 + }, + { + "epoch": 2.218739850600844, + "grad_norm": 2.7039647102355957, + "learning_rate": 3.5301807563417305e-06, + "loss": 0.4522, + "step": 4555 + }, + { + "epoch": 2.219227021760312, + "grad_norm": 2.7281079292297363, + "learning_rate": 3.529594727795508e-06, + "loss": 0.4837, + "step": 4556 + }, + { + "epoch": 2.219714192919779, + "grad_norm": 2.6223602294921875, + "learning_rate": 3.5290086311108026e-06, + "loss": 0.5018, + "step": 4557 + }, + { + "epoch": 2.2202013640792466, + "grad_norm": 2.8410654067993164, + "learning_rate": 3.5284224663264015e-06, + "loss": 0.4897, + "step": 4558 + }, + { + "epoch": 2.220688535238714, + "grad_norm": 2.6856606006622314, + "learning_rate": 3.5278362334810973e-06, + "loss": 0.4835, + "step": 4559 + }, + { + "epoch": 2.2211757063981814, + "grad_norm": 3.01238751411438, + "learning_rate": 3.5272499326136874e-06, + "loss": 0.5454, + "step": 4560 + }, + { + "epoch": 2.2216628775576486, + "grad_norm": 3.063958168029785, + "learning_rate": 3.5266635637629722e-06, + "loss": 0.4905, + "step": 4561 + }, + { + "epoch": 2.2221500487171157, + "grad_norm": 2.8374414443969727, + "learning_rate": 3.5260771269677574e-06, + "loss": 0.4863, + "step": 4562 + }, + { + "epoch": 2.2226372198765834, + "grad_norm": 2.6955907344818115, + "learning_rate": 3.5254906222668552e-06, + "loss": 0.5084, + "step": 4563 + }, + { + "epoch": 2.2231243910360505, + "grad_norm": 3.0629312992095947, + "learning_rate": 3.5249040496990776e-06, + "loss": 0.5012, + "step": 4564 + }, + { + "epoch": 2.223611562195518, + "grad_norm": 2.823390483856201, + "learning_rate": 3.524317409303246e-06, + "loss": 0.4688, + "step": 4565 + }, + { + "epoch": 2.2240987333549853, + "grad_norm": 2.961671829223633, + "learning_rate": 3.5237307011181837e-06, + "loss": 0.498, + "step": 4566 + }, + { + "epoch": 2.224585904514453, + "grad_norm": 3.0859618186950684, + "learning_rate": 3.523143925182719e-06, + "loss": 0.4782, + "step": 4567 + }, + { + "epoch": 2.22507307567392, + "grad_norm": 2.865743637084961, + "learning_rate": 3.522557081535685e-06, + "loss": 0.5066, + "step": 4568 + }, + { + "epoch": 2.2255602468333873, + "grad_norm": 2.989136219024658, + "learning_rate": 3.5219701702159183e-06, + "loss": 0.5711, + "step": 4569 + }, + { + "epoch": 2.226047417992855, + "grad_norm": 2.843966245651245, + "learning_rate": 3.5213831912622604e-06, + "loss": 0.5101, + "step": 4570 + }, + { + "epoch": 2.226534589152322, + "grad_norm": 4.366330146789551, + "learning_rate": 3.5207961447135586e-06, + "loss": 0.501, + "step": 4571 + }, + { + "epoch": 2.2270217603117897, + "grad_norm": 2.9404377937316895, + "learning_rate": 3.5202090306086623e-06, + "loss": 0.5009, + "step": 4572 + }, + { + "epoch": 2.227508931471257, + "grad_norm": 3.0508015155792236, + "learning_rate": 3.519621848986428e-06, + "loss": 0.5453, + "step": 4573 + }, + { + "epoch": 2.2279961026307245, + "grad_norm": 2.521761894226074, + "learning_rate": 3.5190345998857143e-06, + "loss": 0.4858, + "step": 4574 + }, + { + "epoch": 2.2284832737901916, + "grad_norm": 3.150726795196533, + "learning_rate": 3.5184472833453863e-06, + "loss": 0.4698, + "step": 4575 + }, + { + "epoch": 2.228970444949659, + "grad_norm": 2.7355453968048096, + "learning_rate": 3.517859899404311e-06, + "loss": 0.5178, + "step": 4576 + }, + { + "epoch": 2.2294576161091264, + "grad_norm": 2.7601161003112793, + "learning_rate": 3.5172724481013634e-06, + "loss": 0.5625, + "step": 4577 + }, + { + "epoch": 2.2299447872685936, + "grad_norm": 2.6712398529052734, + "learning_rate": 3.5166849294754193e-06, + "loss": 0.4957, + "step": 4578 + }, + { + "epoch": 2.230431958428061, + "grad_norm": 2.528444766998291, + "learning_rate": 3.5160973435653613e-06, + "loss": 0.446, + "step": 4579 + }, + { + "epoch": 2.2309191295875284, + "grad_norm": 2.8723628520965576, + "learning_rate": 3.515509690410076e-06, + "loss": 0.4908, + "step": 4580 + }, + { + "epoch": 2.231406300746996, + "grad_norm": 2.867809295654297, + "learning_rate": 3.514921970048454e-06, + "loss": 0.5028, + "step": 4581 + }, + { + "epoch": 2.231893471906463, + "grad_norm": 2.6727194786071777, + "learning_rate": 3.51433418251939e-06, + "loss": 0.5283, + "step": 4582 + }, + { + "epoch": 2.2323806430659303, + "grad_norm": 2.799304246902466, + "learning_rate": 3.5137463278617844e-06, + "loss": 0.4825, + "step": 4583 + }, + { + "epoch": 2.232867814225398, + "grad_norm": 2.8652420043945312, + "learning_rate": 3.5131584061145415e-06, + "loss": 0.5314, + "step": 4584 + }, + { + "epoch": 2.233354985384865, + "grad_norm": 3.045840263366699, + "learning_rate": 3.512570417316569e-06, + "loss": 0.4418, + "step": 4585 + }, + { + "epoch": 2.2338421565443327, + "grad_norm": 2.908982753753662, + "learning_rate": 3.511982361506781e-06, + "loss": 0.4743, + "step": 4586 + }, + { + "epoch": 2.2343293277038, + "grad_norm": 2.6584510803222656, + "learning_rate": 3.511394238724095e-06, + "loss": 0.557, + "step": 4587 + }, + { + "epoch": 2.2348164988632675, + "grad_norm": 2.784823179244995, + "learning_rate": 3.5108060490074316e-06, + "loss": 0.525, + "step": 4588 + }, + { + "epoch": 2.2353036700227347, + "grad_norm": 2.8223965167999268, + "learning_rate": 3.5102177923957186e-06, + "loss": 0.4688, + "step": 4589 + }, + { + "epoch": 2.235790841182202, + "grad_norm": 3.081470251083374, + "learning_rate": 3.509629468927886e-06, + "loss": 0.5007, + "step": 4590 + }, + { + "epoch": 2.2362780123416695, + "grad_norm": 3.0024373531341553, + "learning_rate": 3.509041078642869e-06, + "loss": 0.5681, + "step": 4591 + }, + { + "epoch": 2.2367651835011366, + "grad_norm": 3.030625343322754, + "learning_rate": 3.508452621579607e-06, + "loss": 0.5019, + "step": 4592 + }, + { + "epoch": 2.2372523546606042, + "grad_norm": 2.7712018489837646, + "learning_rate": 3.5078640977770456e-06, + "loss": 0.5237, + "step": 4593 + }, + { + "epoch": 2.2377395258200714, + "grad_norm": 3.069735527038574, + "learning_rate": 3.507275507274131e-06, + "loss": 0.4621, + "step": 4594 + }, + { + "epoch": 2.238226696979539, + "grad_norm": 2.813508987426758, + "learning_rate": 3.506686850109818e-06, + "loss": 0.5011, + "step": 4595 + }, + { + "epoch": 2.238713868139006, + "grad_norm": 3.380228281021118, + "learning_rate": 3.506098126323062e-06, + "loss": 0.4895, + "step": 4596 + }, + { + "epoch": 2.2392010392984734, + "grad_norm": 2.666691780090332, + "learning_rate": 3.5055093359528263e-06, + "loss": 0.4386, + "step": 4597 + }, + { + "epoch": 2.239688210457941, + "grad_norm": 2.959169626235962, + "learning_rate": 3.504920479038076e-06, + "loss": 0.4768, + "step": 4598 + }, + { + "epoch": 2.240175381617408, + "grad_norm": 2.8701632022857666, + "learning_rate": 3.5043315556177825e-06, + "loss": 0.4652, + "step": 4599 + }, + { + "epoch": 2.2406625527768758, + "grad_norm": 2.855640172958374, + "learning_rate": 3.5037425657309206e-06, + "loss": 0.5391, + "step": 4600 + }, + { + "epoch": 2.241149723936343, + "grad_norm": 2.9458768367767334, + "learning_rate": 3.5031535094164683e-06, + "loss": 0.4756, + "step": 4601 + }, + { + "epoch": 2.2416368950958105, + "grad_norm": 3.2615034580230713, + "learning_rate": 3.502564386713411e-06, + "loss": 0.5255, + "step": 4602 + }, + { + "epoch": 2.2421240662552777, + "grad_norm": 2.8661746978759766, + "learning_rate": 3.501975197660735e-06, + "loss": 0.4753, + "step": 4603 + }, + { + "epoch": 2.242611237414745, + "grad_norm": 2.8027403354644775, + "learning_rate": 3.501385942297434e-06, + "loss": 0.4735, + "step": 4604 + }, + { + "epoch": 2.2430984085742125, + "grad_norm": 2.6795337200164795, + "learning_rate": 3.5007966206625043e-06, + "loss": 0.5095, + "step": 4605 + }, + { + "epoch": 2.2435855797336797, + "grad_norm": 2.7157886028289795, + "learning_rate": 3.5002072327949484e-06, + "loss": 0.4476, + "step": 4606 + }, + { + "epoch": 2.2440727508931473, + "grad_norm": 2.6660141944885254, + "learning_rate": 3.4996177787337696e-06, + "loss": 0.5443, + "step": 4607 + }, + { + "epoch": 2.2445599220526145, + "grad_norm": 2.7986795902252197, + "learning_rate": 3.4990282585179807e-06, + "loss": 0.5147, + "step": 4608 + }, + { + "epoch": 2.245047093212082, + "grad_norm": 2.864588975906372, + "learning_rate": 3.4984386721865937e-06, + "loss": 0.4299, + "step": 4609 + }, + { + "epoch": 2.2455342643715492, + "grad_norm": 2.4917662143707275, + "learning_rate": 3.4978490197786285e-06, + "loss": 0.4281, + "step": 4610 + }, + { + "epoch": 2.2460214355310164, + "grad_norm": 2.7943227291107178, + "learning_rate": 3.497259301333108e-06, + "loss": 0.4252, + "step": 4611 + }, + { + "epoch": 2.246508606690484, + "grad_norm": 2.7810559272766113, + "learning_rate": 3.496669516889061e-06, + "loss": 0.5199, + "step": 4612 + }, + { + "epoch": 2.246995777849951, + "grad_norm": 3.071948766708374, + "learning_rate": 3.4960796664855168e-06, + "loss": 0.4551, + "step": 4613 + }, + { + "epoch": 2.247482949009419, + "grad_norm": 2.945141315460205, + "learning_rate": 3.4954897501615136e-06, + "loss": 0.4375, + "step": 4614 + }, + { + "epoch": 2.247970120168886, + "grad_norm": 2.5513408184051514, + "learning_rate": 3.494899767956091e-06, + "loss": 0.474, + "step": 4615 + }, + { + "epoch": 2.2484572913283536, + "grad_norm": 2.8266680240631104, + "learning_rate": 3.494309719908295e-06, + "loss": 0.5073, + "step": 4616 + }, + { + "epoch": 2.2489444624878208, + "grad_norm": 2.907445192337036, + "learning_rate": 3.4937196060571737e-06, + "loss": 0.4786, + "step": 4617 + }, + { + "epoch": 2.249431633647288, + "grad_norm": 3.0620858669281006, + "learning_rate": 3.493129426441782e-06, + "loss": 0.4223, + "step": 4618 + }, + { + "epoch": 2.2499188048067555, + "grad_norm": 2.9959359169006348, + "learning_rate": 3.4925391811011766e-06, + "loss": 0.5281, + "step": 4619 + }, + { + "epoch": 2.2504059759662227, + "grad_norm": 2.775637626647949, + "learning_rate": 3.4919488700744216e-06, + "loss": 0.5028, + "step": 4620 + }, + { + "epoch": 2.2508931471256903, + "grad_norm": 3.0066885948181152, + "learning_rate": 3.4913584934005816e-06, + "loss": 0.4781, + "step": 4621 + }, + { + "epoch": 2.2513803182851575, + "grad_norm": 3.028994560241699, + "learning_rate": 3.49076805111873e-06, + "loss": 0.5272, + "step": 4622 + }, + { + "epoch": 2.251867489444625, + "grad_norm": 2.6257710456848145, + "learning_rate": 3.49017754326794e-06, + "loss": 0.4384, + "step": 4623 + }, + { + "epoch": 2.2523546606040923, + "grad_norm": 2.7644240856170654, + "learning_rate": 3.489586969887293e-06, + "loss": 0.4979, + "step": 4624 + }, + { + "epoch": 2.2528418317635595, + "grad_norm": 2.8596699237823486, + "learning_rate": 3.4889963310158724e-06, + "loss": 0.4856, + "step": 4625 + }, + { + "epoch": 2.253329002923027, + "grad_norm": 2.844372272491455, + "learning_rate": 3.4884056266927673e-06, + "loss": 0.5082, + "step": 4626 + }, + { + "epoch": 2.2538161740824942, + "grad_norm": 2.9946563243865967, + "learning_rate": 3.4878148569570693e-06, + "loss": 0.5948, + "step": 4627 + }, + { + "epoch": 2.254303345241962, + "grad_norm": 2.874232530593872, + "learning_rate": 3.4872240218478758e-06, + "loss": 0.4568, + "step": 4628 + }, + { + "epoch": 2.254790516401429, + "grad_norm": 2.6639490127563477, + "learning_rate": 3.486633121404288e-06, + "loss": 0.5017, + "step": 4629 + }, + { + "epoch": 2.2552776875608966, + "grad_norm": 3.1084206104278564, + "learning_rate": 3.4860421556654133e-06, + "loss": 0.5143, + "step": 4630 + }, + { + "epoch": 2.255764858720364, + "grad_norm": 2.7405521869659424, + "learning_rate": 3.4854511246703597e-06, + "loss": 0.4253, + "step": 4631 + }, + { + "epoch": 2.256252029879831, + "grad_norm": 2.584904432296753, + "learning_rate": 3.4848600284582433e-06, + "loss": 0.4843, + "step": 4632 + }, + { + "epoch": 2.2567392010392986, + "grad_norm": 3.1494741439819336, + "learning_rate": 3.484268867068181e-06, + "loss": 0.5207, + "step": 4633 + }, + { + "epoch": 2.2572263721987658, + "grad_norm": 2.702606439590454, + "learning_rate": 3.4836776405392974e-06, + "loss": 0.4459, + "step": 4634 + }, + { + "epoch": 2.2577135433582334, + "grad_norm": 2.7520642280578613, + "learning_rate": 3.4830863489107186e-06, + "loss": 0.4369, + "step": 4635 + }, + { + "epoch": 2.2582007145177005, + "grad_norm": 2.8171541690826416, + "learning_rate": 3.482494992221577e-06, + "loss": 0.4933, + "step": 4636 + }, + { + "epoch": 2.258687885677168, + "grad_norm": 2.9766886234283447, + "learning_rate": 3.4819035705110083e-06, + "loss": 0.4424, + "step": 4637 + }, + { + "epoch": 2.2591750568366353, + "grad_norm": 3.31911301612854, + "learning_rate": 3.481312083818153e-06, + "loss": 0.5412, + "step": 4638 + }, + { + "epoch": 2.2596622279961025, + "grad_norm": 2.7976982593536377, + "learning_rate": 3.4807205321821548e-06, + "loss": 0.5507, + "step": 4639 + }, + { + "epoch": 2.26014939915557, + "grad_norm": 2.6459238529205322, + "learning_rate": 3.4801289156421635e-06, + "loss": 0.5232, + "step": 4640 + }, + { + "epoch": 2.2606365703150373, + "grad_norm": 2.6313600540161133, + "learning_rate": 3.479537234237332e-06, + "loss": 0.4623, + "step": 4641 + }, + { + "epoch": 2.261123741474505, + "grad_norm": 2.82721209526062, + "learning_rate": 3.4789454880068174e-06, + "loss": 0.4704, + "step": 4642 + }, + { + "epoch": 2.261610912633972, + "grad_norm": 3.7340073585510254, + "learning_rate": 3.4783536769897814e-06, + "loss": 0.5596, + "step": 4643 + }, + { + "epoch": 2.2620980837934392, + "grad_norm": 2.790832757949829, + "learning_rate": 3.47776180122539e-06, + "loss": 0.5435, + "step": 4644 + }, + { + "epoch": 2.262585254952907, + "grad_norm": 2.801017999649048, + "learning_rate": 3.4771698607528144e-06, + "loss": 0.4856, + "step": 4645 + }, + { + "epoch": 2.263072426112374, + "grad_norm": 2.95294451713562, + "learning_rate": 3.476577855611228e-06, + "loss": 0.515, + "step": 4646 + }, + { + "epoch": 2.2635595972718416, + "grad_norm": 2.6182284355163574, + "learning_rate": 3.47598578583981e-06, + "loss": 0.4817, + "step": 4647 + }, + { + "epoch": 2.264046768431309, + "grad_norm": 2.5613629817962646, + "learning_rate": 3.4753936514777437e-06, + "loss": 0.4863, + "step": 4648 + }, + { + "epoch": 2.264533939590776, + "grad_norm": 2.7036960124969482, + "learning_rate": 3.4748014525642158e-06, + "loss": 0.5126, + "step": 4649 + }, + { + "epoch": 2.2650211107502436, + "grad_norm": 2.7504937648773193, + "learning_rate": 3.4742091891384195e-06, + "loss": 0.5174, + "step": 4650 + }, + { + "epoch": 2.2655082819097108, + "grad_norm": 2.720510959625244, + "learning_rate": 3.4736168612395487e-06, + "loss": 0.4694, + "step": 4651 + }, + { + "epoch": 2.2659954530691784, + "grad_norm": 2.9687302112579346, + "learning_rate": 3.4730244689068054e-06, + "loss": 0.464, + "step": 4652 + }, + { + "epoch": 2.2664826242286455, + "grad_norm": 2.823554515838623, + "learning_rate": 3.472432012179393e-06, + "loss": 0.5178, + "step": 4653 + }, + { + "epoch": 2.266969795388113, + "grad_norm": 2.7636804580688477, + "learning_rate": 3.4718394910965213e-06, + "loss": 0.4954, + "step": 4654 + }, + { + "epoch": 2.2674569665475803, + "grad_norm": 3.7709856033325195, + "learning_rate": 3.4712469056974017e-06, + "loss": 0.5504, + "step": 4655 + }, + { + "epoch": 2.2679441377070475, + "grad_norm": 2.8794710636138916, + "learning_rate": 3.470654256021253e-06, + "loss": 0.4876, + "step": 4656 + }, + { + "epoch": 2.268431308866515, + "grad_norm": 2.779230833053589, + "learning_rate": 3.470061542107296e-06, + "loss": 0.4785, + "step": 4657 + }, + { + "epoch": 2.2689184800259823, + "grad_norm": 2.529388666152954, + "learning_rate": 3.4694687639947554e-06, + "loss": 0.4501, + "step": 4658 + }, + { + "epoch": 2.26940565118545, + "grad_norm": 2.601867198944092, + "learning_rate": 3.4688759217228636e-06, + "loss": 0.4683, + "step": 4659 + }, + { + "epoch": 2.269892822344917, + "grad_norm": 2.589755058288574, + "learning_rate": 3.4682830153308526e-06, + "loss": 0.5055, + "step": 4660 + }, + { + "epoch": 2.2703799935043847, + "grad_norm": 2.76754093170166, + "learning_rate": 3.4676900448579624e-06, + "loss": 0.4709, + "step": 4661 + }, + { + "epoch": 2.270867164663852, + "grad_norm": 2.607717752456665, + "learning_rate": 3.4670970103434348e-06, + "loss": 0.442, + "step": 4662 + }, + { + "epoch": 2.271354335823319, + "grad_norm": 2.3772764205932617, + "learning_rate": 3.4665039118265175e-06, + "loss": 0.5093, + "step": 4663 + }, + { + "epoch": 2.2718415069827866, + "grad_norm": 3.314368724822998, + "learning_rate": 3.4659107493464607e-06, + "loss": 0.5245, + "step": 4664 + }, + { + "epoch": 2.272328678142254, + "grad_norm": 2.57985258102417, + "learning_rate": 3.4653175229425214e-06, + "loss": 0.4652, + "step": 4665 + }, + { + "epoch": 2.2728158493017214, + "grad_norm": 3.116896152496338, + "learning_rate": 3.464724232653957e-06, + "loss": 0.4703, + "step": 4666 + }, + { + "epoch": 2.2733030204611886, + "grad_norm": 2.9368183612823486, + "learning_rate": 3.4641308785200338e-06, + "loss": 0.4234, + "step": 4667 + }, + { + "epoch": 2.273790191620656, + "grad_norm": 2.8671035766601562, + "learning_rate": 3.463537460580019e-06, + "loss": 0.4591, + "step": 4668 + }, + { + "epoch": 2.2742773627801234, + "grad_norm": 3.1812009811401367, + "learning_rate": 3.462943978873184e-06, + "loss": 0.5266, + "step": 4669 + }, + { + "epoch": 2.2747645339395905, + "grad_norm": 2.8821401596069336, + "learning_rate": 3.4623504334388064e-06, + "loss": 0.54, + "step": 4670 + }, + { + "epoch": 2.275251705099058, + "grad_norm": 2.7370338439941406, + "learning_rate": 3.461756824316167e-06, + "loss": 0.4719, + "step": 4671 + }, + { + "epoch": 2.2757388762585253, + "grad_norm": 3.0458381175994873, + "learning_rate": 3.4611631515445497e-06, + "loss": 0.5075, + "step": 4672 + }, + { + "epoch": 2.276226047417993, + "grad_norm": 3.0067741870880127, + "learning_rate": 3.4605694151632458e-06, + "loss": 0.5355, + "step": 4673 + }, + { + "epoch": 2.27671321857746, + "grad_norm": 2.613693952560425, + "learning_rate": 3.459975615211546e-06, + "loss": 0.4205, + "step": 4674 + }, + { + "epoch": 2.2772003897369277, + "grad_norm": 2.8855104446411133, + "learning_rate": 3.459381751728751e-06, + "loss": 0.5204, + "step": 4675 + }, + { + "epoch": 2.277687560896395, + "grad_norm": 2.5899040699005127, + "learning_rate": 3.4587878247541597e-06, + "loss": 0.4268, + "step": 4676 + }, + { + "epoch": 2.278174732055862, + "grad_norm": 2.9011282920837402, + "learning_rate": 3.4581938343270797e-06, + "loss": 0.4754, + "step": 4677 + }, + { + "epoch": 2.2786619032153297, + "grad_norm": 2.911308526992798, + "learning_rate": 3.457599780486821e-06, + "loss": 0.5455, + "step": 4678 + }, + { + "epoch": 2.279149074374797, + "grad_norm": 2.8463711738586426, + "learning_rate": 3.4570056632726984e-06, + "loss": 0.4645, + "step": 4679 + }, + { + "epoch": 2.2796362455342645, + "grad_norm": 3.034926176071167, + "learning_rate": 3.4564114827240298e-06, + "loss": 0.5107, + "step": 4680 + }, + { + "epoch": 2.2801234166937316, + "grad_norm": 3.261223793029785, + "learning_rate": 3.4558172388801385e-06, + "loss": 0.5475, + "step": 4681 + }, + { + "epoch": 2.2806105878531993, + "grad_norm": 3.0357613563537598, + "learning_rate": 3.455222931780351e-06, + "loss": 0.4849, + "step": 4682 + }, + { + "epoch": 2.2810977590126664, + "grad_norm": 2.547177791595459, + "learning_rate": 3.4546285614639997e-06, + "loss": 0.4955, + "step": 4683 + }, + { + "epoch": 2.2815849301721336, + "grad_norm": 2.859006881713867, + "learning_rate": 3.4540341279704182e-06, + "loss": 0.5119, + "step": 4684 + }, + { + "epoch": 2.282072101331601, + "grad_norm": 3.0357272624969482, + "learning_rate": 3.453439631338947e-06, + "loss": 0.4983, + "step": 4685 + }, + { + "epoch": 2.2825592724910684, + "grad_norm": 2.938251495361328, + "learning_rate": 3.45284507160893e-06, + "loss": 0.4315, + "step": 4686 + }, + { + "epoch": 2.283046443650536, + "grad_norm": 2.8082849979400635, + "learning_rate": 3.452250448819715e-06, + "loss": 0.4201, + "step": 4687 + }, + { + "epoch": 2.283533614810003, + "grad_norm": 2.98063063621521, + "learning_rate": 3.451655763010654e-06, + "loss": 0.4946, + "step": 4688 + }, + { + "epoch": 2.284020785969471, + "grad_norm": 2.8726632595062256, + "learning_rate": 3.4510610142211032e-06, + "loss": 0.5436, + "step": 4689 + }, + { + "epoch": 2.284507957128938, + "grad_norm": 2.7350964546203613, + "learning_rate": 3.450466202490423e-06, + "loss": 0.4759, + "step": 4690 + }, + { + "epoch": 2.284995128288405, + "grad_norm": 2.5872802734375, + "learning_rate": 3.4498713278579775e-06, + "loss": 0.4657, + "step": 4691 + }, + { + "epoch": 2.2854822994478727, + "grad_norm": 2.76338791847229, + "learning_rate": 3.4492763903631364e-06, + "loss": 0.511, + "step": 4692 + }, + { + "epoch": 2.28596947060734, + "grad_norm": 2.64436674118042, + "learning_rate": 3.4486813900452732e-06, + "loss": 0.471, + "step": 4693 + }, + { + "epoch": 2.2864566417668075, + "grad_norm": 2.9908878803253174, + "learning_rate": 3.4480863269437626e-06, + "loss": 0.4559, + "step": 4694 + }, + { + "epoch": 2.2869438129262747, + "grad_norm": 3.2762796878814697, + "learning_rate": 3.447491201097988e-06, + "loss": 0.5275, + "step": 4695 + }, + { + "epoch": 2.2874309840857423, + "grad_norm": 2.865098714828491, + "learning_rate": 3.4468960125473327e-06, + "loss": 0.5047, + "step": 4696 + }, + { + "epoch": 2.2879181552452095, + "grad_norm": 2.8759093284606934, + "learning_rate": 3.4463007613311882e-06, + "loss": 0.5476, + "step": 4697 + }, + { + "epoch": 2.2884053264046766, + "grad_norm": 3.045262336730957, + "learning_rate": 3.4457054474889475e-06, + "loss": 0.5237, + "step": 4698 + }, + { + "epoch": 2.2888924975641443, + "grad_norm": 2.882073163986206, + "learning_rate": 3.445110071060008e-06, + "loss": 0.4951, + "step": 4699 + }, + { + "epoch": 2.2893796687236114, + "grad_norm": 2.956418037414551, + "learning_rate": 3.4445146320837723e-06, + "loss": 0.5218, + "step": 4700 + }, + { + "epoch": 2.289866839883079, + "grad_norm": 2.7497122287750244, + "learning_rate": 3.4439191305996467e-06, + "loss": 0.4925, + "step": 4701 + }, + { + "epoch": 2.290354011042546, + "grad_norm": 2.8745243549346924, + "learning_rate": 3.4433235666470403e-06, + "loss": 0.5404, + "step": 4702 + }, + { + "epoch": 2.290841182202014, + "grad_norm": 2.9155197143554688, + "learning_rate": 3.4427279402653684e-06, + "loss": 0.5422, + "step": 4703 + }, + { + "epoch": 2.291328353361481, + "grad_norm": 2.884990692138672, + "learning_rate": 3.442132251494049e-06, + "loss": 0.48, + "step": 4704 + }, + { + "epoch": 2.291815524520948, + "grad_norm": 2.740682363510132, + "learning_rate": 3.4415365003725055e-06, + "loss": 0.4832, + "step": 4705 + }, + { + "epoch": 2.292302695680416, + "grad_norm": 2.799664258956909, + "learning_rate": 3.4409406869401633e-06, + "loss": 0.4462, + "step": 4706 + }, + { + "epoch": 2.292789866839883, + "grad_norm": 2.7629191875457764, + "learning_rate": 3.440344811236454e-06, + "loss": 0.4595, + "step": 4707 + }, + { + "epoch": 2.2932770379993506, + "grad_norm": 2.8170125484466553, + "learning_rate": 3.439748873300813e-06, + "loss": 0.5149, + "step": 4708 + }, + { + "epoch": 2.2937642091588177, + "grad_norm": 3.1976020336151123, + "learning_rate": 3.439152873172679e-06, + "loss": 0.4776, + "step": 4709 + }, + { + "epoch": 2.2942513803182853, + "grad_norm": 2.8030104637145996, + "learning_rate": 3.4385568108914956e-06, + "loss": 0.4773, + "step": 4710 + }, + { + "epoch": 2.2947385514777525, + "grad_norm": 2.4697139263153076, + "learning_rate": 3.4379606864967097e-06, + "loss": 0.4911, + "step": 4711 + }, + { + "epoch": 2.2952257226372197, + "grad_norm": 2.514052152633667, + "learning_rate": 3.437364500027773e-06, + "loss": 0.4307, + "step": 4712 + }, + { + "epoch": 2.2957128937966873, + "grad_norm": 2.834912061691284, + "learning_rate": 3.4367682515241406e-06, + "loss": 0.5346, + "step": 4713 + }, + { + "epoch": 2.2962000649561545, + "grad_norm": 2.878016710281372, + "learning_rate": 3.436171941025273e-06, + "loss": 0.4511, + "step": 4714 + }, + { + "epoch": 2.296687236115622, + "grad_norm": 3.1669058799743652, + "learning_rate": 3.435575568570633e-06, + "loss": 0.5189, + "step": 4715 + }, + { + "epoch": 2.2971744072750893, + "grad_norm": 2.846921920776367, + "learning_rate": 3.4349791341996898e-06, + "loss": 0.4892, + "step": 4716 + }, + { + "epoch": 2.297661578434557, + "grad_norm": 3.068450927734375, + "learning_rate": 3.434382637951914e-06, + "loss": 0.4835, + "step": 4717 + }, + { + "epoch": 2.298148749594024, + "grad_norm": 2.8725171089172363, + "learning_rate": 3.433786079866782e-06, + "loss": 0.5557, + "step": 4718 + }, + { + "epoch": 2.298635920753491, + "grad_norm": 2.7806577682495117, + "learning_rate": 3.4331894599837746e-06, + "loss": 0.5025, + "step": 4719 + }, + { + "epoch": 2.299123091912959, + "grad_norm": 2.762519121170044, + "learning_rate": 3.4325927783423756e-06, + "loss": 0.5279, + "step": 4720 + }, + { + "epoch": 2.299610263072426, + "grad_norm": 3.0543296337127686, + "learning_rate": 3.431996034982073e-06, + "loss": 0.4388, + "step": 4721 + }, + { + "epoch": 2.3000974342318936, + "grad_norm": 2.949754238128662, + "learning_rate": 3.4313992299423605e-06, + "loss": 0.5523, + "step": 4722 + }, + { + "epoch": 2.300584605391361, + "grad_norm": 2.3672969341278076, + "learning_rate": 3.4308023632627323e-06, + "loss": 0.4164, + "step": 4723 + }, + { + "epoch": 2.3010717765508284, + "grad_norm": 2.872985363006592, + "learning_rate": 3.4302054349826914e-06, + "loss": 0.5176, + "step": 4724 + }, + { + "epoch": 2.3015589477102956, + "grad_norm": 2.410581111907959, + "learning_rate": 3.429608445141741e-06, + "loss": 0.4239, + "step": 4725 + }, + { + "epoch": 2.3020461188697627, + "grad_norm": 2.7117886543273926, + "learning_rate": 3.4290113937793904e-06, + "loss": 0.4936, + "step": 4726 + }, + { + "epoch": 2.3025332900292304, + "grad_norm": 2.6863553524017334, + "learning_rate": 3.428414280935152e-06, + "loss": 0.4653, + "step": 4727 + }, + { + "epoch": 2.3030204611886975, + "grad_norm": 2.9253945350646973, + "learning_rate": 3.427817106648544e-06, + "loss": 0.5231, + "step": 4728 + }, + { + "epoch": 2.303507632348165, + "grad_norm": 2.7304561138153076, + "learning_rate": 3.4272198709590847e-06, + "loss": 0.4532, + "step": 4729 + }, + { + "epoch": 2.3039948035076323, + "grad_norm": 2.6174840927124023, + "learning_rate": 3.426622573906302e-06, + "loss": 0.4804, + "step": 4730 + }, + { + "epoch": 2.3044819746671, + "grad_norm": 2.54313588142395, + "learning_rate": 3.426025215529723e-06, + "loss": 0.447, + "step": 4731 + }, + { + "epoch": 2.304969145826567, + "grad_norm": 2.56872820854187, + "learning_rate": 3.425427795868882e-06, + "loss": 0.5383, + "step": 4732 + }, + { + "epoch": 2.3054563169860343, + "grad_norm": 3.0327818393707275, + "learning_rate": 3.4248303149633143e-06, + "loss": 0.4918, + "step": 4733 + }, + { + "epoch": 2.305943488145502, + "grad_norm": 2.6781275272369385, + "learning_rate": 3.424232772852564e-06, + "loss": 0.4438, + "step": 4734 + }, + { + "epoch": 2.306430659304969, + "grad_norm": 2.7787013053894043, + "learning_rate": 3.423635169576175e-06, + "loss": 0.4991, + "step": 4735 + }, + { + "epoch": 2.3069178304644367, + "grad_norm": 3.500683069229126, + "learning_rate": 3.4230375051736956e-06, + "loss": 0.482, + "step": 4736 + }, + { + "epoch": 2.307405001623904, + "grad_norm": 2.854097843170166, + "learning_rate": 3.4224397796846796e-06, + "loss": 0.518, + "step": 4737 + }, + { + "epoch": 2.3078921727833714, + "grad_norm": 2.749216079711914, + "learning_rate": 3.4218419931486862e-06, + "loss": 0.4861, + "step": 4738 + }, + { + "epoch": 2.3083793439428386, + "grad_norm": 2.8126566410064697, + "learning_rate": 3.421244145605275e-06, + "loss": 0.495, + "step": 4739 + }, + { + "epoch": 2.308866515102306, + "grad_norm": 2.9717981815338135, + "learning_rate": 3.420646237094012e-06, + "loss": 0.5472, + "step": 4740 + }, + { + "epoch": 2.3093536862617734, + "grad_norm": 2.868683338165283, + "learning_rate": 3.420048267654466e-06, + "loss": 0.4489, + "step": 4741 + }, + { + "epoch": 2.3098408574212406, + "grad_norm": 2.895473003387451, + "learning_rate": 3.419450237326212e-06, + "loss": 0.4699, + "step": 4742 + }, + { + "epoch": 2.310328028580708, + "grad_norm": 2.723144054412842, + "learning_rate": 3.4188521461488265e-06, + "loss": 0.5486, + "step": 4743 + }, + { + "epoch": 2.3108151997401754, + "grad_norm": 2.987853765487671, + "learning_rate": 3.4182539941618927e-06, + "loss": 0.482, + "step": 4744 + }, + { + "epoch": 2.311302370899643, + "grad_norm": 2.7651946544647217, + "learning_rate": 3.417655781404994e-06, + "loss": 0.5223, + "step": 4745 + }, + { + "epoch": 2.31178954205911, + "grad_norm": 2.8067495822906494, + "learning_rate": 3.4170575079177216e-06, + "loss": 0.4611, + "step": 4746 + }, + { + "epoch": 2.3122767132185773, + "grad_norm": 2.819025993347168, + "learning_rate": 3.416459173739668e-06, + "loss": 0.5683, + "step": 4747 + }, + { + "epoch": 2.312763884378045, + "grad_norm": 2.8385531902313232, + "learning_rate": 3.4158607789104325e-06, + "loss": 0.5263, + "step": 4748 + }, + { + "epoch": 2.313251055537512, + "grad_norm": 2.904379367828369, + "learning_rate": 3.4152623234696153e-06, + "loss": 0.4835, + "step": 4749 + }, + { + "epoch": 2.3137382266969797, + "grad_norm": 2.5574305057525635, + "learning_rate": 3.4146638074568233e-06, + "loss": 0.4933, + "step": 4750 + }, + { + "epoch": 2.314225397856447, + "grad_norm": 2.5474233627319336, + "learning_rate": 3.414065230911665e-06, + "loss": 0.4856, + "step": 4751 + }, + { + "epoch": 2.3147125690159145, + "grad_norm": 2.388049840927124, + "learning_rate": 3.4134665938737554e-06, + "loss": 0.4567, + "step": 4752 + }, + { + "epoch": 2.3151997401753817, + "grad_norm": 2.926577568054199, + "learning_rate": 3.4128678963827115e-06, + "loss": 0.5356, + "step": 4753 + }, + { + "epoch": 2.315686911334849, + "grad_norm": 3.0220086574554443, + "learning_rate": 3.412269138478156e-06, + "loss": 0.4853, + "step": 4754 + }, + { + "epoch": 2.3161740824943164, + "grad_norm": 2.5254244804382324, + "learning_rate": 3.4116703201997132e-06, + "loss": 0.485, + "step": 4755 + }, + { + "epoch": 2.3166612536537836, + "grad_norm": 3.3095173835754395, + "learning_rate": 3.411071441587013e-06, + "loss": 0.4598, + "step": 4756 + }, + { + "epoch": 2.3171484248132512, + "grad_norm": 2.595795154571533, + "learning_rate": 3.4104725026796904e-06, + "loss": 0.4263, + "step": 4757 + }, + { + "epoch": 2.3176355959727184, + "grad_norm": 2.845784902572632, + "learning_rate": 3.4098735035173824e-06, + "loss": 0.5049, + "step": 4758 + }, + { + "epoch": 2.318122767132186, + "grad_norm": 3.025113105773926, + "learning_rate": 3.40927444413973e-06, + "loss": 0.4784, + "step": 4759 + }, + { + "epoch": 2.318609938291653, + "grad_norm": 2.688746213912964, + "learning_rate": 3.40867532458638e-06, + "loss": 0.5322, + "step": 4760 + }, + { + "epoch": 2.3190971094511204, + "grad_norm": 2.8863282203674316, + "learning_rate": 3.408076144896982e-06, + "loss": 0.5493, + "step": 4761 + }, + { + "epoch": 2.319584280610588, + "grad_norm": 2.91318678855896, + "learning_rate": 3.4074769051111883e-06, + "loss": 0.4452, + "step": 4762 + }, + { + "epoch": 2.320071451770055, + "grad_norm": 2.8999791145324707, + "learning_rate": 3.406877605268658e-06, + "loss": 0.5403, + "step": 4763 + }, + { + "epoch": 2.3205586229295228, + "grad_norm": 3.1531059741973877, + "learning_rate": 3.4062782454090526e-06, + "loss": 0.5584, + "step": 4764 + }, + { + "epoch": 2.32104579408899, + "grad_norm": 2.609755039215088, + "learning_rate": 3.405678825572037e-06, + "loss": 0.471, + "step": 4765 + }, + { + "epoch": 2.3215329652484575, + "grad_norm": 2.840317964553833, + "learning_rate": 3.405079345797281e-06, + "loss": 0.4743, + "step": 4766 + }, + { + "epoch": 2.3220201364079247, + "grad_norm": 2.5985989570617676, + "learning_rate": 3.4044798061244583e-06, + "loss": 0.5344, + "step": 4767 + }, + { + "epoch": 2.322507307567392, + "grad_norm": 2.9198036193847656, + "learning_rate": 3.403880206593246e-06, + "loss": 0.5034, + "step": 4768 + }, + { + "epoch": 2.3229944787268595, + "grad_norm": 2.705465793609619, + "learning_rate": 3.4032805472433262e-06, + "loss": 0.4739, + "step": 4769 + }, + { + "epoch": 2.3234816498863267, + "grad_norm": 2.731149435043335, + "learning_rate": 3.402680828114383e-06, + "loss": 0.4681, + "step": 4770 + }, + { + "epoch": 2.3239688210457943, + "grad_norm": 2.712531328201294, + "learning_rate": 3.4020810492461077e-06, + "loss": 0.4701, + "step": 4771 + }, + { + "epoch": 2.3244559922052614, + "grad_norm": 2.902679681777954, + "learning_rate": 3.401481210678192e-06, + "loss": 0.485, + "step": 4772 + }, + { + "epoch": 2.3249431633647286, + "grad_norm": 2.676387071609497, + "learning_rate": 3.4008813124503343e-06, + "loss": 0.5316, + "step": 4773 + }, + { + "epoch": 2.3254303345241962, + "grad_norm": 2.608091354370117, + "learning_rate": 3.4002813546022347e-06, + "loss": 0.4325, + "step": 4774 + }, + { + "epoch": 2.3259175056836634, + "grad_norm": 2.913146734237671, + "learning_rate": 3.3996813371735988e-06, + "loss": 0.5653, + "step": 4775 + }, + { + "epoch": 2.326404676843131, + "grad_norm": 3.098907232284546, + "learning_rate": 3.3990812602041358e-06, + "loss": 0.5354, + "step": 4776 + }, + { + "epoch": 2.326891848002598, + "grad_norm": 2.7560901641845703, + "learning_rate": 3.3984811237335593e-06, + "loss": 0.4838, + "step": 4777 + }, + { + "epoch": 2.327379019162066, + "grad_norm": 2.5429182052612305, + "learning_rate": 3.397880927801585e-06, + "loss": 0.4829, + "step": 4778 + }, + { + "epoch": 2.327866190321533, + "grad_norm": 2.849395275115967, + "learning_rate": 3.397280672447935e-06, + "loss": 0.4662, + "step": 4779 + }, + { + "epoch": 2.328353361481, + "grad_norm": 3.0449886322021484, + "learning_rate": 3.3966803577123338e-06, + "loss": 0.4954, + "step": 4780 + }, + { + "epoch": 2.3288405326404678, + "grad_norm": 3.000946521759033, + "learning_rate": 3.3960799836345097e-06, + "loss": 0.4751, + "step": 4781 + }, + { + "epoch": 2.329327703799935, + "grad_norm": 2.890658378601074, + "learning_rate": 3.395479550254196e-06, + "loss": 0.4647, + "step": 4782 + }, + { + "epoch": 2.3298148749594025, + "grad_norm": 3.0716395378112793, + "learning_rate": 3.394879057611129e-06, + "loss": 0.4675, + "step": 4783 + }, + { + "epoch": 2.3303020461188697, + "grad_norm": 3.3942928314208984, + "learning_rate": 3.394278505745048e-06, + "loss": 0.4536, + "step": 4784 + }, + { + "epoch": 2.330789217278337, + "grad_norm": 2.2425591945648193, + "learning_rate": 3.3936778946957006e-06, + "loss": 0.4304, + "step": 4785 + }, + { + "epoch": 2.3312763884378045, + "grad_norm": 2.9157750606536865, + "learning_rate": 3.393077224502832e-06, + "loss": 0.5523, + "step": 4786 + }, + { + "epoch": 2.3317635595972717, + "grad_norm": 3.288100242614746, + "learning_rate": 3.392476495206197e-06, + "loss": 0.4788, + "step": 4787 + }, + { + "epoch": 2.3322507307567393, + "grad_norm": 3.2894532680511475, + "learning_rate": 3.39187570684555e-06, + "loss": 0.5255, + "step": 4788 + }, + { + "epoch": 2.3327379019162064, + "grad_norm": 2.6688737869262695, + "learning_rate": 3.391274859460652e-06, + "loss": 0.4704, + "step": 4789 + }, + { + "epoch": 2.333225073075674, + "grad_norm": 2.986572742462158, + "learning_rate": 3.3906739530912664e-06, + "loss": 0.5016, + "step": 4790 + }, + { + "epoch": 2.3337122442351412, + "grad_norm": 2.415895700454712, + "learning_rate": 3.390072987777162e-06, + "loss": 0.489, + "step": 4791 + }, + { + "epoch": 2.3341994153946084, + "grad_norm": 2.8499348163604736, + "learning_rate": 3.3894719635581095e-06, + "loss": 0.5658, + "step": 4792 + }, + { + "epoch": 2.334686586554076, + "grad_norm": 2.874480724334717, + "learning_rate": 3.3888708804738862e-06, + "loss": 0.4351, + "step": 4793 + }, + { + "epoch": 2.335173757713543, + "grad_norm": 2.583704948425293, + "learning_rate": 3.38826973856427e-06, + "loss": 0.4841, + "step": 4794 + }, + { + "epoch": 2.335660928873011, + "grad_norm": 3.1467511653900146, + "learning_rate": 3.3876685378690455e-06, + "loss": 0.4933, + "step": 4795 + }, + { + "epoch": 2.336148100032478, + "grad_norm": 3.1442062854766846, + "learning_rate": 3.387067278428e-06, + "loss": 0.4759, + "step": 4796 + }, + { + "epoch": 2.3366352711919456, + "grad_norm": 2.7520458698272705, + "learning_rate": 3.3864659602809247e-06, + "loss": 0.4732, + "step": 4797 + }, + { + "epoch": 2.3371224423514128, + "grad_norm": 2.9483630657196045, + "learning_rate": 3.385864583467614e-06, + "loss": 0.5272, + "step": 4798 + }, + { + "epoch": 2.33760961351088, + "grad_norm": 2.698939561843872, + "learning_rate": 3.3852631480278684e-06, + "loss": 0.4825, + "step": 4799 + }, + { + "epoch": 2.3380967846703475, + "grad_norm": 2.752411365509033, + "learning_rate": 3.38466165400149e-06, + "loss": 0.5239, + "step": 4800 + }, + { + "epoch": 2.3385839558298147, + "grad_norm": 2.820643901824951, + "learning_rate": 3.384060101428285e-06, + "loss": 0.5152, + "step": 4801 + }, + { + "epoch": 2.3390711269892823, + "grad_norm": 2.63757586479187, + "learning_rate": 3.383458490348066e-06, + "loss": 0.549, + "step": 4802 + }, + { + "epoch": 2.3395582981487495, + "grad_norm": 2.8368823528289795, + "learning_rate": 3.3828568208006464e-06, + "loss": 0.5171, + "step": 4803 + }, + { + "epoch": 2.340045469308217, + "grad_norm": 2.507859945297241, + "learning_rate": 3.3822550928258435e-06, + "loss": 0.4702, + "step": 4804 + }, + { + "epoch": 2.3405326404676843, + "grad_norm": 3.0417778491973877, + "learning_rate": 3.381653306463481e-06, + "loss": 0.4961, + "step": 4805 + }, + { + "epoch": 2.3410198116271514, + "grad_norm": 2.6051299571990967, + "learning_rate": 3.3810514617533847e-06, + "loss": 0.4445, + "step": 4806 + }, + { + "epoch": 2.341506982786619, + "grad_norm": 2.8561184406280518, + "learning_rate": 3.3804495587353854e-06, + "loss": 0.4783, + "step": 4807 + }, + { + "epoch": 2.3419941539460862, + "grad_norm": 2.946282148361206, + "learning_rate": 3.3798475974493154e-06, + "loss": 0.5556, + "step": 4808 + }, + { + "epoch": 2.342481325105554, + "grad_norm": 2.802497386932373, + "learning_rate": 3.379245577935014e-06, + "loss": 0.4425, + "step": 4809 + }, + { + "epoch": 2.342968496265021, + "grad_norm": 2.7620420455932617, + "learning_rate": 3.378643500232322e-06, + "loss": 0.512, + "step": 4810 + }, + { + "epoch": 2.3434556674244886, + "grad_norm": 2.980036973953247, + "learning_rate": 3.378041364381085e-06, + "loss": 0.5308, + "step": 4811 + }, + { + "epoch": 2.343942838583956, + "grad_norm": 2.641921281814575, + "learning_rate": 3.3774391704211525e-06, + "loss": 0.5389, + "step": 4812 + }, + { + "epoch": 2.344430009743423, + "grad_norm": 2.741140842437744, + "learning_rate": 3.3768369183923765e-06, + "loss": 0.5945, + "step": 4813 + }, + { + "epoch": 2.3449171809028906, + "grad_norm": 3.2558796405792236, + "learning_rate": 3.3762346083346154e-06, + "loss": 0.5311, + "step": 4814 + }, + { + "epoch": 2.3454043520623578, + "grad_norm": 3.766115188598633, + "learning_rate": 3.3756322402877294e-06, + "loss": 0.524, + "step": 4815 + }, + { + "epoch": 2.3458915232218254, + "grad_norm": 2.511063575744629, + "learning_rate": 3.3750298142915837e-06, + "loss": 0.4292, + "step": 4816 + }, + { + "epoch": 2.3463786943812925, + "grad_norm": 2.6447761058807373, + "learning_rate": 3.374427330386046e-06, + "loss": 0.4308, + "step": 4817 + }, + { + "epoch": 2.34686586554076, + "grad_norm": 2.8254201412200928, + "learning_rate": 3.3738247886109893e-06, + "loss": 0.4465, + "step": 4818 + }, + { + "epoch": 2.3473530367002273, + "grad_norm": 2.398416519165039, + "learning_rate": 3.373222189006289e-06, + "loss": 0.5037, + "step": 4819 + }, + { + "epoch": 2.3478402078596945, + "grad_norm": 2.916090726852417, + "learning_rate": 3.372619531611826e-06, + "loss": 0.4232, + "step": 4820 + }, + { + "epoch": 2.348327379019162, + "grad_norm": 3.0066187381744385, + "learning_rate": 3.372016816467483e-06, + "loss": 0.4613, + "step": 4821 + }, + { + "epoch": 2.3488145501786293, + "grad_norm": 4.653550148010254, + "learning_rate": 3.371414043613149e-06, + "loss": 0.4466, + "step": 4822 + }, + { + "epoch": 2.349301721338097, + "grad_norm": 2.8329684734344482, + "learning_rate": 3.370811213088715e-06, + "loss": 0.4859, + "step": 4823 + }, + { + "epoch": 2.349788892497564, + "grad_norm": 2.829331874847412, + "learning_rate": 3.370208324934075e-06, + "loss": 0.4736, + "step": 4824 + }, + { + "epoch": 2.3502760636570317, + "grad_norm": 2.6873667240142822, + "learning_rate": 3.36960537918913e-06, + "loss": 0.4964, + "step": 4825 + }, + { + "epoch": 2.350763234816499, + "grad_norm": 2.8880157470703125, + "learning_rate": 3.3690023758937816e-06, + "loss": 0.514, + "step": 4826 + }, + { + "epoch": 2.351250405975966, + "grad_norm": 3.170520544052124, + "learning_rate": 3.368399315087937e-06, + "loss": 0.4769, + "step": 4827 + }, + { + "epoch": 2.3517375771354336, + "grad_norm": 3.573953866958618, + "learning_rate": 3.367796196811506e-06, + "loss": 0.587, + "step": 4828 + }, + { + "epoch": 2.352224748294901, + "grad_norm": 2.906156063079834, + "learning_rate": 3.367193021104404e-06, + "loss": 0.4964, + "step": 4829 + }, + { + "epoch": 2.3527119194543684, + "grad_norm": 2.9541258811950684, + "learning_rate": 3.366589788006549e-06, + "loss": 0.5318, + "step": 4830 + }, + { + "epoch": 2.3531990906138356, + "grad_norm": 2.8751585483551025, + "learning_rate": 3.3659864975578615e-06, + "loss": 0.504, + "step": 4831 + }, + { + "epoch": 2.353686261773303, + "grad_norm": 3.1626923084259033, + "learning_rate": 3.365383149798269e-06, + "loss": 0.4386, + "step": 4832 + }, + { + "epoch": 2.3541734329327704, + "grad_norm": 2.9847404956817627, + "learning_rate": 3.3647797447677e-06, + "loss": 0.5232, + "step": 4833 + }, + { + "epoch": 2.3546606040922375, + "grad_norm": 3.1189193725585938, + "learning_rate": 3.3641762825060887e-06, + "loss": 0.4573, + "step": 4834 + }, + { + "epoch": 2.355147775251705, + "grad_norm": 2.6455206871032715, + "learning_rate": 3.3635727630533705e-06, + "loss": 0.4655, + "step": 4835 + }, + { + "epoch": 2.3556349464111723, + "grad_norm": 3.2090907096862793, + "learning_rate": 3.362969186449488e-06, + "loss": 0.4711, + "step": 4836 + }, + { + "epoch": 2.35612211757064, + "grad_norm": 2.609435796737671, + "learning_rate": 3.3623655527343845e-06, + "loss": 0.4806, + "step": 4837 + }, + { + "epoch": 2.356609288730107, + "grad_norm": 3.031477212905884, + "learning_rate": 3.361761861948009e-06, + "loss": 0.4189, + "step": 4838 + }, + { + "epoch": 2.3570964598895747, + "grad_norm": 2.648103713989258, + "learning_rate": 3.361158114130314e-06, + "loss": 0.4642, + "step": 4839 + }, + { + "epoch": 2.357583631049042, + "grad_norm": 2.738205671310425, + "learning_rate": 3.3605543093212557e-06, + "loss": 0.4834, + "step": 4840 + }, + { + "epoch": 2.358070802208509, + "grad_norm": 2.594766855239868, + "learning_rate": 3.3599504475607925e-06, + "loss": 0.4339, + "step": 4841 + }, + { + "epoch": 2.3585579733679767, + "grad_norm": 2.945929765701294, + "learning_rate": 3.3593465288888895e-06, + "loss": 0.475, + "step": 4842 + }, + { + "epoch": 2.359045144527444, + "grad_norm": 3.1223437786102295, + "learning_rate": 3.358742553345512e-06, + "loss": 0.5537, + "step": 4843 + }, + { + "epoch": 2.3595323156869115, + "grad_norm": 2.799546957015991, + "learning_rate": 3.3581385209706333e-06, + "loss": 0.4818, + "step": 4844 + }, + { + "epoch": 2.3600194868463786, + "grad_norm": 2.9776782989501953, + "learning_rate": 3.3575344318042274e-06, + "loss": 0.522, + "step": 4845 + }, + { + "epoch": 2.3605066580058462, + "grad_norm": 2.599419593811035, + "learning_rate": 3.356930285886272e-06, + "loss": 0.5021, + "step": 4846 + }, + { + "epoch": 2.3609938291653134, + "grad_norm": 3.2288858890533447, + "learning_rate": 3.3563260832567507e-06, + "loss": 0.5229, + "step": 4847 + }, + { + "epoch": 2.3614810003247806, + "grad_norm": 2.7305753231048584, + "learning_rate": 3.355721823955649e-06, + "loss": 0.4769, + "step": 4848 + }, + { + "epoch": 2.361968171484248, + "grad_norm": 3.0682332515716553, + "learning_rate": 3.3551175080229557e-06, + "loss": 0.4914, + "step": 4849 + }, + { + "epoch": 2.3624553426437154, + "grad_norm": 2.563206911087036, + "learning_rate": 3.354513135498666e-06, + "loss": 0.5149, + "step": 4850 + }, + { + "epoch": 2.362942513803183, + "grad_norm": 3.0224971771240234, + "learning_rate": 3.3539087064227764e-06, + "loss": 0.5215, + "step": 4851 + }, + { + "epoch": 2.36342968496265, + "grad_norm": 2.756260395050049, + "learning_rate": 3.353304220835288e-06, + "loss": 0.5122, + "step": 4852 + }, + { + "epoch": 2.3639168561221178, + "grad_norm": 2.6839091777801514, + "learning_rate": 3.352699678776206e-06, + "loss": 0.4656, + "step": 4853 + }, + { + "epoch": 2.364404027281585, + "grad_norm": 2.768988609313965, + "learning_rate": 3.352095080285539e-06, + "loss": 0.4691, + "step": 4854 + }, + { + "epoch": 2.364891198441052, + "grad_norm": 2.812485456466675, + "learning_rate": 3.351490425403298e-06, + "loss": 0.5011, + "step": 4855 + }, + { + "epoch": 2.3653783696005197, + "grad_norm": 2.675126552581787, + "learning_rate": 3.350885714169501e-06, + "loss": 0.5476, + "step": 4856 + }, + { + "epoch": 2.365865540759987, + "grad_norm": 2.565175771713257, + "learning_rate": 3.350280946624166e-06, + "loss": 0.4792, + "step": 4857 + }, + { + "epoch": 2.3663527119194545, + "grad_norm": 2.9544591903686523, + "learning_rate": 3.349676122807317e-06, + "loss": 0.3886, + "step": 4858 + }, + { + "epoch": 2.3668398830789217, + "grad_norm": 2.8084137439727783, + "learning_rate": 3.3490712427589817e-06, + "loss": 0.4512, + "step": 4859 + }, + { + "epoch": 2.3673270542383893, + "grad_norm": 2.8425779342651367, + "learning_rate": 3.3484663065191907e-06, + "loss": 0.5163, + "step": 4860 + }, + { + "epoch": 2.3678142253978565, + "grad_norm": 2.665637254714966, + "learning_rate": 3.347861314127979e-06, + "loss": 0.49, + "step": 4861 + }, + { + "epoch": 2.3683013965573236, + "grad_norm": 2.89898419380188, + "learning_rate": 3.3472562656253832e-06, + "loss": 0.5407, + "step": 4862 + }, + { + "epoch": 2.3687885677167912, + "grad_norm": 3.1703124046325684, + "learning_rate": 3.3466511610514473e-06, + "loss": 0.4574, + "step": 4863 + }, + { + "epoch": 2.3692757388762584, + "grad_norm": 2.866380214691162, + "learning_rate": 3.346046000446216e-06, + "loss": 0.4459, + "step": 4864 + }, + { + "epoch": 2.369762910035726, + "grad_norm": 3.1817736625671387, + "learning_rate": 3.34544078384974e-06, + "loss": 0.5267, + "step": 4865 + }, + { + "epoch": 2.370250081195193, + "grad_norm": 2.9903225898742676, + "learning_rate": 3.344835511302071e-06, + "loss": 0.5465, + "step": 4866 + }, + { + "epoch": 2.370737252354661, + "grad_norm": 2.7510061264038086, + "learning_rate": 3.3442301828432667e-06, + "loss": 0.5049, + "step": 4867 + }, + { + "epoch": 2.371224423514128, + "grad_norm": 3.132601737976074, + "learning_rate": 3.343624798513388e-06, + "loss": 0.4758, + "step": 4868 + }, + { + "epoch": 2.371711594673595, + "grad_norm": 2.6407487392425537, + "learning_rate": 3.343019358352498e-06, + "loss": 0.4225, + "step": 4869 + }, + { + "epoch": 2.3721987658330628, + "grad_norm": 2.7301299571990967, + "learning_rate": 3.3424138624006657e-06, + "loss": 0.4963, + "step": 4870 + }, + { + "epoch": 2.37268593699253, + "grad_norm": 2.7061080932617188, + "learning_rate": 3.3418083106979626e-06, + "loss": 0.5243, + "step": 4871 + }, + { + "epoch": 2.3731731081519976, + "grad_norm": 2.775007486343384, + "learning_rate": 3.3412027032844634e-06, + "loss": 0.5751, + "step": 4872 + }, + { + "epoch": 2.3736602793114647, + "grad_norm": 3.3890554904937744, + "learning_rate": 3.340597040200248e-06, + "loss": 0.5087, + "step": 4873 + }, + { + "epoch": 2.3741474504709323, + "grad_norm": 2.6459481716156006, + "learning_rate": 3.3399913214853985e-06, + "loss": 0.4866, + "step": 4874 + }, + { + "epoch": 2.3746346216303995, + "grad_norm": 2.7732648849487305, + "learning_rate": 3.339385547180002e-06, + "loss": 0.4594, + "step": 4875 + }, + { + "epoch": 2.3751217927898667, + "grad_norm": 2.855118989944458, + "learning_rate": 3.3387797173241478e-06, + "loss": 0.4498, + "step": 4876 + }, + { + "epoch": 2.3756089639493343, + "grad_norm": 2.81585955619812, + "learning_rate": 3.33817383195793e-06, + "loss": 0.4579, + "step": 4877 + }, + { + "epoch": 2.3760961351088015, + "grad_norm": 2.73844575881958, + "learning_rate": 3.3375678911214456e-06, + "loss": 0.5139, + "step": 4878 + }, + { + "epoch": 2.376583306268269, + "grad_norm": 2.8706214427948, + "learning_rate": 3.3369618948547965e-06, + "loss": 0.5665, + "step": 4879 + }, + { + "epoch": 2.3770704774277363, + "grad_norm": 2.8700785636901855, + "learning_rate": 3.3363558431980875e-06, + "loss": 0.546, + "step": 4880 + }, + { + "epoch": 2.377557648587204, + "grad_norm": 2.792531728744507, + "learning_rate": 3.3357497361914266e-06, + "loss": 0.5019, + "step": 4881 + }, + { + "epoch": 2.378044819746671, + "grad_norm": 2.771303176879883, + "learning_rate": 3.3351435738749254e-06, + "loss": 0.4715, + "step": 4882 + }, + { + "epoch": 2.378531990906138, + "grad_norm": 2.796982765197754, + "learning_rate": 3.3345373562887014e-06, + "loss": 0.5216, + "step": 4883 + }, + { + "epoch": 2.379019162065606, + "grad_norm": 2.76552152633667, + "learning_rate": 3.3339310834728716e-06, + "loss": 0.5551, + "step": 4884 + }, + { + "epoch": 2.379506333225073, + "grad_norm": 2.822509765625, + "learning_rate": 3.3333247554675614e-06, + "loss": 0.4647, + "step": 4885 + }, + { + "epoch": 2.3799935043845406, + "grad_norm": 2.488123893737793, + "learning_rate": 3.3327183723128953e-06, + "loss": 0.4748, + "step": 4886 + }, + { + "epoch": 2.3804806755440078, + "grad_norm": 2.89906907081604, + "learning_rate": 3.332111934049006e-06, + "loss": 0.4599, + "step": 4887 + }, + { + "epoch": 2.3809678467034754, + "grad_norm": 2.6358110904693604, + "learning_rate": 3.3315054407160258e-06, + "loss": 0.4947, + "step": 4888 + }, + { + "epoch": 2.3814550178629426, + "grad_norm": 2.8021843433380127, + "learning_rate": 3.330898892354093e-06, + "loss": 0.4851, + "step": 4889 + }, + { + "epoch": 2.3819421890224097, + "grad_norm": 2.8026981353759766, + "learning_rate": 3.3302922890033496e-06, + "loss": 0.5006, + "step": 4890 + }, + { + "epoch": 2.3824293601818773, + "grad_norm": 3.0108635425567627, + "learning_rate": 3.3296856307039393e-06, + "loss": 0.5111, + "step": 4891 + }, + { + "epoch": 2.3829165313413445, + "grad_norm": 3.086035966873169, + "learning_rate": 3.3290789174960107e-06, + "loss": 0.5166, + "step": 4892 + }, + { + "epoch": 2.383403702500812, + "grad_norm": 2.781773567199707, + "learning_rate": 3.3284721494197176e-06, + "loss": 0.4422, + "step": 4893 + }, + { + "epoch": 2.3838908736602793, + "grad_norm": 2.951860189437866, + "learning_rate": 3.327865326515214e-06, + "loss": 0.5062, + "step": 4894 + }, + { + "epoch": 2.384378044819747, + "grad_norm": 2.8264997005462646, + "learning_rate": 3.32725844882266e-06, + "loss": 0.4526, + "step": 4895 + }, + { + "epoch": 2.384865215979214, + "grad_norm": 2.8569893836975098, + "learning_rate": 3.3266515163822195e-06, + "loss": 0.5077, + "step": 4896 + }, + { + "epoch": 2.3853523871386813, + "grad_norm": 2.911604642868042, + "learning_rate": 3.326044529234058e-06, + "loss": 0.4814, + "step": 4897 + }, + { + "epoch": 2.385839558298149, + "grad_norm": 2.6469521522521973, + "learning_rate": 3.325437487418347e-06, + "loss": 0.4918, + "step": 4898 + }, + { + "epoch": 2.386326729457616, + "grad_norm": 2.828535795211792, + "learning_rate": 3.32483039097526e-06, + "loss": 0.5352, + "step": 4899 + }, + { + "epoch": 2.3868139006170837, + "grad_norm": 2.8664731979370117, + "learning_rate": 3.324223239944974e-06, + "loss": 0.5197, + "step": 4900 + }, + { + "epoch": 2.387301071776551, + "grad_norm": 2.653696060180664, + "learning_rate": 3.3236160343676713e-06, + "loss": 0.4809, + "step": 4901 + }, + { + "epoch": 2.3877882429360184, + "grad_norm": 2.905880928039551, + "learning_rate": 3.323008774283536e-06, + "loss": 0.549, + "step": 4902 + }, + { + "epoch": 2.3882754140954856, + "grad_norm": 3.1142570972442627, + "learning_rate": 3.3224014597327564e-06, + "loss": 0.5864, + "step": 4903 + }, + { + "epoch": 2.3887625852549528, + "grad_norm": 3.417051315307617, + "learning_rate": 3.3217940907555247e-06, + "loss": 0.5057, + "step": 4904 + }, + { + "epoch": 2.3892497564144204, + "grad_norm": 2.8400466442108154, + "learning_rate": 3.3211866673920374e-06, + "loss": 0.4928, + "step": 4905 + }, + { + "epoch": 2.3897369275738876, + "grad_norm": 3.048341989517212, + "learning_rate": 3.320579189682492e-06, + "loss": 0.5365, + "step": 4906 + }, + { + "epoch": 2.390224098733355, + "grad_norm": 3.149044990539551, + "learning_rate": 3.3199716576670926e-06, + "loss": 0.5611, + "step": 4907 + }, + { + "epoch": 2.3907112698928223, + "grad_norm": 2.9628660678863525, + "learning_rate": 3.319364071386045e-06, + "loss": 0.5118, + "step": 4908 + }, + { + "epoch": 2.3911984410522895, + "grad_norm": 2.8347904682159424, + "learning_rate": 3.31875643087956e-06, + "loss": 0.4916, + "step": 4909 + }, + { + "epoch": 2.391685612211757, + "grad_norm": 2.619783878326416, + "learning_rate": 3.3181487361878505e-06, + "loss": 0.5013, + "step": 4910 + }, + { + "epoch": 2.3921727833712243, + "grad_norm": 2.918168067932129, + "learning_rate": 3.317540987351134e-06, + "loss": 0.5373, + "step": 4911 + }, + { + "epoch": 2.392659954530692, + "grad_norm": 2.892745018005371, + "learning_rate": 3.3169331844096307e-06, + "loss": 0.4573, + "step": 4912 + }, + { + "epoch": 2.393147125690159, + "grad_norm": 3.432353973388672, + "learning_rate": 3.3163253274035654e-06, + "loss": 0.4704, + "step": 4913 + }, + { + "epoch": 2.3936342968496263, + "grad_norm": 2.602836847305298, + "learning_rate": 3.315717416373166e-06, + "loss": 0.5012, + "step": 4914 + }, + { + "epoch": 2.394121468009094, + "grad_norm": 2.850623607635498, + "learning_rate": 3.315109451358664e-06, + "loss": 0.436, + "step": 4915 + }, + { + "epoch": 2.394608639168561, + "grad_norm": 2.4548869132995605, + "learning_rate": 3.3145014324002945e-06, + "loss": 0.4916, + "step": 4916 + }, + { + "epoch": 2.3950958103280287, + "grad_norm": 2.888758897781372, + "learning_rate": 3.3138933595382957e-06, + "loss": 0.5195, + "step": 4917 + }, + { + "epoch": 2.395582981487496, + "grad_norm": 2.626006841659546, + "learning_rate": 3.3132852328129105e-06, + "loss": 0.4586, + "step": 4918 + }, + { + "epoch": 2.3960701526469634, + "grad_norm": 3.2654454708099365, + "learning_rate": 3.312677052264384e-06, + "loss": 0.4946, + "step": 4919 + }, + { + "epoch": 2.3965573238064306, + "grad_norm": 3.052917242050171, + "learning_rate": 3.3120688179329664e-06, + "loss": 0.3913, + "step": 4920 + }, + { + "epoch": 2.3970444949658978, + "grad_norm": 2.4969592094421387, + "learning_rate": 3.311460529858909e-06, + "loss": 0.4022, + "step": 4921 + }, + { + "epoch": 2.3975316661253654, + "grad_norm": 2.843318462371826, + "learning_rate": 3.3108521880824702e-06, + "loss": 0.5188, + "step": 4922 + }, + { + "epoch": 2.3980188372848326, + "grad_norm": 2.565422773361206, + "learning_rate": 3.3102437926439086e-06, + "loss": 0.4794, + "step": 4923 + }, + { + "epoch": 2.3985060084443, + "grad_norm": 3.3541154861450195, + "learning_rate": 3.309635343583489e-06, + "loss": 0.5514, + "step": 4924 + }, + { + "epoch": 2.3989931796037673, + "grad_norm": 2.401472806930542, + "learning_rate": 3.3090268409414766e-06, + "loss": 0.4942, + "step": 4925 + }, + { + "epoch": 2.399480350763235, + "grad_norm": 2.7403600215911865, + "learning_rate": 3.3084182847581442e-06, + "loss": 0.5243, + "step": 4926 + }, + { + "epoch": 2.399967521922702, + "grad_norm": 2.8682196140289307, + "learning_rate": 3.3078096750737642e-06, + "loss": 0.4992, + "step": 4927 + }, + { + "epoch": 2.4004546930821693, + "grad_norm": 2.9158310890197754, + "learning_rate": 3.3072010119286156e-06, + "loss": 0.5735, + "step": 4928 + }, + { + "epoch": 2.400941864241637, + "grad_norm": 2.692143440246582, + "learning_rate": 3.3065922953629794e-06, + "loss": 0.4869, + "step": 4929 + }, + { + "epoch": 2.401429035401104, + "grad_norm": 2.6389544010162354, + "learning_rate": 3.30598352541714e-06, + "loss": 0.4555, + "step": 4930 + }, + { + "epoch": 2.4019162065605717, + "grad_norm": 2.9207024574279785, + "learning_rate": 3.305374702131386e-06, + "loss": 0.45, + "step": 4931 + }, + { + "epoch": 2.402403377720039, + "grad_norm": 3.0225234031677246, + "learning_rate": 3.304765825546009e-06, + "loss": 0.4608, + "step": 4932 + }, + { + "epoch": 2.4028905488795065, + "grad_norm": 3.1278679370880127, + "learning_rate": 3.304156895701305e-06, + "loss": 0.517, + "step": 4933 + }, + { + "epoch": 2.4033777200389737, + "grad_norm": 2.995404005050659, + "learning_rate": 3.303547912637572e-06, + "loss": 0.487, + "step": 4934 + }, + { + "epoch": 2.403864891198441, + "grad_norm": 2.908477544784546, + "learning_rate": 3.3029388763951135e-06, + "loss": 0.4992, + "step": 4935 + }, + { + "epoch": 2.4043520623579084, + "grad_norm": 2.580718517303467, + "learning_rate": 3.3023297870142347e-06, + "loss": 0.4764, + "step": 4936 + }, + { + "epoch": 2.4048392335173756, + "grad_norm": 2.6307313442230225, + "learning_rate": 3.301720644535245e-06, + "loss": 0.4711, + "step": 4937 + }, + { + "epoch": 2.405326404676843, + "grad_norm": 2.8245255947113037, + "learning_rate": 3.3011114489984588e-06, + "loss": 0.5026, + "step": 4938 + }, + { + "epoch": 2.4058135758363104, + "grad_norm": 2.834749460220337, + "learning_rate": 3.30050220044419e-06, + "loss": 0.5097, + "step": 4939 + }, + { + "epoch": 2.406300746995778, + "grad_norm": 2.644615888595581, + "learning_rate": 3.299892898912761e-06, + "loss": 0.4095, + "step": 4940 + }, + { + "epoch": 2.406787918155245, + "grad_norm": 2.839526891708374, + "learning_rate": 3.2992835444444936e-06, + "loss": 0.4616, + "step": 4941 + }, + { + "epoch": 2.4072750893147123, + "grad_norm": 2.352792739868164, + "learning_rate": 3.298674137079717e-06, + "loss": 0.4245, + "step": 4942 + }, + { + "epoch": 2.40776226047418, + "grad_norm": 3.0010507106781006, + "learning_rate": 3.2980646768587593e-06, + "loss": 0.4883, + "step": 4943 + }, + { + "epoch": 2.408249431633647, + "grad_norm": 3.195361614227295, + "learning_rate": 3.297455163821956e-06, + "loss": 0.5463, + "step": 4944 + }, + { + "epoch": 2.4087366027931147, + "grad_norm": 3.0292282104492188, + "learning_rate": 3.2968455980096443e-06, + "loss": 0.5751, + "step": 4945 + }, + { + "epoch": 2.409223773952582, + "grad_norm": 2.6084816455841064, + "learning_rate": 3.296235979462165e-06, + "loss": 0.4436, + "step": 4946 + }, + { + "epoch": 2.4097109451120495, + "grad_norm": 2.8306825160980225, + "learning_rate": 3.2956263082198626e-06, + "loss": 0.4523, + "step": 4947 + }, + { + "epoch": 2.4101981162715167, + "grad_norm": 2.6978793144226074, + "learning_rate": 3.2950165843230853e-06, + "loss": 0.5203, + "step": 4948 + }, + { + "epoch": 2.410685287430984, + "grad_norm": 2.6050150394439697, + "learning_rate": 3.2944068078121843e-06, + "loss": 0.5091, + "step": 4949 + }, + { + "epoch": 2.4111724585904515, + "grad_norm": 3.3191940784454346, + "learning_rate": 3.2937969787275153e-06, + "loss": 0.4971, + "step": 4950 + }, + { + "epoch": 2.4116596297499187, + "grad_norm": 2.9067647457122803, + "learning_rate": 3.2931870971094356e-06, + "loss": 0.4889, + "step": 4951 + }, + { + "epoch": 2.4121468009093863, + "grad_norm": 2.925602674484253, + "learning_rate": 3.2925771629983084e-06, + "loss": 0.5173, + "step": 4952 + }, + { + "epoch": 2.4126339720688534, + "grad_norm": 3.5112547874450684, + "learning_rate": 3.2919671764344973e-06, + "loss": 0.5962, + "step": 4953 + }, + { + "epoch": 2.413121143228321, + "grad_norm": 2.8696281909942627, + "learning_rate": 3.291357137458373e-06, + "loss": 0.434, + "step": 4954 + }, + { + "epoch": 2.4136083143877882, + "grad_norm": 3.06895112991333, + "learning_rate": 3.290747046110307e-06, + "loss": 0.5106, + "step": 4955 + }, + { + "epoch": 2.4140954855472554, + "grad_norm": 3.0758121013641357, + "learning_rate": 3.2901369024306746e-06, + "loss": 0.4407, + "step": 4956 + }, + { + "epoch": 2.414582656706723, + "grad_norm": 2.7145326137542725, + "learning_rate": 3.2895267064598564e-06, + "loss": 0.5014, + "step": 4957 + }, + { + "epoch": 2.41506982786619, + "grad_norm": 2.8255562782287598, + "learning_rate": 3.2889164582382337e-06, + "loss": 0.462, + "step": 4958 + }, + { + "epoch": 2.415556999025658, + "grad_norm": 2.810636281967163, + "learning_rate": 3.288306157806193e-06, + "loss": 0.48, + "step": 4959 + }, + { + "epoch": 2.416044170185125, + "grad_norm": 2.84902286529541, + "learning_rate": 3.2876958052041246e-06, + "loss": 0.5041, + "step": 4960 + }, + { + "epoch": 2.4165313413445926, + "grad_norm": 2.813629150390625, + "learning_rate": 3.287085400472421e-06, + "loss": 0.5308, + "step": 4961 + }, + { + "epoch": 2.4170185125040597, + "grad_norm": 2.808368682861328, + "learning_rate": 3.2864749436514794e-06, + "loss": 0.5195, + "step": 4962 + }, + { + "epoch": 2.417505683663527, + "grad_norm": 2.7973477840423584, + "learning_rate": 3.285864434781698e-06, + "loss": 0.4516, + "step": 4963 + }, + { + "epoch": 2.4179928548229945, + "grad_norm": 2.7094573974609375, + "learning_rate": 3.2852538739034834e-06, + "loss": 0.5037, + "step": 4964 + }, + { + "epoch": 2.4184800259824617, + "grad_norm": 2.5138940811157227, + "learning_rate": 3.2846432610572395e-06, + "loss": 0.4869, + "step": 4965 + }, + { + "epoch": 2.4189671971419293, + "grad_norm": 2.446028232574463, + "learning_rate": 3.2840325962833773e-06, + "loss": 0.4531, + "step": 4966 + }, + { + "epoch": 2.4194543683013965, + "grad_norm": 2.6976819038391113, + "learning_rate": 3.2834218796223118e-06, + "loss": 0.4888, + "step": 4967 + }, + { + "epoch": 2.419941539460864, + "grad_norm": 3.0220577716827393, + "learning_rate": 3.2828111111144578e-06, + "loss": 0.5586, + "step": 4968 + }, + { + "epoch": 2.4204287106203313, + "grad_norm": 2.918544292449951, + "learning_rate": 3.2822002908002387e-06, + "loss": 0.4513, + "step": 4969 + }, + { + "epoch": 2.4209158817797984, + "grad_norm": 2.8965885639190674, + "learning_rate": 3.2815894187200763e-06, + "loss": 0.4901, + "step": 4970 + }, + { + "epoch": 2.421403052939266, + "grad_norm": 3.2342095375061035, + "learning_rate": 3.2809784949143997e-06, + "loss": 0.6018, + "step": 4971 + }, + { + "epoch": 2.4218902240987332, + "grad_norm": 3.0610806941986084, + "learning_rate": 3.2803675194236384e-06, + "loss": 0.5453, + "step": 4972 + }, + { + "epoch": 2.422377395258201, + "grad_norm": 2.8412137031555176, + "learning_rate": 3.279756492288228e-06, + "loss": 0.522, + "step": 4973 + }, + { + "epoch": 2.422864566417668, + "grad_norm": 2.3237669467926025, + "learning_rate": 3.2791454135486046e-06, + "loss": 0.4426, + "step": 4974 + }, + { + "epoch": 2.4233517375771356, + "grad_norm": 2.867053985595703, + "learning_rate": 3.278534283245212e-06, + "loss": 0.4305, + "step": 4975 + }, + { + "epoch": 2.423838908736603, + "grad_norm": 2.9166598320007324, + "learning_rate": 3.277923101418491e-06, + "loss": 0.4361, + "step": 4976 + }, + { + "epoch": 2.42432607989607, + "grad_norm": 2.613862991333008, + "learning_rate": 3.2773118681088927e-06, + "loss": 0.5262, + "step": 4977 + }, + { + "epoch": 2.4248132510555376, + "grad_norm": 2.6776649951934814, + "learning_rate": 3.2767005833568677e-06, + "loss": 0.4848, + "step": 4978 + }, + { + "epoch": 2.4253004222150047, + "grad_norm": 2.4715535640716553, + "learning_rate": 3.2760892472028694e-06, + "loss": 0.5431, + "step": 4979 + }, + { + "epoch": 2.4257875933744724, + "grad_norm": 2.750175714492798, + "learning_rate": 3.2754778596873576e-06, + "loss": 0.4633, + "step": 4980 + }, + { + "epoch": 2.4262747645339395, + "grad_norm": 3.056252956390381, + "learning_rate": 3.2748664208507937e-06, + "loss": 0.5499, + "step": 4981 + }, + { + "epoch": 2.426761935693407, + "grad_norm": 2.7892367839813232, + "learning_rate": 3.2742549307336414e-06, + "loss": 0.4464, + "step": 4982 + }, + { + "epoch": 2.4272491068528743, + "grad_norm": 2.7946696281433105, + "learning_rate": 3.27364338937637e-06, + "loss": 0.5485, + "step": 4983 + }, + { + "epoch": 2.4277362780123415, + "grad_norm": 2.6454174518585205, + "learning_rate": 3.273031796819451e-06, + "loss": 0.5314, + "step": 4984 + }, + { + "epoch": 2.428223449171809, + "grad_norm": 2.655358076095581, + "learning_rate": 3.27242015310336e-06, + "loss": 0.5151, + "step": 4985 + }, + { + "epoch": 2.4287106203312763, + "grad_norm": 2.5978758335113525, + "learning_rate": 3.271808458268575e-06, + "loss": 0.4884, + "step": 4986 + }, + { + "epoch": 2.429197791490744, + "grad_norm": 2.886256456375122, + "learning_rate": 3.271196712355578e-06, + "loss": 0.4147, + "step": 4987 + }, + { + "epoch": 2.429684962650211, + "grad_norm": 2.562008857727051, + "learning_rate": 3.2705849154048547e-06, + "loss": 0.4798, + "step": 4988 + }, + { + "epoch": 2.4301721338096787, + "grad_norm": 3.0054447650909424, + "learning_rate": 3.2699730674568937e-06, + "loss": 0.5278, + "step": 4989 + }, + { + "epoch": 2.430659304969146, + "grad_norm": 2.7451443672180176, + "learning_rate": 3.2693611685521864e-06, + "loss": 0.5169, + "step": 4990 + }, + { + "epoch": 2.431146476128613, + "grad_norm": 2.792370319366455, + "learning_rate": 3.2687492187312285e-06, + "loss": 0.5295, + "step": 4991 + }, + { + "epoch": 2.4316336472880806, + "grad_norm": 2.5207419395446777, + "learning_rate": 3.268137218034519e-06, + "loss": 0.5214, + "step": 4992 + }, + { + "epoch": 2.432120818447548, + "grad_norm": 2.8401176929473877, + "learning_rate": 3.2675251665025604e-06, + "loss": 0.5307, + "step": 4993 + }, + { + "epoch": 2.4326079896070154, + "grad_norm": 3.0556178092956543, + "learning_rate": 3.2669130641758567e-06, + "loss": 0.5109, + "step": 4994 + }, + { + "epoch": 2.4330951607664826, + "grad_norm": 2.966143846511841, + "learning_rate": 3.266300911094919e-06, + "loss": 0.432, + "step": 4995 + }, + { + "epoch": 2.43358233192595, + "grad_norm": 2.978337287902832, + "learning_rate": 3.2656887073002577e-06, + "loss": 0.5243, + "step": 4996 + }, + { + "epoch": 2.4340695030854174, + "grad_norm": 2.847067356109619, + "learning_rate": 3.265076452832389e-06, + "loss": 0.5309, + "step": 4997 + }, + { + "epoch": 2.4345566742448845, + "grad_norm": 2.9029250144958496, + "learning_rate": 3.2644641477318317e-06, + "loss": 0.4865, + "step": 4998 + }, + { + "epoch": 2.435043845404352, + "grad_norm": 2.6046667098999023, + "learning_rate": 3.2638517920391095e-06, + "loss": 0.4995, + "step": 4999 + }, + { + "epoch": 2.4355310165638193, + "grad_norm": 2.843755006790161, + "learning_rate": 3.263239385794746e-06, + "loss": 0.4994, + "step": 5000 + }, + { + "epoch": 2.436018187723287, + "grad_norm": 2.5240471363067627, + "learning_rate": 3.2626269290392715e-06, + "loss": 0.4801, + "step": 5001 + }, + { + "epoch": 2.436505358882754, + "grad_norm": 2.945922374725342, + "learning_rate": 3.2620144218132164e-06, + "loss": 0.4591, + "step": 5002 + }, + { + "epoch": 2.4369925300422217, + "grad_norm": 3.1548306941986084, + "learning_rate": 3.2614018641571197e-06, + "loss": 0.4194, + "step": 5003 + }, + { + "epoch": 2.437479701201689, + "grad_norm": 2.691365957260132, + "learning_rate": 3.260789256111517e-06, + "loss": 0.4848, + "step": 5004 + }, + { + "epoch": 2.437966872361156, + "grad_norm": 2.8254435062408447, + "learning_rate": 3.260176597716953e-06, + "loss": 0.5495, + "step": 5005 + }, + { + "epoch": 2.4384540435206237, + "grad_norm": 2.611799955368042, + "learning_rate": 3.2595638890139725e-06, + "loss": 0.4634, + "step": 5006 + }, + { + "epoch": 2.438941214680091, + "grad_norm": 2.5186283588409424, + "learning_rate": 3.258951130043124e-06, + "loss": 0.5014, + "step": 5007 + }, + { + "epoch": 2.4394283858395585, + "grad_norm": 2.672924518585205, + "learning_rate": 3.258338320844961e-06, + "loss": 0.5539, + "step": 5008 + }, + { + "epoch": 2.4399155569990256, + "grad_norm": 2.8586525917053223, + "learning_rate": 3.2577254614600385e-06, + "loss": 0.4837, + "step": 5009 + }, + { + "epoch": 2.4404027281584932, + "grad_norm": 2.9030544757843018, + "learning_rate": 3.2571125519289152e-06, + "loss": 0.476, + "step": 5010 + }, + { + "epoch": 2.4408898993179604, + "grad_norm": 2.9135894775390625, + "learning_rate": 3.2564995922921542e-06, + "loss": 0.4963, + "step": 5011 + }, + { + "epoch": 2.4413770704774276, + "grad_norm": 3.0889439582824707, + "learning_rate": 3.2558865825903204e-06, + "loss": 0.5319, + "step": 5012 + }, + { + "epoch": 2.441864241636895, + "grad_norm": 2.9362707138061523, + "learning_rate": 3.255273522863983e-06, + "loss": 0.5172, + "step": 5013 + }, + { + "epoch": 2.4423514127963624, + "grad_norm": 2.689035177230835, + "learning_rate": 3.2546604131537134e-06, + "loss": 0.44, + "step": 5014 + }, + { + "epoch": 2.44283858395583, + "grad_norm": 3.1630072593688965, + "learning_rate": 3.254047253500089e-06, + "loss": 0.5987, + "step": 5015 + }, + { + "epoch": 2.443325755115297, + "grad_norm": 2.926342248916626, + "learning_rate": 3.2534340439436874e-06, + "loss": 0.5468, + "step": 5016 + }, + { + "epoch": 2.4438129262747648, + "grad_norm": 3.4448463916778564, + "learning_rate": 3.25282078452509e-06, + "loss": 0.5139, + "step": 5017 + }, + { + "epoch": 2.444300097434232, + "grad_norm": 2.6618149280548096, + "learning_rate": 3.2522074752848844e-06, + "loss": 0.5217, + "step": 5018 + }, + { + "epoch": 2.444787268593699, + "grad_norm": 2.6825783252716064, + "learning_rate": 3.251594116263658e-06, + "loss": 0.4622, + "step": 5019 + }, + { + "epoch": 2.4452744397531667, + "grad_norm": 2.9690423011779785, + "learning_rate": 3.2509807075020024e-06, + "loss": 0.5195, + "step": 5020 + }, + { + "epoch": 2.445761610912634, + "grad_norm": 2.7531275749206543, + "learning_rate": 3.2503672490405135e-06, + "loss": 0.4486, + "step": 5021 + }, + { + "epoch": 2.4462487820721015, + "grad_norm": 2.8320767879486084, + "learning_rate": 3.24975374091979e-06, + "loss": 0.4783, + "step": 5022 + }, + { + "epoch": 2.4467359532315687, + "grad_norm": 2.827908754348755, + "learning_rate": 3.2491401831804342e-06, + "loss": 0.4095, + "step": 5023 + }, + { + "epoch": 2.4472231243910363, + "grad_norm": 2.535423517227173, + "learning_rate": 3.24852657586305e-06, + "loss": 0.4254, + "step": 5024 + }, + { + "epoch": 2.4477102955505035, + "grad_norm": 2.8780059814453125, + "learning_rate": 3.2479129190082482e-06, + "loss": 0.4842, + "step": 5025 + }, + { + "epoch": 2.4481974667099706, + "grad_norm": 2.6425631046295166, + "learning_rate": 3.247299212656638e-06, + "loss": 0.4159, + "step": 5026 + }, + { + "epoch": 2.4486846378694382, + "grad_norm": 2.9608418941497803, + "learning_rate": 3.2466854568488354e-06, + "loss": 0.4471, + "step": 5027 + }, + { + "epoch": 2.4491718090289054, + "grad_norm": 2.6588032245635986, + "learning_rate": 3.246071651625459e-06, + "loss": 0.5275, + "step": 5028 + }, + { + "epoch": 2.449658980188373, + "grad_norm": 2.9220821857452393, + "learning_rate": 3.24545779702713e-06, + "loss": 0.4995, + "step": 5029 + }, + { + "epoch": 2.45014615134784, + "grad_norm": 4.557098865509033, + "learning_rate": 3.2448438930944735e-06, + "loss": 0.4573, + "step": 5030 + }, + { + "epoch": 2.450633322507308, + "grad_norm": 2.742450475692749, + "learning_rate": 3.244229939868118e-06, + "loss": 0.4802, + "step": 5031 + }, + { + "epoch": 2.451120493666775, + "grad_norm": 2.8939812183380127, + "learning_rate": 3.243615937388694e-06, + "loss": 0.4539, + "step": 5032 + }, + { + "epoch": 2.451607664826242, + "grad_norm": 2.6430609226226807, + "learning_rate": 3.243001885696836e-06, + "loss": 0.523, + "step": 5033 + }, + { + "epoch": 2.4520948359857098, + "grad_norm": 2.834260940551758, + "learning_rate": 3.242387784833183e-06, + "loss": 0.506, + "step": 5034 + }, + { + "epoch": 2.452582007145177, + "grad_norm": 2.8120439052581787, + "learning_rate": 3.2417736348383753e-06, + "loss": 0.4916, + "step": 5035 + }, + { + "epoch": 2.4530691783046445, + "grad_norm": 2.459970235824585, + "learning_rate": 3.241159435753058e-06, + "loss": 0.4634, + "step": 5036 + }, + { + "epoch": 2.4535563494641117, + "grad_norm": 2.506253242492676, + "learning_rate": 3.2405451876178773e-06, + "loss": 0.4159, + "step": 5037 + }, + { + "epoch": 2.4540435206235793, + "grad_norm": 2.4992237091064453, + "learning_rate": 3.239930890473486e-06, + "loss": 0.5124, + "step": 5038 + }, + { + "epoch": 2.4545306917830465, + "grad_norm": 2.6723155975341797, + "learning_rate": 3.239316544360537e-06, + "loss": 0.452, + "step": 5039 + }, + { + "epoch": 2.4550178629425137, + "grad_norm": 2.9664714336395264, + "learning_rate": 3.2387021493196873e-06, + "loss": 0.5028, + "step": 5040 + }, + { + "epoch": 2.4555050341019813, + "grad_norm": 3.035198211669922, + "learning_rate": 3.2380877053915987e-06, + "loss": 0.4817, + "step": 5041 + }, + { + "epoch": 2.4559922052614485, + "grad_norm": 2.710087537765503, + "learning_rate": 3.2374732126169345e-06, + "loss": 0.55, + "step": 5042 + }, + { + "epoch": 2.456479376420916, + "grad_norm": 2.9052345752716064, + "learning_rate": 3.2368586710363614e-06, + "loss": 0.4811, + "step": 5043 + }, + { + "epoch": 2.4569665475803832, + "grad_norm": 2.9190585613250732, + "learning_rate": 3.236244080690551e-06, + "loss": 0.4961, + "step": 5044 + }, + { + "epoch": 2.4574537187398504, + "grad_norm": 2.36151123046875, + "learning_rate": 3.2356294416201755e-06, + "loss": 0.4273, + "step": 5045 + }, + { + "epoch": 2.457940889899318, + "grad_norm": 2.5062100887298584, + "learning_rate": 3.235014753865912e-06, + "loss": 0.4628, + "step": 5046 + }, + { + "epoch": 2.458428061058785, + "grad_norm": 3.0769715309143066, + "learning_rate": 3.23440001746844e-06, + "loss": 0.5038, + "step": 5047 + }, + { + "epoch": 2.458915232218253, + "grad_norm": 2.6793346405029297, + "learning_rate": 3.2337852324684436e-06, + "loss": 0.4815, + "step": 5048 + }, + { + "epoch": 2.45940240337772, + "grad_norm": 2.5146470069885254, + "learning_rate": 3.233170398906609e-06, + "loss": 0.4008, + "step": 5049 + }, + { + "epoch": 2.459889574537187, + "grad_norm": 2.797318935394287, + "learning_rate": 3.2325555168236254e-06, + "loss": 0.5422, + "step": 5050 + }, + { + "epoch": 2.4603767456966548, + "grad_norm": 2.9461417198181152, + "learning_rate": 3.231940586260186e-06, + "loss": 0.4953, + "step": 5051 + }, + { + "epoch": 2.460863916856122, + "grad_norm": 3.0918140411376953, + "learning_rate": 3.231325607256987e-06, + "loss": 0.4787, + "step": 5052 + }, + { + "epoch": 2.4613510880155895, + "grad_norm": 9.27033519744873, + "learning_rate": 3.2307105798547267e-06, + "loss": 0.4957, + "step": 5053 + }, + { + "epoch": 2.4618382591750567, + "grad_norm": 2.7598958015441895, + "learning_rate": 3.2300955040941096e-06, + "loss": 0.5208, + "step": 5054 + }, + { + "epoch": 2.4623254303345243, + "grad_norm": 3.2605082988739014, + "learning_rate": 3.229480380015839e-06, + "loss": 0.4376, + "step": 5055 + }, + { + "epoch": 2.4628126014939915, + "grad_norm": 2.838015079498291, + "learning_rate": 3.2288652076606245e-06, + "loss": 0.5318, + "step": 5056 + }, + { + "epoch": 2.4632997726534587, + "grad_norm": 2.859595775604248, + "learning_rate": 3.2282499870691793e-06, + "loss": 0.4482, + "step": 5057 + }, + { + "epoch": 2.4637869438129263, + "grad_norm": 2.5671041011810303, + "learning_rate": 3.2276347182822176e-06, + "loss": 0.4218, + "step": 5058 + }, + { + "epoch": 2.4642741149723935, + "grad_norm": 2.5543174743652344, + "learning_rate": 3.227019401340457e-06, + "loss": 0.4407, + "step": 5059 + }, + { + "epoch": 2.464761286131861, + "grad_norm": 2.941824197769165, + "learning_rate": 3.2264040362846217e-06, + "loss": 0.5128, + "step": 5060 + }, + { + "epoch": 2.4652484572913282, + "grad_norm": 2.8461413383483887, + "learning_rate": 3.2257886231554335e-06, + "loss": 0.4952, + "step": 5061 + }, + { + "epoch": 2.465735628450796, + "grad_norm": 2.805567502975464, + "learning_rate": 3.2251731619936226e-06, + "loss": 0.5436, + "step": 5062 + }, + { + "epoch": 2.466222799610263, + "grad_norm": 2.767214775085449, + "learning_rate": 3.2245576528399193e-06, + "loss": 0.5383, + "step": 5063 + }, + { + "epoch": 2.46670997076973, + "grad_norm": 3.22987699508667, + "learning_rate": 3.2239420957350574e-06, + "loss": 0.6135, + "step": 5064 + }, + { + "epoch": 2.467197141929198, + "grad_norm": 5.3822245597839355, + "learning_rate": 3.223326490719776e-06, + "loss": 0.4792, + "step": 5065 + }, + { + "epoch": 2.467684313088665, + "grad_norm": 2.7305214405059814, + "learning_rate": 3.2227108378348144e-06, + "loss": 0.5007, + "step": 5066 + }, + { + "epoch": 2.4681714842481326, + "grad_norm": 2.7966041564941406, + "learning_rate": 3.222095137120917e-06, + "loss": 0.5224, + "step": 5067 + }, + { + "epoch": 2.4686586554075998, + "grad_norm": 2.7847983837127686, + "learning_rate": 3.2214793886188306e-06, + "loss": 0.5119, + "step": 5068 + }, + { + "epoch": 2.4691458265670674, + "grad_norm": 2.686145782470703, + "learning_rate": 3.2208635923693055e-06, + "loss": 0.5242, + "step": 5069 + }, + { + "epoch": 2.4696329977265346, + "grad_norm": 2.751601457595825, + "learning_rate": 3.2202477484130947e-06, + "loss": 0.491, + "step": 5070 + }, + { + "epoch": 2.4701201688860017, + "grad_norm": 3.0522360801696777, + "learning_rate": 3.219631856790955e-06, + "loss": 0.5178, + "step": 5071 + }, + { + "epoch": 2.4706073400454693, + "grad_norm": 3.243504762649536, + "learning_rate": 3.2190159175436457e-06, + "loss": 0.5177, + "step": 5072 + }, + { + "epoch": 2.4710945112049365, + "grad_norm": 3.1547083854675293, + "learning_rate": 3.2183999307119307e-06, + "loss": 0.5226, + "step": 5073 + }, + { + "epoch": 2.471581682364404, + "grad_norm": 2.796678066253662, + "learning_rate": 3.217783896336575e-06, + "loss": 0.6136, + "step": 5074 + }, + { + "epoch": 2.4720688535238713, + "grad_norm": 2.850940227508545, + "learning_rate": 3.2171678144583475e-06, + "loss": 0.4929, + "step": 5075 + }, + { + "epoch": 2.472556024683339, + "grad_norm": 2.611117124557495, + "learning_rate": 3.2165516851180207e-06, + "loss": 0.4829, + "step": 5076 + }, + { + "epoch": 2.473043195842806, + "grad_norm": 2.7043089866638184, + "learning_rate": 3.2159355083563705e-06, + "loss": 0.4973, + "step": 5077 + }, + { + "epoch": 2.4735303670022732, + "grad_norm": 2.777491569519043, + "learning_rate": 3.2153192842141744e-06, + "loss": 0.5017, + "step": 5078 + }, + { + "epoch": 2.474017538161741, + "grad_norm": 2.842360258102417, + "learning_rate": 3.214703012732216e-06, + "loss": 0.4786, + "step": 5079 + }, + { + "epoch": 2.474504709321208, + "grad_norm": 2.864575147628784, + "learning_rate": 3.214086693951278e-06, + "loss": 0.5641, + "step": 5080 + }, + { + "epoch": 2.4749918804806756, + "grad_norm": 2.92518949508667, + "learning_rate": 3.213470327912149e-06, + "loss": 0.5299, + "step": 5081 + }, + { + "epoch": 2.475479051640143, + "grad_norm": 2.7849059104919434, + "learning_rate": 3.2128539146556197e-06, + "loss": 0.3843, + "step": 5082 + }, + { + "epoch": 2.4759662227996104, + "grad_norm": 2.7648723125457764, + "learning_rate": 3.2122374542224855e-06, + "loss": 0.5458, + "step": 5083 + }, + { + "epoch": 2.4764533939590776, + "grad_norm": 3.292431116104126, + "learning_rate": 3.2116209466535425e-06, + "loss": 0.4926, + "step": 5084 + }, + { + "epoch": 2.4769405651185448, + "grad_norm": 2.8669326305389404, + "learning_rate": 3.2110043919895918e-06, + "loss": 0.5925, + "step": 5085 + }, + { + "epoch": 2.4774277362780124, + "grad_norm": 3.228024959564209, + "learning_rate": 3.2103877902714365e-06, + "loss": 0.4765, + "step": 5086 + }, + { + "epoch": 2.4779149074374796, + "grad_norm": 3.124709367752075, + "learning_rate": 3.2097711415398835e-06, + "loss": 0.4819, + "step": 5087 + }, + { + "epoch": 2.478402078596947, + "grad_norm": 3.0511529445648193, + "learning_rate": 3.209154445835742e-06, + "loss": 0.5252, + "step": 5088 + }, + { + "epoch": 2.4788892497564143, + "grad_norm": 3.210277557373047, + "learning_rate": 3.2085377031998256e-06, + "loss": 0.4512, + "step": 5089 + }, + { + "epoch": 2.479376420915882, + "grad_norm": 3.1634106636047363, + "learning_rate": 3.2079209136729505e-06, + "loss": 0.4582, + "step": 5090 + }, + { + "epoch": 2.479863592075349, + "grad_norm": 2.500369071960449, + "learning_rate": 3.2073040772959347e-06, + "loss": 0.4504, + "step": 5091 + }, + { + "epoch": 2.4803507632348163, + "grad_norm": 2.615313768386841, + "learning_rate": 3.2066871941096006e-06, + "loss": 0.4872, + "step": 5092 + }, + { + "epoch": 2.480837934394284, + "grad_norm": 3.208590507507324, + "learning_rate": 3.2060702641547748e-06, + "loss": 0.5676, + "step": 5093 + }, + { + "epoch": 2.481325105553751, + "grad_norm": 2.5238699913024902, + "learning_rate": 3.2054532874722837e-06, + "loss": 0.4719, + "step": 5094 + }, + { + "epoch": 2.4818122767132187, + "grad_norm": 2.75062894821167, + "learning_rate": 3.2048362641029603e-06, + "loss": 0.4418, + "step": 5095 + }, + { + "epoch": 2.482299447872686, + "grad_norm": 2.6510066986083984, + "learning_rate": 3.2042191940876375e-06, + "loss": 0.4828, + "step": 5096 + }, + { + "epoch": 2.4827866190321535, + "grad_norm": 2.7075748443603516, + "learning_rate": 3.2036020774671555e-06, + "loss": 0.4226, + "step": 5097 + }, + { + "epoch": 2.4832737901916206, + "grad_norm": 2.692467451095581, + "learning_rate": 3.2029849142823527e-06, + "loss": 0.5198, + "step": 5098 + }, + { + "epoch": 2.483760961351088, + "grad_norm": 2.555001735687256, + "learning_rate": 3.2023677045740742e-06, + "loss": 0.449, + "step": 5099 + }, + { + "epoch": 2.4842481325105554, + "grad_norm": 2.902346134185791, + "learning_rate": 3.201750448383166e-06, + "loss": 0.4715, + "step": 5100 + }, + { + "epoch": 2.4847353036700226, + "grad_norm": 3.0308477878570557, + "learning_rate": 3.2011331457504792e-06, + "loss": 0.492, + "step": 5101 + }, + { + "epoch": 2.48522247482949, + "grad_norm": 2.7632768154144287, + "learning_rate": 3.200515796716865e-06, + "loss": 0.4481, + "step": 5102 + }, + { + "epoch": 2.4857096459889574, + "grad_norm": 2.982710361480713, + "learning_rate": 3.1998984013231817e-06, + "loss": 0.4811, + "step": 5103 + }, + { + "epoch": 2.486196817148425, + "grad_norm": 2.6316184997558594, + "learning_rate": 3.199280959610286e-06, + "loss": 0.4544, + "step": 5104 + }, + { + "epoch": 2.486683988307892, + "grad_norm": 2.7700581550598145, + "learning_rate": 3.1986634716190423e-06, + "loss": 0.4249, + "step": 5105 + }, + { + "epoch": 2.4871711594673593, + "grad_norm": 2.6344151496887207, + "learning_rate": 3.198045937390315e-06, + "loss": 0.498, + "step": 5106 + }, + { + "epoch": 2.487658330626827, + "grad_norm": 2.702869415283203, + "learning_rate": 3.1974283569649734e-06, + "loss": 0.4987, + "step": 5107 + }, + { + "epoch": 2.488145501786294, + "grad_norm": 2.740139961242676, + "learning_rate": 3.196810730383887e-06, + "loss": 0.4505, + "step": 5108 + }, + { + "epoch": 2.4886326729457617, + "grad_norm": 3.178084373474121, + "learning_rate": 3.196193057687932e-06, + "loss": 0.4082, + "step": 5109 + }, + { + "epoch": 2.489119844105229, + "grad_norm": 2.853379249572754, + "learning_rate": 3.195575338917985e-06, + "loss": 0.5557, + "step": 5110 + }, + { + "epoch": 2.4896070152646965, + "grad_norm": 3.4665472507476807, + "learning_rate": 3.1949575741149262e-06, + "loss": 0.4904, + "step": 5111 + }, + { + "epoch": 2.4900941864241637, + "grad_norm": 2.3760054111480713, + "learning_rate": 3.194339763319641e-06, + "loss": 0.4789, + "step": 5112 + }, + { + "epoch": 2.490581357583631, + "grad_norm": 2.837928056716919, + "learning_rate": 3.1937219065730153e-06, + "loss": 0.5335, + "step": 5113 + }, + { + "epoch": 2.4910685287430985, + "grad_norm": 2.818005084991455, + "learning_rate": 3.1931040039159367e-06, + "loss": 0.4572, + "step": 5114 + }, + { + "epoch": 2.4915556999025656, + "grad_norm": 2.7131271362304688, + "learning_rate": 3.1924860553893013e-06, + "loss": 0.5444, + "step": 5115 + }, + { + "epoch": 2.4920428710620333, + "grad_norm": 2.7938737869262695, + "learning_rate": 3.191868061034002e-06, + "loss": 0.4985, + "step": 5116 + }, + { + "epoch": 2.4925300422215004, + "grad_norm": 3.079277753829956, + "learning_rate": 3.191250020890939e-06, + "loss": 0.5205, + "step": 5117 + }, + { + "epoch": 2.493017213380968, + "grad_norm": 3.1864781379699707, + "learning_rate": 3.190631935001014e-06, + "loss": 0.4819, + "step": 5118 + }, + { + "epoch": 2.493504384540435, + "grad_norm": 2.691527843475342, + "learning_rate": 3.1900138034051326e-06, + "loss": 0.5385, + "step": 5119 + }, + { + "epoch": 2.4939915556999024, + "grad_norm": 2.576409101486206, + "learning_rate": 3.1893956261442018e-06, + "loss": 0.4849, + "step": 5120 + }, + { + "epoch": 2.49447872685937, + "grad_norm": 2.627516984939575, + "learning_rate": 3.1887774032591325e-06, + "loss": 0.4234, + "step": 5121 + }, + { + "epoch": 2.494965898018837, + "grad_norm": 2.8253161907196045, + "learning_rate": 3.1881591347908387e-06, + "loss": 0.4654, + "step": 5122 + }, + { + "epoch": 2.495453069178305, + "grad_norm": 2.5345218181610107, + "learning_rate": 3.1875408207802366e-06, + "loss": 0.495, + "step": 5123 + }, + { + "epoch": 2.495940240337772, + "grad_norm": 2.849346399307251, + "learning_rate": 3.186922461268248e-06, + "loss": 0.5154, + "step": 5124 + }, + { + "epoch": 2.4964274114972396, + "grad_norm": 2.860842227935791, + "learning_rate": 3.1863040562957947e-06, + "loss": 0.4724, + "step": 5125 + }, + { + "epoch": 2.4969145826567067, + "grad_norm": 3.0677502155303955, + "learning_rate": 3.1856856059038032e-06, + "loss": 0.4742, + "step": 5126 + }, + { + "epoch": 2.497401753816174, + "grad_norm": 2.6309468746185303, + "learning_rate": 3.1850671101332016e-06, + "loss": 0.488, + "step": 5127 + }, + { + "epoch": 2.4978889249756415, + "grad_norm": 3.235412836074829, + "learning_rate": 3.1844485690249225e-06, + "loss": 0.4786, + "step": 5128 + }, + { + "epoch": 2.4983760961351087, + "grad_norm": 2.723423957824707, + "learning_rate": 3.1838299826199003e-06, + "loss": 0.4956, + "step": 5129 + }, + { + "epoch": 2.4988632672945763, + "grad_norm": 2.85909366607666, + "learning_rate": 3.1832113509590744e-06, + "loss": 0.4494, + "step": 5130 + }, + { + "epoch": 2.4993504384540435, + "grad_norm": 3.079789400100708, + "learning_rate": 3.1825926740833846e-06, + "loss": 0.5502, + "step": 5131 + }, + { + "epoch": 2.499837609613511, + "grad_norm": 2.851936101913452, + "learning_rate": 3.1819739520337756e-06, + "loss": 0.5608, + "step": 5132 + }, + { + "epoch": 2.5003247807729783, + "grad_norm": 2.972606897354126, + "learning_rate": 3.181355184851194e-06, + "loss": 0.5422, + "step": 5133 + }, + { + "epoch": 2.5008119519324454, + "grad_norm": 3.404893159866333, + "learning_rate": 3.1807363725765894e-06, + "loss": 0.5178, + "step": 5134 + }, + { + "epoch": 2.501299123091913, + "grad_norm": 3.0784647464752197, + "learning_rate": 3.1801175152509153e-06, + "loss": 0.4713, + "step": 5135 + }, + { + "epoch": 2.50178629425138, + "grad_norm": 3.1926140785217285, + "learning_rate": 3.1794986129151275e-06, + "loss": 0.5451, + "step": 5136 + }, + { + "epoch": 2.502273465410848, + "grad_norm": 3.2538864612579346, + "learning_rate": 3.178879665610184e-06, + "loss": 0.4287, + "step": 5137 + }, + { + "epoch": 2.502760636570315, + "grad_norm": 3.0587055683135986, + "learning_rate": 3.1782606733770475e-06, + "loss": 0.4542, + "step": 5138 + }, + { + "epoch": 2.5032478077297826, + "grad_norm": 2.802206039428711, + "learning_rate": 3.1776416362566833e-06, + "loss": 0.4581, + "step": 5139 + }, + { + "epoch": 2.50373497888925, + "grad_norm": 2.854069948196411, + "learning_rate": 3.1770225542900585e-06, + "loss": 0.4969, + "step": 5140 + }, + { + "epoch": 2.504222150048717, + "grad_norm": 2.9454851150512695, + "learning_rate": 3.1764034275181436e-06, + "loss": 0.4876, + "step": 5141 + }, + { + "epoch": 2.5047093212081846, + "grad_norm": 2.44802188873291, + "learning_rate": 3.175784255981914e-06, + "loss": 0.5275, + "step": 5142 + }, + { + "epoch": 2.5051964923676517, + "grad_norm": 2.78625750541687, + "learning_rate": 3.1751650397223442e-06, + "loss": 0.4495, + "step": 5143 + }, + { + "epoch": 2.5056836635271194, + "grad_norm": 2.9915926456451416, + "learning_rate": 3.1745457787804156e-06, + "loss": 0.4945, + "step": 5144 + }, + { + "epoch": 2.5061708346865865, + "grad_norm": 2.649343967437744, + "learning_rate": 3.1739264731971097e-06, + "loss": 0.545, + "step": 5145 + }, + { + "epoch": 2.506658005846054, + "grad_norm": 3.0100557804107666, + "learning_rate": 3.1733071230134126e-06, + "loss": 0.488, + "step": 5146 + }, + { + "epoch": 2.5071451770055213, + "grad_norm": 2.9779884815216064, + "learning_rate": 3.1726877282703133e-06, + "loss": 0.4798, + "step": 5147 + }, + { + "epoch": 2.5076323481649885, + "grad_norm": 3.3234355449676514, + "learning_rate": 3.172068289008803e-06, + "loss": 0.5215, + "step": 5148 + }, + { + "epoch": 2.508119519324456, + "grad_norm": 2.61251163482666, + "learning_rate": 3.1714488052698746e-06, + "loss": 0.5085, + "step": 5149 + }, + { + "epoch": 2.5086066904839233, + "grad_norm": 2.914336919784546, + "learning_rate": 3.170829277094528e-06, + "loss": 0.5257, + "step": 5150 + }, + { + "epoch": 2.509093861643391, + "grad_norm": 3.024111270904541, + "learning_rate": 3.1702097045237618e-06, + "loss": 0.5029, + "step": 5151 + }, + { + "epoch": 2.509581032802858, + "grad_norm": 2.7557077407836914, + "learning_rate": 3.16959008759858e-06, + "loss": 0.4912, + "step": 5152 + }, + { + "epoch": 2.5100682039623257, + "grad_norm": 2.897989273071289, + "learning_rate": 3.168970426359988e-06, + "loss": 0.4867, + "step": 5153 + }, + { + "epoch": 2.510555375121793, + "grad_norm": 2.9569506645202637, + "learning_rate": 3.168350720848996e-06, + "loss": 0.4935, + "step": 5154 + }, + { + "epoch": 2.51104254628126, + "grad_norm": 3.2700436115264893, + "learning_rate": 3.167730971106616e-06, + "loss": 0.5721, + "step": 5155 + }, + { + "epoch": 2.5115297174407276, + "grad_norm": 3.2677626609802246, + "learning_rate": 3.1671111771738623e-06, + "loss": 0.4701, + "step": 5156 + }, + { + "epoch": 2.512016888600195, + "grad_norm": 3.1148743629455566, + "learning_rate": 3.1664913390917524e-06, + "loss": 0.5068, + "step": 5157 + }, + { + "epoch": 2.512504059759662, + "grad_norm": 3.0282082557678223, + "learning_rate": 3.165871456901308e-06, + "loss": 0.4859, + "step": 5158 + }, + { + "epoch": 2.5129912309191296, + "grad_norm": 2.662191390991211, + "learning_rate": 3.165251530643553e-06, + "loss": 0.5316, + "step": 5159 + }, + { + "epoch": 2.513478402078597, + "grad_norm": 2.4984822273254395, + "learning_rate": 3.164631560359513e-06, + "loss": 0.5136, + "step": 5160 + }, + { + "epoch": 2.5139655732380644, + "grad_norm": 2.849091053009033, + "learning_rate": 3.164011546090218e-06, + "loss": 0.4256, + "step": 5161 + }, + { + "epoch": 2.5144527443975315, + "grad_norm": 2.992521286010742, + "learning_rate": 3.1633914878767013e-06, + "loss": 0.556, + "step": 5162 + }, + { + "epoch": 2.514939915556999, + "grad_norm": 2.5913398265838623, + "learning_rate": 3.1627713857599972e-06, + "loss": 0.4847, + "step": 5163 + }, + { + "epoch": 2.5154270867164663, + "grad_norm": 2.7744812965393066, + "learning_rate": 3.1621512397811455e-06, + "loss": 0.4597, + "step": 5164 + }, + { + "epoch": 2.5159142578759335, + "grad_norm": 2.679905414581299, + "learning_rate": 3.1615310499811856e-06, + "loss": 0.4776, + "step": 5165 + }, + { + "epoch": 2.516401429035401, + "grad_norm": 2.6170873641967773, + "learning_rate": 3.160910816401163e-06, + "loss": 0.4819, + "step": 5166 + }, + { + "epoch": 2.5168886001948687, + "grad_norm": 3.0727181434631348, + "learning_rate": 3.160290539082123e-06, + "loss": 0.4944, + "step": 5167 + }, + { + "epoch": 2.517375771354336, + "grad_norm": 3.1398251056671143, + "learning_rate": 3.159670218065118e-06, + "loss": 0.6056, + "step": 5168 + }, + { + "epoch": 2.517862942513803, + "grad_norm": 2.790919780731201, + "learning_rate": 3.1590498533911984e-06, + "loss": 0.5384, + "step": 5169 + }, + { + "epoch": 2.5183501136732707, + "grad_norm": 3.0136284828186035, + "learning_rate": 3.1584294451014213e-06, + "loss": 0.4693, + "step": 5170 + }, + { + "epoch": 2.518837284832738, + "grad_norm": 3.0335323810577393, + "learning_rate": 3.157808993236845e-06, + "loss": 0.504, + "step": 5171 + }, + { + "epoch": 2.519324455992205, + "grad_norm": 2.752972364425659, + "learning_rate": 3.15718849783853e-06, + "loss": 0.4637, + "step": 5172 + }, + { + "epoch": 2.5198116271516726, + "grad_norm": 3.3108413219451904, + "learning_rate": 3.1565679589475427e-06, + "loss": 0.491, + "step": 5173 + }, + { + "epoch": 2.5202987983111402, + "grad_norm": 2.7926785945892334, + "learning_rate": 3.155947376604948e-06, + "loss": 0.5204, + "step": 5174 + }, + { + "epoch": 2.5207859694706074, + "grad_norm": 2.8330984115600586, + "learning_rate": 3.1553267508518178e-06, + "loss": 0.4469, + "step": 5175 + }, + { + "epoch": 2.5212731406300746, + "grad_norm": 2.5737884044647217, + "learning_rate": 3.154706081729223e-06, + "loss": 0.5046, + "step": 5176 + }, + { + "epoch": 2.521760311789542, + "grad_norm": 3.1681714057922363, + "learning_rate": 3.154085369278242e-06, + "loss": 0.5224, + "step": 5177 + }, + { + "epoch": 2.5222474829490094, + "grad_norm": 3.4905242919921875, + "learning_rate": 3.1534646135399516e-06, + "loss": 0.5349, + "step": 5178 + }, + { + "epoch": 2.5227346541084765, + "grad_norm": 3.077924966812134, + "learning_rate": 3.1528438145554334e-06, + "loss": 0.5181, + "step": 5179 + }, + { + "epoch": 2.523221825267944, + "grad_norm": 3.2786309719085693, + "learning_rate": 3.1522229723657734e-06, + "loss": 0.5338, + "step": 5180 + }, + { + "epoch": 2.5237089964274118, + "grad_norm": 2.6078782081604004, + "learning_rate": 3.151602087012057e-06, + "loss": 0.4476, + "step": 5181 + }, + { + "epoch": 2.524196167586879, + "grad_norm": 2.87739634513855, + "learning_rate": 3.1509811585353757e-06, + "loss": 0.5332, + "step": 5182 + }, + { + "epoch": 2.524683338746346, + "grad_norm": 2.5705127716064453, + "learning_rate": 3.150360186976822e-06, + "loss": 0.4509, + "step": 5183 + }, + { + "epoch": 2.5251705099058137, + "grad_norm": 2.591675281524658, + "learning_rate": 3.1497391723774907e-06, + "loss": 0.5244, + "step": 5184 + }, + { + "epoch": 2.525657681065281, + "grad_norm": 2.670206069946289, + "learning_rate": 3.149118114778482e-06, + "loss": 0.458, + "step": 5185 + }, + { + "epoch": 2.526144852224748, + "grad_norm": 3.1115562915802, + "learning_rate": 3.148497014220897e-06, + "loss": 0.4975, + "step": 5186 + }, + { + "epoch": 2.5266320233842157, + "grad_norm": 2.833226442337036, + "learning_rate": 3.1478758707458408e-06, + "loss": 0.461, + "step": 5187 + }, + { + "epoch": 2.5271191945436833, + "grad_norm": 3.1854000091552734, + "learning_rate": 3.1472546843944186e-06, + "loss": 0.4723, + "step": 5188 + }, + { + "epoch": 2.5276063657031504, + "grad_norm": 2.7126047611236572, + "learning_rate": 3.146633455207742e-06, + "loss": 0.416, + "step": 5189 + }, + { + "epoch": 2.5280935368626176, + "grad_norm": 2.6347970962524414, + "learning_rate": 3.1460121832269234e-06, + "loss": 0.5111, + "step": 5190 + }, + { + "epoch": 2.5285807080220852, + "grad_norm": 3.0879480838775635, + "learning_rate": 3.1453908684930794e-06, + "loss": 0.546, + "step": 5191 + }, + { + "epoch": 2.5290678791815524, + "grad_norm": 2.811936378479004, + "learning_rate": 3.144769511047327e-06, + "loss": 0.5543, + "step": 5192 + }, + { + "epoch": 2.5295550503410196, + "grad_norm": 2.8449316024780273, + "learning_rate": 3.144148110930789e-06, + "loss": 0.4641, + "step": 5193 + }, + { + "epoch": 2.530042221500487, + "grad_norm": 2.9540493488311768, + "learning_rate": 3.143526668184588e-06, + "loss": 0.4992, + "step": 5194 + }, + { + "epoch": 2.530529392659955, + "grad_norm": 2.4975860118865967, + "learning_rate": 3.1429051828498535e-06, + "loss": 0.5104, + "step": 5195 + }, + { + "epoch": 2.531016563819422, + "grad_norm": 2.8068273067474365, + "learning_rate": 3.142283654967713e-06, + "loss": 0.5328, + "step": 5196 + }, + { + "epoch": 2.531503734978889, + "grad_norm": 2.916635036468506, + "learning_rate": 3.1416620845793e-06, + "loss": 0.5245, + "step": 5197 + }, + { + "epoch": 2.5319909061383568, + "grad_norm": 2.833665132522583, + "learning_rate": 3.14104047172575e-06, + "loss": 0.4863, + "step": 5198 + }, + { + "epoch": 2.532478077297824, + "grad_norm": 3.3948476314544678, + "learning_rate": 3.140418816448202e-06, + "loss": 0.5715, + "step": 5199 + }, + { + "epoch": 2.532965248457291, + "grad_norm": 2.577284574508667, + "learning_rate": 3.1397971187877956e-06, + "loss": 0.5378, + "step": 5200 + }, + { + "epoch": 2.5334524196167587, + "grad_norm": 2.8795359134674072, + "learning_rate": 3.139175378785676e-06, + "loss": 0.4898, + "step": 5201 + }, + { + "epoch": 2.5339395907762263, + "grad_norm": 2.628267765045166, + "learning_rate": 3.138553596482989e-06, + "loss": 0.4693, + "step": 5202 + }, + { + "epoch": 2.5344267619356935, + "grad_norm": 2.7627124786376953, + "learning_rate": 3.1379317719208847e-06, + "loss": 0.4928, + "step": 5203 + }, + { + "epoch": 2.5349139330951607, + "grad_norm": 3.2584595680236816, + "learning_rate": 3.137309905140514e-06, + "loss": 0.5126, + "step": 5204 + }, + { + "epoch": 2.5354011042546283, + "grad_norm": 3.0193634033203125, + "learning_rate": 3.136687996183034e-06, + "loss": 0.4773, + "step": 5205 + }, + { + "epoch": 2.5358882754140954, + "grad_norm": 2.865248203277588, + "learning_rate": 3.136066045089602e-06, + "loss": 0.4501, + "step": 5206 + }, + { + "epoch": 2.5363754465735626, + "grad_norm": 2.5103237628936768, + "learning_rate": 3.1354440519013785e-06, + "loss": 0.4415, + "step": 5207 + }, + { + "epoch": 2.5368626177330302, + "grad_norm": 2.623795986175537, + "learning_rate": 3.134822016659526e-06, + "loss": 0.5229, + "step": 5208 + }, + { + "epoch": 2.5373497888924974, + "grad_norm": 2.911414384841919, + "learning_rate": 3.134199939405212e-06, + "loss": 0.4602, + "step": 5209 + }, + { + "epoch": 2.537836960051965, + "grad_norm": 2.8744776248931885, + "learning_rate": 3.1335778201796052e-06, + "loss": 0.4751, + "step": 5210 + }, + { + "epoch": 2.538324131211432, + "grad_norm": 3.4484825134277344, + "learning_rate": 3.1329556590238767e-06, + "loss": 0.523, + "step": 5211 + }, + { + "epoch": 2.5388113023709, + "grad_norm": 3.108502149581909, + "learning_rate": 3.132333455979202e-06, + "loss": 0.5374, + "step": 5212 + }, + { + "epoch": 2.539298473530367, + "grad_norm": 2.9543516635894775, + "learning_rate": 3.1317112110867583e-06, + "loss": 0.4955, + "step": 5213 + }, + { + "epoch": 2.539785644689834, + "grad_norm": 2.8077876567840576, + "learning_rate": 3.131088924387725e-06, + "loss": 0.4944, + "step": 5214 + }, + { + "epoch": 2.5402728158493018, + "grad_norm": 3.1284680366516113, + "learning_rate": 3.1304665959232862e-06, + "loss": 0.4663, + "step": 5215 + }, + { + "epoch": 2.540759987008769, + "grad_norm": 3.1220526695251465, + "learning_rate": 3.1298442257346256e-06, + "loss": 0.5491, + "step": 5216 + }, + { + "epoch": 2.5412471581682365, + "grad_norm": 3.1380739212036133, + "learning_rate": 3.1292218138629337e-06, + "loss": 0.4466, + "step": 5217 + }, + { + "epoch": 2.5417343293277037, + "grad_norm": 2.898998975753784, + "learning_rate": 3.1285993603494005e-06, + "loss": 0.475, + "step": 5218 + }, + { + "epoch": 2.5422215004871713, + "grad_norm": 2.8438234329223633, + "learning_rate": 3.1279768652352204e-06, + "loss": 0.5188, + "step": 5219 + }, + { + "epoch": 2.5427086716466385, + "grad_norm": 3.307105302810669, + "learning_rate": 3.12735432856159e-06, + "loss": 0.4614, + "step": 5220 + }, + { + "epoch": 2.5431958428061057, + "grad_norm": 2.6567888259887695, + "learning_rate": 3.1267317503697077e-06, + "loss": 0.4759, + "step": 5221 + }, + { + "epoch": 2.5436830139655733, + "grad_norm": 2.7311160564422607, + "learning_rate": 3.1261091307007774e-06, + "loss": 0.484, + "step": 5222 + }, + { + "epoch": 2.5441701851250405, + "grad_norm": 2.728811740875244, + "learning_rate": 3.125486469596003e-06, + "loss": 0.4316, + "step": 5223 + }, + { + "epoch": 2.544657356284508, + "grad_norm": 2.5285720825195312, + "learning_rate": 3.124863767096592e-06, + "loss": 0.4896, + "step": 5224 + }, + { + "epoch": 2.5451445274439752, + "grad_norm": 3.817340850830078, + "learning_rate": 3.124241023243755e-06, + "loss": 0.5183, + "step": 5225 + }, + { + "epoch": 2.545631698603443, + "grad_norm": 2.9575116634368896, + "learning_rate": 3.1236182380787054e-06, + "loss": 0.5299, + "step": 5226 + }, + { + "epoch": 2.54611886976291, + "grad_norm": 2.7130961418151855, + "learning_rate": 3.1229954116426587e-06, + "loss": 0.5571, + "step": 5227 + }, + { + "epoch": 2.546606040922377, + "grad_norm": 2.824688673019409, + "learning_rate": 3.122372543976834e-06, + "loss": 0.5128, + "step": 5228 + }, + { + "epoch": 2.547093212081845, + "grad_norm": 2.6822898387908936, + "learning_rate": 3.121749635122452e-06, + "loss": 0.4415, + "step": 5229 + }, + { + "epoch": 2.547580383241312, + "grad_norm": 2.3219218254089355, + "learning_rate": 3.1211266851207367e-06, + "loss": 0.4745, + "step": 5230 + }, + { + "epoch": 2.5480675544007796, + "grad_norm": 2.5304441452026367, + "learning_rate": 3.1205036940129156e-06, + "loss": 0.4834, + "step": 5231 + }, + { + "epoch": 2.5485547255602468, + "grad_norm": 2.939913749694824, + "learning_rate": 3.1198806618402177e-06, + "loss": 0.5139, + "step": 5232 + }, + { + "epoch": 2.5490418967197144, + "grad_norm": 3.1045913696289062, + "learning_rate": 3.1192575886438753e-06, + "loss": 0.5156, + "step": 5233 + }, + { + "epoch": 2.5495290678791815, + "grad_norm": 2.6352899074554443, + "learning_rate": 3.1186344744651244e-06, + "loss": 0.4915, + "step": 5234 + }, + { + "epoch": 2.5500162390386487, + "grad_norm": 2.551046133041382, + "learning_rate": 3.1180113193452e-06, + "loss": 0.5122, + "step": 5235 + }, + { + "epoch": 2.5505034101981163, + "grad_norm": 3.1987597942352295, + "learning_rate": 3.117388123325345e-06, + "loss": 0.529, + "step": 5236 + }, + { + "epoch": 2.5509905813575835, + "grad_norm": 3.0746774673461914, + "learning_rate": 3.116764886446801e-06, + "loss": 0.4875, + "step": 5237 + }, + { + "epoch": 2.551477752517051, + "grad_norm": 2.877096652984619, + "learning_rate": 3.1161416087508145e-06, + "loss": 0.4653, + "step": 5238 + }, + { + "epoch": 2.5519649236765183, + "grad_norm": 2.8506836891174316, + "learning_rate": 3.1155182902786333e-06, + "loss": 0.4901, + "step": 5239 + }, + { + "epoch": 2.552452094835986, + "grad_norm": 3.3309545516967773, + "learning_rate": 3.11489493107151e-06, + "loss": 0.5041, + "step": 5240 + }, + { + "epoch": 2.552939265995453, + "grad_norm": 2.767930030822754, + "learning_rate": 3.114271531170696e-06, + "loss": 0.5285, + "step": 5241 + }, + { + "epoch": 2.5534264371549202, + "grad_norm": 3.289588212966919, + "learning_rate": 3.1136480906174505e-06, + "loss": 0.4894, + "step": 5242 + }, + { + "epoch": 2.553913608314388, + "grad_norm": 2.7218875885009766, + "learning_rate": 3.1130246094530312e-06, + "loss": 0.4686, + "step": 5243 + }, + { + "epoch": 2.554400779473855, + "grad_norm": 3.1633079051971436, + "learning_rate": 3.1124010877187e-06, + "loss": 0.4924, + "step": 5244 + }, + { + "epoch": 2.5548879506333226, + "grad_norm": 2.5529701709747314, + "learning_rate": 3.111777525455722e-06, + "loss": 0.4526, + "step": 5245 + }, + { + "epoch": 2.55537512179279, + "grad_norm": 2.8593013286590576, + "learning_rate": 3.111153922705365e-06, + "loss": 0.4471, + "step": 5246 + }, + { + "epoch": 2.5558622929522574, + "grad_norm": 2.913538932800293, + "learning_rate": 3.110530279508897e-06, + "loss": 0.4913, + "step": 5247 + }, + { + "epoch": 2.5563494641117246, + "grad_norm": 2.875079393386841, + "learning_rate": 3.1099065959075932e-06, + "loss": 0.4678, + "step": 5248 + }, + { + "epoch": 2.5568366352711918, + "grad_norm": 2.7915146350860596, + "learning_rate": 3.1092828719427263e-06, + "loss": 0.4443, + "step": 5249 + }, + { + "epoch": 2.5573238064306594, + "grad_norm": 2.6913251876831055, + "learning_rate": 3.1086591076555772e-06, + "loss": 0.4456, + "step": 5250 + }, + { + "epoch": 2.5578109775901265, + "grad_norm": 2.891404390335083, + "learning_rate": 3.108035303087424e-06, + "loss": 0.5307, + "step": 5251 + }, + { + "epoch": 2.558298148749594, + "grad_norm": 2.5924601554870605, + "learning_rate": 3.1074114582795513e-06, + "loss": 0.4898, + "step": 5252 + }, + { + "epoch": 2.5587853199090613, + "grad_norm": 3.1197006702423096, + "learning_rate": 3.1067875732732445e-06, + "loss": 0.4539, + "step": 5253 + }, + { + "epoch": 2.559272491068529, + "grad_norm": 2.530163288116455, + "learning_rate": 3.1061636481097935e-06, + "loss": 0.4487, + "step": 5254 + }, + { + "epoch": 2.559759662227996, + "grad_norm": 2.6088714599609375, + "learning_rate": 3.1055396828304878e-06, + "loss": 0.4687, + "step": 5255 + }, + { + "epoch": 2.5602468333874633, + "grad_norm": 2.5008299350738525, + "learning_rate": 3.104915677476623e-06, + "loss": 0.4833, + "step": 5256 + }, + { + "epoch": 2.560734004546931, + "grad_norm": 2.8580641746520996, + "learning_rate": 3.1042916320894943e-06, + "loss": 0.5163, + "step": 5257 + }, + { + "epoch": 2.561221175706398, + "grad_norm": 2.7341806888580322, + "learning_rate": 3.103667546710402e-06, + "loss": 0.4669, + "step": 5258 + }, + { + "epoch": 2.5617083468658657, + "grad_norm": 2.71345853805542, + "learning_rate": 3.1030434213806478e-06, + "loss": 0.4994, + "step": 5259 + }, + { + "epoch": 2.562195518025333, + "grad_norm": 2.8626604080200195, + "learning_rate": 3.1024192561415364e-06, + "loss": 0.4826, + "step": 5260 + }, + { + "epoch": 2.5626826891848005, + "grad_norm": 4.400035858154297, + "learning_rate": 3.1017950510343743e-06, + "loss": 0.4782, + "step": 5261 + }, + { + "epoch": 2.5631698603442676, + "grad_norm": 2.8639702796936035, + "learning_rate": 3.101170806100472e-06, + "loss": 0.5119, + "step": 5262 + }, + { + "epoch": 2.563657031503735, + "grad_norm": 3.0680131912231445, + "learning_rate": 3.100546521381141e-06, + "loss": 0.5458, + "step": 5263 + }, + { + "epoch": 2.5641442026632024, + "grad_norm": 3.3118348121643066, + "learning_rate": 3.0999221969176977e-06, + "loss": 0.4837, + "step": 5264 + }, + { + "epoch": 2.5646313738226696, + "grad_norm": 2.8780598640441895, + "learning_rate": 3.09929783275146e-06, + "loss": 0.4775, + "step": 5265 + }, + { + "epoch": 2.565118544982137, + "grad_norm": 2.5564920902252197, + "learning_rate": 3.0986734289237464e-06, + "loss": 0.5315, + "step": 5266 + }, + { + "epoch": 2.5656057161416044, + "grad_norm": 2.9328601360321045, + "learning_rate": 3.0980489854758817e-06, + "loss": 0.5473, + "step": 5267 + }, + { + "epoch": 2.566092887301072, + "grad_norm": 3.0127322673797607, + "learning_rate": 3.0974245024491904e-06, + "loss": 0.528, + "step": 5268 + }, + { + "epoch": 2.566580058460539, + "grad_norm": 3.302210807800293, + "learning_rate": 3.0967999798850014e-06, + "loss": 0.5494, + "step": 5269 + }, + { + "epoch": 2.5670672296200063, + "grad_norm": 2.6061227321624756, + "learning_rate": 3.0961754178246456e-06, + "loss": 0.4503, + "step": 5270 + }, + { + "epoch": 2.567554400779474, + "grad_norm": 2.4772727489471436, + "learning_rate": 3.0955508163094554e-06, + "loss": 0.5048, + "step": 5271 + }, + { + "epoch": 2.568041571938941, + "grad_norm": 3.705040216445923, + "learning_rate": 3.094926175380768e-06, + "loss": 0.5062, + "step": 5272 + }, + { + "epoch": 2.5685287430984087, + "grad_norm": 2.9943742752075195, + "learning_rate": 3.0943014950799216e-06, + "loss": 0.4442, + "step": 5273 + }, + { + "epoch": 2.569015914257876, + "grad_norm": 2.955310821533203, + "learning_rate": 3.093676775448258e-06, + "loss": 0.5284, + "step": 5274 + }, + { + "epoch": 2.5695030854173435, + "grad_norm": 2.6304306983947754, + "learning_rate": 3.0930520165271205e-06, + "loss": 0.5172, + "step": 5275 + }, + { + "epoch": 2.5699902565768107, + "grad_norm": 2.8500213623046875, + "learning_rate": 3.092427218357856e-06, + "loss": 0.5359, + "step": 5276 + }, + { + "epoch": 2.570477427736278, + "grad_norm": 2.797137975692749, + "learning_rate": 3.091802380981813e-06, + "loss": 0.5274, + "step": 5277 + }, + { + "epoch": 2.5709645988957455, + "grad_norm": 2.7790403366088867, + "learning_rate": 3.0911775044403435e-06, + "loss": 0.5145, + "step": 5278 + }, + { + "epoch": 2.5714517700552126, + "grad_norm": 2.8988564014434814, + "learning_rate": 3.0905525887748023e-06, + "loss": 0.4623, + "step": 5279 + }, + { + "epoch": 2.5719389412146803, + "grad_norm": 2.971463441848755, + "learning_rate": 3.089927634026545e-06, + "loss": 0.5169, + "step": 5280 + }, + { + "epoch": 2.5724261123741474, + "grad_norm": 2.9093756675720215, + "learning_rate": 3.089302640236932e-06, + "loss": 0.5064, + "step": 5281 + }, + { + "epoch": 2.572913283533615, + "grad_norm": 3.0264103412628174, + "learning_rate": 3.0886776074473252e-06, + "loss": 0.5111, + "step": 5282 + }, + { + "epoch": 2.573400454693082, + "grad_norm": 2.701308488845825, + "learning_rate": 3.0880525356990898e-06, + "loss": 0.5027, + "step": 5283 + }, + { + "epoch": 2.5738876258525494, + "grad_norm": 2.5850677490234375, + "learning_rate": 3.0874274250335914e-06, + "loss": 0.509, + "step": 5284 + }, + { + "epoch": 2.574374797012017, + "grad_norm": 3.1735422611236572, + "learning_rate": 3.0868022754922012e-06, + "loss": 0.4756, + "step": 5285 + }, + { + "epoch": 2.574861968171484, + "grad_norm": 2.867812156677246, + "learning_rate": 3.086177087116291e-06, + "loss": 0.539, + "step": 5286 + }, + { + "epoch": 2.5753491393309513, + "grad_norm": 3.357975959777832, + "learning_rate": 3.085551859947236e-06, + "loss": 0.5845, + "step": 5287 + }, + { + "epoch": 2.575836310490419, + "grad_norm": 2.7311058044433594, + "learning_rate": 3.0849265940264135e-06, + "loss": 0.489, + "step": 5288 + }, + { + "epoch": 2.5763234816498866, + "grad_norm": 2.4127731323242188, + "learning_rate": 3.0843012893952025e-06, + "loss": 0.4367, + "step": 5289 + }, + { + "epoch": 2.5768106528093537, + "grad_norm": 3.0425243377685547, + "learning_rate": 3.083675946094988e-06, + "loss": 0.5204, + "step": 5290 + }, + { + "epoch": 2.577297823968821, + "grad_norm": 2.573935031890869, + "learning_rate": 3.0830505641671526e-06, + "loss": 0.5014, + "step": 5291 + }, + { + "epoch": 2.5777849951282885, + "grad_norm": 2.787839651107788, + "learning_rate": 3.0824251436530854e-06, + "loss": 0.5436, + "step": 5292 + }, + { + "epoch": 2.5782721662877557, + "grad_norm": 2.5885937213897705, + "learning_rate": 3.0817996845941767e-06, + "loss": 0.4183, + "step": 5293 + }, + { + "epoch": 2.578759337447223, + "grad_norm": 2.659874677658081, + "learning_rate": 3.081174187031819e-06, + "loss": 0.4479, + "step": 5294 + }, + { + "epoch": 2.5792465086066905, + "grad_norm": 2.655472755432129, + "learning_rate": 3.0805486510074077e-06, + "loss": 0.5022, + "step": 5295 + }, + { + "epoch": 2.579733679766158, + "grad_norm": 2.827423572540283, + "learning_rate": 3.07992307656234e-06, + "loss": 0.481, + "step": 5296 + }, + { + "epoch": 2.5802208509256253, + "grad_norm": 2.5186960697174072, + "learning_rate": 3.0792974637380186e-06, + "loss": 0.3963, + "step": 5297 + }, + { + "epoch": 2.5807080220850924, + "grad_norm": 2.8115155696868896, + "learning_rate": 3.0786718125758436e-06, + "loss": 0.5035, + "step": 5298 + }, + { + "epoch": 2.58119519324456, + "grad_norm": 2.7746002674102783, + "learning_rate": 3.0780461231172227e-06, + "loss": 0.4692, + "step": 5299 + }, + { + "epoch": 2.581682364404027, + "grad_norm": 3.0272693634033203, + "learning_rate": 3.0774203954035624e-06, + "loss": 0.5372, + "step": 5300 + }, + { + "epoch": 2.5821695355634944, + "grad_norm": 2.68060040473938, + "learning_rate": 3.076794629476275e-06, + "loss": 0.4444, + "step": 5301 + }, + { + "epoch": 2.582656706722962, + "grad_norm": 2.82010817527771, + "learning_rate": 3.0761688253767723e-06, + "loss": 0.5035, + "step": 5302 + }, + { + "epoch": 2.5831438778824296, + "grad_norm": 2.8986918926239014, + "learning_rate": 3.0755429831464713e-06, + "loss": 0.531, + "step": 5303 + }, + { + "epoch": 2.5836310490418968, + "grad_norm": 2.9884934425354004, + "learning_rate": 3.0749171028267876e-06, + "loss": 0.4782, + "step": 5304 + }, + { + "epoch": 2.584118220201364, + "grad_norm": 2.3790180683135986, + "learning_rate": 3.074291184459145e-06, + "loss": 0.4601, + "step": 5305 + }, + { + "epoch": 2.5846053913608316, + "grad_norm": 2.7262978553771973, + "learning_rate": 3.0736652280849646e-06, + "loss": 0.4437, + "step": 5306 + }, + { + "epoch": 2.5850925625202987, + "grad_norm": 2.835228443145752, + "learning_rate": 3.0730392337456727e-06, + "loss": 0.4779, + "step": 5307 + }, + { + "epoch": 2.585579733679766, + "grad_norm": 2.772705078125, + "learning_rate": 3.0724132014826978e-06, + "loss": 0.5406, + "step": 5308 + }, + { + "epoch": 2.5860669048392335, + "grad_norm": 2.943387746810913, + "learning_rate": 3.071787131337471e-06, + "loss": 0.4888, + "step": 5309 + }, + { + "epoch": 2.586554075998701, + "grad_norm": 2.712975263595581, + "learning_rate": 3.071161023351425e-06, + "loss": 0.5047, + "step": 5310 + }, + { + "epoch": 2.5870412471581683, + "grad_norm": 2.763852834701538, + "learning_rate": 3.070534877565996e-06, + "loss": 0.4424, + "step": 5311 + }, + { + "epoch": 2.5875284183176355, + "grad_norm": 2.8733649253845215, + "learning_rate": 3.069908694022621e-06, + "loss": 0.5603, + "step": 5312 + }, + { + "epoch": 2.588015589477103, + "grad_norm": 2.8875207901000977, + "learning_rate": 3.0692824727627425e-06, + "loss": 0.5092, + "step": 5313 + }, + { + "epoch": 2.5885027606365703, + "grad_norm": 2.506865978240967, + "learning_rate": 3.068656213827802e-06, + "loss": 0.3562, + "step": 5314 + }, + { + "epoch": 2.5889899317960374, + "grad_norm": 2.669475793838501, + "learning_rate": 3.0680299172592472e-06, + "loss": 0.4878, + "step": 5315 + }, + { + "epoch": 2.589477102955505, + "grad_norm": 2.9352054595947266, + "learning_rate": 3.0674035830985244e-06, + "loss": 0.5051, + "step": 5316 + }, + { + "epoch": 2.5899642741149727, + "grad_norm": 2.7522404193878174, + "learning_rate": 3.066777211387086e-06, + "loss": 0.4911, + "step": 5317 + }, + { + "epoch": 2.59045144527444, + "grad_norm": 2.9759361743927, + "learning_rate": 3.066150802166384e-06, + "loss": 0.5176, + "step": 5318 + }, + { + "epoch": 2.590938616433907, + "grad_norm": 3.147719621658325, + "learning_rate": 3.0655243554778753e-06, + "loss": 0.5551, + "step": 5319 + }, + { + "epoch": 2.5914257875933746, + "grad_norm": 2.9402456283569336, + "learning_rate": 3.064897871363017e-06, + "loss": 0.4685, + "step": 5320 + }, + { + "epoch": 2.591912958752842, + "grad_norm": 2.811222791671753, + "learning_rate": 3.0642713498632703e-06, + "loss": 0.5075, + "step": 5321 + }, + { + "epoch": 2.592400129912309, + "grad_norm": 2.917578935623169, + "learning_rate": 3.0636447910200977e-06, + "loss": 0.445, + "step": 5322 + }, + { + "epoch": 2.5928873010717766, + "grad_norm": 2.722069025039673, + "learning_rate": 3.063018194874966e-06, + "loss": 0.6099, + "step": 5323 + }, + { + "epoch": 2.593374472231244, + "grad_norm": 3.3281466960906982, + "learning_rate": 3.062391561469342e-06, + "loss": 0.4867, + "step": 5324 + }, + { + "epoch": 2.5938616433907113, + "grad_norm": 3.1473968029022217, + "learning_rate": 3.061764890844698e-06, + "loss": 0.5118, + "step": 5325 + }, + { + "epoch": 2.5943488145501785, + "grad_norm": 3.04302978515625, + "learning_rate": 3.0611381830425045e-06, + "loss": 0.4169, + "step": 5326 + }, + { + "epoch": 2.594835985709646, + "grad_norm": 2.584317922592163, + "learning_rate": 3.060511438104239e-06, + "loss": 0.4676, + "step": 5327 + }, + { + "epoch": 2.5953231568691133, + "grad_norm": 2.9325273036956787, + "learning_rate": 3.059884656071379e-06, + "loss": 0.5116, + "step": 5328 + }, + { + "epoch": 2.5958103280285805, + "grad_norm": 2.4646427631378174, + "learning_rate": 3.059257836985404e-06, + "loss": 0.4455, + "step": 5329 + }, + { + "epoch": 2.596297499188048, + "grad_norm": 2.9683053493499756, + "learning_rate": 3.058630980887798e-06, + "loss": 0.48, + "step": 5330 + }, + { + "epoch": 2.5967846703475157, + "grad_norm": 2.9447970390319824, + "learning_rate": 3.0580040878200447e-06, + "loss": 0.5047, + "step": 5331 + }, + { + "epoch": 2.597271841506983, + "grad_norm": 3.161288261413574, + "learning_rate": 3.057377157823634e-06, + "loss": 0.4896, + "step": 5332 + }, + { + "epoch": 2.59775901266645, + "grad_norm": 2.5861260890960693, + "learning_rate": 3.056750190940054e-06, + "loss": 0.4526, + "step": 5333 + }, + { + "epoch": 2.5982461838259177, + "grad_norm": 3.0402708053588867, + "learning_rate": 3.0561231872107988e-06, + "loss": 0.5501, + "step": 5334 + }, + { + "epoch": 2.598733354985385, + "grad_norm": 2.9322338104248047, + "learning_rate": 3.0554961466773625e-06, + "loss": 0.5951, + "step": 5335 + }, + { + "epoch": 2.599220526144852, + "grad_norm": 3.2735490798950195, + "learning_rate": 3.0548690693812433e-06, + "loss": 0.5432, + "step": 5336 + }, + { + "epoch": 2.5997076973043196, + "grad_norm": 3.0205507278442383, + "learning_rate": 3.05424195536394e-06, + "loss": 0.4726, + "step": 5337 + }, + { + "epoch": 2.600194868463787, + "grad_norm": 3.0237510204315186, + "learning_rate": 3.0536148046669565e-06, + "loss": 0.4763, + "step": 5338 + }, + { + "epoch": 2.6006820396232544, + "grad_norm": 2.8136227130889893, + "learning_rate": 3.0529876173317965e-06, + "loss": 0.504, + "step": 5339 + }, + { + "epoch": 2.6011692107827216, + "grad_norm": 2.5634989738464355, + "learning_rate": 3.052360393399968e-06, + "loss": 0.4655, + "step": 5340 + }, + { + "epoch": 2.601656381942189, + "grad_norm": 2.722804069519043, + "learning_rate": 3.051733132912979e-06, + "loss": 0.4867, + "step": 5341 + }, + { + "epoch": 2.6021435531016563, + "grad_norm": 2.6877083778381348, + "learning_rate": 3.0511058359123434e-06, + "loss": 0.4853, + "step": 5342 + }, + { + "epoch": 2.6026307242611235, + "grad_norm": 2.6632351875305176, + "learning_rate": 3.0504785024395745e-06, + "loss": 0.5194, + "step": 5343 + }, + { + "epoch": 2.603117895420591, + "grad_norm": 2.619781970977783, + "learning_rate": 3.04985113253619e-06, + "loss": 0.4559, + "step": 5344 + }, + { + "epoch": 2.6036050665800583, + "grad_norm": 2.6796681880950928, + "learning_rate": 3.049223726243708e-06, + "loss": 0.5309, + "step": 5345 + }, + { + "epoch": 2.604092237739526, + "grad_norm": 2.882845878601074, + "learning_rate": 3.048596283603652e-06, + "loss": 0.4972, + "step": 5346 + }, + { + "epoch": 2.604579408898993, + "grad_norm": 2.8595123291015625, + "learning_rate": 3.047968804657544e-06, + "loss": 0.45, + "step": 5347 + }, + { + "epoch": 2.6050665800584607, + "grad_norm": 2.647850513458252, + "learning_rate": 3.047341289446912e-06, + "loss": 0.4034, + "step": 5348 + }, + { + "epoch": 2.605553751217928, + "grad_norm": 2.403765916824341, + "learning_rate": 3.0467137380132845e-06, + "loss": 0.4948, + "step": 5349 + }, + { + "epoch": 2.606040922377395, + "grad_norm": 3.0863969326019287, + "learning_rate": 3.046086150398193e-06, + "loss": 0.5897, + "step": 5350 + }, + { + "epoch": 2.6065280935368627, + "grad_norm": 3.276944160461426, + "learning_rate": 3.0454585266431703e-06, + "loss": 0.4455, + "step": 5351 + }, + { + "epoch": 2.60701526469633, + "grad_norm": 3.3180482387542725, + "learning_rate": 3.0448308667897536e-06, + "loss": 0.5768, + "step": 5352 + }, + { + "epoch": 2.6075024358557974, + "grad_norm": 2.5002360343933105, + "learning_rate": 3.0442031708794805e-06, + "loss": 0.3987, + "step": 5353 + }, + { + "epoch": 2.6079896070152646, + "grad_norm": 2.777641773223877, + "learning_rate": 3.043575438953893e-06, + "loss": 0.5066, + "step": 5354 + }, + { + "epoch": 2.6084767781747322, + "grad_norm": 2.7449350357055664, + "learning_rate": 3.042947671054533e-06, + "loss": 0.4046, + "step": 5355 + }, + { + "epoch": 2.6089639493341994, + "grad_norm": 3.3773231506347656, + "learning_rate": 3.042319867222947e-06, + "loss": 0.5694, + "step": 5356 + }, + { + "epoch": 2.6094511204936666, + "grad_norm": 2.7114765644073486, + "learning_rate": 3.0416920275006816e-06, + "loss": 0.5196, + "step": 5357 + }, + { + "epoch": 2.609938291653134, + "grad_norm": 2.934466600418091, + "learning_rate": 3.0410641519292892e-06, + "loss": 0.4792, + "step": 5358 + }, + { + "epoch": 2.6104254628126013, + "grad_norm": 2.9227702617645264, + "learning_rate": 3.0404362405503214e-06, + "loss": 0.5243, + "step": 5359 + }, + { + "epoch": 2.610912633972069, + "grad_norm": 3.4018208980560303, + "learning_rate": 3.039808293405334e-06, + "loss": 0.5455, + "step": 5360 + }, + { + "epoch": 2.611399805131536, + "grad_norm": 3.0052645206451416, + "learning_rate": 3.039180310535883e-06, + "loss": 0.4897, + "step": 5361 + }, + { + "epoch": 2.6118869762910037, + "grad_norm": 3.0171451568603516, + "learning_rate": 3.0385522919835305e-06, + "loss": 0.4789, + "step": 5362 + }, + { + "epoch": 2.612374147450471, + "grad_norm": 3.1476285457611084, + "learning_rate": 3.037924237789836e-06, + "loss": 0.5422, + "step": 5363 + }, + { + "epoch": 2.612861318609938, + "grad_norm": 3.261289596557617, + "learning_rate": 3.0372961479963668e-06, + "loss": 0.501, + "step": 5364 + }, + { + "epoch": 2.6133484897694057, + "grad_norm": 3.353356122970581, + "learning_rate": 3.0366680226446877e-06, + "loss": 0.5453, + "step": 5365 + }, + { + "epoch": 2.613835660928873, + "grad_norm": 2.9968667030334473, + "learning_rate": 3.03603986177637e-06, + "loss": 0.4844, + "step": 5366 + }, + { + "epoch": 2.6143228320883405, + "grad_norm": 3.0268008708953857, + "learning_rate": 3.0354116654329825e-06, + "loss": 0.5557, + "step": 5367 + }, + { + "epoch": 2.6148100032478077, + "grad_norm": 2.573357582092285, + "learning_rate": 3.034783433656102e-06, + "loss": 0.4621, + "step": 5368 + }, + { + "epoch": 2.6152971744072753, + "grad_norm": 2.64267897605896, + "learning_rate": 3.034155166487303e-06, + "loss": 0.4679, + "step": 5369 + }, + { + "epoch": 2.6157843455667424, + "grad_norm": 5.508352756500244, + "learning_rate": 3.033526863968166e-06, + "loss": 0.5253, + "step": 5370 + }, + { + "epoch": 2.6162715167262096, + "grad_norm": 3.0419957637786865, + "learning_rate": 3.0328985261402692e-06, + "loss": 0.57, + "step": 5371 + }, + { + "epoch": 2.6167586878856772, + "grad_norm": 3.063082218170166, + "learning_rate": 3.0322701530451987e-06, + "loss": 0.4986, + "step": 5372 + }, + { + "epoch": 2.6172458590451444, + "grad_norm": 2.361081838607788, + "learning_rate": 3.031641744724538e-06, + "loss": 0.4955, + "step": 5373 + }, + { + "epoch": 2.617733030204612, + "grad_norm": 2.533994197845459, + "learning_rate": 3.0310133012198776e-06, + "loss": 0.5059, + "step": 5374 + }, + { + "epoch": 2.618220201364079, + "grad_norm": 2.45141863822937, + "learning_rate": 3.030384822572806e-06, + "loss": 0.46, + "step": 5375 + }, + { + "epoch": 2.618707372523547, + "grad_norm": 2.7286629676818848, + "learning_rate": 3.0297563088249153e-06, + "loss": 0.4247, + "step": 5376 + }, + { + "epoch": 2.619194543683014, + "grad_norm": 2.6080222129821777, + "learning_rate": 3.0291277600178028e-06, + "loss": 0.5478, + "step": 5377 + }, + { + "epoch": 2.619681714842481, + "grad_norm": 2.7053821086883545, + "learning_rate": 3.0284991761930637e-06, + "loss": 0.466, + "step": 5378 + }, + { + "epoch": 2.6201688860019487, + "grad_norm": 3.0827248096466064, + "learning_rate": 3.0278705573922983e-06, + "loss": 0.5443, + "step": 5379 + }, + { + "epoch": 2.620656057161416, + "grad_norm": 3.0999789237976074, + "learning_rate": 3.0272419036571083e-06, + "loss": 0.5554, + "step": 5380 + }, + { + "epoch": 2.6211432283208835, + "grad_norm": 2.8496901988983154, + "learning_rate": 3.0266132150290983e-06, + "loss": 0.4752, + "step": 5381 + }, + { + "epoch": 2.6216303994803507, + "grad_norm": 2.594132900238037, + "learning_rate": 3.025984491549875e-06, + "loss": 0.4815, + "step": 5382 + }, + { + "epoch": 2.6221175706398183, + "grad_norm": 2.7355453968048096, + "learning_rate": 3.025355733261047e-06, + "loss": 0.4898, + "step": 5383 + }, + { + "epoch": 2.6226047417992855, + "grad_norm": 2.8686652183532715, + "learning_rate": 3.024726940204225e-06, + "loss": 0.4889, + "step": 5384 + }, + { + "epoch": 2.6230919129587527, + "grad_norm": 2.6371517181396484, + "learning_rate": 3.024098112421024e-06, + "loss": 0.459, + "step": 5385 + }, + { + "epoch": 2.6235790841182203, + "grad_norm": 2.696305751800537, + "learning_rate": 3.023469249953058e-06, + "loss": 0.4394, + "step": 5386 + }, + { + "epoch": 2.6240662552776874, + "grad_norm": 2.4758334159851074, + "learning_rate": 3.022840352841946e-06, + "loss": 0.4753, + "step": 5387 + }, + { + "epoch": 2.624553426437155, + "grad_norm": 2.869175434112549, + "learning_rate": 3.0222114211293075e-06, + "loss": 0.46, + "step": 5388 + }, + { + "epoch": 2.6250405975966222, + "grad_norm": 2.3717713356018066, + "learning_rate": 3.0215824548567663e-06, + "loss": 0.433, + "step": 5389 + }, + { + "epoch": 2.62552776875609, + "grad_norm": 2.6082425117492676, + "learning_rate": 3.020953454065947e-06, + "loss": 0.4894, + "step": 5390 + }, + { + "epoch": 2.626014939915557, + "grad_norm": 2.8919055461883545, + "learning_rate": 3.0203244187984763e-06, + "loss": 0.4998, + "step": 5391 + }, + { + "epoch": 2.626502111075024, + "grad_norm": 2.946739673614502, + "learning_rate": 3.019695349095984e-06, + "loss": 0.4635, + "step": 5392 + }, + { + "epoch": 2.626989282234492, + "grad_norm": 2.4540038108825684, + "learning_rate": 3.0190662450001024e-06, + "loss": 0.4815, + "step": 5393 + }, + { + "epoch": 2.627476453393959, + "grad_norm": 2.9414217472076416, + "learning_rate": 3.0184371065524644e-06, + "loss": 0.4548, + "step": 5394 + }, + { + "epoch": 2.6279636245534266, + "grad_norm": 2.930506467819214, + "learning_rate": 3.0178079337947075e-06, + "loss": 0.51, + "step": 5395 + }, + { + "epoch": 2.6284507957128938, + "grad_norm": 2.7361762523651123, + "learning_rate": 3.0171787267684687e-06, + "loss": 0.4499, + "step": 5396 + }, + { + "epoch": 2.6289379668723614, + "grad_norm": 2.796576738357544, + "learning_rate": 3.016549485515391e-06, + "loss": 0.5247, + "step": 5397 + }, + { + "epoch": 2.6294251380318285, + "grad_norm": 2.9077460765838623, + "learning_rate": 3.015920210077117e-06, + "loss": 0.4378, + "step": 5398 + }, + { + "epoch": 2.6299123091912957, + "grad_norm": 2.7660648822784424, + "learning_rate": 3.01529090049529e-06, + "loss": 0.4623, + "step": 5399 + }, + { + "epoch": 2.6303994803507633, + "grad_norm": 2.58418869972229, + "learning_rate": 3.01466155681156e-06, + "loss": 0.392, + "step": 5400 + }, + { + "epoch": 2.6308866515102305, + "grad_norm": 2.583073854446411, + "learning_rate": 3.0140321790675765e-06, + "loss": 0.4747, + "step": 5401 + }, + { + "epoch": 2.631373822669698, + "grad_norm": 2.8096659183502197, + "learning_rate": 3.0134027673049905e-06, + "loss": 0.4757, + "step": 5402 + }, + { + "epoch": 2.6318609938291653, + "grad_norm": 2.628594398498535, + "learning_rate": 3.0127733215654575e-06, + "loss": 0.4962, + "step": 5403 + }, + { + "epoch": 2.632348164988633, + "grad_norm": 2.76493763923645, + "learning_rate": 3.0121438418906336e-06, + "loss": 0.398, + "step": 5404 + }, + { + "epoch": 2.6328353361481, + "grad_norm": 2.704558849334717, + "learning_rate": 3.011514328322178e-06, + "loss": 0.4751, + "step": 5405 + }, + { + "epoch": 2.6333225073075672, + "grad_norm": 2.90537428855896, + "learning_rate": 3.010884780901752e-06, + "loss": 0.4782, + "step": 5406 + }, + { + "epoch": 2.633809678467035, + "grad_norm": 2.8039989471435547, + "learning_rate": 3.0102551996710182e-06, + "loss": 0.4809, + "step": 5407 + }, + { + "epoch": 2.634296849626502, + "grad_norm": 2.713772773742676, + "learning_rate": 3.0096255846716433e-06, + "loss": 0.4858, + "step": 5408 + }, + { + "epoch": 2.6347840207859696, + "grad_norm": 2.940979242324829, + "learning_rate": 3.0089959359452948e-06, + "loss": 0.4544, + "step": 5409 + }, + { + "epoch": 2.635271191945437, + "grad_norm": 2.486438035964966, + "learning_rate": 3.0083662535336423e-06, + "loss": 0.4836, + "step": 5410 + }, + { + "epoch": 2.6357583631049044, + "grad_norm": 2.754547357559204, + "learning_rate": 3.0077365374783584e-06, + "loss": 0.4109, + "step": 5411 + }, + { + "epoch": 2.6362455342643716, + "grad_norm": 2.69399094581604, + "learning_rate": 3.007106787821118e-06, + "loss": 0.453, + "step": 5412 + }, + { + "epoch": 2.6367327054238388, + "grad_norm": 2.738593816757202, + "learning_rate": 3.0064770046035977e-06, + "loss": 0.3992, + "step": 5413 + }, + { + "epoch": 2.6372198765833064, + "grad_norm": 2.8507843017578125, + "learning_rate": 3.005847187867476e-06, + "loss": 0.5318, + "step": 5414 + }, + { + "epoch": 2.6377070477427735, + "grad_norm": 3.222792625427246, + "learning_rate": 3.005217337654435e-06, + "loss": 0.595, + "step": 5415 + }, + { + "epoch": 2.638194218902241, + "grad_norm": 2.9316155910491943, + "learning_rate": 3.004587454006157e-06, + "loss": 0.5421, + "step": 5416 + }, + { + "epoch": 2.6386813900617083, + "grad_norm": 2.958784580230713, + "learning_rate": 3.0039575369643293e-06, + "loss": 0.4395, + "step": 5417 + }, + { + "epoch": 2.639168561221176, + "grad_norm": 2.5774729251861572, + "learning_rate": 3.003327586570638e-06, + "loss": 0.4388, + "step": 5418 + }, + { + "epoch": 2.639655732380643, + "grad_norm": 2.6741433143615723, + "learning_rate": 3.0026976028667744e-06, + "loss": 0.5138, + "step": 5419 + }, + { + "epoch": 2.6401429035401103, + "grad_norm": 2.8146495819091797, + "learning_rate": 3.00206758589443e-06, + "loss": 0.4938, + "step": 5420 + }, + { + "epoch": 2.640630074699578, + "grad_norm": 2.9104859828948975, + "learning_rate": 3.001437535695299e-06, + "loss": 0.4895, + "step": 5421 + }, + { + "epoch": 2.641117245859045, + "grad_norm": 2.8265252113342285, + "learning_rate": 3.0008074523110793e-06, + "loss": 0.4552, + "step": 5422 + }, + { + "epoch": 2.6416044170185122, + "grad_norm": 2.984790563583374, + "learning_rate": 3.0001773357834683e-06, + "loss": 0.5403, + "step": 5423 + }, + { + "epoch": 2.64209158817798, + "grad_norm": 3.164121150970459, + "learning_rate": 2.999547186154168e-06, + "loss": 0.4752, + "step": 5424 + }, + { + "epoch": 2.6425787593374475, + "grad_norm": 2.914421558380127, + "learning_rate": 2.9989170034648823e-06, + "loss": 0.4387, + "step": 5425 + }, + { + "epoch": 2.6430659304969146, + "grad_norm": 2.7886698246002197, + "learning_rate": 2.9982867877573146e-06, + "loss": 0.4949, + "step": 5426 + }, + { + "epoch": 2.643553101656382, + "grad_norm": 2.933465003967285, + "learning_rate": 2.997656539073175e-06, + "loss": 0.5208, + "step": 5427 + }, + { + "epoch": 2.6440402728158494, + "grad_norm": 3.032796621322632, + "learning_rate": 2.9970262574541703e-06, + "loss": 0.4752, + "step": 5428 + }, + { + "epoch": 2.6445274439753166, + "grad_norm": 2.7947142124176025, + "learning_rate": 2.996395942942015e-06, + "loss": 0.513, + "step": 5429 + }, + { + "epoch": 2.6450146151347838, + "grad_norm": 3.0134401321411133, + "learning_rate": 2.995765595578423e-06, + "loss": 0.443, + "step": 5430 + }, + { + "epoch": 2.6455017862942514, + "grad_norm": 2.4001832008361816, + "learning_rate": 2.9951352154051095e-06, + "loss": 0.4974, + "step": 5431 + }, + { + "epoch": 2.645988957453719, + "grad_norm": 2.9415392875671387, + "learning_rate": 2.9945048024637935e-06, + "loss": 0.4908, + "step": 5432 + }, + { + "epoch": 2.646476128613186, + "grad_norm": 2.7951180934906006, + "learning_rate": 2.9938743567961957e-06, + "loss": 0.4539, + "step": 5433 + }, + { + "epoch": 2.6469632997726533, + "grad_norm": 2.8020670413970947, + "learning_rate": 2.9932438784440394e-06, + "loss": 0.459, + "step": 5434 + }, + { + "epoch": 2.647450470932121, + "grad_norm": 3.0968737602233887, + "learning_rate": 2.992613367449049e-06, + "loss": 0.5527, + "step": 5435 + }, + { + "epoch": 2.647937642091588, + "grad_norm": 2.9317538738250732, + "learning_rate": 2.9919828238529515e-06, + "loss": 0.5216, + "step": 5436 + }, + { + "epoch": 2.6484248132510553, + "grad_norm": 2.6571171283721924, + "learning_rate": 2.991352247697476e-06, + "loss": 0.5146, + "step": 5437 + }, + { + "epoch": 2.648911984410523, + "grad_norm": 2.7868754863739014, + "learning_rate": 2.9907216390243554e-06, + "loss": 0.5277, + "step": 5438 + }, + { + "epoch": 2.6493991555699905, + "grad_norm": 3.487677574157715, + "learning_rate": 2.9900909978753224e-06, + "loss": 0.5713, + "step": 5439 + }, + { + "epoch": 2.6498863267294577, + "grad_norm": 2.9072105884552, + "learning_rate": 2.989460324292113e-06, + "loss": 0.5104, + "step": 5440 + }, + { + "epoch": 2.650373497888925, + "grad_norm": 2.8797800540924072, + "learning_rate": 2.988829618316464e-06, + "loss": 0.5201, + "step": 5441 + }, + { + "epoch": 2.6508606690483925, + "grad_norm": 2.664886951446533, + "learning_rate": 2.988198879990117e-06, + "loss": 0.4979, + "step": 5442 + }, + { + "epoch": 2.6513478402078596, + "grad_norm": 2.6267685890197754, + "learning_rate": 2.987568109354814e-06, + "loss": 0.4943, + "step": 5443 + }, + { + "epoch": 2.651835011367327, + "grad_norm": 2.6893951892852783, + "learning_rate": 2.9869373064522975e-06, + "loss": 0.5059, + "step": 5444 + }, + { + "epoch": 2.6523221825267944, + "grad_norm": 2.824282169342041, + "learning_rate": 2.9863064713243166e-06, + "loss": 0.4697, + "step": 5445 + }, + { + "epoch": 2.652809353686262, + "grad_norm": 3.090024709701538, + "learning_rate": 2.9856756040126188e-06, + "loss": 0.5145, + "step": 5446 + }, + { + "epoch": 2.653296524845729, + "grad_norm": 3.232947826385498, + "learning_rate": 2.985044704558954e-06, + "loss": 0.4425, + "step": 5447 + }, + { + "epoch": 2.6537836960051964, + "grad_norm": 2.801447629928589, + "learning_rate": 2.9844137730050763e-06, + "loss": 0.4516, + "step": 5448 + }, + { + "epoch": 2.654270867164664, + "grad_norm": 3.030407428741455, + "learning_rate": 2.9837828093927403e-06, + "loss": 0.5063, + "step": 5449 + }, + { + "epoch": 2.654758038324131, + "grad_norm": 3.3719120025634766, + "learning_rate": 2.9831518137637028e-06, + "loss": 0.5347, + "step": 5450 + }, + { + "epoch": 2.6552452094835983, + "grad_norm": 3.0916874408721924, + "learning_rate": 2.982520786159723e-06, + "loss": 0.4629, + "step": 5451 + }, + { + "epoch": 2.655732380643066, + "grad_norm": 2.538233518600464, + "learning_rate": 2.981889726622563e-06, + "loss": 0.4039, + "step": 5452 + }, + { + "epoch": 2.6562195518025336, + "grad_norm": 2.622068166732788, + "learning_rate": 2.9812586351939847e-06, + "loss": 0.5051, + "step": 5453 + }, + { + "epoch": 2.6567067229620007, + "grad_norm": 2.761587381362915, + "learning_rate": 2.9806275119157564e-06, + "loss": 0.5987, + "step": 5454 + }, + { + "epoch": 2.657193894121468, + "grad_norm": 3.170583724975586, + "learning_rate": 2.979996356829643e-06, + "loss": 0.5037, + "step": 5455 + }, + { + "epoch": 2.6576810652809355, + "grad_norm": 2.760258197784424, + "learning_rate": 2.979365169977416e-06, + "loss": 0.4313, + "step": 5456 + }, + { + "epoch": 2.6581682364404027, + "grad_norm": 2.8096373081207275, + "learning_rate": 2.9787339514008456e-06, + "loss": 0.512, + "step": 5457 + }, + { + "epoch": 2.65865540759987, + "grad_norm": 2.8970205783843994, + "learning_rate": 2.978102701141708e-06, + "loss": 0.4794, + "step": 5458 + }, + { + "epoch": 2.6591425787593375, + "grad_norm": 2.872349500656128, + "learning_rate": 2.9774714192417776e-06, + "loss": 0.4348, + "step": 5459 + }, + { + "epoch": 2.659629749918805, + "grad_norm": 3.106269598007202, + "learning_rate": 2.9768401057428336e-06, + "loss": 0.4886, + "step": 5460 + }, + { + "epoch": 2.6601169210782722, + "grad_norm": 3.1779863834381104, + "learning_rate": 2.9762087606866553e-06, + "loss": 0.5245, + "step": 5461 + }, + { + "epoch": 2.6606040922377394, + "grad_norm": 2.858505964279175, + "learning_rate": 2.975577384115026e-06, + "loss": 0.4905, + "step": 5462 + }, + { + "epoch": 2.661091263397207, + "grad_norm": 2.922330141067505, + "learning_rate": 2.974945976069729e-06, + "loss": 0.4697, + "step": 5463 + }, + { + "epoch": 2.661578434556674, + "grad_norm": 2.9100353717803955, + "learning_rate": 2.9743145365925523e-06, + "loss": 0.5726, + "step": 5464 + }, + { + "epoch": 2.6620656057161414, + "grad_norm": 2.897613048553467, + "learning_rate": 2.973683065725284e-06, + "loss": 0.4701, + "step": 5465 + }, + { + "epoch": 2.662552776875609, + "grad_norm": 2.9326422214508057, + "learning_rate": 2.9730515635097145e-06, + "loss": 0.4496, + "step": 5466 + }, + { + "epoch": 2.6630399480350766, + "grad_norm": 2.678723096847534, + "learning_rate": 2.9724200299876356e-06, + "loss": 0.4205, + "step": 5467 + }, + { + "epoch": 2.6635271191945438, + "grad_norm": 2.817328691482544, + "learning_rate": 2.971788465200844e-06, + "loss": 0.5611, + "step": 5468 + }, + { + "epoch": 2.664014290354011, + "grad_norm": 2.9420394897460938, + "learning_rate": 2.971156869191135e-06, + "loss": 0.4873, + "step": 5469 + }, + { + "epoch": 2.6645014615134786, + "grad_norm": 2.8603968620300293, + "learning_rate": 2.970525242000309e-06, + "loss": 0.4643, + "step": 5470 + }, + { + "epoch": 2.6649886326729457, + "grad_norm": 2.871752977371216, + "learning_rate": 2.9698935836701655e-06, + "loss": 0.4371, + "step": 5471 + }, + { + "epoch": 2.665475803832413, + "grad_norm": 3.5997958183288574, + "learning_rate": 2.969261894242509e-06, + "loss": 0.5255, + "step": 5472 + }, + { + "epoch": 2.6659629749918805, + "grad_norm": 2.59267258644104, + "learning_rate": 2.9686301737591435e-06, + "loss": 0.4867, + "step": 5473 + }, + { + "epoch": 2.6664501461513477, + "grad_norm": 2.932360887527466, + "learning_rate": 2.967998422261877e-06, + "loss": 0.5231, + "step": 5474 + }, + { + "epoch": 2.6669373173108153, + "grad_norm": 2.844085454940796, + "learning_rate": 2.9673666397925183e-06, + "loss": 0.4946, + "step": 5475 + }, + { + "epoch": 2.6674244884702825, + "grad_norm": 2.7712788581848145, + "learning_rate": 2.966734826392879e-06, + "loss": 0.4211, + "step": 5476 + }, + { + "epoch": 2.66791165962975, + "grad_norm": 2.726700782775879, + "learning_rate": 2.9661029821047728e-06, + "loss": 0.4848, + "step": 5477 + }, + { + "epoch": 2.6683988307892172, + "grad_norm": 2.914781093597412, + "learning_rate": 2.9654711069700143e-06, + "loss": 0.5656, + "step": 5478 + }, + { + "epoch": 2.6688860019486844, + "grad_norm": 3.1317529678344727, + "learning_rate": 2.964839201030421e-06, + "loss": 0.4606, + "step": 5479 + }, + { + "epoch": 2.669373173108152, + "grad_norm": 2.553741693496704, + "learning_rate": 2.964207264327813e-06, + "loss": 0.4493, + "step": 5480 + }, + { + "epoch": 2.669860344267619, + "grad_norm": 2.945378065109253, + "learning_rate": 2.9635752969040116e-06, + "loss": 0.5121, + "step": 5481 + }, + { + "epoch": 2.670347515427087, + "grad_norm": 2.5161168575286865, + "learning_rate": 2.9629432988008398e-06, + "loss": 0.4554, + "step": 5482 + }, + { + "epoch": 2.670834686586554, + "grad_norm": 3.0336039066314697, + "learning_rate": 2.962311270060124e-06, + "loss": 0.5195, + "step": 5483 + }, + { + "epoch": 2.6713218577460216, + "grad_norm": 3.05357027053833, + "learning_rate": 2.9616792107236915e-06, + "loss": 0.5865, + "step": 5484 + }, + { + "epoch": 2.6718090289054888, + "grad_norm": 3.0624306201934814, + "learning_rate": 2.961047120833372e-06, + "loss": 0.4995, + "step": 5485 + }, + { + "epoch": 2.672296200064956, + "grad_norm": 2.838390350341797, + "learning_rate": 2.960415000430996e-06, + "loss": 0.5162, + "step": 5486 + }, + { + "epoch": 2.6727833712244236, + "grad_norm": 3.0353455543518066, + "learning_rate": 2.9597828495583992e-06, + "loss": 0.493, + "step": 5487 + }, + { + "epoch": 2.6732705423838907, + "grad_norm": 2.784796714782715, + "learning_rate": 2.9591506682574156e-06, + "loss": 0.535, + "step": 5488 + }, + { + "epoch": 2.6737577135433583, + "grad_norm": 2.9883670806884766, + "learning_rate": 2.958518456569884e-06, + "loss": 0.5137, + "step": 5489 + }, + { + "epoch": 2.6742448847028255, + "grad_norm": 3.1001317501068115, + "learning_rate": 2.957886214537643e-06, + "loss": 0.5715, + "step": 5490 + }, + { + "epoch": 2.674732055862293, + "grad_norm": 3.0719082355499268, + "learning_rate": 2.957253942202536e-06, + "loss": 0.5221, + "step": 5491 + }, + { + "epoch": 2.6752192270217603, + "grad_norm": 2.868579864501953, + "learning_rate": 2.956621639606404e-06, + "loss": 0.4893, + "step": 5492 + }, + { + "epoch": 2.6757063981812275, + "grad_norm": 2.821106195449829, + "learning_rate": 2.9559893067910963e-06, + "loss": 0.4602, + "step": 5493 + }, + { + "epoch": 2.676193569340695, + "grad_norm": 2.9575417041778564, + "learning_rate": 2.9553569437984568e-06, + "loss": 0.4934, + "step": 5494 + }, + { + "epoch": 2.6766807405001622, + "grad_norm": 2.7731637954711914, + "learning_rate": 2.9547245506703387e-06, + "loss": 0.485, + "step": 5495 + }, + { + "epoch": 2.67716791165963, + "grad_norm": 3.027623176574707, + "learning_rate": 2.9540921274485913e-06, + "loss": 0.5399, + "step": 5496 + }, + { + "epoch": 2.677655082819097, + "grad_norm": 2.929858922958374, + "learning_rate": 2.9534596741750697e-06, + "loss": 0.5516, + "step": 5497 + }, + { + "epoch": 2.6781422539785646, + "grad_norm": 3.003227949142456, + "learning_rate": 2.952827190891629e-06, + "loss": 0.5285, + "step": 5498 + }, + { + "epoch": 2.678629425138032, + "grad_norm": 2.672056198120117, + "learning_rate": 2.9521946776401273e-06, + "loss": 0.4928, + "step": 5499 + }, + { + "epoch": 2.679116596297499, + "grad_norm": 2.83054256439209, + "learning_rate": 2.9515621344624227e-06, + "loss": 0.4308, + "step": 5500 + }, + { + "epoch": 2.6796037674569666, + "grad_norm": 2.5898377895355225, + "learning_rate": 2.9509295614003797e-06, + "loss": 0.4182, + "step": 5501 + }, + { + "epoch": 2.6800909386164338, + "grad_norm": 2.783686399459839, + "learning_rate": 2.950296958495859e-06, + "loss": 0.4985, + "step": 5502 + }, + { + "epoch": 2.6805781097759014, + "grad_norm": 2.992743730545044, + "learning_rate": 2.949664325790728e-06, + "loss": 0.5185, + "step": 5503 + }, + { + "epoch": 2.6810652809353686, + "grad_norm": 2.722222328186035, + "learning_rate": 2.949031663326854e-06, + "loss": 0.484, + "step": 5504 + }, + { + "epoch": 2.681552452094836, + "grad_norm": 2.9203131198883057, + "learning_rate": 2.9483989711461066e-06, + "loss": 0.4078, + "step": 5505 + }, + { + "epoch": 2.6820396232543033, + "grad_norm": 3.0906894207000732, + "learning_rate": 2.947766249290356e-06, + "loss": 0.477, + "step": 5506 + }, + { + "epoch": 2.6825267944137705, + "grad_norm": 3.2096214294433594, + "learning_rate": 2.947133497801478e-06, + "loss": 0.5589, + "step": 5507 + }, + { + "epoch": 2.683013965573238, + "grad_norm": 3.294255018234253, + "learning_rate": 2.946500716721346e-06, + "loss": 0.5182, + "step": 5508 + }, + { + "epoch": 2.6835011367327053, + "grad_norm": 2.8286502361297607, + "learning_rate": 2.945867906091838e-06, + "loss": 0.4965, + "step": 5509 + }, + { + "epoch": 2.683988307892173, + "grad_norm": 3.0825812816619873, + "learning_rate": 2.945235065954834e-06, + "loss": 0.5239, + "step": 5510 + }, + { + "epoch": 2.68447547905164, + "grad_norm": 2.9884772300720215, + "learning_rate": 2.9446021963522146e-06, + "loss": 0.4862, + "step": 5511 + }, + { + "epoch": 2.6849626502111077, + "grad_norm": 2.761162519454956, + "learning_rate": 2.9439692973258633e-06, + "loss": 0.462, + "step": 5512 + }, + { + "epoch": 2.685449821370575, + "grad_norm": 2.854487895965576, + "learning_rate": 2.9433363689176654e-06, + "loss": 0.5191, + "step": 5513 + }, + { + "epoch": 2.685936992530042, + "grad_norm": 2.9081828594207764, + "learning_rate": 2.9427034111695074e-06, + "loss": 0.4871, + "step": 5514 + }, + { + "epoch": 2.6864241636895096, + "grad_norm": 2.6040122509002686, + "learning_rate": 2.942070424123279e-06, + "loss": 0.5025, + "step": 5515 + }, + { + "epoch": 2.686911334848977, + "grad_norm": 2.5828208923339844, + "learning_rate": 2.9414374078208712e-06, + "loss": 0.5681, + "step": 5516 + }, + { + "epoch": 2.6873985060084444, + "grad_norm": 3.130866050720215, + "learning_rate": 2.940804362304177e-06, + "loss": 0.473, + "step": 5517 + }, + { + "epoch": 2.6878856771679116, + "grad_norm": 2.675276041030884, + "learning_rate": 2.9401712876150907e-06, + "loss": 0.4752, + "step": 5518 + }, + { + "epoch": 2.688372848327379, + "grad_norm": 2.554311513900757, + "learning_rate": 2.93953818379551e-06, + "loss": 0.5059, + "step": 5519 + }, + { + "epoch": 2.6888600194868464, + "grad_norm": 3.23972487449646, + "learning_rate": 2.938905050887333e-06, + "loss": 0.5133, + "step": 5520 + }, + { + "epoch": 2.6893471906463136, + "grad_norm": 2.9785618782043457, + "learning_rate": 2.938271888932461e-06, + "loss": 0.5344, + "step": 5521 + }, + { + "epoch": 2.689834361805781, + "grad_norm": 3.861940622329712, + "learning_rate": 2.9376386979727954e-06, + "loss": 0.4931, + "step": 5522 + }, + { + "epoch": 2.6903215329652483, + "grad_norm": 2.8949668407440186, + "learning_rate": 2.9370054780502424e-06, + "loss": 0.5038, + "step": 5523 + }, + { + "epoch": 2.690808704124716, + "grad_norm": 2.71567702293396, + "learning_rate": 2.936372229206707e-06, + "loss": 0.5238, + "step": 5524 + }, + { + "epoch": 2.691295875284183, + "grad_norm": 2.9041614532470703, + "learning_rate": 2.9357389514840984e-06, + "loss": 0.4385, + "step": 5525 + }, + { + "epoch": 2.6917830464436507, + "grad_norm": 2.7928004264831543, + "learning_rate": 2.9351056449243266e-06, + "loss": 0.456, + "step": 5526 + }, + { + "epoch": 2.692270217603118, + "grad_norm": 2.940462112426758, + "learning_rate": 2.9344723095693034e-06, + "loss": 0.4619, + "step": 5527 + }, + { + "epoch": 2.692757388762585, + "grad_norm": 2.8301727771759033, + "learning_rate": 2.9338389454609434e-06, + "loss": 0.5092, + "step": 5528 + }, + { + "epoch": 2.6932445599220527, + "grad_norm": 2.9289910793304443, + "learning_rate": 2.9332055526411624e-06, + "loss": 0.4308, + "step": 5529 + }, + { + "epoch": 2.69373173108152, + "grad_norm": 3.0621132850646973, + "learning_rate": 2.9325721311518784e-06, + "loss": 0.5147, + "step": 5530 + }, + { + "epoch": 2.6942189022409875, + "grad_norm": 2.817948579788208, + "learning_rate": 2.9319386810350107e-06, + "loss": 0.5371, + "step": 5531 + }, + { + "epoch": 2.6947060734004546, + "grad_norm": 3.438333749771118, + "learning_rate": 2.931305202332482e-06, + "loss": 0.4908, + "step": 5532 + }, + { + "epoch": 2.6951932445599223, + "grad_norm": 3.1084702014923096, + "learning_rate": 2.9306716950862145e-06, + "loss": 0.5244, + "step": 5533 + }, + { + "epoch": 2.6956804157193894, + "grad_norm": 2.875359296798706, + "learning_rate": 2.930038159338135e-06, + "loss": 0.4676, + "step": 5534 + }, + { + "epoch": 2.6961675868788566, + "grad_norm": 2.777036428451538, + "learning_rate": 2.9294045951301696e-06, + "loss": 0.4817, + "step": 5535 + }, + { + "epoch": 2.696654758038324, + "grad_norm": 2.707989454269409, + "learning_rate": 2.9287710025042483e-06, + "loss": 0.5014, + "step": 5536 + }, + { + "epoch": 2.6971419291977914, + "grad_norm": 2.6789536476135254, + "learning_rate": 2.928137381502302e-06, + "loss": 0.4541, + "step": 5537 + }, + { + "epoch": 2.697629100357259, + "grad_norm": 2.9774184226989746, + "learning_rate": 2.927503732166263e-06, + "loss": 0.5107, + "step": 5538 + }, + { + "epoch": 2.698116271516726, + "grad_norm": 3.3618907928466797, + "learning_rate": 2.9268700545380674e-06, + "loss": 0.4718, + "step": 5539 + }, + { + "epoch": 2.698603442676194, + "grad_norm": 3.136261224746704, + "learning_rate": 2.9262363486596512e-06, + "loss": 0.5384, + "step": 5540 + }, + { + "epoch": 2.699090613835661, + "grad_norm": 3.01438307762146, + "learning_rate": 2.9256026145729534e-06, + "loss": 0.4864, + "step": 5541 + }, + { + "epoch": 2.699577784995128, + "grad_norm": 3.4644429683685303, + "learning_rate": 2.924968852319914e-06, + "loss": 0.4908, + "step": 5542 + }, + { + "epoch": 2.7000649561545957, + "grad_norm": 3.022106170654297, + "learning_rate": 2.924335061942475e-06, + "loss": 0.487, + "step": 5543 + }, + { + "epoch": 2.700552127314063, + "grad_norm": 3.0707225799560547, + "learning_rate": 2.923701243482582e-06, + "loss": 0.5218, + "step": 5544 + }, + { + "epoch": 2.7010392984735305, + "grad_norm": 2.90767240524292, + "learning_rate": 2.9230673969821795e-06, + "loss": 0.5213, + "step": 5545 + }, + { + "epoch": 2.7015264696329977, + "grad_norm": 3.0223796367645264, + "learning_rate": 2.9224335224832166e-06, + "loss": 0.5315, + "step": 5546 + }, + { + "epoch": 2.7020136407924653, + "grad_norm": 2.729419231414795, + "learning_rate": 2.921799620027642e-06, + "loss": 0.5207, + "step": 5547 + }, + { + "epoch": 2.7025008119519325, + "grad_norm": 3.099358558654785, + "learning_rate": 2.921165689657408e-06, + "loss": 0.4687, + "step": 5548 + }, + { + "epoch": 2.7029879831113997, + "grad_norm": 2.67553973197937, + "learning_rate": 2.9205317314144683e-06, + "loss": 0.5154, + "step": 5549 + }, + { + "epoch": 2.7034751542708673, + "grad_norm": 2.7648279666900635, + "learning_rate": 2.9198977453407774e-06, + "loss": 0.4646, + "step": 5550 + }, + { + "epoch": 2.7039623254303344, + "grad_norm": 3.015507459640503, + "learning_rate": 2.9192637314782924e-06, + "loss": 0.4961, + "step": 5551 + }, + { + "epoch": 2.704449496589802, + "grad_norm": 3.169344425201416, + "learning_rate": 2.9186296898689735e-06, + "loss": 0.5931, + "step": 5552 + }, + { + "epoch": 2.704936667749269, + "grad_norm": 2.6827540397644043, + "learning_rate": 2.9179956205547812e-06, + "loss": 0.417, + "step": 5553 + }, + { + "epoch": 2.705423838908737, + "grad_norm": 3.0765206813812256, + "learning_rate": 2.9173615235776757e-06, + "loss": 0.499, + "step": 5554 + }, + { + "epoch": 2.705911010068204, + "grad_norm": 2.8551270961761475, + "learning_rate": 2.9167273989796254e-06, + "loss": 0.462, + "step": 5555 + }, + { + "epoch": 2.706398181227671, + "grad_norm": 2.934934377670288, + "learning_rate": 2.9160932468025936e-06, + "loss": 0.4965, + "step": 5556 + }, + { + "epoch": 2.706885352387139, + "grad_norm": 2.7747035026550293, + "learning_rate": 2.91545906708855e-06, + "loss": 0.458, + "step": 5557 + }, + { + "epoch": 2.707372523546606, + "grad_norm": 2.6194214820861816, + "learning_rate": 2.914824859879464e-06, + "loss": 0.4446, + "step": 5558 + }, + { + "epoch": 2.707859694706073, + "grad_norm": 2.915109872817993, + "learning_rate": 2.9141906252173065e-06, + "loss": 0.5301, + "step": 5559 + }, + { + "epoch": 2.7083468658655407, + "grad_norm": 2.766153335571289, + "learning_rate": 2.913556363144053e-06, + "loss": 0.4879, + "step": 5560 + }, + { + "epoch": 2.7088340370250084, + "grad_norm": 2.574444055557251, + "learning_rate": 2.9129220737016766e-06, + "loss": 0.4588, + "step": 5561 + }, + { + "epoch": 2.7093212081844755, + "grad_norm": 3.1614136695861816, + "learning_rate": 2.9122877569321573e-06, + "loss": 0.5401, + "step": 5562 + }, + { + "epoch": 2.7098083793439427, + "grad_norm": 2.966956615447998, + "learning_rate": 2.9116534128774715e-06, + "loss": 0.5272, + "step": 5563 + }, + { + "epoch": 2.7102955505034103, + "grad_norm": 2.6090424060821533, + "learning_rate": 2.9110190415796023e-06, + "loss": 0.4365, + "step": 5564 + }, + { + "epoch": 2.7107827216628775, + "grad_norm": 2.72866153717041, + "learning_rate": 2.9103846430805304e-06, + "loss": 0.5762, + "step": 5565 + }, + { + "epoch": 2.7112698928223447, + "grad_norm": 2.903946876525879, + "learning_rate": 2.9097502174222415e-06, + "loss": 0.5453, + "step": 5566 + }, + { + "epoch": 2.7117570639818123, + "grad_norm": 3.103501081466675, + "learning_rate": 2.9091157646467205e-06, + "loss": 0.4302, + "step": 5567 + }, + { + "epoch": 2.71224423514128, + "grad_norm": 2.941974639892578, + "learning_rate": 2.908481284795957e-06, + "loss": 0.4469, + "step": 5568 + }, + { + "epoch": 2.712731406300747, + "grad_norm": 2.8096134662628174, + "learning_rate": 2.9078467779119403e-06, + "loss": 0.5441, + "step": 5569 + }, + { + "epoch": 2.713218577460214, + "grad_norm": 2.9392497539520264, + "learning_rate": 2.907212244036661e-06, + "loss": 0.4931, + "step": 5570 + }, + { + "epoch": 2.713705748619682, + "grad_norm": 2.582306146621704, + "learning_rate": 2.906577683212114e-06, + "loss": 0.4554, + "step": 5571 + }, + { + "epoch": 2.714192919779149, + "grad_norm": 3.1970021724700928, + "learning_rate": 2.9059430954802937e-06, + "loss": 0.5369, + "step": 5572 + }, + { + "epoch": 2.714680090938616, + "grad_norm": 3.054299831390381, + "learning_rate": 2.9053084808831972e-06, + "loss": 0.5054, + "step": 5573 + }, + { + "epoch": 2.715167262098084, + "grad_norm": 3.2281582355499268, + "learning_rate": 2.904673839462823e-06, + "loss": 0.5112, + "step": 5574 + }, + { + "epoch": 2.7156544332575514, + "grad_norm": 3.1823935508728027, + "learning_rate": 2.904039171261172e-06, + "loss": 0.5361, + "step": 5575 + }, + { + "epoch": 2.7161416044170186, + "grad_norm": 2.774393081665039, + "learning_rate": 2.9034044763202464e-06, + "loss": 0.4533, + "step": 5576 + }, + { + "epoch": 2.7166287755764857, + "grad_norm": 3.218581199645996, + "learning_rate": 2.9027697546820497e-06, + "loss": 0.5231, + "step": 5577 + }, + { + "epoch": 2.7171159467359534, + "grad_norm": 2.6696715354919434, + "learning_rate": 2.9021350063885885e-06, + "loss": 0.4769, + "step": 5578 + }, + { + "epoch": 2.7176031178954205, + "grad_norm": 2.714855194091797, + "learning_rate": 2.9015002314818696e-06, + "loss": 0.5094, + "step": 5579 + }, + { + "epoch": 2.7180902890548877, + "grad_norm": 2.5809905529022217, + "learning_rate": 2.900865430003903e-06, + "loss": 0.4837, + "step": 5580 + }, + { + "epoch": 2.7185774602143553, + "grad_norm": 2.7227330207824707, + "learning_rate": 2.900230601996699e-06, + "loss": 0.4866, + "step": 5581 + }, + { + "epoch": 2.719064631373823, + "grad_norm": 2.685258150100708, + "learning_rate": 2.8995957475022724e-06, + "loss": 0.4844, + "step": 5582 + }, + { + "epoch": 2.71955180253329, + "grad_norm": 2.830061674118042, + "learning_rate": 2.898960866562635e-06, + "loss": 0.4827, + "step": 5583 + }, + { + "epoch": 2.7200389736927573, + "grad_norm": 2.6764614582061768, + "learning_rate": 2.8983259592198056e-06, + "loss": 0.4409, + "step": 5584 + }, + { + "epoch": 2.720526144852225, + "grad_norm": 3.0115644931793213, + "learning_rate": 2.897691025515801e-06, + "loss": 0.4579, + "step": 5585 + }, + { + "epoch": 2.721013316011692, + "grad_norm": 2.913573741912842, + "learning_rate": 2.897056065492641e-06, + "loss": 0.4453, + "step": 5586 + }, + { + "epoch": 2.721500487171159, + "grad_norm": 2.7327444553375244, + "learning_rate": 2.896421079192348e-06, + "loss": 0.5257, + "step": 5587 + }, + { + "epoch": 2.721987658330627, + "grad_norm": 2.9487216472625732, + "learning_rate": 2.8957860666569447e-06, + "loss": 0.5016, + "step": 5588 + }, + { + "epoch": 2.7224748294900945, + "grad_norm": 2.8744256496429443, + "learning_rate": 2.8951510279284567e-06, + "loss": 0.5195, + "step": 5589 + }, + { + "epoch": 2.7229620006495616, + "grad_norm": 3.0419650077819824, + "learning_rate": 2.8945159630489094e-06, + "loss": 0.5079, + "step": 5590 + }, + { + "epoch": 2.723449171809029, + "grad_norm": 3.183043956756592, + "learning_rate": 2.893880872060334e-06, + "loss": 0.4519, + "step": 5591 + }, + { + "epoch": 2.7239363429684964, + "grad_norm": 2.9783051013946533, + "learning_rate": 2.8932457550047575e-06, + "loss": 0.5368, + "step": 5592 + }, + { + "epoch": 2.7244235141279636, + "grad_norm": 2.70023512840271, + "learning_rate": 2.8926106119242147e-06, + "loss": 0.4918, + "step": 5593 + }, + { + "epoch": 2.7249106852874307, + "grad_norm": 2.8474276065826416, + "learning_rate": 2.8919754428607376e-06, + "loss": 0.4929, + "step": 5594 + }, + { + "epoch": 2.7253978564468984, + "grad_norm": 3.0333874225616455, + "learning_rate": 2.8913402478563627e-06, + "loss": 0.5357, + "step": 5595 + }, + { + "epoch": 2.725885027606366, + "grad_norm": 2.8811182975769043, + "learning_rate": 2.8907050269531263e-06, + "loss": 0.5158, + "step": 5596 + }, + { + "epoch": 2.726372198765833, + "grad_norm": 2.9367904663085938, + "learning_rate": 2.8900697801930678e-06, + "loss": 0.455, + "step": 5597 + }, + { + "epoch": 2.7268593699253003, + "grad_norm": 2.896949291229248, + "learning_rate": 2.889434507618228e-06, + "loss": 0.5215, + "step": 5598 + }, + { + "epoch": 2.727346541084768, + "grad_norm": 3.6735024452209473, + "learning_rate": 2.888799209270648e-06, + "loss": 0.4661, + "step": 5599 + }, + { + "epoch": 2.727833712244235, + "grad_norm": 2.566328287124634, + "learning_rate": 2.8881638851923725e-06, + "loss": 0.5559, + "step": 5600 + }, + { + "epoch": 2.7283208834037023, + "grad_norm": 3.2292215824127197, + "learning_rate": 2.887528535425448e-06, + "loss": 0.4955, + "step": 5601 + }, + { + "epoch": 2.72880805456317, + "grad_norm": 3.3286073207855225, + "learning_rate": 2.88689316001192e-06, + "loss": 0.4522, + "step": 5602 + }, + { + "epoch": 2.7292952257226375, + "grad_norm": 2.6195127964019775, + "learning_rate": 2.8862577589938395e-06, + "loss": 0.4384, + "step": 5603 + }, + { + "epoch": 2.7297823968821047, + "grad_norm": 2.5867154598236084, + "learning_rate": 2.885622332413256e-06, + "loss": 0.4807, + "step": 5604 + }, + { + "epoch": 2.730269568041572, + "grad_norm": 2.6313278675079346, + "learning_rate": 2.8849868803122233e-06, + "loss": 0.5098, + "step": 5605 + }, + { + "epoch": 2.7307567392010395, + "grad_norm": 2.6534619331359863, + "learning_rate": 2.8843514027327936e-06, + "loss": 0.4742, + "step": 5606 + }, + { + "epoch": 2.7312439103605066, + "grad_norm": 2.7449374198913574, + "learning_rate": 2.8837158997170246e-06, + "loss": 0.5153, + "step": 5607 + }, + { + "epoch": 2.731731081519974, + "grad_norm": 2.8165171146392822, + "learning_rate": 2.883080371306973e-06, + "loss": 0.5034, + "step": 5608 + }, + { + "epoch": 2.7322182526794414, + "grad_norm": 3.14597749710083, + "learning_rate": 2.8824448175446976e-06, + "loss": 0.5059, + "step": 5609 + }, + { + "epoch": 2.7327054238389086, + "grad_norm": 2.6872363090515137, + "learning_rate": 2.88180923847226e-06, + "loss": 0.4272, + "step": 5610 + }, + { + "epoch": 2.733192594998376, + "grad_norm": 3.1223578453063965, + "learning_rate": 2.8811736341317233e-06, + "loss": 0.4573, + "step": 5611 + }, + { + "epoch": 2.7336797661578434, + "grad_norm": 2.578890562057495, + "learning_rate": 2.8805380045651497e-06, + "loss": 0.4276, + "step": 5612 + }, + { + "epoch": 2.734166937317311, + "grad_norm": 2.7832462787628174, + "learning_rate": 2.8799023498146074e-06, + "loss": 0.449, + "step": 5613 + }, + { + "epoch": 2.734654108476778, + "grad_norm": 2.997966766357422, + "learning_rate": 2.879266669922162e-06, + "loss": 0.5316, + "step": 5614 + }, + { + "epoch": 2.7351412796362453, + "grad_norm": 3.311467409133911, + "learning_rate": 2.878630964929885e-06, + "loss": 0.4519, + "step": 5615 + }, + { + "epoch": 2.735628450795713, + "grad_norm": 2.7852602005004883, + "learning_rate": 2.877995234879845e-06, + "loss": 0.4322, + "step": 5616 + }, + { + "epoch": 2.73611562195518, + "grad_norm": 2.7830960750579834, + "learning_rate": 2.8773594798141162e-06, + "loss": 0.4802, + "step": 5617 + }, + { + "epoch": 2.7366027931146477, + "grad_norm": 2.5427610874176025, + "learning_rate": 2.8767236997747717e-06, + "loss": 0.5255, + "step": 5618 + }, + { + "epoch": 2.737089964274115, + "grad_norm": 2.8233642578125, + "learning_rate": 2.876087894803888e-06, + "loss": 0.4683, + "step": 5619 + }, + { + "epoch": 2.7375771354335825, + "grad_norm": 3.2246646881103516, + "learning_rate": 2.8754520649435426e-06, + "loss": 0.4863, + "step": 5620 + }, + { + "epoch": 2.7380643065930497, + "grad_norm": 2.7089242935180664, + "learning_rate": 2.8748162102358145e-06, + "loss": 0.5119, + "step": 5621 + }, + { + "epoch": 2.738551477752517, + "grad_norm": 2.877051830291748, + "learning_rate": 2.8741803307227846e-06, + "loss": 0.455, + "step": 5622 + }, + { + "epoch": 2.7390386489119845, + "grad_norm": 3.2709648609161377, + "learning_rate": 2.873544426446535e-06, + "loss": 0.4982, + "step": 5623 + }, + { + "epoch": 2.7395258200714516, + "grad_norm": 2.5561890602111816, + "learning_rate": 2.8729084974491497e-06, + "loss": 0.4479, + "step": 5624 + }, + { + "epoch": 2.7400129912309192, + "grad_norm": 2.7960424423217773, + "learning_rate": 2.8722725437727156e-06, + "loss": 0.4804, + "step": 5625 + }, + { + "epoch": 2.7405001623903864, + "grad_norm": 2.802668571472168, + "learning_rate": 2.871636565459318e-06, + "loss": 0.4837, + "step": 5626 + }, + { + "epoch": 2.740987333549854, + "grad_norm": 2.860725164413452, + "learning_rate": 2.8710005625510486e-06, + "loss": 0.5226, + "step": 5627 + }, + { + "epoch": 2.741474504709321, + "grad_norm": 2.825049638748169, + "learning_rate": 2.870364535089995e-06, + "loss": 0.4918, + "step": 5628 + }, + { + "epoch": 2.7419616758687884, + "grad_norm": 2.7855305671691895, + "learning_rate": 2.869728483118252e-06, + "loss": 0.4851, + "step": 5629 + }, + { + "epoch": 2.742448847028256, + "grad_norm": 3.304288864135742, + "learning_rate": 2.8690924066779118e-06, + "loss": 0.5841, + "step": 5630 + }, + { + "epoch": 2.742936018187723, + "grad_norm": 2.8587872982025146, + "learning_rate": 2.8684563058110717e-06, + "loss": 0.52, + "step": 5631 + }, + { + "epoch": 2.7434231893471908, + "grad_norm": 2.6538195610046387, + "learning_rate": 2.867820180559826e-06, + "loss": 0.4656, + "step": 5632 + }, + { + "epoch": 2.743910360506658, + "grad_norm": 2.4498367309570312, + "learning_rate": 2.867184030966276e-06, + "loss": 0.4165, + "step": 5633 + }, + { + "epoch": 2.7443975316661255, + "grad_norm": 2.9104843139648438, + "learning_rate": 2.8665478570725203e-06, + "loss": 0.5141, + "step": 5634 + }, + { + "epoch": 2.7448847028255927, + "grad_norm": 2.52763295173645, + "learning_rate": 2.8659116589206627e-06, + "loss": 0.4914, + "step": 5635 + }, + { + "epoch": 2.74537187398506, + "grad_norm": 2.9322116374969482, + "learning_rate": 2.865275436552804e-06, + "loss": 0.5658, + "step": 5636 + }, + { + "epoch": 2.7458590451445275, + "grad_norm": 3.249471426010132, + "learning_rate": 2.8646391900110522e-06, + "loss": 0.439, + "step": 5637 + }, + { + "epoch": 2.7463462163039947, + "grad_norm": 3.0287063121795654, + "learning_rate": 2.864002919337513e-06, + "loss": 0.5134, + "step": 5638 + }, + { + "epoch": 2.7468333874634623, + "grad_norm": 2.9496166706085205, + "learning_rate": 2.863366624574293e-06, + "loss": 0.4546, + "step": 5639 + }, + { + "epoch": 2.7473205586229295, + "grad_norm": 2.762770414352417, + "learning_rate": 2.8627303057635048e-06, + "loss": 0.449, + "step": 5640 + }, + { + "epoch": 2.747807729782397, + "grad_norm": 2.808349847793579, + "learning_rate": 2.862093962947258e-06, + "loss": 0.4726, + "step": 5641 + }, + { + "epoch": 2.7482949009418642, + "grad_norm": 2.924440860748291, + "learning_rate": 2.861457596167667e-06, + "loss": 0.4771, + "step": 5642 + }, + { + "epoch": 2.7487820721013314, + "grad_norm": 2.8847062587738037, + "learning_rate": 2.860821205466847e-06, + "loss": 0.4249, + "step": 5643 + }, + { + "epoch": 2.749269243260799, + "grad_norm": 3.1089870929718018, + "learning_rate": 2.860184790886912e-06, + "loss": 0.5056, + "step": 5644 + }, + { + "epoch": 2.749756414420266, + "grad_norm": 2.892305612564087, + "learning_rate": 2.859548352469981e-06, + "loss": 0.5017, + "step": 5645 + }, + { + "epoch": 2.750243585579734, + "grad_norm": 2.8636555671691895, + "learning_rate": 2.8589118902581735e-06, + "loss": 0.4916, + "step": 5646 + }, + { + "epoch": 2.750730756739201, + "grad_norm": 2.5554492473602295, + "learning_rate": 2.85827540429361e-06, + "loss": 0.4487, + "step": 5647 + }, + { + "epoch": 2.7512179278986686, + "grad_norm": 2.9835543632507324, + "learning_rate": 2.8576388946184148e-06, + "loss": 0.455, + "step": 5648 + }, + { + "epoch": 2.7517050990581358, + "grad_norm": 2.9497790336608887, + "learning_rate": 2.85700236127471e-06, + "loss": 0.5517, + "step": 5649 + }, + { + "epoch": 2.752192270217603, + "grad_norm": 3.0272634029388428, + "learning_rate": 2.8563658043046226e-06, + "loss": 0.5162, + "step": 5650 + }, + { + "epoch": 2.7526794413770705, + "grad_norm": 2.578821897506714, + "learning_rate": 2.855729223750279e-06, + "loss": 0.4804, + "step": 5651 + }, + { + "epoch": 2.7531666125365377, + "grad_norm": 2.5955920219421387, + "learning_rate": 2.855092619653809e-06, + "loss": 0.4672, + "step": 5652 + }, + { + "epoch": 2.7536537836960053, + "grad_norm": 2.686525821685791, + "learning_rate": 2.8544559920573424e-06, + "loss": 0.5709, + "step": 5653 + }, + { + "epoch": 2.7541409548554725, + "grad_norm": 2.8681585788726807, + "learning_rate": 2.8538193410030115e-06, + "loss": 0.4559, + "step": 5654 + }, + { + "epoch": 2.75462812601494, + "grad_norm": 2.940279245376587, + "learning_rate": 2.8531826665329485e-06, + "loss": 0.445, + "step": 5655 + }, + { + "epoch": 2.7551152971744073, + "grad_norm": 3.1663920879364014, + "learning_rate": 2.852545968689291e-06, + "loss": 0.4678, + "step": 5656 + }, + { + "epoch": 2.7556024683338745, + "grad_norm": 2.817692756652832, + "learning_rate": 2.851909247514173e-06, + "loss": 0.5561, + "step": 5657 + }, + { + "epoch": 2.756089639493342, + "grad_norm": 2.9446372985839844, + "learning_rate": 2.8512725030497344e-06, + "loss": 0.526, + "step": 5658 + }, + { + "epoch": 2.7565768106528092, + "grad_norm": 3.3500988483428955, + "learning_rate": 2.8506357353381135e-06, + "loss": 0.5364, + "step": 5659 + }, + { + "epoch": 2.757063981812277, + "grad_norm": 3.2111406326293945, + "learning_rate": 2.8499989444214533e-06, + "loss": 0.5026, + "step": 5660 + }, + { + "epoch": 2.757551152971744, + "grad_norm": 2.6394190788269043, + "learning_rate": 2.849362130341895e-06, + "loss": 0.5961, + "step": 5661 + }, + { + "epoch": 2.7580383241312116, + "grad_norm": 4.798309803009033, + "learning_rate": 2.8487252931415838e-06, + "loss": 0.4953, + "step": 5662 + }, + { + "epoch": 2.758525495290679, + "grad_norm": 3.234783887863159, + "learning_rate": 2.8480884328626652e-06, + "loss": 0.5204, + "step": 5663 + }, + { + "epoch": 2.759012666450146, + "grad_norm": 2.8077352046966553, + "learning_rate": 2.8474515495472855e-06, + "loss": 0.5552, + "step": 5664 + }, + { + "epoch": 2.7594998376096136, + "grad_norm": 2.8636813163757324, + "learning_rate": 2.8468146432375956e-06, + "loss": 0.486, + "step": 5665 + }, + { + "epoch": 2.7599870087690808, + "grad_norm": 2.7710819244384766, + "learning_rate": 2.846177713975745e-06, + "loss": 0.4533, + "step": 5666 + }, + { + "epoch": 2.7604741799285484, + "grad_norm": 2.9483590126037598, + "learning_rate": 2.845540761803885e-06, + "loss": 0.484, + "step": 5667 + }, + { + "epoch": 2.7609613510880155, + "grad_norm": 2.571805238723755, + "learning_rate": 2.84490378676417e-06, + "loss": 0.4636, + "step": 5668 + }, + { + "epoch": 2.761448522247483, + "grad_norm": 3.203312635421753, + "learning_rate": 2.8442667888987535e-06, + "loss": 0.5933, + "step": 5669 + }, + { + "epoch": 2.7619356934069503, + "grad_norm": 3.249272584915161, + "learning_rate": 2.8436297682497944e-06, + "loss": 0.4551, + "step": 5670 + }, + { + "epoch": 2.7624228645664175, + "grad_norm": 2.979247570037842, + "learning_rate": 2.842992724859448e-06, + "loss": 0.481, + "step": 5671 + }, + { + "epoch": 2.762910035725885, + "grad_norm": 2.765094041824341, + "learning_rate": 2.842355658769876e-06, + "loss": 0.4826, + "step": 5672 + }, + { + "epoch": 2.7633972068853523, + "grad_norm": 2.686581611633301, + "learning_rate": 2.8417185700232376e-06, + "loss": 0.4736, + "step": 5673 + }, + { + "epoch": 2.76388437804482, + "grad_norm": 2.707622528076172, + "learning_rate": 2.8410814586616963e-06, + "loss": 0.4925, + "step": 5674 + }, + { + "epoch": 2.764371549204287, + "grad_norm": 3.0891425609588623, + "learning_rate": 2.8404443247274156e-06, + "loss": 0.506, + "step": 5675 + }, + { + "epoch": 2.7648587203637547, + "grad_norm": 3.135664701461792, + "learning_rate": 2.8398071682625616e-06, + "loss": 0.5267, + "step": 5676 + }, + { + "epoch": 2.765345891523222, + "grad_norm": 3.1740128993988037, + "learning_rate": 2.8391699893093006e-06, + "loss": 0.5015, + "step": 5677 + }, + { + "epoch": 2.765833062682689, + "grad_norm": 2.998034715652466, + "learning_rate": 2.838532787909802e-06, + "loss": 0.5714, + "step": 5678 + }, + { + "epoch": 2.7663202338421566, + "grad_norm": 2.9548606872558594, + "learning_rate": 2.837895564106234e-06, + "loss": 0.5652, + "step": 5679 + }, + { + "epoch": 2.766807405001624, + "grad_norm": 2.8138296604156494, + "learning_rate": 2.8372583179407697e-06, + "loss": 0.4858, + "step": 5680 + }, + { + "epoch": 2.7672945761610914, + "grad_norm": 2.6842856407165527, + "learning_rate": 2.8366210494555806e-06, + "loss": 0.5175, + "step": 5681 + }, + { + "epoch": 2.7677817473205586, + "grad_norm": 2.7753031253814697, + "learning_rate": 2.835983758692843e-06, + "loss": 0.5182, + "step": 5682 + }, + { + "epoch": 2.768268918480026, + "grad_norm": 3.0367863178253174, + "learning_rate": 2.83534644569473e-06, + "loss": 0.4788, + "step": 5683 + }, + { + "epoch": 2.7687560896394934, + "grad_norm": 3.122129201889038, + "learning_rate": 2.8347091105034217e-06, + "loss": 0.5773, + "step": 5684 + }, + { + "epoch": 2.7692432607989605, + "grad_norm": 3.0531346797943115, + "learning_rate": 2.8340717531610955e-06, + "loss": 0.5268, + "step": 5685 + }, + { + "epoch": 2.769730431958428, + "grad_norm": 2.55731201171875, + "learning_rate": 2.8334343737099313e-06, + "loss": 0.4488, + "step": 5686 + }, + { + "epoch": 2.7702176031178953, + "grad_norm": 2.8648152351379395, + "learning_rate": 2.8327969721921116e-06, + "loss": 0.547, + "step": 5687 + }, + { + "epoch": 2.7707047742773625, + "grad_norm": 2.5641329288482666, + "learning_rate": 2.8321595486498195e-06, + "loss": 0.4806, + "step": 5688 + }, + { + "epoch": 2.77119194543683, + "grad_norm": 2.870034694671631, + "learning_rate": 2.8315221031252395e-06, + "loss": 0.4541, + "step": 5689 + }, + { + "epoch": 2.7716791165962977, + "grad_norm": 2.9745490550994873, + "learning_rate": 2.8308846356605567e-06, + "loss": 0.4928, + "step": 5690 + }, + { + "epoch": 2.772166287755765, + "grad_norm": 2.7811124324798584, + "learning_rate": 2.83024714629796e-06, + "loss": 0.4791, + "step": 5691 + }, + { + "epoch": 2.772653458915232, + "grad_norm": 3.1061925888061523, + "learning_rate": 2.8296096350796375e-06, + "loss": 0.5452, + "step": 5692 + }, + { + "epoch": 2.7731406300746997, + "grad_norm": 2.502319812774658, + "learning_rate": 2.8289721020477813e-06, + "loss": 0.4489, + "step": 5693 + }, + { + "epoch": 2.773627801234167, + "grad_norm": 3.0311856269836426, + "learning_rate": 2.8283345472445805e-06, + "loss": 0.525, + "step": 5694 + }, + { + "epoch": 2.774114972393634, + "grad_norm": 2.546363592147827, + "learning_rate": 2.827696970712231e-06, + "loss": 0.4579, + "step": 5695 + }, + { + "epoch": 2.7746021435531016, + "grad_norm": 2.644102096557617, + "learning_rate": 2.8270593724929258e-06, + "loss": 0.4519, + "step": 5696 + }, + { + "epoch": 2.7750893147125693, + "grad_norm": 2.7256674766540527, + "learning_rate": 2.826421752628862e-06, + "loss": 0.5227, + "step": 5697 + }, + { + "epoch": 2.7755764858720364, + "grad_norm": 2.8631112575531006, + "learning_rate": 2.825784111162237e-06, + "loss": 0.4969, + "step": 5698 + }, + { + "epoch": 2.7760636570315036, + "grad_norm": 3.0722243785858154, + "learning_rate": 2.82514644813525e-06, + "loss": 0.4982, + "step": 5699 + }, + { + "epoch": 2.776550828190971, + "grad_norm": 2.874429225921631, + "learning_rate": 2.8245087635901015e-06, + "loss": 0.4876, + "step": 5700 + }, + { + "epoch": 2.7770379993504384, + "grad_norm": 2.747102737426758, + "learning_rate": 2.8238710575689932e-06, + "loss": 0.5275, + "step": 5701 + }, + { + "epoch": 2.7775251705099055, + "grad_norm": 3.115628957748413, + "learning_rate": 2.823233330114128e-06, + "loss": 0.5333, + "step": 5702 + }, + { + "epoch": 2.778012341669373, + "grad_norm": 3.1309425830841064, + "learning_rate": 2.8225955812677114e-06, + "loss": 0.4853, + "step": 5703 + }, + { + "epoch": 2.778499512828841, + "grad_norm": 2.811709403991699, + "learning_rate": 2.8219578110719493e-06, + "loss": 0.4805, + "step": 5704 + }, + { + "epoch": 2.778986683988308, + "grad_norm": 2.6867434978485107, + "learning_rate": 2.8213200195690493e-06, + "loss": 0.4829, + "step": 5705 + }, + { + "epoch": 2.779473855147775, + "grad_norm": 2.7292141914367676, + "learning_rate": 2.8206822068012195e-06, + "loss": 0.4737, + "step": 5706 + }, + { + "epoch": 2.7799610263072427, + "grad_norm": 2.7655653953552246, + "learning_rate": 2.8200443728106725e-06, + "loss": 0.4412, + "step": 5707 + }, + { + "epoch": 2.78044819746671, + "grad_norm": 2.9425764083862305, + "learning_rate": 2.8194065176396177e-06, + "loss": 0.5234, + "step": 5708 + }, + { + "epoch": 2.780935368626177, + "grad_norm": 2.9249863624572754, + "learning_rate": 2.81876864133027e-06, + "loss": 0.4927, + "step": 5709 + }, + { + "epoch": 2.7814225397856447, + "grad_norm": 2.929969072341919, + "learning_rate": 2.818130743924843e-06, + "loss": 0.4723, + "step": 5710 + }, + { + "epoch": 2.7819097109451123, + "grad_norm": 2.946063280105591, + "learning_rate": 2.8174928254655533e-06, + "loss": 0.5028, + "step": 5711 + }, + { + "epoch": 2.7823968821045795, + "grad_norm": 2.774744749069214, + "learning_rate": 2.8168548859946177e-06, + "loss": 0.4894, + "step": 5712 + }, + { + "epoch": 2.7828840532640466, + "grad_norm": 3.0272974967956543, + "learning_rate": 2.816216925554255e-06, + "loss": 0.4375, + "step": 5713 + }, + { + "epoch": 2.7833712244235143, + "grad_norm": 2.497100591659546, + "learning_rate": 2.8155789441866853e-06, + "loss": 0.5018, + "step": 5714 + }, + { + "epoch": 2.7838583955829814, + "grad_norm": 2.6562180519104004, + "learning_rate": 2.8149409419341318e-06, + "loss": 0.499, + "step": 5715 + }, + { + "epoch": 2.7843455667424486, + "grad_norm": 2.594229221343994, + "learning_rate": 2.8143029188388147e-06, + "loss": 0.4769, + "step": 5716 + }, + { + "epoch": 2.784832737901916, + "grad_norm": 2.8353476524353027, + "learning_rate": 2.813664874942961e-06, + "loss": 0.4811, + "step": 5717 + }, + { + "epoch": 2.785319909061384, + "grad_norm": 2.8284575939178467, + "learning_rate": 2.813026810288794e-06, + "loss": 0.444, + "step": 5718 + }, + { + "epoch": 2.785807080220851, + "grad_norm": 2.4871015548706055, + "learning_rate": 2.812388724918542e-06, + "loss": 0.5186, + "step": 5719 + }, + { + "epoch": 2.786294251380318, + "grad_norm": 3.0301270484924316, + "learning_rate": 2.8117506188744333e-06, + "loss": 0.4755, + "step": 5720 + }, + { + "epoch": 2.786781422539786, + "grad_norm": 2.5535407066345215, + "learning_rate": 2.8111124921986983e-06, + "loss": 0.4946, + "step": 5721 + }, + { + "epoch": 2.787268593699253, + "grad_norm": 2.756887912750244, + "learning_rate": 2.8104743449335664e-06, + "loss": 0.4677, + "step": 5722 + }, + { + "epoch": 2.78775576485872, + "grad_norm": 3.0198652744293213, + "learning_rate": 2.8098361771212723e-06, + "loss": 0.5464, + "step": 5723 + }, + { + "epoch": 2.7882429360181877, + "grad_norm": 2.776294469833374, + "learning_rate": 2.809197988804048e-06, + "loss": 0.4921, + "step": 5724 + }, + { + "epoch": 2.7887301071776553, + "grad_norm": 3.243307590484619, + "learning_rate": 2.8085597800241306e-06, + "loss": 0.529, + "step": 5725 + }, + { + "epoch": 2.7892172783371225, + "grad_norm": 3.005228042602539, + "learning_rate": 2.8079215508237547e-06, + "loss": 0.5034, + "step": 5726 + }, + { + "epoch": 2.7897044494965897, + "grad_norm": 2.6787164211273193, + "learning_rate": 2.8072833012451602e-06, + "loss": 0.4857, + "step": 5727 + }, + { + "epoch": 2.7901916206560573, + "grad_norm": 2.6124701499938965, + "learning_rate": 2.806645031330584e-06, + "loss": 0.5736, + "step": 5728 + }, + { + "epoch": 2.7906787918155245, + "grad_norm": 2.684276580810547, + "learning_rate": 2.8060067411222697e-06, + "loss": 0.4193, + "step": 5729 + }, + { + "epoch": 2.7911659629749916, + "grad_norm": 2.857543468475342, + "learning_rate": 2.805368430662457e-06, + "loss": 0.491, + "step": 5730 + }, + { + "epoch": 2.7916531341344593, + "grad_norm": 2.685798168182373, + "learning_rate": 2.8047300999933914e-06, + "loss": 0.4802, + "step": 5731 + }, + { + "epoch": 2.792140305293927, + "grad_norm": 2.890941619873047, + "learning_rate": 2.8040917491573144e-06, + "loss": 0.4751, + "step": 5732 + }, + { + "epoch": 2.792627476453394, + "grad_norm": 2.8429341316223145, + "learning_rate": 2.8034533781964744e-06, + "loss": 0.4882, + "step": 5733 + }, + { + "epoch": 2.793114647612861, + "grad_norm": 2.9730427265167236, + "learning_rate": 2.802814987153118e-06, + "loss": 0.5339, + "step": 5734 + }, + { + "epoch": 2.793601818772329, + "grad_norm": 3.1012110710144043, + "learning_rate": 2.802176576069494e-06, + "loss": 0.5592, + "step": 5735 + }, + { + "epoch": 2.794088989931796, + "grad_norm": 2.732978105545044, + "learning_rate": 2.801538144987852e-06, + "loss": 0.4871, + "step": 5736 + }, + { + "epoch": 2.794576161091263, + "grad_norm": 2.6050527095794678, + "learning_rate": 2.800899693950444e-06, + "loss": 0.5284, + "step": 5737 + }, + { + "epoch": 2.795063332250731, + "grad_norm": 3.1656386852264404, + "learning_rate": 2.800261222999522e-06, + "loss": 0.5498, + "step": 5738 + }, + { + "epoch": 2.795550503410198, + "grad_norm": 2.708165168762207, + "learning_rate": 2.7996227321773405e-06, + "loss": 0.4648, + "step": 5739 + }, + { + "epoch": 2.7960376745696656, + "grad_norm": 3.0578083992004395, + "learning_rate": 2.798984221526154e-06, + "loss": 0.5739, + "step": 5740 + }, + { + "epoch": 2.7965248457291327, + "grad_norm": 3.0541141033172607, + "learning_rate": 2.798345691088221e-06, + "loss": 0.4513, + "step": 5741 + }, + { + "epoch": 2.7970120168886003, + "grad_norm": 2.858527660369873, + "learning_rate": 2.7977071409057967e-06, + "loss": 0.5021, + "step": 5742 + }, + { + "epoch": 2.7974991880480675, + "grad_norm": 2.7390856742858887, + "learning_rate": 2.7970685710211414e-06, + "loss": 0.4512, + "step": 5743 + }, + { + "epoch": 2.7979863592075347, + "grad_norm": 2.877150535583496, + "learning_rate": 2.796429981476516e-06, + "loss": 0.5113, + "step": 5744 + }, + { + "epoch": 2.7984735303670023, + "grad_norm": 2.547593116760254, + "learning_rate": 2.7957913723141823e-06, + "loss": 0.4132, + "step": 5745 + }, + { + "epoch": 2.7989607015264695, + "grad_norm": 2.9962663650512695, + "learning_rate": 2.795152743576403e-06, + "loss": 0.4698, + "step": 5746 + }, + { + "epoch": 2.799447872685937, + "grad_norm": 2.534959077835083, + "learning_rate": 2.7945140953054423e-06, + "loss": 0.4585, + "step": 5747 + }, + { + "epoch": 2.7999350438454043, + "grad_norm": 2.7768774032592773, + "learning_rate": 2.7938754275435664e-06, + "loss": 0.4963, + "step": 5748 + }, + { + "epoch": 2.800422215004872, + "grad_norm": 2.9071786403656006, + "learning_rate": 2.7932367403330424e-06, + "loss": 0.4338, + "step": 5749 + }, + { + "epoch": 2.800909386164339, + "grad_norm": 2.8478736877441406, + "learning_rate": 2.792598033716138e-06, + "loss": 0.46, + "step": 5750 + }, + { + "epoch": 2.801396557323806, + "grad_norm": 2.5366663932800293, + "learning_rate": 2.7919593077351225e-06, + "loss": 0.4654, + "step": 5751 + }, + { + "epoch": 2.801883728483274, + "grad_norm": 2.8971614837646484, + "learning_rate": 2.791320562432268e-06, + "loss": 0.498, + "step": 5752 + }, + { + "epoch": 2.802370899642741, + "grad_norm": 2.7428059577941895, + "learning_rate": 2.7906817978498457e-06, + "loss": 0.4879, + "step": 5753 + }, + { + "epoch": 2.8028580708022086, + "grad_norm": 2.8151023387908936, + "learning_rate": 2.790043014030128e-06, + "loss": 0.4968, + "step": 5754 + }, + { + "epoch": 2.803345241961676, + "grad_norm": 2.6339828968048096, + "learning_rate": 2.7894042110153923e-06, + "loss": 0.5219, + "step": 5755 + }, + { + "epoch": 2.8038324131211434, + "grad_norm": 2.620932102203369, + "learning_rate": 2.7887653888479123e-06, + "loss": 0.5197, + "step": 5756 + }, + { + "epoch": 2.8043195842806106, + "grad_norm": 2.648822069168091, + "learning_rate": 2.788126547569965e-06, + "loss": 0.5307, + "step": 5757 + }, + { + "epoch": 2.8048067554400777, + "grad_norm": 2.8703248500823975, + "learning_rate": 2.7874876872238305e-06, + "loss": 0.4796, + "step": 5758 + }, + { + "epoch": 2.8052939265995454, + "grad_norm": 2.8842408657073975, + "learning_rate": 2.7868488078517868e-06, + "loss": 0.4489, + "step": 5759 + }, + { + "epoch": 2.8057810977590125, + "grad_norm": 3.0564723014831543, + "learning_rate": 2.7862099094961165e-06, + "loss": 0.5803, + "step": 5760 + }, + { + "epoch": 2.80626826891848, + "grad_norm": 2.781665086746216, + "learning_rate": 2.7855709921991005e-06, + "loss": 0.5024, + "step": 5761 + }, + { + "epoch": 2.8067554400779473, + "grad_norm": 2.756169080734253, + "learning_rate": 2.784932056003023e-06, + "loss": 0.4253, + "step": 5762 + }, + { + "epoch": 2.807242611237415, + "grad_norm": 2.9008400440216064, + "learning_rate": 2.7842931009501682e-06, + "loss": 0.47, + "step": 5763 + }, + { + "epoch": 2.807729782396882, + "grad_norm": 2.7834224700927734, + "learning_rate": 2.783654127082823e-06, + "loss": 0.4535, + "step": 5764 + }, + { + "epoch": 2.8082169535563493, + "grad_norm": 2.7699568271636963, + "learning_rate": 2.783015134443274e-06, + "loss": 0.4266, + "step": 5765 + }, + { + "epoch": 2.808704124715817, + "grad_norm": 2.8436570167541504, + "learning_rate": 2.7823761230738093e-06, + "loss": 0.5008, + "step": 5766 + }, + { + "epoch": 2.809191295875284, + "grad_norm": 2.930739641189575, + "learning_rate": 2.7817370930167198e-06, + "loss": 0.4675, + "step": 5767 + }, + { + "epoch": 2.8096784670347517, + "grad_norm": 2.8159987926483154, + "learning_rate": 2.7810980443142954e-06, + "loss": 0.5256, + "step": 5768 + }, + { + "epoch": 2.810165638194219, + "grad_norm": 2.714315891265869, + "learning_rate": 2.7804589770088283e-06, + "loss": 0.5128, + "step": 5769 + }, + { + "epoch": 2.8106528093536864, + "grad_norm": 2.904405117034912, + "learning_rate": 2.7798198911426127e-06, + "loss": 0.4683, + "step": 5770 + }, + { + "epoch": 2.8111399805131536, + "grad_norm": 2.743563652038574, + "learning_rate": 2.7791807867579424e-06, + "loss": 0.527, + "step": 5771 + }, + { + "epoch": 2.811627151672621, + "grad_norm": 2.9215152263641357, + "learning_rate": 2.7785416638971136e-06, + "loss": 0.4231, + "step": 5772 + }, + { + "epoch": 2.8121143228320884, + "grad_norm": 2.5700366497039795, + "learning_rate": 2.7779025226024235e-06, + "loss": 0.4992, + "step": 5773 + }, + { + "epoch": 2.8126014939915556, + "grad_norm": 2.883209228515625, + "learning_rate": 2.7772633629161707e-06, + "loss": 0.4834, + "step": 5774 + }, + { + "epoch": 2.813088665151023, + "grad_norm": 3.120842933654785, + "learning_rate": 2.776624184880654e-06, + "loss": 0.4609, + "step": 5775 + }, + { + "epoch": 2.8135758363104904, + "grad_norm": 2.6617798805236816, + "learning_rate": 2.775984988538175e-06, + "loss": 0.4599, + "step": 5776 + }, + { + "epoch": 2.814063007469958, + "grad_norm": 2.55338454246521, + "learning_rate": 2.7753457739310347e-06, + "loss": 0.4846, + "step": 5777 + }, + { + "epoch": 2.814550178629425, + "grad_norm": 3.298259973526001, + "learning_rate": 2.7747065411015366e-06, + "loss": 0.4722, + "step": 5778 + }, + { + "epoch": 2.8150373497888923, + "grad_norm": 3.049487829208374, + "learning_rate": 2.774067290091985e-06, + "loss": 0.4635, + "step": 5779 + }, + { + "epoch": 2.81552452094836, + "grad_norm": 2.8553948402404785, + "learning_rate": 2.773428020944687e-06, + "loss": 0.4514, + "step": 5780 + }, + { + "epoch": 2.816011692107827, + "grad_norm": 3.1912879943847656, + "learning_rate": 2.7727887337019464e-06, + "loss": 0.5342, + "step": 5781 + }, + { + "epoch": 2.8164988632672947, + "grad_norm": 3.035282611846924, + "learning_rate": 2.7721494284060734e-06, + "loss": 0.4984, + "step": 5782 + }, + { + "epoch": 2.816986034426762, + "grad_norm": 2.6862828731536865, + "learning_rate": 2.7715101050993766e-06, + "loss": 0.4773, + "step": 5783 + }, + { + "epoch": 2.8174732055862295, + "grad_norm": 3.051903009414673, + "learning_rate": 2.770870763824167e-06, + "loss": 0.5043, + "step": 5784 + }, + { + "epoch": 2.8179603767456967, + "grad_norm": 2.969916343688965, + "learning_rate": 2.770231404622754e-06, + "loss": 0.4826, + "step": 5785 + }, + { + "epoch": 2.818447547905164, + "grad_norm": 2.639265537261963, + "learning_rate": 2.7695920275374527e-06, + "loss": 0.483, + "step": 5786 + }, + { + "epoch": 2.8189347190646314, + "grad_norm": 2.924400806427002, + "learning_rate": 2.768952632610576e-06, + "loss": 0.5108, + "step": 5787 + }, + { + "epoch": 2.8194218902240986, + "grad_norm": 2.9197232723236084, + "learning_rate": 2.7683132198844392e-06, + "loss": 0.4865, + "step": 5788 + }, + { + "epoch": 2.8199090613835662, + "grad_norm": 2.646820545196533, + "learning_rate": 2.7676737894013584e-06, + "loss": 0.5114, + "step": 5789 + }, + { + "epoch": 2.8203962325430334, + "grad_norm": 2.4782779216766357, + "learning_rate": 2.7670343412036516e-06, + "loss": 0.4911, + "step": 5790 + }, + { + "epoch": 2.820883403702501, + "grad_norm": 3.0154242515563965, + "learning_rate": 2.766394875333636e-06, + "loss": 0.3963, + "step": 5791 + }, + { + "epoch": 2.821370574861968, + "grad_norm": 2.9539794921875, + "learning_rate": 2.7657553918336332e-06, + "loss": 0.4882, + "step": 5792 + }, + { + "epoch": 2.8218577460214354, + "grad_norm": 2.711603879928589, + "learning_rate": 2.7651158907459635e-06, + "loss": 0.4845, + "step": 5793 + }, + { + "epoch": 2.822344917180903, + "grad_norm": 2.976743459701538, + "learning_rate": 2.7644763721129483e-06, + "loss": 0.5407, + "step": 5794 + }, + { + "epoch": 2.82283208834037, + "grad_norm": 2.6884849071502686, + "learning_rate": 2.7638368359769115e-06, + "loss": 0.5015, + "step": 5795 + }, + { + "epoch": 2.8233192594998378, + "grad_norm": 2.9995803833007812, + "learning_rate": 2.7631972823801774e-06, + "loss": 0.5321, + "step": 5796 + }, + { + "epoch": 2.823806430659305, + "grad_norm": 2.631585121154785, + "learning_rate": 2.7625577113650712e-06, + "loss": 0.4684, + "step": 5797 + }, + { + "epoch": 2.8242936018187725, + "grad_norm": 3.09185528755188, + "learning_rate": 2.761918122973921e-06, + "loss": 0.5289, + "step": 5798 + }, + { + "epoch": 2.8247807729782397, + "grad_norm": 2.6550281047821045, + "learning_rate": 2.761278517249054e-06, + "loss": 0.5325, + "step": 5799 + }, + { + "epoch": 2.825267944137707, + "grad_norm": 2.696094512939453, + "learning_rate": 2.7606388942327983e-06, + "loss": 0.5027, + "step": 5800 + }, + { + "epoch": 2.8257551152971745, + "grad_norm": 2.360450267791748, + "learning_rate": 2.7599992539674852e-06, + "loss": 0.4146, + "step": 5801 + }, + { + "epoch": 2.8262422864566417, + "grad_norm": 2.824608325958252, + "learning_rate": 2.7593595964954454e-06, + "loss": 0.5165, + "step": 5802 + }, + { + "epoch": 2.8267294576161093, + "grad_norm": 2.5868332386016846, + "learning_rate": 2.7587199218590122e-06, + "loss": 0.4426, + "step": 5803 + }, + { + "epoch": 2.8272166287755764, + "grad_norm": 2.8230576515197754, + "learning_rate": 2.758080230100518e-06, + "loss": 0.4964, + "step": 5804 + }, + { + "epoch": 2.827703799935044, + "grad_norm": 2.970489025115967, + "learning_rate": 2.7574405212622986e-06, + "loss": 0.5275, + "step": 5805 + }, + { + "epoch": 2.8281909710945112, + "grad_norm": 2.631115436553955, + "learning_rate": 2.756800795386689e-06, + "loss": 0.5168, + "step": 5806 + }, + { + "epoch": 2.8286781422539784, + "grad_norm": 3.276128053665161, + "learning_rate": 2.756161052516027e-06, + "loss": 0.4922, + "step": 5807 + }, + { + "epoch": 2.829165313413446, + "grad_norm": 3.0886502265930176, + "learning_rate": 2.7555212926926505e-06, + "loss": 0.5242, + "step": 5808 + }, + { + "epoch": 2.829652484572913, + "grad_norm": 3.106175661087036, + "learning_rate": 2.754881515958899e-06, + "loss": 0.5304, + "step": 5809 + }, + { + "epoch": 2.830139655732381, + "grad_norm": 2.6054186820983887, + "learning_rate": 2.754241722357112e-06, + "loss": 0.4617, + "step": 5810 + }, + { + "epoch": 2.830626826891848, + "grad_norm": 2.863914728164673, + "learning_rate": 2.7536019119296314e-06, + "loss": 0.4992, + "step": 5811 + }, + { + "epoch": 2.8311139980513156, + "grad_norm": 3.014571189880371, + "learning_rate": 2.7529620847188e-06, + "loss": 0.4916, + "step": 5812 + }, + { + "epoch": 2.8316011692107828, + "grad_norm": 2.9761648178100586, + "learning_rate": 2.7523222407669613e-06, + "loss": 0.4086, + "step": 5813 + }, + { + "epoch": 2.83208834037025, + "grad_norm": 2.991757869720459, + "learning_rate": 2.75168238011646e-06, + "loss": 0.4479, + "step": 5814 + }, + { + "epoch": 2.8325755115297175, + "grad_norm": 2.466193437576294, + "learning_rate": 2.7510425028096426e-06, + "loss": 0.443, + "step": 5815 + }, + { + "epoch": 2.8330626826891847, + "grad_norm": 2.8530123233795166, + "learning_rate": 2.7504026088888557e-06, + "loss": 0.5527, + "step": 5816 + }, + { + "epoch": 2.8335498538486523, + "grad_norm": 2.940305471420288, + "learning_rate": 2.7497626983964478e-06, + "loss": 0.5091, + "step": 5817 + }, + { + "epoch": 2.8340370250081195, + "grad_norm": 3.176567554473877, + "learning_rate": 2.7491227713747674e-06, + "loss": 0.4304, + "step": 5818 + }, + { + "epoch": 2.834524196167587, + "grad_norm": 3.0140888690948486, + "learning_rate": 2.748482827866165e-06, + "loss": 0.5116, + "step": 5819 + }, + { + "epoch": 2.8350113673270543, + "grad_norm": 2.8891003131866455, + "learning_rate": 2.7478428679129926e-06, + "loss": 0.4645, + "step": 5820 + }, + { + "epoch": 2.8354985384865214, + "grad_norm": 3.1955342292785645, + "learning_rate": 2.7472028915576028e-06, + "loss": 0.5283, + "step": 5821 + }, + { + "epoch": 2.835985709645989, + "grad_norm": 2.658430337905884, + "learning_rate": 2.7465628988423476e-06, + "loss": 0.4481, + "step": 5822 + }, + { + "epoch": 2.8364728808054562, + "grad_norm": 2.695270299911499, + "learning_rate": 2.745922889809583e-06, + "loss": 0.5018, + "step": 5823 + }, + { + "epoch": 2.8369600519649234, + "grad_norm": 2.8371384143829346, + "learning_rate": 2.7452828645016645e-06, + "loss": 0.4512, + "step": 5824 + }, + { + "epoch": 2.837447223124391, + "grad_norm": 3.013974905014038, + "learning_rate": 2.7446428229609488e-06, + "loss": 0.5297, + "step": 5825 + }, + { + "epoch": 2.8379343942838586, + "grad_norm": 2.844268321990967, + "learning_rate": 2.744002765229794e-06, + "loss": 0.4633, + "step": 5826 + }, + { + "epoch": 2.838421565443326, + "grad_norm": 2.6731204986572266, + "learning_rate": 2.743362691350559e-06, + "loss": 0.4569, + "step": 5827 + }, + { + "epoch": 2.838908736602793, + "grad_norm": 2.6661133766174316, + "learning_rate": 2.7427226013656033e-06, + "loss": 0.4693, + "step": 5828 + }, + { + "epoch": 2.8393959077622606, + "grad_norm": 3.1611313819885254, + "learning_rate": 2.7420824953172893e-06, + "loss": 0.5286, + "step": 5829 + }, + { + "epoch": 2.8398830789217278, + "grad_norm": 2.6077792644500732, + "learning_rate": 2.7414423732479774e-06, + "loss": 0.3517, + "step": 5830 + }, + { + "epoch": 2.840370250081195, + "grad_norm": 3.0043296813964844, + "learning_rate": 2.7408022352000317e-06, + "loss": 0.467, + "step": 5831 + }, + { + "epoch": 2.8408574212406625, + "grad_norm": 2.6961848735809326, + "learning_rate": 2.740162081215817e-06, + "loss": 0.4368, + "step": 5832 + }, + { + "epoch": 2.84134459240013, + "grad_norm": 2.6419460773468018, + "learning_rate": 2.7395219113376982e-06, + "loss": 0.43, + "step": 5833 + }, + { + "epoch": 2.8418317635595973, + "grad_norm": 2.932844877243042, + "learning_rate": 2.738881725608041e-06, + "loss": 0.4768, + "step": 5834 + }, + { + "epoch": 2.8423189347190645, + "grad_norm": 3.2787554264068604, + "learning_rate": 2.7382415240692146e-06, + "loss": 0.4852, + "step": 5835 + }, + { + "epoch": 2.842806105878532, + "grad_norm": 2.400216817855835, + "learning_rate": 2.7376013067635847e-06, + "loss": 0.4048, + "step": 5836 + }, + { + "epoch": 2.8432932770379993, + "grad_norm": 2.726271152496338, + "learning_rate": 2.736961073733524e-06, + "loss": 0.4798, + "step": 5837 + }, + { + "epoch": 2.8437804481974664, + "grad_norm": 2.933094024658203, + "learning_rate": 2.7363208250214003e-06, + "loss": 0.4672, + "step": 5838 + }, + { + "epoch": 2.844267619356934, + "grad_norm": 2.778644561767578, + "learning_rate": 2.7356805606695873e-06, + "loss": 0.4314, + "step": 5839 + }, + { + "epoch": 2.8447547905164017, + "grad_norm": 3.1088311672210693, + "learning_rate": 2.7350402807204567e-06, + "loss": 0.4429, + "step": 5840 + }, + { + "epoch": 2.845241961675869, + "grad_norm": 2.822943925857544, + "learning_rate": 2.7343999852163816e-06, + "loss": 0.5491, + "step": 5841 + }, + { + "epoch": 2.845729132835336, + "grad_norm": 2.7562496662139893, + "learning_rate": 2.733759674199738e-06, + "loss": 0.5424, + "step": 5842 + }, + { + "epoch": 2.8462163039948036, + "grad_norm": 2.930805206298828, + "learning_rate": 2.7331193477129003e-06, + "loss": 0.4592, + "step": 5843 + }, + { + "epoch": 2.846703475154271, + "grad_norm": 2.948103666305542, + "learning_rate": 2.7324790057982465e-06, + "loss": 0.5228, + "step": 5844 + }, + { + "epoch": 2.847190646313738, + "grad_norm": 2.8947625160217285, + "learning_rate": 2.731838648498153e-06, + "loss": 0.5171, + "step": 5845 + }, + { + "epoch": 2.8476778174732056, + "grad_norm": 3.22948956489563, + "learning_rate": 2.7311982758549997e-06, + "loss": 0.5401, + "step": 5846 + }, + { + "epoch": 2.848164988632673, + "grad_norm": 2.7648141384124756, + "learning_rate": 2.730557887911166e-06, + "loss": 0.466, + "step": 5847 + }, + { + "epoch": 2.8486521597921404, + "grad_norm": 3.0782389640808105, + "learning_rate": 2.7299174847090333e-06, + "loss": 0.5102, + "step": 5848 + }, + { + "epoch": 2.8491393309516075, + "grad_norm": 2.9784202575683594, + "learning_rate": 2.729277066290982e-06, + "loss": 0.4908, + "step": 5849 + }, + { + "epoch": 2.849626502111075, + "grad_norm": 2.5453319549560547, + "learning_rate": 2.7286366326993963e-06, + "loss": 0.4627, + "step": 5850 + }, + { + "epoch": 2.8501136732705423, + "grad_norm": 2.882017135620117, + "learning_rate": 2.727996183976659e-06, + "loss": 0.4917, + "step": 5851 + }, + { + "epoch": 2.8506008444300095, + "grad_norm": 3.118303060531616, + "learning_rate": 2.7273557201651564e-06, + "loss": 0.5319, + "step": 5852 + }, + { + "epoch": 2.851088015589477, + "grad_norm": 2.815511703491211, + "learning_rate": 2.7267152413072724e-06, + "loss": 0.5089, + "step": 5853 + }, + { + "epoch": 2.8515751867489447, + "grad_norm": 2.9138357639312744, + "learning_rate": 2.7260747474453954e-06, + "loss": 0.4046, + "step": 5854 + }, + { + "epoch": 2.852062357908412, + "grad_norm": 2.8010716438293457, + "learning_rate": 2.7254342386219124e-06, + "loss": 0.4799, + "step": 5855 + }, + { + "epoch": 2.852549529067879, + "grad_norm": 2.836585760116577, + "learning_rate": 2.7247937148792126e-06, + "loss": 0.584, + "step": 5856 + }, + { + "epoch": 2.8530367002273467, + "grad_norm": 2.991833448410034, + "learning_rate": 2.7241531762596856e-06, + "loss": 0.5102, + "step": 5857 + }, + { + "epoch": 2.853523871386814, + "grad_norm": 2.6411216259002686, + "learning_rate": 2.723512622805723e-06, + "loss": 0.4748, + "step": 5858 + }, + { + "epoch": 2.854011042546281, + "grad_norm": 2.6444640159606934, + "learning_rate": 2.722872054559715e-06, + "loss": 0.4973, + "step": 5859 + }, + { + "epoch": 2.8544982137057486, + "grad_norm": 2.5162127017974854, + "learning_rate": 2.722231471564055e-06, + "loss": 0.4368, + "step": 5860 + }, + { + "epoch": 2.8549853848652162, + "grad_norm": 3.1701719760894775, + "learning_rate": 2.7215908738611375e-06, + "loss": 0.5862, + "step": 5861 + }, + { + "epoch": 2.8554725560246834, + "grad_norm": 2.511629581451416, + "learning_rate": 2.720950261493356e-06, + "loss": 0.4481, + "step": 5862 + }, + { + "epoch": 2.8559597271841506, + "grad_norm": 2.6414284706115723, + "learning_rate": 2.7203096345031075e-06, + "loss": 0.4528, + "step": 5863 + }, + { + "epoch": 2.856446898343618, + "grad_norm": 2.976163387298584, + "learning_rate": 2.719668992932788e-06, + "loss": 0.4893, + "step": 5864 + }, + { + "epoch": 2.8569340695030854, + "grad_norm": 3.070915699005127, + "learning_rate": 2.7190283368247945e-06, + "loss": 0.4811, + "step": 5865 + }, + { + "epoch": 2.8574212406625525, + "grad_norm": 2.768596649169922, + "learning_rate": 2.7183876662215263e-06, + "loss": 0.4834, + "step": 5866 + }, + { + "epoch": 2.85790841182202, + "grad_norm": 3.1541998386383057, + "learning_rate": 2.7177469811653822e-06, + "loss": 0.5139, + "step": 5867 + }, + { + "epoch": 2.8583955829814878, + "grad_norm": 2.86372709274292, + "learning_rate": 2.717106281698764e-06, + "loss": 0.4199, + "step": 5868 + }, + { + "epoch": 2.858882754140955, + "grad_norm": 2.9883341789245605, + "learning_rate": 2.7164655678640717e-06, + "loss": 0.5433, + "step": 5869 + }, + { + "epoch": 2.859369925300422, + "grad_norm": 2.5180280208587646, + "learning_rate": 2.7158248397037085e-06, + "loss": 0.5105, + "step": 5870 + }, + { + "epoch": 2.8598570964598897, + "grad_norm": 2.9265918731689453, + "learning_rate": 2.7151840972600773e-06, + "loss": 0.4788, + "step": 5871 + }, + { + "epoch": 2.860344267619357, + "grad_norm": 2.698673963546753, + "learning_rate": 2.7145433405755827e-06, + "loss": 0.425, + "step": 5872 + }, + { + "epoch": 2.860831438778824, + "grad_norm": 2.810870409011841, + "learning_rate": 2.7139025696926297e-06, + "loss": 0.53, + "step": 5873 + }, + { + "epoch": 2.8613186099382917, + "grad_norm": 2.889566421508789, + "learning_rate": 2.7132617846536246e-06, + "loss": 0.5382, + "step": 5874 + }, + { + "epoch": 2.861805781097759, + "grad_norm": 2.6041383743286133, + "learning_rate": 2.7126209855009745e-06, + "loss": 0.4827, + "step": 5875 + }, + { + "epoch": 2.8622929522572265, + "grad_norm": 2.8502867221832275, + "learning_rate": 2.711980172277087e-06, + "loss": 0.5145, + "step": 5876 + }, + { + "epoch": 2.8627801234166936, + "grad_norm": 2.7990612983703613, + "learning_rate": 2.7113393450243718e-06, + "loss": 0.4953, + "step": 5877 + }, + { + "epoch": 2.8632672945761612, + "grad_norm": 2.942610025405884, + "learning_rate": 2.710698503785239e-06, + "loss": 0.4444, + "step": 5878 + }, + { + "epoch": 2.8637544657356284, + "grad_norm": 2.9519591331481934, + "learning_rate": 2.710057648602098e-06, + "loss": 0.4634, + "step": 5879 + }, + { + "epoch": 2.8642416368950956, + "grad_norm": 3.1067111492156982, + "learning_rate": 2.7094167795173616e-06, + "loss": 0.4883, + "step": 5880 + }, + { + "epoch": 2.864728808054563, + "grad_norm": 2.819638729095459, + "learning_rate": 2.7087758965734418e-06, + "loss": 0.4991, + "step": 5881 + }, + { + "epoch": 2.8652159792140304, + "grad_norm": 3.0886993408203125, + "learning_rate": 2.708134999812754e-06, + "loss": 0.4995, + "step": 5882 + }, + { + "epoch": 2.865703150373498, + "grad_norm": 3.2610018253326416, + "learning_rate": 2.7074940892777095e-06, + "loss": 0.5451, + "step": 5883 + }, + { + "epoch": 2.866190321532965, + "grad_norm": 3.180185556411743, + "learning_rate": 2.7068531650107265e-06, + "loss": 0.5336, + "step": 5884 + }, + { + "epoch": 2.8666774926924328, + "grad_norm": 2.990530252456665, + "learning_rate": 2.7062122270542207e-06, + "loss": 0.4307, + "step": 5885 + }, + { + "epoch": 2.8671646638519, + "grad_norm": 3.0419697761535645, + "learning_rate": 2.7055712754506085e-06, + "loss": 0.487, + "step": 5886 + }, + { + "epoch": 2.867651835011367, + "grad_norm": 2.9414260387420654, + "learning_rate": 2.704930310242308e-06, + "loss": 0.5055, + "step": 5887 + }, + { + "epoch": 2.8681390061708347, + "grad_norm": 2.417677164077759, + "learning_rate": 2.7042893314717394e-06, + "loss": 0.4802, + "step": 5888 + }, + { + "epoch": 2.868626177330302, + "grad_norm": 2.437692165374756, + "learning_rate": 2.7036483391813213e-06, + "loss": 0.4544, + "step": 5889 + }, + { + "epoch": 2.8691133484897695, + "grad_norm": 3.03930926322937, + "learning_rate": 2.7030073334134753e-06, + "loss": 0.5701, + "step": 5890 + }, + { + "epoch": 2.8696005196492367, + "grad_norm": 2.7958788871765137, + "learning_rate": 2.7023663142106225e-06, + "loss": 0.4816, + "step": 5891 + }, + { + "epoch": 2.8700876908087043, + "grad_norm": 2.814568042755127, + "learning_rate": 2.701725281615186e-06, + "loss": 0.4793, + "step": 5892 + }, + { + "epoch": 2.8705748619681715, + "grad_norm": 2.9441144466400146, + "learning_rate": 2.7010842356695892e-06, + "loss": 0.5738, + "step": 5893 + }, + { + "epoch": 2.8710620331276386, + "grad_norm": 2.96110200881958, + "learning_rate": 2.700443176416257e-06, + "loss": 0.4719, + "step": 5894 + }, + { + "epoch": 2.8715492042871062, + "grad_norm": 2.9478304386138916, + "learning_rate": 2.699802103897613e-06, + "loss": 0.438, + "step": 5895 + }, + { + "epoch": 2.8720363754465734, + "grad_norm": 2.4948909282684326, + "learning_rate": 2.699161018156085e-06, + "loss": 0.4476, + "step": 5896 + }, + { + "epoch": 2.872523546606041, + "grad_norm": 2.8957951068878174, + "learning_rate": 2.6985199192341e-06, + "loss": 0.4977, + "step": 5897 + }, + { + "epoch": 2.873010717765508, + "grad_norm": 3.113018035888672, + "learning_rate": 2.6978788071740843e-06, + "loss": 0.4621, + "step": 5898 + }, + { + "epoch": 2.873497888924976, + "grad_norm": 2.5839107036590576, + "learning_rate": 2.697237682018468e-06, + "loss": 0.5436, + "step": 5899 + }, + { + "epoch": 2.873985060084443, + "grad_norm": 2.667649745941162, + "learning_rate": 2.6965965438096796e-06, + "loss": 0.5796, + "step": 5900 + }, + { + "epoch": 2.87447223124391, + "grad_norm": 3.10785174369812, + "learning_rate": 2.6959553925901512e-06, + "loss": 0.5302, + "step": 5901 + }, + { + "epoch": 2.8749594024033778, + "grad_norm": 3.4447224140167236, + "learning_rate": 2.695314228402312e-06, + "loss": 0.4467, + "step": 5902 + }, + { + "epoch": 2.875446573562845, + "grad_norm": 2.8946871757507324, + "learning_rate": 2.6946730512885964e-06, + "loss": 0.497, + "step": 5903 + }, + { + "epoch": 2.8759337447223126, + "grad_norm": 3.077167272567749, + "learning_rate": 2.6940318612914355e-06, + "loss": 0.443, + "step": 5904 + }, + { + "epoch": 2.8764209158817797, + "grad_norm": 2.6819300651550293, + "learning_rate": 2.6933906584532652e-06, + "loss": 0.4919, + "step": 5905 + }, + { + "epoch": 2.8769080870412473, + "grad_norm": 2.6213598251342773, + "learning_rate": 2.692749442816518e-06, + "loss": 0.4266, + "step": 5906 + }, + { + "epoch": 2.8773952582007145, + "grad_norm": 2.812575101852417, + "learning_rate": 2.6921082144236316e-06, + "loss": 0.4485, + "step": 5907 + }, + { + "epoch": 2.8778824293601817, + "grad_norm": 2.4241604804992676, + "learning_rate": 2.6914669733170414e-06, + "loss": 0.4587, + "step": 5908 + }, + { + "epoch": 2.8783696005196493, + "grad_norm": 2.8189961910247803, + "learning_rate": 2.690825719539184e-06, + "loss": 0.4969, + "step": 5909 + }, + { + "epoch": 2.8788567716791165, + "grad_norm": 2.880168914794922, + "learning_rate": 2.6901844531324987e-06, + "loss": 0.4966, + "step": 5910 + }, + { + "epoch": 2.879343942838584, + "grad_norm": 2.972693920135498, + "learning_rate": 2.689543174139424e-06, + "loss": 0.545, + "step": 5911 + }, + { + "epoch": 2.8798311139980513, + "grad_norm": 2.374971866607666, + "learning_rate": 2.6889018826023993e-06, + "loss": 0.4514, + "step": 5912 + }, + { + "epoch": 2.880318285157519, + "grad_norm": 2.918083429336548, + "learning_rate": 2.6882605785638665e-06, + "loss": 0.4303, + "step": 5913 + }, + { + "epoch": 2.880805456316986, + "grad_norm": 2.9631507396698, + "learning_rate": 2.687619262066265e-06, + "loss": 0.514, + "step": 5914 + }, + { + "epoch": 2.881292627476453, + "grad_norm": 3.105273962020874, + "learning_rate": 2.686977933152039e-06, + "loss": 0.4597, + "step": 5915 + }, + { + "epoch": 2.881779798635921, + "grad_norm": 2.619889497756958, + "learning_rate": 2.6863365918636304e-06, + "loss": 0.5114, + "step": 5916 + }, + { + "epoch": 2.882266969795388, + "grad_norm": 2.968773365020752, + "learning_rate": 2.6856952382434837e-06, + "loss": 0.5757, + "step": 5917 + }, + { + "epoch": 2.8827541409548556, + "grad_norm": 3.135951042175293, + "learning_rate": 2.685053872334043e-06, + "loss": 0.416, + "step": 5918 + }, + { + "epoch": 2.8832413121143228, + "grad_norm": 2.748114585876465, + "learning_rate": 2.684412494177755e-06, + "loss": 0.5348, + "step": 5919 + }, + { + "epoch": 2.8837284832737904, + "grad_norm": 2.8958139419555664, + "learning_rate": 2.683771103817065e-06, + "loss": 0.498, + "step": 5920 + }, + { + "epoch": 2.8842156544332576, + "grad_norm": 2.5381009578704834, + "learning_rate": 2.68312970129442e-06, + "loss": 0.4851, + "step": 5921 + }, + { + "epoch": 2.8847028255927247, + "grad_norm": 2.5433502197265625, + "learning_rate": 2.682488286652269e-06, + "loss": 0.4769, + "step": 5922 + }, + { + "epoch": 2.8851899967521923, + "grad_norm": 2.6525495052337646, + "learning_rate": 2.6818468599330603e-06, + "loss": 0.4356, + "step": 5923 + }, + { + "epoch": 2.8856771679116595, + "grad_norm": 2.8024260997772217, + "learning_rate": 2.6812054211792427e-06, + "loss": 0.4868, + "step": 5924 + }, + { + "epoch": 2.886164339071127, + "grad_norm": 2.807119607925415, + "learning_rate": 2.6805639704332677e-06, + "loss": 0.5374, + "step": 5925 + }, + { + "epoch": 2.8866515102305943, + "grad_norm": 2.800489902496338, + "learning_rate": 2.6799225077375857e-06, + "loss": 0.4942, + "step": 5926 + }, + { + "epoch": 2.887138681390062, + "grad_norm": 3.0433077812194824, + "learning_rate": 2.679281033134649e-06, + "loss": 0.4977, + "step": 5927 + }, + { + "epoch": 2.887625852549529, + "grad_norm": 2.6894657611846924, + "learning_rate": 2.6786395466669107e-06, + "loss": 0.5556, + "step": 5928 + }, + { + "epoch": 2.8881130237089963, + "grad_norm": 2.736332416534424, + "learning_rate": 2.677998048376823e-06, + "loss": 0.4525, + "step": 5929 + }, + { + "epoch": 2.888600194868464, + "grad_norm": 2.637491464614868, + "learning_rate": 2.6773565383068416e-06, + "loss": 0.4839, + "step": 5930 + }, + { + "epoch": 2.889087366027931, + "grad_norm": 3.024782419204712, + "learning_rate": 2.676715016499421e-06, + "loss": 0.4636, + "step": 5931 + }, + { + "epoch": 2.8895745371873987, + "grad_norm": 2.9504520893096924, + "learning_rate": 2.6760734829970165e-06, + "loss": 0.5454, + "step": 5932 + }, + { + "epoch": 2.890061708346866, + "grad_norm": 2.740323305130005, + "learning_rate": 2.675431937842086e-06, + "loss": 0.4431, + "step": 5933 + }, + { + "epoch": 2.8905488795063334, + "grad_norm": 2.902831554412842, + "learning_rate": 2.6747903810770854e-06, + "loss": 0.4604, + "step": 5934 + }, + { + "epoch": 2.8910360506658006, + "grad_norm": 2.7787420749664307, + "learning_rate": 2.674148812744474e-06, + "loss": 0.5156, + "step": 5935 + }, + { + "epoch": 2.8915232218252678, + "grad_norm": 2.491698741912842, + "learning_rate": 2.673507232886711e-06, + "loss": 0.4415, + "step": 5936 + }, + { + "epoch": 2.8920103929847354, + "grad_norm": 2.4873247146606445, + "learning_rate": 2.672865641546255e-06, + "loss": 0.5182, + "step": 5937 + }, + { + "epoch": 2.8924975641442026, + "grad_norm": 2.8292148113250732, + "learning_rate": 2.6722240387655663e-06, + "loss": 0.5389, + "step": 5938 + }, + { + "epoch": 2.89298473530367, + "grad_norm": 2.857340097427368, + "learning_rate": 2.6715824245871075e-06, + "loss": 0.4458, + "step": 5939 + }, + { + "epoch": 2.8934719064631373, + "grad_norm": 2.7810206413269043, + "learning_rate": 2.6709407990533398e-06, + "loss": 0.5268, + "step": 5940 + }, + { + "epoch": 2.893959077622605, + "grad_norm": 2.6616597175598145, + "learning_rate": 2.670299162206726e-06, + "loss": 0.5185, + "step": 5941 + }, + { + "epoch": 2.894446248782072, + "grad_norm": 3.27947998046875, + "learning_rate": 2.669657514089729e-06, + "loss": 0.5024, + "step": 5942 + }, + { + "epoch": 2.8949334199415393, + "grad_norm": 2.7259087562561035, + "learning_rate": 2.669015854744814e-06, + "loss": 0.4664, + "step": 5943 + }, + { + "epoch": 2.895420591101007, + "grad_norm": 2.5170607566833496, + "learning_rate": 2.668374184214445e-06, + "loss": 0.4522, + "step": 5944 + }, + { + "epoch": 2.895907762260474, + "grad_norm": 2.934147834777832, + "learning_rate": 2.6677325025410887e-06, + "loss": 0.4676, + "step": 5945 + }, + { + "epoch": 2.8963949334199417, + "grad_norm": 2.956655740737915, + "learning_rate": 2.6670908097672105e-06, + "loss": 0.527, + "step": 5946 + }, + { + "epoch": 2.896882104579409, + "grad_norm": 2.666348934173584, + "learning_rate": 2.6664491059352787e-06, + "loss": 0.5403, + "step": 5947 + }, + { + "epoch": 2.8973692757388765, + "grad_norm": 2.719600200653076, + "learning_rate": 2.66580739108776e-06, + "loss": 0.4733, + "step": 5948 + }, + { + "epoch": 2.8978564468983437, + "grad_norm": 3.068610906600952, + "learning_rate": 2.665165665267124e-06, + "loss": 0.546, + "step": 5949 + }, + { + "epoch": 2.898343618057811, + "grad_norm": 3.124532461166382, + "learning_rate": 2.6645239285158397e-06, + "loss": 0.5315, + "step": 5950 + }, + { + "epoch": 2.8988307892172784, + "grad_norm": 2.7142016887664795, + "learning_rate": 2.6638821808763765e-06, + "loss": 0.4625, + "step": 5951 + }, + { + "epoch": 2.8993179603767456, + "grad_norm": 3.0316920280456543, + "learning_rate": 2.663240422391206e-06, + "loss": 0.4921, + "step": 5952 + }, + { + "epoch": 2.899805131536213, + "grad_norm": 2.7033631801605225, + "learning_rate": 2.6625986531028004e-06, + "loss": 0.4787, + "step": 5953 + }, + { + "epoch": 2.9002923026956804, + "grad_norm": 2.874145030975342, + "learning_rate": 2.6619568730536306e-06, + "loss": 0.5074, + "step": 5954 + }, + { + "epoch": 2.900779473855148, + "grad_norm": 3.2608180046081543, + "learning_rate": 2.6613150822861698e-06, + "loss": 0.5429, + "step": 5955 + }, + { + "epoch": 2.901266645014615, + "grad_norm": 2.7943124771118164, + "learning_rate": 2.660673280842892e-06, + "loss": 0.4993, + "step": 5956 + }, + { + "epoch": 2.9017538161740823, + "grad_norm": 2.4371144771575928, + "learning_rate": 2.660031468766271e-06, + "loss": 0.4365, + "step": 5957 + }, + { + "epoch": 2.90224098733355, + "grad_norm": 2.4903035163879395, + "learning_rate": 2.6593896460987833e-06, + "loss": 0.4683, + "step": 5958 + }, + { + "epoch": 2.902728158493017, + "grad_norm": 2.857966899871826, + "learning_rate": 2.6587478128829025e-06, + "loss": 0.4536, + "step": 5959 + }, + { + "epoch": 2.9032153296524843, + "grad_norm": 2.7333741188049316, + "learning_rate": 2.6581059691611073e-06, + "loss": 0.4899, + "step": 5960 + }, + { + "epoch": 2.903702500811952, + "grad_norm": 2.9920339584350586, + "learning_rate": 2.657464114975873e-06, + "loss": 0.4995, + "step": 5961 + }, + { + "epoch": 2.9041896719714195, + "grad_norm": 2.9514803886413574, + "learning_rate": 2.6568222503696784e-06, + "loss": 0.4732, + "step": 5962 + }, + { + "epoch": 2.9046768431308867, + "grad_norm": 2.6668288707733154, + "learning_rate": 2.656180375385002e-06, + "loss": 0.4473, + "step": 5963 + }, + { + "epoch": 2.905164014290354, + "grad_norm": 2.726719379425049, + "learning_rate": 2.6555384900643227e-06, + "loss": 0.4973, + "step": 5964 + }, + { + "epoch": 2.9056511854498215, + "grad_norm": 2.7874345779418945, + "learning_rate": 2.6548965944501204e-06, + "loss": 0.5409, + "step": 5965 + }, + { + "epoch": 2.9061383566092887, + "grad_norm": 2.701493740081787, + "learning_rate": 2.6542546885848768e-06, + "loss": 0.4835, + "step": 5966 + }, + { + "epoch": 2.906625527768756, + "grad_norm": 2.6163711547851562, + "learning_rate": 2.6536127725110718e-06, + "loss": 0.5546, + "step": 5967 + }, + { + "epoch": 2.9071126989282234, + "grad_norm": 2.8983895778656006, + "learning_rate": 2.652970846271188e-06, + "loss": 0.4922, + "step": 5968 + }, + { + "epoch": 2.907599870087691, + "grad_norm": 2.7048380374908447, + "learning_rate": 2.652328909907707e-06, + "loss": 0.4748, + "step": 5969 + }, + { + "epoch": 2.908087041247158, + "grad_norm": 2.8203299045562744, + "learning_rate": 2.6516869634631138e-06, + "loss": 0.5368, + "step": 5970 + }, + { + "epoch": 2.9085742124066254, + "grad_norm": 2.8102540969848633, + "learning_rate": 2.651045006979891e-06, + "loss": 0.4897, + "step": 5971 + }, + { + "epoch": 2.909061383566093, + "grad_norm": 2.8410120010375977, + "learning_rate": 2.650403040500524e-06, + "loss": 0.4734, + "step": 5972 + }, + { + "epoch": 2.90954855472556, + "grad_norm": 2.8647966384887695, + "learning_rate": 2.649761064067498e-06, + "loss": 0.4786, + "step": 5973 + }, + { + "epoch": 2.9100357258850273, + "grad_norm": 2.74694561958313, + "learning_rate": 2.649119077723298e-06, + "loss": 0.4902, + "step": 5974 + }, + { + "epoch": 2.910522897044495, + "grad_norm": 2.692505121231079, + "learning_rate": 2.6484770815104122e-06, + "loss": 0.5023, + "step": 5975 + }, + { + "epoch": 2.9110100682039626, + "grad_norm": 2.8024423122406006, + "learning_rate": 2.6478350754713266e-06, + "loss": 0.5015, + "step": 5976 + }, + { + "epoch": 2.9114972393634297, + "grad_norm": 3.018706798553467, + "learning_rate": 2.647193059648529e-06, + "loss": 0.4777, + "step": 5977 + }, + { + "epoch": 2.911984410522897, + "grad_norm": 2.7538843154907227, + "learning_rate": 2.646551034084509e-06, + "loss": 0.4502, + "step": 5978 + }, + { + "epoch": 2.9124715816823645, + "grad_norm": 3.015822649002075, + "learning_rate": 2.645908998821755e-06, + "loss": 0.5003, + "step": 5979 + }, + { + "epoch": 2.9129587528418317, + "grad_norm": 2.9684371948242188, + "learning_rate": 2.6452669539027577e-06, + "loss": 0.4734, + "step": 5980 + }, + { + "epoch": 2.913445924001299, + "grad_norm": 2.828437566757202, + "learning_rate": 2.644624899370006e-06, + "loss": 0.5025, + "step": 5981 + }, + { + "epoch": 2.9139330951607665, + "grad_norm": 2.7816827297210693, + "learning_rate": 2.6439828352659926e-06, + "loss": 0.5311, + "step": 5982 + }, + { + "epoch": 2.914420266320234, + "grad_norm": 2.8728272914886475, + "learning_rate": 2.6433407616332072e-06, + "loss": 0.4193, + "step": 5983 + }, + { + "epoch": 2.9149074374797013, + "grad_norm": 2.5599570274353027, + "learning_rate": 2.642698678514145e-06, + "loss": 0.4686, + "step": 5984 + }, + { + "epoch": 2.9153946086391684, + "grad_norm": 3.1250388622283936, + "learning_rate": 2.642056585951297e-06, + "loss": 0.5881, + "step": 5985 + }, + { + "epoch": 2.915881779798636, + "grad_norm": 2.9134202003479004, + "learning_rate": 2.6414144839871576e-06, + "loss": 0.474, + "step": 5986 + }, + { + "epoch": 2.9163689509581032, + "grad_norm": 3.0394890308380127, + "learning_rate": 2.6407723726642203e-06, + "loss": 0.4891, + "step": 5987 + }, + { + "epoch": 2.9168561221175704, + "grad_norm": 2.850053548812866, + "learning_rate": 2.640130252024981e-06, + "loss": 0.541, + "step": 5988 + }, + { + "epoch": 2.917343293277038, + "grad_norm": 2.807858943939209, + "learning_rate": 2.6394881221119343e-06, + "loss": 0.4727, + "step": 5989 + }, + { + "epoch": 2.9178304644365056, + "grad_norm": 2.5995163917541504, + "learning_rate": 2.6388459829675774e-06, + "loss": 0.4339, + "step": 5990 + }, + { + "epoch": 2.918317635595973, + "grad_norm": 2.9578962326049805, + "learning_rate": 2.6382038346344054e-06, + "loss": 0.5561, + "step": 5991 + }, + { + "epoch": 2.91880480675544, + "grad_norm": 3.089456796646118, + "learning_rate": 2.6375616771549166e-06, + "loss": 0.4858, + "step": 5992 + }, + { + "epoch": 2.9192919779149076, + "grad_norm": 3.356647491455078, + "learning_rate": 2.6369195105716087e-06, + "loss": 0.4575, + "step": 5993 + }, + { + "epoch": 2.9197791490743747, + "grad_norm": 2.8886587619781494, + "learning_rate": 2.6362773349269816e-06, + "loss": 0.5076, + "step": 5994 + }, + { + "epoch": 2.920266320233842, + "grad_norm": 2.950049877166748, + "learning_rate": 2.6356351502635323e-06, + "loss": 0.4675, + "step": 5995 + }, + { + "epoch": 2.9207534913933095, + "grad_norm": 2.7266666889190674, + "learning_rate": 2.6349929566237615e-06, + "loss": 0.4373, + "step": 5996 + }, + { + "epoch": 2.921240662552777, + "grad_norm": 2.8411896228790283, + "learning_rate": 2.634350754050169e-06, + "loss": 0.4971, + "step": 5997 + }, + { + "epoch": 2.9217278337122443, + "grad_norm": 2.7614169120788574, + "learning_rate": 2.6337085425852567e-06, + "loss": 0.444, + "step": 5998 + }, + { + "epoch": 2.9222150048717115, + "grad_norm": 2.5766775608062744, + "learning_rate": 2.6330663222715247e-06, + "loss": 0.4745, + "step": 5999 + }, + { + "epoch": 2.922702176031179, + "grad_norm": 2.890504837036133, + "learning_rate": 2.6324240931514766e-06, + "loss": 0.4612, + "step": 6000 + }, + { + "epoch": 2.9231893471906463, + "grad_norm": 2.8785767555236816, + "learning_rate": 2.6317818552676144e-06, + "loss": 0.4473, + "step": 6001 + }, + { + "epoch": 2.9236765183501134, + "grad_norm": 2.691241502761841, + "learning_rate": 2.631139608662441e-06, + "loss": 0.4974, + "step": 6002 + }, + { + "epoch": 2.924163689509581, + "grad_norm": 2.696063280105591, + "learning_rate": 2.6304973533784608e-06, + "loss": 0.4693, + "step": 6003 + }, + { + "epoch": 2.9246508606690487, + "grad_norm": 2.637894868850708, + "learning_rate": 2.629855089458177e-06, + "loss": 0.4879, + "step": 6004 + }, + { + "epoch": 2.925138031828516, + "grad_norm": 2.9307239055633545, + "learning_rate": 2.6292128169440966e-06, + "loss": 0.5894, + "step": 6005 + }, + { + "epoch": 2.925625202987983, + "grad_norm": 3.045340061187744, + "learning_rate": 2.6285705358787235e-06, + "loss": 0.4608, + "step": 6006 + }, + { + "epoch": 2.9261123741474506, + "grad_norm": 2.9703874588012695, + "learning_rate": 2.6279282463045643e-06, + "loss": 0.5047, + "step": 6007 + }, + { + "epoch": 2.926599545306918, + "grad_norm": 3.0131912231445312, + "learning_rate": 2.627285948264125e-06, + "loss": 0.4239, + "step": 6008 + }, + { + "epoch": 2.927086716466385, + "grad_norm": 2.8327760696411133, + "learning_rate": 2.626643641799915e-06, + "loss": 0.4567, + "step": 6009 + }, + { + "epoch": 2.9275738876258526, + "grad_norm": 2.9618465900421143, + "learning_rate": 2.626001326954439e-06, + "loss": 0.4911, + "step": 6010 + }, + { + "epoch": 2.9280610587853197, + "grad_norm": 2.7885448932647705, + "learning_rate": 2.6253590037702083e-06, + "loss": 0.5621, + "step": 6011 + }, + { + "epoch": 2.9285482299447874, + "grad_norm": 2.5747172832489014, + "learning_rate": 2.6247166722897293e-06, + "loss": 0.4779, + "step": 6012 + }, + { + "epoch": 2.9290354011042545, + "grad_norm": 3.438037395477295, + "learning_rate": 2.624074332555513e-06, + "loss": 0.5339, + "step": 6013 + }, + { + "epoch": 2.929522572263722, + "grad_norm": 2.5452451705932617, + "learning_rate": 2.623431984610069e-06, + "loss": 0.4603, + "step": 6014 + }, + { + "epoch": 2.9300097434231893, + "grad_norm": 2.9940297603607178, + "learning_rate": 2.6227896284959075e-06, + "loss": 0.5006, + "step": 6015 + }, + { + "epoch": 2.9304969145826565, + "grad_norm": 3.246967315673828, + "learning_rate": 2.6221472642555403e-06, + "loss": 0.5089, + "step": 6016 + }, + { + "epoch": 2.930984085742124, + "grad_norm": 3.2931315898895264, + "learning_rate": 2.6215048919314778e-06, + "loss": 0.5018, + "step": 6017 + }, + { + "epoch": 2.9314712569015913, + "grad_norm": 3.083209276199341, + "learning_rate": 2.620862511566234e-06, + "loss": 0.5205, + "step": 6018 + }, + { + "epoch": 2.931958428061059, + "grad_norm": 2.6471426486968994, + "learning_rate": 2.6202201232023195e-06, + "loss": 0.4848, + "step": 6019 + }, + { + "epoch": 2.932445599220526, + "grad_norm": 3.415654420852661, + "learning_rate": 2.6195777268822485e-06, + "loss": 0.4842, + "step": 6020 + }, + { + "epoch": 2.9329327703799937, + "grad_norm": 2.9726762771606445, + "learning_rate": 2.6189353226485355e-06, + "loss": 0.4806, + "step": 6021 + }, + { + "epoch": 2.933419941539461, + "grad_norm": 2.698988199234009, + "learning_rate": 2.6182929105436926e-06, + "loss": 0.4804, + "step": 6022 + }, + { + "epoch": 2.933907112698928, + "grad_norm": 2.9268109798431396, + "learning_rate": 2.617650490610237e-06, + "loss": 0.5387, + "step": 6023 + }, + { + "epoch": 2.9343942838583956, + "grad_norm": 3.137442111968994, + "learning_rate": 2.6170080628906826e-06, + "loss": 0.5197, + "step": 6024 + }, + { + "epoch": 2.934881455017863, + "grad_norm": 2.790419101715088, + "learning_rate": 2.616365627427545e-06, + "loss": 0.5208, + "step": 6025 + }, + { + "epoch": 2.9353686261773304, + "grad_norm": 3.005722761154175, + "learning_rate": 2.6157231842633413e-06, + "loss": 0.508, + "step": 6026 + }, + { + "epoch": 2.9358557973367976, + "grad_norm": 2.6904873847961426, + "learning_rate": 2.615080733440588e-06, + "loss": 0.4343, + "step": 6027 + }, + { + "epoch": 2.936342968496265, + "grad_norm": 2.83406138420105, + "learning_rate": 2.614438275001803e-06, + "loss": 0.4852, + "step": 6028 + }, + { + "epoch": 2.9368301396557324, + "grad_norm": 2.7759687900543213, + "learning_rate": 2.6137958089895034e-06, + "loss": 0.5147, + "step": 6029 + }, + { + "epoch": 2.9373173108151995, + "grad_norm": 2.767380952835083, + "learning_rate": 2.613153335446207e-06, + "loss": 0.4973, + "step": 6030 + }, + { + "epoch": 2.937804481974667, + "grad_norm": 2.90729022026062, + "learning_rate": 2.612510854414434e-06, + "loss": 0.517, + "step": 6031 + }, + { + "epoch": 2.9382916531341343, + "grad_norm": 2.6097981929779053, + "learning_rate": 2.6118683659367027e-06, + "loss": 0.4723, + "step": 6032 + }, + { + "epoch": 2.938778824293602, + "grad_norm": 4.28902006149292, + "learning_rate": 2.611225870055534e-06, + "loss": 0.4837, + "step": 6033 + }, + { + "epoch": 2.939265995453069, + "grad_norm": 2.9348642826080322, + "learning_rate": 2.6105833668134473e-06, + "loss": 0.4848, + "step": 6034 + }, + { + "epoch": 2.9397531666125367, + "grad_norm": 2.895944595336914, + "learning_rate": 2.6099408562529636e-06, + "loss": 0.5017, + "step": 6035 + }, + { + "epoch": 2.940240337772004, + "grad_norm": 2.7706334590911865, + "learning_rate": 2.609298338416604e-06, + "loss": 0.4916, + "step": 6036 + }, + { + "epoch": 2.940727508931471, + "grad_norm": 2.879408836364746, + "learning_rate": 2.6086558133468916e-06, + "loss": 0.4588, + "step": 6037 + }, + { + "epoch": 2.9412146800909387, + "grad_norm": 2.7808501720428467, + "learning_rate": 2.608013281086346e-06, + "loss": 0.5569, + "step": 6038 + }, + { + "epoch": 2.941701851250406, + "grad_norm": 2.7312064170837402, + "learning_rate": 2.607370741677493e-06, + "loss": 0.4338, + "step": 6039 + }, + { + "epoch": 2.9421890224098735, + "grad_norm": 2.981597900390625, + "learning_rate": 2.6067281951628542e-06, + "loss": 0.5081, + "step": 6040 + }, + { + "epoch": 2.9426761935693406, + "grad_norm": 2.9403669834136963, + "learning_rate": 2.6060856415849536e-06, + "loss": 0.5151, + "step": 6041 + }, + { + "epoch": 2.9431633647288082, + "grad_norm": 2.9179468154907227, + "learning_rate": 2.6054430809863136e-06, + "loss": 0.451, + "step": 6042 + }, + { + "epoch": 2.9436505358882754, + "grad_norm": 2.8366689682006836, + "learning_rate": 2.6048005134094617e-06, + "loss": 0.4508, + "step": 6043 + }, + { + "epoch": 2.9441377070477426, + "grad_norm": 3.04789137840271, + "learning_rate": 2.6041579388969207e-06, + "loss": 0.5456, + "step": 6044 + }, + { + "epoch": 2.94462487820721, + "grad_norm": 3.0248537063598633, + "learning_rate": 2.6035153574912175e-06, + "loss": 0.5498, + "step": 6045 + }, + { + "epoch": 2.9451120493666774, + "grad_norm": 3.2527506351470947, + "learning_rate": 2.602872769234877e-06, + "loss": 0.5029, + "step": 6046 + }, + { + "epoch": 2.945599220526145, + "grad_norm": 2.857834815979004, + "learning_rate": 2.602230174170427e-06, + "loss": 0.5255, + "step": 6047 + }, + { + "epoch": 2.946086391685612, + "grad_norm": 3.0759265422821045, + "learning_rate": 2.601587572340393e-06, + "loss": 0.5054, + "step": 6048 + }, + { + "epoch": 2.9465735628450798, + "grad_norm": 2.897141933441162, + "learning_rate": 2.600944963787303e-06, + "loss": 0.4868, + "step": 6049 + }, + { + "epoch": 2.947060734004547, + "grad_norm": 2.7798871994018555, + "learning_rate": 2.600302348553685e-06, + "loss": 0.4245, + "step": 6050 + }, + { + "epoch": 2.947547905164014, + "grad_norm": 2.5060505867004395, + "learning_rate": 2.5996597266820666e-06, + "loss": 0.4533, + "step": 6051 + }, + { + "epoch": 2.9480350763234817, + "grad_norm": 3.1764378547668457, + "learning_rate": 2.599017098214978e-06, + "loss": 0.545, + "step": 6052 + }, + { + "epoch": 2.948522247482949, + "grad_norm": 3.061155080795288, + "learning_rate": 2.598374463194945e-06, + "loss": 0.5221, + "step": 6053 + }, + { + "epoch": 2.9490094186424165, + "grad_norm": 2.793639659881592, + "learning_rate": 2.5977318216645e-06, + "loss": 0.4261, + "step": 6054 + }, + { + "epoch": 2.9494965898018837, + "grad_norm": 3.3247363567352295, + "learning_rate": 2.5970891736661724e-06, + "loss": 0.5179, + "step": 6055 + }, + { + "epoch": 2.9499837609613513, + "grad_norm": 3.2068138122558594, + "learning_rate": 2.5964465192424925e-06, + "loss": 0.5805, + "step": 6056 + }, + { + "epoch": 2.9504709321208185, + "grad_norm": 3.198651075363159, + "learning_rate": 2.5958038584359907e-06, + "loss": 0.6101, + "step": 6057 + }, + { + "epoch": 2.9509581032802856, + "grad_norm": 2.8803951740264893, + "learning_rate": 2.5951611912891983e-06, + "loss": 0.4961, + "step": 6058 + }, + { + "epoch": 2.9514452744397532, + "grad_norm": 2.832725763320923, + "learning_rate": 2.594518517844647e-06, + "loss": 0.4804, + "step": 6059 + }, + { + "epoch": 2.9519324455992204, + "grad_norm": 2.8329520225524902, + "learning_rate": 2.59387583814487e-06, + "loss": 0.5596, + "step": 6060 + }, + { + "epoch": 2.952419616758688, + "grad_norm": 3.1214230060577393, + "learning_rate": 2.5932331522323975e-06, + "loss": 0.4337, + "step": 6061 + }, + { + "epoch": 2.952906787918155, + "grad_norm": 2.665614604949951, + "learning_rate": 2.5925904601497644e-06, + "loss": 0.4248, + "step": 6062 + }, + { + "epoch": 2.953393959077623, + "grad_norm": 3.1834161281585693, + "learning_rate": 2.591947761939503e-06, + "loss": 0.52, + "step": 6063 + }, + { + "epoch": 2.95388113023709, + "grad_norm": 2.671201229095459, + "learning_rate": 2.591305057644148e-06, + "loss": 0.4736, + "step": 6064 + }, + { + "epoch": 2.954368301396557, + "grad_norm": 2.8191399574279785, + "learning_rate": 2.590662347306232e-06, + "loss": 0.5324, + "step": 6065 + }, + { + "epoch": 2.9548554725560248, + "grad_norm": 3.3157806396484375, + "learning_rate": 2.590019630968291e-06, + "loss": 0.5257, + "step": 6066 + }, + { + "epoch": 2.955342643715492, + "grad_norm": 2.906522750854492, + "learning_rate": 2.589376908672858e-06, + "loss": 0.5336, + "step": 6067 + }, + { + "epoch": 2.9558298148749595, + "grad_norm": 3.240656614303589, + "learning_rate": 2.58873418046247e-06, + "loss": 0.4948, + "step": 6068 + }, + { + "epoch": 2.9563169860344267, + "grad_norm": 2.7134289741516113, + "learning_rate": 2.5880914463796625e-06, + "loss": 0.5015, + "step": 6069 + }, + { + "epoch": 2.9568041571938943, + "grad_norm": 3.029818296432495, + "learning_rate": 2.5874487064669712e-06, + "loss": 0.5535, + "step": 6070 + }, + { + "epoch": 2.9572913283533615, + "grad_norm": 2.8653106689453125, + "learning_rate": 2.586805960766932e-06, + "loss": 0.5087, + "step": 6071 + }, + { + "epoch": 2.9577784995128287, + "grad_norm": 2.51947021484375, + "learning_rate": 2.5861632093220836e-06, + "loss": 0.4471, + "step": 6072 + }, + { + "epoch": 2.9582656706722963, + "grad_norm": 3.003831624984741, + "learning_rate": 2.585520452174961e-06, + "loss": 0.5294, + "step": 6073 + }, + { + "epoch": 2.9587528418317635, + "grad_norm": 2.848367929458618, + "learning_rate": 2.5848776893681034e-06, + "loss": 0.4881, + "step": 6074 + }, + { + "epoch": 2.959240012991231, + "grad_norm": 2.7045304775238037, + "learning_rate": 2.5842349209440477e-06, + "loss": 0.4702, + "step": 6075 + }, + { + "epoch": 2.9597271841506982, + "grad_norm": 2.6596686840057373, + "learning_rate": 2.5835921469453335e-06, + "loss": 0.4559, + "step": 6076 + }, + { + "epoch": 2.960214355310166, + "grad_norm": 2.7442972660064697, + "learning_rate": 2.582949367414498e-06, + "loss": 0.434, + "step": 6077 + }, + { + "epoch": 2.960701526469633, + "grad_norm": 2.9896671772003174, + "learning_rate": 2.582306582394082e-06, + "loss": 0.4901, + "step": 6078 + }, + { + "epoch": 2.9611886976291, + "grad_norm": 3.166238307952881, + "learning_rate": 2.581663791926623e-06, + "loss": 0.4414, + "step": 6079 + }, + { + "epoch": 2.961675868788568, + "grad_norm": 2.8067052364349365, + "learning_rate": 2.5810209960546627e-06, + "loss": 0.482, + "step": 6080 + }, + { + "epoch": 2.962163039948035, + "grad_norm": 2.851674795150757, + "learning_rate": 2.58037819482074e-06, + "loss": 0.479, + "step": 6081 + }, + { + "epoch": 2.9626502111075026, + "grad_norm": 2.7834155559539795, + "learning_rate": 2.5797353882673965e-06, + "loss": 0.5372, + "step": 6082 + }, + { + "epoch": 2.9631373822669698, + "grad_norm": 2.834329128265381, + "learning_rate": 2.579092576437172e-06, + "loss": 0.519, + "step": 6083 + }, + { + "epoch": 2.9636245534264374, + "grad_norm": 3.2540650367736816, + "learning_rate": 2.5784497593726076e-06, + "loss": 0.6076, + "step": 6084 + }, + { + "epoch": 2.9641117245859046, + "grad_norm": 3.0053906440734863, + "learning_rate": 2.577806937116246e-06, + "loss": 0.4849, + "step": 6085 + }, + { + "epoch": 2.9645988957453717, + "grad_norm": 2.792588710784912, + "learning_rate": 2.577164109710629e-06, + "loss": 0.5116, + "step": 6086 + }, + { + "epoch": 2.9650860669048393, + "grad_norm": 2.6607725620269775, + "learning_rate": 2.576521277198298e-06, + "loss": 0.4482, + "step": 6087 + }, + { + "epoch": 2.9655732380643065, + "grad_norm": 2.8903520107269287, + "learning_rate": 2.5758784396217955e-06, + "loss": 0.4247, + "step": 6088 + }, + { + "epoch": 2.9660604092237737, + "grad_norm": 2.9547793865203857, + "learning_rate": 2.5752355970236655e-06, + "loss": 0.441, + "step": 6089 + }, + { + "epoch": 2.9665475803832413, + "grad_norm": 2.8347654342651367, + "learning_rate": 2.5745927494464505e-06, + "loss": 0.511, + "step": 6090 + }, + { + "epoch": 2.967034751542709, + "grad_norm": 2.6645543575286865, + "learning_rate": 2.5739498969326944e-06, + "loss": 0.4452, + "step": 6091 + }, + { + "epoch": 2.967521922702176, + "grad_norm": 2.648183584213257, + "learning_rate": 2.573307039524941e-06, + "loss": 0.4834, + "step": 6092 + }, + { + "epoch": 2.9680090938616432, + "grad_norm": 2.902214527130127, + "learning_rate": 2.572664177265734e-06, + "loss": 0.5149, + "step": 6093 + }, + { + "epoch": 2.968496265021111, + "grad_norm": 2.989872694015503, + "learning_rate": 2.5720213101976193e-06, + "loss": 0.4493, + "step": 6094 + }, + { + "epoch": 2.968983436180578, + "grad_norm": 2.92305326461792, + "learning_rate": 2.57137843836314e-06, + "loss": 0.44, + "step": 6095 + }, + { + "epoch": 2.969470607340045, + "grad_norm": 2.8509280681610107, + "learning_rate": 2.5707355618048434e-06, + "loss": 0.5228, + "step": 6096 + }, + { + "epoch": 2.969957778499513, + "grad_norm": 2.723409652709961, + "learning_rate": 2.5700926805652732e-06, + "loss": 0.4989, + "step": 6097 + }, + { + "epoch": 2.9704449496589804, + "grad_norm": 2.8359384536743164, + "learning_rate": 2.5694497946869762e-06, + "loss": 0.5128, + "step": 6098 + }, + { + "epoch": 2.9709321208184476, + "grad_norm": 2.9316751956939697, + "learning_rate": 2.568806904212498e-06, + "loss": 0.4833, + "step": 6099 + }, + { + "epoch": 2.9714192919779148, + "grad_norm": 2.762528896331787, + "learning_rate": 2.5681640091843855e-06, + "loss": 0.5101, + "step": 6100 + }, + { + "epoch": 2.9719064631373824, + "grad_norm": 2.6823549270629883, + "learning_rate": 2.567521109645185e-06, + "loss": 0.4232, + "step": 6101 + }, + { + "epoch": 2.9723936342968496, + "grad_norm": 2.905256748199463, + "learning_rate": 2.5668782056374448e-06, + "loss": 0.4932, + "step": 6102 + }, + { + "epoch": 2.9728808054563167, + "grad_norm": 2.63826584815979, + "learning_rate": 2.5662352972037103e-06, + "loss": 0.4199, + "step": 6103 + }, + { + "epoch": 2.9733679766157843, + "grad_norm": 3.032552719116211, + "learning_rate": 2.56559238438653e-06, + "loss": 0.5192, + "step": 6104 + }, + { + "epoch": 2.973855147775252, + "grad_norm": 2.929217576980591, + "learning_rate": 2.5649494672284517e-06, + "loss": 0.4119, + "step": 6105 + }, + { + "epoch": 2.974342318934719, + "grad_norm": 2.7479248046875, + "learning_rate": 2.564306545772024e-06, + "loss": 0.5034, + "step": 6106 + }, + { + "epoch": 2.9748294900941863, + "grad_norm": 2.8219754695892334, + "learning_rate": 2.5636636200597953e-06, + "loss": 0.5005, + "step": 6107 + }, + { + "epoch": 2.975316661253654, + "grad_norm": 2.5443031787872314, + "learning_rate": 2.563020690134314e-06, + "loss": 0.4805, + "step": 6108 + }, + { + "epoch": 2.975803832413121, + "grad_norm": 2.952747106552124, + "learning_rate": 2.5623777560381296e-06, + "loss": 0.5067, + "step": 6109 + }, + { + "epoch": 2.9762910035725882, + "grad_norm": 2.9992246627807617, + "learning_rate": 2.5617348178137902e-06, + "loss": 0.487, + "step": 6110 + }, + { + "epoch": 2.976778174732056, + "grad_norm": 2.848557710647583, + "learning_rate": 2.561091875503847e-06, + "loss": 0.5024, + "step": 6111 + }, + { + "epoch": 2.9772653458915235, + "grad_norm": 2.9704127311706543, + "learning_rate": 2.560448929150848e-06, + "loss": 0.4828, + "step": 6112 + }, + { + "epoch": 2.9777525170509906, + "grad_norm": 2.9244139194488525, + "learning_rate": 2.5598059787973463e-06, + "loss": 0.5613, + "step": 6113 + }, + { + "epoch": 2.978239688210458, + "grad_norm": 2.6720705032348633, + "learning_rate": 2.5591630244858896e-06, + "loss": 0.4893, + "step": 6114 + }, + { + "epoch": 2.9787268593699254, + "grad_norm": 2.726355791091919, + "learning_rate": 2.5585200662590293e-06, + "loss": 0.4499, + "step": 6115 + }, + { + "epoch": 2.9792140305293926, + "grad_norm": 2.5421102046966553, + "learning_rate": 2.5578771041593164e-06, + "loss": 0.4456, + "step": 6116 + }, + { + "epoch": 2.9797012016888598, + "grad_norm": 2.8840298652648926, + "learning_rate": 2.5572341382293026e-06, + "loss": 0.4584, + "step": 6117 + }, + { + "epoch": 2.9801883728483274, + "grad_norm": 2.9158778190612793, + "learning_rate": 2.5565911685115384e-06, + "loss": 0.5458, + "step": 6118 + }, + { + "epoch": 2.980675544007795, + "grad_norm": 2.765469789505005, + "learning_rate": 2.5559481950485765e-06, + "loss": 0.4236, + "step": 6119 + }, + { + "epoch": 2.981162715167262, + "grad_norm": 2.646160125732422, + "learning_rate": 2.5553052178829676e-06, + "loss": 0.5053, + "step": 6120 + }, + { + "epoch": 2.9816498863267293, + "grad_norm": 2.856053590774536, + "learning_rate": 2.554662237057265e-06, + "loss": 0.5253, + "step": 6121 + }, + { + "epoch": 2.982137057486197, + "grad_norm": 3.1322176456451416, + "learning_rate": 2.55401925261402e-06, + "loss": 0.4701, + "step": 6122 + }, + { + "epoch": 2.982624228645664, + "grad_norm": 2.7608301639556885, + "learning_rate": 2.5533762645957867e-06, + "loss": 0.4955, + "step": 6123 + }, + { + "epoch": 2.9831113998051313, + "grad_norm": 2.92608380317688, + "learning_rate": 2.5527332730451164e-06, + "loss": 0.4569, + "step": 6124 + }, + { + "epoch": 2.983598570964599, + "grad_norm": 2.9254062175750732, + "learning_rate": 2.552090278004564e-06, + "loss": 0.5507, + "step": 6125 + }, + { + "epoch": 2.9840857421240665, + "grad_norm": 2.8164725303649902, + "learning_rate": 2.551447279516681e-06, + "loss": 0.5461, + "step": 6126 + }, + { + "epoch": 2.9845729132835337, + "grad_norm": 2.8886330127716064, + "learning_rate": 2.5508042776240222e-06, + "loss": 0.5645, + "step": 6127 + }, + { + "epoch": 2.985060084443001, + "grad_norm": 2.7067768573760986, + "learning_rate": 2.5501612723691415e-06, + "loss": 0.5045, + "step": 6128 + }, + { + "epoch": 2.9855472556024685, + "grad_norm": 2.6719985008239746, + "learning_rate": 2.5495182637945916e-06, + "loss": 0.4435, + "step": 6129 + }, + { + "epoch": 2.9860344267619356, + "grad_norm": 3.0731797218322754, + "learning_rate": 2.5488752519429285e-06, + "loss": 0.5554, + "step": 6130 + }, + { + "epoch": 2.986521597921403, + "grad_norm": 3.155647039413452, + "learning_rate": 2.5482322368567053e-06, + "loss": 0.5541, + "step": 6131 + }, + { + "epoch": 2.9870087690808704, + "grad_norm": 2.82392954826355, + "learning_rate": 2.547589218578477e-06, + "loss": 0.5126, + "step": 6132 + }, + { + "epoch": 2.987495940240338, + "grad_norm": 2.7452313899993896, + "learning_rate": 2.5469461971507984e-06, + "loss": 0.5034, + "step": 6133 + }, + { + "epoch": 2.987983111399805, + "grad_norm": 2.9227287769317627, + "learning_rate": 2.546303172616225e-06, + "loss": 0.5773, + "step": 6134 + }, + { + "epoch": 2.9884702825592724, + "grad_norm": 2.7584855556488037, + "learning_rate": 2.5456601450173123e-06, + "loss": 0.5166, + "step": 6135 + }, + { + "epoch": 2.98895745371874, + "grad_norm": 2.908677101135254, + "learning_rate": 2.545017114396615e-06, + "loss": 0.5298, + "step": 6136 + }, + { + "epoch": 2.989444624878207, + "grad_norm": 2.9473822116851807, + "learning_rate": 2.54437408079669e-06, + "loss": 0.5287, + "step": 6137 + }, + { + "epoch": 2.9899317960376743, + "grad_norm": 2.796091079711914, + "learning_rate": 2.5437310442600915e-06, + "loss": 0.5252, + "step": 6138 + }, + { + "epoch": 2.990418967197142, + "grad_norm": 3.1033668518066406, + "learning_rate": 2.5430880048293778e-06, + "loss": 0.5078, + "step": 6139 + }, + { + "epoch": 2.990906138356609, + "grad_norm": 2.839426279067993, + "learning_rate": 2.5424449625471027e-06, + "loss": 0.5171, + "step": 6140 + }, + { + "epoch": 2.9913933095160767, + "grad_norm": 2.5491526126861572, + "learning_rate": 2.541801917455825e-06, + "loss": 0.4747, + "step": 6141 + }, + { + "epoch": 2.991880480675544, + "grad_norm": 2.6191329956054688, + "learning_rate": 2.5411588695980994e-06, + "loss": 0.4257, + "step": 6142 + }, + { + "epoch": 2.9923676518350115, + "grad_norm": 2.5401318073272705, + "learning_rate": 2.540515819016484e-06, + "loss": 0.4743, + "step": 6143 + }, + { + "epoch": 2.9928548229944787, + "grad_norm": 2.750786781311035, + "learning_rate": 2.5398727657535356e-06, + "loss": 0.4904, + "step": 6144 + }, + { + "epoch": 2.993341994153946, + "grad_norm": 3.012859582901001, + "learning_rate": 2.539229709851811e-06, + "loss": 0.494, + "step": 6145 + }, + { + "epoch": 2.9938291653134135, + "grad_norm": 2.7759833335876465, + "learning_rate": 2.5385866513538676e-06, + "loss": 0.4729, + "step": 6146 + }, + { + "epoch": 2.9943163364728806, + "grad_norm": 2.7043814659118652, + "learning_rate": 2.5379435903022644e-06, + "loss": 0.469, + "step": 6147 + }, + { + "epoch": 2.9948035076323483, + "grad_norm": 2.6769254207611084, + "learning_rate": 2.5373005267395566e-06, + "loss": 0.4729, + "step": 6148 + }, + { + "epoch": 2.9952906787918154, + "grad_norm": 2.6290900707244873, + "learning_rate": 2.536657460708304e-06, + "loss": 0.4361, + "step": 6149 + }, + { + "epoch": 2.995777849951283, + "grad_norm": 2.956850528717041, + "learning_rate": 2.5360143922510644e-06, + "loss": 0.474, + "step": 6150 + }, + { + "epoch": 2.99626502111075, + "grad_norm": 3.077425003051758, + "learning_rate": 2.5353713214103955e-06, + "loss": 0.5263, + "step": 6151 + }, + { + "epoch": 2.9967521922702174, + "grad_norm": 3.0447752475738525, + "learning_rate": 2.5347282482288556e-06, + "loss": 0.4578, + "step": 6152 + }, + { + "epoch": 2.997239363429685, + "grad_norm": 2.9553635120391846, + "learning_rate": 2.5340851727490033e-06, + "loss": 0.4528, + "step": 6153 + }, + { + "epoch": 2.997726534589152, + "grad_norm": 2.5116209983825684, + "learning_rate": 2.5334420950133976e-06, + "loss": 0.4766, + "step": 6154 + }, + { + "epoch": 2.99821370574862, + "grad_norm": 2.8877720832824707, + "learning_rate": 2.5327990150645977e-06, + "loss": 0.5106, + "step": 6155 + }, + { + "epoch": 2.998700876908087, + "grad_norm": 2.82554292678833, + "learning_rate": 2.5321559329451616e-06, + "loss": 0.5002, + "step": 6156 + }, + { + "epoch": 2.9991880480675546, + "grad_norm": 2.5810718536376953, + "learning_rate": 2.531512848697648e-06, + "loss": 0.4202, + "step": 6157 + }, + { + "epoch": 2.9996752192270217, + "grad_norm": 2.638089179992676, + "learning_rate": 2.5308697623646184e-06, + "loss": 0.4239, + "step": 6158 + }, + { + "epoch": 3.0, + "grad_norm": 3.5627388954162598, + "learning_rate": 2.5302266739886298e-06, + "loss": 0.4752, + "step": 6159 + }, + { + "epoch": 3.000487171159467, + "grad_norm": 2.522366762161255, + "learning_rate": 2.5295835836122436e-06, + "loss": 0.4684, + "step": 6160 + }, + { + "epoch": 3.000974342318935, + "grad_norm": 2.6099908351898193, + "learning_rate": 2.528940491278018e-06, + "loss": 0.5015, + "step": 6161 + }, + { + "epoch": 3.001461513478402, + "grad_norm": 2.69919753074646, + "learning_rate": 2.5282973970285134e-06, + "loss": 0.4449, + "step": 6162 + }, + { + "epoch": 3.0019486846378696, + "grad_norm": 2.3985438346862793, + "learning_rate": 2.52765430090629e-06, + "loss": 0.4652, + "step": 6163 + }, + { + "epoch": 3.0024358557973367, + "grad_norm": 2.5553743839263916, + "learning_rate": 2.527011202953907e-06, + "loss": 0.4844, + "step": 6164 + }, + { + "epoch": 3.0029230269568044, + "grad_norm": 3.1230945587158203, + "learning_rate": 2.5263681032139255e-06, + "loss": 0.4099, + "step": 6165 + }, + { + "epoch": 3.0034101981162715, + "grad_norm": 2.5754284858703613, + "learning_rate": 2.5257250017289055e-06, + "loss": 0.4337, + "step": 6166 + }, + { + "epoch": 3.0038973692757387, + "grad_norm": 3.007718086242676, + "learning_rate": 2.5250818985414065e-06, + "loss": 0.4029, + "step": 6167 + }, + { + "epoch": 3.0043845404352063, + "grad_norm": 2.906951427459717, + "learning_rate": 2.5244387936939914e-06, + "loss": 0.4377, + "step": 6168 + }, + { + "epoch": 3.0048717115946735, + "grad_norm": 2.939276695251465, + "learning_rate": 2.5237956872292173e-06, + "loss": 0.4547, + "step": 6169 + }, + { + "epoch": 3.005358882754141, + "grad_norm": 2.5988640785217285, + "learning_rate": 2.5231525791896484e-06, + "loss": 0.4162, + "step": 6170 + }, + { + "epoch": 3.0058460539136083, + "grad_norm": 2.817826747894287, + "learning_rate": 2.5225094696178427e-06, + "loss": 0.4819, + "step": 6171 + }, + { + "epoch": 3.006333225073076, + "grad_norm": 2.9584267139434814, + "learning_rate": 2.5218663585563635e-06, + "loss": 0.4634, + "step": 6172 + }, + { + "epoch": 3.006820396232543, + "grad_norm": 3.0697553157806396, + "learning_rate": 2.5212232460477705e-06, + "loss": 0.4097, + "step": 6173 + }, + { + "epoch": 3.00730756739201, + "grad_norm": 2.572662591934204, + "learning_rate": 2.5205801321346244e-06, + "loss": 0.4314, + "step": 6174 + }, + { + "epoch": 3.007794738551478, + "grad_norm": 3.0218536853790283, + "learning_rate": 2.519937016859488e-06, + "loss": 0.4615, + "step": 6175 + }, + { + "epoch": 3.008281909710945, + "grad_norm": 2.5193262100219727, + "learning_rate": 2.5192939002649215e-06, + "loss": 0.3945, + "step": 6176 + }, + { + "epoch": 3.0087690808704126, + "grad_norm": 2.9987151622772217, + "learning_rate": 2.5186507823934854e-06, + "loss": 0.4497, + "step": 6177 + }, + { + "epoch": 3.00925625202988, + "grad_norm": 3.0466275215148926, + "learning_rate": 2.5180076632877433e-06, + "loss": 0.5161, + "step": 6178 + }, + { + "epoch": 3.0097434231893474, + "grad_norm": 2.5681145191192627, + "learning_rate": 2.517364542990255e-06, + "loss": 0.4039, + "step": 6179 + }, + { + "epoch": 3.0102305943488146, + "grad_norm": 2.5705134868621826, + "learning_rate": 2.5167214215435835e-06, + "loss": 0.4089, + "step": 6180 + }, + { + "epoch": 3.0107177655082817, + "grad_norm": 3.1144516468048096, + "learning_rate": 2.5160782989902893e-06, + "loss": 0.4458, + "step": 6181 + }, + { + "epoch": 3.0112049366677494, + "grad_norm": 2.8334593772888184, + "learning_rate": 2.5154351753729354e-06, + "loss": 0.3665, + "step": 6182 + }, + { + "epoch": 3.0116921078272165, + "grad_norm": 2.7153162956237793, + "learning_rate": 2.514792050734083e-06, + "loss": 0.434, + "step": 6183 + }, + { + "epoch": 3.012179278986684, + "grad_norm": 3.029035806655884, + "learning_rate": 2.5141489251162935e-06, + "loss": 0.4602, + "step": 6184 + }, + { + "epoch": 3.0126664501461513, + "grad_norm": 3.3281776905059814, + "learning_rate": 2.51350579856213e-06, + "loss": 0.475, + "step": 6185 + }, + { + "epoch": 3.013153621305619, + "grad_norm": 2.910982847213745, + "learning_rate": 2.5128626711141537e-06, + "loss": 0.433, + "step": 6186 + }, + { + "epoch": 3.013640792465086, + "grad_norm": 3.10064959526062, + "learning_rate": 2.5122195428149264e-06, + "loss": 0.4851, + "step": 6187 + }, + { + "epoch": 3.0141279636245533, + "grad_norm": 3.3921992778778076, + "learning_rate": 2.511576413707012e-06, + "loss": 0.4011, + "step": 6188 + }, + { + "epoch": 3.014615134784021, + "grad_norm": 3.0752456188201904, + "learning_rate": 2.510933283832971e-06, + "loss": 0.5124, + "step": 6189 + }, + { + "epoch": 3.015102305943488, + "grad_norm": 2.950228214263916, + "learning_rate": 2.510290153235367e-06, + "loss": 0.4063, + "step": 6190 + }, + { + "epoch": 3.0155894771029557, + "grad_norm": 2.791623830795288, + "learning_rate": 2.5096470219567604e-06, + "loss": 0.413, + "step": 6191 + }, + { + "epoch": 3.016076648262423, + "grad_norm": 3.0106253623962402, + "learning_rate": 2.509003890039716e-06, + "loss": 0.4281, + "step": 6192 + }, + { + "epoch": 3.01656381942189, + "grad_norm": 2.8875772953033447, + "learning_rate": 2.508360757526794e-06, + "loss": 0.4879, + "step": 6193 + }, + { + "epoch": 3.0170509905813576, + "grad_norm": 2.7952349185943604, + "learning_rate": 2.507717624460559e-06, + "loss": 0.4525, + "step": 6194 + }, + { + "epoch": 3.017538161740825, + "grad_norm": 2.636521100997925, + "learning_rate": 2.5070744908835715e-06, + "loss": 0.4089, + "step": 6195 + }, + { + "epoch": 3.0180253329002924, + "grad_norm": 2.9749677181243896, + "learning_rate": 2.5064313568383954e-06, + "loss": 0.4802, + "step": 6196 + }, + { + "epoch": 3.0185125040597596, + "grad_norm": 3.1202685832977295, + "learning_rate": 2.505788222367592e-06, + "loss": 0.4792, + "step": 6197 + }, + { + "epoch": 3.018999675219227, + "grad_norm": 2.520578384399414, + "learning_rate": 2.505145087513725e-06, + "loss": 0.3761, + "step": 6198 + }, + { + "epoch": 3.0194868463786944, + "grad_norm": 2.6747803688049316, + "learning_rate": 2.5045019523193566e-06, + "loss": 0.3927, + "step": 6199 + }, + { + "epoch": 3.0199740175381615, + "grad_norm": 2.9111595153808594, + "learning_rate": 2.5038588168270493e-06, + "loss": 0.4019, + "step": 6200 + }, + { + "epoch": 3.020461188697629, + "grad_norm": 2.629267692565918, + "learning_rate": 2.5032156810793658e-06, + "loss": 0.4037, + "step": 6201 + }, + { + "epoch": 3.0209483598570963, + "grad_norm": 2.9411346912384033, + "learning_rate": 2.5025725451188694e-06, + "loss": 0.4251, + "step": 6202 + }, + { + "epoch": 3.021435531016564, + "grad_norm": 3.2232935428619385, + "learning_rate": 2.501929408988121e-06, + "loss": 0.4674, + "step": 6203 + }, + { + "epoch": 3.021922702176031, + "grad_norm": 2.679334878921509, + "learning_rate": 2.5012862727296857e-06, + "loss": 0.479, + "step": 6204 + }, + { + "epoch": 3.0224098733354987, + "grad_norm": 2.7718446254730225, + "learning_rate": 2.500643136386124e-06, + "loss": 0.4627, + "step": 6205 + }, + { + "epoch": 3.022897044494966, + "grad_norm": 2.693885564804077, + "learning_rate": 2.5e-06, + "loss": 0.4405, + "step": 6206 + }, + { + "epoch": 3.023384215654433, + "grad_norm": 3.2388927936553955, + "learning_rate": 2.499356863613876e-06, + "loss": 0.5083, + "step": 6207 + }, + { + "epoch": 3.0238713868139007, + "grad_norm": 2.9337379932403564, + "learning_rate": 2.4987137272703156e-06, + "loss": 0.4711, + "step": 6208 + }, + { + "epoch": 3.024358557973368, + "grad_norm": 2.8364765644073486, + "learning_rate": 2.4980705910118796e-06, + "loss": 0.4426, + "step": 6209 + }, + { + "epoch": 3.0248457291328354, + "grad_norm": 2.9277167320251465, + "learning_rate": 2.497427454881132e-06, + "loss": 0.4335, + "step": 6210 + }, + { + "epoch": 3.0253329002923026, + "grad_norm": 2.526780605316162, + "learning_rate": 2.4967843189206346e-06, + "loss": 0.3827, + "step": 6211 + }, + { + "epoch": 3.0258200714517702, + "grad_norm": 2.8348071575164795, + "learning_rate": 2.496141183172952e-06, + "loss": 0.449, + "step": 6212 + }, + { + "epoch": 3.0263072426112374, + "grad_norm": 2.8852407932281494, + "learning_rate": 2.4954980476806442e-06, + "loss": 0.4054, + "step": 6213 + }, + { + "epoch": 3.0267944137707046, + "grad_norm": 2.6926167011260986, + "learning_rate": 2.4948549124862752e-06, + "loss": 0.4185, + "step": 6214 + }, + { + "epoch": 3.027281584930172, + "grad_norm": 2.549823522567749, + "learning_rate": 2.494211777632409e-06, + "loss": 0.3903, + "step": 6215 + }, + { + "epoch": 3.0277687560896394, + "grad_norm": 2.8965914249420166, + "learning_rate": 2.493568643161606e-06, + "loss": 0.438, + "step": 6216 + }, + { + "epoch": 3.028255927249107, + "grad_norm": 2.847449541091919, + "learning_rate": 2.492925509116429e-06, + "loss": 0.4135, + "step": 6217 + }, + { + "epoch": 3.028743098408574, + "grad_norm": 2.735595941543579, + "learning_rate": 2.492282375539442e-06, + "loss": 0.452, + "step": 6218 + }, + { + "epoch": 3.0292302695680418, + "grad_norm": 2.5225491523742676, + "learning_rate": 2.4916392424732062e-06, + "loss": 0.3785, + "step": 6219 + }, + { + "epoch": 3.029717440727509, + "grad_norm": 2.917470693588257, + "learning_rate": 2.4909961099602848e-06, + "loss": 0.4369, + "step": 6220 + }, + { + "epoch": 3.030204611886976, + "grad_norm": 2.716815948486328, + "learning_rate": 2.49035297804324e-06, + "loss": 0.3778, + "step": 6221 + }, + { + "epoch": 3.0306917830464437, + "grad_norm": 2.9304966926574707, + "learning_rate": 2.489709846764634e-06, + "loss": 0.4169, + "step": 6222 + }, + { + "epoch": 3.031178954205911, + "grad_norm": 2.9452662467956543, + "learning_rate": 2.489066716167029e-06, + "loss": 0.4935, + "step": 6223 + }, + { + "epoch": 3.0316661253653785, + "grad_norm": 3.13606858253479, + "learning_rate": 2.488423586292989e-06, + "loss": 0.3961, + "step": 6224 + }, + { + "epoch": 3.0321532965248457, + "grad_norm": 2.8212881088256836, + "learning_rate": 2.487780457185074e-06, + "loss": 0.4466, + "step": 6225 + }, + { + "epoch": 3.0326404676843133, + "grad_norm": 2.862422227859497, + "learning_rate": 2.487137328885847e-06, + "loss": 0.3805, + "step": 6226 + }, + { + "epoch": 3.0331276388437804, + "grad_norm": 2.9058144092559814, + "learning_rate": 2.4864942014378706e-06, + "loss": 0.4895, + "step": 6227 + }, + { + "epoch": 3.0336148100032476, + "grad_norm": 2.9780056476593018, + "learning_rate": 2.4858510748837073e-06, + "loss": 0.4448, + "step": 6228 + }, + { + "epoch": 3.0341019811627152, + "grad_norm": 2.7434751987457275, + "learning_rate": 2.4852079492659182e-06, + "loss": 0.377, + "step": 6229 + }, + { + "epoch": 3.0345891523221824, + "grad_norm": 2.8775007724761963, + "learning_rate": 2.484564824627065e-06, + "loss": 0.4514, + "step": 6230 + }, + { + "epoch": 3.03507632348165, + "grad_norm": 2.936764717102051, + "learning_rate": 2.4839217010097107e-06, + "loss": 0.4444, + "step": 6231 + }, + { + "epoch": 3.035563494641117, + "grad_norm": 2.663994312286377, + "learning_rate": 2.4832785784564174e-06, + "loss": 0.4217, + "step": 6232 + }, + { + "epoch": 3.036050665800585, + "grad_norm": 2.5402772426605225, + "learning_rate": 2.4826354570097455e-06, + "loss": 0.396, + "step": 6233 + }, + { + "epoch": 3.036537836960052, + "grad_norm": 2.9519052505493164, + "learning_rate": 2.4819923367122576e-06, + "loss": 0.4419, + "step": 6234 + }, + { + "epoch": 3.037025008119519, + "grad_norm": 2.887995719909668, + "learning_rate": 2.4813492176065145e-06, + "loss": 0.4229, + "step": 6235 + }, + { + "epoch": 3.0375121792789868, + "grad_norm": 2.9424405097961426, + "learning_rate": 2.48070609973508e-06, + "loss": 0.4755, + "step": 6236 + }, + { + "epoch": 3.037999350438454, + "grad_norm": 2.8962950706481934, + "learning_rate": 2.480062983140513e-06, + "loss": 0.4034, + "step": 6237 + }, + { + "epoch": 3.0384865215979215, + "grad_norm": 2.7079358100891113, + "learning_rate": 2.4794198678653764e-06, + "loss": 0.4234, + "step": 6238 + }, + { + "epoch": 3.0389736927573887, + "grad_norm": 3.076882839202881, + "learning_rate": 2.4787767539522304e-06, + "loss": 0.4997, + "step": 6239 + }, + { + "epoch": 3.0394608639168563, + "grad_norm": 2.9005956649780273, + "learning_rate": 2.4781336414436373e-06, + "loss": 0.4335, + "step": 6240 + }, + { + "epoch": 3.0399480350763235, + "grad_norm": 2.9130072593688965, + "learning_rate": 2.4774905303821577e-06, + "loss": 0.3769, + "step": 6241 + }, + { + "epoch": 3.0404352062357907, + "grad_norm": 2.6438937187194824, + "learning_rate": 2.4768474208103524e-06, + "loss": 0.3921, + "step": 6242 + }, + { + "epoch": 3.0409223773952583, + "grad_norm": 2.638615846633911, + "learning_rate": 2.4762043127707827e-06, + "loss": 0.4526, + "step": 6243 + }, + { + "epoch": 3.0414095485547255, + "grad_norm": 2.997511148452759, + "learning_rate": 2.4755612063060103e-06, + "loss": 0.3739, + "step": 6244 + }, + { + "epoch": 3.041896719714193, + "grad_norm": 2.9397170543670654, + "learning_rate": 2.474918101458594e-06, + "loss": 0.4248, + "step": 6245 + }, + { + "epoch": 3.0423838908736602, + "grad_norm": 2.9662137031555176, + "learning_rate": 2.474274998271095e-06, + "loss": 0.404, + "step": 6246 + }, + { + "epoch": 3.042871062033128, + "grad_norm": 2.941689968109131, + "learning_rate": 2.473631896786075e-06, + "loss": 0.4101, + "step": 6247 + }, + { + "epoch": 3.043358233192595, + "grad_norm": 2.7787864208221436, + "learning_rate": 2.472988797046094e-06, + "loss": 0.398, + "step": 6248 + }, + { + "epoch": 3.043845404352062, + "grad_norm": 2.7027018070220947, + "learning_rate": 2.472345699093711e-06, + "loss": 0.4338, + "step": 6249 + }, + { + "epoch": 3.04433257551153, + "grad_norm": 3.1040525436401367, + "learning_rate": 2.471702602971487e-06, + "loss": 0.4227, + "step": 6250 + }, + { + "epoch": 3.044819746670997, + "grad_norm": 2.953782320022583, + "learning_rate": 2.4710595087219825e-06, + "loss": 0.4822, + "step": 6251 + }, + { + "epoch": 3.0453069178304646, + "grad_norm": 2.782864809036255, + "learning_rate": 2.4704164163877576e-06, + "loss": 0.4085, + "step": 6252 + }, + { + "epoch": 3.0457940889899318, + "grad_norm": 3.3429980278015137, + "learning_rate": 2.4697733260113706e-06, + "loss": 0.4668, + "step": 6253 + }, + { + "epoch": 3.0462812601493994, + "grad_norm": 3.0785434246063232, + "learning_rate": 2.4691302376353824e-06, + "loss": 0.4733, + "step": 6254 + }, + { + "epoch": 3.0467684313088665, + "grad_norm": 3.1933822631835938, + "learning_rate": 2.4684871513023516e-06, + "loss": 0.4895, + "step": 6255 + }, + { + "epoch": 3.0472556024683337, + "grad_norm": 3.3041627407073975, + "learning_rate": 2.4678440670548396e-06, + "loss": 0.4772, + "step": 6256 + }, + { + "epoch": 3.0477427736278013, + "grad_norm": 3.220651865005493, + "learning_rate": 2.4672009849354035e-06, + "loss": 0.4755, + "step": 6257 + }, + { + "epoch": 3.0482299447872685, + "grad_norm": 3.09544038772583, + "learning_rate": 2.4665579049866024e-06, + "loss": 0.4737, + "step": 6258 + }, + { + "epoch": 3.048717115946736, + "grad_norm": 3.1858251094818115, + "learning_rate": 2.465914827250998e-06, + "loss": 0.427, + "step": 6259 + }, + { + "epoch": 3.0492042871062033, + "grad_norm": 2.42977237701416, + "learning_rate": 2.465271751771145e-06, + "loss": 0.3751, + "step": 6260 + }, + { + "epoch": 3.049691458265671, + "grad_norm": 2.8084092140197754, + "learning_rate": 2.464628678589605e-06, + "loss": 0.4461, + "step": 6261 + }, + { + "epoch": 3.050178629425138, + "grad_norm": 3.3125905990600586, + "learning_rate": 2.4639856077489364e-06, + "loss": 0.4321, + "step": 6262 + }, + { + "epoch": 3.0506658005846052, + "grad_norm": 3.3144967555999756, + "learning_rate": 2.4633425392916966e-06, + "loss": 0.3955, + "step": 6263 + }, + { + "epoch": 3.051152971744073, + "grad_norm": 2.8186709880828857, + "learning_rate": 2.4626994732604443e-06, + "loss": 0.4252, + "step": 6264 + }, + { + "epoch": 3.05164014290354, + "grad_norm": 2.7992634773254395, + "learning_rate": 2.462056409697737e-06, + "loss": 0.4293, + "step": 6265 + }, + { + "epoch": 3.0521273140630076, + "grad_norm": 2.9284369945526123, + "learning_rate": 2.4614133486461324e-06, + "loss": 0.4334, + "step": 6266 + }, + { + "epoch": 3.052614485222475, + "grad_norm": 3.018265962600708, + "learning_rate": 2.4607702901481903e-06, + "loss": 0.4395, + "step": 6267 + }, + { + "epoch": 3.053101656381942, + "grad_norm": 3.008685350418091, + "learning_rate": 2.4601272342464657e-06, + "loss": 0.4749, + "step": 6268 + }, + { + "epoch": 3.0535888275414096, + "grad_norm": 2.55574369430542, + "learning_rate": 2.4594841809835163e-06, + "loss": 0.4085, + "step": 6269 + }, + { + "epoch": 3.0540759987008768, + "grad_norm": 3.0475966930389404, + "learning_rate": 2.4588411304019006e-06, + "loss": 0.4197, + "step": 6270 + }, + { + "epoch": 3.0545631698603444, + "grad_norm": 2.649879217147827, + "learning_rate": 2.4581980825441764e-06, + "loss": 0.4245, + "step": 6271 + }, + { + "epoch": 3.0550503410198115, + "grad_norm": 2.9600448608398438, + "learning_rate": 2.4575550374528978e-06, + "loss": 0.4027, + "step": 6272 + }, + { + "epoch": 3.055537512179279, + "grad_norm": 2.976940870285034, + "learning_rate": 2.456911995170623e-06, + "loss": 0.496, + "step": 6273 + }, + { + "epoch": 3.0560246833387463, + "grad_norm": 2.9722747802734375, + "learning_rate": 2.4562689557399085e-06, + "loss": 0.4521, + "step": 6274 + }, + { + "epoch": 3.0565118544982135, + "grad_norm": 3.215508222579956, + "learning_rate": 2.455625919203311e-06, + "loss": 0.3757, + "step": 6275 + }, + { + "epoch": 3.056999025657681, + "grad_norm": 2.863736391067505, + "learning_rate": 2.4549828856033855e-06, + "loss": 0.4378, + "step": 6276 + }, + { + "epoch": 3.0574861968171483, + "grad_norm": 3.0192081928253174, + "learning_rate": 2.454339854982688e-06, + "loss": 0.4294, + "step": 6277 + }, + { + "epoch": 3.057973367976616, + "grad_norm": 2.658202886581421, + "learning_rate": 2.453696827383775e-06, + "loss": 0.3886, + "step": 6278 + }, + { + "epoch": 3.058460539136083, + "grad_norm": 2.9935131072998047, + "learning_rate": 2.453053802849203e-06, + "loss": 0.4756, + "step": 6279 + }, + { + "epoch": 3.0589477102955507, + "grad_norm": 2.5292067527770996, + "learning_rate": 2.452410781421524e-06, + "loss": 0.3819, + "step": 6280 + }, + { + "epoch": 3.059434881455018, + "grad_norm": 3.1333227157592773, + "learning_rate": 2.4517677631432956e-06, + "loss": 0.3867, + "step": 6281 + }, + { + "epoch": 3.059922052614485, + "grad_norm": 3.1795568466186523, + "learning_rate": 2.4511247480570724e-06, + "loss": 0.3812, + "step": 6282 + }, + { + "epoch": 3.0604092237739526, + "grad_norm": 3.2791695594787598, + "learning_rate": 2.450481736205409e-06, + "loss": 0.5017, + "step": 6283 + }, + { + "epoch": 3.06089639493342, + "grad_norm": 2.5865516662597656, + "learning_rate": 2.4498387276308594e-06, + "loss": 0.417, + "step": 6284 + }, + { + "epoch": 3.0613835660928874, + "grad_norm": 3.732316017150879, + "learning_rate": 2.449195722375978e-06, + "loss": 0.5021, + "step": 6285 + }, + { + "epoch": 3.0618707372523546, + "grad_norm": 2.9873392581939697, + "learning_rate": 2.4485527204833193e-06, + "loss": 0.3981, + "step": 6286 + }, + { + "epoch": 3.062357908411822, + "grad_norm": 2.6503403186798096, + "learning_rate": 2.4479097219954373e-06, + "loss": 0.3819, + "step": 6287 + }, + { + "epoch": 3.0628450795712894, + "grad_norm": 3.064992666244507, + "learning_rate": 2.447266726954884e-06, + "loss": 0.3959, + "step": 6288 + }, + { + "epoch": 3.0633322507307565, + "grad_norm": 3.004530906677246, + "learning_rate": 2.4466237354042137e-06, + "loss": 0.3854, + "step": 6289 + }, + { + "epoch": 3.063819421890224, + "grad_norm": 3.399733543395996, + "learning_rate": 2.44598074738598e-06, + "loss": 0.4498, + "step": 6290 + }, + { + "epoch": 3.0643065930496913, + "grad_norm": 2.7867703437805176, + "learning_rate": 2.4453377629427363e-06, + "loss": 0.3764, + "step": 6291 + }, + { + "epoch": 3.064793764209159, + "grad_norm": 3.211421012878418, + "learning_rate": 2.4446947821170333e-06, + "loss": 0.4637, + "step": 6292 + }, + { + "epoch": 3.065280935368626, + "grad_norm": 2.739664316177368, + "learning_rate": 2.4440518049514244e-06, + "loss": 0.4037, + "step": 6293 + }, + { + "epoch": 3.0657681065280937, + "grad_norm": 2.879425525665283, + "learning_rate": 2.443408831488462e-06, + "loss": 0.402, + "step": 6294 + }, + { + "epoch": 3.066255277687561, + "grad_norm": 3.0856258869171143, + "learning_rate": 2.4427658617706986e-06, + "loss": 0.4922, + "step": 6295 + }, + { + "epoch": 3.066742448847028, + "grad_norm": 2.547354221343994, + "learning_rate": 2.442122895840684e-06, + "loss": 0.3593, + "step": 6296 + }, + { + "epoch": 3.0672296200064957, + "grad_norm": 2.814624071121216, + "learning_rate": 2.4414799337409715e-06, + "loss": 0.4181, + "step": 6297 + }, + { + "epoch": 3.067716791165963, + "grad_norm": 2.885528802871704, + "learning_rate": 2.440836975514111e-06, + "loss": 0.4423, + "step": 6298 + }, + { + "epoch": 3.0682039623254305, + "grad_norm": 2.816465377807617, + "learning_rate": 2.440194021202655e-06, + "loss": 0.4189, + "step": 6299 + }, + { + "epoch": 3.0686911334848976, + "grad_norm": 3.1811749935150146, + "learning_rate": 2.4395510708491523e-06, + "loss": 0.4248, + "step": 6300 + }, + { + "epoch": 3.0691783046443653, + "grad_norm": 2.5418789386749268, + "learning_rate": 2.4389081244961537e-06, + "loss": 0.4307, + "step": 6301 + }, + { + "epoch": 3.0696654758038324, + "grad_norm": 2.8237767219543457, + "learning_rate": 2.4382651821862106e-06, + "loss": 0.41, + "step": 6302 + }, + { + "epoch": 3.0701526469632996, + "grad_norm": 3.0020081996917725, + "learning_rate": 2.437622243961872e-06, + "loss": 0.3758, + "step": 6303 + }, + { + "epoch": 3.070639818122767, + "grad_norm": 2.9348766803741455, + "learning_rate": 2.4369793098656867e-06, + "loss": 0.4138, + "step": 6304 + }, + { + "epoch": 3.0711269892822344, + "grad_norm": 3.149221897125244, + "learning_rate": 2.4363363799402055e-06, + "loss": 0.389, + "step": 6305 + }, + { + "epoch": 3.071614160441702, + "grad_norm": 2.8101205825805664, + "learning_rate": 2.435693454227976e-06, + "loss": 0.4281, + "step": 6306 + }, + { + "epoch": 3.072101331601169, + "grad_norm": 3.29691481590271, + "learning_rate": 2.4350505327715487e-06, + "loss": 0.4228, + "step": 6307 + }, + { + "epoch": 3.0725885027606368, + "grad_norm": 2.6513195037841797, + "learning_rate": 2.434407615613471e-06, + "loss": 0.4212, + "step": 6308 + }, + { + "epoch": 3.073075673920104, + "grad_norm": 2.8037428855895996, + "learning_rate": 2.43376470279629e-06, + "loss": 0.4632, + "step": 6309 + }, + { + "epoch": 3.073562845079571, + "grad_norm": 3.084099531173706, + "learning_rate": 2.433121794362556e-06, + "loss": 0.4086, + "step": 6310 + }, + { + "epoch": 3.0740500162390387, + "grad_norm": 3.274348258972168, + "learning_rate": 2.432478890354815e-06, + "loss": 0.4955, + "step": 6311 + }, + { + "epoch": 3.074537187398506, + "grad_norm": 2.958631753921509, + "learning_rate": 2.431835990815615e-06, + "loss": 0.4065, + "step": 6312 + }, + { + "epoch": 3.0750243585579735, + "grad_norm": 2.7410166263580322, + "learning_rate": 2.431193095787502e-06, + "loss": 0.3474, + "step": 6313 + }, + { + "epoch": 3.0755115297174407, + "grad_norm": 2.991947889328003, + "learning_rate": 2.4305502053130246e-06, + "loss": 0.4027, + "step": 6314 + }, + { + "epoch": 3.0759987008769083, + "grad_norm": 2.722734212875366, + "learning_rate": 2.4299073194347276e-06, + "loss": 0.4484, + "step": 6315 + }, + { + "epoch": 3.0764858720363755, + "grad_norm": 2.8242263793945312, + "learning_rate": 2.4292644381951574e-06, + "loss": 0.4247, + "step": 6316 + }, + { + "epoch": 3.0769730431958426, + "grad_norm": 2.849710464477539, + "learning_rate": 2.42862156163686e-06, + "loss": 0.4608, + "step": 6317 + }, + { + "epoch": 3.0774602143553103, + "grad_norm": 2.9734718799591064, + "learning_rate": 2.427978689802382e-06, + "loss": 0.4919, + "step": 6318 + }, + { + "epoch": 3.0779473855147774, + "grad_norm": 2.7178714275360107, + "learning_rate": 2.4273358227342668e-06, + "loss": 0.4011, + "step": 6319 + }, + { + "epoch": 3.078434556674245, + "grad_norm": 2.7906203269958496, + "learning_rate": 2.42669296047506e-06, + "loss": 0.4447, + "step": 6320 + }, + { + "epoch": 3.078921727833712, + "grad_norm": 2.9076356887817383, + "learning_rate": 2.426050103067306e-06, + "loss": 0.4602, + "step": 6321 + }, + { + "epoch": 3.0794088989931794, + "grad_norm": 2.920454263687134, + "learning_rate": 2.425407250553551e-06, + "loss": 0.4829, + "step": 6322 + }, + { + "epoch": 3.079896070152647, + "grad_norm": 2.6117300987243652, + "learning_rate": 2.4247644029763354e-06, + "loss": 0.4053, + "step": 6323 + }, + { + "epoch": 3.080383241312114, + "grad_norm": 2.92836856842041, + "learning_rate": 2.424121560378205e-06, + "loss": 0.4163, + "step": 6324 + }, + { + "epoch": 3.0808704124715818, + "grad_norm": 2.648327350616455, + "learning_rate": 2.423478722801703e-06, + "loss": 0.4468, + "step": 6325 + }, + { + "epoch": 3.081357583631049, + "grad_norm": 2.8104588985443115, + "learning_rate": 2.422835890289372e-06, + "loss": 0.4235, + "step": 6326 + }, + { + "epoch": 3.0818447547905166, + "grad_norm": 2.5254592895507812, + "learning_rate": 2.4221930628837547e-06, + "loss": 0.4014, + "step": 6327 + }, + { + "epoch": 3.0823319259499837, + "grad_norm": 2.688354015350342, + "learning_rate": 2.4215502406273932e-06, + "loss": 0.379, + "step": 6328 + }, + { + "epoch": 3.082819097109451, + "grad_norm": 3.040249824523926, + "learning_rate": 2.4209074235628287e-06, + "loss": 0.4589, + "step": 6329 + }, + { + "epoch": 3.0833062682689185, + "grad_norm": 3.0802531242370605, + "learning_rate": 2.4202646117326047e-06, + "loss": 0.4262, + "step": 6330 + }, + { + "epoch": 3.0837934394283857, + "grad_norm": 2.7871150970458984, + "learning_rate": 2.419621805179261e-06, + "loss": 0.4386, + "step": 6331 + }, + { + "epoch": 3.0842806105878533, + "grad_norm": 2.7104644775390625, + "learning_rate": 2.4189790039453377e-06, + "loss": 0.4009, + "step": 6332 + }, + { + "epoch": 3.0847677817473205, + "grad_norm": 2.921281099319458, + "learning_rate": 2.418336208073377e-06, + "loss": 0.3667, + "step": 6333 + }, + { + "epoch": 3.085254952906788, + "grad_norm": 2.8950068950653076, + "learning_rate": 2.4176934176059194e-06, + "loss": 0.4084, + "step": 6334 + }, + { + "epoch": 3.0857421240662553, + "grad_norm": 3.3960916996002197, + "learning_rate": 2.4170506325855024e-06, + "loss": 0.4237, + "step": 6335 + }, + { + "epoch": 3.0862292952257224, + "grad_norm": 2.579437732696533, + "learning_rate": 2.4164078530546674e-06, + "loss": 0.389, + "step": 6336 + }, + { + "epoch": 3.08671646638519, + "grad_norm": 3.208162307739258, + "learning_rate": 2.4157650790559523e-06, + "loss": 0.4609, + "step": 6337 + }, + { + "epoch": 3.087203637544657, + "grad_norm": 2.9986488819122314, + "learning_rate": 2.415122310631898e-06, + "loss": 0.4346, + "step": 6338 + }, + { + "epoch": 3.087690808704125, + "grad_norm": 2.7129085063934326, + "learning_rate": 2.41447954782504e-06, + "loss": 0.4054, + "step": 6339 + }, + { + "epoch": 3.088177979863592, + "grad_norm": 2.839789628982544, + "learning_rate": 2.4138367906779172e-06, + "loss": 0.4222, + "step": 6340 + }, + { + "epoch": 3.0886651510230596, + "grad_norm": 2.9176788330078125, + "learning_rate": 2.413194039233068e-06, + "loss": 0.4373, + "step": 6341 + }, + { + "epoch": 3.0891523221825268, + "grad_norm": 3.003857135772705, + "learning_rate": 2.4125512935330304e-06, + "loss": 0.4261, + "step": 6342 + }, + { + "epoch": 3.089639493341994, + "grad_norm": 3.4796695709228516, + "learning_rate": 2.4119085536203383e-06, + "loss": 0.4616, + "step": 6343 + }, + { + "epoch": 3.0901266645014616, + "grad_norm": 3.0403997898101807, + "learning_rate": 2.4112658195375304e-06, + "loss": 0.421, + "step": 6344 + }, + { + "epoch": 3.0906138356609287, + "grad_norm": 3.2307119369506836, + "learning_rate": 2.4106230913271426e-06, + "loss": 0.4519, + "step": 6345 + }, + { + "epoch": 3.0911010068203963, + "grad_norm": 3.3156230449676514, + "learning_rate": 2.409980369031711e-06, + "loss": 0.4792, + "step": 6346 + }, + { + "epoch": 3.0915881779798635, + "grad_norm": 2.7295517921447754, + "learning_rate": 2.409337652693769e-06, + "loss": 0.3942, + "step": 6347 + }, + { + "epoch": 3.092075349139331, + "grad_norm": 3.2851719856262207, + "learning_rate": 2.408694942355853e-06, + "loss": 0.3923, + "step": 6348 + }, + { + "epoch": 3.0925625202987983, + "grad_norm": 3.2898569107055664, + "learning_rate": 2.4080522380604972e-06, + "loss": 0.4596, + "step": 6349 + }, + { + "epoch": 3.0930496914582655, + "grad_norm": 2.6574935913085938, + "learning_rate": 2.4074095398502364e-06, + "loss": 0.4055, + "step": 6350 + }, + { + "epoch": 3.093536862617733, + "grad_norm": 3.5035288333892822, + "learning_rate": 2.4067668477676034e-06, + "loss": 0.4432, + "step": 6351 + }, + { + "epoch": 3.0940240337772003, + "grad_norm": 2.7650086879730225, + "learning_rate": 2.406124161855131e-06, + "loss": 0.4342, + "step": 6352 + }, + { + "epoch": 3.094511204936668, + "grad_norm": 3.1407265663146973, + "learning_rate": 2.405481482155353e-06, + "loss": 0.4457, + "step": 6353 + }, + { + "epoch": 3.094998376096135, + "grad_norm": 2.9623918533325195, + "learning_rate": 2.4048388087108025e-06, + "loss": 0.4299, + "step": 6354 + }, + { + "epoch": 3.0954855472556027, + "grad_norm": 2.8543412685394287, + "learning_rate": 2.40419614156401e-06, + "loss": 0.4041, + "step": 6355 + }, + { + "epoch": 3.09597271841507, + "grad_norm": 2.9520721435546875, + "learning_rate": 2.4035534807575083e-06, + "loss": 0.4328, + "step": 6356 + }, + { + "epoch": 3.096459889574537, + "grad_norm": 2.9071664810180664, + "learning_rate": 2.4029108263338275e-06, + "loss": 0.4504, + "step": 6357 + }, + { + "epoch": 3.0969470607340046, + "grad_norm": 3.18149995803833, + "learning_rate": 2.402268178335501e-06, + "loss": 0.4906, + "step": 6358 + }, + { + "epoch": 3.097434231893472, + "grad_norm": 2.7301859855651855, + "learning_rate": 2.4016255368050556e-06, + "loss": 0.398, + "step": 6359 + }, + { + "epoch": 3.0979214030529394, + "grad_norm": 2.570680856704712, + "learning_rate": 2.4009829017850235e-06, + "loss": 0.3873, + "step": 6360 + }, + { + "epoch": 3.0984085742124066, + "grad_norm": 3.023310422897339, + "learning_rate": 2.4003402733179334e-06, + "loss": 0.5033, + "step": 6361 + }, + { + "epoch": 3.098895745371874, + "grad_norm": 2.692723035812378, + "learning_rate": 2.399697651446316e-06, + "loss": 0.434, + "step": 6362 + }, + { + "epoch": 3.0993829165313413, + "grad_norm": 2.5954439640045166, + "learning_rate": 2.3990550362126976e-06, + "loss": 0.4321, + "step": 6363 + }, + { + "epoch": 3.0998700876908085, + "grad_norm": 2.616060495376587, + "learning_rate": 2.3984124276596075e-06, + "loss": 0.367, + "step": 6364 + }, + { + "epoch": 3.100357258850276, + "grad_norm": 2.659353733062744, + "learning_rate": 2.397769825829574e-06, + "loss": 0.4596, + "step": 6365 + }, + { + "epoch": 3.1008444300097433, + "grad_norm": 2.762568950653076, + "learning_rate": 2.3971272307651238e-06, + "loss": 0.4643, + "step": 6366 + }, + { + "epoch": 3.101331601169211, + "grad_norm": 3.161146640777588, + "learning_rate": 2.3964846425087833e-06, + "loss": 0.5041, + "step": 6367 + }, + { + "epoch": 3.101818772328678, + "grad_norm": 2.7753281593322754, + "learning_rate": 2.3958420611030797e-06, + "loss": 0.384, + "step": 6368 + }, + { + "epoch": 3.1023059434881457, + "grad_norm": 3.0383639335632324, + "learning_rate": 2.39519948659054e-06, + "loss": 0.4719, + "step": 6369 + }, + { + "epoch": 3.102793114647613, + "grad_norm": 2.974001884460449, + "learning_rate": 2.394556919013687e-06, + "loss": 0.4442, + "step": 6370 + }, + { + "epoch": 3.10328028580708, + "grad_norm": 2.979387044906616, + "learning_rate": 2.3939143584150477e-06, + "loss": 0.4645, + "step": 6371 + }, + { + "epoch": 3.1037674569665477, + "grad_norm": 3.367691993713379, + "learning_rate": 2.3932718048371466e-06, + "loss": 0.5096, + "step": 6372 + }, + { + "epoch": 3.104254628126015, + "grad_norm": 2.944662570953369, + "learning_rate": 2.3926292583225076e-06, + "loss": 0.4246, + "step": 6373 + }, + { + "epoch": 3.1047417992854824, + "grad_norm": 4.3255391120910645, + "learning_rate": 2.3919867189136542e-06, + "loss": 0.4231, + "step": 6374 + }, + { + "epoch": 3.1052289704449496, + "grad_norm": 3.0486385822296143, + "learning_rate": 2.3913441866531093e-06, + "loss": 0.4458, + "step": 6375 + }, + { + "epoch": 3.1057161416044172, + "grad_norm": 3.388056993484497, + "learning_rate": 2.390701661583396e-06, + "loss": 0.4652, + "step": 6376 + }, + { + "epoch": 3.1062033127638844, + "grad_norm": 2.8679208755493164, + "learning_rate": 2.3900591437470377e-06, + "loss": 0.3945, + "step": 6377 + }, + { + "epoch": 3.1066904839233516, + "grad_norm": 2.9624321460723877, + "learning_rate": 2.3894166331865535e-06, + "loss": 0.4529, + "step": 6378 + }, + { + "epoch": 3.107177655082819, + "grad_norm": 2.8569698333740234, + "learning_rate": 2.3887741299444667e-06, + "loss": 0.3647, + "step": 6379 + }, + { + "epoch": 3.1076648262422863, + "grad_norm": 3.1908977031707764, + "learning_rate": 2.3881316340632973e-06, + "loss": 0.4638, + "step": 6380 + }, + { + "epoch": 3.108151997401754, + "grad_norm": 2.851241111755371, + "learning_rate": 2.3874891455855673e-06, + "loss": 0.4139, + "step": 6381 + }, + { + "epoch": 3.108639168561221, + "grad_norm": 3.03897762298584, + "learning_rate": 2.386846664553794e-06, + "loss": 0.4226, + "step": 6382 + }, + { + "epoch": 3.1091263397206887, + "grad_norm": 3.239306926727295, + "learning_rate": 2.386204191010498e-06, + "loss": 0.4535, + "step": 6383 + }, + { + "epoch": 3.109613510880156, + "grad_norm": 2.666149377822876, + "learning_rate": 2.3855617249981976e-06, + "loss": 0.3801, + "step": 6384 + }, + { + "epoch": 3.110100682039623, + "grad_norm": 2.9478163719177246, + "learning_rate": 2.3849192665594128e-06, + "loss": 0.4733, + "step": 6385 + }, + { + "epoch": 3.1105878531990907, + "grad_norm": 2.759777784347534, + "learning_rate": 2.3842768157366595e-06, + "loss": 0.372, + "step": 6386 + }, + { + "epoch": 3.111075024358558, + "grad_norm": 2.712778329849243, + "learning_rate": 2.3836343725724555e-06, + "loss": 0.441, + "step": 6387 + }, + { + "epoch": 3.1115621955180255, + "grad_norm": 2.7941765785217285, + "learning_rate": 2.382991937109318e-06, + "loss": 0.3704, + "step": 6388 + }, + { + "epoch": 3.1120493666774927, + "grad_norm": 3.3385088443756104, + "learning_rate": 2.3823495093897643e-06, + "loss": 0.4209, + "step": 6389 + }, + { + "epoch": 3.1125365378369603, + "grad_norm": 3.0453195571899414, + "learning_rate": 2.381707089456308e-06, + "loss": 0.4236, + "step": 6390 + }, + { + "epoch": 3.1130237089964274, + "grad_norm": 3.205662965774536, + "learning_rate": 2.3810646773514653e-06, + "loss": 0.4372, + "step": 6391 + }, + { + "epoch": 3.1135108801558946, + "grad_norm": 2.6793105602264404, + "learning_rate": 2.380422273117752e-06, + "loss": 0.3989, + "step": 6392 + }, + { + "epoch": 3.1139980513153622, + "grad_norm": 2.7323365211486816, + "learning_rate": 2.3797798767976813e-06, + "loss": 0.4228, + "step": 6393 + }, + { + "epoch": 3.1144852224748294, + "grad_norm": 2.780149459838867, + "learning_rate": 2.3791374884337667e-06, + "loss": 0.4491, + "step": 6394 + }, + { + "epoch": 3.114972393634297, + "grad_norm": 2.9850192070007324, + "learning_rate": 2.3784951080685226e-06, + "loss": 0.4507, + "step": 6395 + }, + { + "epoch": 3.115459564793764, + "grad_norm": 3.2376632690429688, + "learning_rate": 2.37785273574446e-06, + "loss": 0.4269, + "step": 6396 + }, + { + "epoch": 3.115946735953232, + "grad_norm": 3.041234254837036, + "learning_rate": 2.377210371504093e-06, + "loss": 0.4143, + "step": 6397 + }, + { + "epoch": 3.116433907112699, + "grad_norm": 2.9712934494018555, + "learning_rate": 2.376568015389932e-06, + "loss": 0.4578, + "step": 6398 + }, + { + "epoch": 3.116921078272166, + "grad_norm": 3.766894817352295, + "learning_rate": 2.3759256674444873e-06, + "loss": 0.4529, + "step": 6399 + }, + { + "epoch": 3.1174082494316337, + "grad_norm": 2.6923458576202393, + "learning_rate": 2.375283327710271e-06, + "loss": 0.4166, + "step": 6400 + }, + { + "epoch": 3.117895420591101, + "grad_norm": 2.9219865798950195, + "learning_rate": 2.3746409962297934e-06, + "loss": 0.4297, + "step": 6401 + }, + { + "epoch": 3.1183825917505685, + "grad_norm": 3.0985822677612305, + "learning_rate": 2.3739986730455617e-06, + "loss": 0.5016, + "step": 6402 + }, + { + "epoch": 3.1188697629100357, + "grad_norm": 2.9376957416534424, + "learning_rate": 2.373356358200086e-06, + "loss": 0.4065, + "step": 6403 + }, + { + "epoch": 3.119356934069503, + "grad_norm": 2.820084810256958, + "learning_rate": 2.3727140517358747e-06, + "loss": 0.4391, + "step": 6404 + }, + { + "epoch": 3.1198441052289705, + "grad_norm": 2.9548752307891846, + "learning_rate": 2.372071753695437e-06, + "loss": 0.4034, + "step": 6405 + }, + { + "epoch": 3.1203312763884377, + "grad_norm": 2.903090476989746, + "learning_rate": 2.3714294641212778e-06, + "loss": 0.4489, + "step": 6406 + }, + { + "epoch": 3.1208184475479053, + "grad_norm": 2.5088720321655273, + "learning_rate": 2.370787183055904e-06, + "loss": 0.3931, + "step": 6407 + }, + { + "epoch": 3.1213056187073724, + "grad_norm": 3.3416080474853516, + "learning_rate": 2.3701449105418233e-06, + "loss": 0.3477, + "step": 6408 + }, + { + "epoch": 3.12179278986684, + "grad_norm": 2.815720558166504, + "learning_rate": 2.3695026466215405e-06, + "loss": 0.4055, + "step": 6409 + }, + { + "epoch": 3.1222799610263072, + "grad_norm": 3.0532524585723877, + "learning_rate": 2.36886039133756e-06, + "loss": 0.4687, + "step": 6410 + }, + { + "epoch": 3.1227671321857744, + "grad_norm": 2.804419755935669, + "learning_rate": 2.368218144732386e-06, + "loss": 0.4385, + "step": 6411 + }, + { + "epoch": 3.123254303345242, + "grad_norm": 2.6920220851898193, + "learning_rate": 2.3675759068485238e-06, + "loss": 0.4352, + "step": 6412 + }, + { + "epoch": 3.123741474504709, + "grad_norm": 2.827303409576416, + "learning_rate": 2.3669336777284757e-06, + "loss": 0.4023, + "step": 6413 + }, + { + "epoch": 3.124228645664177, + "grad_norm": 2.7203948497772217, + "learning_rate": 2.3662914574147437e-06, + "loss": 0.4285, + "step": 6414 + }, + { + "epoch": 3.124715816823644, + "grad_norm": 2.8293209075927734, + "learning_rate": 2.3656492459498315e-06, + "loss": 0.4376, + "step": 6415 + }, + { + "epoch": 3.1252029879831116, + "grad_norm": 2.8547167778015137, + "learning_rate": 2.3650070433762394e-06, + "loss": 0.4363, + "step": 6416 + }, + { + "epoch": 3.1256901591425788, + "grad_norm": 3.014270305633545, + "learning_rate": 2.3643648497364685e-06, + "loss": 0.3789, + "step": 6417 + }, + { + "epoch": 3.126177330302046, + "grad_norm": 2.6962106227874756, + "learning_rate": 2.363722665073019e-06, + "loss": 0.4145, + "step": 6418 + }, + { + "epoch": 3.1266645014615135, + "grad_norm": 2.8428688049316406, + "learning_rate": 2.363080489428391e-06, + "loss": 0.3956, + "step": 6419 + }, + { + "epoch": 3.1271516726209807, + "grad_norm": 3.0398361682891846, + "learning_rate": 2.362438322845084e-06, + "loss": 0.4191, + "step": 6420 + }, + { + "epoch": 3.1276388437804483, + "grad_norm": 3.265127182006836, + "learning_rate": 2.3617961653655955e-06, + "loss": 0.4609, + "step": 6421 + }, + { + "epoch": 3.1281260149399155, + "grad_norm": 3.012047052383423, + "learning_rate": 2.3611540170324235e-06, + "loss": 0.3809, + "step": 6422 + }, + { + "epoch": 3.128613186099383, + "grad_norm": 3.0791215896606445, + "learning_rate": 2.360511877888066e-06, + "loss": 0.4341, + "step": 6423 + }, + { + "epoch": 3.1291003572588503, + "grad_norm": 2.911558151245117, + "learning_rate": 2.35986974797502e-06, + "loss": 0.3891, + "step": 6424 + }, + { + "epoch": 3.1295875284183174, + "grad_norm": 3.317901372909546, + "learning_rate": 2.35922762733578e-06, + "loss": 0.4477, + "step": 6425 + }, + { + "epoch": 3.130074699577785, + "grad_norm": 3.1349852085113525, + "learning_rate": 2.3585855160128433e-06, + "loss": 0.4543, + "step": 6426 + }, + { + "epoch": 3.1305618707372522, + "grad_norm": 2.9666359424591064, + "learning_rate": 2.3579434140487036e-06, + "loss": 0.4272, + "step": 6427 + }, + { + "epoch": 3.13104904189672, + "grad_norm": 2.6903271675109863, + "learning_rate": 2.3573013214858563e-06, + "loss": 0.3646, + "step": 6428 + }, + { + "epoch": 3.131536213056187, + "grad_norm": 2.7028281688690186, + "learning_rate": 2.356659238366793e-06, + "loss": 0.4131, + "step": 6429 + }, + { + "epoch": 3.1320233842156546, + "grad_norm": 3.139085292816162, + "learning_rate": 2.3560171647340087e-06, + "loss": 0.4134, + "step": 6430 + }, + { + "epoch": 3.132510555375122, + "grad_norm": 3.29486346244812, + "learning_rate": 2.3553751006299945e-06, + "loss": 0.4344, + "step": 6431 + }, + { + "epoch": 3.132997726534589, + "grad_norm": 2.787954330444336, + "learning_rate": 2.354733046097244e-06, + "loss": 0.4243, + "step": 6432 + }, + { + "epoch": 3.1334848976940566, + "grad_norm": 3.022313356399536, + "learning_rate": 2.3540910011782457e-06, + "loss": 0.3712, + "step": 6433 + }, + { + "epoch": 3.1339720688535238, + "grad_norm": 3.037585973739624, + "learning_rate": 2.3534489659154913e-06, + "loss": 0.4193, + "step": 6434 + }, + { + "epoch": 3.1344592400129914, + "grad_norm": 2.9410438537597656, + "learning_rate": 2.3528069403514716e-06, + "loss": 0.5024, + "step": 6435 + }, + { + "epoch": 3.1349464111724585, + "grad_norm": 3.3817570209503174, + "learning_rate": 2.3521649245286747e-06, + "loss": 0.455, + "step": 6436 + }, + { + "epoch": 3.135433582331926, + "grad_norm": 3.0168449878692627, + "learning_rate": 2.3515229184895886e-06, + "loss": 0.4297, + "step": 6437 + }, + { + "epoch": 3.1359207534913933, + "grad_norm": 3.050676107406616, + "learning_rate": 2.3508809222767026e-06, + "loss": 0.4055, + "step": 6438 + }, + { + "epoch": 3.1364079246508605, + "grad_norm": 2.932176113128662, + "learning_rate": 2.3502389359325027e-06, + "loss": 0.4538, + "step": 6439 + }, + { + "epoch": 3.136895095810328, + "grad_norm": 2.6907050609588623, + "learning_rate": 2.3495969594994767e-06, + "loss": 0.3653, + "step": 6440 + }, + { + "epoch": 3.1373822669697953, + "grad_norm": 2.9298508167266846, + "learning_rate": 2.34895499302011e-06, + "loss": 0.4621, + "step": 6441 + }, + { + "epoch": 3.137869438129263, + "grad_norm": 3.422785520553589, + "learning_rate": 2.348313036536887e-06, + "loss": 0.4128, + "step": 6442 + }, + { + "epoch": 3.13835660928873, + "grad_norm": 3.0341978073120117, + "learning_rate": 2.3476710900922935e-06, + "loss": 0.4707, + "step": 6443 + }, + { + "epoch": 3.1388437804481977, + "grad_norm": 2.661345958709717, + "learning_rate": 2.3470291537288137e-06, + "loss": 0.344, + "step": 6444 + }, + { + "epoch": 3.139330951607665, + "grad_norm": 2.819606304168701, + "learning_rate": 2.3463872274889295e-06, + "loss": 0.426, + "step": 6445 + }, + { + "epoch": 3.139818122767132, + "grad_norm": 3.0431718826293945, + "learning_rate": 2.3457453114151236e-06, + "loss": 0.4368, + "step": 6446 + }, + { + "epoch": 3.1403052939265996, + "grad_norm": 2.7915444374084473, + "learning_rate": 2.3451034055498796e-06, + "loss": 0.4264, + "step": 6447 + }, + { + "epoch": 3.140792465086067, + "grad_norm": 3.191951036453247, + "learning_rate": 2.3444615099356785e-06, + "loss": 0.5111, + "step": 6448 + }, + { + "epoch": 3.1412796362455344, + "grad_norm": 2.852405071258545, + "learning_rate": 2.343819624614999e-06, + "loss": 0.4309, + "step": 6449 + }, + { + "epoch": 3.1417668074050016, + "grad_norm": 3.26533579826355, + "learning_rate": 2.343177749630322e-06, + "loss": 0.4399, + "step": 6450 + }, + { + "epoch": 3.1422539785644688, + "grad_norm": 2.9827165603637695, + "learning_rate": 2.3425358850241274e-06, + "loss": 0.4351, + "step": 6451 + }, + { + "epoch": 3.1427411497239364, + "grad_norm": 2.9166476726531982, + "learning_rate": 2.341894030838894e-06, + "loss": 0.4454, + "step": 6452 + }, + { + "epoch": 3.1432283208834035, + "grad_norm": 3.0907366275787354, + "learning_rate": 2.3412521871170983e-06, + "loss": 0.406, + "step": 6453 + }, + { + "epoch": 3.143715492042871, + "grad_norm": 2.8057446479797363, + "learning_rate": 2.3406103539012175e-06, + "loss": 0.4792, + "step": 6454 + }, + { + "epoch": 3.1442026632023383, + "grad_norm": 2.8503239154815674, + "learning_rate": 2.339968531233729e-06, + "loss": 0.4235, + "step": 6455 + }, + { + "epoch": 3.144689834361806, + "grad_norm": 2.7450783252716064, + "learning_rate": 2.339326719157109e-06, + "loss": 0.3753, + "step": 6456 + }, + { + "epoch": 3.145177005521273, + "grad_norm": 3.1386635303497314, + "learning_rate": 2.338684917713831e-06, + "loss": 0.3961, + "step": 6457 + }, + { + "epoch": 3.1456641766807403, + "grad_norm": 3.112344264984131, + "learning_rate": 2.33804312694637e-06, + "loss": 0.4479, + "step": 6458 + }, + { + "epoch": 3.146151347840208, + "grad_norm": 3.228515148162842, + "learning_rate": 2.3374013468972005e-06, + "loss": 0.4378, + "step": 6459 + }, + { + "epoch": 3.146638518999675, + "grad_norm": 3.077791452407837, + "learning_rate": 2.3367595776087944e-06, + "loss": 0.4561, + "step": 6460 + }, + { + "epoch": 3.1471256901591427, + "grad_norm": 3.1386122703552246, + "learning_rate": 2.3361178191236244e-06, + "loss": 0.4836, + "step": 6461 + }, + { + "epoch": 3.14761286131861, + "grad_norm": 2.9100942611694336, + "learning_rate": 2.335476071484161e-06, + "loss": 0.4318, + "step": 6462 + }, + { + "epoch": 3.1481000324780775, + "grad_norm": 3.230483293533325, + "learning_rate": 2.3348343347328765e-06, + "loss": 0.4723, + "step": 6463 + }, + { + "epoch": 3.1485872036375446, + "grad_norm": 3.125913381576538, + "learning_rate": 2.334192608912241e-06, + "loss": 0.4128, + "step": 6464 + }, + { + "epoch": 3.149074374797012, + "grad_norm": 3.038367748260498, + "learning_rate": 2.333550894064722e-06, + "loss": 0.4682, + "step": 6465 + }, + { + "epoch": 3.1495615459564794, + "grad_norm": 3.098174571990967, + "learning_rate": 2.33290919023279e-06, + "loss": 0.3939, + "step": 6466 + }, + { + "epoch": 3.1500487171159466, + "grad_norm": 2.649027109146118, + "learning_rate": 2.3322674974589125e-06, + "loss": 0.4607, + "step": 6467 + }, + { + "epoch": 3.150535888275414, + "grad_norm": 2.6060338020324707, + "learning_rate": 2.3316258157855557e-06, + "loss": 0.4416, + "step": 6468 + }, + { + "epoch": 3.1510230594348814, + "grad_norm": 3.1972293853759766, + "learning_rate": 2.3309841452551867e-06, + "loss": 0.4698, + "step": 6469 + }, + { + "epoch": 3.151510230594349, + "grad_norm": 3.425679922103882, + "learning_rate": 2.3303424859102714e-06, + "loss": 0.4192, + "step": 6470 + }, + { + "epoch": 3.151997401753816, + "grad_norm": 2.8148539066314697, + "learning_rate": 2.3297008377932754e-06, + "loss": 0.3695, + "step": 6471 + }, + { + "epoch": 3.1524845729132833, + "grad_norm": 2.7040581703186035, + "learning_rate": 2.329059200946661e-06, + "loss": 0.4698, + "step": 6472 + }, + { + "epoch": 3.152971744072751, + "grad_norm": 2.786386489868164, + "learning_rate": 2.328417575412893e-06, + "loss": 0.3831, + "step": 6473 + }, + { + "epoch": 3.153458915232218, + "grad_norm": 3.4130892753601074, + "learning_rate": 2.3277759612344336e-06, + "loss": 0.521, + "step": 6474 + }, + { + "epoch": 3.1539460863916857, + "grad_norm": 3.2028722763061523, + "learning_rate": 2.3271343584537463e-06, + "loss": 0.41, + "step": 6475 + }, + { + "epoch": 3.154433257551153, + "grad_norm": 3.322582960128784, + "learning_rate": 2.32649276711329e-06, + "loss": 0.4264, + "step": 6476 + }, + { + "epoch": 3.1549204287106205, + "grad_norm": 2.954360246658325, + "learning_rate": 2.3258511872555262e-06, + "loss": 0.4446, + "step": 6477 + }, + { + "epoch": 3.1554075998700877, + "grad_norm": 3.1235735416412354, + "learning_rate": 2.325209618922915e-06, + "loss": 0.4007, + "step": 6478 + }, + { + "epoch": 3.155894771029555, + "grad_norm": 2.9611527919769287, + "learning_rate": 2.3245680621579154e-06, + "loss": 0.3815, + "step": 6479 + }, + { + "epoch": 3.1563819421890225, + "grad_norm": 2.924724578857422, + "learning_rate": 2.3239265170029844e-06, + "loss": 0.4234, + "step": 6480 + }, + { + "epoch": 3.1568691133484896, + "grad_norm": 2.682651996612549, + "learning_rate": 2.3232849835005796e-06, + "loss": 0.3926, + "step": 6481 + }, + { + "epoch": 3.1573562845079572, + "grad_norm": 2.9162049293518066, + "learning_rate": 2.3226434616931593e-06, + "loss": 0.3805, + "step": 6482 + }, + { + "epoch": 3.1578434556674244, + "grad_norm": 3.0849757194519043, + "learning_rate": 2.3220019516231778e-06, + "loss": 0.4174, + "step": 6483 + }, + { + "epoch": 3.158330626826892, + "grad_norm": 2.7008185386657715, + "learning_rate": 2.32136045333309e-06, + "loss": 0.3672, + "step": 6484 + }, + { + "epoch": 3.158817797986359, + "grad_norm": 2.9157214164733887, + "learning_rate": 2.3207189668653514e-06, + "loss": 0.4295, + "step": 6485 + }, + { + "epoch": 3.1593049691458264, + "grad_norm": 3.3892645835876465, + "learning_rate": 2.3200774922624147e-06, + "loss": 0.4182, + "step": 6486 + }, + { + "epoch": 3.159792140305294, + "grad_norm": 2.944035768508911, + "learning_rate": 2.3194360295667336e-06, + "loss": 0.4155, + "step": 6487 + }, + { + "epoch": 3.160279311464761, + "grad_norm": 3.0591013431549072, + "learning_rate": 2.318794578820758e-06, + "loss": 0.4511, + "step": 6488 + }, + { + "epoch": 3.1607664826242288, + "grad_norm": 2.8015265464782715, + "learning_rate": 2.3181531400669405e-06, + "loss": 0.4032, + "step": 6489 + }, + { + "epoch": 3.161253653783696, + "grad_norm": 3.226877450942993, + "learning_rate": 2.317511713347731e-06, + "loss": 0.45, + "step": 6490 + }, + { + "epoch": 3.1617408249431636, + "grad_norm": 3.179382085800171, + "learning_rate": 2.316870298705581e-06, + "loss": 0.5091, + "step": 6491 + }, + { + "epoch": 3.1622279961026307, + "grad_norm": 2.8791050910949707, + "learning_rate": 2.316228896182936e-06, + "loss": 0.3884, + "step": 6492 + }, + { + "epoch": 3.162715167262098, + "grad_norm": 3.168748617172241, + "learning_rate": 2.3155875058222456e-06, + "loss": 0.4708, + "step": 6493 + }, + { + "epoch": 3.1632023384215655, + "grad_norm": 2.8953638076782227, + "learning_rate": 2.314946127665957e-06, + "loss": 0.454, + "step": 6494 + }, + { + "epoch": 3.1636895095810327, + "grad_norm": 2.8801920413970947, + "learning_rate": 2.314304761756517e-06, + "loss": 0.4154, + "step": 6495 + }, + { + "epoch": 3.1641766807405003, + "grad_norm": 2.7618484497070312, + "learning_rate": 2.3136634081363704e-06, + "loss": 0.4536, + "step": 6496 + }, + { + "epoch": 3.1646638518999675, + "grad_norm": 2.877284526824951, + "learning_rate": 2.3130220668479616e-06, + "loss": 0.4343, + "step": 6497 + }, + { + "epoch": 3.165151023059435, + "grad_norm": 3.0249743461608887, + "learning_rate": 2.312380737933735e-06, + "loss": 0.4352, + "step": 6498 + }, + { + "epoch": 3.1656381942189022, + "grad_norm": 3.198902130126953, + "learning_rate": 2.311739421436135e-06, + "loss": 0.4405, + "step": 6499 + }, + { + "epoch": 3.1661253653783694, + "grad_norm": 3.10052490234375, + "learning_rate": 2.311098117397601e-06, + "loss": 0.4765, + "step": 6500 + }, + { + "epoch": 3.166612536537837, + "grad_norm": 2.7854175567626953, + "learning_rate": 2.3104568258605763e-06, + "loss": 0.4631, + "step": 6501 + }, + { + "epoch": 3.167099707697304, + "grad_norm": 3.2687883377075195, + "learning_rate": 2.3098155468675017e-06, + "loss": 0.4879, + "step": 6502 + }, + { + "epoch": 3.167586878856772, + "grad_norm": 3.298579692840576, + "learning_rate": 2.3091742804608166e-06, + "loss": 0.4935, + "step": 6503 + }, + { + "epoch": 3.168074050016239, + "grad_norm": 3.014486312866211, + "learning_rate": 2.3085330266829594e-06, + "loss": 0.4946, + "step": 6504 + }, + { + "epoch": 3.1685612211757066, + "grad_norm": 2.99694561958313, + "learning_rate": 2.307891785576369e-06, + "loss": 0.4377, + "step": 6505 + }, + { + "epoch": 3.1690483923351738, + "grad_norm": 2.941824436187744, + "learning_rate": 2.3072505571834818e-06, + "loss": 0.4616, + "step": 6506 + }, + { + "epoch": 3.169535563494641, + "grad_norm": 2.663118839263916, + "learning_rate": 2.3066093415467356e-06, + "loss": 0.3941, + "step": 6507 + }, + { + "epoch": 3.1700227346541086, + "grad_norm": 2.850104570388794, + "learning_rate": 2.305968138708565e-06, + "loss": 0.4089, + "step": 6508 + }, + { + "epoch": 3.1705099058135757, + "grad_norm": 2.623734712600708, + "learning_rate": 2.305326948711404e-06, + "loss": 0.4146, + "step": 6509 + }, + { + "epoch": 3.1709970769730433, + "grad_norm": 2.9165098667144775, + "learning_rate": 2.304685771597688e-06, + "loss": 0.3708, + "step": 6510 + }, + { + "epoch": 3.1714842481325105, + "grad_norm": 3.7592391967773438, + "learning_rate": 2.30404460740985e-06, + "loss": 0.4736, + "step": 6511 + }, + { + "epoch": 3.171971419291978, + "grad_norm": 2.7936854362487793, + "learning_rate": 2.3034034561903208e-06, + "loss": 0.4284, + "step": 6512 + }, + { + "epoch": 3.1724585904514453, + "grad_norm": 2.8473174571990967, + "learning_rate": 2.3027623179815327e-06, + "loss": 0.4616, + "step": 6513 + }, + { + "epoch": 3.1729457616109125, + "grad_norm": 2.7413792610168457, + "learning_rate": 2.302121192825916e-06, + "loss": 0.4311, + "step": 6514 + }, + { + "epoch": 3.17343293277038, + "grad_norm": 2.7056472301483154, + "learning_rate": 2.3014800807659012e-06, + "loss": 0.376, + "step": 6515 + }, + { + "epoch": 3.1739201039298472, + "grad_norm": 2.839271068572998, + "learning_rate": 2.3008389818439153e-06, + "loss": 0.4323, + "step": 6516 + }, + { + "epoch": 3.174407275089315, + "grad_norm": 3.2766480445861816, + "learning_rate": 2.300197896102387e-06, + "loss": 0.4357, + "step": 6517 + }, + { + "epoch": 3.174894446248782, + "grad_norm": 3.349045991897583, + "learning_rate": 2.2995568235837443e-06, + "loss": 0.5083, + "step": 6518 + }, + { + "epoch": 3.1753816174082496, + "grad_norm": 2.882255792617798, + "learning_rate": 2.298915764330411e-06, + "loss": 0.4131, + "step": 6519 + }, + { + "epoch": 3.175868788567717, + "grad_norm": 2.859935998916626, + "learning_rate": 2.2982747183848143e-06, + "loss": 0.4393, + "step": 6520 + }, + { + "epoch": 3.176355959727184, + "grad_norm": 2.804713010787964, + "learning_rate": 2.297633685789378e-06, + "loss": 0.3961, + "step": 6521 + }, + { + "epoch": 3.1768431308866516, + "grad_norm": 3.016172409057617, + "learning_rate": 2.296992666586526e-06, + "loss": 0.47, + "step": 6522 + }, + { + "epoch": 3.1773303020461188, + "grad_norm": 3.4900431632995605, + "learning_rate": 2.29635166081868e-06, + "loss": 0.4607, + "step": 6523 + }, + { + "epoch": 3.1778174732055864, + "grad_norm": 3.0495927333831787, + "learning_rate": 2.2957106685282615e-06, + "loss": 0.438, + "step": 6524 + }, + { + "epoch": 3.1783046443650536, + "grad_norm": 2.758716106414795, + "learning_rate": 2.2950696897576925e-06, + "loss": 0.4628, + "step": 6525 + }, + { + "epoch": 3.178791815524521, + "grad_norm": 2.8732657432556152, + "learning_rate": 2.2944287245493928e-06, + "loss": 0.4124, + "step": 6526 + }, + { + "epoch": 3.1792789866839883, + "grad_norm": 2.9772186279296875, + "learning_rate": 2.29378777294578e-06, + "loss": 0.3565, + "step": 6527 + }, + { + "epoch": 3.1797661578434555, + "grad_norm": 3.094050168991089, + "learning_rate": 2.293146834989274e-06, + "loss": 0.3786, + "step": 6528 + }, + { + "epoch": 3.180253329002923, + "grad_norm": 3.1683828830718994, + "learning_rate": 2.2925059107222905e-06, + "loss": 0.4819, + "step": 6529 + }, + { + "epoch": 3.1807405001623903, + "grad_norm": 3.0262985229492188, + "learning_rate": 2.2918650001872473e-06, + "loss": 0.4368, + "step": 6530 + }, + { + "epoch": 3.181227671321858, + "grad_norm": 3.1313395500183105, + "learning_rate": 2.2912241034265587e-06, + "loss": 0.4168, + "step": 6531 + }, + { + "epoch": 3.181714842481325, + "grad_norm": 3.2713184356689453, + "learning_rate": 2.290583220482639e-06, + "loss": 0.458, + "step": 6532 + }, + { + "epoch": 3.1822020136407927, + "grad_norm": 2.769890308380127, + "learning_rate": 2.2899423513979025e-06, + "loss": 0.3679, + "step": 6533 + }, + { + "epoch": 3.18268918480026, + "grad_norm": 3.272951602935791, + "learning_rate": 2.2893014962147624e-06, + "loss": 0.4215, + "step": 6534 + }, + { + "epoch": 3.183176355959727, + "grad_norm": 2.9762344360351562, + "learning_rate": 2.2886606549756286e-06, + "loss": 0.4578, + "step": 6535 + }, + { + "epoch": 3.1836635271191946, + "grad_norm": 3.179069757461548, + "learning_rate": 2.2880198277229134e-06, + "loss": 0.4191, + "step": 6536 + }, + { + "epoch": 3.184150698278662, + "grad_norm": 2.4933204650878906, + "learning_rate": 2.287379014499026e-06, + "loss": 0.4905, + "step": 6537 + }, + { + "epoch": 3.1846378694381294, + "grad_norm": 3.6515586376190186, + "learning_rate": 2.2867382153463766e-06, + "loss": 0.4619, + "step": 6538 + }, + { + "epoch": 3.1851250405975966, + "grad_norm": 3.086937427520752, + "learning_rate": 2.286097430307371e-06, + "loss": 0.4184, + "step": 6539 + }, + { + "epoch": 3.185612211757064, + "grad_norm": 2.6670594215393066, + "learning_rate": 2.285456659424418e-06, + "loss": 0.3973, + "step": 6540 + }, + { + "epoch": 3.1860993829165314, + "grad_norm": 3.014308452606201, + "learning_rate": 2.284815902739923e-06, + "loss": 0.395, + "step": 6541 + }, + { + "epoch": 3.1865865540759986, + "grad_norm": 2.5032262802124023, + "learning_rate": 2.2841751602962927e-06, + "loss": 0.3959, + "step": 6542 + }, + { + "epoch": 3.187073725235466, + "grad_norm": 2.8540964126586914, + "learning_rate": 2.283534432135929e-06, + "loss": 0.417, + "step": 6543 + }, + { + "epoch": 3.1875608963949333, + "grad_norm": 3.115478992462158, + "learning_rate": 2.2828937183012367e-06, + "loss": 0.4338, + "step": 6544 + }, + { + "epoch": 3.188048067554401, + "grad_norm": 2.802051544189453, + "learning_rate": 2.2822530188346178e-06, + "loss": 0.4187, + "step": 6545 + }, + { + "epoch": 3.188535238713868, + "grad_norm": 2.8167014122009277, + "learning_rate": 2.281612333778475e-06, + "loss": 0.4374, + "step": 6546 + }, + { + "epoch": 3.1890224098733353, + "grad_norm": 3.4356179237365723, + "learning_rate": 2.280971663175206e-06, + "loss": 0.3917, + "step": 6547 + }, + { + "epoch": 3.189509581032803, + "grad_norm": 3.0503411293029785, + "learning_rate": 2.280331007067213e-06, + "loss": 0.4148, + "step": 6548 + }, + { + "epoch": 3.18999675219227, + "grad_norm": 3.096773624420166, + "learning_rate": 2.279690365496893e-06, + "loss": 0.3912, + "step": 6549 + }, + { + "epoch": 3.1904839233517377, + "grad_norm": 3.163715362548828, + "learning_rate": 2.279049738506644e-06, + "loss": 0.4904, + "step": 6550 + }, + { + "epoch": 3.190971094511205, + "grad_norm": 2.8849236965179443, + "learning_rate": 2.2784091261388634e-06, + "loss": 0.4334, + "step": 6551 + }, + { + "epoch": 3.1914582656706725, + "grad_norm": 2.850064516067505, + "learning_rate": 2.2777685284359456e-06, + "loss": 0.3846, + "step": 6552 + }, + { + "epoch": 3.1919454368301396, + "grad_norm": 2.8292555809020996, + "learning_rate": 2.2771279454402854e-06, + "loss": 0.3687, + "step": 6553 + }, + { + "epoch": 3.192432607989607, + "grad_norm": 3.2292447090148926, + "learning_rate": 2.2764873771942784e-06, + "loss": 0.4791, + "step": 6554 + }, + { + "epoch": 3.1929197791490744, + "grad_norm": 3.354914903640747, + "learning_rate": 2.275846823740315e-06, + "loss": 0.4756, + "step": 6555 + }, + { + "epoch": 3.1934069503085416, + "grad_norm": 3.1829519271850586, + "learning_rate": 2.275206285120788e-06, + "loss": 0.4314, + "step": 6556 + }, + { + "epoch": 3.193894121468009, + "grad_norm": 2.9391701221466064, + "learning_rate": 2.274565761378088e-06, + "loss": 0.4272, + "step": 6557 + }, + { + "epoch": 3.1943812926274764, + "grad_norm": 3.1883513927459717, + "learning_rate": 2.273925252554606e-06, + "loss": 0.4542, + "step": 6558 + }, + { + "epoch": 3.194868463786944, + "grad_norm": 2.8424720764160156, + "learning_rate": 2.2732847586927284e-06, + "loss": 0.3872, + "step": 6559 + }, + { + "epoch": 3.195355634946411, + "grad_norm": 3.3557000160217285, + "learning_rate": 2.272644279834845e-06, + "loss": 0.4197, + "step": 6560 + }, + { + "epoch": 3.1958428061058783, + "grad_norm": 2.6400678157806396, + "learning_rate": 2.272003816023341e-06, + "loss": 0.4048, + "step": 6561 + }, + { + "epoch": 3.196329977265346, + "grad_norm": 2.9541783332824707, + "learning_rate": 2.271363367300605e-06, + "loss": 0.4227, + "step": 6562 + }, + { + "epoch": 3.196817148424813, + "grad_norm": 2.777937650680542, + "learning_rate": 2.270722933709019e-06, + "loss": 0.3868, + "step": 6563 + }, + { + "epoch": 3.1973043195842807, + "grad_norm": 2.989611864089966, + "learning_rate": 2.2700825152909675e-06, + "loss": 0.4457, + "step": 6564 + }, + { + "epoch": 3.197791490743748, + "grad_norm": 2.6890602111816406, + "learning_rate": 2.269442112088834e-06, + "loss": 0.4463, + "step": 6565 + }, + { + "epoch": 3.1982786619032155, + "grad_norm": 3.4775726795196533, + "learning_rate": 2.268801724145001e-06, + "loss": 0.4653, + "step": 6566 + }, + { + "epoch": 3.1987658330626827, + "grad_norm": 3.346329927444458, + "learning_rate": 2.2681613515018474e-06, + "loss": 0.4521, + "step": 6567 + }, + { + "epoch": 3.19925300422215, + "grad_norm": 2.9671366214752197, + "learning_rate": 2.267520994201754e-06, + "loss": 0.4307, + "step": 6568 + }, + { + "epoch": 3.1997401753816175, + "grad_norm": 2.7157490253448486, + "learning_rate": 2.2668806522871005e-06, + "loss": 0.4106, + "step": 6569 + }, + { + "epoch": 3.2002273465410846, + "grad_norm": 2.8913462162017822, + "learning_rate": 2.266240325800263e-06, + "loss": 0.3757, + "step": 6570 + }, + { + "epoch": 3.2007145177005523, + "grad_norm": 3.1077544689178467, + "learning_rate": 2.265600014783619e-06, + "loss": 0.452, + "step": 6571 + }, + { + "epoch": 3.2012016888600194, + "grad_norm": 2.8101630210876465, + "learning_rate": 2.264959719279544e-06, + "loss": 0.3531, + "step": 6572 + }, + { + "epoch": 3.201688860019487, + "grad_norm": 2.6852126121520996, + "learning_rate": 2.2643194393304135e-06, + "loss": 0.4785, + "step": 6573 + }, + { + "epoch": 3.202176031178954, + "grad_norm": 2.59883189201355, + "learning_rate": 2.2636791749786e-06, + "loss": 0.3999, + "step": 6574 + }, + { + "epoch": 3.2026632023384214, + "grad_norm": 2.7945454120635986, + "learning_rate": 2.263038926266477e-06, + "loss": 0.4194, + "step": 6575 + }, + { + "epoch": 3.203150373497889, + "grad_norm": 3.115248918533325, + "learning_rate": 2.2623986932364153e-06, + "loss": 0.4332, + "step": 6576 + }, + { + "epoch": 3.203637544657356, + "grad_norm": 3.4498462677001953, + "learning_rate": 2.261758475930787e-06, + "loss": 0.4601, + "step": 6577 + }, + { + "epoch": 3.204124715816824, + "grad_norm": 3.07275390625, + "learning_rate": 2.2611182743919597e-06, + "loss": 0.4327, + "step": 6578 + }, + { + "epoch": 3.204611886976291, + "grad_norm": 2.962526321411133, + "learning_rate": 2.2604780886623026e-06, + "loss": 0.4224, + "step": 6579 + }, + { + "epoch": 3.205099058135758, + "grad_norm": 3.0867698192596436, + "learning_rate": 2.259837918784183e-06, + "loss": 0.4679, + "step": 6580 + }, + { + "epoch": 3.2055862292952257, + "grad_norm": 3.0207104682922363, + "learning_rate": 2.259197764799969e-06, + "loss": 0.4273, + "step": 6581 + }, + { + "epoch": 3.206073400454693, + "grad_norm": 3.3059020042419434, + "learning_rate": 2.258557626752024e-06, + "loss": 0.4583, + "step": 6582 + }, + { + "epoch": 3.2065605716141605, + "grad_norm": 2.9211554527282715, + "learning_rate": 2.257917504682712e-06, + "loss": 0.4385, + "step": 6583 + }, + { + "epoch": 3.2070477427736277, + "grad_norm": 2.879934072494507, + "learning_rate": 2.2572773986343967e-06, + "loss": 0.4291, + "step": 6584 + }, + { + "epoch": 3.2075349139330953, + "grad_norm": 3.3993594646453857, + "learning_rate": 2.256637308649442e-06, + "loss": 0.5085, + "step": 6585 + }, + { + "epoch": 3.2080220850925625, + "grad_norm": 3.0909855365753174, + "learning_rate": 2.255997234770207e-06, + "loss": 0.4301, + "step": 6586 + }, + { + "epoch": 3.2085092562520297, + "grad_norm": 2.725416421890259, + "learning_rate": 2.2553571770390516e-06, + "loss": 0.3816, + "step": 6587 + }, + { + "epoch": 3.2089964274114973, + "grad_norm": 3.457443952560425, + "learning_rate": 2.254717135498336e-06, + "loss": 0.4497, + "step": 6588 + }, + { + "epoch": 3.2094835985709644, + "grad_norm": 2.5888869762420654, + "learning_rate": 2.2540771101904184e-06, + "loss": 0.3917, + "step": 6589 + }, + { + "epoch": 3.209970769730432, + "grad_norm": 2.8519651889801025, + "learning_rate": 2.2534371011576533e-06, + "loss": 0.3906, + "step": 6590 + }, + { + "epoch": 3.210457940889899, + "grad_norm": 2.840663194656372, + "learning_rate": 2.2527971084423985e-06, + "loss": 0.3942, + "step": 6591 + }, + { + "epoch": 3.210945112049367, + "grad_norm": 3.140784502029419, + "learning_rate": 2.252157132087008e-06, + "loss": 0.4588, + "step": 6592 + }, + { + "epoch": 3.211432283208834, + "grad_norm": 2.9410150051116943, + "learning_rate": 2.2515171721338354e-06, + "loss": 0.4839, + "step": 6593 + }, + { + "epoch": 3.211919454368301, + "grad_norm": 3.2328760623931885, + "learning_rate": 2.250877228625233e-06, + "loss": 0.4545, + "step": 6594 + }, + { + "epoch": 3.212406625527769, + "grad_norm": 3.179218292236328, + "learning_rate": 2.250237301603553e-06, + "loss": 0.4876, + "step": 6595 + }, + { + "epoch": 3.212893796687236, + "grad_norm": 2.9282004833221436, + "learning_rate": 2.2495973911111447e-06, + "loss": 0.4423, + "step": 6596 + }, + { + "epoch": 3.2133809678467036, + "grad_norm": 3.2381138801574707, + "learning_rate": 2.2489574971903582e-06, + "loss": 0.4612, + "step": 6597 + }, + { + "epoch": 3.2138681390061707, + "grad_norm": 3.191960573196411, + "learning_rate": 2.2483176198835407e-06, + "loss": 0.464, + "step": 6598 + }, + { + "epoch": 3.2143553101656384, + "grad_norm": 2.788215398788452, + "learning_rate": 2.247677759233039e-06, + "loss": 0.4018, + "step": 6599 + }, + { + "epoch": 3.2148424813251055, + "grad_norm": 2.9801549911499023, + "learning_rate": 2.2470379152812004e-06, + "loss": 0.418, + "step": 6600 + }, + { + "epoch": 3.2153296524845727, + "grad_norm": 3.030349016189575, + "learning_rate": 2.2463980880703703e-06, + "loss": 0.462, + "step": 6601 + }, + { + "epoch": 3.2158168236440403, + "grad_norm": 2.853226661682129, + "learning_rate": 2.2457582776428894e-06, + "loss": 0.4198, + "step": 6602 + }, + { + "epoch": 3.2163039948035075, + "grad_norm": 3.603837013244629, + "learning_rate": 2.245118484041102e-06, + "loss": 0.4779, + "step": 6603 + }, + { + "epoch": 3.216791165962975, + "grad_norm": 3.4872167110443115, + "learning_rate": 2.24447870730735e-06, + "loss": 0.4073, + "step": 6604 + }, + { + "epoch": 3.2172783371224423, + "grad_norm": 2.987663984298706, + "learning_rate": 2.243838947483974e-06, + "loss": 0.4298, + "step": 6605 + }, + { + "epoch": 3.21776550828191, + "grad_norm": 3.2231857776641846, + "learning_rate": 2.2431992046133116e-06, + "loss": 0.3789, + "step": 6606 + }, + { + "epoch": 3.218252679441377, + "grad_norm": 3.012554168701172, + "learning_rate": 2.2425594787377023e-06, + "loss": 0.4788, + "step": 6607 + }, + { + "epoch": 3.218739850600844, + "grad_norm": 3.2262911796569824, + "learning_rate": 2.241919769899482e-06, + "loss": 0.4552, + "step": 6608 + }, + { + "epoch": 3.219227021760312, + "grad_norm": 3.5653765201568604, + "learning_rate": 2.241280078140989e-06, + "loss": 0.4483, + "step": 6609 + }, + { + "epoch": 3.219714192919779, + "grad_norm": 3.3389532566070557, + "learning_rate": 2.2406404035045554e-06, + "loss": 0.4763, + "step": 6610 + }, + { + "epoch": 3.2202013640792466, + "grad_norm": 2.8655009269714355, + "learning_rate": 2.2400007460325156e-06, + "loss": 0.4309, + "step": 6611 + }, + { + "epoch": 3.220688535238714, + "grad_norm": 2.7573156356811523, + "learning_rate": 2.2393611057672026e-06, + "loss": 0.3619, + "step": 6612 + }, + { + "epoch": 3.2211757063981814, + "grad_norm": 2.8661327362060547, + "learning_rate": 2.2387214827509473e-06, + "loss": 0.4485, + "step": 6613 + }, + { + "epoch": 3.2216628775576486, + "grad_norm": 3.0859899520874023, + "learning_rate": 2.2380818770260793e-06, + "loss": 0.4442, + "step": 6614 + }, + { + "epoch": 3.2221500487171157, + "grad_norm": 3.3775131702423096, + "learning_rate": 2.237442288634929e-06, + "loss": 0.4545, + "step": 6615 + }, + { + "epoch": 3.2226372198765834, + "grad_norm": 2.8482649326324463, + "learning_rate": 2.236802717619823e-06, + "loss": 0.432, + "step": 6616 + }, + { + "epoch": 3.2231243910360505, + "grad_norm": 3.3324928283691406, + "learning_rate": 2.236163164023089e-06, + "loss": 0.4063, + "step": 6617 + }, + { + "epoch": 3.223611562195518, + "grad_norm": 3.047842025756836, + "learning_rate": 2.2355236278870526e-06, + "loss": 0.5514, + "step": 6618 + }, + { + "epoch": 3.2240987333549853, + "grad_norm": 3.243129014968872, + "learning_rate": 2.2348841092540373e-06, + "loss": 0.5107, + "step": 6619 + }, + { + "epoch": 3.224585904514453, + "grad_norm": 3.0187337398529053, + "learning_rate": 2.2342446081663676e-06, + "loss": 0.4083, + "step": 6620 + }, + { + "epoch": 3.22507307567392, + "grad_norm": 3.327650308609009, + "learning_rate": 2.2336051246663647e-06, + "loss": 0.4736, + "step": 6621 + }, + { + "epoch": 3.2255602468333873, + "grad_norm": 3.514251947402954, + "learning_rate": 2.2329656587963492e-06, + "loss": 0.381, + "step": 6622 + }, + { + "epoch": 3.226047417992855, + "grad_norm": 2.834855556488037, + "learning_rate": 2.2323262105986416e-06, + "loss": 0.4237, + "step": 6623 + }, + { + "epoch": 3.226534589152322, + "grad_norm": 2.687974214553833, + "learning_rate": 2.2316867801155616e-06, + "loss": 0.39, + "step": 6624 + }, + { + "epoch": 3.2270217603117897, + "grad_norm": 2.8512511253356934, + "learning_rate": 2.231047367389425e-06, + "loss": 0.3995, + "step": 6625 + }, + { + "epoch": 3.227508931471257, + "grad_norm": 3.171241044998169, + "learning_rate": 2.2304079724625477e-06, + "loss": 0.4709, + "step": 6626 + }, + { + "epoch": 3.2279961026307245, + "grad_norm": 3.123373508453369, + "learning_rate": 2.229768595377246e-06, + "loss": 0.4553, + "step": 6627 + }, + { + "epoch": 3.2284832737901916, + "grad_norm": 3.0535757541656494, + "learning_rate": 2.229129236175835e-06, + "loss": 0.4421, + "step": 6628 + }, + { + "epoch": 3.228970444949659, + "grad_norm": 3.188267707824707, + "learning_rate": 2.228489894900624e-06, + "loss": 0.4665, + "step": 6629 + }, + { + "epoch": 3.2294576161091264, + "grad_norm": 2.899526834487915, + "learning_rate": 2.227850571593927e-06, + "loss": 0.4906, + "step": 6630 + }, + { + "epoch": 3.2299447872685936, + "grad_norm": 2.891099214553833, + "learning_rate": 2.227211266298054e-06, + "loss": 0.4734, + "step": 6631 + }, + { + "epoch": 3.230431958428061, + "grad_norm": 2.5867531299591064, + "learning_rate": 2.2265719790553147e-06, + "loss": 0.4334, + "step": 6632 + }, + { + "epoch": 3.2309191295875284, + "grad_norm": 3.068082809448242, + "learning_rate": 2.2259327099080156e-06, + "loss": 0.397, + "step": 6633 + }, + { + "epoch": 3.231406300746996, + "grad_norm": 3.2239203453063965, + "learning_rate": 2.225293458898464e-06, + "loss": 0.4636, + "step": 6634 + }, + { + "epoch": 3.231893471906463, + "grad_norm": 2.947744369506836, + "learning_rate": 2.224654226068966e-06, + "loss": 0.486, + "step": 6635 + }, + { + "epoch": 3.2323806430659303, + "grad_norm": 4.302929878234863, + "learning_rate": 2.2240150114618262e-06, + "loss": 0.4708, + "step": 6636 + }, + { + "epoch": 3.232867814225398, + "grad_norm": 2.941458225250244, + "learning_rate": 2.2233758151193467e-06, + "loss": 0.4171, + "step": 6637 + }, + { + "epoch": 3.233354985384865, + "grad_norm": 2.660174608230591, + "learning_rate": 2.22273663708383e-06, + "loss": 0.4044, + "step": 6638 + }, + { + "epoch": 3.2338421565443327, + "grad_norm": 3.337723970413208, + "learning_rate": 2.222097477397577e-06, + "loss": 0.4844, + "step": 6639 + }, + { + "epoch": 3.2343293277038, + "grad_norm": 2.9799118041992188, + "learning_rate": 2.221458336102887e-06, + "loss": 0.4626, + "step": 6640 + }, + { + "epoch": 3.2348164988632675, + "grad_norm": 3.410291910171509, + "learning_rate": 2.220819213242059e-06, + "loss": 0.4176, + "step": 6641 + }, + { + "epoch": 3.2353036700227347, + "grad_norm": 3.140043258666992, + "learning_rate": 2.220180108857388e-06, + "loss": 0.4566, + "step": 6642 + }, + { + "epoch": 3.235790841182202, + "grad_norm": 3.163177251815796, + "learning_rate": 2.219541022991172e-06, + "loss": 0.3868, + "step": 6643 + }, + { + "epoch": 3.2362780123416695, + "grad_norm": 3.101372718811035, + "learning_rate": 2.2189019556857063e-06, + "loss": 0.423, + "step": 6644 + }, + { + "epoch": 3.2367651835011366, + "grad_norm": 2.816145420074463, + "learning_rate": 2.218262906983281e-06, + "loss": 0.3838, + "step": 6645 + }, + { + "epoch": 3.2372523546606042, + "grad_norm": 2.963357925415039, + "learning_rate": 2.217623876926191e-06, + "loss": 0.4032, + "step": 6646 + }, + { + "epoch": 3.2377395258200714, + "grad_norm": 2.9992899894714355, + "learning_rate": 2.2169848655567265e-06, + "loss": 0.4194, + "step": 6647 + }, + { + "epoch": 3.238226696979539, + "grad_norm": 3.2130484580993652, + "learning_rate": 2.216345872917178e-06, + "loss": 0.4727, + "step": 6648 + }, + { + "epoch": 3.238713868139006, + "grad_norm": 2.585078001022339, + "learning_rate": 2.215706899049832e-06, + "loss": 0.4214, + "step": 6649 + }, + { + "epoch": 3.2392010392984734, + "grad_norm": 2.828054189682007, + "learning_rate": 2.2150679439969777e-06, + "loss": 0.4346, + "step": 6650 + }, + { + "epoch": 3.239688210457941, + "grad_norm": 2.7623653411865234, + "learning_rate": 2.2144290078009e-06, + "loss": 0.4087, + "step": 6651 + }, + { + "epoch": 3.240175381617408, + "grad_norm": 3.0152578353881836, + "learning_rate": 2.213790090503885e-06, + "loss": 0.4731, + "step": 6652 + }, + { + "epoch": 3.2406625527768758, + "grad_norm": 2.710582971572876, + "learning_rate": 2.213151192148214e-06, + "loss": 0.3921, + "step": 6653 + }, + { + "epoch": 3.241149723936343, + "grad_norm": 2.5994255542755127, + "learning_rate": 2.2125123127761704e-06, + "loss": 0.3741, + "step": 6654 + }, + { + "epoch": 3.2416368950958105, + "grad_norm": 2.815115451812744, + "learning_rate": 2.211873452430035e-06, + "loss": 0.4344, + "step": 6655 + }, + { + "epoch": 3.2421240662552777, + "grad_norm": 2.936253070831299, + "learning_rate": 2.211234611152089e-06, + "loss": 0.4579, + "step": 6656 + }, + { + "epoch": 3.242611237414745, + "grad_norm": 3.279989719390869, + "learning_rate": 2.2105957889846085e-06, + "loss": 0.4643, + "step": 6657 + }, + { + "epoch": 3.2430984085742125, + "grad_norm": 2.9686684608459473, + "learning_rate": 2.2099569859698724e-06, + "loss": 0.4365, + "step": 6658 + }, + { + "epoch": 3.2435855797336797, + "grad_norm": 2.9090347290039062, + "learning_rate": 2.209318202150155e-06, + "loss": 0.3762, + "step": 6659 + }, + { + "epoch": 3.2440727508931473, + "grad_norm": 3.158108949661255, + "learning_rate": 2.2086794375677327e-06, + "loss": 0.4836, + "step": 6660 + }, + { + "epoch": 3.2445599220526145, + "grad_norm": 2.76961612701416, + "learning_rate": 2.2080406922648783e-06, + "loss": 0.4471, + "step": 6661 + }, + { + "epoch": 3.245047093212082, + "grad_norm": 3.1869254112243652, + "learning_rate": 2.2074019662838627e-06, + "loss": 0.4361, + "step": 6662 + }, + { + "epoch": 3.2455342643715492, + "grad_norm": 3.15557599067688, + "learning_rate": 2.206763259666958e-06, + "loss": 0.4493, + "step": 6663 + }, + { + "epoch": 3.2460214355310164, + "grad_norm": 3.059182643890381, + "learning_rate": 2.2061245724564344e-06, + "loss": 0.4488, + "step": 6664 + }, + { + "epoch": 3.246508606690484, + "grad_norm": 3.2997193336486816, + "learning_rate": 2.2054859046945585e-06, + "loss": 0.411, + "step": 6665 + }, + { + "epoch": 3.246995777849951, + "grad_norm": 3.379263162612915, + "learning_rate": 2.2048472564235977e-06, + "loss": 0.4045, + "step": 6666 + }, + { + "epoch": 3.247482949009419, + "grad_norm": 3.0069692134857178, + "learning_rate": 2.204208627685818e-06, + "loss": 0.4057, + "step": 6667 + }, + { + "epoch": 3.247970120168886, + "grad_norm": 2.8289918899536133, + "learning_rate": 2.203570018523485e-06, + "loss": 0.3555, + "step": 6668 + }, + { + "epoch": 3.2484572913283536, + "grad_norm": 3.30332088470459, + "learning_rate": 2.202931428978859e-06, + "loss": 0.4355, + "step": 6669 + }, + { + "epoch": 3.2489444624878208, + "grad_norm": 3.09824800491333, + "learning_rate": 2.202292859094204e-06, + "loss": 0.3886, + "step": 6670 + }, + { + "epoch": 3.249431633647288, + "grad_norm": 3.1882565021514893, + "learning_rate": 2.2016543089117796e-06, + "loss": 0.3893, + "step": 6671 + }, + { + "epoch": 3.2499188048067555, + "grad_norm": 3.050288200378418, + "learning_rate": 2.2010157784738464e-06, + "loss": 0.4271, + "step": 6672 + }, + { + "epoch": 3.2504059759662227, + "grad_norm": 2.7660574913024902, + "learning_rate": 2.20037726782266e-06, + "loss": 0.3493, + "step": 6673 + }, + { + "epoch": 3.2508931471256903, + "grad_norm": 2.997282028198242, + "learning_rate": 2.1997387770004784e-06, + "loss": 0.463, + "step": 6674 + }, + { + "epoch": 3.2513803182851575, + "grad_norm": 3.028203248977661, + "learning_rate": 2.1991003060495572e-06, + "loss": 0.4283, + "step": 6675 + }, + { + "epoch": 3.251867489444625, + "grad_norm": 3.0311899185180664, + "learning_rate": 2.198461855012149e-06, + "loss": 0.378, + "step": 6676 + }, + { + "epoch": 3.2523546606040923, + "grad_norm": 3.1812920570373535, + "learning_rate": 2.1978234239305067e-06, + "loss": 0.4527, + "step": 6677 + }, + { + "epoch": 3.2528418317635595, + "grad_norm": 2.7816712856292725, + "learning_rate": 2.1971850128468823e-06, + "loss": 0.4252, + "step": 6678 + }, + { + "epoch": 3.253329002923027, + "grad_norm": 2.7411320209503174, + "learning_rate": 2.196546621803527e-06, + "loss": 0.4015, + "step": 6679 + }, + { + "epoch": 3.2538161740824942, + "grad_norm": 3.092290163040161, + "learning_rate": 2.1959082508426865e-06, + "loss": 0.4173, + "step": 6680 + }, + { + "epoch": 3.254303345241962, + "grad_norm": 2.9722886085510254, + "learning_rate": 2.19526990000661e-06, + "loss": 0.4741, + "step": 6681 + }, + { + "epoch": 3.254790516401429, + "grad_norm": 3.1731817722320557, + "learning_rate": 2.1946315693375432e-06, + "loss": 0.4352, + "step": 6682 + }, + { + "epoch": 3.2552776875608966, + "grad_norm": 3.637716293334961, + "learning_rate": 2.1939932588777307e-06, + "loss": 0.4098, + "step": 6683 + }, + { + "epoch": 3.255764858720364, + "grad_norm": 3.361420154571533, + "learning_rate": 2.1933549686694162e-06, + "loss": 0.5232, + "step": 6684 + }, + { + "epoch": 3.256252029879831, + "grad_norm": 3.098644495010376, + "learning_rate": 2.1927166987548406e-06, + "loss": 0.4473, + "step": 6685 + }, + { + "epoch": 3.2567392010392986, + "grad_norm": 2.7592310905456543, + "learning_rate": 2.1920784491762453e-06, + "loss": 0.4393, + "step": 6686 + }, + { + "epoch": 3.2572263721987658, + "grad_norm": 3.16201114654541, + "learning_rate": 2.1914402199758707e-06, + "loss": 0.4082, + "step": 6687 + }, + { + "epoch": 3.2577135433582334, + "grad_norm": 2.7322378158569336, + "learning_rate": 2.1908020111959526e-06, + "loss": 0.385, + "step": 6688 + }, + { + "epoch": 3.2582007145177005, + "grad_norm": 2.7995316982269287, + "learning_rate": 2.1901638228787286e-06, + "loss": 0.4265, + "step": 6689 + }, + { + "epoch": 3.258687885677168, + "grad_norm": 3.048276424407959, + "learning_rate": 2.1895256550664336e-06, + "loss": 0.412, + "step": 6690 + }, + { + "epoch": 3.2591750568366353, + "grad_norm": 3.2627484798431396, + "learning_rate": 2.1888875078013033e-06, + "loss": 0.4126, + "step": 6691 + }, + { + "epoch": 3.2596622279961025, + "grad_norm": 3.065408706665039, + "learning_rate": 2.1882493811255675e-06, + "loss": 0.4887, + "step": 6692 + }, + { + "epoch": 3.26014939915557, + "grad_norm": 2.8797171115875244, + "learning_rate": 2.1876112750814587e-06, + "loss": 0.4772, + "step": 6693 + }, + { + "epoch": 3.2606365703150373, + "grad_norm": 2.949061155319214, + "learning_rate": 2.1869731897112066e-06, + "loss": 0.4371, + "step": 6694 + }, + { + "epoch": 3.261123741474505, + "grad_norm": 3.4730825424194336, + "learning_rate": 2.186335125057041e-06, + "loss": 0.4878, + "step": 6695 + }, + { + "epoch": 3.261610912633972, + "grad_norm": 3.1271090507507324, + "learning_rate": 2.185697081161186e-06, + "loss": 0.4484, + "step": 6696 + }, + { + "epoch": 3.2620980837934392, + "grad_norm": 2.8288161754608154, + "learning_rate": 2.185059058065869e-06, + "loss": 0.4034, + "step": 6697 + }, + { + "epoch": 3.262585254952907, + "grad_norm": 3.4781100749969482, + "learning_rate": 2.1844210558133143e-06, + "loss": 0.4694, + "step": 6698 + }, + { + "epoch": 3.263072426112374, + "grad_norm": 3.1448798179626465, + "learning_rate": 2.183783074445746e-06, + "loss": 0.4539, + "step": 6699 + }, + { + "epoch": 3.2635595972718416, + "grad_norm": 3.3949127197265625, + "learning_rate": 2.183145114005383e-06, + "loss": 0.4954, + "step": 6700 + }, + { + "epoch": 3.264046768431309, + "grad_norm": 3.0423648357391357, + "learning_rate": 2.182507174534447e-06, + "loss": 0.3681, + "step": 6701 + }, + { + "epoch": 3.264533939590776, + "grad_norm": 2.775106191635132, + "learning_rate": 2.1818692560751573e-06, + "loss": 0.3848, + "step": 6702 + }, + { + "epoch": 3.2650211107502436, + "grad_norm": 2.74167537689209, + "learning_rate": 2.1812313586697307e-06, + "loss": 0.402, + "step": 6703 + }, + { + "epoch": 3.2655082819097108, + "grad_norm": 3.210022211074829, + "learning_rate": 2.1805934823603827e-06, + "loss": 0.4549, + "step": 6704 + }, + { + "epoch": 3.2659954530691784, + "grad_norm": 2.9060330390930176, + "learning_rate": 2.1799556271893283e-06, + "loss": 0.3866, + "step": 6705 + }, + { + "epoch": 3.2664826242286455, + "grad_norm": 3.1070244312286377, + "learning_rate": 2.1793177931987805e-06, + "loss": 0.4623, + "step": 6706 + }, + { + "epoch": 3.266969795388113, + "grad_norm": 2.5187370777130127, + "learning_rate": 2.1786799804309524e-06, + "loss": 0.3665, + "step": 6707 + }, + { + "epoch": 3.2674569665475803, + "grad_norm": 2.5691914558410645, + "learning_rate": 2.178042188928052e-06, + "loss": 0.3979, + "step": 6708 + }, + { + "epoch": 3.2679441377070475, + "grad_norm": 3.008624315261841, + "learning_rate": 2.1774044187322895e-06, + "loss": 0.4792, + "step": 6709 + }, + { + "epoch": 3.268431308866515, + "grad_norm": 3.1918206214904785, + "learning_rate": 2.1767666698858725e-06, + "loss": 0.4867, + "step": 6710 + }, + { + "epoch": 3.2689184800259823, + "grad_norm": 3.030654191970825, + "learning_rate": 2.1761289424310084e-06, + "loss": 0.487, + "step": 6711 + }, + { + "epoch": 3.26940565118545, + "grad_norm": 2.959186553955078, + "learning_rate": 2.1754912364098997e-06, + "loss": 0.4017, + "step": 6712 + }, + { + "epoch": 3.269892822344917, + "grad_norm": 3.162658452987671, + "learning_rate": 2.1748535518647506e-06, + "loss": 0.419, + "step": 6713 + }, + { + "epoch": 3.2703799935043847, + "grad_norm": 3.144378900527954, + "learning_rate": 2.1742158888377633e-06, + "loss": 0.3795, + "step": 6714 + }, + { + "epoch": 3.270867164663852, + "grad_norm": 2.6965813636779785, + "learning_rate": 2.173578247371139e-06, + "loss": 0.3919, + "step": 6715 + }, + { + "epoch": 3.271354335823319, + "grad_norm": 2.7201454639434814, + "learning_rate": 2.172940627507075e-06, + "loss": 0.414, + "step": 6716 + }, + { + "epoch": 3.2718415069827866, + "grad_norm": 5.024057865142822, + "learning_rate": 2.17230302928777e-06, + "loss": 0.4908, + "step": 6717 + }, + { + "epoch": 3.272328678142254, + "grad_norm": 3.1637959480285645, + "learning_rate": 2.1716654527554195e-06, + "loss": 0.5066, + "step": 6718 + }, + { + "epoch": 3.2728158493017214, + "grad_norm": 2.7932868003845215, + "learning_rate": 2.1710278979522204e-06, + "loss": 0.4091, + "step": 6719 + }, + { + "epoch": 3.2733030204611886, + "grad_norm": 3.009317398071289, + "learning_rate": 2.170390364920363e-06, + "loss": 0.3877, + "step": 6720 + }, + { + "epoch": 3.273790191620656, + "grad_norm": 3.0927352905273438, + "learning_rate": 2.1697528537020407e-06, + "loss": 0.4351, + "step": 6721 + }, + { + "epoch": 3.2742773627801234, + "grad_norm": 3.1376454830169678, + "learning_rate": 2.169115364339444e-06, + "loss": 0.4601, + "step": 6722 + }, + { + "epoch": 3.2747645339395905, + "grad_norm": 2.895845890045166, + "learning_rate": 2.1684778968747618e-06, + "loss": 0.3894, + "step": 6723 + }, + { + "epoch": 3.275251705099058, + "grad_norm": 2.915257453918457, + "learning_rate": 2.1678404513501813e-06, + "loss": 0.4536, + "step": 6724 + }, + { + "epoch": 3.2757388762585253, + "grad_norm": 2.9959020614624023, + "learning_rate": 2.1672030278078893e-06, + "loss": 0.407, + "step": 6725 + }, + { + "epoch": 3.276226047417993, + "grad_norm": 3.1185011863708496, + "learning_rate": 2.1665656262900696e-06, + "loss": 0.4564, + "step": 6726 + }, + { + "epoch": 3.27671321857746, + "grad_norm": 2.8067026138305664, + "learning_rate": 2.1659282468389053e-06, + "loss": 0.4234, + "step": 6727 + }, + { + "epoch": 3.2772003897369277, + "grad_norm": 3.300004005432129, + "learning_rate": 2.165290889496579e-06, + "loss": 0.4688, + "step": 6728 + }, + { + "epoch": 3.277687560896395, + "grad_norm": 3.4306886196136475, + "learning_rate": 2.16465355430527e-06, + "loss": 0.3717, + "step": 6729 + }, + { + "epoch": 3.278174732055862, + "grad_norm": 3.2309699058532715, + "learning_rate": 2.164016241307158e-06, + "loss": 0.4304, + "step": 6730 + }, + { + "epoch": 3.2786619032153297, + "grad_norm": 2.880849838256836, + "learning_rate": 2.16337895054442e-06, + "loss": 0.3615, + "step": 6731 + }, + { + "epoch": 3.279149074374797, + "grad_norm": 2.9665160179138184, + "learning_rate": 2.162741682059231e-06, + "loss": 0.4389, + "step": 6732 + }, + { + "epoch": 3.2796362455342645, + "grad_norm": 2.859846830368042, + "learning_rate": 2.1621044358937664e-06, + "loss": 0.388, + "step": 6733 + }, + { + "epoch": 3.2801234166937316, + "grad_norm": 3.1917951107025146, + "learning_rate": 2.1614672120901997e-06, + "loss": 0.4337, + "step": 6734 + }, + { + "epoch": 3.2806105878531993, + "grad_norm": 2.805053949356079, + "learning_rate": 2.1608300106907002e-06, + "loss": 0.4204, + "step": 6735 + }, + { + "epoch": 3.2810977590126664, + "grad_norm": 2.801485061645508, + "learning_rate": 2.1601928317374388e-06, + "loss": 0.4055, + "step": 6736 + }, + { + "epoch": 3.2815849301721336, + "grad_norm": 2.830080032348633, + "learning_rate": 2.1595556752725844e-06, + "loss": 0.4481, + "step": 6737 + }, + { + "epoch": 3.282072101331601, + "grad_norm": 3.0509884357452393, + "learning_rate": 2.158918541338305e-06, + "loss": 0.4594, + "step": 6738 + }, + { + "epoch": 3.2825592724910684, + "grad_norm": 3.1305480003356934, + "learning_rate": 2.1582814299767637e-06, + "loss": 0.4391, + "step": 6739 + }, + { + "epoch": 3.283046443650536, + "grad_norm": 2.8908538818359375, + "learning_rate": 2.157644341230125e-06, + "loss": 0.4263, + "step": 6740 + }, + { + "epoch": 3.283533614810003, + "grad_norm": 2.8748726844787598, + "learning_rate": 2.157007275140552e-06, + "loss": 0.3818, + "step": 6741 + }, + { + "epoch": 3.284020785969471, + "grad_norm": 3.0963737964630127, + "learning_rate": 2.1563702317502073e-06, + "loss": 0.4279, + "step": 6742 + }, + { + "epoch": 3.284507957128938, + "grad_norm": 3.0983476638793945, + "learning_rate": 2.155733211101247e-06, + "loss": 0.4277, + "step": 6743 + }, + { + "epoch": 3.284995128288405, + "grad_norm": 2.857806921005249, + "learning_rate": 2.155096213235831e-06, + "loss": 0.4361, + "step": 6744 + }, + { + "epoch": 3.2854822994478727, + "grad_norm": 3.1322054862976074, + "learning_rate": 2.154459238196116e-06, + "loss": 0.4557, + "step": 6745 + }, + { + "epoch": 3.28596947060734, + "grad_norm": 3.0718767642974854, + "learning_rate": 2.1538222860242563e-06, + "loss": 0.5195, + "step": 6746 + }, + { + "epoch": 3.2864566417668075, + "grad_norm": 2.97299861907959, + "learning_rate": 2.153185356762405e-06, + "loss": 0.4074, + "step": 6747 + }, + { + "epoch": 3.2869438129262747, + "grad_norm": 2.8915536403656006, + "learning_rate": 2.152548450452715e-06, + "loss": 0.4456, + "step": 6748 + }, + { + "epoch": 3.2874309840857423, + "grad_norm": 2.8495724201202393, + "learning_rate": 2.151911567137336e-06, + "loss": 0.3806, + "step": 6749 + }, + { + "epoch": 3.2879181552452095, + "grad_norm": 2.8796193599700928, + "learning_rate": 2.151274706858417e-06, + "loss": 0.3963, + "step": 6750 + }, + { + "epoch": 3.2884053264046766, + "grad_norm": 3.2332208156585693, + "learning_rate": 2.150637869658106e-06, + "loss": 0.4357, + "step": 6751 + }, + { + "epoch": 3.2888924975641443, + "grad_norm": 2.8857545852661133, + "learning_rate": 2.150001055578547e-06, + "loss": 0.3756, + "step": 6752 + }, + { + "epoch": 3.2893796687236114, + "grad_norm": 3.039684295654297, + "learning_rate": 2.1493642646618865e-06, + "loss": 0.4107, + "step": 6753 + }, + { + "epoch": 3.289866839883079, + "grad_norm": 2.983132839202881, + "learning_rate": 2.148727496950267e-06, + "loss": 0.4319, + "step": 6754 + }, + { + "epoch": 3.290354011042546, + "grad_norm": 3.573072671890259, + "learning_rate": 2.1480907524858278e-06, + "loss": 0.4204, + "step": 6755 + }, + { + "epoch": 3.290841182202014, + "grad_norm": 3.17810320854187, + "learning_rate": 2.14745403131071e-06, + "loss": 0.4422, + "step": 6756 + }, + { + "epoch": 3.291328353361481, + "grad_norm": 3.2709975242614746, + "learning_rate": 2.146817333467051e-06, + "loss": 0.4552, + "step": 6757 + }, + { + "epoch": 3.291815524520948, + "grad_norm": 3.0084047317504883, + "learning_rate": 2.1461806589969898e-06, + "loss": 0.4067, + "step": 6758 + }, + { + "epoch": 3.292302695680416, + "grad_norm": 2.9187633991241455, + "learning_rate": 2.145544007942658e-06, + "loss": 0.4488, + "step": 6759 + }, + { + "epoch": 3.292789866839883, + "grad_norm": 2.9209508895874023, + "learning_rate": 2.1449073803461913e-06, + "loss": 0.4352, + "step": 6760 + }, + { + "epoch": 3.2932770379993506, + "grad_norm": 2.9318604469299316, + "learning_rate": 2.144270776249721e-06, + "loss": 0.4274, + "step": 6761 + }, + { + "epoch": 3.2937642091588177, + "grad_norm": 2.674283266067505, + "learning_rate": 2.1436341956953786e-06, + "loss": 0.4197, + "step": 6762 + }, + { + "epoch": 3.2942513803182853, + "grad_norm": 2.781141996383667, + "learning_rate": 2.1429976387252908e-06, + "loss": 0.3879, + "step": 6763 + }, + { + "epoch": 3.2947385514777525, + "grad_norm": 2.9557387828826904, + "learning_rate": 2.142361105381586e-06, + "loss": 0.4114, + "step": 6764 + }, + { + "epoch": 3.2952257226372197, + "grad_norm": 3.060633420944214, + "learning_rate": 2.1417245957063896e-06, + "loss": 0.4558, + "step": 6765 + }, + { + "epoch": 3.2957128937966873, + "grad_norm": 2.9990744590759277, + "learning_rate": 2.1410881097418277e-06, + "loss": 0.4499, + "step": 6766 + }, + { + "epoch": 3.2962000649561545, + "grad_norm": 3.1703760623931885, + "learning_rate": 2.1404516475300204e-06, + "loss": 0.4208, + "step": 6767 + }, + { + "epoch": 3.296687236115622, + "grad_norm": 2.896451711654663, + "learning_rate": 2.139815209113089e-06, + "loss": 0.4536, + "step": 6768 + }, + { + "epoch": 3.2971744072750893, + "grad_norm": 3.3590500354766846, + "learning_rate": 2.1391787945331544e-06, + "loss": 0.4726, + "step": 6769 + }, + { + "epoch": 3.297661578434557, + "grad_norm": 3.0012145042419434, + "learning_rate": 2.1385424038323332e-06, + "loss": 0.4453, + "step": 6770 + }, + { + "epoch": 3.298148749594024, + "grad_norm": 3.549285888671875, + "learning_rate": 2.1379060370527423e-06, + "loss": 0.5325, + "step": 6771 + }, + { + "epoch": 3.298635920753491, + "grad_norm": 2.7595601081848145, + "learning_rate": 2.1372696942364957e-06, + "loss": 0.414, + "step": 6772 + }, + { + "epoch": 3.299123091912959, + "grad_norm": 3.2732901573181152, + "learning_rate": 2.136633375425707e-06, + "loss": 0.4302, + "step": 6773 + }, + { + "epoch": 3.299610263072426, + "grad_norm": 3.3239662647247314, + "learning_rate": 2.1359970806624886e-06, + "loss": 0.4309, + "step": 6774 + }, + { + "epoch": 3.3000974342318936, + "grad_norm": 3.0926496982574463, + "learning_rate": 2.135360809988948e-06, + "loss": 0.3767, + "step": 6775 + }, + { + "epoch": 3.300584605391361, + "grad_norm": 2.9269087314605713, + "learning_rate": 2.134724563447196e-06, + "loss": 0.4706, + "step": 6776 + }, + { + "epoch": 3.3010717765508284, + "grad_norm": 3.2354650497436523, + "learning_rate": 2.134088341079339e-06, + "loss": 0.5128, + "step": 6777 + }, + { + "epoch": 3.3015589477102956, + "grad_norm": 3.0139400959014893, + "learning_rate": 2.1334521429274806e-06, + "loss": 0.4235, + "step": 6778 + }, + { + "epoch": 3.3020461188697627, + "grad_norm": 2.887834072113037, + "learning_rate": 2.1328159690337246e-06, + "loss": 0.4145, + "step": 6779 + }, + { + "epoch": 3.3025332900292304, + "grad_norm": 3.295517683029175, + "learning_rate": 2.1321798194401745e-06, + "loss": 0.4576, + "step": 6780 + }, + { + "epoch": 3.3030204611886975, + "grad_norm": 2.953895330429077, + "learning_rate": 2.13154369418893e-06, + "loss": 0.4731, + "step": 6781 + }, + { + "epoch": 3.303507632348165, + "grad_norm": 3.13308048248291, + "learning_rate": 2.1309075933220886e-06, + "loss": 0.3839, + "step": 6782 + }, + { + "epoch": 3.3039948035076323, + "grad_norm": 2.9018137454986572, + "learning_rate": 2.1302715168817488e-06, + "loss": 0.3683, + "step": 6783 + }, + { + "epoch": 3.3044819746671, + "grad_norm": 2.7263805866241455, + "learning_rate": 2.129635464910005e-06, + "loss": 0.4042, + "step": 6784 + }, + { + "epoch": 3.304969145826567, + "grad_norm": 3.010035276412964, + "learning_rate": 2.128999437448953e-06, + "loss": 0.4442, + "step": 6785 + }, + { + "epoch": 3.3054563169860343, + "grad_norm": 3.002807855606079, + "learning_rate": 2.1283634345406827e-06, + "loss": 0.4416, + "step": 6786 + }, + { + "epoch": 3.305943488145502, + "grad_norm": 3.3973751068115234, + "learning_rate": 2.1277274562272852e-06, + "loss": 0.4647, + "step": 6787 + }, + { + "epoch": 3.306430659304969, + "grad_norm": 2.9691007137298584, + "learning_rate": 2.1270915025508503e-06, + "loss": 0.4092, + "step": 6788 + }, + { + "epoch": 3.3069178304644367, + "grad_norm": 2.7528605461120605, + "learning_rate": 2.1264555735534666e-06, + "loss": 0.4557, + "step": 6789 + }, + { + "epoch": 3.307405001623904, + "grad_norm": 2.854022979736328, + "learning_rate": 2.1258196692772167e-06, + "loss": 0.4232, + "step": 6790 + }, + { + "epoch": 3.3078921727833714, + "grad_norm": 3.016949415206909, + "learning_rate": 2.125183789764186e-06, + "loss": 0.3686, + "step": 6791 + }, + { + "epoch": 3.3083793439428386, + "grad_norm": 3.411463737487793, + "learning_rate": 2.124547935056458e-06, + "loss": 0.4438, + "step": 6792 + }, + { + "epoch": 3.308866515102306, + "grad_norm": 3.6352484226226807, + "learning_rate": 2.1239121051961126e-06, + "loss": 0.3946, + "step": 6793 + }, + { + "epoch": 3.3093536862617734, + "grad_norm": 2.7570226192474365, + "learning_rate": 2.1232763002252295e-06, + "loss": 0.4404, + "step": 6794 + }, + { + "epoch": 3.3098408574212406, + "grad_norm": 3.1125638484954834, + "learning_rate": 2.1226405201858846e-06, + "loss": 0.44, + "step": 6795 + }, + { + "epoch": 3.310328028580708, + "grad_norm": 3.3740127086639404, + "learning_rate": 2.1220047651201554e-06, + "loss": 0.426, + "step": 6796 + }, + { + "epoch": 3.3108151997401754, + "grad_norm": 3.1907522678375244, + "learning_rate": 2.1213690350701163e-06, + "loss": 0.4358, + "step": 6797 + }, + { + "epoch": 3.311302370899643, + "grad_norm": 3.0151045322418213, + "learning_rate": 2.1207333300778384e-06, + "loss": 0.4679, + "step": 6798 + }, + { + "epoch": 3.31178954205911, + "grad_norm": 3.2759063243865967, + "learning_rate": 2.120097650185393e-06, + "loss": 0.5248, + "step": 6799 + }, + { + "epoch": 3.3122767132185773, + "grad_norm": 3.0159499645233154, + "learning_rate": 2.1194619954348507e-06, + "loss": 0.4024, + "step": 6800 + }, + { + "epoch": 3.312763884378045, + "grad_norm": 2.653252601623535, + "learning_rate": 2.1188263658682784e-06, + "loss": 0.3919, + "step": 6801 + }, + { + "epoch": 3.313251055537512, + "grad_norm": 2.9269838333129883, + "learning_rate": 2.1181907615277406e-06, + "loss": 0.4905, + "step": 6802 + }, + { + "epoch": 3.3137382266969797, + "grad_norm": 3.0242865085601807, + "learning_rate": 2.1175551824553028e-06, + "loss": 0.4031, + "step": 6803 + }, + { + "epoch": 3.314225397856447, + "grad_norm": 2.7911105155944824, + "learning_rate": 2.1169196286930276e-06, + "loss": 0.4242, + "step": 6804 + }, + { + "epoch": 3.3147125690159145, + "grad_norm": 3.0950961112976074, + "learning_rate": 2.1162841002829767e-06, + "loss": 0.4341, + "step": 6805 + }, + { + "epoch": 3.3151997401753817, + "grad_norm": 2.6646034717559814, + "learning_rate": 2.115648597267207e-06, + "loss": 0.4488, + "step": 6806 + }, + { + "epoch": 3.315686911334849, + "grad_norm": 2.753606081008911, + "learning_rate": 2.1150131196877775e-06, + "loss": 0.3913, + "step": 6807 + }, + { + "epoch": 3.3161740824943164, + "grad_norm": 3.1903607845306396, + "learning_rate": 2.114377667586744e-06, + "loss": 0.4274, + "step": 6808 + }, + { + "epoch": 3.3166612536537836, + "grad_norm": 3.068371057510376, + "learning_rate": 2.1137422410061613e-06, + "loss": 0.4786, + "step": 6809 + }, + { + "epoch": 3.3171484248132512, + "grad_norm": 2.687919855117798, + "learning_rate": 2.1131068399880805e-06, + "loss": 0.393, + "step": 6810 + }, + { + "epoch": 3.3176355959727184, + "grad_norm": 2.9097890853881836, + "learning_rate": 2.112471464574553e-06, + "loss": 0.4353, + "step": 6811 + }, + { + "epoch": 3.318122767132186, + "grad_norm": 2.699496269226074, + "learning_rate": 2.111836114807628e-06, + "loss": 0.4323, + "step": 6812 + }, + { + "epoch": 3.318609938291653, + "grad_norm": 3.0991439819335938, + "learning_rate": 2.111200790729353e-06, + "loss": 0.4251, + "step": 6813 + }, + { + "epoch": 3.3190971094511204, + "grad_norm": 2.899622917175293, + "learning_rate": 2.110565492381773e-06, + "loss": 0.416, + "step": 6814 + }, + { + "epoch": 3.319584280610588, + "grad_norm": 2.872802495956421, + "learning_rate": 2.1099302198069327e-06, + "loss": 0.3908, + "step": 6815 + }, + { + "epoch": 3.320071451770055, + "grad_norm": 2.930147886276245, + "learning_rate": 2.109294973046874e-06, + "loss": 0.3856, + "step": 6816 + }, + { + "epoch": 3.3205586229295228, + "grad_norm": 3.033634662628174, + "learning_rate": 2.1086597521436377e-06, + "loss": 0.4761, + "step": 6817 + }, + { + "epoch": 3.32104579408899, + "grad_norm": 3.0027894973754883, + "learning_rate": 2.108024557139263e-06, + "loss": 0.4415, + "step": 6818 + }, + { + "epoch": 3.3215329652484575, + "grad_norm": 2.8185009956359863, + "learning_rate": 2.1073893880757857e-06, + "loss": 0.3759, + "step": 6819 + }, + { + "epoch": 3.3220201364079247, + "grad_norm": 2.9725544452667236, + "learning_rate": 2.1067542449952425e-06, + "loss": 0.447, + "step": 6820 + }, + { + "epoch": 3.322507307567392, + "grad_norm": 3.1715550422668457, + "learning_rate": 2.1061191279396675e-06, + "loss": 0.4629, + "step": 6821 + }, + { + "epoch": 3.3229944787268595, + "grad_norm": 2.9665536880493164, + "learning_rate": 2.105484036951091e-06, + "loss": 0.4392, + "step": 6822 + }, + { + "epoch": 3.3234816498863267, + "grad_norm": 3.030409097671509, + "learning_rate": 2.104848972071544e-06, + "loss": 0.446, + "step": 6823 + }, + { + "epoch": 3.3239688210457943, + "grad_norm": 3.0027778148651123, + "learning_rate": 2.1042139333430557e-06, + "loss": 0.3987, + "step": 6824 + }, + { + "epoch": 3.3244559922052614, + "grad_norm": 3.1084377765655518, + "learning_rate": 2.1035789208076534e-06, + "loss": 0.4296, + "step": 6825 + }, + { + "epoch": 3.3249431633647286, + "grad_norm": 2.726851224899292, + "learning_rate": 2.1029439345073598e-06, + "loss": 0.4187, + "step": 6826 + }, + { + "epoch": 3.3254303345241962, + "grad_norm": 3.0765419006347656, + "learning_rate": 2.1023089744841995e-06, + "loss": 0.4519, + "step": 6827 + }, + { + "epoch": 3.3259175056836634, + "grad_norm": 3.045745372772217, + "learning_rate": 2.1016740407801957e-06, + "loss": 0.4263, + "step": 6828 + }, + { + "epoch": 3.326404676843131, + "grad_norm": 2.761040449142456, + "learning_rate": 2.1010391334373657e-06, + "loss": 0.4245, + "step": 6829 + }, + { + "epoch": 3.326891848002598, + "grad_norm": 2.8245437145233154, + "learning_rate": 2.1004042524977284e-06, + "loss": 0.3552, + "step": 6830 + }, + { + "epoch": 3.327379019162066, + "grad_norm": 3.1309075355529785, + "learning_rate": 2.099769398003301e-06, + "loss": 0.4192, + "step": 6831 + }, + { + "epoch": 3.327866190321533, + "grad_norm": 2.7415521144866943, + "learning_rate": 2.099134569996098e-06, + "loss": 0.4298, + "step": 6832 + }, + { + "epoch": 3.328353361481, + "grad_norm": 3.491976737976074, + "learning_rate": 2.0984997685181312e-06, + "loss": 0.4915, + "step": 6833 + }, + { + "epoch": 3.3288405326404678, + "grad_norm": 3.0911946296691895, + "learning_rate": 2.0978649936114123e-06, + "loss": 0.425, + "step": 6834 + }, + { + "epoch": 3.329327703799935, + "grad_norm": 2.8578097820281982, + "learning_rate": 2.097230245317951e-06, + "loss": 0.4119, + "step": 6835 + }, + { + "epoch": 3.3298148749594025, + "grad_norm": 2.9308595657348633, + "learning_rate": 2.096595523679755e-06, + "loss": 0.3831, + "step": 6836 + }, + { + "epoch": 3.3303020461188697, + "grad_norm": 3.124505043029785, + "learning_rate": 2.0959608287388284e-06, + "loss": 0.3981, + "step": 6837 + }, + { + "epoch": 3.330789217278337, + "grad_norm": 2.9704740047454834, + "learning_rate": 2.095326160537178e-06, + "loss": 0.4071, + "step": 6838 + }, + { + "epoch": 3.3312763884378045, + "grad_norm": 2.7128190994262695, + "learning_rate": 2.094691519116803e-06, + "loss": 0.4058, + "step": 6839 + }, + { + "epoch": 3.3317635595972717, + "grad_norm": 2.7184555530548096, + "learning_rate": 2.0940569045197067e-06, + "loss": 0.4367, + "step": 6840 + }, + { + "epoch": 3.3322507307567393, + "grad_norm": 3.0938377380371094, + "learning_rate": 2.0934223167878865e-06, + "loss": 0.4463, + "step": 6841 + }, + { + "epoch": 3.3327379019162064, + "grad_norm": 3.017120599746704, + "learning_rate": 2.0927877559633393e-06, + "loss": 0.3671, + "step": 6842 + }, + { + "epoch": 3.333225073075674, + "grad_norm": 3.915295124053955, + "learning_rate": 2.09215322208806e-06, + "loss": 0.4772, + "step": 6843 + }, + { + "epoch": 3.3337122442351412, + "grad_norm": 3.3954195976257324, + "learning_rate": 2.091518715204044e-06, + "loss": 0.4337, + "step": 6844 + }, + { + "epoch": 3.3341994153946084, + "grad_norm": 2.8962814807891846, + "learning_rate": 2.0908842353532803e-06, + "loss": 0.4408, + "step": 6845 + }, + { + "epoch": 3.334686586554076, + "grad_norm": 2.9870781898498535, + "learning_rate": 2.0902497825777598e-06, + "loss": 0.4471, + "step": 6846 + }, + { + "epoch": 3.335173757713543, + "grad_norm": 2.6248619556427, + "learning_rate": 2.08961535691947e-06, + "loss": 0.4423, + "step": 6847 + }, + { + "epoch": 3.335660928873011, + "grad_norm": 3.064199686050415, + "learning_rate": 2.088980958420399e-06, + "loss": 0.3685, + "step": 6848 + }, + { + "epoch": 3.336148100032478, + "grad_norm": 2.8769333362579346, + "learning_rate": 2.088346587122529e-06, + "loss": 0.4057, + "step": 6849 + }, + { + "epoch": 3.3366352711919456, + "grad_norm": 3.3336403369903564, + "learning_rate": 2.0877122430678435e-06, + "loss": 0.4608, + "step": 6850 + }, + { + "epoch": 3.3371224423514128, + "grad_norm": 3.337892770767212, + "learning_rate": 2.0870779262983234e-06, + "loss": 0.526, + "step": 6851 + }, + { + "epoch": 3.33760961351088, + "grad_norm": 3.32401704788208, + "learning_rate": 2.0864436368559483e-06, + "loss": 0.4521, + "step": 6852 + }, + { + "epoch": 3.3380967846703475, + "grad_norm": 3.2695846557617188, + "learning_rate": 2.0858093747826944e-06, + "loss": 0.4603, + "step": 6853 + }, + { + "epoch": 3.3385839558298147, + "grad_norm": 3.4659509658813477, + "learning_rate": 2.0851751401205374e-06, + "loss": 0.4813, + "step": 6854 + }, + { + "epoch": 3.3390711269892823, + "grad_norm": 3.0972626209259033, + "learning_rate": 2.084540932911451e-06, + "loss": 0.4972, + "step": 6855 + }, + { + "epoch": 3.3395582981487495, + "grad_norm": 2.8884730339050293, + "learning_rate": 2.0839067531974073e-06, + "loss": 0.4551, + "step": 6856 + }, + { + "epoch": 3.340045469308217, + "grad_norm": 3.0576722621917725, + "learning_rate": 2.0832726010203754e-06, + "loss": 0.4727, + "step": 6857 + }, + { + "epoch": 3.3405326404676843, + "grad_norm": 3.278252601623535, + "learning_rate": 2.0826384764223247e-06, + "loss": 0.4784, + "step": 6858 + }, + { + "epoch": 3.3410198116271514, + "grad_norm": 3.004270553588867, + "learning_rate": 2.08200437944522e-06, + "loss": 0.4404, + "step": 6859 + }, + { + "epoch": 3.341506982786619, + "grad_norm": 3.077392101287842, + "learning_rate": 2.081370310131027e-06, + "loss": 0.4366, + "step": 6860 + }, + { + "epoch": 3.3419941539460862, + "grad_norm": 2.8598239421844482, + "learning_rate": 2.080736268521708e-06, + "loss": 0.45, + "step": 6861 + }, + { + "epoch": 3.342481325105554, + "grad_norm": 3.1953134536743164, + "learning_rate": 2.080102254659223e-06, + "loss": 0.4835, + "step": 6862 + }, + { + "epoch": 3.342968496265021, + "grad_norm": 3.3203580379486084, + "learning_rate": 2.079468268585532e-06, + "loss": 0.4773, + "step": 6863 + }, + { + "epoch": 3.3434556674244886, + "grad_norm": 2.8592820167541504, + "learning_rate": 2.0788343103425927e-06, + "loss": 0.4778, + "step": 6864 + }, + { + "epoch": 3.343942838583956, + "grad_norm": 2.9332902431488037, + "learning_rate": 2.0782003799723587e-06, + "loss": 0.4355, + "step": 6865 + }, + { + "epoch": 3.344430009743423, + "grad_norm": 2.923360824584961, + "learning_rate": 2.077566477516784e-06, + "loss": 0.433, + "step": 6866 + }, + { + "epoch": 3.3449171809028906, + "grad_norm": 2.7878520488739014, + "learning_rate": 2.0769326030178205e-06, + "loss": 0.3909, + "step": 6867 + }, + { + "epoch": 3.3454043520623578, + "grad_norm": 2.9337053298950195, + "learning_rate": 2.076298756517419e-06, + "loss": 0.3412, + "step": 6868 + }, + { + "epoch": 3.3458915232218254, + "grad_norm": 3.2596182823181152, + "learning_rate": 2.0756649380575254e-06, + "loss": 0.3965, + "step": 6869 + }, + { + "epoch": 3.3463786943812925, + "grad_norm": 2.842313528060913, + "learning_rate": 2.075031147680087e-06, + "loss": 0.4521, + "step": 6870 + }, + { + "epoch": 3.34686586554076, + "grad_norm": 2.813758611679077, + "learning_rate": 2.074397385427047e-06, + "loss": 0.4277, + "step": 6871 + }, + { + "epoch": 3.3473530367002273, + "grad_norm": 2.743195056915283, + "learning_rate": 2.0737636513403496e-06, + "loss": 0.3143, + "step": 6872 + }, + { + "epoch": 3.3478402078596945, + "grad_norm": 3.104469060897827, + "learning_rate": 2.0731299454619334e-06, + "loss": 0.4012, + "step": 6873 + }, + { + "epoch": 3.348327379019162, + "grad_norm": 2.9657187461853027, + "learning_rate": 2.0724962678337373e-06, + "loss": 0.4413, + "step": 6874 + }, + { + "epoch": 3.3488145501786293, + "grad_norm": 3.045670509338379, + "learning_rate": 2.0718626184976985e-06, + "loss": 0.4497, + "step": 6875 + }, + { + "epoch": 3.349301721338097, + "grad_norm": 3.043114423751831, + "learning_rate": 2.071228997495753e-06, + "loss": 0.4279, + "step": 6876 + }, + { + "epoch": 3.349788892497564, + "grad_norm": 3.3509483337402344, + "learning_rate": 2.0705954048698312e-06, + "loss": 0.4157, + "step": 6877 + }, + { + "epoch": 3.3502760636570317, + "grad_norm": 2.87941837310791, + "learning_rate": 2.0699618406618658e-06, + "loss": 0.4241, + "step": 6878 + }, + { + "epoch": 3.350763234816499, + "grad_norm": 3.022745132446289, + "learning_rate": 2.0693283049137863e-06, + "loss": 0.4246, + "step": 6879 + }, + { + "epoch": 3.351250405975966, + "grad_norm": 3.322925329208374, + "learning_rate": 2.0686947976675186e-06, + "loss": 0.4481, + "step": 6880 + }, + { + "epoch": 3.3517375771354336, + "grad_norm": 3.120741128921509, + "learning_rate": 2.06806131896499e-06, + "loss": 0.4081, + "step": 6881 + }, + { + "epoch": 3.352224748294901, + "grad_norm": 3.004920721054077, + "learning_rate": 2.067427868848122e-06, + "loss": 0.4488, + "step": 6882 + }, + { + "epoch": 3.3527119194543684, + "grad_norm": 3.7546675205230713, + "learning_rate": 2.0667944473588384e-06, + "loss": 0.4582, + "step": 6883 + }, + { + "epoch": 3.3531990906138356, + "grad_norm": 3.096203565597534, + "learning_rate": 2.0661610545390575e-06, + "loss": 0.4082, + "step": 6884 + }, + { + "epoch": 3.353686261773303, + "grad_norm": 2.940032720565796, + "learning_rate": 2.0655276904306974e-06, + "loss": 0.3737, + "step": 6885 + }, + { + "epoch": 3.3541734329327704, + "grad_norm": 2.986473798751831, + "learning_rate": 2.064894355075674e-06, + "loss": 0.4231, + "step": 6886 + }, + { + "epoch": 3.3546606040922375, + "grad_norm": 3.2420668601989746, + "learning_rate": 2.064261048515903e-06, + "loss": 0.4673, + "step": 6887 + }, + { + "epoch": 3.355147775251705, + "grad_norm": 3.3953402042388916, + "learning_rate": 2.063627770793294e-06, + "loss": 0.4502, + "step": 6888 + }, + { + "epoch": 3.3556349464111723, + "grad_norm": 2.9802417755126953, + "learning_rate": 2.062994521949758e-06, + "loss": 0.4304, + "step": 6889 + }, + { + "epoch": 3.35612211757064, + "grad_norm": 2.818358898162842, + "learning_rate": 2.0623613020272045e-06, + "loss": 0.454, + "step": 6890 + }, + { + "epoch": 3.356609288730107, + "grad_norm": 3.4160070419311523, + "learning_rate": 2.0617281110675404e-06, + "loss": 0.4383, + "step": 6891 + }, + { + "epoch": 3.3570964598895747, + "grad_norm": 3.0863394737243652, + "learning_rate": 2.061094949112668e-06, + "loss": 0.378, + "step": 6892 + }, + { + "epoch": 3.357583631049042, + "grad_norm": 3.053161144256592, + "learning_rate": 2.0604618162044907e-06, + "loss": 0.4258, + "step": 6893 + }, + { + "epoch": 3.358070802208509, + "grad_norm": 3.1239988803863525, + "learning_rate": 2.0598287123849097e-06, + "loss": 0.4823, + "step": 6894 + }, + { + "epoch": 3.3585579733679767, + "grad_norm": 3.045820474624634, + "learning_rate": 2.0591956376958243e-06, + "loss": 0.4043, + "step": 6895 + }, + { + "epoch": 3.359045144527444, + "grad_norm": 3.516514778137207, + "learning_rate": 2.0585625921791296e-06, + "loss": 0.4411, + "step": 6896 + }, + { + "epoch": 3.3595323156869115, + "grad_norm": 2.97811222076416, + "learning_rate": 2.0579295758767215e-06, + "loss": 0.3853, + "step": 6897 + }, + { + "epoch": 3.3600194868463786, + "grad_norm": 3.656956195831299, + "learning_rate": 2.057296588830493e-06, + "loss": 0.4538, + "step": 6898 + }, + { + "epoch": 3.3605066580058462, + "grad_norm": 3.1071271896362305, + "learning_rate": 2.056663631082336e-06, + "loss": 0.4708, + "step": 6899 + }, + { + "epoch": 3.3609938291653134, + "grad_norm": 2.808359384536743, + "learning_rate": 2.0560307026741375e-06, + "loss": 0.424, + "step": 6900 + }, + { + "epoch": 3.3614810003247806, + "grad_norm": 2.963728427886963, + "learning_rate": 2.055397803647786e-06, + "loss": 0.4139, + "step": 6901 + }, + { + "epoch": 3.361968171484248, + "grad_norm": 3.1006510257720947, + "learning_rate": 2.0547649340451666e-06, + "loss": 0.4802, + "step": 6902 + }, + { + "epoch": 3.3624553426437154, + "grad_norm": 3.138824462890625, + "learning_rate": 2.0541320939081627e-06, + "loss": 0.4583, + "step": 6903 + }, + { + "epoch": 3.362942513803183, + "grad_norm": 3.0037765502929688, + "learning_rate": 2.0534992832786547e-06, + "loss": 0.4295, + "step": 6904 + }, + { + "epoch": 3.36342968496265, + "grad_norm": 2.982694625854492, + "learning_rate": 2.0528665021985226e-06, + "loss": 0.4399, + "step": 6905 + }, + { + "epoch": 3.3639168561221178, + "grad_norm": 2.9703495502471924, + "learning_rate": 2.052233750709644e-06, + "loss": 0.3693, + "step": 6906 + }, + { + "epoch": 3.364404027281585, + "grad_norm": 3.1004016399383545, + "learning_rate": 2.0516010288538947e-06, + "loss": 0.4307, + "step": 6907 + }, + { + "epoch": 3.364891198441052, + "grad_norm": 2.88413667678833, + "learning_rate": 2.0509683366731467e-06, + "loss": 0.3961, + "step": 6908 + }, + { + "epoch": 3.3653783696005197, + "grad_norm": 3.0805773735046387, + "learning_rate": 2.0503356742092723e-06, + "loss": 0.4196, + "step": 6909 + }, + { + "epoch": 3.365865540759987, + "grad_norm": 2.9464924335479736, + "learning_rate": 2.0497030415041413e-06, + "loss": 0.4212, + "step": 6910 + }, + { + "epoch": 3.3663527119194545, + "grad_norm": 2.8591012954711914, + "learning_rate": 2.049070438599622e-06, + "loss": 0.4211, + "step": 6911 + }, + { + "epoch": 3.3668398830789217, + "grad_norm": 2.882464647293091, + "learning_rate": 2.0484378655375777e-06, + "loss": 0.4132, + "step": 6912 + }, + { + "epoch": 3.3673270542383893, + "grad_norm": 2.9223718643188477, + "learning_rate": 2.0478053223598735e-06, + "loss": 0.3695, + "step": 6913 + }, + { + "epoch": 3.3678142253978565, + "grad_norm": 2.782140016555786, + "learning_rate": 2.047172809108371e-06, + "loss": 0.4308, + "step": 6914 + }, + { + "epoch": 3.3683013965573236, + "grad_norm": 3.1624643802642822, + "learning_rate": 2.0465403258249316e-06, + "loss": 0.4611, + "step": 6915 + }, + { + "epoch": 3.3687885677167912, + "grad_norm": 3.160050630569458, + "learning_rate": 2.045907872551409e-06, + "loss": 0.4419, + "step": 6916 + }, + { + "epoch": 3.3692757388762584, + "grad_norm": 3.212024211883545, + "learning_rate": 2.045275449329662e-06, + "loss": 0.5187, + "step": 6917 + }, + { + "epoch": 3.369762910035726, + "grad_norm": 3.4279701709747314, + "learning_rate": 2.044643056201543e-06, + "loss": 0.5058, + "step": 6918 + }, + { + "epoch": 3.370250081195193, + "grad_norm": 2.9178037643432617, + "learning_rate": 2.0440106932089054e-06, + "loss": 0.4147, + "step": 6919 + }, + { + "epoch": 3.370737252354661, + "grad_norm": 3.2004823684692383, + "learning_rate": 2.0433783603935964e-06, + "loss": 0.4542, + "step": 6920 + }, + { + "epoch": 3.371224423514128, + "grad_norm": 3.2103874683380127, + "learning_rate": 2.042746057797465e-06, + "loss": 0.4428, + "step": 6921 + }, + { + "epoch": 3.371711594673595, + "grad_norm": 3.0982213020324707, + "learning_rate": 2.0421137854623574e-06, + "loss": 0.4276, + "step": 6922 + }, + { + "epoch": 3.3721987658330628, + "grad_norm": 2.98573899269104, + "learning_rate": 2.0414815434301173e-06, + "loss": 0.4144, + "step": 6923 + }, + { + "epoch": 3.37268593699253, + "grad_norm": 3.2774300575256348, + "learning_rate": 2.040849331742585e-06, + "loss": 0.4281, + "step": 6924 + }, + { + "epoch": 3.3731731081519976, + "grad_norm": 3.359243154525757, + "learning_rate": 2.040217150441602e-06, + "loss": 0.5203, + "step": 6925 + }, + { + "epoch": 3.3736602793114647, + "grad_norm": 3.167165517807007, + "learning_rate": 2.0395849995690044e-06, + "loss": 0.4328, + "step": 6926 + }, + { + "epoch": 3.3741474504709323, + "grad_norm": 3.053866386413574, + "learning_rate": 2.038952879166629e-06, + "loss": 0.4448, + "step": 6927 + }, + { + "epoch": 3.3746346216303995, + "grad_norm": 3.0651278495788574, + "learning_rate": 2.0383207892763093e-06, + "loss": 0.4301, + "step": 6928 + }, + { + "epoch": 3.3751217927898667, + "grad_norm": 3.2135252952575684, + "learning_rate": 2.0376887299398763e-06, + "loss": 0.4993, + "step": 6929 + }, + { + "epoch": 3.3756089639493343, + "grad_norm": 3.0971081256866455, + "learning_rate": 2.0370567011991598e-06, + "loss": 0.4378, + "step": 6930 + }, + { + "epoch": 3.3760961351088015, + "grad_norm": 2.806074380874634, + "learning_rate": 2.036424703095989e-06, + "loss": 0.4344, + "step": 6931 + }, + { + "epoch": 3.376583306268269, + "grad_norm": 3.1244750022888184, + "learning_rate": 2.035792735672187e-06, + "loss": 0.4701, + "step": 6932 + }, + { + "epoch": 3.3770704774277363, + "grad_norm": 2.959825038909912, + "learning_rate": 2.035160798969579e-06, + "loss": 0.4623, + "step": 6933 + }, + { + "epoch": 3.377557648587204, + "grad_norm": 3.2786147594451904, + "learning_rate": 2.034528893029987e-06, + "loss": 0.4424, + "step": 6934 + }, + { + "epoch": 3.378044819746671, + "grad_norm": 3.3671154975891113, + "learning_rate": 2.0338970178952285e-06, + "loss": 0.4453, + "step": 6935 + }, + { + "epoch": 3.378531990906138, + "grad_norm": 3.3578524589538574, + "learning_rate": 2.0332651736071213e-06, + "loss": 0.4427, + "step": 6936 + }, + { + "epoch": 3.379019162065606, + "grad_norm": 2.813936948776245, + "learning_rate": 2.0326333602074817e-06, + "loss": 0.4251, + "step": 6937 + }, + { + "epoch": 3.379506333225073, + "grad_norm": 2.931091070175171, + "learning_rate": 2.0320015777381243e-06, + "loss": 0.4385, + "step": 6938 + }, + { + "epoch": 3.3799935043845406, + "grad_norm": 2.825216054916382, + "learning_rate": 2.0313698262408574e-06, + "loss": 0.4026, + "step": 6939 + }, + { + "epoch": 3.3804806755440078, + "grad_norm": 3.3027663230895996, + "learning_rate": 2.030738105757492e-06, + "loss": 0.3848, + "step": 6940 + }, + { + "epoch": 3.3809678467034754, + "grad_norm": 3.7981176376342773, + "learning_rate": 2.030106416329835e-06, + "loss": 0.4706, + "step": 6941 + }, + { + "epoch": 3.3814550178629426, + "grad_norm": 3.178455352783203, + "learning_rate": 2.0294747579996924e-06, + "loss": 0.4023, + "step": 6942 + }, + { + "epoch": 3.3819421890224097, + "grad_norm": 2.899405002593994, + "learning_rate": 2.0288431308088656e-06, + "loss": 0.4451, + "step": 6943 + }, + { + "epoch": 3.3824293601818773, + "grad_norm": 3.0982158184051514, + "learning_rate": 2.028211534799157e-06, + "loss": 0.4727, + "step": 6944 + }, + { + "epoch": 3.3829165313413445, + "grad_norm": 2.7750778198242188, + "learning_rate": 2.027579970012365e-06, + "loss": 0.4718, + "step": 6945 + }, + { + "epoch": 3.383403702500812, + "grad_norm": 3.2608072757720947, + "learning_rate": 2.0269484364902868e-06, + "loss": 0.4307, + "step": 6946 + }, + { + "epoch": 3.3838908736602793, + "grad_norm": 3.202890634536743, + "learning_rate": 2.0263169342747167e-06, + "loss": 0.4743, + "step": 6947 + }, + { + "epoch": 3.384378044819747, + "grad_norm": 2.9402480125427246, + "learning_rate": 2.025685463407448e-06, + "loss": 0.389, + "step": 6948 + }, + { + "epoch": 3.384865215979214, + "grad_norm": 3.3228824138641357, + "learning_rate": 2.0250540239302705e-06, + "loss": 0.4439, + "step": 6949 + }, + { + "epoch": 3.3853523871386813, + "grad_norm": 3.4926671981811523, + "learning_rate": 2.0244226158849745e-06, + "loss": 0.409, + "step": 6950 + }, + { + "epoch": 3.385839558298149, + "grad_norm": 2.98103404045105, + "learning_rate": 2.0237912393133455e-06, + "loss": 0.4827, + "step": 6951 + }, + { + "epoch": 3.386326729457616, + "grad_norm": 2.8400495052337646, + "learning_rate": 2.023159894257167e-06, + "loss": 0.4024, + "step": 6952 + }, + { + "epoch": 3.3868139006170837, + "grad_norm": 3.3700485229492188, + "learning_rate": 2.0225285807582224e-06, + "loss": 0.4588, + "step": 6953 + }, + { + "epoch": 3.387301071776551, + "grad_norm": 3.2876739501953125, + "learning_rate": 2.021897298858293e-06, + "loss": 0.3502, + "step": 6954 + }, + { + "epoch": 3.3877882429360184, + "grad_norm": 2.9399960041046143, + "learning_rate": 2.021266048599155e-06, + "loss": 0.4976, + "step": 6955 + }, + { + "epoch": 3.3882754140954856, + "grad_norm": 3.162545680999756, + "learning_rate": 2.020634830022585e-06, + "loss": 0.4316, + "step": 6956 + }, + { + "epoch": 3.3887625852549528, + "grad_norm": 3.4833948612213135, + "learning_rate": 2.0200036431703573e-06, + "loss": 0.4796, + "step": 6957 + }, + { + "epoch": 3.3892497564144204, + "grad_norm": 3.1820571422576904, + "learning_rate": 2.0193724880842453e-06, + "loss": 0.4533, + "step": 6958 + }, + { + "epoch": 3.3897369275738876, + "grad_norm": 2.6705784797668457, + "learning_rate": 2.0187413648060158e-06, + "loss": 0.3695, + "step": 6959 + }, + { + "epoch": 3.390224098733355, + "grad_norm": 3.3248226642608643, + "learning_rate": 2.018110273377438e-06, + "loss": 0.4664, + "step": 6960 + }, + { + "epoch": 3.3907112698928223, + "grad_norm": 2.9569880962371826, + "learning_rate": 2.0174792138402775e-06, + "loss": 0.4463, + "step": 6961 + }, + { + "epoch": 3.3911984410522895, + "grad_norm": 3.176683187484741, + "learning_rate": 2.016848186236299e-06, + "loss": 0.4271, + "step": 6962 + }, + { + "epoch": 3.391685612211757, + "grad_norm": 2.886734962463379, + "learning_rate": 2.016217190607261e-06, + "loss": 0.413, + "step": 6963 + }, + { + "epoch": 3.3921727833712243, + "grad_norm": 3.2841269969940186, + "learning_rate": 2.0155862269949245e-06, + "loss": 0.3866, + "step": 6964 + }, + { + "epoch": 3.392659954530692, + "grad_norm": 3.354401111602783, + "learning_rate": 2.014955295441046e-06, + "loss": 0.4778, + "step": 6965 + }, + { + "epoch": 3.393147125690159, + "grad_norm": 3.4992055892944336, + "learning_rate": 2.0143243959873825e-06, + "loss": 0.439, + "step": 6966 + }, + { + "epoch": 3.3936342968496263, + "grad_norm": 2.7979564666748047, + "learning_rate": 2.013693528675684e-06, + "loss": 0.4256, + "step": 6967 + }, + { + "epoch": 3.394121468009094, + "grad_norm": 3.1714730262756348, + "learning_rate": 2.0130626935477033e-06, + "loss": 0.4646, + "step": 6968 + }, + { + "epoch": 3.394608639168561, + "grad_norm": 2.875222682952881, + "learning_rate": 2.0124318906451874e-06, + "loss": 0.4463, + "step": 6969 + }, + { + "epoch": 3.3950958103280287, + "grad_norm": 2.6494672298431396, + "learning_rate": 2.011801120009884e-06, + "loss": 0.3852, + "step": 6970 + }, + { + "epoch": 3.395582981487496, + "grad_norm": 3.0545308589935303, + "learning_rate": 2.011170381683537e-06, + "loss": 0.4264, + "step": 6971 + }, + { + "epoch": 3.3960701526469634, + "grad_norm": 3.203718662261963, + "learning_rate": 2.0105396757078884e-06, + "loss": 0.3699, + "step": 6972 + }, + { + "epoch": 3.3965573238064306, + "grad_norm": 3.0995893478393555, + "learning_rate": 2.009909002124678e-06, + "loss": 0.3956, + "step": 6973 + }, + { + "epoch": 3.3970444949658978, + "grad_norm": 2.6654741764068604, + "learning_rate": 2.0092783609756454e-06, + "loss": 0.3751, + "step": 6974 + }, + { + "epoch": 3.3975316661253654, + "grad_norm": 2.877145767211914, + "learning_rate": 2.0086477523025243e-06, + "loss": 0.4002, + "step": 6975 + }, + { + "epoch": 3.3980188372848326, + "grad_norm": 3.080890417098999, + "learning_rate": 2.0080171761470493e-06, + "loss": 0.4469, + "step": 6976 + }, + { + "epoch": 3.3985060084443, + "grad_norm": 3.000033140182495, + "learning_rate": 2.0073866325509516e-06, + "loss": 0.4225, + "step": 6977 + }, + { + "epoch": 3.3989931796037673, + "grad_norm": 3.3309707641601562, + "learning_rate": 2.006756121555962e-06, + "loss": 0.5077, + "step": 6978 + }, + { + "epoch": 3.399480350763235, + "grad_norm": 3.1966629028320312, + "learning_rate": 2.006125643203805e-06, + "loss": 0.4124, + "step": 6979 + }, + { + "epoch": 3.399967521922702, + "grad_norm": 3.4054806232452393, + "learning_rate": 2.005495197536207e-06, + "loss": 0.3879, + "step": 6980 + }, + { + "epoch": 3.4004546930821693, + "grad_norm": 3.0516750812530518, + "learning_rate": 2.0048647845948905e-06, + "loss": 0.4348, + "step": 6981 + }, + { + "epoch": 3.400941864241637, + "grad_norm": 3.0930020809173584, + "learning_rate": 2.004234404421578e-06, + "loss": 0.4682, + "step": 6982 + }, + { + "epoch": 3.401429035401104, + "grad_norm": 3.123298406600952, + "learning_rate": 2.0036040570579853e-06, + "loss": 0.4923, + "step": 6983 + }, + { + "epoch": 3.4019162065605717, + "grad_norm": 2.811669111251831, + "learning_rate": 2.0029737425458297e-06, + "loss": 0.3894, + "step": 6984 + }, + { + "epoch": 3.402403377720039, + "grad_norm": 2.9847640991210938, + "learning_rate": 2.002343460926827e-06, + "loss": 0.4678, + "step": 6985 + }, + { + "epoch": 3.4028905488795065, + "grad_norm": 3.2703096866607666, + "learning_rate": 2.001713212242686e-06, + "loss": 0.4553, + "step": 6986 + }, + { + "epoch": 3.4033777200389737, + "grad_norm": 3.0280771255493164, + "learning_rate": 2.0010829965351185e-06, + "loss": 0.3947, + "step": 6987 + }, + { + "epoch": 3.403864891198441, + "grad_norm": 3.0596797466278076, + "learning_rate": 2.000452813845832e-06, + "loss": 0.4401, + "step": 6988 + }, + { + "epoch": 3.4043520623579084, + "grad_norm": 3.151484966278076, + "learning_rate": 1.9998226642165325e-06, + "loss": 0.3862, + "step": 6989 + }, + { + "epoch": 3.4048392335173756, + "grad_norm": 3.1538736820220947, + "learning_rate": 1.999192547688922e-06, + "loss": 0.4178, + "step": 6990 + }, + { + "epoch": 3.405326404676843, + "grad_norm": 2.9495060443878174, + "learning_rate": 1.998562464304702e-06, + "loss": 0.4235, + "step": 6991 + }, + { + "epoch": 3.4058135758363104, + "grad_norm": 2.973815679550171, + "learning_rate": 1.997932414105571e-06, + "loss": 0.4094, + "step": 6992 + }, + { + "epoch": 3.406300746995778, + "grad_norm": 3.394670009613037, + "learning_rate": 1.997302397133227e-06, + "loss": 0.4553, + "step": 6993 + }, + { + "epoch": 3.406787918155245, + "grad_norm": 3.259704828262329, + "learning_rate": 1.996672413429363e-06, + "loss": 0.4122, + "step": 6994 + }, + { + "epoch": 3.4072750893147123, + "grad_norm": 2.839536190032959, + "learning_rate": 1.9960424630356715e-06, + "loss": 0.3883, + "step": 6995 + }, + { + "epoch": 3.40776226047418, + "grad_norm": 2.821457862854004, + "learning_rate": 1.995412545993843e-06, + "loss": 0.4502, + "step": 6996 + }, + { + "epoch": 3.408249431633647, + "grad_norm": 2.90908145904541, + "learning_rate": 1.994782662345566e-06, + "loss": 0.3627, + "step": 6997 + }, + { + "epoch": 3.4087366027931147, + "grad_norm": 2.7570199966430664, + "learning_rate": 1.9941528121325245e-06, + "loss": 0.4096, + "step": 6998 + }, + { + "epoch": 3.409223773952582, + "grad_norm": 3.1786248683929443, + "learning_rate": 1.9935229953964027e-06, + "loss": 0.466, + "step": 6999 + }, + { + "epoch": 3.4097109451120495, + "grad_norm": 2.708771228790283, + "learning_rate": 1.9928932121788823e-06, + "loss": 0.4077, + "step": 7000 + }, + { + "epoch": 3.4101981162715167, + "grad_norm": 2.85648512840271, + "learning_rate": 1.9922634625216425e-06, + "loss": 0.4495, + "step": 7001 + }, + { + "epoch": 3.410685287430984, + "grad_norm": 3.013468027114868, + "learning_rate": 1.9916337464663585e-06, + "loss": 0.479, + "step": 7002 + }, + { + "epoch": 3.4111724585904515, + "grad_norm": 3.444247007369995, + "learning_rate": 1.9910040640547056e-06, + "loss": 0.4547, + "step": 7003 + }, + { + "epoch": 3.4116596297499187, + "grad_norm": 2.9787325859069824, + "learning_rate": 1.990374415328357e-06, + "loss": 0.4126, + "step": 7004 + }, + { + "epoch": 3.4121468009093863, + "grad_norm": 3.513378858566284, + "learning_rate": 1.9897448003289826e-06, + "loss": 0.4645, + "step": 7005 + }, + { + "epoch": 3.4126339720688534, + "grad_norm": 2.81638765335083, + "learning_rate": 1.989115219098249e-06, + "loss": 0.3699, + "step": 7006 + }, + { + "epoch": 3.413121143228321, + "grad_norm": 2.8006060123443604, + "learning_rate": 1.9884856716778223e-06, + "loss": 0.4295, + "step": 7007 + }, + { + "epoch": 3.4136083143877882, + "grad_norm": 3.2103826999664307, + "learning_rate": 1.987856158109367e-06, + "loss": 0.4208, + "step": 7008 + }, + { + "epoch": 3.4140954855472554, + "grad_norm": 3.087021827697754, + "learning_rate": 1.987226678434544e-06, + "loss": 0.4269, + "step": 7009 + }, + { + "epoch": 3.414582656706723, + "grad_norm": 2.8218541145324707, + "learning_rate": 1.9865972326950103e-06, + "loss": 0.4042, + "step": 7010 + }, + { + "epoch": 3.41506982786619, + "grad_norm": 3.031276226043701, + "learning_rate": 1.9859678209324243e-06, + "loss": 0.5006, + "step": 7011 + }, + { + "epoch": 3.415556999025658, + "grad_norm": 2.998175621032715, + "learning_rate": 1.9853384431884408e-06, + "loss": 0.4249, + "step": 7012 + }, + { + "epoch": 3.416044170185125, + "grad_norm": 2.7796857357025146, + "learning_rate": 1.984709099504711e-06, + "loss": 0.4272, + "step": 7013 + }, + { + "epoch": 3.4165313413445926, + "grad_norm": 2.7009074687957764, + "learning_rate": 1.9840797899228844e-06, + "loss": 0.3835, + "step": 7014 + }, + { + "epoch": 3.4170185125040597, + "grad_norm": 2.995699882507324, + "learning_rate": 1.9834505144846096e-06, + "loss": 0.4849, + "step": 7015 + }, + { + "epoch": 3.417505683663527, + "grad_norm": 3.4611198902130127, + "learning_rate": 1.9828212732315312e-06, + "loss": 0.471, + "step": 7016 + }, + { + "epoch": 3.4179928548229945, + "grad_norm": 2.925145149230957, + "learning_rate": 1.982192066205294e-06, + "loss": 0.3992, + "step": 7017 + }, + { + "epoch": 3.4184800259824617, + "grad_norm": 2.6309049129486084, + "learning_rate": 1.981562893447537e-06, + "loss": 0.3851, + "step": 7018 + }, + { + "epoch": 3.4189671971419293, + "grad_norm": 3.767211437225342, + "learning_rate": 1.9809337549998985e-06, + "loss": 0.4745, + "step": 7019 + }, + { + "epoch": 3.4194543683013965, + "grad_norm": 2.8913235664367676, + "learning_rate": 1.9803046509040163e-06, + "loss": 0.3999, + "step": 7020 + }, + { + "epoch": 3.419941539460864, + "grad_norm": 2.983694314956665, + "learning_rate": 1.979675581201525e-06, + "loss": 0.4402, + "step": 7021 + }, + { + "epoch": 3.4204287106203313, + "grad_norm": 2.916640281677246, + "learning_rate": 1.979046545934054e-06, + "loss": 0.4947, + "step": 7022 + }, + { + "epoch": 3.4209158817797984, + "grad_norm": 2.9151787757873535, + "learning_rate": 1.978417545143234e-06, + "loss": 0.4031, + "step": 7023 + }, + { + "epoch": 3.421403052939266, + "grad_norm": 3.2557172775268555, + "learning_rate": 1.9777885788706925e-06, + "loss": 0.3993, + "step": 7024 + }, + { + "epoch": 3.4218902240987332, + "grad_norm": 3.034432888031006, + "learning_rate": 1.9771596471580557e-06, + "loss": 0.4755, + "step": 7025 + }, + { + "epoch": 3.422377395258201, + "grad_norm": 3.180548667907715, + "learning_rate": 1.976530750046943e-06, + "loss": 0.4235, + "step": 7026 + }, + { + "epoch": 3.422864566417668, + "grad_norm": 2.9152321815490723, + "learning_rate": 1.975901887578977e-06, + "loss": 0.4511, + "step": 7027 + }, + { + "epoch": 3.4233517375771356, + "grad_norm": 3.101881265640259, + "learning_rate": 1.975273059795775e-06, + "loss": 0.4821, + "step": 7028 + }, + { + "epoch": 3.423838908736603, + "grad_norm": 3.052544355392456, + "learning_rate": 1.974644266738954e-06, + "loss": 0.4552, + "step": 7029 + }, + { + "epoch": 3.42432607989607, + "grad_norm": 3.331411361694336, + "learning_rate": 1.974015508450126e-06, + "loss": 0.4253, + "step": 7030 + }, + { + "epoch": 3.4248132510555376, + "grad_norm": 3.041771650314331, + "learning_rate": 1.973386784970902e-06, + "loss": 0.3886, + "step": 7031 + }, + { + "epoch": 3.4253004222150047, + "grad_norm": 3.0103883743286133, + "learning_rate": 1.9727580963428925e-06, + "loss": 0.5187, + "step": 7032 + }, + { + "epoch": 3.4257875933744724, + "grad_norm": 2.7562828063964844, + "learning_rate": 1.972129442607703e-06, + "loss": 0.4495, + "step": 7033 + }, + { + "epoch": 3.4262747645339395, + "grad_norm": 2.815096139907837, + "learning_rate": 1.971500823806937e-06, + "loss": 0.4269, + "step": 7034 + }, + { + "epoch": 3.426761935693407, + "grad_norm": 3.144317388534546, + "learning_rate": 1.9708722399821985e-06, + "loss": 0.4672, + "step": 7035 + }, + { + "epoch": 3.4272491068528743, + "grad_norm": 2.9518632888793945, + "learning_rate": 1.970243691175085e-06, + "loss": 0.4005, + "step": 7036 + }, + { + "epoch": 3.4277362780123415, + "grad_norm": 2.819613456726074, + "learning_rate": 1.9696151774271948e-06, + "loss": 0.4377, + "step": 7037 + }, + { + "epoch": 3.428223449171809, + "grad_norm": 3.1083974838256836, + "learning_rate": 1.968986698780123e-06, + "loss": 0.4902, + "step": 7038 + }, + { + "epoch": 3.4287106203312763, + "grad_norm": 3.031930446624756, + "learning_rate": 1.9683582552754614e-06, + "loss": 0.4341, + "step": 7039 + }, + { + "epoch": 3.429197791490744, + "grad_norm": 3.2239198684692383, + "learning_rate": 1.9677298469548017e-06, + "loss": 0.4295, + "step": 7040 + }, + { + "epoch": 3.429684962650211, + "grad_norm": 3.1628942489624023, + "learning_rate": 1.967101473859731e-06, + "loss": 0.4164, + "step": 7041 + }, + { + "epoch": 3.4301721338096787, + "grad_norm": 3.0360026359558105, + "learning_rate": 1.966473136031835e-06, + "loss": 0.396, + "step": 7042 + }, + { + "epoch": 3.430659304969146, + "grad_norm": 2.927499532699585, + "learning_rate": 1.965844833512697e-06, + "loss": 0.342, + "step": 7043 + }, + { + "epoch": 3.431146476128613, + "grad_norm": 2.83981990814209, + "learning_rate": 1.965216566343899e-06, + "loss": 0.3932, + "step": 7044 + }, + { + "epoch": 3.4316336472880806, + "grad_norm": 3.254018545150757, + "learning_rate": 1.964588334567018e-06, + "loss": 0.5039, + "step": 7045 + }, + { + "epoch": 3.432120818447548, + "grad_norm": 2.9149580001831055, + "learning_rate": 1.9639601382236313e-06, + "loss": 0.4189, + "step": 7046 + }, + { + "epoch": 3.4326079896070154, + "grad_norm": 2.9345755577087402, + "learning_rate": 1.9633319773553123e-06, + "loss": 0.3999, + "step": 7047 + }, + { + "epoch": 3.4330951607664826, + "grad_norm": 3.278862237930298, + "learning_rate": 1.9627038520036345e-06, + "loss": 0.3511, + "step": 7048 + }, + { + "epoch": 3.43358233192595, + "grad_norm": 2.9745264053344727, + "learning_rate": 1.9620757622101643e-06, + "loss": 0.4333, + "step": 7049 + }, + { + "epoch": 3.4340695030854174, + "grad_norm": 3.1128830909729004, + "learning_rate": 1.9614477080164704e-06, + "loss": 0.4392, + "step": 7050 + }, + { + "epoch": 3.4345566742448845, + "grad_norm": 4.63845157623291, + "learning_rate": 1.960819689464117e-06, + "loss": 0.3926, + "step": 7051 + }, + { + "epoch": 3.435043845404352, + "grad_norm": 3.243102788925171, + "learning_rate": 1.9601917065946676e-06, + "loss": 0.4644, + "step": 7052 + }, + { + "epoch": 3.4355310165638193, + "grad_norm": 2.8567707538604736, + "learning_rate": 1.959563759449679e-06, + "loss": 0.4334, + "step": 7053 + }, + { + "epoch": 3.436018187723287, + "grad_norm": 3.1509015560150146, + "learning_rate": 1.958935848070711e-06, + "loss": 0.4507, + "step": 7054 + }, + { + "epoch": 3.436505358882754, + "grad_norm": 2.9328951835632324, + "learning_rate": 1.958307972499319e-06, + "loss": 0.4252, + "step": 7055 + }, + { + "epoch": 3.4369925300422217, + "grad_norm": 3.2440366744995117, + "learning_rate": 1.9576801327770543e-06, + "loss": 0.474, + "step": 7056 + }, + { + "epoch": 3.437479701201689, + "grad_norm": 3.0789358615875244, + "learning_rate": 1.957052328945468e-06, + "loss": 0.4472, + "step": 7057 + }, + { + "epoch": 3.437966872361156, + "grad_norm": 3.2267186641693115, + "learning_rate": 1.956424561046108e-06, + "loss": 0.3861, + "step": 7058 + }, + { + "epoch": 3.4384540435206237, + "grad_norm": 3.0919265747070312, + "learning_rate": 1.95579682912052e-06, + "loss": 0.474, + "step": 7059 + }, + { + "epoch": 3.438941214680091, + "grad_norm": 3.1667237281799316, + "learning_rate": 1.9551691332102472e-06, + "loss": 0.4506, + "step": 7060 + }, + { + "epoch": 3.4394283858395585, + "grad_norm": 2.8905389308929443, + "learning_rate": 1.9545414733568305e-06, + "loss": 0.3629, + "step": 7061 + }, + { + "epoch": 3.4399155569990256, + "grad_norm": 2.5751261711120605, + "learning_rate": 1.953913849601808e-06, + "loss": 0.3764, + "step": 7062 + }, + { + "epoch": 3.4404027281584932, + "grad_norm": 3.0038399696350098, + "learning_rate": 1.9532862619867155e-06, + "loss": 0.4223, + "step": 7063 + }, + { + "epoch": 3.4408898993179604, + "grad_norm": 2.9580163955688477, + "learning_rate": 1.952658710553089e-06, + "loss": 0.441, + "step": 7064 + }, + { + "epoch": 3.4413770704774276, + "grad_norm": 3.103675127029419, + "learning_rate": 1.9520311953424566e-06, + "loss": 0.4949, + "step": 7065 + }, + { + "epoch": 3.441864241636895, + "grad_norm": 3.0967273712158203, + "learning_rate": 1.951403716396349e-06, + "loss": 0.4355, + "step": 7066 + }, + { + "epoch": 3.4423514127963624, + "grad_norm": 3.12019944190979, + "learning_rate": 1.950776273756292e-06, + "loss": 0.4342, + "step": 7067 + }, + { + "epoch": 3.44283858395583, + "grad_norm": 3.507032871246338, + "learning_rate": 1.950148867463811e-06, + "loss": 0.4413, + "step": 7068 + }, + { + "epoch": 3.443325755115297, + "grad_norm": 3.1479034423828125, + "learning_rate": 1.9495214975604263e-06, + "loss": 0.4891, + "step": 7069 + }, + { + "epoch": 3.4438129262747648, + "grad_norm": 2.8649232387542725, + "learning_rate": 1.9488941640876574e-06, + "loss": 0.3747, + "step": 7070 + }, + { + "epoch": 3.444300097434232, + "grad_norm": 2.8824915885925293, + "learning_rate": 1.9482668670870214e-06, + "loss": 0.4029, + "step": 7071 + }, + { + "epoch": 3.444787268593699, + "grad_norm": 2.9267289638519287, + "learning_rate": 1.9476396066000336e-06, + "loss": 0.4526, + "step": 7072 + }, + { + "epoch": 3.4452744397531667, + "grad_norm": 3.0683882236480713, + "learning_rate": 1.947012382668204e-06, + "loss": 0.4303, + "step": 7073 + }, + { + "epoch": 3.445761610912634, + "grad_norm": 3.662552833557129, + "learning_rate": 1.946385195333044e-06, + "loss": 0.4732, + "step": 7074 + }, + { + "epoch": 3.4462487820721015, + "grad_norm": 3.3053274154663086, + "learning_rate": 1.94575804463606e-06, + "loss": 0.4913, + "step": 7075 + }, + { + "epoch": 3.4467359532315687, + "grad_norm": 3.016244888305664, + "learning_rate": 1.945130930618758e-06, + "loss": 0.4926, + "step": 7076 + }, + { + "epoch": 3.4472231243910363, + "grad_norm": 3.630915641784668, + "learning_rate": 1.9445038533226383e-06, + "loss": 0.5141, + "step": 7077 + }, + { + "epoch": 3.4477102955505035, + "grad_norm": 3.8869950771331787, + "learning_rate": 1.943876812789202e-06, + "loss": 0.4351, + "step": 7078 + }, + { + "epoch": 3.4481974667099706, + "grad_norm": 2.6787056922912598, + "learning_rate": 1.9432498090599463e-06, + "loss": 0.3885, + "step": 7079 + }, + { + "epoch": 3.4486846378694382, + "grad_norm": 2.975038528442383, + "learning_rate": 1.942622842176367e-06, + "loss": 0.442, + "step": 7080 + }, + { + "epoch": 3.4491718090289054, + "grad_norm": 3.47965669631958, + "learning_rate": 1.9419959121799557e-06, + "loss": 0.4601, + "step": 7081 + }, + { + "epoch": 3.449658980188373, + "grad_norm": 2.676077365875244, + "learning_rate": 1.941369019112203e-06, + "loss": 0.4021, + "step": 7082 + }, + { + "epoch": 3.45014615134784, + "grad_norm": 3.2048590183258057, + "learning_rate": 1.940742163014596e-06, + "loss": 0.4615, + "step": 7083 + }, + { + "epoch": 3.450633322507308, + "grad_norm": 2.9073092937469482, + "learning_rate": 1.9401153439286223e-06, + "loss": 0.3446, + "step": 7084 + }, + { + "epoch": 3.451120493666775, + "grad_norm": 2.978586435317993, + "learning_rate": 1.9394885618957616e-06, + "loss": 0.3608, + "step": 7085 + }, + { + "epoch": 3.451607664826242, + "grad_norm": 2.7081336975097656, + "learning_rate": 1.938861816957496e-06, + "loss": 0.4314, + "step": 7086 + }, + { + "epoch": 3.4520948359857098, + "grad_norm": 3.693612575531006, + "learning_rate": 1.9382351091553036e-06, + "loss": 0.4348, + "step": 7087 + }, + { + "epoch": 3.452582007145177, + "grad_norm": 2.9114573001861572, + "learning_rate": 1.9376084385306583e-06, + "loss": 0.4254, + "step": 7088 + }, + { + "epoch": 3.4530691783046445, + "grad_norm": 3.143775463104248, + "learning_rate": 1.9369818051250346e-06, + "loss": 0.3863, + "step": 7089 + }, + { + "epoch": 3.4535563494641117, + "grad_norm": 2.8994245529174805, + "learning_rate": 1.9363552089799023e-06, + "loss": 0.4196, + "step": 7090 + }, + { + "epoch": 3.4540435206235793, + "grad_norm": 2.9283435344696045, + "learning_rate": 1.9357286501367314e-06, + "loss": 0.4105, + "step": 7091 + }, + { + "epoch": 3.4545306917830465, + "grad_norm": 3.270876407623291, + "learning_rate": 1.935102128636984e-06, + "loss": 0.4841, + "step": 7092 + }, + { + "epoch": 3.4550178629425137, + "grad_norm": 2.79980206489563, + "learning_rate": 1.9344756445221255e-06, + "loss": 0.4189, + "step": 7093 + }, + { + "epoch": 3.4555050341019813, + "grad_norm": 3.0420899391174316, + "learning_rate": 1.9338491978336164e-06, + "loss": 0.4028, + "step": 7094 + }, + { + "epoch": 3.4559922052614485, + "grad_norm": 4.442075729370117, + "learning_rate": 1.933222788612915e-06, + "loss": 0.4821, + "step": 7095 + }, + { + "epoch": 3.456479376420916, + "grad_norm": 3.44950795173645, + "learning_rate": 1.9325964169014765e-06, + "loss": 0.4735, + "step": 7096 + }, + { + "epoch": 3.4569665475803832, + "grad_norm": 3.4115262031555176, + "learning_rate": 1.931970082740754e-06, + "loss": 0.4975, + "step": 7097 + }, + { + "epoch": 3.4574537187398504, + "grad_norm": 3.4754693508148193, + "learning_rate": 1.9313437861721983e-06, + "loss": 0.4162, + "step": 7098 + }, + { + "epoch": 3.457940889899318, + "grad_norm": 3.3160853385925293, + "learning_rate": 1.930717527237259e-06, + "loss": 0.4714, + "step": 7099 + }, + { + "epoch": 3.458428061058785, + "grad_norm": 3.0906784534454346, + "learning_rate": 1.93009130597738e-06, + "loss": 0.4402, + "step": 7100 + }, + { + "epoch": 3.458915232218253, + "grad_norm": 3.1549620628356934, + "learning_rate": 1.929465122434005e-06, + "loss": 0.4298, + "step": 7101 + }, + { + "epoch": 3.45940240337772, + "grad_norm": 3.273026704788208, + "learning_rate": 1.9288389766485757e-06, + "loss": 0.4687, + "step": 7102 + }, + { + "epoch": 3.459889574537187, + "grad_norm": 3.020538091659546, + "learning_rate": 1.9282128686625294e-06, + "loss": 0.4339, + "step": 7103 + }, + { + "epoch": 3.4603767456966548, + "grad_norm": 3.0531082153320312, + "learning_rate": 1.9275867985173027e-06, + "loss": 0.458, + "step": 7104 + }, + { + "epoch": 3.460863916856122, + "grad_norm": 3.2798235416412354, + "learning_rate": 1.9269607662543277e-06, + "loss": 0.4542, + "step": 7105 + }, + { + "epoch": 3.4613510880155895, + "grad_norm": 3.724971294403076, + "learning_rate": 1.926334771915036e-06, + "loss": 0.5418, + "step": 7106 + }, + { + "epoch": 3.4618382591750567, + "grad_norm": 2.7933309078216553, + "learning_rate": 1.9257088155408567e-06, + "loss": 0.4174, + "step": 7107 + }, + { + "epoch": 3.4623254303345243, + "grad_norm": 3.204275369644165, + "learning_rate": 1.925082897173213e-06, + "loss": 0.4726, + "step": 7108 + }, + { + "epoch": 3.4628126014939915, + "grad_norm": 3.604065179824829, + "learning_rate": 1.92445701685353e-06, + "loss": 0.4879, + "step": 7109 + }, + { + "epoch": 3.4632997726534587, + "grad_norm": 3.177347183227539, + "learning_rate": 1.9238311746232277e-06, + "loss": 0.4702, + "step": 7110 + }, + { + "epoch": 3.4637869438129263, + "grad_norm": 3.3019673824310303, + "learning_rate": 1.923205370523726e-06, + "loss": 0.4512, + "step": 7111 + }, + { + "epoch": 3.4642741149723935, + "grad_norm": 2.8091816902160645, + "learning_rate": 1.922579604596438e-06, + "loss": 0.4491, + "step": 7112 + }, + { + "epoch": 3.464761286131861, + "grad_norm": 3.319275379180908, + "learning_rate": 1.921953876882778e-06, + "loss": 0.4623, + "step": 7113 + }, + { + "epoch": 3.4652484572913282, + "grad_norm": 2.8791556358337402, + "learning_rate": 1.9213281874241564e-06, + "loss": 0.4108, + "step": 7114 + }, + { + "epoch": 3.465735628450796, + "grad_norm": 3.116682291030884, + "learning_rate": 1.920702536261983e-06, + "loss": 0.4956, + "step": 7115 + }, + { + "epoch": 3.466222799610263, + "grad_norm": 3.287045478820801, + "learning_rate": 1.9200769234376605e-06, + "loss": 0.4611, + "step": 7116 + }, + { + "epoch": 3.46670997076973, + "grad_norm": 2.6450674533843994, + "learning_rate": 1.919451348992593e-06, + "loss": 0.3338, + "step": 7117 + }, + { + "epoch": 3.467197141929198, + "grad_norm": 2.7304439544677734, + "learning_rate": 1.9188258129681813e-06, + "loss": 0.4442, + "step": 7118 + }, + { + "epoch": 3.467684313088665, + "grad_norm": 2.9624946117401123, + "learning_rate": 1.918200315405824e-06, + "loss": 0.4434, + "step": 7119 + }, + { + "epoch": 3.4681714842481326, + "grad_norm": 2.694962978363037, + "learning_rate": 1.9175748563469154e-06, + "loss": 0.3823, + "step": 7120 + }, + { + "epoch": 3.4686586554075998, + "grad_norm": 3.033590078353882, + "learning_rate": 1.916949435832848e-06, + "loss": 0.4139, + "step": 7121 + }, + { + "epoch": 3.4691458265670674, + "grad_norm": 3.1819183826446533, + "learning_rate": 1.9163240539050133e-06, + "loss": 0.46, + "step": 7122 + }, + { + "epoch": 3.4696329977265346, + "grad_norm": 2.9033615589141846, + "learning_rate": 1.915698710604798e-06, + "loss": 0.4285, + "step": 7123 + }, + { + "epoch": 3.4701201688860017, + "grad_norm": 2.586334705352783, + "learning_rate": 1.915073405973588e-06, + "loss": 0.3798, + "step": 7124 + }, + { + "epoch": 3.4706073400454693, + "grad_norm": 2.7296221256256104, + "learning_rate": 1.9144481400527646e-06, + "loss": 0.3856, + "step": 7125 + }, + { + "epoch": 3.4710945112049365, + "grad_norm": 2.8941028118133545, + "learning_rate": 1.9138229128837094e-06, + "loss": 0.4325, + "step": 7126 + }, + { + "epoch": 3.471581682364404, + "grad_norm": 2.6750974655151367, + "learning_rate": 1.9131977245078e-06, + "loss": 0.429, + "step": 7127 + }, + { + "epoch": 3.4720688535238713, + "grad_norm": 2.98732852935791, + "learning_rate": 1.9125725749664095e-06, + "loss": 0.4257, + "step": 7128 + }, + { + "epoch": 3.472556024683339, + "grad_norm": 3.065648078918457, + "learning_rate": 1.911947464300911e-06, + "loss": 0.4424, + "step": 7129 + }, + { + "epoch": 3.473043195842806, + "grad_norm": 2.9528729915618896, + "learning_rate": 1.911322392552675e-06, + "loss": 0.3979, + "step": 7130 + }, + { + "epoch": 3.4735303670022732, + "grad_norm": 3.304936170578003, + "learning_rate": 1.910697359763069e-06, + "loss": 0.4308, + "step": 7131 + }, + { + "epoch": 3.474017538161741, + "grad_norm": 3.0445151329040527, + "learning_rate": 1.910072365973456e-06, + "loss": 0.4043, + "step": 7132 + }, + { + "epoch": 3.474504709321208, + "grad_norm": 2.8225252628326416, + "learning_rate": 1.9094474112251986e-06, + "loss": 0.4314, + "step": 7133 + }, + { + "epoch": 3.4749918804806756, + "grad_norm": 3.3952813148498535, + "learning_rate": 1.908822495559657e-06, + "loss": 0.4552, + "step": 7134 + }, + { + "epoch": 3.475479051640143, + "grad_norm": 2.83392333984375, + "learning_rate": 1.9081976190181885e-06, + "loss": 0.4379, + "step": 7135 + }, + { + "epoch": 3.4759662227996104, + "grad_norm": 3.468628406524658, + "learning_rate": 1.907572781642145e-06, + "loss": 0.4564, + "step": 7136 + }, + { + "epoch": 3.4764533939590776, + "grad_norm": 3.1166439056396484, + "learning_rate": 1.9069479834728799e-06, + "loss": 0.4476, + "step": 7137 + }, + { + "epoch": 3.4769405651185448, + "grad_norm": 2.845937490463257, + "learning_rate": 1.9063232245517433e-06, + "loss": 0.4086, + "step": 7138 + }, + { + "epoch": 3.4774277362780124, + "grad_norm": 3.2396957874298096, + "learning_rate": 1.9056985049200788e-06, + "loss": 0.4784, + "step": 7139 + }, + { + "epoch": 3.4779149074374796, + "grad_norm": 2.957245349884033, + "learning_rate": 1.9050738246192324e-06, + "loss": 0.3834, + "step": 7140 + }, + { + "epoch": 3.478402078596947, + "grad_norm": 3.040264844894409, + "learning_rate": 1.9044491836905448e-06, + "loss": 0.4098, + "step": 7141 + }, + { + "epoch": 3.4788892497564143, + "grad_norm": 2.8753087520599365, + "learning_rate": 1.9038245821753556e-06, + "loss": 0.4045, + "step": 7142 + }, + { + "epoch": 3.479376420915882, + "grad_norm": 2.9366226196289062, + "learning_rate": 1.9032000201149992e-06, + "loss": 0.4121, + "step": 7143 + }, + { + "epoch": 3.479863592075349, + "grad_norm": 2.9048938751220703, + "learning_rate": 1.9025754975508098e-06, + "loss": 0.3721, + "step": 7144 + }, + { + "epoch": 3.4803507632348163, + "grad_norm": 2.843667507171631, + "learning_rate": 1.9019510145241191e-06, + "loss": 0.3874, + "step": 7145 + }, + { + "epoch": 3.480837934394284, + "grad_norm": 2.683807611465454, + "learning_rate": 1.9013265710762544e-06, + "loss": 0.4036, + "step": 7146 + }, + { + "epoch": 3.481325105553751, + "grad_norm": 3.201554536819458, + "learning_rate": 1.900702167248541e-06, + "loss": 0.4265, + "step": 7147 + }, + { + "epoch": 3.4818122767132187, + "grad_norm": 3.1533868312835693, + "learning_rate": 1.9000778030823027e-06, + "loss": 0.4443, + "step": 7148 + }, + { + "epoch": 3.482299447872686, + "grad_norm": 3.1167032718658447, + "learning_rate": 1.899453478618859e-06, + "loss": 0.3836, + "step": 7149 + }, + { + "epoch": 3.4827866190321535, + "grad_norm": 3.1774659156799316, + "learning_rate": 1.898829193899529e-06, + "loss": 0.4266, + "step": 7150 + }, + { + "epoch": 3.4832737901916206, + "grad_norm": 6.066186904907227, + "learning_rate": 1.8982049489656267e-06, + "loss": 0.4207, + "step": 7151 + }, + { + "epoch": 3.483760961351088, + "grad_norm": 3.417304515838623, + "learning_rate": 1.8975807438584643e-06, + "loss": 0.4672, + "step": 7152 + }, + { + "epoch": 3.4842481325105554, + "grad_norm": 3.264727830886841, + "learning_rate": 1.8969565786193522e-06, + "loss": 0.4491, + "step": 7153 + }, + { + "epoch": 3.4847353036700226, + "grad_norm": 3.2344253063201904, + "learning_rate": 1.8963324532895987e-06, + "loss": 0.4256, + "step": 7154 + }, + { + "epoch": 3.48522247482949, + "grad_norm": 3.006875991821289, + "learning_rate": 1.8957083679105063e-06, + "loss": 0.4668, + "step": 7155 + }, + { + "epoch": 3.4857096459889574, + "grad_norm": 2.8162360191345215, + "learning_rate": 1.8950843225233778e-06, + "loss": 0.3947, + "step": 7156 + }, + { + "epoch": 3.486196817148425, + "grad_norm": 2.917609214782715, + "learning_rate": 1.8944603171695122e-06, + "loss": 0.3255, + "step": 7157 + }, + { + "epoch": 3.486683988307892, + "grad_norm": 3.4592065811157227, + "learning_rate": 1.8938363518902077e-06, + "loss": 0.4254, + "step": 7158 + }, + { + "epoch": 3.4871711594673593, + "grad_norm": 3.2546677589416504, + "learning_rate": 1.893212426726756e-06, + "loss": 0.3973, + "step": 7159 + }, + { + "epoch": 3.487658330626827, + "grad_norm": 3.149360179901123, + "learning_rate": 1.8925885417204493e-06, + "loss": 0.4643, + "step": 7160 + }, + { + "epoch": 3.488145501786294, + "grad_norm": 2.5543429851531982, + "learning_rate": 1.8919646969125765e-06, + "loss": 0.3965, + "step": 7161 + }, + { + "epoch": 3.4886326729457617, + "grad_norm": 3.016550302505493, + "learning_rate": 1.8913408923444242e-06, + "loss": 0.4568, + "step": 7162 + }, + { + "epoch": 3.489119844105229, + "grad_norm": 2.810849905014038, + "learning_rate": 1.8907171280572743e-06, + "loss": 0.3864, + "step": 7163 + }, + { + "epoch": 3.4896070152646965, + "grad_norm": 2.872424364089966, + "learning_rate": 1.8900934040924076e-06, + "loss": 0.4299, + "step": 7164 + }, + { + "epoch": 3.4900941864241637, + "grad_norm": 3.1891698837280273, + "learning_rate": 1.8894697204911037e-06, + "loss": 0.4123, + "step": 7165 + }, + { + "epoch": 3.490581357583631, + "grad_norm": 3.133967161178589, + "learning_rate": 1.8888460772946366e-06, + "loss": 0.4145, + "step": 7166 + }, + { + "epoch": 3.4910685287430985, + "grad_norm": 2.95444917678833, + "learning_rate": 1.8882224745442784e-06, + "loss": 0.3984, + "step": 7167 + }, + { + "epoch": 3.4915556999025656, + "grad_norm": 2.977743625640869, + "learning_rate": 1.8875989122813007e-06, + "loss": 0.4338, + "step": 7168 + }, + { + "epoch": 3.4920428710620333, + "grad_norm": 2.9847614765167236, + "learning_rate": 1.8869753905469696e-06, + "loss": 0.4762, + "step": 7169 + }, + { + "epoch": 3.4925300422215004, + "grad_norm": 2.9780290126800537, + "learning_rate": 1.8863519093825504e-06, + "loss": 0.4498, + "step": 7170 + }, + { + "epoch": 3.493017213380968, + "grad_norm": 3.1105399131774902, + "learning_rate": 1.8857284688293045e-06, + "loss": 0.4582, + "step": 7171 + }, + { + "epoch": 3.493504384540435, + "grad_norm": 2.9965946674346924, + "learning_rate": 1.8851050689284912e-06, + "loss": 0.4707, + "step": 7172 + }, + { + "epoch": 3.4939915556999024, + "grad_norm": 2.9257256984710693, + "learning_rate": 1.8844817097213667e-06, + "loss": 0.3776, + "step": 7173 + }, + { + "epoch": 3.49447872685937, + "grad_norm": 2.7341787815093994, + "learning_rate": 1.8838583912491867e-06, + "loss": 0.4092, + "step": 7174 + }, + { + "epoch": 3.494965898018837, + "grad_norm": 2.7726709842681885, + "learning_rate": 1.8832351135531998e-06, + "loss": 0.3785, + "step": 7175 + }, + { + "epoch": 3.495453069178305, + "grad_norm": 2.9145348072052, + "learning_rate": 1.8826118766746557e-06, + "loss": 0.4942, + "step": 7176 + }, + { + "epoch": 3.495940240337772, + "grad_norm": 3.084083318710327, + "learning_rate": 1.8819886806548003e-06, + "loss": 0.439, + "step": 7177 + }, + { + "epoch": 3.4964274114972396, + "grad_norm": 3.256538152694702, + "learning_rate": 1.8813655255348773e-06, + "loss": 0.4249, + "step": 7178 + }, + { + "epoch": 3.4969145826567067, + "grad_norm": 3.265118360519409, + "learning_rate": 1.880742411356125e-06, + "loss": 0.4414, + "step": 7179 + }, + { + "epoch": 3.497401753816174, + "grad_norm": 3.335611343383789, + "learning_rate": 1.8801193381597827e-06, + "loss": 0.5129, + "step": 7180 + }, + { + "epoch": 3.4978889249756415, + "grad_norm": 3.0063905715942383, + "learning_rate": 1.8794963059870846e-06, + "loss": 0.4472, + "step": 7181 + }, + { + "epoch": 3.4983760961351087, + "grad_norm": 3.3909103870391846, + "learning_rate": 1.8788733148792643e-06, + "loss": 0.4647, + "step": 7182 + }, + { + "epoch": 3.4988632672945763, + "grad_norm": 3.0762269496917725, + "learning_rate": 1.878250364877549e-06, + "loss": 0.4185, + "step": 7183 + }, + { + "epoch": 3.4993504384540435, + "grad_norm": 3.042421817779541, + "learning_rate": 1.8776274560231669e-06, + "loss": 0.4466, + "step": 7184 + }, + { + "epoch": 3.499837609613511, + "grad_norm": 2.9577527046203613, + "learning_rate": 1.8770045883573417e-06, + "loss": 0.4491, + "step": 7185 + }, + { + "epoch": 3.5003247807729783, + "grad_norm": 2.9421424865722656, + "learning_rate": 1.8763817619212959e-06, + "loss": 0.395, + "step": 7186 + }, + { + "epoch": 3.5008119519324454, + "grad_norm": 3.4276463985443115, + "learning_rate": 1.875758976756246e-06, + "loss": 0.4178, + "step": 7187 + }, + { + "epoch": 3.501299123091913, + "grad_norm": 3.1286778450012207, + "learning_rate": 1.8751362329034087e-06, + "loss": 0.4317, + "step": 7188 + }, + { + "epoch": 3.50178629425138, + "grad_norm": 2.9889180660247803, + "learning_rate": 1.874513530403998e-06, + "loss": 0.4498, + "step": 7189 + }, + { + "epoch": 3.502273465410848, + "grad_norm": 2.922480583190918, + "learning_rate": 1.8738908692992237e-06, + "loss": 0.4535, + "step": 7190 + }, + { + "epoch": 3.502760636570315, + "grad_norm": 3.568760871887207, + "learning_rate": 1.8732682496302931e-06, + "loss": 0.4273, + "step": 7191 + }, + { + "epoch": 3.5032478077297826, + "grad_norm": 2.678013324737549, + "learning_rate": 1.8726456714384112e-06, + "loss": 0.3925, + "step": 7192 + }, + { + "epoch": 3.50373497888925, + "grad_norm": 3.0312018394470215, + "learning_rate": 1.8720231347647804e-06, + "loss": 0.4322, + "step": 7193 + }, + { + "epoch": 3.504222150048717, + "grad_norm": 3.1688578128814697, + "learning_rate": 1.8714006396506004e-06, + "loss": 0.4306, + "step": 7194 + }, + { + "epoch": 3.5047093212081846, + "grad_norm": 2.928257465362549, + "learning_rate": 1.8707781861370669e-06, + "loss": 0.4157, + "step": 7195 + }, + { + "epoch": 3.5051964923676517, + "grad_norm": 3.3066353797912598, + "learning_rate": 1.8701557742653746e-06, + "loss": 0.4437, + "step": 7196 + }, + { + "epoch": 3.5056836635271194, + "grad_norm": 3.111915111541748, + "learning_rate": 1.8695334040767155e-06, + "loss": 0.4204, + "step": 7197 + }, + { + "epoch": 3.5061708346865865, + "grad_norm": 3.3180742263793945, + "learning_rate": 1.8689110756122757e-06, + "loss": 0.3915, + "step": 7198 + }, + { + "epoch": 3.506658005846054, + "grad_norm": 3.1680915355682373, + "learning_rate": 1.8682887889132423e-06, + "loss": 0.3938, + "step": 7199 + }, + { + "epoch": 3.5071451770055213, + "grad_norm": 3.2876107692718506, + "learning_rate": 1.8676665440207982e-06, + "loss": 0.4319, + "step": 7200 + }, + { + "epoch": 3.5076323481649885, + "grad_norm": 3.2390637397766113, + "learning_rate": 1.8670443409761241e-06, + "loss": 0.4599, + "step": 7201 + }, + { + "epoch": 3.508119519324456, + "grad_norm": 3.2408058643341064, + "learning_rate": 1.8664221798203958e-06, + "loss": 0.4308, + "step": 7202 + }, + { + "epoch": 3.5086066904839233, + "grad_norm": 2.9338934421539307, + "learning_rate": 1.8658000605947884e-06, + "loss": 0.3926, + "step": 7203 + }, + { + "epoch": 3.509093861643391, + "grad_norm": 3.0109341144561768, + "learning_rate": 1.8651779833404742e-06, + "loss": 0.4006, + "step": 7204 + }, + { + "epoch": 3.509581032802858, + "grad_norm": 3.204465389251709, + "learning_rate": 1.864555948098623e-06, + "loss": 0.395, + "step": 7205 + }, + { + "epoch": 3.5100682039623257, + "grad_norm": 3.2565743923187256, + "learning_rate": 1.8639339549103986e-06, + "loss": 0.4721, + "step": 7206 + }, + { + "epoch": 3.510555375121793, + "grad_norm": 3.2003040313720703, + "learning_rate": 1.863312003816966e-06, + "loss": 0.4546, + "step": 7207 + }, + { + "epoch": 3.51104254628126, + "grad_norm": 3.148937463760376, + "learning_rate": 1.8626900948594856e-06, + "loss": 0.4408, + "step": 7208 + }, + { + "epoch": 3.5115297174407276, + "grad_norm": 2.9944164752960205, + "learning_rate": 1.862068228079117e-06, + "loss": 0.4203, + "step": 7209 + }, + { + "epoch": 3.512016888600195, + "grad_norm": 3.0832157135009766, + "learning_rate": 1.8614464035170121e-06, + "loss": 0.4011, + "step": 7210 + }, + { + "epoch": 3.512504059759662, + "grad_norm": 3.2917258739471436, + "learning_rate": 1.8608246212143246e-06, + "loss": 0.5001, + "step": 7211 + }, + { + "epoch": 3.5129912309191296, + "grad_norm": 3.1487159729003906, + "learning_rate": 1.8602028812122048e-06, + "loss": 0.452, + "step": 7212 + }, + { + "epoch": 3.513478402078597, + "grad_norm": 3.0680277347564697, + "learning_rate": 1.859581183551799e-06, + "loss": 0.4673, + "step": 7213 + }, + { + "epoch": 3.5139655732380644, + "grad_norm": 2.816089391708374, + "learning_rate": 1.8589595282742507e-06, + "loss": 0.4175, + "step": 7214 + }, + { + "epoch": 3.5144527443975315, + "grad_norm": 3.1103458404541016, + "learning_rate": 1.8583379154207004e-06, + "loss": 0.5233, + "step": 7215 + }, + { + "epoch": 3.514939915556999, + "grad_norm": 3.1740787029266357, + "learning_rate": 1.8577163450322873e-06, + "loss": 0.4127, + "step": 7216 + }, + { + "epoch": 3.5154270867164663, + "grad_norm": 2.771796226501465, + "learning_rate": 1.857094817150148e-06, + "loss": 0.43, + "step": 7217 + }, + { + "epoch": 3.5159142578759335, + "grad_norm": 2.779846429824829, + "learning_rate": 1.8564733318154122e-06, + "loss": 0.4523, + "step": 7218 + }, + { + "epoch": 3.516401429035401, + "grad_norm": 2.36971116065979, + "learning_rate": 1.8558518890692119e-06, + "loss": 0.3633, + "step": 7219 + }, + { + "epoch": 3.5168886001948687, + "grad_norm": 2.988417625427246, + "learning_rate": 1.855230488952673e-06, + "loss": 0.4384, + "step": 7220 + }, + { + "epoch": 3.517375771354336, + "grad_norm": 3.5314395427703857, + "learning_rate": 1.854609131506922e-06, + "loss": 0.4546, + "step": 7221 + }, + { + "epoch": 3.517862942513803, + "grad_norm": 2.783647298812866, + "learning_rate": 1.853987816773077e-06, + "loss": 0.3791, + "step": 7222 + }, + { + "epoch": 3.5183501136732707, + "grad_norm": 3.0616800785064697, + "learning_rate": 1.8533665447922584e-06, + "loss": 0.3699, + "step": 7223 + }, + { + "epoch": 3.518837284832738, + "grad_norm": 2.874037504196167, + "learning_rate": 1.8527453156055816e-06, + "loss": 0.4215, + "step": 7224 + }, + { + "epoch": 3.519324455992205, + "grad_norm": 3.0880324840545654, + "learning_rate": 1.852124129254161e-06, + "loss": 0.4467, + "step": 7225 + }, + { + "epoch": 3.5198116271516726, + "grad_norm": 3.4483611583709717, + "learning_rate": 1.8515029857791033e-06, + "loss": 0.4845, + "step": 7226 + }, + { + "epoch": 3.5202987983111402, + "grad_norm": 3.2928736209869385, + "learning_rate": 1.8508818852215181e-06, + "loss": 0.4403, + "step": 7227 + }, + { + "epoch": 3.5207859694706074, + "grad_norm": 3.1452815532684326, + "learning_rate": 1.8502608276225093e-06, + "loss": 0.3979, + "step": 7228 + }, + { + "epoch": 3.5212731406300746, + "grad_norm": 2.9758715629577637, + "learning_rate": 1.8496398130231796e-06, + "loss": 0.4184, + "step": 7229 + }, + { + "epoch": 3.521760311789542, + "grad_norm": 2.8131563663482666, + "learning_rate": 1.8490188414646252e-06, + "loss": 0.4163, + "step": 7230 + }, + { + "epoch": 3.5222474829490094, + "grad_norm": 2.9211294651031494, + "learning_rate": 1.8483979129879432e-06, + "loss": 0.4674, + "step": 7231 + }, + { + "epoch": 3.5227346541084765, + "grad_norm": 2.724412202835083, + "learning_rate": 1.8477770276342275e-06, + "loss": 0.4013, + "step": 7232 + }, + { + "epoch": 3.523221825267944, + "grad_norm": 3.2159321308135986, + "learning_rate": 1.8471561854445672e-06, + "loss": 0.4573, + "step": 7233 + }, + { + "epoch": 3.5237089964274118, + "grad_norm": 3.854623794555664, + "learning_rate": 1.846535386460049e-06, + "loss": 0.4552, + "step": 7234 + }, + { + "epoch": 3.524196167586879, + "grad_norm": 3.122382879257202, + "learning_rate": 1.845914630721759e-06, + "loss": 0.429, + "step": 7235 + }, + { + "epoch": 3.524683338746346, + "grad_norm": 3.2215328216552734, + "learning_rate": 1.8452939182707768e-06, + "loss": 0.3969, + "step": 7236 + }, + { + "epoch": 3.5251705099058137, + "grad_norm": 2.585955858230591, + "learning_rate": 1.8446732491481833e-06, + "loss": 0.454, + "step": 7237 + }, + { + "epoch": 3.525657681065281, + "grad_norm": 3.234034538269043, + "learning_rate": 1.8440526233950528e-06, + "loss": 0.3751, + "step": 7238 + }, + { + "epoch": 3.526144852224748, + "grad_norm": 4.026361465454102, + "learning_rate": 1.843432041052458e-06, + "loss": 0.409, + "step": 7239 + }, + { + "epoch": 3.5266320233842157, + "grad_norm": 3.108464002609253, + "learning_rate": 1.8428115021614694e-06, + "loss": 0.4046, + "step": 7240 + }, + { + "epoch": 3.5271191945436833, + "grad_norm": 3.206225633621216, + "learning_rate": 1.842191006763156e-06, + "loss": 0.4765, + "step": 7241 + }, + { + "epoch": 3.5276063657031504, + "grad_norm": 3.27494478225708, + "learning_rate": 1.8415705548985791e-06, + "loss": 0.4456, + "step": 7242 + }, + { + "epoch": 3.5280935368626176, + "grad_norm": 2.974926710128784, + "learning_rate": 1.8409501466088018e-06, + "loss": 0.4296, + "step": 7243 + }, + { + "epoch": 3.5285807080220852, + "grad_norm": 3.356860876083374, + "learning_rate": 1.840329781934883e-06, + "loss": 0.4666, + "step": 7244 + }, + { + "epoch": 3.5290678791815524, + "grad_norm": 3.4049973487854004, + "learning_rate": 1.8397094609178772e-06, + "loss": 0.4684, + "step": 7245 + }, + { + "epoch": 3.5295550503410196, + "grad_norm": 3.3817083835601807, + "learning_rate": 1.8390891835988378e-06, + "loss": 0.5125, + "step": 7246 + }, + { + "epoch": 3.530042221500487, + "grad_norm": 2.9852359294891357, + "learning_rate": 1.8384689500188146e-06, + "loss": 0.3887, + "step": 7247 + }, + { + "epoch": 3.530529392659955, + "grad_norm": 2.773294687271118, + "learning_rate": 1.8378487602188556e-06, + "loss": 0.4378, + "step": 7248 + }, + { + "epoch": 3.531016563819422, + "grad_norm": 2.7876248359680176, + "learning_rate": 1.8372286142400032e-06, + "loss": 0.4048, + "step": 7249 + }, + { + "epoch": 3.531503734978889, + "grad_norm": 3.854243040084839, + "learning_rate": 1.836608512123299e-06, + "loss": 0.3464, + "step": 7250 + }, + { + "epoch": 3.5319909061383568, + "grad_norm": 3.410994529724121, + "learning_rate": 1.8359884539097822e-06, + "loss": 0.4372, + "step": 7251 + }, + { + "epoch": 3.532478077297824, + "grad_norm": 3.203171968460083, + "learning_rate": 1.8353684396404884e-06, + "loss": 0.4343, + "step": 7252 + }, + { + "epoch": 3.532965248457291, + "grad_norm": 2.8815040588378906, + "learning_rate": 1.8347484693564481e-06, + "loss": 0.4456, + "step": 7253 + }, + { + "epoch": 3.5334524196167587, + "grad_norm": 3.1134281158447266, + "learning_rate": 1.8341285430986927e-06, + "loss": 0.4484, + "step": 7254 + }, + { + "epoch": 3.5339395907762263, + "grad_norm": 3.5420820713043213, + "learning_rate": 1.8335086609082486e-06, + "loss": 0.4047, + "step": 7255 + }, + { + "epoch": 3.5344267619356935, + "grad_norm": 3.0902976989746094, + "learning_rate": 1.8328888228261391e-06, + "loss": 0.392, + "step": 7256 + }, + { + "epoch": 3.5349139330951607, + "grad_norm": 2.9665634632110596, + "learning_rate": 1.8322690288933848e-06, + "loss": 0.4139, + "step": 7257 + }, + { + "epoch": 3.5354011042546283, + "grad_norm": 2.7503554821014404, + "learning_rate": 1.8316492791510043e-06, + "loss": 0.3941, + "step": 7258 + }, + { + "epoch": 3.5358882754140954, + "grad_norm": 2.730581045150757, + "learning_rate": 1.8310295736400119e-06, + "loss": 0.4277, + "step": 7259 + }, + { + "epoch": 3.5363754465735626, + "grad_norm": 3.1083242893218994, + "learning_rate": 1.8304099124014208e-06, + "loss": 0.4441, + "step": 7260 + }, + { + "epoch": 3.5368626177330302, + "grad_norm": 3.1541035175323486, + "learning_rate": 1.8297902954762393e-06, + "loss": 0.4046, + "step": 7261 + }, + { + "epoch": 3.5373497888924974, + "grad_norm": 2.813416004180908, + "learning_rate": 1.8291707229054728e-06, + "loss": 0.4521, + "step": 7262 + }, + { + "epoch": 3.537836960051965, + "grad_norm": 3.1947028636932373, + "learning_rate": 1.8285511947301254e-06, + "loss": 0.4425, + "step": 7263 + }, + { + "epoch": 3.538324131211432, + "grad_norm": 3.4535951614379883, + "learning_rate": 1.8279317109911987e-06, + "loss": 0.5101, + "step": 7264 + }, + { + "epoch": 3.5388113023709, + "grad_norm": 3.025423765182495, + "learning_rate": 1.8273122717296875e-06, + "loss": 0.4903, + "step": 7265 + }, + { + "epoch": 3.539298473530367, + "grad_norm": 3.3129611015319824, + "learning_rate": 1.8266928769865878e-06, + "loss": 0.4556, + "step": 7266 + }, + { + "epoch": 3.539785644689834, + "grad_norm": 2.715830087661743, + "learning_rate": 1.8260735268028905e-06, + "loss": 0.3892, + "step": 7267 + }, + { + "epoch": 3.5402728158493018, + "grad_norm": 3.357029438018799, + "learning_rate": 1.8254542212195859e-06, + "loss": 0.4177, + "step": 7268 + }, + { + "epoch": 3.540759987008769, + "grad_norm": 3.141732931137085, + "learning_rate": 1.8248349602776566e-06, + "loss": 0.4455, + "step": 7269 + }, + { + "epoch": 3.5412471581682365, + "grad_norm": 3.369354248046875, + "learning_rate": 1.8242157440180869e-06, + "loss": 0.4401, + "step": 7270 + }, + { + "epoch": 3.5417343293277037, + "grad_norm": 3.1423819065093994, + "learning_rate": 1.8235965724818563e-06, + "loss": 0.4582, + "step": 7271 + }, + { + "epoch": 3.5422215004871713, + "grad_norm": 3.6844496726989746, + "learning_rate": 1.8229774457099428e-06, + "loss": 0.4338, + "step": 7272 + }, + { + "epoch": 3.5427086716466385, + "grad_norm": 2.9593400955200195, + "learning_rate": 1.822358363743318e-06, + "loss": 0.436, + "step": 7273 + }, + { + "epoch": 3.5431958428061057, + "grad_norm": 3.4055044651031494, + "learning_rate": 1.821739326622953e-06, + "loss": 0.4656, + "step": 7274 + }, + { + "epoch": 3.5436830139655733, + "grad_norm": 2.544193744659424, + "learning_rate": 1.8211203343898165e-06, + "loss": 0.3976, + "step": 7275 + }, + { + "epoch": 3.5441701851250405, + "grad_norm": 2.7859513759613037, + "learning_rate": 1.8205013870848742e-06, + "loss": 0.4131, + "step": 7276 + }, + { + "epoch": 3.544657356284508, + "grad_norm": 2.8727762699127197, + "learning_rate": 1.8198824847490856e-06, + "loss": 0.441, + "step": 7277 + }, + { + "epoch": 3.5451445274439752, + "grad_norm": 3.03285551071167, + "learning_rate": 1.8192636274234116e-06, + "loss": 0.4247, + "step": 7278 + }, + { + "epoch": 3.545631698603443, + "grad_norm": 3.1292121410369873, + "learning_rate": 1.8186448151488068e-06, + "loss": 0.4616, + "step": 7279 + }, + { + "epoch": 3.54611886976291, + "grad_norm": 2.987816572189331, + "learning_rate": 1.8180260479662248e-06, + "loss": 0.4764, + "step": 7280 + }, + { + "epoch": 3.546606040922377, + "grad_norm": 3.0867691040039062, + "learning_rate": 1.8174073259166158e-06, + "loss": 0.4495, + "step": 7281 + }, + { + "epoch": 3.547093212081845, + "grad_norm": 3.015153646469116, + "learning_rate": 1.8167886490409258e-06, + "loss": 0.4261, + "step": 7282 + }, + { + "epoch": 3.547580383241312, + "grad_norm": 3.2564871311187744, + "learning_rate": 1.8161700173800995e-06, + "loss": 0.4422, + "step": 7283 + }, + { + "epoch": 3.5480675544007796, + "grad_norm": 3.1481730937957764, + "learning_rate": 1.8155514309750787e-06, + "loss": 0.465, + "step": 7284 + }, + { + "epoch": 3.5485547255602468, + "grad_norm": 2.7872862815856934, + "learning_rate": 1.8149328898667995e-06, + "loss": 0.4372, + "step": 7285 + }, + { + "epoch": 3.5490418967197144, + "grad_norm": 2.8572335243225098, + "learning_rate": 1.8143143940961978e-06, + "loss": 0.361, + "step": 7286 + }, + { + "epoch": 3.5495290678791815, + "grad_norm": 3.4306070804595947, + "learning_rate": 1.8136959437042057e-06, + "loss": 0.5111, + "step": 7287 + }, + { + "epoch": 3.5500162390386487, + "grad_norm": 3.2091939449310303, + "learning_rate": 1.813077538731753e-06, + "loss": 0.4567, + "step": 7288 + }, + { + "epoch": 3.5505034101981163, + "grad_norm": 3.1011593341827393, + "learning_rate": 1.8124591792197638e-06, + "loss": 0.4083, + "step": 7289 + }, + { + "epoch": 3.5509905813575835, + "grad_norm": 2.9168691635131836, + "learning_rate": 1.8118408652091624e-06, + "loss": 0.4342, + "step": 7290 + }, + { + "epoch": 3.551477752517051, + "grad_norm": 3.271131992340088, + "learning_rate": 1.811222596740868e-06, + "loss": 0.4176, + "step": 7291 + }, + { + "epoch": 3.5519649236765183, + "grad_norm": 3.4112799167633057, + "learning_rate": 1.8106043738557993e-06, + "loss": 0.4755, + "step": 7292 + }, + { + "epoch": 3.552452094835986, + "grad_norm": 3.1747689247131348, + "learning_rate": 1.8099861965948678e-06, + "loss": 0.424, + "step": 7293 + }, + { + "epoch": 3.552939265995453, + "grad_norm": 2.998819351196289, + "learning_rate": 1.809368064998986e-06, + "loss": 0.4298, + "step": 7294 + }, + { + "epoch": 3.5534264371549202, + "grad_norm": 3.1178784370422363, + "learning_rate": 1.8087499791090617e-06, + "loss": 0.4252, + "step": 7295 + }, + { + "epoch": 3.553913608314388, + "grad_norm": 3.0627057552337646, + "learning_rate": 1.808131938965999e-06, + "loss": 0.4546, + "step": 7296 + }, + { + "epoch": 3.554400779473855, + "grad_norm": 2.969691514968872, + "learning_rate": 1.8075139446106998e-06, + "loss": 0.426, + "step": 7297 + }, + { + "epoch": 3.5548879506333226, + "grad_norm": 3.232820987701416, + "learning_rate": 1.8068959960840632e-06, + "loss": 0.3681, + "step": 7298 + }, + { + "epoch": 3.55537512179279, + "grad_norm": 2.6079843044281006, + "learning_rate": 1.8062780934269864e-06, + "loss": 0.3588, + "step": 7299 + }, + { + "epoch": 3.5558622929522574, + "grad_norm": 2.8044815063476562, + "learning_rate": 1.8056602366803594e-06, + "loss": 0.4091, + "step": 7300 + }, + { + "epoch": 3.5563494641117246, + "grad_norm": 2.887458562850952, + "learning_rate": 1.805042425885074e-06, + "loss": 0.472, + "step": 7301 + }, + { + "epoch": 3.5568366352711918, + "grad_norm": 3.561805248260498, + "learning_rate": 1.8044246610820155e-06, + "loss": 0.4394, + "step": 7302 + }, + { + "epoch": 3.5573238064306594, + "grad_norm": 3.0488553047180176, + "learning_rate": 1.8038069423120687e-06, + "loss": 0.451, + "step": 7303 + }, + { + "epoch": 3.5578109775901265, + "grad_norm": 2.9197332859039307, + "learning_rate": 1.8031892696161139e-06, + "loss": 0.4125, + "step": 7304 + }, + { + "epoch": 3.558298148749594, + "grad_norm": 2.927988052368164, + "learning_rate": 1.8025716430350276e-06, + "loss": 0.3698, + "step": 7305 + }, + { + "epoch": 3.5587853199090613, + "grad_norm": 3.166837215423584, + "learning_rate": 1.8019540626096849e-06, + "loss": 0.4208, + "step": 7306 + }, + { + "epoch": 3.559272491068529, + "grad_norm": 3.591555118560791, + "learning_rate": 1.8013365283809585e-06, + "loss": 0.4512, + "step": 7307 + }, + { + "epoch": 3.559759662227996, + "grad_norm": 3.101670742034912, + "learning_rate": 1.8007190403897146e-06, + "loss": 0.4291, + "step": 7308 + }, + { + "epoch": 3.5602468333874633, + "grad_norm": 3.0098636150360107, + "learning_rate": 1.8001015986768194e-06, + "loss": 0.4725, + "step": 7309 + }, + { + "epoch": 3.560734004546931, + "grad_norm": 3.011319637298584, + "learning_rate": 1.7994842032831352e-06, + "loss": 0.3673, + "step": 7310 + }, + { + "epoch": 3.561221175706398, + "grad_norm": 3.591597557067871, + "learning_rate": 1.7988668542495224e-06, + "loss": 0.4926, + "step": 7311 + }, + { + "epoch": 3.5617083468658657, + "grad_norm": 3.1775567531585693, + "learning_rate": 1.7982495516168347e-06, + "loss": 0.4646, + "step": 7312 + }, + { + "epoch": 3.562195518025333, + "grad_norm": 3.012538433074951, + "learning_rate": 1.7976322954259264e-06, + "loss": 0.4286, + "step": 7313 + }, + { + "epoch": 3.5626826891848005, + "grad_norm": 3.003685474395752, + "learning_rate": 1.7970150857176475e-06, + "loss": 0.4794, + "step": 7314 + }, + { + "epoch": 3.5631698603442676, + "grad_norm": 3.1992292404174805, + "learning_rate": 1.7963979225328453e-06, + "loss": 0.4737, + "step": 7315 + }, + { + "epoch": 3.563657031503735, + "grad_norm": 3.4281320571899414, + "learning_rate": 1.7957808059123627e-06, + "loss": 0.4888, + "step": 7316 + }, + { + "epoch": 3.5641442026632024, + "grad_norm": 2.9247539043426514, + "learning_rate": 1.7951637358970403e-06, + "loss": 0.4372, + "step": 7317 + }, + { + "epoch": 3.5646313738226696, + "grad_norm": 3.2876384258270264, + "learning_rate": 1.7945467125277165e-06, + "loss": 0.4747, + "step": 7318 + }, + { + "epoch": 3.565118544982137, + "grad_norm": 3.095583438873291, + "learning_rate": 1.7939297358452267e-06, + "loss": 0.4103, + "step": 7319 + }, + { + "epoch": 3.5656057161416044, + "grad_norm": 2.985093355178833, + "learning_rate": 1.7933128058903998e-06, + "loss": 0.4126, + "step": 7320 + }, + { + "epoch": 3.566092887301072, + "grad_norm": 2.9926066398620605, + "learning_rate": 1.7926959227040657e-06, + "loss": 0.439, + "step": 7321 + }, + { + "epoch": 3.566580058460539, + "grad_norm": 2.856842279434204, + "learning_rate": 1.7920790863270503e-06, + "loss": 0.4247, + "step": 7322 + }, + { + "epoch": 3.5670672296200063, + "grad_norm": 3.2604711055755615, + "learning_rate": 1.7914622968001748e-06, + "loss": 0.4189, + "step": 7323 + }, + { + "epoch": 3.567554400779474, + "grad_norm": 3.159606456756592, + "learning_rate": 1.7908455541642583e-06, + "loss": 0.4556, + "step": 7324 + }, + { + "epoch": 3.568041571938941, + "grad_norm": 3.1152632236480713, + "learning_rate": 1.7902288584601175e-06, + "loss": 0.4333, + "step": 7325 + }, + { + "epoch": 3.5685287430984087, + "grad_norm": 3.2016983032226562, + "learning_rate": 1.789612209728564e-06, + "loss": 0.4293, + "step": 7326 + }, + { + "epoch": 3.569015914257876, + "grad_norm": 2.772897958755493, + "learning_rate": 1.7889956080104095e-06, + "loss": 0.437, + "step": 7327 + }, + { + "epoch": 3.5695030854173435, + "grad_norm": 2.828794002532959, + "learning_rate": 1.7883790533464585e-06, + "loss": 0.4127, + "step": 7328 + }, + { + "epoch": 3.5699902565768107, + "grad_norm": 3.039731025695801, + "learning_rate": 1.7877625457775153e-06, + "loss": 0.4384, + "step": 7329 + }, + { + "epoch": 3.570477427736278, + "grad_norm": 3.049407958984375, + "learning_rate": 1.7871460853443805e-06, + "loss": 0.4153, + "step": 7330 + }, + { + "epoch": 3.5709645988957455, + "grad_norm": 2.8936896324157715, + "learning_rate": 1.7865296720878522e-06, + "loss": 0.41, + "step": 7331 + }, + { + "epoch": 3.5714517700552126, + "grad_norm": 3.314142942428589, + "learning_rate": 1.7859133060487234e-06, + "loss": 0.4123, + "step": 7332 + }, + { + "epoch": 3.5719389412146803, + "grad_norm": 3.377629518508911, + "learning_rate": 1.785296987267785e-06, + "loss": 0.4137, + "step": 7333 + }, + { + "epoch": 3.5724261123741474, + "grad_norm": 3.832972526550293, + "learning_rate": 1.7846807157858254e-06, + "loss": 0.4128, + "step": 7334 + }, + { + "epoch": 3.572913283533615, + "grad_norm": 3.1264615058898926, + "learning_rate": 1.7840644916436305e-06, + "loss": 0.4278, + "step": 7335 + }, + { + "epoch": 3.573400454693082, + "grad_norm": 3.0251619815826416, + "learning_rate": 1.7834483148819801e-06, + "loss": 0.4198, + "step": 7336 + }, + { + "epoch": 3.5738876258525494, + "grad_norm": 3.327409029006958, + "learning_rate": 1.782832185541653e-06, + "loss": 0.4317, + "step": 7337 + }, + { + "epoch": 3.574374797012017, + "grad_norm": 2.9553723335266113, + "learning_rate": 1.7822161036634256e-06, + "loss": 0.3882, + "step": 7338 + }, + { + "epoch": 3.574861968171484, + "grad_norm": 3.1609630584716797, + "learning_rate": 1.7816000692880706e-06, + "loss": 0.4859, + "step": 7339 + }, + { + "epoch": 3.5753491393309513, + "grad_norm": 3.1600751876831055, + "learning_rate": 1.780984082456355e-06, + "loss": 0.4447, + "step": 7340 + }, + { + "epoch": 3.575836310490419, + "grad_norm": 3.3656749725341797, + "learning_rate": 1.780368143209046e-06, + "loss": 0.4445, + "step": 7341 + }, + { + "epoch": 3.5763234816498866, + "grad_norm": 3.2122859954833984, + "learning_rate": 1.7797522515869062e-06, + "loss": 0.4615, + "step": 7342 + }, + { + "epoch": 3.5768106528093537, + "grad_norm": 3.052456855773926, + "learning_rate": 1.7791364076306957e-06, + "loss": 0.3401, + "step": 7343 + }, + { + "epoch": 3.577297823968821, + "grad_norm": 3.163261890411377, + "learning_rate": 1.77852061138117e-06, + "loss": 0.4924, + "step": 7344 + }, + { + "epoch": 3.5777849951282885, + "grad_norm": 2.9527485370635986, + "learning_rate": 1.7779048628790838e-06, + "loss": 0.4054, + "step": 7345 + }, + { + "epoch": 3.5782721662877557, + "grad_norm": 3.473045587539673, + "learning_rate": 1.7772891621651867e-06, + "loss": 0.4389, + "step": 7346 + }, + { + "epoch": 3.578759337447223, + "grad_norm": 3.0008656978607178, + "learning_rate": 1.7766735092802246e-06, + "loss": 0.4642, + "step": 7347 + }, + { + "epoch": 3.5792465086066905, + "grad_norm": 2.9345054626464844, + "learning_rate": 1.7760579042649428e-06, + "loss": 0.424, + "step": 7348 + }, + { + "epoch": 3.579733679766158, + "grad_norm": 3.0782477855682373, + "learning_rate": 1.7754423471600811e-06, + "loss": 0.3895, + "step": 7349 + }, + { + "epoch": 3.5802208509256253, + "grad_norm": 3.136533737182617, + "learning_rate": 1.7748268380063785e-06, + "loss": 0.4435, + "step": 7350 + }, + { + "epoch": 3.5807080220850924, + "grad_norm": 3.2859458923339844, + "learning_rate": 1.7742113768445673e-06, + "loss": 0.4311, + "step": 7351 + }, + { + "epoch": 3.58119519324456, + "grad_norm": 2.703775644302368, + "learning_rate": 1.7735959637153794e-06, + "loss": 0.3822, + "step": 7352 + }, + { + "epoch": 3.581682364404027, + "grad_norm": 3.177765369415283, + "learning_rate": 1.772980598659543e-06, + "loss": 0.4267, + "step": 7353 + }, + { + "epoch": 3.5821695355634944, + "grad_norm": 2.938070774078369, + "learning_rate": 1.7723652817177839e-06, + "loss": 0.3987, + "step": 7354 + }, + { + "epoch": 3.582656706722962, + "grad_norm": 2.814131021499634, + "learning_rate": 1.771750012930822e-06, + "loss": 0.435, + "step": 7355 + }, + { + "epoch": 3.5831438778824296, + "grad_norm": 2.9602341651916504, + "learning_rate": 1.7711347923393757e-06, + "loss": 0.448, + "step": 7356 + }, + { + "epoch": 3.5836310490418968, + "grad_norm": 3.5146491527557373, + "learning_rate": 1.7705196199841616e-06, + "loss": 0.519, + "step": 7357 + }, + { + "epoch": 3.584118220201364, + "grad_norm": 2.9435925483703613, + "learning_rate": 1.7699044959058919e-06, + "loss": 0.3731, + "step": 7358 + }, + { + "epoch": 3.5846053913608316, + "grad_norm": 3.236400842666626, + "learning_rate": 1.7692894201452737e-06, + "loss": 0.3857, + "step": 7359 + }, + { + "epoch": 3.5850925625202987, + "grad_norm": 3.2062761783599854, + "learning_rate": 1.7686743927430137e-06, + "loss": 0.5063, + "step": 7360 + }, + { + "epoch": 3.585579733679766, + "grad_norm": 3.636906862258911, + "learning_rate": 1.7680594137398144e-06, + "loss": 0.5146, + "step": 7361 + }, + { + "epoch": 3.5860669048392335, + "grad_norm": 3.0687999725341797, + "learning_rate": 1.7674444831763755e-06, + "loss": 0.426, + "step": 7362 + }, + { + "epoch": 3.586554075998701, + "grad_norm": 3.4812819957733154, + "learning_rate": 1.7668296010933916e-06, + "loss": 0.459, + "step": 7363 + }, + { + "epoch": 3.5870412471581683, + "grad_norm": 3.1012258529663086, + "learning_rate": 1.7662147675315566e-06, + "loss": 0.3885, + "step": 7364 + }, + { + "epoch": 3.5875284183176355, + "grad_norm": 2.954350709915161, + "learning_rate": 1.7655999825315606e-06, + "loss": 0.5186, + "step": 7365 + }, + { + "epoch": 3.588015589477103, + "grad_norm": 3.000703811645508, + "learning_rate": 1.7649852461340894e-06, + "loss": 0.3975, + "step": 7366 + }, + { + "epoch": 3.5885027606365703, + "grad_norm": 3.301825761795044, + "learning_rate": 1.7643705583798254e-06, + "loss": 0.4755, + "step": 7367 + }, + { + "epoch": 3.5889899317960374, + "grad_norm": 3.0967559814453125, + "learning_rate": 1.76375591930945e-06, + "loss": 0.4944, + "step": 7368 + }, + { + "epoch": 3.589477102955505, + "grad_norm": 2.7529103755950928, + "learning_rate": 1.7631413289636384e-06, + "loss": 0.3856, + "step": 7369 + }, + { + "epoch": 3.5899642741149727, + "grad_norm": 3.168273448944092, + "learning_rate": 1.762526787383066e-06, + "loss": 0.4006, + "step": 7370 + }, + { + "epoch": 3.59045144527444, + "grad_norm": 3.262075901031494, + "learning_rate": 1.761912294608402e-06, + "loss": 0.4377, + "step": 7371 + }, + { + "epoch": 3.590938616433907, + "grad_norm": 3.144972801208496, + "learning_rate": 1.761297850680313e-06, + "loss": 0.44, + "step": 7372 + }, + { + "epoch": 3.5914257875933746, + "grad_norm": 3.233616828918457, + "learning_rate": 1.7606834556394637e-06, + "loss": 0.4757, + "step": 7373 + }, + { + "epoch": 3.591912958752842, + "grad_norm": 2.9262568950653076, + "learning_rate": 1.7600691095265154e-06, + "loss": 0.4297, + "step": 7374 + }, + { + "epoch": 3.592400129912309, + "grad_norm": 3.151799201965332, + "learning_rate": 1.7594548123821235e-06, + "loss": 0.4484, + "step": 7375 + }, + { + "epoch": 3.5928873010717766, + "grad_norm": 2.9254109859466553, + "learning_rate": 1.7588405642469431e-06, + "loss": 0.3585, + "step": 7376 + }, + { + "epoch": 3.593374472231244, + "grad_norm": 3.0416414737701416, + "learning_rate": 1.7582263651616249e-06, + "loss": 0.3936, + "step": 7377 + }, + { + "epoch": 3.5938616433907113, + "grad_norm": 2.9949936866760254, + "learning_rate": 1.7576122151668181e-06, + "loss": 0.4347, + "step": 7378 + }, + { + "epoch": 3.5943488145501785, + "grad_norm": 3.1116416454315186, + "learning_rate": 1.7569981143031647e-06, + "loss": 0.475, + "step": 7379 + }, + { + "epoch": 3.594835985709646, + "grad_norm": 3.0654447078704834, + "learning_rate": 1.7563840626113069e-06, + "loss": 0.3694, + "step": 7380 + }, + { + "epoch": 3.5953231568691133, + "grad_norm": 3.0079572200775146, + "learning_rate": 1.7557700601318828e-06, + "loss": 0.4206, + "step": 7381 + }, + { + "epoch": 3.5958103280285805, + "grad_norm": 2.8206543922424316, + "learning_rate": 1.7551561069055273e-06, + "loss": 0.3617, + "step": 7382 + }, + { + "epoch": 3.596297499188048, + "grad_norm": 2.935228109359741, + "learning_rate": 1.7545422029728706e-06, + "loss": 0.4532, + "step": 7383 + }, + { + "epoch": 3.5967846703475157, + "grad_norm": 3.0998120307922363, + "learning_rate": 1.7539283483745416e-06, + "loss": 0.4286, + "step": 7384 + }, + { + "epoch": 3.597271841506983, + "grad_norm": 3.2463533878326416, + "learning_rate": 1.753314543151165e-06, + "loss": 0.4343, + "step": 7385 + }, + { + "epoch": 3.59775901266645, + "grad_norm": 3.0960657596588135, + "learning_rate": 1.7527007873433632e-06, + "loss": 0.427, + "step": 7386 + }, + { + "epoch": 3.5982461838259177, + "grad_norm": 2.8488564491271973, + "learning_rate": 1.7520870809917528e-06, + "loss": 0.4126, + "step": 7387 + }, + { + "epoch": 3.598733354985385, + "grad_norm": 3.0097808837890625, + "learning_rate": 1.7514734241369502e-06, + "loss": 0.3702, + "step": 7388 + }, + { + "epoch": 3.599220526144852, + "grad_norm": 2.9596245288848877, + "learning_rate": 1.7508598168195662e-06, + "loss": 0.4186, + "step": 7389 + }, + { + "epoch": 3.5997076973043196, + "grad_norm": 3.0590429306030273, + "learning_rate": 1.7502462590802106e-06, + "loss": 0.4472, + "step": 7390 + }, + { + "epoch": 3.600194868463787, + "grad_norm": 3.2124369144439697, + "learning_rate": 1.7496327509594873e-06, + "loss": 0.4302, + "step": 7391 + }, + { + "epoch": 3.6006820396232544, + "grad_norm": 3.19429612159729, + "learning_rate": 1.7490192924979982e-06, + "loss": 0.4181, + "step": 7392 + }, + { + "epoch": 3.6011692107827216, + "grad_norm": 2.7304670810699463, + "learning_rate": 1.7484058837363426e-06, + "loss": 0.3998, + "step": 7393 + }, + { + "epoch": 3.601656381942189, + "grad_norm": 2.909898042678833, + "learning_rate": 1.7477925247151167e-06, + "loss": 0.4775, + "step": 7394 + }, + { + "epoch": 3.6021435531016563, + "grad_norm": 3.3881490230560303, + "learning_rate": 1.74717921547491e-06, + "loss": 0.4905, + "step": 7395 + }, + { + "epoch": 3.6026307242611235, + "grad_norm": 2.783644437789917, + "learning_rate": 1.7465659560563134e-06, + "loss": 0.3835, + "step": 7396 + }, + { + "epoch": 3.603117895420591, + "grad_norm": 2.9794883728027344, + "learning_rate": 1.745952746499912e-06, + "loss": 0.4395, + "step": 7397 + }, + { + "epoch": 3.6036050665800583, + "grad_norm": 2.7799229621887207, + "learning_rate": 1.7453395868462872e-06, + "loss": 0.3826, + "step": 7398 + }, + { + "epoch": 3.604092237739526, + "grad_norm": 3.1150307655334473, + "learning_rate": 1.7447264771360178e-06, + "loss": 0.4174, + "step": 7399 + }, + { + "epoch": 3.604579408898993, + "grad_norm": 3.0483686923980713, + "learning_rate": 1.7441134174096802e-06, + "loss": 0.4048, + "step": 7400 + }, + { + "epoch": 3.6050665800584607, + "grad_norm": 3.212028741836548, + "learning_rate": 1.743500407707847e-06, + "loss": 0.4576, + "step": 7401 + }, + { + "epoch": 3.605553751217928, + "grad_norm": 3.0060126781463623, + "learning_rate": 1.7428874480710856e-06, + "loss": 0.4523, + "step": 7402 + }, + { + "epoch": 3.606040922377395, + "grad_norm": 3.132950782775879, + "learning_rate": 1.742274538539962e-06, + "loss": 0.3872, + "step": 7403 + }, + { + "epoch": 3.6065280935368627, + "grad_norm": 3.014347791671753, + "learning_rate": 1.741661679155039e-06, + "loss": 0.4475, + "step": 7404 + }, + { + "epoch": 3.60701526469633, + "grad_norm": 2.823896884918213, + "learning_rate": 1.7410488699568767e-06, + "loss": 0.3581, + "step": 7405 + }, + { + "epoch": 3.6075024358557974, + "grad_norm": 2.705146551132202, + "learning_rate": 1.7404361109860285e-06, + "loss": 0.414, + "step": 7406 + }, + { + "epoch": 3.6079896070152646, + "grad_norm": 3.0934696197509766, + "learning_rate": 1.7398234022830473e-06, + "loss": 0.4228, + "step": 7407 + }, + { + "epoch": 3.6084767781747322, + "grad_norm": 2.860454797744751, + "learning_rate": 1.739210743888483e-06, + "loss": 0.4358, + "step": 7408 + }, + { + "epoch": 3.6089639493341994, + "grad_norm": 2.7251181602478027, + "learning_rate": 1.7385981358428817e-06, + "loss": 0.3673, + "step": 7409 + }, + { + "epoch": 3.6094511204936666, + "grad_norm": 2.899898052215576, + "learning_rate": 1.7379855781867838e-06, + "loss": 0.4442, + "step": 7410 + }, + { + "epoch": 3.609938291653134, + "grad_norm": 3.2206788063049316, + "learning_rate": 1.7373730709607296e-06, + "loss": 0.4767, + "step": 7411 + }, + { + "epoch": 3.6104254628126013, + "grad_norm": 3.1746621131896973, + "learning_rate": 1.7367606142052546e-06, + "loss": 0.4019, + "step": 7412 + }, + { + "epoch": 3.610912633972069, + "grad_norm": 3.066789150238037, + "learning_rate": 1.7361482079608916e-06, + "loss": 0.4927, + "step": 7413 + }, + { + "epoch": 3.611399805131536, + "grad_norm": 2.9459292888641357, + "learning_rate": 1.7355358522681687e-06, + "loss": 0.4338, + "step": 7414 + }, + { + "epoch": 3.6118869762910037, + "grad_norm": 2.9874236583709717, + "learning_rate": 1.7349235471676116e-06, + "loss": 0.3963, + "step": 7415 + }, + { + "epoch": 3.612374147450471, + "grad_norm": 3.4131112098693848, + "learning_rate": 1.7343112926997428e-06, + "loss": 0.5005, + "step": 7416 + }, + { + "epoch": 3.612861318609938, + "grad_norm": 2.813884973526001, + "learning_rate": 1.7336990889050823e-06, + "loss": 0.4006, + "step": 7417 + }, + { + "epoch": 3.6133484897694057, + "grad_norm": 3.2637956142425537, + "learning_rate": 1.7330869358241437e-06, + "loss": 0.4866, + "step": 7418 + }, + { + "epoch": 3.613835660928873, + "grad_norm": 3.027163028717041, + "learning_rate": 1.7324748334974406e-06, + "loss": 0.4706, + "step": 7419 + }, + { + "epoch": 3.6143228320883405, + "grad_norm": 2.79740309715271, + "learning_rate": 1.7318627819654813e-06, + "loss": 0.3811, + "step": 7420 + }, + { + "epoch": 3.6148100032478077, + "grad_norm": 3.047179937362671, + "learning_rate": 1.7312507812687726e-06, + "loss": 0.4314, + "step": 7421 + }, + { + "epoch": 3.6152971744072753, + "grad_norm": 3.3353774547576904, + "learning_rate": 1.7306388314478146e-06, + "loss": 0.3935, + "step": 7422 + }, + { + "epoch": 3.6157843455667424, + "grad_norm": 3.1675403118133545, + "learning_rate": 1.7300269325431068e-06, + "loss": 0.4254, + "step": 7423 + }, + { + "epoch": 3.6162715167262096, + "grad_norm": 3.1458914279937744, + "learning_rate": 1.7294150845951455e-06, + "loss": 0.4059, + "step": 7424 + }, + { + "epoch": 3.6167586878856772, + "grad_norm": 2.6670994758605957, + "learning_rate": 1.7288032876444227e-06, + "loss": 0.3781, + "step": 7425 + }, + { + "epoch": 3.6172458590451444, + "grad_norm": 2.877713680267334, + "learning_rate": 1.7281915417314258e-06, + "loss": 0.4422, + "step": 7426 + }, + { + "epoch": 3.617733030204612, + "grad_norm": 2.9105677604675293, + "learning_rate": 1.7275798468966404e-06, + "loss": 0.3981, + "step": 7427 + }, + { + "epoch": 3.618220201364079, + "grad_norm": 3.4857490062713623, + "learning_rate": 1.726968203180549e-06, + "loss": 0.4297, + "step": 7428 + }, + { + "epoch": 3.618707372523547, + "grad_norm": 2.8006746768951416, + "learning_rate": 1.7263566106236312e-06, + "loss": 0.4288, + "step": 7429 + }, + { + "epoch": 3.619194543683014, + "grad_norm": 3.5172367095947266, + "learning_rate": 1.7257450692663596e-06, + "loss": 0.4507, + "step": 7430 + }, + { + "epoch": 3.619681714842481, + "grad_norm": 3.0851101875305176, + "learning_rate": 1.7251335791492073e-06, + "loss": 0.407, + "step": 7431 + }, + { + "epoch": 3.6201688860019487, + "grad_norm": 3.347689628601074, + "learning_rate": 1.7245221403126433e-06, + "loss": 0.4566, + "step": 7432 + }, + { + "epoch": 3.620656057161416, + "grad_norm": 3.4965527057647705, + "learning_rate": 1.7239107527971317e-06, + "loss": 0.4785, + "step": 7433 + }, + { + "epoch": 3.6211432283208835, + "grad_norm": 2.988111734390259, + "learning_rate": 1.7232994166431334e-06, + "loss": 0.5025, + "step": 7434 + }, + { + "epoch": 3.6216303994803507, + "grad_norm": 3.2309720516204834, + "learning_rate": 1.7226881318911082e-06, + "loss": 0.5034, + "step": 7435 + }, + { + "epoch": 3.6221175706398183, + "grad_norm": 3.0622899532318115, + "learning_rate": 1.7220768985815091e-06, + "loss": 0.4435, + "step": 7436 + }, + { + "epoch": 3.6226047417992855, + "grad_norm": 3.034590482711792, + "learning_rate": 1.7214657167547899e-06, + "loss": 0.3989, + "step": 7437 + }, + { + "epoch": 3.6230919129587527, + "grad_norm": 3.60367751121521, + "learning_rate": 1.7208545864513958e-06, + "loss": 0.434, + "step": 7438 + }, + { + "epoch": 3.6235790841182203, + "grad_norm": 2.7870914936065674, + "learning_rate": 1.720243507711773e-06, + "loss": 0.4181, + "step": 7439 + }, + { + "epoch": 3.6240662552776874, + "grad_norm": 3.1929237842559814, + "learning_rate": 1.7196324805763618e-06, + "loss": 0.48, + "step": 7440 + }, + { + "epoch": 3.624553426437155, + "grad_norm": 3.3388471603393555, + "learning_rate": 1.7190215050856015e-06, + "loss": 0.4789, + "step": 7441 + }, + { + "epoch": 3.6250405975966222, + "grad_norm": 3.006239414215088, + "learning_rate": 1.7184105812799244e-06, + "loss": 0.4507, + "step": 7442 + }, + { + "epoch": 3.62552776875609, + "grad_norm": 3.3956706523895264, + "learning_rate": 1.7177997091997623e-06, + "loss": 0.4667, + "step": 7443 + }, + { + "epoch": 3.626014939915557, + "grad_norm": 3.34938907623291, + "learning_rate": 1.717188888885542e-06, + "loss": 0.4888, + "step": 7444 + }, + { + "epoch": 3.626502111075024, + "grad_norm": 3.460838556289673, + "learning_rate": 1.71657812037769e-06, + "loss": 0.461, + "step": 7445 + }, + { + "epoch": 3.626989282234492, + "grad_norm": 2.919677495956421, + "learning_rate": 1.7159674037166235e-06, + "loss": 0.3956, + "step": 7446 + }, + { + "epoch": 3.627476453393959, + "grad_norm": 3.3868775367736816, + "learning_rate": 1.7153567389427613e-06, + "loss": 0.4372, + "step": 7447 + }, + { + "epoch": 3.6279636245534266, + "grad_norm": 2.9536826610565186, + "learning_rate": 1.7147461260965182e-06, + "loss": 0.4184, + "step": 7448 + }, + { + "epoch": 3.6284507957128938, + "grad_norm": 3.0161056518554688, + "learning_rate": 1.7141355652183022e-06, + "loss": 0.4393, + "step": 7449 + }, + { + "epoch": 3.6289379668723614, + "grad_norm": 3.0955028533935547, + "learning_rate": 1.7135250563485215e-06, + "loss": 0.4471, + "step": 7450 + }, + { + "epoch": 3.6294251380318285, + "grad_norm": 3.0920348167419434, + "learning_rate": 1.7129145995275792e-06, + "loss": 0.4366, + "step": 7451 + }, + { + "epoch": 3.6299123091912957, + "grad_norm": 2.9475181102752686, + "learning_rate": 1.7123041947958763e-06, + "loss": 0.4091, + "step": 7452 + }, + { + "epoch": 3.6303994803507633, + "grad_norm": 2.7195656299591064, + "learning_rate": 1.7116938421938077e-06, + "loss": 0.4233, + "step": 7453 + }, + { + "epoch": 3.6308866515102305, + "grad_norm": 3.057427167892456, + "learning_rate": 1.711083541761767e-06, + "loss": 0.4435, + "step": 7454 + }, + { + "epoch": 3.631373822669698, + "grad_norm": 2.777400493621826, + "learning_rate": 1.7104732935401446e-06, + "loss": 0.3767, + "step": 7455 + }, + { + "epoch": 3.6318609938291653, + "grad_norm": 2.8378841876983643, + "learning_rate": 1.7098630975693262e-06, + "loss": 0.4194, + "step": 7456 + }, + { + "epoch": 3.632348164988633, + "grad_norm": 2.8610920906066895, + "learning_rate": 1.7092529538896938e-06, + "loss": 0.4275, + "step": 7457 + }, + { + "epoch": 3.6328353361481, + "grad_norm": 3.3611900806427, + "learning_rate": 1.708642862541628e-06, + "loss": 0.4731, + "step": 7458 + }, + { + "epoch": 3.6333225073075672, + "grad_norm": 2.9456589221954346, + "learning_rate": 1.708032823565503e-06, + "loss": 0.439, + "step": 7459 + }, + { + "epoch": 3.633809678467035, + "grad_norm": 2.8561041355133057, + "learning_rate": 1.7074228370016927e-06, + "loss": 0.4196, + "step": 7460 + }, + { + "epoch": 3.634296849626502, + "grad_norm": 2.6004655361175537, + "learning_rate": 1.706812902890565e-06, + "loss": 0.3937, + "step": 7461 + }, + { + "epoch": 3.6347840207859696, + "grad_norm": 3.0451436042785645, + "learning_rate": 1.7062030212724855e-06, + "loss": 0.4154, + "step": 7462 + }, + { + "epoch": 3.635271191945437, + "grad_norm": 3.0197057723999023, + "learning_rate": 1.7055931921878157e-06, + "loss": 0.4317, + "step": 7463 + }, + { + "epoch": 3.6357583631049044, + "grad_norm": 3.134531021118164, + "learning_rate": 1.7049834156769158e-06, + "loss": 0.4462, + "step": 7464 + }, + { + "epoch": 3.6362455342643716, + "grad_norm": 3.13899827003479, + "learning_rate": 1.7043736917801384e-06, + "loss": 0.4653, + "step": 7465 + }, + { + "epoch": 3.6367327054238388, + "grad_norm": 2.7785191535949707, + "learning_rate": 1.7037640205378359e-06, + "loss": 0.4289, + "step": 7466 + }, + { + "epoch": 3.6372198765833064, + "grad_norm": 3.0817105770111084, + "learning_rate": 1.7031544019903561e-06, + "loss": 0.4651, + "step": 7467 + }, + { + "epoch": 3.6377070477427735, + "grad_norm": 3.413494348526001, + "learning_rate": 1.702544836178045e-06, + "loss": 0.48, + "step": 7468 + }, + { + "epoch": 3.638194218902241, + "grad_norm": 3.2191684246063232, + "learning_rate": 1.7019353231412416e-06, + "loss": 0.4636, + "step": 7469 + }, + { + "epoch": 3.6386813900617083, + "grad_norm": 3.4107630252838135, + "learning_rate": 1.7013258629202839e-06, + "loss": 0.4758, + "step": 7470 + }, + { + "epoch": 3.639168561221176, + "grad_norm": 2.6918113231658936, + "learning_rate": 1.7007164555555062e-06, + "loss": 0.4418, + "step": 7471 + }, + { + "epoch": 3.639655732380643, + "grad_norm": 3.3165249824523926, + "learning_rate": 1.7001071010872402e-06, + "loss": 0.487, + "step": 7472 + }, + { + "epoch": 3.6401429035401103, + "grad_norm": 2.6800050735473633, + "learning_rate": 1.6994977995558107e-06, + "loss": 0.388, + "step": 7473 + }, + { + "epoch": 3.640630074699578, + "grad_norm": 3.061213493347168, + "learning_rate": 1.6988885510015425e-06, + "loss": 0.4342, + "step": 7474 + }, + { + "epoch": 3.641117245859045, + "grad_norm": 3.360556125640869, + "learning_rate": 1.6982793554647554e-06, + "loss": 0.4406, + "step": 7475 + }, + { + "epoch": 3.6416044170185122, + "grad_norm": 3.0284667015075684, + "learning_rate": 1.6976702129857664e-06, + "loss": 0.4498, + "step": 7476 + }, + { + "epoch": 3.64209158817798, + "grad_norm": 2.883190870285034, + "learning_rate": 1.6970611236048873e-06, + "loss": 0.4256, + "step": 7477 + }, + { + "epoch": 3.6425787593374475, + "grad_norm": 3.578981876373291, + "learning_rate": 1.6964520873624286e-06, + "loss": 0.4298, + "step": 7478 + }, + { + "epoch": 3.6430659304969146, + "grad_norm": 2.7977004051208496, + "learning_rate": 1.6958431042986956e-06, + "loss": 0.4128, + "step": 7479 + }, + { + "epoch": 3.643553101656382, + "grad_norm": 3.338883876800537, + "learning_rate": 1.6952341744539916e-06, + "loss": 0.4675, + "step": 7480 + }, + { + "epoch": 3.6440402728158494, + "grad_norm": 3.161386728286743, + "learning_rate": 1.6946252978686151e-06, + "loss": 0.411, + "step": 7481 + }, + { + "epoch": 3.6445274439753166, + "grad_norm": 3.110182523727417, + "learning_rate": 1.6940164745828608e-06, + "loss": 0.4817, + "step": 7482 + }, + { + "epoch": 3.6450146151347838, + "grad_norm": 2.94553279876709, + "learning_rate": 1.6934077046370208e-06, + "loss": 0.4607, + "step": 7483 + }, + { + "epoch": 3.6455017862942514, + "grad_norm": 3.3181662559509277, + "learning_rate": 1.6927989880713852e-06, + "loss": 0.462, + "step": 7484 + }, + { + "epoch": 3.645988957453719, + "grad_norm": 3.2693562507629395, + "learning_rate": 1.6921903249262362e-06, + "loss": 0.4575, + "step": 7485 + }, + { + "epoch": 3.646476128613186, + "grad_norm": 4.012955665588379, + "learning_rate": 1.6915817152418566e-06, + "loss": 0.4804, + "step": 7486 + }, + { + "epoch": 3.6469632997726533, + "grad_norm": 2.7666335105895996, + "learning_rate": 1.6909731590585235e-06, + "loss": 0.3637, + "step": 7487 + }, + { + "epoch": 3.647450470932121, + "grad_norm": 2.9521491527557373, + "learning_rate": 1.6903646564165124e-06, + "loss": 0.4323, + "step": 7488 + }, + { + "epoch": 3.647937642091588, + "grad_norm": 3.2948458194732666, + "learning_rate": 1.689756207356092e-06, + "loss": 0.4522, + "step": 7489 + }, + { + "epoch": 3.6484248132510553, + "grad_norm": 2.989386796951294, + "learning_rate": 1.6891478119175304e-06, + "loss": 0.4358, + "step": 7490 + }, + { + "epoch": 3.648911984410523, + "grad_norm": 2.981910467147827, + "learning_rate": 1.6885394701410912e-06, + "loss": 0.3926, + "step": 7491 + }, + { + "epoch": 3.6493991555699905, + "grad_norm": 3.4974536895751953, + "learning_rate": 1.6879311820670352e-06, + "loss": 0.4857, + "step": 7492 + }, + { + "epoch": 3.6498863267294577, + "grad_norm": 2.9589591026306152, + "learning_rate": 1.6873229477356168e-06, + "loss": 0.4158, + "step": 7493 + }, + { + "epoch": 3.650373497888925, + "grad_norm": 3.086703300476074, + "learning_rate": 1.6867147671870903e-06, + "loss": 0.4226, + "step": 7494 + }, + { + "epoch": 3.6508606690483925, + "grad_norm": 3.3450632095336914, + "learning_rate": 1.6861066404617043e-06, + "loss": 0.4897, + "step": 7495 + }, + { + "epoch": 3.6513478402078596, + "grad_norm": 2.9660816192626953, + "learning_rate": 1.6854985675997065e-06, + "loss": 0.4331, + "step": 7496 + }, + { + "epoch": 3.651835011367327, + "grad_norm": 2.8734567165374756, + "learning_rate": 1.6848905486413367e-06, + "loss": 0.4201, + "step": 7497 + }, + { + "epoch": 3.6523221825267944, + "grad_norm": 2.9485905170440674, + "learning_rate": 1.6842825836268348e-06, + "loss": 0.4389, + "step": 7498 + }, + { + "epoch": 3.652809353686262, + "grad_norm": 3.11431884765625, + "learning_rate": 1.6836746725964352e-06, + "loss": 0.4453, + "step": 7499 + }, + { + "epoch": 3.653296524845729, + "grad_norm": 2.9840049743652344, + "learning_rate": 1.6830668155903701e-06, + "loss": 0.4193, + "step": 7500 + }, + { + "epoch": 3.6537836960051964, + "grad_norm": 3.177183151245117, + "learning_rate": 1.6824590126488672e-06, + "loss": 0.4188, + "step": 7501 + }, + { + "epoch": 3.654270867164664, + "grad_norm": 2.9769482612609863, + "learning_rate": 1.6818512638121499e-06, + "loss": 0.4099, + "step": 7502 + }, + { + "epoch": 3.654758038324131, + "grad_norm": 3.202101707458496, + "learning_rate": 1.6812435691204403e-06, + "loss": 0.4543, + "step": 7503 + }, + { + "epoch": 3.6552452094835983, + "grad_norm": 3.1965320110321045, + "learning_rate": 1.6806359286139557e-06, + "loss": 0.4817, + "step": 7504 + }, + { + "epoch": 3.655732380643066, + "grad_norm": 3.070432662963867, + "learning_rate": 1.6800283423329078e-06, + "loss": 0.4602, + "step": 7505 + }, + { + "epoch": 3.6562195518025336, + "grad_norm": 3.537637948989868, + "learning_rate": 1.679420810317508e-06, + "loss": 0.3693, + "step": 7506 + }, + { + "epoch": 3.6567067229620007, + "grad_norm": 2.929816722869873, + "learning_rate": 1.6788133326079638e-06, + "loss": 0.4239, + "step": 7507 + }, + { + "epoch": 3.657193894121468, + "grad_norm": 3.2219574451446533, + "learning_rate": 1.6782059092444758e-06, + "loss": 0.4489, + "step": 7508 + }, + { + "epoch": 3.6576810652809355, + "grad_norm": 3.3590145111083984, + "learning_rate": 1.677598540267244e-06, + "loss": 0.5027, + "step": 7509 + }, + { + "epoch": 3.6581682364404027, + "grad_norm": 2.722712278366089, + "learning_rate": 1.6769912257164645e-06, + "loss": 0.4354, + "step": 7510 + }, + { + "epoch": 3.65865540759987, + "grad_norm": 4.175169467926025, + "learning_rate": 1.67638396563233e-06, + "loss": 0.4472, + "step": 7511 + }, + { + "epoch": 3.6591425787593375, + "grad_norm": 3.1683146953582764, + "learning_rate": 1.6757767600550267e-06, + "loss": 0.4262, + "step": 7512 + }, + { + "epoch": 3.659629749918805, + "grad_norm": 3.1020054817199707, + "learning_rate": 1.6751696090247404e-06, + "loss": 0.4236, + "step": 7513 + }, + { + "epoch": 3.6601169210782722, + "grad_norm": 2.935365915298462, + "learning_rate": 1.6745625125816533e-06, + "loss": 0.4428, + "step": 7514 + }, + { + "epoch": 3.6606040922377394, + "grad_norm": 3.3273305892944336, + "learning_rate": 1.6739554707659427e-06, + "loss": 0.4111, + "step": 7515 + }, + { + "epoch": 3.661091263397207, + "grad_norm": 3.0748050212860107, + "learning_rate": 1.6733484836177815e-06, + "loss": 0.4606, + "step": 7516 + }, + { + "epoch": 3.661578434556674, + "grad_norm": 2.895674228668213, + "learning_rate": 1.6727415511773405e-06, + "loss": 0.4487, + "step": 7517 + }, + { + "epoch": 3.6620656057161414, + "grad_norm": 2.777451276779175, + "learning_rate": 1.6721346734847865e-06, + "loss": 0.3715, + "step": 7518 + }, + { + "epoch": 3.662552776875609, + "grad_norm": 3.309791088104248, + "learning_rate": 1.671527850580284e-06, + "loss": 0.477, + "step": 7519 + }, + { + "epoch": 3.6630399480350766, + "grad_norm": 3.1986374855041504, + "learning_rate": 1.6709210825039899e-06, + "loss": 0.4229, + "step": 7520 + }, + { + "epoch": 3.6635271191945438, + "grad_norm": 2.915512800216675, + "learning_rate": 1.6703143692960614e-06, + "loss": 0.415, + "step": 7521 + }, + { + "epoch": 3.664014290354011, + "grad_norm": 2.926910877227783, + "learning_rate": 1.6697077109966515e-06, + "loss": 0.428, + "step": 7522 + }, + { + "epoch": 3.6645014615134786, + "grad_norm": 3.0849859714508057, + "learning_rate": 1.6691011076459074e-06, + "loss": 0.4826, + "step": 7523 + }, + { + "epoch": 3.6649886326729457, + "grad_norm": 3.1488890647888184, + "learning_rate": 1.6684945592839753e-06, + "loss": 0.3848, + "step": 7524 + }, + { + "epoch": 3.665475803832413, + "grad_norm": 3.064849376678467, + "learning_rate": 1.667888065950995e-06, + "loss": 0.4038, + "step": 7525 + }, + { + "epoch": 3.6659629749918805, + "grad_norm": 3.142543315887451, + "learning_rate": 1.6672816276871045e-06, + "loss": 0.3723, + "step": 7526 + }, + { + "epoch": 3.6664501461513477, + "grad_norm": 2.8625435829162598, + "learning_rate": 1.6666752445324403e-06, + "loss": 0.4103, + "step": 7527 + }, + { + "epoch": 3.6669373173108153, + "grad_norm": 2.839536190032959, + "learning_rate": 1.6660689165271293e-06, + "loss": 0.4101, + "step": 7528 + }, + { + "epoch": 3.6674244884702825, + "grad_norm": 3.0392262935638428, + "learning_rate": 1.6654626437112998e-06, + "loss": 0.458, + "step": 7529 + }, + { + "epoch": 3.66791165962975, + "grad_norm": 3.233319044113159, + "learning_rate": 1.6648564261250746e-06, + "loss": 0.4852, + "step": 7530 + }, + { + "epoch": 3.6683988307892172, + "grad_norm": 3.6729302406311035, + "learning_rate": 1.6642502638085745e-06, + "loss": 0.4229, + "step": 7531 + }, + { + "epoch": 3.6688860019486844, + "grad_norm": 3.2656772136688232, + "learning_rate": 1.6636441568019135e-06, + "loss": 0.4322, + "step": 7532 + }, + { + "epoch": 3.669373173108152, + "grad_norm": 2.858934164047241, + "learning_rate": 1.6630381051452037e-06, + "loss": 0.4384, + "step": 7533 + }, + { + "epoch": 3.669860344267619, + "grad_norm": 2.905162811279297, + "learning_rate": 1.6624321088785544e-06, + "loss": 0.4357, + "step": 7534 + }, + { + "epoch": 3.670347515427087, + "grad_norm": 3.2593114376068115, + "learning_rate": 1.6618261680420717e-06, + "loss": 0.4729, + "step": 7535 + }, + { + "epoch": 3.670834686586554, + "grad_norm": 3.2693872451782227, + "learning_rate": 1.6612202826758533e-06, + "loss": 0.4722, + "step": 7536 + }, + { + "epoch": 3.6713218577460216, + "grad_norm": 3.2383034229278564, + "learning_rate": 1.6606144528199992e-06, + "loss": 0.4842, + "step": 7537 + }, + { + "epoch": 3.6718090289054888, + "grad_norm": 3.1309831142425537, + "learning_rate": 1.6600086785146019e-06, + "loss": 0.4185, + "step": 7538 + }, + { + "epoch": 3.672296200064956, + "grad_norm": 2.7284069061279297, + "learning_rate": 1.6594029597997532e-06, + "loss": 0.3695, + "step": 7539 + }, + { + "epoch": 3.6727833712244236, + "grad_norm": 3.275826930999756, + "learning_rate": 1.6587972967155375e-06, + "loss": 0.4098, + "step": 7540 + }, + { + "epoch": 3.6732705423838907, + "grad_norm": 2.812617301940918, + "learning_rate": 1.658191689302038e-06, + "loss": 0.4063, + "step": 7541 + }, + { + "epoch": 3.6737577135433583, + "grad_norm": 3.118597984313965, + "learning_rate": 1.6575861375993351e-06, + "loss": 0.4241, + "step": 7542 + }, + { + "epoch": 3.6742448847028255, + "grad_norm": 3.430473804473877, + "learning_rate": 1.6569806416475026e-06, + "loss": 0.4054, + "step": 7543 + }, + { + "epoch": 3.674732055862293, + "grad_norm": 3.1584107875823975, + "learning_rate": 1.6563752014866127e-06, + "loss": 0.5118, + "step": 7544 + }, + { + "epoch": 3.6752192270217603, + "grad_norm": 3.276399850845337, + "learning_rate": 1.655769817156734e-06, + "loss": 0.4181, + "step": 7545 + }, + { + "epoch": 3.6757063981812275, + "grad_norm": 2.889866828918457, + "learning_rate": 1.6551644886979293e-06, + "loss": 0.4164, + "step": 7546 + }, + { + "epoch": 3.676193569340695, + "grad_norm": 2.94511079788208, + "learning_rate": 1.6545592161502608e-06, + "loss": 0.3965, + "step": 7547 + }, + { + "epoch": 3.6766807405001622, + "grad_norm": 2.5806427001953125, + "learning_rate": 1.6539539995537845e-06, + "loss": 0.3579, + "step": 7548 + }, + { + "epoch": 3.67716791165963, + "grad_norm": 3.4122304916381836, + "learning_rate": 1.6533488389485533e-06, + "loss": 0.5277, + "step": 7549 + }, + { + "epoch": 3.677655082819097, + "grad_norm": 3.0279054641723633, + "learning_rate": 1.6527437343746172e-06, + "loss": 0.4252, + "step": 7550 + }, + { + "epoch": 3.6781422539785646, + "grad_norm": 3.622462034225464, + "learning_rate": 1.6521386858720228e-06, + "loss": 0.5055, + "step": 7551 + }, + { + "epoch": 3.678629425138032, + "grad_norm": 3.158522605895996, + "learning_rate": 1.6515336934808101e-06, + "loss": 0.5033, + "step": 7552 + }, + { + "epoch": 3.679116596297499, + "grad_norm": 2.9070627689361572, + "learning_rate": 1.6509287572410187e-06, + "loss": 0.4139, + "step": 7553 + }, + { + "epoch": 3.6796037674569666, + "grad_norm": 3.1886820793151855, + "learning_rate": 1.650323877192684e-06, + "loss": 0.432, + "step": 7554 + }, + { + "epoch": 3.6800909386164338, + "grad_norm": 3.151998519897461, + "learning_rate": 1.6497190533758351e-06, + "loss": 0.4272, + "step": 7555 + }, + { + "epoch": 3.6805781097759014, + "grad_norm": 3.062177896499634, + "learning_rate": 1.6491142858304998e-06, + "loss": 0.4563, + "step": 7556 + }, + { + "epoch": 3.6810652809353686, + "grad_norm": 3.2367136478424072, + "learning_rate": 1.6485095745967019e-06, + "loss": 0.405, + "step": 7557 + }, + { + "epoch": 3.681552452094836, + "grad_norm": 3.089688539505005, + "learning_rate": 1.6479049197144622e-06, + "loss": 0.4298, + "step": 7558 + }, + { + "epoch": 3.6820396232543033, + "grad_norm": 2.94069766998291, + "learning_rate": 1.6473003212237945e-06, + "loss": 0.4538, + "step": 7559 + }, + { + "epoch": 3.6825267944137705, + "grad_norm": 3.314694881439209, + "learning_rate": 1.646695779164712e-06, + "loss": 0.4789, + "step": 7560 + }, + { + "epoch": 3.683013965573238, + "grad_norm": 2.936753034591675, + "learning_rate": 1.6460912935772238e-06, + "loss": 0.4024, + "step": 7561 + }, + { + "epoch": 3.6835011367327053, + "grad_norm": 3.010382890701294, + "learning_rate": 1.645486864501335e-06, + "loss": 0.4133, + "step": 7562 + }, + { + "epoch": 3.683988307892173, + "grad_norm": 3.143770694732666, + "learning_rate": 1.6448824919770451e-06, + "loss": 0.4577, + "step": 7563 + }, + { + "epoch": 3.68447547905164, + "grad_norm": 2.7382333278656006, + "learning_rate": 1.6442781760443518e-06, + "loss": 0.4154, + "step": 7564 + }, + { + "epoch": 3.6849626502111077, + "grad_norm": 3.3767216205596924, + "learning_rate": 1.64367391674325e-06, + "loss": 0.4481, + "step": 7565 + }, + { + "epoch": 3.685449821370575, + "grad_norm": 2.942309617996216, + "learning_rate": 1.6430697141137286e-06, + "loss": 0.396, + "step": 7566 + }, + { + "epoch": 3.685936992530042, + "grad_norm": 3.2206666469573975, + "learning_rate": 1.6424655681957732e-06, + "loss": 0.4273, + "step": 7567 + }, + { + "epoch": 3.6864241636895096, + "grad_norm": 3.1545941829681396, + "learning_rate": 1.641861479029367e-06, + "loss": 0.4565, + "step": 7568 + }, + { + "epoch": 3.686911334848977, + "grad_norm": 3.3347063064575195, + "learning_rate": 1.6412574466544878e-06, + "loss": 0.4795, + "step": 7569 + }, + { + "epoch": 3.6873985060084444, + "grad_norm": 2.65281081199646, + "learning_rate": 1.6406534711111116e-06, + "loss": 0.3964, + "step": 7570 + }, + { + "epoch": 3.6878856771679116, + "grad_norm": 3.2148585319519043, + "learning_rate": 1.6400495524392085e-06, + "loss": 0.4548, + "step": 7571 + }, + { + "epoch": 3.688372848327379, + "grad_norm": 3.201951503753662, + "learning_rate": 1.6394456906787454e-06, + "loss": 0.4122, + "step": 7572 + }, + { + "epoch": 3.6888600194868464, + "grad_norm": 2.651837110519409, + "learning_rate": 1.638841885869686e-06, + "loss": 0.3417, + "step": 7573 + }, + { + "epoch": 3.6893471906463136, + "grad_norm": 2.804543972015381, + "learning_rate": 1.6382381380519918e-06, + "loss": 0.4268, + "step": 7574 + }, + { + "epoch": 3.689834361805781, + "grad_norm": 3.2157907485961914, + "learning_rate": 1.6376344472656163e-06, + "loss": 0.4231, + "step": 7575 + }, + { + "epoch": 3.6903215329652483, + "grad_norm": 3.3626816272735596, + "learning_rate": 1.6370308135505128e-06, + "loss": 0.4215, + "step": 7576 + }, + { + "epoch": 3.690808704124716, + "grad_norm": 3.33205509185791, + "learning_rate": 1.6364272369466297e-06, + "loss": 0.4032, + "step": 7577 + }, + { + "epoch": 3.691295875284183, + "grad_norm": 3.3090577125549316, + "learning_rate": 1.6358237174939126e-06, + "loss": 0.3545, + "step": 7578 + }, + { + "epoch": 3.6917830464436507, + "grad_norm": 2.8054122924804688, + "learning_rate": 1.6352202552323004e-06, + "loss": 0.3808, + "step": 7579 + }, + { + "epoch": 3.692270217603118, + "grad_norm": 3.302318572998047, + "learning_rate": 1.6346168502017312e-06, + "loss": 0.4317, + "step": 7580 + }, + { + "epoch": 3.692757388762585, + "grad_norm": 3.4874649047851562, + "learning_rate": 1.6340135024421383e-06, + "loss": 0.4404, + "step": 7581 + }, + { + "epoch": 3.6932445599220527, + "grad_norm": 3.155346632003784, + "learning_rate": 1.6334102119934523e-06, + "loss": 0.4157, + "step": 7582 + }, + { + "epoch": 3.69373173108152, + "grad_norm": 2.7995004653930664, + "learning_rate": 1.6328069788955963e-06, + "loss": 0.4091, + "step": 7583 + }, + { + "epoch": 3.6942189022409875, + "grad_norm": 3.004915237426758, + "learning_rate": 1.632203803188494e-06, + "loss": 0.4444, + "step": 7584 + }, + { + "epoch": 3.6947060734004546, + "grad_norm": 3.395784616470337, + "learning_rate": 1.6316006849120638e-06, + "loss": 0.5156, + "step": 7585 + }, + { + "epoch": 3.6951932445599223, + "grad_norm": 3.5061545372009277, + "learning_rate": 1.6309976241062192e-06, + "loss": 0.4432, + "step": 7586 + }, + { + "epoch": 3.6956804157193894, + "grad_norm": 3.6518149375915527, + "learning_rate": 1.6303946208108706e-06, + "loss": 0.4116, + "step": 7587 + }, + { + "epoch": 3.6961675868788566, + "grad_norm": 3.1923327445983887, + "learning_rate": 1.6297916750659254e-06, + "loss": 0.4691, + "step": 7588 + }, + { + "epoch": 3.696654758038324, + "grad_norm": 2.817512273788452, + "learning_rate": 1.6291887869112857e-06, + "loss": 0.4034, + "step": 7589 + }, + { + "epoch": 3.6971419291977914, + "grad_norm": 2.872545003890991, + "learning_rate": 1.6285859563868514e-06, + "loss": 0.4473, + "step": 7590 + }, + { + "epoch": 3.697629100357259, + "grad_norm": 3.0123348236083984, + "learning_rate": 1.6279831835325175e-06, + "loss": 0.4319, + "step": 7591 + }, + { + "epoch": 3.698116271516726, + "grad_norm": 2.8889358043670654, + "learning_rate": 1.6273804683881746e-06, + "loss": 0.3756, + "step": 7592 + }, + { + "epoch": 3.698603442676194, + "grad_norm": 2.9083445072174072, + "learning_rate": 1.6267778109937111e-06, + "loss": 0.371, + "step": 7593 + }, + { + "epoch": 3.699090613835661, + "grad_norm": 3.4194953441619873, + "learning_rate": 1.626175211389012e-06, + "loss": 0.5221, + "step": 7594 + }, + { + "epoch": 3.699577784995128, + "grad_norm": 2.849522113800049, + "learning_rate": 1.625572669613955e-06, + "loss": 0.3749, + "step": 7595 + }, + { + "epoch": 3.7000649561545957, + "grad_norm": 3.207091808319092, + "learning_rate": 1.624970185708417e-06, + "loss": 0.4184, + "step": 7596 + }, + { + "epoch": 3.700552127314063, + "grad_norm": 2.813518524169922, + "learning_rate": 1.6243677597122705e-06, + "loss": 0.3547, + "step": 7597 + }, + { + "epoch": 3.7010392984735305, + "grad_norm": 3.217020034790039, + "learning_rate": 1.6237653916653856e-06, + "loss": 0.425, + "step": 7598 + }, + { + "epoch": 3.7015264696329977, + "grad_norm": 3.228830337524414, + "learning_rate": 1.6231630816076244e-06, + "loss": 0.4898, + "step": 7599 + }, + { + "epoch": 3.7020136407924653, + "grad_norm": 3.5944581031799316, + "learning_rate": 1.6225608295788486e-06, + "loss": 0.4373, + "step": 7600 + }, + { + "epoch": 3.7025008119519325, + "grad_norm": 2.9511756896972656, + "learning_rate": 1.6219586356189154e-06, + "loss": 0.3958, + "step": 7601 + }, + { + "epoch": 3.7029879831113997, + "grad_norm": 2.7343432903289795, + "learning_rate": 1.621356499767679e-06, + "loss": 0.4144, + "step": 7602 + }, + { + "epoch": 3.7034751542708673, + "grad_norm": 2.867464780807495, + "learning_rate": 1.6207544220649868e-06, + "loss": 0.4456, + "step": 7603 + }, + { + "epoch": 3.7039623254303344, + "grad_norm": 2.646960735321045, + "learning_rate": 1.6201524025506848e-06, + "loss": 0.4, + "step": 7604 + }, + { + "epoch": 3.704449496589802, + "grad_norm": 2.6288390159606934, + "learning_rate": 1.6195504412646159e-06, + "loss": 0.4452, + "step": 7605 + }, + { + "epoch": 3.704936667749269, + "grad_norm": 3.755241632461548, + "learning_rate": 1.6189485382466157e-06, + "loss": 0.4823, + "step": 7606 + }, + { + "epoch": 3.705423838908737, + "grad_norm": 2.910734176635742, + "learning_rate": 1.6183466935365195e-06, + "loss": 0.4751, + "step": 7607 + }, + { + "epoch": 3.705911010068204, + "grad_norm": 2.8179609775543213, + "learning_rate": 1.617744907174157e-06, + "loss": 0.3527, + "step": 7608 + }, + { + "epoch": 3.706398181227671, + "grad_norm": 3.0469777584075928, + "learning_rate": 1.6171431791993553e-06, + "loss": 0.3586, + "step": 7609 + }, + { + "epoch": 3.706885352387139, + "grad_norm": 4.083987712860107, + "learning_rate": 1.6165415096519348e-06, + "loss": 0.4472, + "step": 7610 + }, + { + "epoch": 3.707372523546606, + "grad_norm": 3.0828816890716553, + "learning_rate": 1.6159398985717154e-06, + "loss": 0.4537, + "step": 7611 + }, + { + "epoch": 3.707859694706073, + "grad_norm": 3.079228401184082, + "learning_rate": 1.6153383459985105e-06, + "loss": 0.5055, + "step": 7612 + }, + { + "epoch": 3.7083468658655407, + "grad_norm": 2.9508838653564453, + "learning_rate": 1.6147368519721324e-06, + "loss": 0.4147, + "step": 7613 + }, + { + "epoch": 3.7088340370250084, + "grad_norm": 2.8538544178009033, + "learning_rate": 1.6141354165323868e-06, + "loss": 0.4184, + "step": 7614 + }, + { + "epoch": 3.7093212081844755, + "grad_norm": 3.065559148788452, + "learning_rate": 1.6135340397190763e-06, + "loss": 0.4028, + "step": 7615 + }, + { + "epoch": 3.7098083793439427, + "grad_norm": 3.384873867034912, + "learning_rate": 1.6129327215720003e-06, + "loss": 0.4349, + "step": 7616 + }, + { + "epoch": 3.7102955505034103, + "grad_norm": 3.114579439163208, + "learning_rate": 1.6123314621309555e-06, + "loss": 0.4684, + "step": 7617 + }, + { + "epoch": 3.7107827216628775, + "grad_norm": 3.0596539974212646, + "learning_rate": 1.6117302614357308e-06, + "loss": 0.4485, + "step": 7618 + }, + { + "epoch": 3.7112698928223447, + "grad_norm": 3.030611753463745, + "learning_rate": 1.6111291195261148e-06, + "loss": 0.4599, + "step": 7619 + }, + { + "epoch": 3.7117570639818123, + "grad_norm": 4.910580635070801, + "learning_rate": 1.6105280364418907e-06, + "loss": 0.3432, + "step": 7620 + }, + { + "epoch": 3.71224423514128, + "grad_norm": 2.953813314437866, + "learning_rate": 1.6099270122228395e-06, + "loss": 0.3633, + "step": 7621 + }, + { + "epoch": 3.712731406300747, + "grad_norm": 3.2420878410339355, + "learning_rate": 1.6093260469087342e-06, + "loss": 0.469, + "step": 7622 + }, + { + "epoch": 3.713218577460214, + "grad_norm": 3.043346643447876, + "learning_rate": 1.608725140539349e-06, + "loss": 0.4155, + "step": 7623 + }, + { + "epoch": 3.713705748619682, + "grad_norm": 2.7903096675872803, + "learning_rate": 1.6081242931544506e-06, + "loss": 0.4377, + "step": 7624 + }, + { + "epoch": 3.714192919779149, + "grad_norm": 3.1451144218444824, + "learning_rate": 1.6075235047938043e-06, + "loss": 0.4374, + "step": 7625 + }, + { + "epoch": 3.714680090938616, + "grad_norm": 2.8861372470855713, + "learning_rate": 1.6069227754971683e-06, + "loss": 0.4286, + "step": 7626 + }, + { + "epoch": 3.715167262098084, + "grad_norm": 2.756500005722046, + "learning_rate": 1.6063221053043002e-06, + "loss": 0.3534, + "step": 7627 + }, + { + "epoch": 3.7156544332575514, + "grad_norm": 3.155569076538086, + "learning_rate": 1.6057214942549516e-06, + "loss": 0.4314, + "step": 7628 + }, + { + "epoch": 3.7161416044170186, + "grad_norm": 3.161106586456299, + "learning_rate": 1.6051209423888724e-06, + "loss": 0.4525, + "step": 7629 + }, + { + "epoch": 3.7166287755764857, + "grad_norm": 2.711552381515503, + "learning_rate": 1.6045204497458051e-06, + "loss": 0.394, + "step": 7630 + }, + { + "epoch": 3.7171159467359534, + "grad_norm": 3.308797597885132, + "learning_rate": 1.6039200163654909e-06, + "loss": 0.3997, + "step": 7631 + }, + { + "epoch": 3.7176031178954205, + "grad_norm": 2.8732573986053467, + "learning_rate": 1.6033196422876669e-06, + "loss": 0.3967, + "step": 7632 + }, + { + "epoch": 3.7180902890548877, + "grad_norm": 2.854794979095459, + "learning_rate": 1.6027193275520658e-06, + "loss": 0.4664, + "step": 7633 + }, + { + "epoch": 3.7185774602143553, + "grad_norm": 3.057246208190918, + "learning_rate": 1.6021190721984158e-06, + "loss": 0.4655, + "step": 7634 + }, + { + "epoch": 3.719064631373823, + "grad_norm": 3.023790121078491, + "learning_rate": 1.6015188762664413e-06, + "loss": 0.4338, + "step": 7635 + }, + { + "epoch": 3.71955180253329, + "grad_norm": 3.244497060775757, + "learning_rate": 1.6009187397958642e-06, + "loss": 0.4924, + "step": 7636 + }, + { + "epoch": 3.7200389736927573, + "grad_norm": 3.4847280979156494, + "learning_rate": 1.6003186628264023e-06, + "loss": 0.4123, + "step": 7637 + }, + { + "epoch": 3.720526144852225, + "grad_norm": 2.967583179473877, + "learning_rate": 1.5997186453977663e-06, + "loss": 0.4384, + "step": 7638 + }, + { + "epoch": 3.721013316011692, + "grad_norm": 2.9519577026367188, + "learning_rate": 1.5991186875496668e-06, + "loss": 0.3987, + "step": 7639 + }, + { + "epoch": 3.721500487171159, + "grad_norm": 3.0020499229431152, + "learning_rate": 1.5985187893218082e-06, + "loss": 0.433, + "step": 7640 + }, + { + "epoch": 3.721987658330627, + "grad_norm": 3.1785645484924316, + "learning_rate": 1.5979189507538933e-06, + "loss": 0.4634, + "step": 7641 + }, + { + "epoch": 3.7224748294900945, + "grad_norm": 3.3139736652374268, + "learning_rate": 1.5973191718856173e-06, + "loss": 0.4201, + "step": 7642 + }, + { + "epoch": 3.7229620006495616, + "grad_norm": 3.112790584564209, + "learning_rate": 1.5967194527566748e-06, + "loss": 0.4777, + "step": 7643 + }, + { + "epoch": 3.723449171809029, + "grad_norm": 3.3636233806610107, + "learning_rate": 1.5961197934067543e-06, + "loss": 0.487, + "step": 7644 + }, + { + "epoch": 3.7239363429684964, + "grad_norm": 2.912043333053589, + "learning_rate": 1.595520193875543e-06, + "loss": 0.366, + "step": 7645 + }, + { + "epoch": 3.7244235141279636, + "grad_norm": 3.2575294971466064, + "learning_rate": 1.5949206542027201e-06, + "loss": 0.4448, + "step": 7646 + }, + { + "epoch": 3.7249106852874307, + "grad_norm": 3.2594897747039795, + "learning_rate": 1.5943211744279635e-06, + "loss": 0.417, + "step": 7647 + }, + { + "epoch": 3.7253978564468984, + "grad_norm": 2.915203094482422, + "learning_rate": 1.5937217545909478e-06, + "loss": 0.425, + "step": 7648 + }, + { + "epoch": 3.725885027606366, + "grad_norm": 3.1763885021209717, + "learning_rate": 1.5931223947313428e-06, + "loss": 0.5121, + "step": 7649 + }, + { + "epoch": 3.726372198765833, + "grad_norm": 3.404231071472168, + "learning_rate": 1.5925230948888123e-06, + "loss": 0.5044, + "step": 7650 + }, + { + "epoch": 3.7268593699253003, + "grad_norm": 3.1476879119873047, + "learning_rate": 1.5919238551030187e-06, + "loss": 0.4386, + "step": 7651 + }, + { + "epoch": 3.727346541084768, + "grad_norm": 3.0741844177246094, + "learning_rate": 1.5913246754136205e-06, + "loss": 0.4163, + "step": 7652 + }, + { + "epoch": 3.727833712244235, + "grad_norm": 2.774064540863037, + "learning_rate": 1.5907255558602707e-06, + "loss": 0.4176, + "step": 7653 + }, + { + "epoch": 3.7283208834037023, + "grad_norm": 3.2709591388702393, + "learning_rate": 1.5901264964826184e-06, + "loss": 0.4249, + "step": 7654 + }, + { + "epoch": 3.72880805456317, + "grad_norm": 2.9974920749664307, + "learning_rate": 1.58952749732031e-06, + "loss": 0.4493, + "step": 7655 + }, + { + "epoch": 3.7292952257226375, + "grad_norm": 2.828796625137329, + "learning_rate": 1.5889285584129876e-06, + "loss": 0.459, + "step": 7656 + }, + { + "epoch": 3.7297823968821047, + "grad_norm": 3.097978353500366, + "learning_rate": 1.5883296798002876e-06, + "loss": 0.4251, + "step": 7657 + }, + { + "epoch": 3.730269568041572, + "grad_norm": 3.2411837577819824, + "learning_rate": 1.587730861521845e-06, + "loss": 0.5042, + "step": 7658 + }, + { + "epoch": 3.7307567392010395, + "grad_norm": 3.041897773742676, + "learning_rate": 1.5871321036172885e-06, + "loss": 0.4346, + "step": 7659 + }, + { + "epoch": 3.7312439103605066, + "grad_norm": 3.4255568981170654, + "learning_rate": 1.5865334061262454e-06, + "loss": 0.5194, + "step": 7660 + }, + { + "epoch": 3.731731081519974, + "grad_norm": 3.1190249919891357, + "learning_rate": 1.5859347690883353e-06, + "loss": 0.5151, + "step": 7661 + }, + { + "epoch": 3.7322182526794414, + "grad_norm": 2.9931249618530273, + "learning_rate": 1.5853361925431773e-06, + "loss": 0.4796, + "step": 7662 + }, + { + "epoch": 3.7327054238389086, + "grad_norm": 2.926821708679199, + "learning_rate": 1.5847376765303848e-06, + "loss": 0.4216, + "step": 7663 + }, + { + "epoch": 3.733192594998376, + "grad_norm": 3.115840196609497, + "learning_rate": 1.5841392210895687e-06, + "loss": 0.3592, + "step": 7664 + }, + { + "epoch": 3.7336797661578434, + "grad_norm": 4.480978488922119, + "learning_rate": 1.5835408262603324e-06, + "loss": 0.4508, + "step": 7665 + }, + { + "epoch": 3.734166937317311, + "grad_norm": 3.1380348205566406, + "learning_rate": 1.5829424920822792e-06, + "loss": 0.451, + "step": 7666 + }, + { + "epoch": 3.734654108476778, + "grad_norm": 3.210757255554199, + "learning_rate": 1.5823442185950063e-06, + "loss": 0.4913, + "step": 7667 + }, + { + "epoch": 3.7351412796362453, + "grad_norm": 2.7214484214782715, + "learning_rate": 1.5817460058381088e-06, + "loss": 0.462, + "step": 7668 + }, + { + "epoch": 3.735628450795713, + "grad_norm": 2.878943920135498, + "learning_rate": 1.5811478538511737e-06, + "loss": 0.4069, + "step": 7669 + }, + { + "epoch": 3.73611562195518, + "grad_norm": 3.0367445945739746, + "learning_rate": 1.5805497626737885e-06, + "loss": 0.4164, + "step": 7670 + }, + { + "epoch": 3.7366027931146477, + "grad_norm": 3.0961194038391113, + "learning_rate": 1.5799517323455343e-06, + "loss": 0.3848, + "step": 7671 + }, + { + "epoch": 3.737089964274115, + "grad_norm": 2.969615936279297, + "learning_rate": 1.5793537629059897e-06, + "loss": 0.4158, + "step": 7672 + }, + { + "epoch": 3.7375771354335825, + "grad_norm": 2.923771619796753, + "learning_rate": 1.5787558543947262e-06, + "loss": 0.4565, + "step": 7673 + }, + { + "epoch": 3.7380643065930497, + "grad_norm": 3.3920440673828125, + "learning_rate": 1.5781580068513146e-06, + "loss": 0.4413, + "step": 7674 + }, + { + "epoch": 3.738551477752517, + "grad_norm": 3.1647539138793945, + "learning_rate": 1.5775602203153206e-06, + "loss": 0.5225, + "step": 7675 + }, + { + "epoch": 3.7390386489119845, + "grad_norm": 3.0245649814605713, + "learning_rate": 1.5769624948263056e-06, + "loss": 0.468, + "step": 7676 + }, + { + "epoch": 3.7395258200714516, + "grad_norm": 2.883899450302124, + "learning_rate": 1.576364830423826e-06, + "loss": 0.4741, + "step": 7677 + }, + { + "epoch": 3.7400129912309192, + "grad_norm": 3.177891969680786, + "learning_rate": 1.5757672271474367e-06, + "loss": 0.3835, + "step": 7678 + }, + { + "epoch": 3.7405001623903864, + "grad_norm": 3.072892189025879, + "learning_rate": 1.5751696850366854e-06, + "loss": 0.4489, + "step": 7679 + }, + { + "epoch": 3.740987333549854, + "grad_norm": 3.196406364440918, + "learning_rate": 1.574572204131119e-06, + "loss": 0.4322, + "step": 7680 + }, + { + "epoch": 3.741474504709321, + "grad_norm": 2.9535818099975586, + "learning_rate": 1.5739747844702779e-06, + "loss": 0.3871, + "step": 7681 + }, + { + "epoch": 3.7419616758687884, + "grad_norm": 2.9410324096679688, + "learning_rate": 1.5733774260936985e-06, + "loss": 0.4016, + "step": 7682 + }, + { + "epoch": 3.742448847028256, + "grad_norm": 3.110713005065918, + "learning_rate": 1.572780129040915e-06, + "loss": 0.3927, + "step": 7683 + }, + { + "epoch": 3.742936018187723, + "grad_norm": 3.2974867820739746, + "learning_rate": 1.5721828933514573e-06, + "loss": 0.4309, + "step": 7684 + }, + { + "epoch": 3.7434231893471908, + "grad_norm": 2.825032949447632, + "learning_rate": 1.5715857190648485e-06, + "loss": 0.3796, + "step": 7685 + }, + { + "epoch": 3.743910360506658, + "grad_norm": 3.194899797439575, + "learning_rate": 1.57098860622061e-06, + "loss": 0.487, + "step": 7686 + }, + { + "epoch": 3.7443975316661255, + "grad_norm": 2.7252378463745117, + "learning_rate": 1.5703915548582594e-06, + "loss": 0.4459, + "step": 7687 + }, + { + "epoch": 3.7448847028255927, + "grad_norm": 3.2377138137817383, + "learning_rate": 1.5697945650173096e-06, + "loss": 0.437, + "step": 7688 + }, + { + "epoch": 3.74537187398506, + "grad_norm": 2.924211263656616, + "learning_rate": 1.5691976367372683e-06, + "loss": 0.4458, + "step": 7689 + }, + { + "epoch": 3.7458590451445275, + "grad_norm": 3.2741873264312744, + "learning_rate": 1.5686007700576405e-06, + "loss": 0.4675, + "step": 7690 + }, + { + "epoch": 3.7463462163039947, + "grad_norm": 3.2849550247192383, + "learning_rate": 1.5680039650179272e-06, + "loss": 0.4405, + "step": 7691 + }, + { + "epoch": 3.7468333874634623, + "grad_norm": 3.0003175735473633, + "learning_rate": 1.5674072216576256e-06, + "loss": 0.4448, + "step": 7692 + }, + { + "epoch": 3.7473205586229295, + "grad_norm": 2.928877115249634, + "learning_rate": 1.5668105400162263e-06, + "loss": 0.4184, + "step": 7693 + }, + { + "epoch": 3.747807729782397, + "grad_norm": 2.862079381942749, + "learning_rate": 1.5662139201332182e-06, + "loss": 0.4545, + "step": 7694 + }, + { + "epoch": 3.7482949009418642, + "grad_norm": 3.1729440689086914, + "learning_rate": 1.5656173620480862e-06, + "loss": 0.4546, + "step": 7695 + }, + { + "epoch": 3.7487820721013314, + "grad_norm": 2.953535795211792, + "learning_rate": 1.5650208658003113e-06, + "loss": 0.3838, + "step": 7696 + }, + { + "epoch": 3.749269243260799, + "grad_norm": 3.8887834548950195, + "learning_rate": 1.5644244314293672e-06, + "loss": 0.4063, + "step": 7697 + }, + { + "epoch": 3.749756414420266, + "grad_norm": 2.8845818042755127, + "learning_rate": 1.563828058974728e-06, + "loss": 0.4441, + "step": 7698 + }, + { + "epoch": 3.750243585579734, + "grad_norm": 2.5913357734680176, + "learning_rate": 1.5632317484758596e-06, + "loss": 0.3865, + "step": 7699 + }, + { + "epoch": 3.750730756739201, + "grad_norm": 3.092327117919922, + "learning_rate": 1.5626354999722276e-06, + "loss": 0.456, + "step": 7700 + }, + { + "epoch": 3.7512179278986686, + "grad_norm": 2.9826114177703857, + "learning_rate": 1.5620393135032913e-06, + "loss": 0.4358, + "step": 7701 + }, + { + "epoch": 3.7517050990581358, + "grad_norm": 2.8611624240875244, + "learning_rate": 1.5614431891085048e-06, + "loss": 0.4257, + "step": 7702 + }, + { + "epoch": 3.752192270217603, + "grad_norm": 3.3699984550476074, + "learning_rate": 1.560847126827321e-06, + "loss": 0.3586, + "step": 7703 + }, + { + "epoch": 3.7526794413770705, + "grad_norm": 3.1255533695220947, + "learning_rate": 1.5602511266991877e-06, + "loss": 0.4237, + "step": 7704 + }, + { + "epoch": 3.7531666125365377, + "grad_norm": 2.7619566917419434, + "learning_rate": 1.5596551887635464e-06, + "loss": 0.3608, + "step": 7705 + }, + { + "epoch": 3.7536537836960053, + "grad_norm": 2.921088457107544, + "learning_rate": 1.5590593130598373e-06, + "loss": 0.433, + "step": 7706 + }, + { + "epoch": 3.7541409548554725, + "grad_norm": 2.700742483139038, + "learning_rate": 1.558463499627496e-06, + "loss": 0.4202, + "step": 7707 + }, + { + "epoch": 3.75462812601494, + "grad_norm": 3.186877965927124, + "learning_rate": 1.5578677485059519e-06, + "loss": 0.438, + "step": 7708 + }, + { + "epoch": 3.7551152971744073, + "grad_norm": 3.258312940597534, + "learning_rate": 1.5572720597346322e-06, + "loss": 0.4866, + "step": 7709 + }, + { + "epoch": 3.7556024683338745, + "grad_norm": 3.1685400009155273, + "learning_rate": 1.55667643335296e-06, + "loss": 0.4206, + "step": 7710 + }, + { + "epoch": 3.756089639493342, + "grad_norm": 3.234851360321045, + "learning_rate": 1.5560808694003543e-06, + "loss": 0.4541, + "step": 7711 + }, + { + "epoch": 3.7565768106528092, + "grad_norm": 3.257974147796631, + "learning_rate": 1.5554853679162279e-06, + "loss": 0.4298, + "step": 7712 + }, + { + "epoch": 3.757063981812277, + "grad_norm": 3.0056111812591553, + "learning_rate": 1.554889928939992e-06, + "loss": 0.4571, + "step": 7713 + }, + { + "epoch": 3.757551152971744, + "grad_norm": 3.0585737228393555, + "learning_rate": 1.5542945525110525e-06, + "loss": 0.4801, + "step": 7714 + }, + { + "epoch": 3.7580383241312116, + "grad_norm": 3.1381382942199707, + "learning_rate": 1.5536992386688126e-06, + "loss": 0.461, + "step": 7715 + }, + { + "epoch": 3.758525495290679, + "grad_norm": 3.074636220932007, + "learning_rate": 1.553103987452668e-06, + "loss": 0.519, + "step": 7716 + }, + { + "epoch": 3.759012666450146, + "grad_norm": 2.7303202152252197, + "learning_rate": 1.5525087989020133e-06, + "loss": 0.4325, + "step": 7717 + }, + { + "epoch": 3.7594998376096136, + "grad_norm": 3.1875345706939697, + "learning_rate": 1.5519136730562378e-06, + "loss": 0.4395, + "step": 7718 + }, + { + "epoch": 3.7599870087690808, + "grad_norm": 3.0812747478485107, + "learning_rate": 1.5513186099547285e-06, + "loss": 0.4346, + "step": 7719 + }, + { + "epoch": 3.7604741799285484, + "grad_norm": 3.203015089035034, + "learning_rate": 1.5507236096368638e-06, + "loss": 0.45, + "step": 7720 + }, + { + "epoch": 3.7609613510880155, + "grad_norm": 2.948901891708374, + "learning_rate": 1.550128672142023e-06, + "loss": 0.432, + "step": 7721 + }, + { + "epoch": 3.761448522247483, + "grad_norm": 3.070431709289551, + "learning_rate": 1.5495337975095776e-06, + "loss": 0.3909, + "step": 7722 + }, + { + "epoch": 3.7619356934069503, + "grad_norm": 3.1975321769714355, + "learning_rate": 1.5489389857788978e-06, + "loss": 0.4176, + "step": 7723 + }, + { + "epoch": 3.7624228645664175, + "grad_norm": 2.695573329925537, + "learning_rate": 1.548344236989347e-06, + "loss": 0.3863, + "step": 7724 + }, + { + "epoch": 3.762910035725885, + "grad_norm": 2.651211977005005, + "learning_rate": 1.5477495511802855e-06, + "loss": 0.428, + "step": 7725 + }, + { + "epoch": 3.7633972068853523, + "grad_norm": 3.124929904937744, + "learning_rate": 1.5471549283910702e-06, + "loss": 0.3974, + "step": 7726 + }, + { + "epoch": 3.76388437804482, + "grad_norm": 3.23445987701416, + "learning_rate": 1.5465603686610541e-06, + "loss": 0.4668, + "step": 7727 + }, + { + "epoch": 3.764371549204287, + "grad_norm": 2.92790150642395, + "learning_rate": 1.5459658720295828e-06, + "loss": 0.4093, + "step": 7728 + }, + { + "epoch": 3.7648587203637547, + "grad_norm": 2.8986172676086426, + "learning_rate": 1.5453714385360014e-06, + "loss": 0.4153, + "step": 7729 + }, + { + "epoch": 3.765345891523222, + "grad_norm": 3.623318672180176, + "learning_rate": 1.5447770682196488e-06, + "loss": 0.4949, + "step": 7730 + }, + { + "epoch": 3.765833062682689, + "grad_norm": 3.3329923152923584, + "learning_rate": 1.5441827611198626e-06, + "loss": 0.4528, + "step": 7731 + }, + { + "epoch": 3.7663202338421566, + "grad_norm": 2.970212697982788, + "learning_rate": 1.5435885172759706e-06, + "loss": 0.3815, + "step": 7732 + }, + { + "epoch": 3.766807405001624, + "grad_norm": 2.8142855167388916, + "learning_rate": 1.542994336727302e-06, + "loss": 0.3306, + "step": 7733 + }, + { + "epoch": 3.7672945761610914, + "grad_norm": 2.802112102508545, + "learning_rate": 1.542400219513179e-06, + "loss": 0.3907, + "step": 7734 + }, + { + "epoch": 3.7677817473205586, + "grad_norm": 3.0366570949554443, + "learning_rate": 1.5418061656729211e-06, + "loss": 0.4309, + "step": 7735 + }, + { + "epoch": 3.768268918480026, + "grad_norm": 3.3802361488342285, + "learning_rate": 1.5412121752458411e-06, + "loss": 0.4447, + "step": 7736 + }, + { + "epoch": 3.7687560896394934, + "grad_norm": 3.0076496601104736, + "learning_rate": 1.54061824827125e-06, + "loss": 0.4349, + "step": 7737 + }, + { + "epoch": 3.7692432607989605, + "grad_norm": 3.7115161418914795, + "learning_rate": 1.5400243847884539e-06, + "loss": 0.5006, + "step": 7738 + }, + { + "epoch": 3.769730431958428, + "grad_norm": 3.0791893005371094, + "learning_rate": 1.5394305848367557e-06, + "loss": 0.4434, + "step": 7739 + }, + { + "epoch": 3.7702176031178953, + "grad_norm": 3.1937062740325928, + "learning_rate": 1.5388368484554505e-06, + "loss": 0.4347, + "step": 7740 + }, + { + "epoch": 3.7707047742773625, + "grad_norm": 3.2141973972320557, + "learning_rate": 1.5382431756838336e-06, + "loss": 0.4593, + "step": 7741 + }, + { + "epoch": 3.77119194543683, + "grad_norm": 3.1589083671569824, + "learning_rate": 1.5376495665611944e-06, + "loss": 0.5515, + "step": 7742 + }, + { + "epoch": 3.7716791165962977, + "grad_norm": 3.424978494644165, + "learning_rate": 1.537056021126817e-06, + "loss": 0.5048, + "step": 7743 + }, + { + "epoch": 3.772166287755765, + "grad_norm": 2.9448282718658447, + "learning_rate": 1.536462539419982e-06, + "loss": 0.3914, + "step": 7744 + }, + { + "epoch": 3.772653458915232, + "grad_norm": 3.0355842113494873, + "learning_rate": 1.535869121479967e-06, + "loss": 0.4529, + "step": 7745 + }, + { + "epoch": 3.7731406300746997, + "grad_norm": 3.7161524295806885, + "learning_rate": 1.535275767346043e-06, + "loss": 0.4449, + "step": 7746 + }, + { + "epoch": 3.773627801234167, + "grad_norm": 3.429079532623291, + "learning_rate": 1.5346824770574803e-06, + "loss": 0.44, + "step": 7747 + }, + { + "epoch": 3.774114972393634, + "grad_norm": 3.3180713653564453, + "learning_rate": 1.53408925065354e-06, + "loss": 0.4258, + "step": 7748 + }, + { + "epoch": 3.7746021435531016, + "grad_norm": 3.042306661605835, + "learning_rate": 1.5334960881734834e-06, + "loss": 0.4359, + "step": 7749 + }, + { + "epoch": 3.7750893147125693, + "grad_norm": 3.094189167022705, + "learning_rate": 1.5329029896565656e-06, + "loss": 0.4618, + "step": 7750 + }, + { + "epoch": 3.7755764858720364, + "grad_norm": 3.289642572402954, + "learning_rate": 1.5323099551420387e-06, + "loss": 0.4164, + "step": 7751 + }, + { + "epoch": 3.7760636570315036, + "grad_norm": 2.8806610107421875, + "learning_rate": 1.5317169846691483e-06, + "loss": 0.4232, + "step": 7752 + }, + { + "epoch": 3.776550828190971, + "grad_norm": 3.438140630722046, + "learning_rate": 1.5311240782771373e-06, + "loss": 0.5063, + "step": 7753 + }, + { + "epoch": 3.7770379993504384, + "grad_norm": 2.770939826965332, + "learning_rate": 1.5305312360052444e-06, + "loss": 0.4027, + "step": 7754 + }, + { + "epoch": 3.7775251705099055, + "grad_norm": 3.2236690521240234, + "learning_rate": 1.5299384578927054e-06, + "loss": 0.4446, + "step": 7755 + }, + { + "epoch": 3.778012341669373, + "grad_norm": 3.2393221855163574, + "learning_rate": 1.529345743978748e-06, + "loss": 0.4837, + "step": 7756 + }, + { + "epoch": 3.778499512828841, + "grad_norm": 2.987550973892212, + "learning_rate": 1.5287530943025985e-06, + "loss": 0.4637, + "step": 7757 + }, + { + "epoch": 3.778986683988308, + "grad_norm": 2.9988481998443604, + "learning_rate": 1.528160508903479e-06, + "loss": 0.357, + "step": 7758 + }, + { + "epoch": 3.779473855147775, + "grad_norm": 2.9824843406677246, + "learning_rate": 1.5275679878206077e-06, + "loss": 0.3901, + "step": 7759 + }, + { + "epoch": 3.7799610263072427, + "grad_norm": 3.0836904048919678, + "learning_rate": 1.526975531093195e-06, + "loss": 0.4622, + "step": 7760 + }, + { + "epoch": 3.78044819746671, + "grad_norm": 2.9972753524780273, + "learning_rate": 1.5263831387604513e-06, + "loss": 0.4127, + "step": 7761 + }, + { + "epoch": 3.780935368626177, + "grad_norm": 3.351268768310547, + "learning_rate": 1.5257908108615818e-06, + "loss": 0.4078, + "step": 7762 + }, + { + "epoch": 3.7814225397856447, + "grad_norm": 3.28444504737854, + "learning_rate": 1.5251985474357846e-06, + "loss": 0.4598, + "step": 7763 + }, + { + "epoch": 3.7819097109451123, + "grad_norm": 3.327718496322632, + "learning_rate": 1.524606348522257e-06, + "loss": 0.4253, + "step": 7764 + }, + { + "epoch": 3.7823968821045795, + "grad_norm": 3.095791816711426, + "learning_rate": 1.5240142141601907e-06, + "loss": 0.4319, + "step": 7765 + }, + { + "epoch": 3.7828840532640466, + "grad_norm": 3.387449026107788, + "learning_rate": 1.523422144388773e-06, + "loss": 0.4186, + "step": 7766 + }, + { + "epoch": 3.7833712244235143, + "grad_norm": 3.0569896697998047, + "learning_rate": 1.5228301392471862e-06, + "loss": 0.3628, + "step": 7767 + }, + { + "epoch": 3.7838583955829814, + "grad_norm": 3.106145143508911, + "learning_rate": 1.5222381987746104e-06, + "loss": 0.4813, + "step": 7768 + }, + { + "epoch": 3.7843455667424486, + "grad_norm": 2.8333661556243896, + "learning_rate": 1.521646323010219e-06, + "loss": 0.437, + "step": 7769 + }, + { + "epoch": 3.784832737901916, + "grad_norm": 3.07450532913208, + "learning_rate": 1.5210545119931838e-06, + "loss": 0.4781, + "step": 7770 + }, + { + "epoch": 3.785319909061384, + "grad_norm": 3.054664373397827, + "learning_rate": 1.520462765762669e-06, + "loss": 0.4036, + "step": 7771 + }, + { + "epoch": 3.785807080220851, + "grad_norm": 3.155552625656128, + "learning_rate": 1.5198710843578369e-06, + "loss": 0.4061, + "step": 7772 + }, + { + "epoch": 3.786294251380318, + "grad_norm": 2.9414522647857666, + "learning_rate": 1.5192794678178452e-06, + "loss": 0.4241, + "step": 7773 + }, + { + "epoch": 3.786781422539786, + "grad_norm": 2.814432382583618, + "learning_rate": 1.5186879161818482e-06, + "loss": 0.433, + "step": 7774 + }, + { + "epoch": 3.787268593699253, + "grad_norm": 3.373624801635742, + "learning_rate": 1.5180964294889925e-06, + "loss": 0.4176, + "step": 7775 + }, + { + "epoch": 3.78775576485872, + "grad_norm": 3.0722391605377197, + "learning_rate": 1.5175050077784236e-06, + "loss": 0.4328, + "step": 7776 + }, + { + "epoch": 3.7882429360181877, + "grad_norm": 2.9537651538848877, + "learning_rate": 1.5169136510892818e-06, + "loss": 0.3925, + "step": 7777 + }, + { + "epoch": 3.7887301071776553, + "grad_norm": 3.6436030864715576, + "learning_rate": 1.516322359460704e-06, + "loss": 0.4293, + "step": 7778 + }, + { + "epoch": 3.7892172783371225, + "grad_norm": 3.1841442584991455, + "learning_rate": 1.5157311329318197e-06, + "loss": 0.4112, + "step": 7779 + }, + { + "epoch": 3.7897044494965897, + "grad_norm": 3.0677907466888428, + "learning_rate": 1.5151399715417576e-06, + "loss": 0.4582, + "step": 7780 + }, + { + "epoch": 3.7901916206560573, + "grad_norm": 3.3636879920959473, + "learning_rate": 1.5145488753296403e-06, + "loss": 0.3977, + "step": 7781 + }, + { + "epoch": 3.7906787918155245, + "grad_norm": 2.835350751876831, + "learning_rate": 1.5139578443345875e-06, + "loss": 0.4241, + "step": 7782 + }, + { + "epoch": 3.7911659629749916, + "grad_norm": 3.2585818767547607, + "learning_rate": 1.5133668785957122e-06, + "loss": 0.4468, + "step": 7783 + }, + { + "epoch": 3.7916531341344593, + "grad_norm": 3.73309588432312, + "learning_rate": 1.5127759781521248e-06, + "loss": 0.4778, + "step": 7784 + }, + { + "epoch": 3.792140305293927, + "grad_norm": 3.49433970451355, + "learning_rate": 1.5121851430429318e-06, + "loss": 0.4805, + "step": 7785 + }, + { + "epoch": 3.792627476453394, + "grad_norm": 2.9134578704833984, + "learning_rate": 1.5115943733072342e-06, + "loss": 0.4097, + "step": 7786 + }, + { + "epoch": 3.793114647612861, + "grad_norm": 2.9547038078308105, + "learning_rate": 1.511003668984128e-06, + "loss": 0.4325, + "step": 7787 + }, + { + "epoch": 3.793601818772329, + "grad_norm": 2.9561851024627686, + "learning_rate": 1.5104130301127077e-06, + "loss": 0.4105, + "step": 7788 + }, + { + "epoch": 3.794088989931796, + "grad_norm": 3.641265392303467, + "learning_rate": 1.5098224567320602e-06, + "loss": 0.4144, + "step": 7789 + }, + { + "epoch": 3.794576161091263, + "grad_norm": 3.197195291519165, + "learning_rate": 1.509231948881271e-06, + "loss": 0.4311, + "step": 7790 + }, + { + "epoch": 3.795063332250731, + "grad_norm": 3.069098949432373, + "learning_rate": 1.508641506599419e-06, + "loss": 0.4202, + "step": 7791 + }, + { + "epoch": 3.795550503410198, + "grad_norm": 2.5587127208709717, + "learning_rate": 1.5080511299255797e-06, + "loss": 0.3959, + "step": 7792 + }, + { + "epoch": 3.7960376745696656, + "grad_norm": 3.0077030658721924, + "learning_rate": 1.5074608188988234e-06, + "loss": 0.4846, + "step": 7793 + }, + { + "epoch": 3.7965248457291327, + "grad_norm": 2.6954407691955566, + "learning_rate": 1.5068705735582195e-06, + "loss": 0.4333, + "step": 7794 + }, + { + "epoch": 3.7970120168886003, + "grad_norm": 3.0608577728271484, + "learning_rate": 1.5062803939428273e-06, + "loss": 0.4716, + "step": 7795 + }, + { + "epoch": 3.7974991880480675, + "grad_norm": 2.841020107269287, + "learning_rate": 1.5056902800917059e-06, + "loss": 0.4002, + "step": 7796 + }, + { + "epoch": 3.7979863592075347, + "grad_norm": 2.8879501819610596, + "learning_rate": 1.5051002320439093e-06, + "loss": 0.4314, + "step": 7797 + }, + { + "epoch": 3.7984735303670023, + "grad_norm": 2.8729844093322754, + "learning_rate": 1.5045102498384878e-06, + "loss": 0.4172, + "step": 7798 + }, + { + "epoch": 3.7989607015264695, + "grad_norm": 3.5676205158233643, + "learning_rate": 1.5039203335144843e-06, + "loss": 0.4585, + "step": 7799 + }, + { + "epoch": 3.799447872685937, + "grad_norm": 2.6848154067993164, + "learning_rate": 1.5033304831109401e-06, + "loss": 0.3651, + "step": 7800 + }, + { + "epoch": 3.7999350438454043, + "grad_norm": 3.0618269443511963, + "learning_rate": 1.502740698666892e-06, + "loss": 0.4698, + "step": 7801 + }, + { + "epoch": 3.800422215004872, + "grad_norm": 2.975595474243164, + "learning_rate": 1.5021509802213723e-06, + "loss": 0.4298, + "step": 7802 + }, + { + "epoch": 3.800909386164339, + "grad_norm": 2.934424638748169, + "learning_rate": 1.5015613278134072e-06, + "loss": 0.4204, + "step": 7803 + }, + { + "epoch": 3.801396557323806, + "grad_norm": 2.9542927742004395, + "learning_rate": 1.5009717414820202e-06, + "loss": 0.391, + "step": 7804 + }, + { + "epoch": 3.801883728483274, + "grad_norm": 3.4049625396728516, + "learning_rate": 1.5003822212662303e-06, + "loss": 0.429, + "step": 7805 + }, + { + "epoch": 3.802370899642741, + "grad_norm": 2.8645217418670654, + "learning_rate": 1.4997927672050526e-06, + "loss": 0.4045, + "step": 7806 + }, + { + "epoch": 3.8028580708022086, + "grad_norm": 2.9303126335144043, + "learning_rate": 1.499203379337496e-06, + "loss": 0.4728, + "step": 7807 + }, + { + "epoch": 3.803345241961676, + "grad_norm": 3.0411734580993652, + "learning_rate": 1.498614057702567e-06, + "loss": 0.3533, + "step": 7808 + }, + { + "epoch": 3.8038324131211434, + "grad_norm": 3.00921893119812, + "learning_rate": 1.4980248023392658e-06, + "loss": 0.4095, + "step": 7809 + }, + { + "epoch": 3.8043195842806106, + "grad_norm": 3.0689237117767334, + "learning_rate": 1.4974356132865902e-06, + "loss": 0.4872, + "step": 7810 + }, + { + "epoch": 3.8048067554400777, + "grad_norm": 3.389204502105713, + "learning_rate": 1.4968464905835325e-06, + "loss": 0.4488, + "step": 7811 + }, + { + "epoch": 3.8052939265995454, + "grad_norm": 3.3024981021881104, + "learning_rate": 1.4962574342690803e-06, + "loss": 0.423, + "step": 7812 + }, + { + "epoch": 3.8057810977590125, + "grad_norm": 3.444566488265991, + "learning_rate": 1.4956684443822177e-06, + "loss": 0.4081, + "step": 7813 + }, + { + "epoch": 3.80626826891848, + "grad_norm": 3.038463592529297, + "learning_rate": 1.4950795209619246e-06, + "loss": 0.435, + "step": 7814 + }, + { + "epoch": 3.8067554400779473, + "grad_norm": 3.7227964401245117, + "learning_rate": 1.494490664047174e-06, + "loss": 0.5393, + "step": 7815 + }, + { + "epoch": 3.807242611237415, + "grad_norm": 2.9989676475524902, + "learning_rate": 1.493901873676938e-06, + "loss": 0.3949, + "step": 7816 + }, + { + "epoch": 3.807729782396882, + "grad_norm": 3.4248924255371094, + "learning_rate": 1.4933131498901833e-06, + "loss": 0.4627, + "step": 7817 + }, + { + "epoch": 3.8082169535563493, + "grad_norm": 3.4101617336273193, + "learning_rate": 1.4927244927258696e-06, + "loss": 0.41, + "step": 7818 + }, + { + "epoch": 3.808704124715817, + "grad_norm": 2.730227470397949, + "learning_rate": 1.492135902222955e-06, + "loss": 0.38, + "step": 7819 + }, + { + "epoch": 3.809191295875284, + "grad_norm": 2.947181224822998, + "learning_rate": 1.4915473784203926e-06, + "loss": 0.4175, + "step": 7820 + }, + { + "epoch": 3.8096784670347517, + "grad_norm": 3.081657648086548, + "learning_rate": 1.4909589213571321e-06, + "loss": 0.4879, + "step": 7821 + }, + { + "epoch": 3.810165638194219, + "grad_norm": 2.8939051628112793, + "learning_rate": 1.490370531072115e-06, + "loss": 0.4303, + "step": 7822 + }, + { + "epoch": 3.8106528093536864, + "grad_norm": 3.574436902999878, + "learning_rate": 1.489782207604282e-06, + "loss": 0.4068, + "step": 7823 + }, + { + "epoch": 3.8111399805131536, + "grad_norm": 2.9912450313568115, + "learning_rate": 1.4891939509925684e-06, + "loss": 0.418, + "step": 7824 + }, + { + "epoch": 3.811627151672621, + "grad_norm": 3.2242891788482666, + "learning_rate": 1.4886057612759064e-06, + "loss": 0.4355, + "step": 7825 + }, + { + "epoch": 3.8121143228320884, + "grad_norm": 3.2282702922821045, + "learning_rate": 1.4880176384932197e-06, + "loss": 0.4346, + "step": 7826 + }, + { + "epoch": 3.8126014939915556, + "grad_norm": 3.2877843379974365, + "learning_rate": 1.4874295826834312e-06, + "loss": 0.461, + "step": 7827 + }, + { + "epoch": 3.813088665151023, + "grad_norm": 3.074786901473999, + "learning_rate": 1.4868415938854591e-06, + "loss": 0.4043, + "step": 7828 + }, + { + "epoch": 3.8135758363104904, + "grad_norm": 3.060413360595703, + "learning_rate": 1.4862536721382164e-06, + "loss": 0.4131, + "step": 7829 + }, + { + "epoch": 3.814063007469958, + "grad_norm": 3.105736017227173, + "learning_rate": 1.485665817480611e-06, + "loss": 0.4331, + "step": 7830 + }, + { + "epoch": 3.814550178629425, + "grad_norm": 3.3588502407073975, + "learning_rate": 1.4850780299515467e-06, + "loss": 0.4593, + "step": 7831 + }, + { + "epoch": 3.8150373497888923, + "grad_norm": 3.259681463241577, + "learning_rate": 1.4844903095899248e-06, + "loss": 0.4521, + "step": 7832 + }, + { + "epoch": 3.81552452094836, + "grad_norm": 3.2412972450256348, + "learning_rate": 1.4839026564346393e-06, + "loss": 0.4993, + "step": 7833 + }, + { + "epoch": 3.816011692107827, + "grad_norm": 3.1562392711639404, + "learning_rate": 1.4833150705245816e-06, + "loss": 0.3883, + "step": 7834 + }, + { + "epoch": 3.8164988632672947, + "grad_norm": 2.9957971572875977, + "learning_rate": 1.4827275518986373e-06, + "loss": 0.4326, + "step": 7835 + }, + { + "epoch": 3.816986034426762, + "grad_norm": 2.663252353668213, + "learning_rate": 1.482140100595689e-06, + "loss": 0.3965, + "step": 7836 + }, + { + "epoch": 3.8174732055862295, + "grad_norm": 2.8283238410949707, + "learning_rate": 1.4815527166546147e-06, + "loss": 0.4878, + "step": 7837 + }, + { + "epoch": 3.8179603767456967, + "grad_norm": 3.383478879928589, + "learning_rate": 1.4809654001142863e-06, + "loss": 0.391, + "step": 7838 + }, + { + "epoch": 3.818447547905164, + "grad_norm": 2.6385602951049805, + "learning_rate": 1.4803781510135723e-06, + "loss": 0.4129, + "step": 7839 + }, + { + "epoch": 3.8189347190646314, + "grad_norm": 3.254030704498291, + "learning_rate": 1.4797909693913377e-06, + "loss": 0.4751, + "step": 7840 + }, + { + "epoch": 3.8194218902240986, + "grad_norm": 2.996774911880493, + "learning_rate": 1.4792038552864427e-06, + "loss": 0.4315, + "step": 7841 + }, + { + "epoch": 3.8199090613835662, + "grad_norm": 3.0521695613861084, + "learning_rate": 1.4786168087377404e-06, + "loss": 0.4196, + "step": 7842 + }, + { + "epoch": 3.8203962325430334, + "grad_norm": 2.8332958221435547, + "learning_rate": 1.4780298297840825e-06, + "loss": 0.4828, + "step": 7843 + }, + { + "epoch": 3.820883403702501, + "grad_norm": 3.097383975982666, + "learning_rate": 1.4774429184643154e-06, + "loss": 0.427, + "step": 7844 + }, + { + "epoch": 3.821370574861968, + "grad_norm": 3.176569938659668, + "learning_rate": 1.4768560748172817e-06, + "loss": 0.4781, + "step": 7845 + }, + { + "epoch": 3.8218577460214354, + "grad_norm": 2.972285032272339, + "learning_rate": 1.4762692988818167e-06, + "loss": 0.4584, + "step": 7846 + }, + { + "epoch": 3.822344917180903, + "grad_norm": 3.226297378540039, + "learning_rate": 1.4756825906967543e-06, + "loss": 0.4494, + "step": 7847 + }, + { + "epoch": 3.82283208834037, + "grad_norm": 3.0768818855285645, + "learning_rate": 1.4750959503009226e-06, + "loss": 0.437, + "step": 7848 + }, + { + "epoch": 3.8233192594998378, + "grad_norm": 3.474822521209717, + "learning_rate": 1.4745093777331465e-06, + "loss": 0.4606, + "step": 7849 + }, + { + "epoch": 3.823806430659305, + "grad_norm": 2.9268980026245117, + "learning_rate": 1.473922873032243e-06, + "loss": 0.411, + "step": 7850 + }, + { + "epoch": 3.8242936018187725, + "grad_norm": 2.833102226257324, + "learning_rate": 1.4733364362370284e-06, + "loss": 0.4439, + "step": 7851 + }, + { + "epoch": 3.8247807729782397, + "grad_norm": 2.9723448753356934, + "learning_rate": 1.4727500673863134e-06, + "loss": 0.4198, + "step": 7852 + }, + { + "epoch": 3.825267944137707, + "grad_norm": 2.884549856185913, + "learning_rate": 1.4721637665189033e-06, + "loss": 0.4633, + "step": 7853 + }, + { + "epoch": 3.8257551152971745, + "grad_norm": 2.9873335361480713, + "learning_rate": 1.4715775336735989e-06, + "loss": 0.4389, + "step": 7854 + }, + { + "epoch": 3.8262422864566417, + "grad_norm": 3.1541850566864014, + "learning_rate": 1.470991368889198e-06, + "loss": 0.4349, + "step": 7855 + }, + { + "epoch": 3.8267294576161093, + "grad_norm": 3.3774333000183105, + "learning_rate": 1.470405272204492e-06, + "loss": 0.448, + "step": 7856 + }, + { + "epoch": 3.8272166287755764, + "grad_norm": 2.76275372505188, + "learning_rate": 1.4698192436582703e-06, + "loss": 0.3982, + "step": 7857 + }, + { + "epoch": 3.827703799935044, + "grad_norm": 2.780426502227783, + "learning_rate": 1.469233283289314e-06, + "loss": 0.4061, + "step": 7858 + }, + { + "epoch": 3.8281909710945112, + "grad_norm": 3.3645503520965576, + "learning_rate": 1.4686473911364033e-06, + "loss": 0.4174, + "step": 7859 + }, + { + "epoch": 3.8286781422539784, + "grad_norm": 3.0797951221466064, + "learning_rate": 1.4680615672383119e-06, + "loss": 0.4496, + "step": 7860 + }, + { + "epoch": 3.829165313413446, + "grad_norm": 2.675617218017578, + "learning_rate": 1.4674758116338112e-06, + "loss": 0.4284, + "step": 7861 + }, + { + "epoch": 3.829652484572913, + "grad_norm": 2.811264753341675, + "learning_rate": 1.4668901243616636e-06, + "loss": 0.3952, + "step": 7862 + }, + { + "epoch": 3.830139655732381, + "grad_norm": 2.879026412963867, + "learning_rate": 1.4663045054606317e-06, + "loss": 0.4395, + "step": 7863 + }, + { + "epoch": 3.830626826891848, + "grad_norm": 2.533405303955078, + "learning_rate": 1.4657189549694726e-06, + "loss": 0.3436, + "step": 7864 + }, + { + "epoch": 3.8311139980513156, + "grad_norm": 3.2404086589813232, + "learning_rate": 1.4651334729269353e-06, + "loss": 0.4032, + "step": 7865 + }, + { + "epoch": 3.8316011692107828, + "grad_norm": 3.097702741622925, + "learning_rate": 1.4645480593717688e-06, + "loss": 0.4463, + "step": 7866 + }, + { + "epoch": 3.83208834037025, + "grad_norm": 2.9251492023468018, + "learning_rate": 1.4639627143427148e-06, + "loss": 0.401, + "step": 7867 + }, + { + "epoch": 3.8325755115297175, + "grad_norm": 3.239361047744751, + "learning_rate": 1.4633774378785134e-06, + "loss": 0.4377, + "step": 7868 + }, + { + "epoch": 3.8330626826891847, + "grad_norm": 3.4612603187561035, + "learning_rate": 1.4627922300178955e-06, + "loss": 0.4722, + "step": 7869 + }, + { + "epoch": 3.8335498538486523, + "grad_norm": 2.942763090133667, + "learning_rate": 1.462207090799591e-06, + "loss": 0.3805, + "step": 7870 + }, + { + "epoch": 3.8340370250081195, + "grad_norm": 3.047434091567993, + "learning_rate": 1.461622020262325e-06, + "loss": 0.4674, + "step": 7871 + }, + { + "epoch": 3.834524196167587, + "grad_norm": 2.822443723678589, + "learning_rate": 1.4610370184448175e-06, + "loss": 0.4077, + "step": 7872 + }, + { + "epoch": 3.8350113673270543, + "grad_norm": 3.5074684619903564, + "learning_rate": 1.4604520853857828e-06, + "loss": 0.4741, + "step": 7873 + }, + { + "epoch": 3.8354985384865214, + "grad_norm": 2.8313560485839844, + "learning_rate": 1.4598672211239321e-06, + "loss": 0.4355, + "step": 7874 + }, + { + "epoch": 3.835985709645989, + "grad_norm": 3.036362409591675, + "learning_rate": 1.4592824256979723e-06, + "loss": 0.3739, + "step": 7875 + }, + { + "epoch": 3.8364728808054562, + "grad_norm": 2.8627076148986816, + "learning_rate": 1.458697699146605e-06, + "loss": 0.4662, + "step": 7876 + }, + { + "epoch": 3.8369600519649234, + "grad_norm": 2.982013463973999, + "learning_rate": 1.4581130415085264e-06, + "loss": 0.4492, + "step": 7877 + }, + { + "epoch": 3.837447223124391, + "grad_norm": 2.9565775394439697, + "learning_rate": 1.4575284528224295e-06, + "loss": 0.4173, + "step": 7878 + }, + { + "epoch": 3.8379343942838586, + "grad_norm": 3.2190053462982178, + "learning_rate": 1.4569439331270024e-06, + "loss": 0.43, + "step": 7879 + }, + { + "epoch": 3.838421565443326, + "grad_norm": 2.8498568534851074, + "learning_rate": 1.45635948246093e-06, + "loss": 0.3904, + "step": 7880 + }, + { + "epoch": 3.838908736602793, + "grad_norm": 2.8812615871429443, + "learning_rate": 1.4557751008628888e-06, + "loss": 0.4082, + "step": 7881 + }, + { + "epoch": 3.8393959077622606, + "grad_norm": 3.1175830364227295, + "learning_rate": 1.4551907883715539e-06, + "loss": 0.4177, + "step": 7882 + }, + { + "epoch": 3.8398830789217278, + "grad_norm": 2.933824300765991, + "learning_rate": 1.4546065450255956e-06, + "loss": 0.4437, + "step": 7883 + }, + { + "epoch": 3.840370250081195, + "grad_norm": 3.2531697750091553, + "learning_rate": 1.4540223708636797e-06, + "loss": 0.4822, + "step": 7884 + }, + { + "epoch": 3.8408574212406625, + "grad_norm": 2.8176815509796143, + "learning_rate": 1.453438265924465e-06, + "loss": 0.3908, + "step": 7885 + }, + { + "epoch": 3.84134459240013, + "grad_norm": 3.103785991668701, + "learning_rate": 1.452854230246608e-06, + "loss": 0.4324, + "step": 7886 + }, + { + "epoch": 3.8418317635595973, + "grad_norm": 3.6276118755340576, + "learning_rate": 1.4522702638687611e-06, + "loss": 0.4465, + "step": 7887 + }, + { + "epoch": 3.8423189347190645, + "grad_norm": 2.965442419052124, + "learning_rate": 1.451686366829571e-06, + "loss": 0.3854, + "step": 7888 + }, + { + "epoch": 3.842806105878532, + "grad_norm": 2.9453437328338623, + "learning_rate": 1.4511025391676792e-06, + "loss": 0.4214, + "step": 7889 + }, + { + "epoch": 3.8432932770379993, + "grad_norm": 3.1656417846679688, + "learning_rate": 1.4505187809217231e-06, + "loss": 0.4419, + "step": 7890 + }, + { + "epoch": 3.8437804481974664, + "grad_norm": 3.098186731338501, + "learning_rate": 1.4499350921303369e-06, + "loss": 0.4757, + "step": 7891 + }, + { + "epoch": 3.844267619356934, + "grad_norm": 3.3902125358581543, + "learning_rate": 1.4493514728321489e-06, + "loss": 0.4106, + "step": 7892 + }, + { + "epoch": 3.8447547905164017, + "grad_norm": 3.0175790786743164, + "learning_rate": 1.4487679230657819e-06, + "loss": 0.429, + "step": 7893 + }, + { + "epoch": 3.845241961675869, + "grad_norm": 3.354400157928467, + "learning_rate": 1.448184442869856e-06, + "loss": 0.4173, + "step": 7894 + }, + { + "epoch": 3.845729132835336, + "grad_norm": 3.2377769947052, + "learning_rate": 1.4476010322829853e-06, + "loss": 0.4383, + "step": 7895 + }, + { + "epoch": 3.8462163039948036, + "grad_norm": 3.2990474700927734, + "learning_rate": 1.4470176913437816e-06, + "loss": 0.4249, + "step": 7896 + }, + { + "epoch": 3.846703475154271, + "grad_norm": 2.8390448093414307, + "learning_rate": 1.446434420090848e-06, + "loss": 0.4446, + "step": 7897 + }, + { + "epoch": 3.847190646313738, + "grad_norm": 3.4437077045440674, + "learning_rate": 1.4458512185627865e-06, + "loss": 0.4023, + "step": 7898 + }, + { + "epoch": 3.8476778174732056, + "grad_norm": 2.9170989990234375, + "learning_rate": 1.445268086798193e-06, + "loss": 0.3737, + "step": 7899 + }, + { + "epoch": 3.848164988632673, + "grad_norm": 3.0699844360351562, + "learning_rate": 1.4446850248356603e-06, + "loss": 0.4885, + "step": 7900 + }, + { + "epoch": 3.8486521597921404, + "grad_norm": 7.71252965927124, + "learning_rate": 1.4441020327137733e-06, + "loss": 0.4716, + "step": 7901 + }, + { + "epoch": 3.8491393309516075, + "grad_norm": 3.116717576980591, + "learning_rate": 1.4435191104711158e-06, + "loss": 0.4383, + "step": 7902 + }, + { + "epoch": 3.849626502111075, + "grad_norm": 2.8823082447052, + "learning_rate": 1.442936258146265e-06, + "loss": 0.3764, + "step": 7903 + }, + { + "epoch": 3.8501136732705423, + "grad_norm": 3.254899024963379, + "learning_rate": 1.4423534757777953e-06, + "loss": 0.3569, + "step": 7904 + }, + { + "epoch": 3.8506008444300095, + "grad_norm": 3.1027116775512695, + "learning_rate": 1.4417707634042728e-06, + "loss": 0.4228, + "step": 7905 + }, + { + "epoch": 3.851088015589477, + "grad_norm": 2.8686561584472656, + "learning_rate": 1.4411881210642628e-06, + "loss": 0.4199, + "step": 7906 + }, + { + "epoch": 3.8515751867489447, + "grad_norm": 2.9528791904449463, + "learning_rate": 1.4406055487963254e-06, + "loss": 0.4516, + "step": 7907 + }, + { + "epoch": 3.852062357908412, + "grad_norm": 3.106926679611206, + "learning_rate": 1.440023046639013e-06, + "loss": 0.4221, + "step": 7908 + }, + { + "epoch": 3.852549529067879, + "grad_norm": 2.7853095531463623, + "learning_rate": 1.4394406146308768e-06, + "loss": 0.3563, + "step": 7909 + }, + { + "epoch": 3.8530367002273467, + "grad_norm": 2.6103837490081787, + "learning_rate": 1.4388582528104628e-06, + "loss": 0.3959, + "step": 7910 + }, + { + "epoch": 3.853523871386814, + "grad_norm": 3.642730951309204, + "learning_rate": 1.43827596121631e-06, + "loss": 0.4583, + "step": 7911 + }, + { + "epoch": 3.854011042546281, + "grad_norm": 3.2844009399414062, + "learning_rate": 1.4376937398869562e-06, + "loss": 0.4028, + "step": 7912 + }, + { + "epoch": 3.8544982137057486, + "grad_norm": 3.3279459476470947, + "learning_rate": 1.4371115888609305e-06, + "loss": 0.471, + "step": 7913 + }, + { + "epoch": 3.8549853848652162, + "grad_norm": 3.589212417602539, + "learning_rate": 1.436529508176761e-06, + "loss": 0.505, + "step": 7914 + }, + { + "epoch": 3.8554725560246834, + "grad_norm": 3.195343494415283, + "learning_rate": 1.435947497872971e-06, + "loss": 0.4527, + "step": 7915 + }, + { + "epoch": 3.8559597271841506, + "grad_norm": 3.1220803260803223, + "learning_rate": 1.435365557988075e-06, + "loss": 0.4249, + "step": 7916 + }, + { + "epoch": 3.856446898343618, + "grad_norm": 3.4061899185180664, + "learning_rate": 1.4347836885605875e-06, + "loss": 0.4962, + "step": 7917 + }, + { + "epoch": 3.8569340695030854, + "grad_norm": 3.233151912689209, + "learning_rate": 1.4342018896290163e-06, + "loss": 0.4148, + "step": 7918 + }, + { + "epoch": 3.8574212406625525, + "grad_norm": 3.308363676071167, + "learning_rate": 1.4336201612318656e-06, + "loss": 0.4672, + "step": 7919 + }, + { + "epoch": 3.85790841182202, + "grad_norm": 2.9992072582244873, + "learning_rate": 1.4330385034076328e-06, + "loss": 0.4599, + "step": 7920 + }, + { + "epoch": 3.8583955829814878, + "grad_norm": 3.257810354232788, + "learning_rate": 1.4324569161948121e-06, + "loss": 0.4383, + "step": 7921 + }, + { + "epoch": 3.858882754140955, + "grad_norm": 2.758373260498047, + "learning_rate": 1.4318753996318932e-06, + "loss": 0.4202, + "step": 7922 + }, + { + "epoch": 3.859369925300422, + "grad_norm": 3.0226588249206543, + "learning_rate": 1.4312939537573622e-06, + "loss": 0.3832, + "step": 7923 + }, + { + "epoch": 3.8598570964598897, + "grad_norm": 3.0850605964660645, + "learning_rate": 1.4307125786096964e-06, + "loss": 0.4185, + "step": 7924 + }, + { + "epoch": 3.860344267619357, + "grad_norm": 3.3105356693267822, + "learning_rate": 1.4301312742273729e-06, + "loss": 0.4292, + "step": 7925 + }, + { + "epoch": 3.860831438778824, + "grad_norm": 2.840318202972412, + "learning_rate": 1.4295500406488623e-06, + "loss": 0.4232, + "step": 7926 + }, + { + "epoch": 3.8613186099382917, + "grad_norm": 3.1390645503997803, + "learning_rate": 1.4289688779126306e-06, + "loss": 0.3761, + "step": 7927 + }, + { + "epoch": 3.861805781097759, + "grad_norm": 3.2329864501953125, + "learning_rate": 1.4283877860571382e-06, + "loss": 0.4421, + "step": 7928 + }, + { + "epoch": 3.8622929522572265, + "grad_norm": 3.3889541625976562, + "learning_rate": 1.4278067651208424e-06, + "loss": 0.4724, + "step": 7929 + }, + { + "epoch": 3.8627801234166936, + "grad_norm": 3.6075191497802734, + "learning_rate": 1.427225815142195e-06, + "loss": 0.4895, + "step": 7930 + }, + { + "epoch": 3.8632672945761612, + "grad_norm": 3.847154378890991, + "learning_rate": 1.4266449361596443e-06, + "loss": 0.4501, + "step": 7931 + }, + { + "epoch": 3.8637544657356284, + "grad_norm": 2.7366292476654053, + "learning_rate": 1.4260641282116306e-06, + "loss": 0.4136, + "step": 7932 + }, + { + "epoch": 3.8642416368950956, + "grad_norm": 3.580754518508911, + "learning_rate": 1.425483391336593e-06, + "loss": 0.4353, + "step": 7933 + }, + { + "epoch": 3.864728808054563, + "grad_norm": 3.0183234214782715, + "learning_rate": 1.4249027255729648e-06, + "loss": 0.4513, + "step": 7934 + }, + { + "epoch": 3.8652159792140304, + "grad_norm": 3.151472568511963, + "learning_rate": 1.424322130959175e-06, + "loss": 0.4712, + "step": 7935 + }, + { + "epoch": 3.865703150373498, + "grad_norm": 3.23100209236145, + "learning_rate": 1.423741607533645e-06, + "loss": 0.4838, + "step": 7936 + }, + { + "epoch": 3.866190321532965, + "grad_norm": 2.9956510066986084, + "learning_rate": 1.423161155334795e-06, + "loss": 0.3916, + "step": 7937 + }, + { + "epoch": 3.8666774926924328, + "grad_norm": 2.838939666748047, + "learning_rate": 1.4225807744010403e-06, + "loss": 0.3956, + "step": 7938 + }, + { + "epoch": 3.8671646638519, + "grad_norm": 3.5391645431518555, + "learning_rate": 1.4220004647707903e-06, + "loss": 0.4897, + "step": 7939 + }, + { + "epoch": 3.867651835011367, + "grad_norm": 2.879232168197632, + "learning_rate": 1.421420226482448e-06, + "loss": 0.4069, + "step": 7940 + }, + { + "epoch": 3.8681390061708347, + "grad_norm": 2.8260433673858643, + "learning_rate": 1.420840059574415e-06, + "loss": 0.4016, + "step": 7941 + }, + { + "epoch": 3.868626177330302, + "grad_norm": 2.9803011417388916, + "learning_rate": 1.4202599640850855e-06, + "loss": 0.4382, + "step": 7942 + }, + { + "epoch": 3.8691133484897695, + "grad_norm": 2.7332794666290283, + "learning_rate": 1.419679940052853e-06, + "loss": 0.376, + "step": 7943 + }, + { + "epoch": 3.8696005196492367, + "grad_norm": 2.9411487579345703, + "learning_rate": 1.4190999875160999e-06, + "loss": 0.3751, + "step": 7944 + }, + { + "epoch": 3.8700876908087043, + "grad_norm": 3.025571346282959, + "learning_rate": 1.4185201065132094e-06, + "loss": 0.4444, + "step": 7945 + }, + { + "epoch": 3.8705748619681715, + "grad_norm": 2.8672494888305664, + "learning_rate": 1.4179402970825573e-06, + "loss": 0.4451, + "step": 7946 + }, + { + "epoch": 3.8710620331276386, + "grad_norm": 2.777097702026367, + "learning_rate": 1.4173605592625166e-06, + "loss": 0.3793, + "step": 7947 + }, + { + "epoch": 3.8715492042871062, + "grad_norm": 3.146941661834717, + "learning_rate": 1.4167808930914522e-06, + "loss": 0.411, + "step": 7948 + }, + { + "epoch": 3.8720363754465734, + "grad_norm": 2.882216453552246, + "learning_rate": 1.4162012986077274e-06, + "loss": 0.4254, + "step": 7949 + }, + { + "epoch": 3.872523546606041, + "grad_norm": 2.826995849609375, + "learning_rate": 1.415621775849701e-06, + "loss": 0.3595, + "step": 7950 + }, + { + "epoch": 3.873010717765508, + "grad_norm": 2.7499442100524902, + "learning_rate": 1.4150423248557233e-06, + "loss": 0.4812, + "step": 7951 + }, + { + "epoch": 3.873497888924976, + "grad_norm": 3.177870750427246, + "learning_rate": 1.4144629456641434e-06, + "loss": 0.4998, + "step": 7952 + }, + { + "epoch": 3.873985060084443, + "grad_norm": 3.4241397380828857, + "learning_rate": 1.413883638313306e-06, + "loss": 0.4323, + "step": 7953 + }, + { + "epoch": 3.87447223124391, + "grad_norm": 2.95570969581604, + "learning_rate": 1.413304402841547e-06, + "loss": 0.4126, + "step": 7954 + }, + { + "epoch": 3.8749594024033778, + "grad_norm": 3.25215744972229, + "learning_rate": 1.4127252392872016e-06, + "loss": 0.4908, + "step": 7955 + }, + { + "epoch": 3.875446573562845, + "grad_norm": 2.8891313076019287, + "learning_rate": 1.4121461476885995e-06, + "loss": 0.4252, + "step": 7956 + }, + { + "epoch": 3.8759337447223126, + "grad_norm": 3.0817270278930664, + "learning_rate": 1.411567128084063e-06, + "loss": 0.3482, + "step": 7957 + }, + { + "epoch": 3.8764209158817797, + "grad_norm": 3.0980384349823, + "learning_rate": 1.4109881805119126e-06, + "loss": 0.4562, + "step": 7958 + }, + { + "epoch": 3.8769080870412473, + "grad_norm": 3.1406214237213135, + "learning_rate": 1.410409305010464e-06, + "loss": 0.4368, + "step": 7959 + }, + { + "epoch": 3.8773952582007145, + "grad_norm": 3.8550729751586914, + "learning_rate": 1.4098305016180253e-06, + "loss": 0.4711, + "step": 7960 + }, + { + "epoch": 3.8778824293601817, + "grad_norm": 3.2590465545654297, + "learning_rate": 1.4092517703729024e-06, + "loss": 0.4388, + "step": 7961 + }, + { + "epoch": 3.8783696005196493, + "grad_norm": 3.020078659057617, + "learning_rate": 1.4086731113133962e-06, + "loss": 0.4468, + "step": 7962 + }, + { + "epoch": 3.8788567716791165, + "grad_norm": 3.2214598655700684, + "learning_rate": 1.4080945244778021e-06, + "loss": 0.4368, + "step": 7963 + }, + { + "epoch": 3.879343942838584, + "grad_norm": 3.529888391494751, + "learning_rate": 1.40751600990441e-06, + "loss": 0.5324, + "step": 7964 + }, + { + "epoch": 3.8798311139980513, + "grad_norm": 3.3595526218414307, + "learning_rate": 1.4069375676315067e-06, + "loss": 0.4267, + "step": 7965 + }, + { + "epoch": 3.880318285157519, + "grad_norm": 3.2111825942993164, + "learning_rate": 1.4063591976973744e-06, + "loss": 0.4525, + "step": 7966 + }, + { + "epoch": 3.880805456316986, + "grad_norm": 3.0071823596954346, + "learning_rate": 1.4057809001402874e-06, + "loss": 0.4397, + "step": 7967 + }, + { + "epoch": 3.881292627476453, + "grad_norm": 2.9466888904571533, + "learning_rate": 1.4052026749985187e-06, + "loss": 0.3725, + "step": 7968 + }, + { + "epoch": 3.881779798635921, + "grad_norm": 3.4896934032440186, + "learning_rate": 1.404624522310335e-06, + "loss": 0.3895, + "step": 7969 + }, + { + "epoch": 3.882266969795388, + "grad_norm": 3.058164119720459, + "learning_rate": 1.4040464421139993e-06, + "loss": 0.3641, + "step": 7970 + }, + { + "epoch": 3.8827541409548556, + "grad_norm": 2.7717981338500977, + "learning_rate": 1.4034684344477673e-06, + "loss": 0.4186, + "step": 7971 + }, + { + "epoch": 3.8832413121143228, + "grad_norm": 3.311260938644409, + "learning_rate": 1.402890499349892e-06, + "loss": 0.4454, + "step": 7972 + }, + { + "epoch": 3.8837284832737904, + "grad_norm": 3.2275140285491943, + "learning_rate": 1.4023126368586212e-06, + "loss": 0.4767, + "step": 7973 + }, + { + "epoch": 3.8842156544332576, + "grad_norm": 3.1997196674346924, + "learning_rate": 1.401734847012199e-06, + "loss": 0.4477, + "step": 7974 + }, + { + "epoch": 3.8847028255927247, + "grad_norm": 3.022003412246704, + "learning_rate": 1.4011571298488613e-06, + "loss": 0.4427, + "step": 7975 + }, + { + "epoch": 3.8851899967521923, + "grad_norm": 3.1721973419189453, + "learning_rate": 1.4005794854068424e-06, + "loss": 0.3488, + "step": 7976 + }, + { + "epoch": 3.8856771679116595, + "grad_norm": 4.500248908996582, + "learning_rate": 1.400001913724371e-06, + "loss": 0.4638, + "step": 7977 + }, + { + "epoch": 3.886164339071127, + "grad_norm": 3.0945684909820557, + "learning_rate": 1.3994244148396713e-06, + "loss": 0.4422, + "step": 7978 + }, + { + "epoch": 3.8866515102305943, + "grad_norm": 3.0870518684387207, + "learning_rate": 1.39884698879096e-06, + "loss": 0.468, + "step": 7979 + }, + { + "epoch": 3.887138681390062, + "grad_norm": 3.083306312561035, + "learning_rate": 1.3982696356164529e-06, + "loss": 0.4333, + "step": 7980 + }, + { + "epoch": 3.887625852549529, + "grad_norm": 3.0855517387390137, + "learning_rate": 1.3976923553543587e-06, + "loss": 0.4265, + "step": 7981 + }, + { + "epoch": 3.8881130237089963, + "grad_norm": 3.195143699645996, + "learning_rate": 1.397115148042883e-06, + "loss": 0.4661, + "step": 7982 + }, + { + "epoch": 3.888600194868464, + "grad_norm": 3.0407235622406006, + "learning_rate": 1.3965380137202226e-06, + "loss": 0.4295, + "step": 7983 + }, + { + "epoch": 3.889087366027931, + "grad_norm": 3.1145715713500977, + "learning_rate": 1.3959609524245737e-06, + "loss": 0.4216, + "step": 7984 + }, + { + "epoch": 3.8895745371873987, + "grad_norm": 3.035979986190796, + "learning_rate": 1.3953839641941263e-06, + "loss": 0.4583, + "step": 7985 + }, + { + "epoch": 3.890061708346866, + "grad_norm": 2.7009694576263428, + "learning_rate": 1.3948070490670668e-06, + "loss": 0.385, + "step": 7986 + }, + { + "epoch": 3.8905488795063334, + "grad_norm": 4.121170997619629, + "learning_rate": 1.3942302070815722e-06, + "loss": 0.4797, + "step": 7987 + }, + { + "epoch": 3.8910360506658006, + "grad_norm": 3.1125593185424805, + "learning_rate": 1.39365343827582e-06, + "loss": 0.4478, + "step": 7988 + }, + { + "epoch": 3.8915232218252678, + "grad_norm": 3.0063750743865967, + "learning_rate": 1.3930767426879798e-06, + "loss": 0.4389, + "step": 7989 + }, + { + "epoch": 3.8920103929847354, + "grad_norm": 2.930541515350342, + "learning_rate": 1.3925001203562189e-06, + "loss": 0.4154, + "step": 7990 + }, + { + "epoch": 3.8924975641442026, + "grad_norm": 3.3865294456481934, + "learning_rate": 1.3919235713186957e-06, + "loss": 0.5247, + "step": 7991 + }, + { + "epoch": 3.89298473530367, + "grad_norm": 3.077697992324829, + "learning_rate": 1.3913470956135678e-06, + "loss": 0.4565, + "step": 7992 + }, + { + "epoch": 3.8934719064631373, + "grad_norm": 3.1887073516845703, + "learning_rate": 1.3907706932789855e-06, + "loss": 0.3604, + "step": 7993 + }, + { + "epoch": 3.893959077622605, + "grad_norm": 3.1238529682159424, + "learning_rate": 1.3901943643530966e-06, + "loss": 0.3921, + "step": 7994 + }, + { + "epoch": 3.894446248782072, + "grad_norm": 3.0706512928009033, + "learning_rate": 1.3896181088740402e-06, + "loss": 0.4378, + "step": 7995 + }, + { + "epoch": 3.8949334199415393, + "grad_norm": 2.9444737434387207, + "learning_rate": 1.389041926879954e-06, + "loss": 0.44, + "step": 7996 + }, + { + "epoch": 3.895420591101007, + "grad_norm": 3.0149424076080322, + "learning_rate": 1.3884658184089706e-06, + "loss": 0.3918, + "step": 7997 + }, + { + "epoch": 3.895907762260474, + "grad_norm": 3.400815963745117, + "learning_rate": 1.387889783499215e-06, + "loss": 0.4667, + "step": 7998 + }, + { + "epoch": 3.8963949334199417, + "grad_norm": 3.0383646488189697, + "learning_rate": 1.387313822188811e-06, + "loss": 0.4804, + "step": 7999 + }, + { + "epoch": 3.896882104579409, + "grad_norm": 2.998734951019287, + "learning_rate": 1.3867379345158733e-06, + "loss": 0.4767, + "step": 8000 + }, + { + "epoch": 3.8973692757388765, + "grad_norm": 3.1641619205474854, + "learning_rate": 1.3861621205185154e-06, + "loss": 0.4588, + "step": 8001 + }, + { + "epoch": 3.8978564468983437, + "grad_norm": 3.275045394897461, + "learning_rate": 1.3855863802348463e-06, + "loss": 0.453, + "step": 8002 + }, + { + "epoch": 3.898343618057811, + "grad_norm": 3.047304630279541, + "learning_rate": 1.385010713702965e-06, + "loss": 0.3942, + "step": 8003 + }, + { + "epoch": 3.8988307892172784, + "grad_norm": 2.8949100971221924, + "learning_rate": 1.3844351209609713e-06, + "loss": 0.4137, + "step": 8004 + }, + { + "epoch": 3.8993179603767456, + "grad_norm": 3.1278369426727295, + "learning_rate": 1.3838596020469574e-06, + "loss": 0.4213, + "step": 8005 + }, + { + "epoch": 3.899805131536213, + "grad_norm": 3.1370890140533447, + "learning_rate": 1.383284156999012e-06, + "loss": 0.446, + "step": 8006 + }, + { + "epoch": 3.9002923026956804, + "grad_norm": 3.038285970687866, + "learning_rate": 1.3827087858552162e-06, + "loss": 0.4459, + "step": 8007 + }, + { + "epoch": 3.900779473855148, + "grad_norm": 3.00495982170105, + "learning_rate": 1.3821334886536486e-06, + "loss": 0.4061, + "step": 8008 + }, + { + "epoch": 3.901266645014615, + "grad_norm": 2.8374617099761963, + "learning_rate": 1.3815582654323827e-06, + "loss": 0.4021, + "step": 8009 + }, + { + "epoch": 3.9017538161740823, + "grad_norm": 2.9755215644836426, + "learning_rate": 1.3809831162294878e-06, + "loss": 0.4249, + "step": 8010 + }, + { + "epoch": 3.90224098733355, + "grad_norm": 3.122234344482422, + "learning_rate": 1.3804080410830251e-06, + "loss": 0.4176, + "step": 8011 + }, + { + "epoch": 3.902728158493017, + "grad_norm": 3.223078727722168, + "learning_rate": 1.3798330400310538e-06, + "loss": 0.4537, + "step": 8012 + }, + { + "epoch": 3.9032153296524843, + "grad_norm": 3.1280715465545654, + "learning_rate": 1.3792581131116278e-06, + "loss": 0.4185, + "step": 8013 + }, + { + "epoch": 3.903702500811952, + "grad_norm": 2.901075601577759, + "learning_rate": 1.3786832603627968e-06, + "loss": 0.3711, + "step": 8014 + }, + { + "epoch": 3.9041896719714195, + "grad_norm": 3.2820680141448975, + "learning_rate": 1.3781084818226017e-06, + "loss": 0.508, + "step": 8015 + }, + { + "epoch": 3.9046768431308867, + "grad_norm": 2.6312553882598877, + "learning_rate": 1.3775337775290832e-06, + "loss": 0.3575, + "step": 8016 + }, + { + "epoch": 3.905164014290354, + "grad_norm": 3.0956294536590576, + "learning_rate": 1.3769591475202744e-06, + "loss": 0.4703, + "step": 8017 + }, + { + "epoch": 3.9056511854498215, + "grad_norm": 3.228503942489624, + "learning_rate": 1.3763845918342057e-06, + "loss": 0.4431, + "step": 8018 + }, + { + "epoch": 3.9061383566092887, + "grad_norm": 2.8350141048431396, + "learning_rate": 1.3758101105088992e-06, + "loss": 0.3811, + "step": 8019 + }, + { + "epoch": 3.906625527768756, + "grad_norm": 3.0907528400421143, + "learning_rate": 1.375235703582375e-06, + "loss": 0.3667, + "step": 8020 + }, + { + "epoch": 3.9071126989282234, + "grad_norm": 3.07734751701355, + "learning_rate": 1.3746613710926477e-06, + "loss": 0.4071, + "step": 8021 + }, + { + "epoch": 3.907599870087691, + "grad_norm": 3.07486629486084, + "learning_rate": 1.3740871130777255e-06, + "loss": 0.4586, + "step": 8022 + }, + { + "epoch": 3.908087041247158, + "grad_norm": 3.0568203926086426, + "learning_rate": 1.3735129295756134e-06, + "loss": 0.4125, + "step": 8023 + }, + { + "epoch": 3.9085742124066254, + "grad_norm": 2.8839147090911865, + "learning_rate": 1.3729388206243105e-06, + "loss": 0.3814, + "step": 8024 + }, + { + "epoch": 3.909061383566093, + "grad_norm": 3.0964338779449463, + "learning_rate": 1.3723647862618125e-06, + "loss": 0.4727, + "step": 8025 + }, + { + "epoch": 3.90954855472556, + "grad_norm": 3.466881275177002, + "learning_rate": 1.371790826526107e-06, + "loss": 0.4281, + "step": 8026 + }, + { + "epoch": 3.9100357258850273, + "grad_norm": 3.420269250869751, + "learning_rate": 1.3712169414551797e-06, + "loss": 0.4441, + "step": 8027 + }, + { + "epoch": 3.910522897044495, + "grad_norm": 3.073456048965454, + "learning_rate": 1.37064313108701e-06, + "loss": 0.4212, + "step": 8028 + }, + { + "epoch": 3.9110100682039626, + "grad_norm": 3.046858310699463, + "learning_rate": 1.3700693954595735e-06, + "loss": 0.422, + "step": 8029 + }, + { + "epoch": 3.9114972393634297, + "grad_norm": 3.6231985092163086, + "learning_rate": 1.3694957346108384e-06, + "loss": 0.4111, + "step": 8030 + }, + { + "epoch": 3.911984410522897, + "grad_norm": 2.9515442848205566, + "learning_rate": 1.3689221485787702e-06, + "loss": 0.4104, + "step": 8031 + }, + { + "epoch": 3.9124715816823645, + "grad_norm": 2.958402395248413, + "learning_rate": 1.368348637401329e-06, + "loss": 0.4452, + "step": 8032 + }, + { + "epoch": 3.9129587528418317, + "grad_norm": 3.1032111644744873, + "learning_rate": 1.3677752011164704e-06, + "loss": 0.4461, + "step": 8033 + }, + { + "epoch": 3.913445924001299, + "grad_norm": 3.0376665592193604, + "learning_rate": 1.367201839762143e-06, + "loss": 0.4387, + "step": 8034 + }, + { + "epoch": 3.9139330951607665, + "grad_norm": 3.1319644451141357, + "learning_rate": 1.366628553376292e-06, + "loss": 0.3982, + "step": 8035 + }, + { + "epoch": 3.914420266320234, + "grad_norm": 3.196770668029785, + "learning_rate": 1.3660553419968574e-06, + "loss": 0.4431, + "step": 8036 + }, + { + "epoch": 3.9149074374797013, + "grad_norm": 3.130801200866699, + "learning_rate": 1.3654822056617765e-06, + "loss": 0.4262, + "step": 8037 + }, + { + "epoch": 3.9153946086391684, + "grad_norm": 3.0953242778778076, + "learning_rate": 1.3649091444089763e-06, + "loss": 0.4347, + "step": 8038 + }, + { + "epoch": 3.915881779798636, + "grad_norm": 3.37490177154541, + "learning_rate": 1.3643361582763831e-06, + "loss": 0.4923, + "step": 8039 + }, + { + "epoch": 3.9163689509581032, + "grad_norm": 2.6543760299682617, + "learning_rate": 1.363763247301918e-06, + "loss": 0.3766, + "step": 8040 + }, + { + "epoch": 3.9168561221175704, + "grad_norm": 3.0525968074798584, + "learning_rate": 1.3631904115234945e-06, + "loss": 0.4563, + "step": 8041 + }, + { + "epoch": 3.917343293277038, + "grad_norm": 3.1483402252197266, + "learning_rate": 1.362617650979024e-06, + "loss": 0.4574, + "step": 8042 + }, + { + "epoch": 3.9178304644365056, + "grad_norm": 3.0714004039764404, + "learning_rate": 1.3620449657064121e-06, + "loss": 0.4442, + "step": 8043 + }, + { + "epoch": 3.918317635595973, + "grad_norm": 2.9714996814727783, + "learning_rate": 1.3614723557435577e-06, + "loss": 0.4572, + "step": 8044 + }, + { + "epoch": 3.91880480675544, + "grad_norm": 3.0072007179260254, + "learning_rate": 1.3608998211283567e-06, + "loss": 0.3774, + "step": 8045 + }, + { + "epoch": 3.9192919779149076, + "grad_norm": 3.308168411254883, + "learning_rate": 1.3603273618987004e-06, + "loss": 0.4226, + "step": 8046 + }, + { + "epoch": 3.9197791490743747, + "grad_norm": 3.6994779109954834, + "learning_rate": 1.3597549780924721e-06, + "loss": 0.5495, + "step": 8047 + }, + { + "epoch": 3.920266320233842, + "grad_norm": 3.302874803543091, + "learning_rate": 1.3591826697475533e-06, + "loss": 0.4714, + "step": 8048 + }, + { + "epoch": 3.9207534913933095, + "grad_norm": 2.7330496311187744, + "learning_rate": 1.3586104369018198e-06, + "loss": 0.416, + "step": 8049 + }, + { + "epoch": 3.921240662552777, + "grad_norm": 2.989780902862549, + "learning_rate": 1.3580382795931408e-06, + "loss": 0.4641, + "step": 8050 + }, + { + "epoch": 3.9217278337122443, + "grad_norm": 3.256533622741699, + "learning_rate": 1.357466197859382e-06, + "loss": 0.4265, + "step": 8051 + }, + { + "epoch": 3.9222150048717115, + "grad_norm": 3.098686933517456, + "learning_rate": 1.3568941917384038e-06, + "loss": 0.4898, + "step": 8052 + }, + { + "epoch": 3.922702176031179, + "grad_norm": 3.1517090797424316, + "learning_rate": 1.3563222612680626e-06, + "loss": 0.4474, + "step": 8053 + }, + { + "epoch": 3.9231893471906463, + "grad_norm": 3.4324686527252197, + "learning_rate": 1.3557504064862065e-06, + "loss": 0.3891, + "step": 8054 + }, + { + "epoch": 3.9236765183501134, + "grad_norm": 3.114811658859253, + "learning_rate": 1.3551786274306822e-06, + "loss": 0.4169, + "step": 8055 + }, + { + "epoch": 3.924163689509581, + "grad_norm": 3.1915581226348877, + "learning_rate": 1.3546069241393296e-06, + "loss": 0.4315, + "step": 8056 + }, + { + "epoch": 3.9246508606690487, + "grad_norm": 3.1000161170959473, + "learning_rate": 1.3540352966499847e-06, + "loss": 0.4137, + "step": 8057 + }, + { + "epoch": 3.925138031828516, + "grad_norm": 2.933974504470825, + "learning_rate": 1.3534637450004768e-06, + "loss": 0.4294, + "step": 8058 + }, + { + "epoch": 3.925625202987983, + "grad_norm": 3.0178818702697754, + "learning_rate": 1.3528922692286311e-06, + "loss": 0.4404, + "step": 8059 + }, + { + "epoch": 3.9261123741474506, + "grad_norm": 2.974508285522461, + "learning_rate": 1.3523208693722684e-06, + "loss": 0.4823, + "step": 8060 + }, + { + "epoch": 3.926599545306918, + "grad_norm": 3.301525354385376, + "learning_rate": 1.3517495454692043e-06, + "loss": 0.3687, + "step": 8061 + }, + { + "epoch": 3.927086716466385, + "grad_norm": 2.7360990047454834, + "learning_rate": 1.351178297557248e-06, + "loss": 0.3808, + "step": 8062 + }, + { + "epoch": 3.9275738876258526, + "grad_norm": 2.8901126384735107, + "learning_rate": 1.3506071256742043e-06, + "loss": 0.4009, + "step": 8063 + }, + { + "epoch": 3.9280610587853197, + "grad_norm": 3.427300214767456, + "learning_rate": 1.3500360298578747e-06, + "loss": 0.4691, + "step": 8064 + }, + { + "epoch": 3.9285482299447874, + "grad_norm": 3.1762807369232178, + "learning_rate": 1.3494650101460538e-06, + "loss": 0.4064, + "step": 8065 + }, + { + "epoch": 3.9290354011042545, + "grad_norm": 2.6956119537353516, + "learning_rate": 1.348894066576531e-06, + "loss": 0.409, + "step": 8066 + }, + { + "epoch": 3.929522572263722, + "grad_norm": 2.957592487335205, + "learning_rate": 1.3483231991870916e-06, + "loss": 0.3642, + "step": 8067 + }, + { + "epoch": 3.9300097434231893, + "grad_norm": 3.079820394515991, + "learning_rate": 1.3477524080155153e-06, + "loss": 0.4681, + "step": 8068 + }, + { + "epoch": 3.9304969145826565, + "grad_norm": 3.47464919090271, + "learning_rate": 1.3471816930995784e-06, + "loss": 0.4723, + "step": 8069 + }, + { + "epoch": 3.930984085742124, + "grad_norm": 3.3978843688964844, + "learning_rate": 1.3466110544770488e-06, + "loss": 0.3441, + "step": 8070 + }, + { + "epoch": 3.9314712569015913, + "grad_norm": 3.3846333026885986, + "learning_rate": 1.3460404921856923e-06, + "loss": 0.4134, + "step": 8071 + }, + { + "epoch": 3.931958428061059, + "grad_norm": 2.8442142009735107, + "learning_rate": 1.3454700062632698e-06, + "loss": 0.4068, + "step": 8072 + }, + { + "epoch": 3.932445599220526, + "grad_norm": 3.0354678630828857, + "learning_rate": 1.344899596747533e-06, + "loss": 0.4066, + "step": 8073 + }, + { + "epoch": 3.9329327703799937, + "grad_norm": 2.9514122009277344, + "learning_rate": 1.344329263676234e-06, + "loss": 0.4031, + "step": 8074 + }, + { + "epoch": 3.933419941539461, + "grad_norm": 3.083582639694214, + "learning_rate": 1.3437590070871165e-06, + "loss": 0.4643, + "step": 8075 + }, + { + "epoch": 3.933907112698928, + "grad_norm": 3.2914786338806152, + "learning_rate": 1.3431888270179214e-06, + "loss": 0.4621, + "step": 8076 + }, + { + "epoch": 3.9343942838583956, + "grad_norm": 2.859750986099243, + "learning_rate": 1.342618723506381e-06, + "loss": 0.424, + "step": 8077 + }, + { + "epoch": 3.934881455017863, + "grad_norm": 3.106013774871826, + "learning_rate": 1.3420486965902255e-06, + "loss": 0.4276, + "step": 8078 + }, + { + "epoch": 3.9353686261773304, + "grad_norm": 3.3073887825012207, + "learning_rate": 1.3414787463071794e-06, + "loss": 0.4514, + "step": 8079 + }, + { + "epoch": 3.9358557973367976, + "grad_norm": 3.0567803382873535, + "learning_rate": 1.3409088726949632e-06, + "loss": 0.4064, + "step": 8080 + }, + { + "epoch": 3.936342968496265, + "grad_norm": 2.945647954940796, + "learning_rate": 1.3403390757912887e-06, + "loss": 0.4411, + "step": 8081 + }, + { + "epoch": 3.9368301396557324, + "grad_norm": 3.270015001296997, + "learning_rate": 1.3397693556338665e-06, + "loss": 0.4466, + "step": 8082 + }, + { + "epoch": 3.9373173108151995, + "grad_norm": 2.9531877040863037, + "learning_rate": 1.3391997122604e-06, + "loss": 0.4011, + "step": 8083 + }, + { + "epoch": 3.937804481974667, + "grad_norm": 3.086620807647705, + "learning_rate": 1.3386301457085894e-06, + "loss": 0.447, + "step": 8084 + }, + { + "epoch": 3.9382916531341343, + "grad_norm": 2.8460779190063477, + "learning_rate": 1.3380606560161266e-06, + "loss": 0.3842, + "step": 8085 + }, + { + "epoch": 3.938778824293602, + "grad_norm": 3.6304855346679688, + "learning_rate": 1.3374912432207027e-06, + "loss": 0.5362, + "step": 8086 + }, + { + "epoch": 3.939265995453069, + "grad_norm": 2.8329195976257324, + "learning_rate": 1.3369219073599988e-06, + "loss": 0.3614, + "step": 8087 + }, + { + "epoch": 3.9397531666125367, + "grad_norm": 3.0578880310058594, + "learning_rate": 1.3363526484716949e-06, + "loss": 0.4563, + "step": 8088 + }, + { + "epoch": 3.940240337772004, + "grad_norm": 3.429544687271118, + "learning_rate": 1.3357834665934649e-06, + "loss": 0.4381, + "step": 8089 + }, + { + "epoch": 3.940727508931471, + "grad_norm": 3.110882520675659, + "learning_rate": 1.3352143617629764e-06, + "loss": 0.481, + "step": 8090 + }, + { + "epoch": 3.9412146800909387, + "grad_norm": 3.064354181289673, + "learning_rate": 1.3346453340178925e-06, + "loss": 0.4066, + "step": 8091 + }, + { + "epoch": 3.941701851250406, + "grad_norm": 3.0300405025482178, + "learning_rate": 1.3340763833958732e-06, + "loss": 0.4131, + "step": 8092 + }, + { + "epoch": 3.9421890224098735, + "grad_norm": 2.8142504692077637, + "learning_rate": 1.333507509934569e-06, + "loss": 0.4423, + "step": 8093 + }, + { + "epoch": 3.9426761935693406, + "grad_norm": 2.9367451667785645, + "learning_rate": 1.3329387136716293e-06, + "loss": 0.4629, + "step": 8094 + }, + { + "epoch": 3.9431633647288082, + "grad_norm": 3.247776508331299, + "learning_rate": 1.3323699946446966e-06, + "loss": 0.3938, + "step": 8095 + }, + { + "epoch": 3.9436505358882754, + "grad_norm": 3.323133707046509, + "learning_rate": 1.3318013528914104e-06, + "loss": 0.4055, + "step": 8096 + }, + { + "epoch": 3.9441377070477426, + "grad_norm": 2.806861400604248, + "learning_rate": 1.3312327884494003e-06, + "loss": 0.4253, + "step": 8097 + }, + { + "epoch": 3.94462487820721, + "grad_norm": 3.130797863006592, + "learning_rate": 1.330664301356296e-06, + "loss": 0.4439, + "step": 8098 + }, + { + "epoch": 3.9451120493666774, + "grad_norm": 2.8771486282348633, + "learning_rate": 1.330095891649719e-06, + "loss": 0.4118, + "step": 8099 + }, + { + "epoch": 3.945599220526145, + "grad_norm": 3.033430814743042, + "learning_rate": 1.329527559367288e-06, + "loss": 0.4141, + "step": 8100 + }, + { + "epoch": 3.946086391685612, + "grad_norm": 2.937354326248169, + "learning_rate": 1.328959304546613e-06, + "loss": 0.4399, + "step": 8101 + }, + { + "epoch": 3.9465735628450798, + "grad_norm": 2.8365273475646973, + "learning_rate": 1.3283911272253025e-06, + "loss": 0.3729, + "step": 8102 + }, + { + "epoch": 3.947060734004547, + "grad_norm": 3.757066011428833, + "learning_rate": 1.3278230274409582e-06, + "loss": 0.4365, + "step": 8103 + }, + { + "epoch": 3.947547905164014, + "grad_norm": 3.08404803276062, + "learning_rate": 1.3272550052311772e-06, + "loss": 0.3812, + "step": 8104 + }, + { + "epoch": 3.9480350763234817, + "grad_norm": 2.953537940979004, + "learning_rate": 1.3266870606335502e-06, + "loss": 0.3547, + "step": 8105 + }, + { + "epoch": 3.948522247482949, + "grad_norm": 3.1095216274261475, + "learning_rate": 1.3261191936856642e-06, + "loss": 0.414, + "step": 8106 + }, + { + "epoch": 3.9490094186424165, + "grad_norm": 3.017343044281006, + "learning_rate": 1.3255514044251006e-06, + "loss": 0.4707, + "step": 8107 + }, + { + "epoch": 3.9494965898018837, + "grad_norm": 2.987724542617798, + "learning_rate": 1.3249836928894366e-06, + "loss": 0.4625, + "step": 8108 + }, + { + "epoch": 3.9499837609613513, + "grad_norm": 3.0601842403411865, + "learning_rate": 1.3244160591162414e-06, + "loss": 0.4, + "step": 8109 + }, + { + "epoch": 3.9504709321208185, + "grad_norm": 3.250667095184326, + "learning_rate": 1.323848503143082e-06, + "loss": 0.4053, + "step": 8110 + }, + { + "epoch": 3.9509581032802856, + "grad_norm": 3.032761812210083, + "learning_rate": 1.3232810250075193e-06, + "loss": 0.4011, + "step": 8111 + }, + { + "epoch": 3.9514452744397532, + "grad_norm": 3.209730625152588, + "learning_rate": 1.3227136247471095e-06, + "loss": 0.4619, + "step": 8112 + }, + { + "epoch": 3.9519324455992204, + "grad_norm": 3.5451323986053467, + "learning_rate": 1.3221463023994017e-06, + "loss": 0.4746, + "step": 8113 + }, + { + "epoch": 3.952419616758688, + "grad_norm": 3.433812141418457, + "learning_rate": 1.3215790580019416e-06, + "loss": 0.4781, + "step": 8114 + }, + { + "epoch": 3.952906787918155, + "grad_norm": 3.0965986251831055, + "learning_rate": 1.3210118915922698e-06, + "loss": 0.4201, + "step": 8115 + }, + { + "epoch": 3.953393959077623, + "grad_norm": 2.9918487071990967, + "learning_rate": 1.3204448032079225e-06, + "loss": 0.4213, + "step": 8116 + }, + { + "epoch": 3.95388113023709, + "grad_norm": 3.0821166038513184, + "learning_rate": 1.319877792886427e-06, + "loss": 0.4868, + "step": 8117 + }, + { + "epoch": 3.954368301396557, + "grad_norm": 3.0294601917266846, + "learning_rate": 1.3193108606653093e-06, + "loss": 0.4856, + "step": 8118 + }, + { + "epoch": 3.9548554725560248, + "grad_norm": 2.9528534412384033, + "learning_rate": 1.3187440065820888e-06, + "loss": 0.3982, + "step": 8119 + }, + { + "epoch": 3.955342643715492, + "grad_norm": 3.2834055423736572, + "learning_rate": 1.3181772306742813e-06, + "loss": 0.4182, + "step": 8120 + }, + { + "epoch": 3.9558298148749595, + "grad_norm": 3.033596992492676, + "learning_rate": 1.3176105329793935e-06, + "loss": 0.4135, + "step": 8121 + }, + { + "epoch": 3.9563169860344267, + "grad_norm": 3.032360315322876, + "learning_rate": 1.3170439135349305e-06, + "loss": 0.4593, + "step": 8122 + }, + { + "epoch": 3.9568041571938943, + "grad_norm": 3.1714248657226562, + "learning_rate": 1.316477372378392e-06, + "loss": 0.3825, + "step": 8123 + }, + { + "epoch": 3.9572913283533615, + "grad_norm": 3.2308390140533447, + "learning_rate": 1.31591090954727e-06, + "loss": 0.4317, + "step": 8124 + }, + { + "epoch": 3.9577784995128287, + "grad_norm": 2.793046712875366, + "learning_rate": 1.3153445250790536e-06, + "loss": 0.4428, + "step": 8125 + }, + { + "epoch": 3.9582656706722963, + "grad_norm": 2.6561806201934814, + "learning_rate": 1.3147782190112263e-06, + "loss": 0.4064, + "step": 8126 + }, + { + "epoch": 3.9587528418317635, + "grad_norm": 3.0931663513183594, + "learning_rate": 1.3142119913812667e-06, + "loss": 0.4041, + "step": 8127 + }, + { + "epoch": 3.959240012991231, + "grad_norm": 3.1228063106536865, + "learning_rate": 1.3136458422266466e-06, + "loss": 0.4675, + "step": 8128 + }, + { + "epoch": 3.9597271841506982, + "grad_norm": 2.9364728927612305, + "learning_rate": 1.3130797715848339e-06, + "loss": 0.4219, + "step": 8129 + }, + { + "epoch": 3.960214355310166, + "grad_norm": 3.0687851905822754, + "learning_rate": 1.3125137794932923e-06, + "loss": 0.4704, + "step": 8130 + }, + { + "epoch": 3.960701526469633, + "grad_norm": 3.177506446838379, + "learning_rate": 1.3119478659894774e-06, + "loss": 0.4303, + "step": 8131 + }, + { + "epoch": 3.9611886976291, + "grad_norm": 2.8873183727264404, + "learning_rate": 1.311382031110842e-06, + "loss": 0.4232, + "step": 8132 + }, + { + "epoch": 3.961675868788568, + "grad_norm": 2.543735980987549, + "learning_rate": 1.3108162748948344e-06, + "loss": 0.4197, + "step": 8133 + }, + { + "epoch": 3.962163039948035, + "grad_norm": 3.1485483646392822, + "learning_rate": 1.3102505973788937e-06, + "loss": 0.4129, + "step": 8134 + }, + { + "epoch": 3.9626502111075026, + "grad_norm": 2.9119062423706055, + "learning_rate": 1.309684998600459e-06, + "loss": 0.4331, + "step": 8135 + }, + { + "epoch": 3.9631373822669698, + "grad_norm": 2.9748895168304443, + "learning_rate": 1.3091194785969588e-06, + "loss": 0.4006, + "step": 8136 + }, + { + "epoch": 3.9636245534264374, + "grad_norm": 2.6897988319396973, + "learning_rate": 1.3085540374058208e-06, + "loss": 0.3874, + "step": 8137 + }, + { + "epoch": 3.9641117245859046, + "grad_norm": 3.055333375930786, + "learning_rate": 1.3079886750644657e-06, + "loss": 0.4699, + "step": 8138 + }, + { + "epoch": 3.9645988957453717, + "grad_norm": 3.4479119777679443, + "learning_rate": 1.3074233916103101e-06, + "loss": 0.4471, + "step": 8139 + }, + { + "epoch": 3.9650860669048393, + "grad_norm": 2.7770087718963623, + "learning_rate": 1.3068581870807622e-06, + "loss": 0.3827, + "step": 8140 + }, + { + "epoch": 3.9655732380643065, + "grad_norm": 2.8184759616851807, + "learning_rate": 1.3062930615132285e-06, + "loss": 0.4052, + "step": 8141 + }, + { + "epoch": 3.9660604092237737, + "grad_norm": 3.0128366947174072, + "learning_rate": 1.305728014945109e-06, + "loss": 0.4062, + "step": 8142 + }, + { + "epoch": 3.9665475803832413, + "grad_norm": 3.0356197357177734, + "learning_rate": 1.3051630474137992e-06, + "loss": 0.4407, + "step": 8143 + }, + { + "epoch": 3.967034751542709, + "grad_norm": 3.3121416568756104, + "learning_rate": 1.3045981589566863e-06, + "loss": 0.464, + "step": 8144 + }, + { + "epoch": 3.967521922702176, + "grad_norm": 3.0600810050964355, + "learning_rate": 1.3040333496111562e-06, + "loss": 0.4159, + "step": 8145 + }, + { + "epoch": 3.9680090938616432, + "grad_norm": 2.9078052043914795, + "learning_rate": 1.303468619414588e-06, + "loss": 0.3409, + "step": 8146 + }, + { + "epoch": 3.968496265021111, + "grad_norm": 3.2837893962860107, + "learning_rate": 1.3029039684043554e-06, + "loss": 0.4259, + "step": 8147 + }, + { + "epoch": 3.968983436180578, + "grad_norm": 2.9215903282165527, + "learning_rate": 1.302339396617826e-06, + "loss": 0.4433, + "step": 8148 + }, + { + "epoch": 3.969470607340045, + "grad_norm": 3.1445412635803223, + "learning_rate": 1.301774904092364e-06, + "loss": 0.4198, + "step": 8149 + }, + { + "epoch": 3.969957778499513, + "grad_norm": 2.8468775749206543, + "learning_rate": 1.3012104908653272e-06, + "loss": 0.3791, + "step": 8150 + }, + { + "epoch": 3.9704449496589804, + "grad_norm": 3.1333770751953125, + "learning_rate": 1.3006461569740697e-06, + "loss": 0.4987, + "step": 8151 + }, + { + "epoch": 3.9709321208184476, + "grad_norm": 3.38337779045105, + "learning_rate": 1.3000819024559363e-06, + "loss": 0.4325, + "step": 8152 + }, + { + "epoch": 3.9714192919779148, + "grad_norm": 3.53725266456604, + "learning_rate": 1.2995177273482712e-06, + "loss": 0.5, + "step": 8153 + }, + { + "epoch": 3.9719064631373824, + "grad_norm": 2.951308250427246, + "learning_rate": 1.298953631688411e-06, + "loss": 0.5073, + "step": 8154 + }, + { + "epoch": 3.9723936342968496, + "grad_norm": 3.1370317935943604, + "learning_rate": 1.298389615513689e-06, + "loss": 0.4382, + "step": 8155 + }, + { + "epoch": 3.9728808054563167, + "grad_norm": 3.0536301136016846, + "learning_rate": 1.2978256788614288e-06, + "loss": 0.514, + "step": 8156 + }, + { + "epoch": 3.9733679766157843, + "grad_norm": 3.1093151569366455, + "learning_rate": 1.2972618217689536e-06, + "loss": 0.4138, + "step": 8157 + }, + { + "epoch": 3.973855147775252, + "grad_norm": 3.602893590927124, + "learning_rate": 1.2966980442735788e-06, + "loss": 0.4375, + "step": 8158 + }, + { + "epoch": 3.974342318934719, + "grad_norm": 3.2723798751831055, + "learning_rate": 1.2961343464126165e-06, + "loss": 0.4043, + "step": 8159 + }, + { + "epoch": 3.9748294900941863, + "grad_norm": 3.035362720489502, + "learning_rate": 1.2955707282233698e-06, + "loss": 0.3877, + "step": 8160 + }, + { + "epoch": 3.975316661253654, + "grad_norm": 3.0119707584381104, + "learning_rate": 1.2950071897431404e-06, + "loss": 0.4361, + "step": 8161 + }, + { + "epoch": 3.975803832413121, + "grad_norm": 2.7184526920318604, + "learning_rate": 1.294443731009223e-06, + "loss": 0.4336, + "step": 8162 + }, + { + "epoch": 3.9762910035725882, + "grad_norm": 3.025301456451416, + "learning_rate": 1.2938803520589083e-06, + "loss": 0.4457, + "step": 8163 + }, + { + "epoch": 3.976778174732056, + "grad_norm": 2.740351915359497, + "learning_rate": 1.2933170529294787e-06, + "loss": 0.367, + "step": 8164 + }, + { + "epoch": 3.9772653458915235, + "grad_norm": 3.4388253688812256, + "learning_rate": 1.2927538336582135e-06, + "loss": 0.4762, + "step": 8165 + }, + { + "epoch": 3.9777525170509906, + "grad_norm": 3.040426254272461, + "learning_rate": 1.2921906942823879e-06, + "loss": 0.4029, + "step": 8166 + }, + { + "epoch": 3.978239688210458, + "grad_norm": 3.4637198448181152, + "learning_rate": 1.2916276348392702e-06, + "loss": 0.4999, + "step": 8167 + }, + { + "epoch": 3.9787268593699254, + "grad_norm": 3.114591360092163, + "learning_rate": 1.2910646553661222e-06, + "loss": 0.3905, + "step": 8168 + }, + { + "epoch": 3.9792140305293926, + "grad_norm": 3.18735933303833, + "learning_rate": 1.2905017559002032e-06, + "loss": 0.4183, + "step": 8169 + }, + { + "epoch": 3.9797012016888598, + "grad_norm": 3.1014389991760254, + "learning_rate": 1.2899389364787645e-06, + "loss": 0.4693, + "step": 8170 + }, + { + "epoch": 3.9801883728483274, + "grad_norm": 3.3563811779022217, + "learning_rate": 1.2893761971390562e-06, + "loss": 0.4359, + "step": 8171 + }, + { + "epoch": 3.980675544007795, + "grad_norm": 3.2352182865142822, + "learning_rate": 1.288813537918317e-06, + "loss": 0.4432, + "step": 8172 + }, + { + "epoch": 3.981162715167262, + "grad_norm": 3.096384048461914, + "learning_rate": 1.288250958853786e-06, + "loss": 0.394, + "step": 8173 + }, + { + "epoch": 3.9816498863267293, + "grad_norm": 3.508612632751465, + "learning_rate": 1.2876884599826928e-06, + "loss": 0.415, + "step": 8174 + }, + { + "epoch": 3.982137057486197, + "grad_norm": 2.7686080932617188, + "learning_rate": 1.2871260413422643e-06, + "loss": 0.3985, + "step": 8175 + }, + { + "epoch": 3.982624228645664, + "grad_norm": 3.312134265899658, + "learning_rate": 1.2865637029697226e-06, + "loss": 0.4399, + "step": 8176 + }, + { + "epoch": 3.9831113998051313, + "grad_norm": 2.8982326984405518, + "learning_rate": 1.286001444902281e-06, + "loss": 0.4288, + "step": 8177 + }, + { + "epoch": 3.983598570964599, + "grad_norm": 3.0290987491607666, + "learning_rate": 1.2854392671771506e-06, + "loss": 0.4368, + "step": 8178 + }, + { + "epoch": 3.9840857421240665, + "grad_norm": 3.5102665424346924, + "learning_rate": 1.2848771698315376e-06, + "loss": 0.4629, + "step": 8179 + }, + { + "epoch": 3.9845729132835337, + "grad_norm": 3.156545639038086, + "learning_rate": 1.2843151529026392e-06, + "loss": 0.436, + "step": 8180 + }, + { + "epoch": 3.985060084443001, + "grad_norm": 3.26727557182312, + "learning_rate": 1.283753216427651e-06, + "loss": 0.3819, + "step": 8181 + }, + { + "epoch": 3.9855472556024685, + "grad_norm": 3.0614013671875, + "learning_rate": 1.2831913604437623e-06, + "loss": 0.4286, + "step": 8182 + }, + { + "epoch": 3.9860344267619356, + "grad_norm": 3.2501485347747803, + "learning_rate": 1.2826295849881554e-06, + "loss": 0.4831, + "step": 8183 + }, + { + "epoch": 3.986521597921403, + "grad_norm": 3.3552873134613037, + "learning_rate": 1.2820678900980093e-06, + "loss": 0.3802, + "step": 8184 + }, + { + "epoch": 3.9870087690808704, + "grad_norm": 2.9825327396392822, + "learning_rate": 1.2815062758104965e-06, + "loss": 0.387, + "step": 8185 + }, + { + "epoch": 3.987495940240338, + "grad_norm": 2.9253830909729004, + "learning_rate": 1.280944742162786e-06, + "loss": 0.4845, + "step": 8186 + }, + { + "epoch": 3.987983111399805, + "grad_norm": 3.057753562927246, + "learning_rate": 1.2803832891920381e-06, + "loss": 0.3642, + "step": 8187 + }, + { + "epoch": 3.9884702825592724, + "grad_norm": 2.6690807342529297, + "learning_rate": 1.2798219169354103e-06, + "loss": 0.3496, + "step": 8188 + }, + { + "epoch": 3.98895745371874, + "grad_norm": 2.8226113319396973, + "learning_rate": 1.2792606254300547e-06, + "loss": 0.4008, + "step": 8189 + }, + { + "epoch": 3.989444624878207, + "grad_norm": 3.1007463932037354, + "learning_rate": 1.278699414713118e-06, + "loss": 0.3978, + "step": 8190 + }, + { + "epoch": 3.9899317960376743, + "grad_norm": 2.9973208904266357, + "learning_rate": 1.2781382848217393e-06, + "loss": 0.3797, + "step": 8191 + }, + { + "epoch": 3.990418967197142, + "grad_norm": 3.5540568828582764, + "learning_rate": 1.2775772357930553e-06, + "loss": 0.4562, + "step": 8192 + }, + { + "epoch": 3.990906138356609, + "grad_norm": 2.8782265186309814, + "learning_rate": 1.2770162676641959e-06, + "loss": 0.4442, + "step": 8193 + }, + { + "epoch": 3.9913933095160767, + "grad_norm": 3.364924430847168, + "learning_rate": 1.2764553804722868e-06, + "loss": 0.4971, + "step": 8194 + }, + { + "epoch": 3.991880480675544, + "grad_norm": 3.2158799171447754, + "learning_rate": 1.2758945742544459e-06, + "loss": 0.4012, + "step": 8195 + }, + { + "epoch": 3.9923676518350115, + "grad_norm": 3.357302188873291, + "learning_rate": 1.2753338490477885e-06, + "loss": 0.4198, + "step": 8196 + }, + { + "epoch": 3.9928548229944787, + "grad_norm": 2.995910167694092, + "learning_rate": 1.2747732048894223e-06, + "loss": 0.3928, + "step": 8197 + }, + { + "epoch": 3.993341994153946, + "grad_norm": 3.059253215789795, + "learning_rate": 1.2742126418164523e-06, + "loss": 0.3794, + "step": 8198 + }, + { + "epoch": 3.9938291653134135, + "grad_norm": 2.9359054565429688, + "learning_rate": 1.2736521598659751e-06, + "loss": 0.4241, + "step": 8199 + }, + { + "epoch": 3.9943163364728806, + "grad_norm": 3.2358601093292236, + "learning_rate": 1.2730917590750833e-06, + "loss": 0.4048, + "step": 8200 + }, + { + "epoch": 3.9948035076323483, + "grad_norm": 2.8271567821502686, + "learning_rate": 1.2725314394808647e-06, + "loss": 0.3613, + "step": 8201 + }, + { + "epoch": 3.9952906787918154, + "grad_norm": 3.5940158367156982, + "learning_rate": 1.2719712011204022e-06, + "loss": 0.4922, + "step": 8202 + }, + { + "epoch": 3.995777849951283, + "grad_norm": 3.3316891193389893, + "learning_rate": 1.2714110440307702e-06, + "loss": 0.4117, + "step": 8203 + }, + { + "epoch": 3.99626502111075, + "grad_norm": 3.1810038089752197, + "learning_rate": 1.270850968249041e-06, + "loss": 0.4525, + "step": 8204 + }, + { + "epoch": 3.9967521922702174, + "grad_norm": 3.2410290241241455, + "learning_rate": 1.2702909738122804e-06, + "loss": 0.3609, + "step": 8205 + }, + { + "epoch": 3.997239363429685, + "grad_norm": 2.988764524459839, + "learning_rate": 1.2697310607575491e-06, + "loss": 0.4018, + "step": 8206 + }, + { + "epoch": 3.997726534589152, + "grad_norm": 3.13199782371521, + "learning_rate": 1.269171229121901e-06, + "loss": 0.4705, + "step": 8207 + }, + { + "epoch": 3.99821370574862, + "grad_norm": 2.7055702209472656, + "learning_rate": 1.2686114789423866e-06, + "loss": 0.4726, + "step": 8208 + }, + { + "epoch": 3.998700876908087, + "grad_norm": 2.640314817428589, + "learning_rate": 1.2680518102560497e-06, + "loss": 0.3702, + "step": 8209 + }, + { + "epoch": 3.9991880480675546, + "grad_norm": 2.908262252807617, + "learning_rate": 1.2674922230999298e-06, + "loss": 0.4297, + "step": 8210 + }, + { + "epoch": 3.9996752192270217, + "grad_norm": 2.804546356201172, + "learning_rate": 1.2669327175110593e-06, + "loss": 0.4319, + "step": 8211 + }, + { + "epoch": 4.0, + "grad_norm": 2.804546356201172, + "learning_rate": 1.2663732935264666e-06, + "loss": 0.4465, + "step": 8212 + }, + { + "epoch": 4.000487171159468, + "grad_norm": 4.6632914543151855, + "learning_rate": 1.2658139511831746e-06, + "loss": 0.3781, + "step": 8213 + }, + { + "epoch": 4.000974342318934, + "grad_norm": 2.722215414047241, + "learning_rate": 1.2652546905182011e-06, + "loss": 0.3641, + "step": 8214 + }, + { + "epoch": 4.001461513478402, + "grad_norm": 2.7956998348236084, + "learning_rate": 1.2646955115685563e-06, + "loss": 0.3509, + "step": 8215 + }, + { + "epoch": 4.00194868463787, + "grad_norm": 2.8590898513793945, + "learning_rate": 1.2641364143712476e-06, + "loss": 0.4442, + "step": 8216 + }, + { + "epoch": 4.002435855797337, + "grad_norm": 3.174689531326294, + "learning_rate": 1.2635773989632768e-06, + "loss": 0.4156, + "step": 8217 + }, + { + "epoch": 4.002923026956804, + "grad_norm": 2.8576111793518066, + "learning_rate": 1.2630184653816377e-06, + "loss": 0.361, + "step": 8218 + }, + { + "epoch": 4.0034101981162715, + "grad_norm": 3.0831072330474854, + "learning_rate": 1.2624596136633214e-06, + "loss": 0.3459, + "step": 8219 + }, + { + "epoch": 4.003897369275739, + "grad_norm": 2.551546335220337, + "learning_rate": 1.2619008438453134e-06, + "loss": 0.3915, + "step": 8220 + }, + { + "epoch": 4.004384540435206, + "grad_norm": 2.826174736022949, + "learning_rate": 1.2613421559645916e-06, + "loss": 0.3514, + "step": 8221 + }, + { + "epoch": 4.0048717115946735, + "grad_norm": 2.6590018272399902, + "learning_rate": 1.2607835500581311e-06, + "loss": 0.3462, + "step": 8222 + }, + { + "epoch": 4.005358882754141, + "grad_norm": 3.079434394836426, + "learning_rate": 1.2602250261628996e-06, + "loss": 0.3423, + "step": 8223 + }, + { + "epoch": 4.005846053913609, + "grad_norm": 2.647125482559204, + "learning_rate": 1.2596665843158597e-06, + "loss": 0.362, + "step": 8224 + }, + { + "epoch": 4.006333225073075, + "grad_norm": 2.7947449684143066, + "learning_rate": 1.259108224553971e-06, + "loss": 0.3769, + "step": 8225 + }, + { + "epoch": 4.006820396232543, + "grad_norm": 3.2311477661132812, + "learning_rate": 1.2585499469141838e-06, + "loss": 0.3987, + "step": 8226 + }, + { + "epoch": 4.007307567392011, + "grad_norm": 3.041585922241211, + "learning_rate": 1.257991751433445e-06, + "loss": 0.3472, + "step": 8227 + }, + { + "epoch": 4.007794738551477, + "grad_norm": 2.6528890132904053, + "learning_rate": 1.2574336381486967e-06, + "loss": 0.3908, + "step": 8228 + }, + { + "epoch": 4.008281909710945, + "grad_norm": 2.865401029586792, + "learning_rate": 1.256875607096875e-06, + "loss": 0.415, + "step": 8229 + }, + { + "epoch": 4.008769080870413, + "grad_norm": 3.3730921745300293, + "learning_rate": 1.2563176583149095e-06, + "loss": 0.3895, + "step": 8230 + }, + { + "epoch": 4.00925625202988, + "grad_norm": 3.072283983230591, + "learning_rate": 1.2557597918397256e-06, + "loss": 0.3895, + "step": 8231 + }, + { + "epoch": 4.009743423189347, + "grad_norm": 3.374359607696533, + "learning_rate": 1.2552020077082428e-06, + "loss": 0.4179, + "step": 8232 + }, + { + "epoch": 4.010230594348815, + "grad_norm": 3.14017915725708, + "learning_rate": 1.2546443059573756e-06, + "loss": 0.3661, + "step": 8233 + }, + { + "epoch": 4.010717765508282, + "grad_norm": 3.018857002258301, + "learning_rate": 1.254086686624032e-06, + "loss": 0.3847, + "step": 8234 + }, + { + "epoch": 4.011204936667749, + "grad_norm": 2.949061155319214, + "learning_rate": 1.2535291497451149e-06, + "loss": 0.3899, + "step": 8235 + }, + { + "epoch": 4.0116921078272165, + "grad_norm": 3.3516452312469482, + "learning_rate": 1.2529716953575227e-06, + "loss": 0.3982, + "step": 8236 + }, + { + "epoch": 4.012179278986684, + "grad_norm": 3.266174554824829, + "learning_rate": 1.252414323498149e-06, + "loss": 0.4093, + "step": 8237 + }, + { + "epoch": 4.012666450146152, + "grad_norm": 2.8106303215026855, + "learning_rate": 1.2518570342038777e-06, + "loss": 0.4266, + "step": 8238 + }, + { + "epoch": 4.0131536213056185, + "grad_norm": 3.1785011291503906, + "learning_rate": 1.2512998275115917e-06, + "loss": 0.4224, + "step": 8239 + }, + { + "epoch": 4.013640792465086, + "grad_norm": 3.0855021476745605, + "learning_rate": 1.250742703458167e-06, + "loss": 0.4489, + "step": 8240 + }, + { + "epoch": 4.014127963624554, + "grad_norm": 3.4042398929595947, + "learning_rate": 1.250185662080475e-06, + "loss": 0.3941, + "step": 8241 + }, + { + "epoch": 4.01461513478402, + "grad_norm": 2.9463391304016113, + "learning_rate": 1.2496287034153783e-06, + "loss": 0.371, + "step": 8242 + }, + { + "epoch": 4.015102305943488, + "grad_norm": 3.047346591949463, + "learning_rate": 1.2490718274997376e-06, + "loss": 0.3915, + "step": 8243 + }, + { + "epoch": 4.015589477102956, + "grad_norm": 2.8565502166748047, + "learning_rate": 1.248515034370407e-06, + "loss": 0.2923, + "step": 8244 + }, + { + "epoch": 4.016076648262423, + "grad_norm": 2.699542284011841, + "learning_rate": 1.2479583240642358e-06, + "loss": 0.3888, + "step": 8245 + }, + { + "epoch": 4.01656381942189, + "grad_norm": 3.3862366676330566, + "learning_rate": 1.2474016966180648e-06, + "loss": 0.3573, + "step": 8246 + }, + { + "epoch": 4.017050990581358, + "grad_norm": 3.0244154930114746, + "learning_rate": 1.2468451520687333e-06, + "loss": 0.4208, + "step": 8247 + }, + { + "epoch": 4.017538161740825, + "grad_norm": 2.8430464267730713, + "learning_rate": 1.246288690453073e-06, + "loss": 0.3745, + "step": 8248 + }, + { + "epoch": 4.018025332900292, + "grad_norm": 3.1063239574432373, + "learning_rate": 1.245732311807911e-06, + "loss": 0.3739, + "step": 8249 + }, + { + "epoch": 4.01851250405976, + "grad_norm": 2.8614957332611084, + "learning_rate": 1.245176016170067e-06, + "loss": 0.4076, + "step": 8250 + }, + { + "epoch": 4.018999675219227, + "grad_norm": 3.2461559772491455, + "learning_rate": 1.2446198035763577e-06, + "loss": 0.3893, + "step": 8251 + }, + { + "epoch": 4.019486846378695, + "grad_norm": 3.576676845550537, + "learning_rate": 1.2440636740635925e-06, + "loss": 0.3992, + "step": 8252 + }, + { + "epoch": 4.0199740175381615, + "grad_norm": 3.3145291805267334, + "learning_rate": 1.2435076276685776e-06, + "loss": 0.341, + "step": 8253 + }, + { + "epoch": 4.020461188697629, + "grad_norm": 2.6492512226104736, + "learning_rate": 1.2429516644281097e-06, + "loss": 0.4011, + "step": 8254 + }, + { + "epoch": 4.020948359857097, + "grad_norm": 3.3724169731140137, + "learning_rate": 1.2423957843789838e-06, + "loss": 0.3999, + "step": 8255 + }, + { + "epoch": 4.0214355310165635, + "grad_norm": 2.9389231204986572, + "learning_rate": 1.2418399875579876e-06, + "loss": 0.3608, + "step": 8256 + }, + { + "epoch": 4.021922702176031, + "grad_norm": 2.891031503677368, + "learning_rate": 1.241284274001905e-06, + "loss": 0.3653, + "step": 8257 + }, + { + "epoch": 4.022409873335499, + "grad_norm": 2.9605724811553955, + "learning_rate": 1.2407286437475109e-06, + "loss": 0.3545, + "step": 8258 + }, + { + "epoch": 4.022897044494966, + "grad_norm": 3.769503116607666, + "learning_rate": 1.2401730968315778e-06, + "loss": 0.4333, + "step": 8259 + }, + { + "epoch": 4.023384215654433, + "grad_norm": 3.3242475986480713, + "learning_rate": 1.2396176332908729e-06, + "loss": 0.4152, + "step": 8260 + }, + { + "epoch": 4.023871386813901, + "grad_norm": 3.2650978565216064, + "learning_rate": 1.2390622531621548e-06, + "loss": 0.4243, + "step": 8261 + }, + { + "epoch": 4.024358557973368, + "grad_norm": 3.085453748703003, + "learning_rate": 1.2385069564821791e-06, + "loss": 0.3606, + "step": 8262 + }, + { + "epoch": 4.024845729132835, + "grad_norm": 3.0725576877593994, + "learning_rate": 1.2379517432876969e-06, + "loss": 0.404, + "step": 8263 + }, + { + "epoch": 4.025332900292303, + "grad_norm": 3.1109108924865723, + "learning_rate": 1.2373966136154498e-06, + "loss": 0.4741, + "step": 8264 + }, + { + "epoch": 4.02582007145177, + "grad_norm": 3.584644317626953, + "learning_rate": 1.236841567502177e-06, + "loss": 0.3842, + "step": 8265 + }, + { + "epoch": 4.026307242611238, + "grad_norm": 2.8645458221435547, + "learning_rate": 1.2362866049846125e-06, + "loss": 0.3933, + "step": 8266 + }, + { + "epoch": 4.026794413770705, + "grad_norm": 2.9623565673828125, + "learning_rate": 1.2357317260994819e-06, + "loss": 0.3769, + "step": 8267 + }, + { + "epoch": 4.027281584930172, + "grad_norm": 3.1504924297332764, + "learning_rate": 1.2351769308835083e-06, + "loss": 0.3971, + "step": 8268 + }, + { + "epoch": 4.02776875608964, + "grad_norm": 2.889706611633301, + "learning_rate": 1.2346222193734082e-06, + "loss": 0.4045, + "step": 8269 + }, + { + "epoch": 4.0282559272491065, + "grad_norm": 2.886854410171509, + "learning_rate": 1.2340675916058907e-06, + "loss": 0.3713, + "step": 8270 + }, + { + "epoch": 4.028743098408574, + "grad_norm": 3.3841731548309326, + "learning_rate": 1.2335130476176627e-06, + "loss": 0.3937, + "step": 8271 + }, + { + "epoch": 4.029230269568042, + "grad_norm": 3.4706766605377197, + "learning_rate": 1.2329585874454228e-06, + "loss": 0.3336, + "step": 8272 + }, + { + "epoch": 4.029717440727509, + "grad_norm": 2.6902523040771484, + "learning_rate": 1.232404211125867e-06, + "loss": 0.3713, + "step": 8273 + }, + { + "epoch": 4.030204611886976, + "grad_norm": 2.966704845428467, + "learning_rate": 1.231849918695681e-06, + "loss": 0.3988, + "step": 8274 + }, + { + "epoch": 4.030691783046444, + "grad_norm": 3.128126382827759, + "learning_rate": 1.2312957101915496e-06, + "loss": 0.3913, + "step": 8275 + }, + { + "epoch": 4.031178954205911, + "grad_norm": 2.862351894378662, + "learning_rate": 1.2307415856501506e-06, + "loss": 0.373, + "step": 8276 + }, + { + "epoch": 4.031666125365378, + "grad_norm": 2.801699161529541, + "learning_rate": 1.2301875451081546e-06, + "loss": 0.3667, + "step": 8277 + }, + { + "epoch": 4.032153296524846, + "grad_norm": 2.740565538406372, + "learning_rate": 1.2296335886022287e-06, + "loss": 0.4097, + "step": 8278 + }, + { + "epoch": 4.032640467684313, + "grad_norm": 2.934968948364258, + "learning_rate": 1.2290797161690332e-06, + "loss": 0.3897, + "step": 8279 + }, + { + "epoch": 4.03312763884378, + "grad_norm": 2.751234292984009, + "learning_rate": 1.2285259278452248e-06, + "loss": 0.3771, + "step": 8280 + }, + { + "epoch": 4.033614810003248, + "grad_norm": 3.30954909324646, + "learning_rate": 1.2279722236674517e-06, + "loss": 0.347, + "step": 8281 + }, + { + "epoch": 4.034101981162715, + "grad_norm": 3.0944526195526123, + "learning_rate": 1.227418603672358e-06, + "loss": 0.3889, + "step": 8282 + }, + { + "epoch": 4.034589152322183, + "grad_norm": 3.166062116622925, + "learning_rate": 1.2268650678965823e-06, + "loss": 0.2982, + "step": 8283 + }, + { + "epoch": 4.03507632348165, + "grad_norm": 3.014437198638916, + "learning_rate": 1.2263116163767592e-06, + "loss": 0.3855, + "step": 8284 + }, + { + "epoch": 4.035563494641117, + "grad_norm": 3.056892156600952, + "learning_rate": 1.2257582491495138e-06, + "loss": 0.3917, + "step": 8285 + }, + { + "epoch": 4.036050665800585, + "grad_norm": 3.2262203693389893, + "learning_rate": 1.2252049662514684e-06, + "loss": 0.4128, + "step": 8286 + }, + { + "epoch": 4.0365378369600515, + "grad_norm": 3.324437379837036, + "learning_rate": 1.2246517677192402e-06, + "loss": 0.4138, + "step": 8287 + }, + { + "epoch": 4.037025008119519, + "grad_norm": 3.1234328746795654, + "learning_rate": 1.2240986535894398e-06, + "loss": 0.3051, + "step": 8288 + }, + { + "epoch": 4.037512179278987, + "grad_norm": 3.1572048664093018, + "learning_rate": 1.2235456238986706e-06, + "loss": 0.3852, + "step": 8289 + }, + { + "epoch": 4.037999350438454, + "grad_norm": 3.2621777057647705, + "learning_rate": 1.2229926786835337e-06, + "loss": 0.4212, + "step": 8290 + }, + { + "epoch": 4.038486521597921, + "grad_norm": 3.2546420097351074, + "learning_rate": 1.2224398179806217e-06, + "loss": 0.3815, + "step": 8291 + }, + { + "epoch": 4.038973692757389, + "grad_norm": 2.9970383644104004, + "learning_rate": 1.2218870418265252e-06, + "loss": 0.3835, + "step": 8292 + }, + { + "epoch": 4.039460863916856, + "grad_norm": 3.3149142265319824, + "learning_rate": 1.221334350257824e-06, + "loss": 0.3786, + "step": 8293 + }, + { + "epoch": 4.039948035076323, + "grad_norm": 2.9971156120300293, + "learning_rate": 1.2207817433110963e-06, + "loss": 0.4124, + "step": 8294 + }, + { + "epoch": 4.040435206235791, + "grad_norm": 3.0592637062072754, + "learning_rate": 1.2202292210229141e-06, + "loss": 0.428, + "step": 8295 + }, + { + "epoch": 4.040922377395258, + "grad_norm": 3.3829104900360107, + "learning_rate": 1.2196767834298435e-06, + "loss": 0.4271, + "step": 8296 + }, + { + "epoch": 4.041409548554726, + "grad_norm": 3.2502520084381104, + "learning_rate": 1.2191244305684432e-06, + "loss": 0.3995, + "step": 8297 + }, + { + "epoch": 4.041896719714193, + "grad_norm": 2.9266746044158936, + "learning_rate": 1.2185721624752692e-06, + "loss": 0.3494, + "step": 8298 + }, + { + "epoch": 4.04238389087366, + "grad_norm": 3.1688108444213867, + "learning_rate": 1.2180199791868696e-06, + "loss": 0.3731, + "step": 8299 + }, + { + "epoch": 4.042871062033128, + "grad_norm": 3.4524683952331543, + "learning_rate": 1.2174678807397896e-06, + "loss": 0.3786, + "step": 8300 + }, + { + "epoch": 4.043358233192595, + "grad_norm": 3.2227072715759277, + "learning_rate": 1.216915867170565e-06, + "loss": 0.3388, + "step": 8301 + }, + { + "epoch": 4.043845404352062, + "grad_norm": 3.2382638454437256, + "learning_rate": 1.2163639385157289e-06, + "loss": 0.3656, + "step": 8302 + }, + { + "epoch": 4.04433257551153, + "grad_norm": 2.8863184452056885, + "learning_rate": 1.2158120948118075e-06, + "loss": 0.4093, + "step": 8303 + }, + { + "epoch": 4.044819746670997, + "grad_norm": 2.8634846210479736, + "learning_rate": 1.215260336095323e-06, + "loss": 0.4456, + "step": 8304 + }, + { + "epoch": 4.045306917830464, + "grad_norm": 3.1823277473449707, + "learning_rate": 1.2147086624027892e-06, + "loss": 0.4074, + "step": 8305 + }, + { + "epoch": 4.045794088989932, + "grad_norm": 2.880143642425537, + "learning_rate": 1.2141570737707161e-06, + "loss": 0.3351, + "step": 8306 + }, + { + "epoch": 4.046281260149399, + "grad_norm": 2.788872003555298, + "learning_rate": 1.2136055702356094e-06, + "loss": 0.3505, + "step": 8307 + }, + { + "epoch": 4.046768431308866, + "grad_norm": 3.1696183681488037, + "learning_rate": 1.2130541518339652e-06, + "loss": 0.4178, + "step": 8308 + }, + { + "epoch": 4.047255602468334, + "grad_norm": 2.9123291969299316, + "learning_rate": 1.2125028186022782e-06, + "loss": 0.3882, + "step": 8309 + }, + { + "epoch": 4.047742773627801, + "grad_norm": 3.2383434772491455, + "learning_rate": 1.211951570577034e-06, + "loss": 0.4094, + "step": 8310 + }, + { + "epoch": 4.048229944787269, + "grad_norm": 3.1216866970062256, + "learning_rate": 1.211400407794715e-06, + "loss": 0.3847, + "step": 8311 + }, + { + "epoch": 4.048717115946736, + "grad_norm": 3.777456045150757, + "learning_rate": 1.2108493302917975e-06, + "loss": 0.3543, + "step": 8312 + }, + { + "epoch": 4.049204287106203, + "grad_norm": 3.1935677528381348, + "learning_rate": 1.2102983381047509e-06, + "loss": 0.3389, + "step": 8313 + }, + { + "epoch": 4.049691458265671, + "grad_norm": 2.9758431911468506, + "learning_rate": 1.2097474312700399e-06, + "loss": 0.3596, + "step": 8314 + }, + { + "epoch": 4.050178629425138, + "grad_norm": 3.6137709617614746, + "learning_rate": 1.2091966098241238e-06, + "loss": 0.3647, + "step": 8315 + }, + { + "epoch": 4.050665800584605, + "grad_norm": 2.9595439434051514, + "learning_rate": 1.2086458738034567e-06, + "loss": 0.3755, + "step": 8316 + }, + { + "epoch": 4.051152971744073, + "grad_norm": 3.569035530090332, + "learning_rate": 1.2080952232444845e-06, + "loss": 0.3711, + "step": 8317 + }, + { + "epoch": 4.0516401429035405, + "grad_norm": 3.6235287189483643, + "learning_rate": 1.2075446581836504e-06, + "loss": 0.395, + "step": 8318 + }, + { + "epoch": 4.052127314063007, + "grad_norm": 2.8269543647766113, + "learning_rate": 1.2069941786573902e-06, + "loss": 0.3848, + "step": 8319 + }, + { + "epoch": 4.052614485222475, + "grad_norm": 3.0046002864837646, + "learning_rate": 1.2064437847021357e-06, + "loss": 0.396, + "step": 8320 + }, + { + "epoch": 4.053101656381942, + "grad_norm": 3.0841660499572754, + "learning_rate": 1.2058934763543103e-06, + "loss": 0.4065, + "step": 8321 + }, + { + "epoch": 4.053588827541409, + "grad_norm": 2.98189377784729, + "learning_rate": 1.2053432536503343e-06, + "loss": 0.3904, + "step": 8322 + }, + { + "epoch": 4.054075998700877, + "grad_norm": 2.803581714630127, + "learning_rate": 1.204793116626621e-06, + "loss": 0.3915, + "step": 8323 + }, + { + "epoch": 4.054563169860344, + "grad_norm": 3.0170722007751465, + "learning_rate": 1.2042430653195795e-06, + "loss": 0.3886, + "step": 8324 + }, + { + "epoch": 4.055050341019812, + "grad_norm": 3.064854621887207, + "learning_rate": 1.2036930997656105e-06, + "loss": 0.3707, + "step": 8325 + }, + { + "epoch": 4.055537512179279, + "grad_norm": 3.2037274837493896, + "learning_rate": 1.2031432200011114e-06, + "loss": 0.4307, + "step": 8326 + }, + { + "epoch": 4.056024683338746, + "grad_norm": 3.2847070693969727, + "learning_rate": 1.2025934260624731e-06, + "loss": 0.3481, + "step": 8327 + }, + { + "epoch": 4.056511854498214, + "grad_norm": 3.232783794403076, + "learning_rate": 1.2020437179860823e-06, + "loss": 0.405, + "step": 8328 + }, + { + "epoch": 4.056999025657681, + "grad_norm": 2.992337703704834, + "learning_rate": 1.201494095808316e-06, + "loss": 0.3292, + "step": 8329 + }, + { + "epoch": 4.057486196817148, + "grad_norm": 2.8274953365325928, + "learning_rate": 1.2009445595655501e-06, + "loss": 0.3391, + "step": 8330 + }, + { + "epoch": 4.057973367976616, + "grad_norm": 3.0940325260162354, + "learning_rate": 1.2003951092941526e-06, + "loss": 0.4321, + "step": 8331 + }, + { + "epoch": 4.0584605391360835, + "grad_norm": 3.3794751167297363, + "learning_rate": 1.199845745030485e-06, + "loss": 0.3934, + "step": 8332 + }, + { + "epoch": 4.05894771029555, + "grad_norm": 2.97965407371521, + "learning_rate": 1.1992964668109053e-06, + "loss": 0.4367, + "step": 8333 + }, + { + "epoch": 4.059434881455018, + "grad_norm": 3.12331223487854, + "learning_rate": 1.1987472746717639e-06, + "loss": 0.3432, + "step": 8334 + }, + { + "epoch": 4.0599220526144855, + "grad_norm": 2.8055732250213623, + "learning_rate": 1.1981981686494076e-06, + "loss": 0.3811, + "step": 8335 + }, + { + "epoch": 4.060409223773952, + "grad_norm": 2.974292755126953, + "learning_rate": 1.1976491487801747e-06, + "loss": 0.3974, + "step": 8336 + }, + { + "epoch": 4.06089639493342, + "grad_norm": 3.13305401802063, + "learning_rate": 1.1971002151003996e-06, + "loss": 0.386, + "step": 8337 + }, + { + "epoch": 4.061383566092887, + "grad_norm": 3.4738235473632812, + "learning_rate": 1.196551367646411e-06, + "loss": 0.4169, + "step": 8338 + }, + { + "epoch": 4.061870737252355, + "grad_norm": 3.5953922271728516, + "learning_rate": 1.1960026064545325e-06, + "loss": 0.433, + "step": 8339 + }, + { + "epoch": 4.062357908411822, + "grad_norm": 3.355485200881958, + "learning_rate": 1.195453931561079e-06, + "loss": 0.4188, + "step": 8340 + }, + { + "epoch": 4.062845079571289, + "grad_norm": 3.0792741775512695, + "learning_rate": 1.194905343002363e-06, + "loss": 0.3297, + "step": 8341 + }, + { + "epoch": 4.063332250730757, + "grad_norm": 3.4621543884277344, + "learning_rate": 1.1943568408146897e-06, + "loss": 0.3983, + "step": 8342 + }, + { + "epoch": 4.063819421890224, + "grad_norm": 3.950485944747925, + "learning_rate": 1.1938084250343602e-06, + "loss": 0.441, + "step": 8343 + }, + { + "epoch": 4.064306593049691, + "grad_norm": 3.126530647277832, + "learning_rate": 1.1932600956976664e-06, + "loss": 0.4095, + "step": 8344 + }, + { + "epoch": 4.064793764209159, + "grad_norm": 3.119340181350708, + "learning_rate": 1.1927118528408977e-06, + "loss": 0.3648, + "step": 8345 + }, + { + "epoch": 4.065280935368627, + "grad_norm": 3.2253730297088623, + "learning_rate": 1.1921636965003369e-06, + "loss": 0.3997, + "step": 8346 + }, + { + "epoch": 4.065768106528093, + "grad_norm": 3.68571138381958, + "learning_rate": 1.1916156267122618e-06, + "loss": 0.4159, + "step": 8347 + }, + { + "epoch": 4.066255277687561, + "grad_norm": 3.367072582244873, + "learning_rate": 1.191067643512942e-06, + "loss": 0.4013, + "step": 8348 + }, + { + "epoch": 4.0667424488470285, + "grad_norm": 2.8916120529174805, + "learning_rate": 1.1905197469386437e-06, + "loss": 0.3433, + "step": 8349 + }, + { + "epoch": 4.067229620006495, + "grad_norm": 3.0203285217285156, + "learning_rate": 1.1899719370256274e-06, + "loss": 0.3873, + "step": 8350 + }, + { + "epoch": 4.067716791165963, + "grad_norm": 2.974085569381714, + "learning_rate": 1.1894242138101454e-06, + "loss": 0.4231, + "step": 8351 + }, + { + "epoch": 4.0682039623254305, + "grad_norm": 3.3959097862243652, + "learning_rate": 1.1888765773284466e-06, + "loss": 0.4249, + "step": 8352 + }, + { + "epoch": 4.068691133484898, + "grad_norm": 3.385319948196411, + "learning_rate": 1.188329027616775e-06, + "loss": 0.4515, + "step": 8353 + }, + { + "epoch": 4.069178304644365, + "grad_norm": 3.1220662593841553, + "learning_rate": 1.1877815647113653e-06, + "loss": 0.3866, + "step": 8354 + }, + { + "epoch": 4.069665475803832, + "grad_norm": 3.3266639709472656, + "learning_rate": 1.1872341886484494e-06, + "loss": 0.381, + "step": 8355 + }, + { + "epoch": 4.0701526469633, + "grad_norm": 3.1516811847686768, + "learning_rate": 1.1866868994642535e-06, + "loss": 0.4521, + "step": 8356 + }, + { + "epoch": 4.070639818122767, + "grad_norm": 3.25844144821167, + "learning_rate": 1.1861396971949957e-06, + "loss": 0.4108, + "step": 8357 + }, + { + "epoch": 4.071126989282234, + "grad_norm": 3.3341317176818848, + "learning_rate": 1.1855925818768902e-06, + "loss": 0.3382, + "step": 8358 + }, + { + "epoch": 4.071614160441702, + "grad_norm": 3.047839641571045, + "learning_rate": 1.1850455535461461e-06, + "loss": 0.3464, + "step": 8359 + }, + { + "epoch": 4.07210133160117, + "grad_norm": 2.970466136932373, + "learning_rate": 1.184498612238964e-06, + "loss": 0.3826, + "step": 8360 + }, + { + "epoch": 4.072588502760636, + "grad_norm": 3.152893304824829, + "learning_rate": 1.183951757991541e-06, + "loss": 0.3962, + "step": 8361 + }, + { + "epoch": 4.073075673920104, + "grad_norm": 2.8231711387634277, + "learning_rate": 1.1834049908400683e-06, + "loss": 0.392, + "step": 8362 + }, + { + "epoch": 4.073562845079572, + "grad_norm": 3.7843925952911377, + "learning_rate": 1.1828583108207316e-06, + "loss": 0.3513, + "step": 8363 + }, + { + "epoch": 4.074050016239038, + "grad_norm": 3.1935696601867676, + "learning_rate": 1.1823117179697082e-06, + "loss": 0.4006, + "step": 8364 + }, + { + "epoch": 4.074537187398506, + "grad_norm": 3.587054491043091, + "learning_rate": 1.181765212323173e-06, + "loss": 0.392, + "step": 8365 + }, + { + "epoch": 4.0750243585579735, + "grad_norm": 3.100947380065918, + "learning_rate": 1.1812187939172927e-06, + "loss": 0.4678, + "step": 8366 + }, + { + "epoch": 4.075511529717441, + "grad_norm": 3.829118013381958, + "learning_rate": 1.1806724627882313e-06, + "loss": 0.3524, + "step": 8367 + }, + { + "epoch": 4.075998700876908, + "grad_norm": 2.9629249572753906, + "learning_rate": 1.1801262189721422e-06, + "loss": 0.3799, + "step": 8368 + }, + { + "epoch": 4.0764858720363755, + "grad_norm": 3.540036201477051, + "learning_rate": 1.179580062505177e-06, + "loss": 0.4147, + "step": 8369 + }, + { + "epoch": 4.076973043195843, + "grad_norm": 3.1942710876464844, + "learning_rate": 1.1790339934234805e-06, + "loss": 0.422, + "step": 8370 + }, + { + "epoch": 4.07746021435531, + "grad_norm": 3.4825406074523926, + "learning_rate": 1.1784880117631922e-06, + "loss": 0.3884, + "step": 8371 + }, + { + "epoch": 4.077947385514777, + "grad_norm": 2.9001858234405518, + "learning_rate": 1.177942117560443e-06, + "loss": 0.3565, + "step": 8372 + }, + { + "epoch": 4.078434556674245, + "grad_norm": 2.7615466117858887, + "learning_rate": 1.1773963108513616e-06, + "loss": 0.4016, + "step": 8373 + }, + { + "epoch": 4.078921727833713, + "grad_norm": 3.0842854976654053, + "learning_rate": 1.1768505916720693e-06, + "loss": 0.4095, + "step": 8374 + }, + { + "epoch": 4.079408898993179, + "grad_norm": 3.248525857925415, + "learning_rate": 1.1763049600586823e-06, + "loss": 0.4821, + "step": 8375 + }, + { + "epoch": 4.079896070152647, + "grad_norm": 3.0368242263793945, + "learning_rate": 1.1757594160473091e-06, + "loss": 0.3499, + "step": 8376 + }, + { + "epoch": 4.080383241312115, + "grad_norm": 3.0918972492218018, + "learning_rate": 1.1752139596740541e-06, + "loss": 0.4205, + "step": 8377 + }, + { + "epoch": 4.080870412471581, + "grad_norm": 3.3321948051452637, + "learning_rate": 1.1746685909750164e-06, + "loss": 0.4108, + "step": 8378 + }, + { + "epoch": 4.081357583631049, + "grad_norm": 3.164921760559082, + "learning_rate": 1.1741233099862884e-06, + "loss": 0.4213, + "step": 8379 + }, + { + "epoch": 4.081844754790517, + "grad_norm": 3.854138135910034, + "learning_rate": 1.1735781167439555e-06, + "loss": 0.3924, + "step": 8380 + }, + { + "epoch": 4.082331925949984, + "grad_norm": 3.5060932636260986, + "learning_rate": 1.1730330112840993e-06, + "loss": 0.4211, + "step": 8381 + }, + { + "epoch": 4.082819097109451, + "grad_norm": 3.119168519973755, + "learning_rate": 1.1724879936427961e-06, + "loss": 0.3539, + "step": 8382 + }, + { + "epoch": 4.0833062682689185, + "grad_norm": 3.2269554138183594, + "learning_rate": 1.1719430638561125e-06, + "loss": 0.4156, + "step": 8383 + }, + { + "epoch": 4.083793439428386, + "grad_norm": 3.3457000255584717, + "learning_rate": 1.1713982219601132e-06, + "loss": 0.3564, + "step": 8384 + }, + { + "epoch": 4.084280610587853, + "grad_norm": 3.142763614654541, + "learning_rate": 1.1708534679908561e-06, + "loss": 0.3991, + "step": 8385 + }, + { + "epoch": 4.0847677817473205, + "grad_norm": 2.6918461322784424, + "learning_rate": 1.1703088019843936e-06, + "loss": 0.355, + "step": 8386 + }, + { + "epoch": 4.085254952906788, + "grad_norm": 3.0783278942108154, + "learning_rate": 1.1697642239767697e-06, + "loss": 0.408, + "step": 8387 + }, + { + "epoch": 4.085742124066256, + "grad_norm": 3.6875414848327637, + "learning_rate": 1.169219734004026e-06, + "loss": 0.3694, + "step": 8388 + }, + { + "epoch": 4.086229295225722, + "grad_norm": 3.194359302520752, + "learning_rate": 1.168675332102196e-06, + "loss": 0.4039, + "step": 8389 + }, + { + "epoch": 4.08671646638519, + "grad_norm": 2.9728763103485107, + "learning_rate": 1.1681310183073096e-06, + "loss": 0.3727, + "step": 8390 + }, + { + "epoch": 4.087203637544658, + "grad_norm": 2.7047042846679688, + "learning_rate": 1.1675867926553875e-06, + "loss": 0.3134, + "step": 8391 + }, + { + "epoch": 4.087690808704124, + "grad_norm": 3.113790273666382, + "learning_rate": 1.1670426551824473e-06, + "loss": 0.417, + "step": 8392 + }, + { + "epoch": 4.088177979863592, + "grad_norm": 3.3934853076934814, + "learning_rate": 1.1664986059245001e-06, + "loss": 0.4574, + "step": 8393 + }, + { + "epoch": 4.08866515102306, + "grad_norm": 3.3019344806671143, + "learning_rate": 1.1659546449175518e-06, + "loss": 0.423, + "step": 8394 + }, + { + "epoch": 4.089152322182527, + "grad_norm": 3.043473958969116, + "learning_rate": 1.1654107721976e-06, + "loss": 0.3487, + "step": 8395 + }, + { + "epoch": 4.089639493341994, + "grad_norm": 3.28444242477417, + "learning_rate": 1.1648669878006402e-06, + "loss": 0.3484, + "step": 8396 + }, + { + "epoch": 4.090126664501462, + "grad_norm": 3.0107839107513428, + "learning_rate": 1.1643232917626576e-06, + "loss": 0.3577, + "step": 8397 + }, + { + "epoch": 4.090613835660929, + "grad_norm": 3.225677728652954, + "learning_rate": 1.1637796841196353e-06, + "loss": 0.3957, + "step": 8398 + }, + { + "epoch": 4.091101006820396, + "grad_norm": 3.2678303718566895, + "learning_rate": 1.1632361649075499e-06, + "loss": 0.3837, + "step": 8399 + }, + { + "epoch": 4.0915881779798635, + "grad_norm": 3.0673489570617676, + "learning_rate": 1.1626927341623699e-06, + "loss": 0.3925, + "step": 8400 + }, + { + "epoch": 4.092075349139331, + "grad_norm": 3.0040764808654785, + "learning_rate": 1.1621493919200599e-06, + "loss": 0.3807, + "step": 8401 + }, + { + "epoch": 4.092562520298799, + "grad_norm": 3.1448700428009033, + "learning_rate": 1.1616061382165796e-06, + "loss": 0.3451, + "step": 8402 + }, + { + "epoch": 4.0930496914582655, + "grad_norm": 3.147313356399536, + "learning_rate": 1.1610629730878797e-06, + "loss": 0.3589, + "step": 8403 + }, + { + "epoch": 4.093536862617733, + "grad_norm": 2.958621025085449, + "learning_rate": 1.1605198965699074e-06, + "loss": 0.3346, + "step": 8404 + }, + { + "epoch": 4.094024033777201, + "grad_norm": 3.415128231048584, + "learning_rate": 1.1599769086986038e-06, + "loss": 0.3305, + "step": 8405 + }, + { + "epoch": 4.094511204936667, + "grad_norm": 2.8129024505615234, + "learning_rate": 1.1594340095099044e-06, + "loss": 0.4056, + "step": 8406 + }, + { + "epoch": 4.094998376096135, + "grad_norm": 3.259143829345703, + "learning_rate": 1.1588911990397364e-06, + "loss": 0.3949, + "step": 8407 + }, + { + "epoch": 4.095485547255603, + "grad_norm": 3.015232563018799, + "learning_rate": 1.158348477324024e-06, + "loss": 0.4284, + "step": 8408 + }, + { + "epoch": 4.095972718415069, + "grad_norm": 3.3853695392608643, + "learning_rate": 1.1578058443986844e-06, + "loss": 0.3897, + "step": 8409 + }, + { + "epoch": 4.096459889574537, + "grad_norm": 3.3523459434509277, + "learning_rate": 1.15726330029963e-06, + "loss": 0.4454, + "step": 8410 + }, + { + "epoch": 4.096947060734005, + "grad_norm": 3.2732796669006348, + "learning_rate": 1.1567208450627645e-06, + "loss": 0.4109, + "step": 8411 + }, + { + "epoch": 4.097434231893472, + "grad_norm": 3.3436989784240723, + "learning_rate": 1.1561784787239882e-06, + "loss": 0.3672, + "step": 8412 + }, + { + "epoch": 4.097921403052939, + "grad_norm": 3.2620456218719482, + "learning_rate": 1.1556362013191952e-06, + "loss": 0.3913, + "step": 8413 + }, + { + "epoch": 4.098408574212407, + "grad_norm": 3.604508876800537, + "learning_rate": 1.155094012884274e-06, + "loss": 0.3522, + "step": 8414 + }, + { + "epoch": 4.098895745371874, + "grad_norm": 3.0812325477600098, + "learning_rate": 1.1545519134551054e-06, + "loss": 0.4181, + "step": 8415 + }, + { + "epoch": 4.099382916531342, + "grad_norm": 3.1518287658691406, + "learning_rate": 1.1540099030675652e-06, + "loss": 0.4307, + "step": 8416 + }, + { + "epoch": 4.0998700876908085, + "grad_norm": 2.9551901817321777, + "learning_rate": 1.1534679817575245e-06, + "loss": 0.3749, + "step": 8417 + }, + { + "epoch": 4.100357258850276, + "grad_norm": 3.1945250034332275, + "learning_rate": 1.1529261495608486e-06, + "loss": 0.4008, + "step": 8418 + }, + { + "epoch": 4.100844430009744, + "grad_norm": 2.9644622802734375, + "learning_rate": 1.1523844065133932e-06, + "loss": 0.3333, + "step": 8419 + }, + { + "epoch": 4.1013316011692105, + "grad_norm": 2.879148483276367, + "learning_rate": 1.1518427526510126e-06, + "loss": 0.3359, + "step": 8420 + }, + { + "epoch": 4.101818772328678, + "grad_norm": 3.19447660446167, + "learning_rate": 1.1513011880095532e-06, + "loss": 0.3466, + "step": 8421 + }, + { + "epoch": 4.102305943488146, + "grad_norm": 2.8573882579803467, + "learning_rate": 1.1507597126248562e-06, + "loss": 0.4092, + "step": 8422 + }, + { + "epoch": 4.102793114647612, + "grad_norm": 3.215305805206299, + "learning_rate": 1.1502183265327551e-06, + "loss": 0.3534, + "step": 8423 + }, + { + "epoch": 4.10328028580708, + "grad_norm": 2.940485715866089, + "learning_rate": 1.1496770297690796e-06, + "loss": 0.4272, + "step": 8424 + }, + { + "epoch": 4.103767456966548, + "grad_norm": 2.9573757648468018, + "learning_rate": 1.1491358223696522e-06, + "loss": 0.4144, + "step": 8425 + }, + { + "epoch": 4.104254628126015, + "grad_norm": 3.3718225955963135, + "learning_rate": 1.1485947043702914e-06, + "loss": 0.4176, + "step": 8426 + }, + { + "epoch": 4.104741799285482, + "grad_norm": 3.463449716567993, + "learning_rate": 1.1480536758068061e-06, + "loss": 0.3791, + "step": 8427 + }, + { + "epoch": 4.10522897044495, + "grad_norm": 3.136906862258911, + "learning_rate": 1.147512736715003e-06, + "loss": 0.3836, + "step": 8428 + }, + { + "epoch": 4.105716141604417, + "grad_norm": 2.9594597816467285, + "learning_rate": 1.1469718871306806e-06, + "loss": 0.3987, + "step": 8429 + }, + { + "epoch": 4.106203312763884, + "grad_norm": 4.123192310333252, + "learning_rate": 1.146431127089634e-06, + "loss": 0.3857, + "step": 8430 + }, + { + "epoch": 4.106690483923352, + "grad_norm": 3.02433180809021, + "learning_rate": 1.1458904566276482e-06, + "loss": 0.3732, + "step": 8431 + }, + { + "epoch": 4.107177655082819, + "grad_norm": 3.088063955307007, + "learning_rate": 1.145349875780506e-06, + "loss": 0.4067, + "step": 8432 + }, + { + "epoch": 4.107664826242287, + "grad_norm": 3.3165829181671143, + "learning_rate": 1.1448093845839837e-06, + "loss": 0.3921, + "step": 8433 + }, + { + "epoch": 4.1081519974017535, + "grad_norm": 2.9583704471588135, + "learning_rate": 1.1442689830738492e-06, + "loss": 0.3397, + "step": 8434 + }, + { + "epoch": 4.108639168561221, + "grad_norm": 3.1565797328948975, + "learning_rate": 1.1437286712858671e-06, + "loss": 0.3394, + "step": 8435 + }, + { + "epoch": 4.109126339720689, + "grad_norm": 2.873206615447998, + "learning_rate": 1.1431884492557957e-06, + "loss": 0.385, + "step": 8436 + }, + { + "epoch": 4.1096135108801555, + "grad_norm": 3.307924747467041, + "learning_rate": 1.142648317019387e-06, + "loss": 0.4094, + "step": 8437 + }, + { + "epoch": 4.110100682039623, + "grad_norm": 3.1571221351623535, + "learning_rate": 1.1421082746123851e-06, + "loss": 0.3263, + "step": 8438 + }, + { + "epoch": 4.110587853199091, + "grad_norm": 3.059178113937378, + "learning_rate": 1.1415683220705315e-06, + "loss": 0.3499, + "step": 8439 + }, + { + "epoch": 4.111075024358558, + "grad_norm": 2.9044857025146484, + "learning_rate": 1.1410284594295607e-06, + "loss": 0.3733, + "step": 8440 + }, + { + "epoch": 4.111562195518025, + "grad_norm": 3.267821788787842, + "learning_rate": 1.1404886867251989e-06, + "loss": 0.3865, + "step": 8441 + }, + { + "epoch": 4.112049366677493, + "grad_norm": 3.446633815765381, + "learning_rate": 1.1399490039931693e-06, + "loss": 0.4123, + "step": 8442 + }, + { + "epoch": 4.11253653783696, + "grad_norm": 3.069662094116211, + "learning_rate": 1.139409411269189e-06, + "loss": 0.3975, + "step": 8443 + }, + { + "epoch": 4.113023708996427, + "grad_norm": 3.40513014793396, + "learning_rate": 1.138869908588966e-06, + "loss": 0.4286, + "step": 8444 + }, + { + "epoch": 4.113510880155895, + "grad_norm": 3.32737398147583, + "learning_rate": 1.1383304959882066e-06, + "loss": 0.4226, + "step": 8445 + }, + { + "epoch": 4.113998051315362, + "grad_norm": 3.3737685680389404, + "learning_rate": 1.1377911735026073e-06, + "loss": 0.3973, + "step": 8446 + }, + { + "epoch": 4.11448522247483, + "grad_norm": 2.8607823848724365, + "learning_rate": 1.1372519411678615e-06, + "loss": 0.3964, + "step": 8447 + }, + { + "epoch": 4.114972393634297, + "grad_norm": 2.931837320327759, + "learning_rate": 1.136712799019655e-06, + "loss": 0.3903, + "step": 8448 + }, + { + "epoch": 4.115459564793764, + "grad_norm": 3.5161917209625244, + "learning_rate": 1.1361737470936697e-06, + "loss": 0.4415, + "step": 8449 + }, + { + "epoch": 4.115946735953232, + "grad_norm": 3.549802780151367, + "learning_rate": 1.1356347854255778e-06, + "loss": 0.4303, + "step": 8450 + }, + { + "epoch": 4.1164339071126985, + "grad_norm": 3.139012098312378, + "learning_rate": 1.1350959140510487e-06, + "loss": 0.4095, + "step": 8451 + }, + { + "epoch": 4.116921078272166, + "grad_norm": 3.3332502841949463, + "learning_rate": 1.1345571330057452e-06, + "loss": 0.3632, + "step": 8452 + }, + { + "epoch": 4.117408249431634, + "grad_norm": 2.852170944213867, + "learning_rate": 1.1340184423253242e-06, + "loss": 0.3738, + "step": 8453 + }, + { + "epoch": 4.117895420591101, + "grad_norm": 3.2234280109405518, + "learning_rate": 1.1334798420454344e-06, + "loss": 0.454, + "step": 8454 + }, + { + "epoch": 4.118382591750568, + "grad_norm": 3.0468292236328125, + "learning_rate": 1.1329413322017215e-06, + "loss": 0.3603, + "step": 8455 + }, + { + "epoch": 4.118869762910036, + "grad_norm": 3.1397368907928467, + "learning_rate": 1.1324029128298241e-06, + "loss": 0.4272, + "step": 8456 + }, + { + "epoch": 4.119356934069503, + "grad_norm": 3.4431703090667725, + "learning_rate": 1.1318645839653752e-06, + "loss": 0.3651, + "step": 8457 + }, + { + "epoch": 4.11984410522897, + "grad_norm": 2.887991428375244, + "learning_rate": 1.1313263456440002e-06, + "loss": 0.3854, + "step": 8458 + }, + { + "epoch": 4.120331276388438, + "grad_norm": 3.485877752304077, + "learning_rate": 1.13078819790132e-06, + "loss": 0.379, + "step": 8459 + }, + { + "epoch": 4.120818447547905, + "grad_norm": 2.8319389820098877, + "learning_rate": 1.1302501407729493e-06, + "loss": 0.3766, + "step": 8460 + }, + { + "epoch": 4.121305618707373, + "grad_norm": 3.3738701343536377, + "learning_rate": 1.1297121742944978e-06, + "loss": 0.4317, + "step": 8461 + }, + { + "epoch": 4.12179278986684, + "grad_norm": 3.0458385944366455, + "learning_rate": 1.1291742985015658e-06, + "loss": 0.3714, + "step": 8462 + }, + { + "epoch": 4.122279961026307, + "grad_norm": 3.1683056354522705, + "learning_rate": 1.1286365134297514e-06, + "loss": 0.3674, + "step": 8463 + }, + { + "epoch": 4.122767132185775, + "grad_norm": 3.104687452316284, + "learning_rate": 1.1280988191146447e-06, + "loss": 0.4224, + "step": 8464 + }, + { + "epoch": 4.123254303345242, + "grad_norm": 2.9391844272613525, + "learning_rate": 1.1275612155918315e-06, + "loss": 0.4119, + "step": 8465 + }, + { + "epoch": 4.123741474504709, + "grad_norm": 3.280057907104492, + "learning_rate": 1.1270237028968878e-06, + "loss": 0.3508, + "step": 8466 + }, + { + "epoch": 4.124228645664177, + "grad_norm": 3.363783359527588, + "learning_rate": 1.1264862810653882e-06, + "loss": 0.3934, + "step": 8467 + }, + { + "epoch": 4.124715816823644, + "grad_norm": 3.1314473152160645, + "learning_rate": 1.1259489501328982e-06, + "loss": 0.363, + "step": 8468 + }, + { + "epoch": 4.125202987983111, + "grad_norm": 3.279029130935669, + "learning_rate": 1.1254117101349798e-06, + "loss": 0.3845, + "step": 8469 + }, + { + "epoch": 4.125690159142579, + "grad_norm": 3.2133004665374756, + "learning_rate": 1.1248745611071856e-06, + "loss": 0.3209, + "step": 8470 + }, + { + "epoch": 4.126177330302046, + "grad_norm": 3.222562074661255, + "learning_rate": 1.1243375030850647e-06, + "loss": 0.3571, + "step": 8471 + }, + { + "epoch": 4.126664501461513, + "grad_norm": 3.237651824951172, + "learning_rate": 1.1238005361041598e-06, + "loss": 0.3793, + "step": 8472 + }, + { + "epoch": 4.127151672620981, + "grad_norm": 2.955432891845703, + "learning_rate": 1.1232636602000082e-06, + "loss": 0.3433, + "step": 8473 + }, + { + "epoch": 4.127638843780448, + "grad_norm": 2.763636589050293, + "learning_rate": 1.1227268754081386e-06, + "loss": 0.3837, + "step": 8474 + }, + { + "epoch": 4.128126014939916, + "grad_norm": 3.077815532684326, + "learning_rate": 1.122190181764076e-06, + "loss": 0.4112, + "step": 8475 + }, + { + "epoch": 4.128613186099383, + "grad_norm": 3.398352861404419, + "learning_rate": 1.1216535793033389e-06, + "loss": 0.373, + "step": 8476 + }, + { + "epoch": 4.12910035725885, + "grad_norm": 3.2515106201171875, + "learning_rate": 1.1211170680614406e-06, + "loss": 0.3534, + "step": 8477 + }, + { + "epoch": 4.129587528418318, + "grad_norm": 2.9030582904815674, + "learning_rate": 1.1205806480738852e-06, + "loss": 0.4311, + "step": 8478 + }, + { + "epoch": 4.130074699577785, + "grad_norm": 3.3827459812164307, + "learning_rate": 1.1200443193761743e-06, + "loss": 0.381, + "step": 8479 + }, + { + "epoch": 4.130561870737252, + "grad_norm": 3.3118093013763428, + "learning_rate": 1.1195080820038014e-06, + "loss": 0.3953, + "step": 8480 + }, + { + "epoch": 4.13104904189672, + "grad_norm": 3.4457178115844727, + "learning_rate": 1.1189719359922565e-06, + "loss": 0.4119, + "step": 8481 + }, + { + "epoch": 4.1315362130561875, + "grad_norm": 3.4089279174804688, + "learning_rate": 1.118435881377019e-06, + "loss": 0.3835, + "step": 8482 + }, + { + "epoch": 4.132023384215654, + "grad_norm": 3.2830722332000732, + "learning_rate": 1.1178999181935674e-06, + "loss": 0.3634, + "step": 8483 + }, + { + "epoch": 4.132510555375122, + "grad_norm": 3.1980581283569336, + "learning_rate": 1.1173640464773695e-06, + "loss": 0.4032, + "step": 8484 + }, + { + "epoch": 4.132997726534589, + "grad_norm": 3.483591318130493, + "learning_rate": 1.1168282662638902e-06, + "loss": 0.4448, + "step": 8485 + }, + { + "epoch": 4.133484897694056, + "grad_norm": 3.208993434906006, + "learning_rate": 1.1162925775885882e-06, + "loss": 0.4437, + "step": 8486 + }, + { + "epoch": 4.133972068853524, + "grad_norm": 3.5400335788726807, + "learning_rate": 1.115756980486914e-06, + "loss": 0.345, + "step": 8487 + }, + { + "epoch": 4.134459240012991, + "grad_norm": 3.02382493019104, + "learning_rate": 1.1152214749943139e-06, + "loss": 0.375, + "step": 8488 + }, + { + "epoch": 4.134946411172459, + "grad_norm": 3.1585230827331543, + "learning_rate": 1.1146860611462284e-06, + "loss": 0.3765, + "step": 8489 + }, + { + "epoch": 4.135433582331926, + "grad_norm": 3.193577766418457, + "learning_rate": 1.1141507389780896e-06, + "loss": 0.4066, + "step": 8490 + }, + { + "epoch": 4.135920753491393, + "grad_norm": 3.331866502761841, + "learning_rate": 1.1136155085253255e-06, + "loss": 0.377, + "step": 8491 + }, + { + "epoch": 4.136407924650861, + "grad_norm": 3.315417528152466, + "learning_rate": 1.1130803698233593e-06, + "loss": 0.3994, + "step": 8492 + }, + { + "epoch": 4.136895095810328, + "grad_norm": 3.203831911087036, + "learning_rate": 1.1125453229076039e-06, + "loss": 0.4367, + "step": 8493 + }, + { + "epoch": 4.137382266969795, + "grad_norm": 3.6465601921081543, + "learning_rate": 1.1120103678134698e-06, + "loss": 0.3575, + "step": 8494 + }, + { + "epoch": 4.137869438129263, + "grad_norm": 2.835165023803711, + "learning_rate": 1.1114755045763603e-06, + "loss": 0.3223, + "step": 8495 + }, + { + "epoch": 4.1383566092887305, + "grad_norm": 2.9712507724761963, + "learning_rate": 1.1109407332316735e-06, + "loss": 0.4408, + "step": 8496 + }, + { + "epoch": 4.138843780448197, + "grad_norm": 3.5708274841308594, + "learning_rate": 1.110406053814799e-06, + "loss": 0.3938, + "step": 8497 + }, + { + "epoch": 4.139330951607665, + "grad_norm": 2.9879634380340576, + "learning_rate": 1.1098714663611221e-06, + "loss": 0.3982, + "step": 8498 + }, + { + "epoch": 4.1398181227671325, + "grad_norm": 3.4535579681396484, + "learning_rate": 1.1093369709060222e-06, + "loss": 0.3702, + "step": 8499 + }, + { + "epoch": 4.140305293926599, + "grad_norm": 3.0769104957580566, + "learning_rate": 1.1088025674848732e-06, + "loss": 0.3731, + "step": 8500 + }, + { + "epoch": 4.140792465086067, + "grad_norm": 3.0217597484588623, + "learning_rate": 1.1082682561330395e-06, + "loss": 0.3534, + "step": 8501 + }, + { + "epoch": 4.141279636245534, + "grad_norm": 2.9592761993408203, + "learning_rate": 1.1077340368858832e-06, + "loss": 0.3631, + "step": 8502 + }, + { + "epoch": 4.141766807405002, + "grad_norm": 2.970829486846924, + "learning_rate": 1.1071999097787589e-06, + "loss": 0.3807, + "step": 8503 + }, + { + "epoch": 4.142253978564469, + "grad_norm": 3.261934995651245, + "learning_rate": 1.1066658748470154e-06, + "loss": 0.4163, + "step": 8504 + }, + { + "epoch": 4.142741149723936, + "grad_norm": 3.6208572387695312, + "learning_rate": 1.106131932125994e-06, + "loss": 0.4196, + "step": 8505 + }, + { + "epoch": 4.143228320883404, + "grad_norm": 3.129642963409424, + "learning_rate": 1.1055980816510314e-06, + "loss": 0.3778, + "step": 8506 + }, + { + "epoch": 4.143715492042871, + "grad_norm": 3.290464401245117, + "learning_rate": 1.1050643234574578e-06, + "loss": 0.3245, + "step": 8507 + }, + { + "epoch": 4.144202663202338, + "grad_norm": 2.9634594917297363, + "learning_rate": 1.1045306575805984e-06, + "loss": 0.3963, + "step": 8508 + }, + { + "epoch": 4.144689834361806, + "grad_norm": 3.234792709350586, + "learning_rate": 1.103997084055769e-06, + "loss": 0.3614, + "step": 8509 + }, + { + "epoch": 4.1451770055212735, + "grad_norm": 3.2967939376831055, + "learning_rate": 1.103463602918283e-06, + "loss": 0.4451, + "step": 8510 + }, + { + "epoch": 4.14566417668074, + "grad_norm": 3.527130365371704, + "learning_rate": 1.1029302142034454e-06, + "loss": 0.3848, + "step": 8511 + }, + { + "epoch": 4.146151347840208, + "grad_norm": 3.0866856575012207, + "learning_rate": 1.1023969179465572e-06, + "loss": 0.3388, + "step": 8512 + }, + { + "epoch": 4.1466385189996755, + "grad_norm": 3.2444770336151123, + "learning_rate": 1.1018637141829096e-06, + "loss": 0.4179, + "step": 8513 + }, + { + "epoch": 4.147125690159142, + "grad_norm": 3.613766670227051, + "learning_rate": 1.1013306029477913e-06, + "loss": 0.4283, + "step": 8514 + }, + { + "epoch": 4.14761286131861, + "grad_norm": 3.5092031955718994, + "learning_rate": 1.1007975842764837e-06, + "loss": 0.4168, + "step": 8515 + }, + { + "epoch": 4.1481000324780775, + "grad_norm": 3.18487286567688, + "learning_rate": 1.1002646582042626e-06, + "loss": 0.3132, + "step": 8516 + }, + { + "epoch": 4.148587203637545, + "grad_norm": 3.184117317199707, + "learning_rate": 1.0997318247663948e-06, + "loss": 0.3837, + "step": 8517 + }, + { + "epoch": 4.149074374797012, + "grad_norm": 3.2931323051452637, + "learning_rate": 1.0991990839981448e-06, + "loss": 0.4302, + "step": 8518 + }, + { + "epoch": 4.149561545956479, + "grad_norm": 3.3346340656280518, + "learning_rate": 1.0986664359347687e-06, + "loss": 0.3954, + "step": 8519 + }, + { + "epoch": 4.150048717115947, + "grad_norm": 3.5521552562713623, + "learning_rate": 1.0981338806115186e-06, + "loss": 0.3865, + "step": 8520 + }, + { + "epoch": 4.150535888275414, + "grad_norm": 3.5711047649383545, + "learning_rate": 1.0976014180636367e-06, + "loss": 0.3771, + "step": 8521 + }, + { + "epoch": 4.151023059434881, + "grad_norm": 3.141906261444092, + "learning_rate": 1.0970690483263624e-06, + "loss": 0.3665, + "step": 8522 + }, + { + "epoch": 4.151510230594349, + "grad_norm": 2.871859550476074, + "learning_rate": 1.0965367714349276e-06, + "loss": 0.4325, + "step": 8523 + }, + { + "epoch": 4.151997401753817, + "grad_norm": 3.2228829860687256, + "learning_rate": 1.0960045874245598e-06, + "loss": 0.3252, + "step": 8524 + }, + { + "epoch": 4.152484572913283, + "grad_norm": 2.978046417236328, + "learning_rate": 1.0954724963304766e-06, + "loss": 0.3591, + "step": 8525 + }, + { + "epoch": 4.152971744072751, + "grad_norm": 2.9903507232666016, + "learning_rate": 1.0949404981878927e-06, + "loss": 0.3695, + "step": 8526 + }, + { + "epoch": 4.1534589152322186, + "grad_norm": 3.1004538536071777, + "learning_rate": 1.094408593032017e-06, + "loss": 0.3747, + "step": 8527 + }, + { + "epoch": 4.153946086391685, + "grad_norm": 3.207862377166748, + "learning_rate": 1.0938767808980485e-06, + "loss": 0.3364, + "step": 8528 + }, + { + "epoch": 4.154433257551153, + "grad_norm": 3.064539909362793, + "learning_rate": 1.093345061821184e-06, + "loss": 0.4081, + "step": 8529 + }, + { + "epoch": 4.1549204287106205, + "grad_norm": 3.041660785675049, + "learning_rate": 1.092813435836613e-06, + "loss": 0.3617, + "step": 8530 + }, + { + "epoch": 4.155407599870088, + "grad_norm": 3.1204824447631836, + "learning_rate": 1.0922819029795168e-06, + "loss": 0.3488, + "step": 8531 + }, + { + "epoch": 4.155894771029555, + "grad_norm": 3.47397518157959, + "learning_rate": 1.0917504632850743e-06, + "loss": 0.4065, + "step": 8532 + }, + { + "epoch": 4.1563819421890225, + "grad_norm": 3.275855779647827, + "learning_rate": 1.091219116788454e-06, + "loss": 0.4418, + "step": 8533 + }, + { + "epoch": 4.15686911334849, + "grad_norm": 3.0637364387512207, + "learning_rate": 1.0906878635248213e-06, + "loss": 0.3705, + "step": 8534 + }, + { + "epoch": 4.157356284507957, + "grad_norm": 3.4373435974121094, + "learning_rate": 1.0901567035293356e-06, + "loss": 0.3675, + "step": 8535 + }, + { + "epoch": 4.157843455667424, + "grad_norm": 2.776958465576172, + "learning_rate": 1.089625636837147e-06, + "loss": 0.3846, + "step": 8536 + }, + { + "epoch": 4.158330626826892, + "grad_norm": 3.5244553089141846, + "learning_rate": 1.0890946634834024e-06, + "loss": 0.3309, + "step": 8537 + }, + { + "epoch": 4.158817797986359, + "grad_norm": 3.2065961360931396, + "learning_rate": 1.0885637835032416e-06, + "loss": 0.4609, + "step": 8538 + }, + { + "epoch": 4.159304969145826, + "grad_norm": 3.8282101154327393, + "learning_rate": 1.088032996931799e-06, + "loss": 0.3585, + "step": 8539 + }, + { + "epoch": 4.159792140305294, + "grad_norm": 2.9479079246520996, + "learning_rate": 1.0875023038042004e-06, + "loss": 0.4306, + "step": 8540 + }, + { + "epoch": 4.160279311464762, + "grad_norm": 3.974430561065674, + "learning_rate": 1.0869717041555678e-06, + "loss": 0.3651, + "step": 8541 + }, + { + "epoch": 4.160766482624228, + "grad_norm": 3.2122836112976074, + "learning_rate": 1.0864411980210162e-06, + "loss": 0.4141, + "step": 8542 + }, + { + "epoch": 4.161253653783696, + "grad_norm": 3.1042845249176025, + "learning_rate": 1.0859107854356554e-06, + "loss": 0.4194, + "step": 8543 + }, + { + "epoch": 4.1617408249431636, + "grad_norm": 2.9792730808258057, + "learning_rate": 1.0853804664345862e-06, + "loss": 0.3541, + "step": 8544 + }, + { + "epoch": 4.162227996102631, + "grad_norm": 3.314063549041748, + "learning_rate": 1.084850241052906e-06, + "loss": 0.4141, + "step": 8545 + }, + { + "epoch": 4.162715167262098, + "grad_norm": 3.195225715637207, + "learning_rate": 1.0843201093257053e-06, + "loss": 0.3603, + "step": 8546 + }, + { + "epoch": 4.1632023384215655, + "grad_norm": 3.092599868774414, + "learning_rate": 1.0837900712880687e-06, + "loss": 0.395, + "step": 8547 + }, + { + "epoch": 4.163689509581033, + "grad_norm": 3.400092840194702, + "learning_rate": 1.0832601269750722e-06, + "loss": 0.3893, + "step": 8548 + }, + { + "epoch": 4.1641766807405, + "grad_norm": 3.0033700466156006, + "learning_rate": 1.0827302764217887e-06, + "loss": 0.3762, + "step": 8549 + }, + { + "epoch": 4.1646638518999675, + "grad_norm": 3.2795732021331787, + "learning_rate": 1.0822005196632836e-06, + "loss": 0.4015, + "step": 8550 + }, + { + "epoch": 4.165151023059435, + "grad_norm": 2.953052043914795, + "learning_rate": 1.0816708567346168e-06, + "loss": 0.4177, + "step": 8551 + }, + { + "epoch": 4.165638194218902, + "grad_norm": 3.3339388370513916, + "learning_rate": 1.08114128767084e-06, + "loss": 0.3658, + "step": 8552 + }, + { + "epoch": 4.166125365378369, + "grad_norm": 2.9502923488616943, + "learning_rate": 1.0806118125070006e-06, + "loss": 0.3678, + "step": 8553 + }, + { + "epoch": 4.166612536537837, + "grad_norm": 3.623206377029419, + "learning_rate": 1.080082431278139e-06, + "loss": 0.3545, + "step": 8554 + }, + { + "epoch": 4.167099707697305, + "grad_norm": 2.879397392272949, + "learning_rate": 1.079553144019291e-06, + "loss": 0.4048, + "step": 8555 + }, + { + "epoch": 4.167586878856771, + "grad_norm": 3.1303203105926514, + "learning_rate": 1.079023950765483e-06, + "loss": 0.3614, + "step": 8556 + }, + { + "epoch": 4.168074050016239, + "grad_norm": 3.204596996307373, + "learning_rate": 1.0784948515517371e-06, + "loss": 0.3463, + "step": 8557 + }, + { + "epoch": 4.168561221175707, + "grad_norm": 2.974503993988037, + "learning_rate": 1.0779658464130699e-06, + "loss": 0.3618, + "step": 8558 + }, + { + "epoch": 4.169048392335174, + "grad_norm": 2.9843037128448486, + "learning_rate": 1.0774369353844912e-06, + "loss": 0.4364, + "step": 8559 + }, + { + "epoch": 4.169535563494641, + "grad_norm": 3.1742029190063477, + "learning_rate": 1.076908118501003e-06, + "loss": 0.3915, + "step": 8560 + }, + { + "epoch": 4.1700227346541086, + "grad_norm": 2.8825643062591553, + "learning_rate": 1.076379395797603e-06, + "loss": 0.3949, + "step": 8561 + }, + { + "epoch": 4.170509905813576, + "grad_norm": 3.4071648120880127, + "learning_rate": 1.075850767309282e-06, + "loss": 0.3792, + "step": 8562 + }, + { + "epoch": 4.170997076973043, + "grad_norm": 3.4733877182006836, + "learning_rate": 1.0753222330710254e-06, + "loss": 0.4282, + "step": 8563 + }, + { + "epoch": 4.1714842481325105, + "grad_norm": 3.4503724575042725, + "learning_rate": 1.0747937931178095e-06, + "loss": 0.3838, + "step": 8564 + }, + { + "epoch": 4.171971419291978, + "grad_norm": 3.0645062923431396, + "learning_rate": 1.0742654474846082e-06, + "loss": 0.3843, + "step": 8565 + }, + { + "epoch": 4.172458590451445, + "grad_norm": 3.460885763168335, + "learning_rate": 1.0737371962063866e-06, + "loss": 0.4152, + "step": 8566 + }, + { + "epoch": 4.1729457616109125, + "grad_norm": 3.070725440979004, + "learning_rate": 1.0732090393181056e-06, + "loss": 0.3971, + "step": 8567 + }, + { + "epoch": 4.17343293277038, + "grad_norm": 3.414149761199951, + "learning_rate": 1.0726809768547163e-06, + "loss": 0.3892, + "step": 8568 + }, + { + "epoch": 4.173920103929848, + "grad_norm": 3.204510450363159, + "learning_rate": 1.0721530088511676e-06, + "loss": 0.3935, + "step": 8569 + }, + { + "epoch": 4.174407275089314, + "grad_norm": 3.3716859817504883, + "learning_rate": 1.0716251353424e-06, + "loss": 0.3891, + "step": 8570 + }, + { + "epoch": 4.174894446248782, + "grad_norm": 3.615518808364868, + "learning_rate": 1.0710973563633476e-06, + "loss": 0.4032, + "step": 8571 + }, + { + "epoch": 4.17538161740825, + "grad_norm": 3.1614580154418945, + "learning_rate": 1.0705696719489388e-06, + "loss": 0.3666, + "step": 8572 + }, + { + "epoch": 4.175868788567716, + "grad_norm": 3.142975091934204, + "learning_rate": 1.070042082134097e-06, + "loss": 0.4297, + "step": 8573 + }, + { + "epoch": 4.176355959727184, + "grad_norm": 3.178184747695923, + "learning_rate": 1.0695145869537363e-06, + "loss": 0.3806, + "step": 8574 + }, + { + "epoch": 4.176843130886652, + "grad_norm": 3.1968793869018555, + "learning_rate": 1.0689871864427667e-06, + "loss": 0.3811, + "step": 8575 + }, + { + "epoch": 4.177330302046119, + "grad_norm": 3.1121604442596436, + "learning_rate": 1.068459880636093e-06, + "loss": 0.4337, + "step": 8576 + }, + { + "epoch": 4.177817473205586, + "grad_norm": 3.375716209411621, + "learning_rate": 1.06793266956861e-06, + "loss": 0.3826, + "step": 8577 + }, + { + "epoch": 4.1783046443650536, + "grad_norm": 3.0884363651275635, + "learning_rate": 1.0674055532752098e-06, + "loss": 0.3399, + "step": 8578 + }, + { + "epoch": 4.178791815524521, + "grad_norm": 2.8653745651245117, + "learning_rate": 1.0668785317907773e-06, + "loss": 0.4169, + "step": 8579 + }, + { + "epoch": 4.179278986683988, + "grad_norm": 3.218125820159912, + "learning_rate": 1.0663516051501895e-06, + "loss": 0.3074, + "step": 8580 + }, + { + "epoch": 4.1797661578434555, + "grad_norm": 3.0520708560943604, + "learning_rate": 1.0658247733883189e-06, + "loss": 0.3809, + "step": 8581 + }, + { + "epoch": 4.180253329002923, + "grad_norm": 3.0224244594573975, + "learning_rate": 1.0652980365400312e-06, + "loss": 0.4517, + "step": 8582 + }, + { + "epoch": 4.180740500162391, + "grad_norm": 3.3931920528411865, + "learning_rate": 1.0647713946401866e-06, + "loss": 0.3588, + "step": 8583 + }, + { + "epoch": 4.1812276713218575, + "grad_norm": 3.2184622287750244, + "learning_rate": 1.0642448477236366e-06, + "loss": 0.3578, + "step": 8584 + }, + { + "epoch": 4.181714842481325, + "grad_norm": 3.3802666664123535, + "learning_rate": 1.0637183958252293e-06, + "loss": 0.4459, + "step": 8585 + }, + { + "epoch": 4.182202013640793, + "grad_norm": 4.059115886688232, + "learning_rate": 1.0631920389798043e-06, + "loss": 0.4116, + "step": 8586 + }, + { + "epoch": 4.182689184800259, + "grad_norm": 3.18074107170105, + "learning_rate": 1.0626657772221973e-06, + "loss": 0.3781, + "step": 8587 + }, + { + "epoch": 4.183176355959727, + "grad_norm": 3.0214552879333496, + "learning_rate": 1.0621396105872347e-06, + "loss": 0.3667, + "step": 8588 + }, + { + "epoch": 4.183663527119195, + "grad_norm": 2.872102737426758, + "learning_rate": 1.0616135391097386e-06, + "loss": 0.3756, + "step": 8589 + }, + { + "epoch": 4.184150698278662, + "grad_norm": 3.3607826232910156, + "learning_rate": 1.0610875628245257e-06, + "loss": 0.4138, + "step": 8590 + }, + { + "epoch": 4.184637869438129, + "grad_norm": 3.494154214859009, + "learning_rate": 1.0605616817664027e-06, + "loss": 0.4058, + "step": 8591 + }, + { + "epoch": 4.185125040597597, + "grad_norm": 3.3295962810516357, + "learning_rate": 1.0600358959701736e-06, + "loss": 0.3957, + "step": 8592 + }, + { + "epoch": 4.185612211757064, + "grad_norm": 3.485707998275757, + "learning_rate": 1.0595102054706348e-06, + "loss": 0.3699, + "step": 8593 + }, + { + "epoch": 4.186099382916531, + "grad_norm": 3.1290369033813477, + "learning_rate": 1.0589846103025776e-06, + "loss": 0.3804, + "step": 8594 + }, + { + "epoch": 4.186586554075999, + "grad_norm": 3.2568202018737793, + "learning_rate": 1.0584591105007835e-06, + "loss": 0.4851, + "step": 8595 + }, + { + "epoch": 4.187073725235466, + "grad_norm": 3.921837329864502, + "learning_rate": 1.0579337061000314e-06, + "loss": 0.4141, + "step": 8596 + }, + { + "epoch": 4.187560896394934, + "grad_norm": 3.421656847000122, + "learning_rate": 1.0574083971350923e-06, + "loss": 0.3874, + "step": 8597 + }, + { + "epoch": 4.1880480675544005, + "grad_norm": 3.3157596588134766, + "learning_rate": 1.056883183640732e-06, + "loss": 0.4182, + "step": 8598 + }, + { + "epoch": 4.188535238713868, + "grad_norm": 3.660240650177002, + "learning_rate": 1.0563580656517076e-06, + "loss": 0.4491, + "step": 8599 + }, + { + "epoch": 4.189022409873336, + "grad_norm": 3.5081543922424316, + "learning_rate": 1.0558330432027717e-06, + "loss": 0.3628, + "step": 8600 + }, + { + "epoch": 4.1895095810328025, + "grad_norm": 2.980380058288574, + "learning_rate": 1.0553081163286705e-06, + "loss": 0.3399, + "step": 8601 + }, + { + "epoch": 4.18999675219227, + "grad_norm": 3.0029428005218506, + "learning_rate": 1.0547832850641446e-06, + "loss": 0.3491, + "step": 8602 + }, + { + "epoch": 4.190483923351738, + "grad_norm": 2.9399235248565674, + "learning_rate": 1.0542585494439254e-06, + "loss": 0.3191, + "step": 8603 + }, + { + "epoch": 4.190971094511205, + "grad_norm": 3.205719232559204, + "learning_rate": 1.0537339095027408e-06, + "loss": 0.3542, + "step": 8604 + }, + { + "epoch": 4.191458265670672, + "grad_norm": 2.978379249572754, + "learning_rate": 1.0532093652753115e-06, + "loss": 0.391, + "step": 8605 + }, + { + "epoch": 4.19194543683014, + "grad_norm": 3.117396593093872, + "learning_rate": 1.0526849167963524e-06, + "loss": 0.3868, + "step": 8606 + }, + { + "epoch": 4.192432607989607, + "grad_norm": 2.8343420028686523, + "learning_rate": 1.05216056410057e-06, + "loss": 0.4242, + "step": 8607 + }, + { + "epoch": 4.192919779149074, + "grad_norm": 3.436192512512207, + "learning_rate": 1.0516363072226664e-06, + "loss": 0.3511, + "step": 8608 + }, + { + "epoch": 4.193406950308542, + "grad_norm": 3.732836961746216, + "learning_rate": 1.0511121461973373e-06, + "loss": 0.3444, + "step": 8609 + }, + { + "epoch": 4.193894121468009, + "grad_norm": 3.1926608085632324, + "learning_rate": 1.0505880810592722e-06, + "loss": 0.3903, + "step": 8610 + }, + { + "epoch": 4.194381292627477, + "grad_norm": 3.2892374992370605, + "learning_rate": 1.0500641118431523e-06, + "loss": 0.4247, + "step": 8611 + }, + { + "epoch": 4.194868463786944, + "grad_norm": 3.4503161907196045, + "learning_rate": 1.0495402385836543e-06, + "loss": 0.3762, + "step": 8612 + }, + { + "epoch": 4.195355634946411, + "grad_norm": 3.4270286560058594, + "learning_rate": 1.0490164613154483e-06, + "loss": 0.3924, + "step": 8613 + }, + { + "epoch": 4.195842806105879, + "grad_norm": 3.474811553955078, + "learning_rate": 1.0484927800731983e-06, + "loss": 0.437, + "step": 8614 + }, + { + "epoch": 4.1963299772653455, + "grad_norm": 3.073371648788452, + "learning_rate": 1.0479691948915604e-06, + "loss": 0.3576, + "step": 8615 + }, + { + "epoch": 4.196817148424813, + "grad_norm": 3.204055070877075, + "learning_rate": 1.047445705805186e-06, + "loss": 0.3756, + "step": 8616 + }, + { + "epoch": 4.197304319584281, + "grad_norm": 3.2881860733032227, + "learning_rate": 1.0469223128487206e-06, + "loss": 0.398, + "step": 8617 + }, + { + "epoch": 4.197791490743748, + "grad_norm": 3.128858804702759, + "learning_rate": 1.0463990160568e-06, + "loss": 0.3578, + "step": 8618 + }, + { + "epoch": 4.198278661903215, + "grad_norm": 2.876124858856201, + "learning_rate": 1.045875815464058e-06, + "loss": 0.4624, + "step": 8619 + }, + { + "epoch": 4.198765833062683, + "grad_norm": 3.5013742446899414, + "learning_rate": 1.0453527111051183e-06, + "loss": 0.4256, + "step": 8620 + }, + { + "epoch": 4.19925300422215, + "grad_norm": 3.5144786834716797, + "learning_rate": 1.044829703014601e-06, + "loss": 0.3653, + "step": 8621 + }, + { + "epoch": 4.199740175381617, + "grad_norm": 3.3753535747528076, + "learning_rate": 1.044306791227119e-06, + "loss": 0.4211, + "step": 8622 + }, + { + "epoch": 4.200227346541085, + "grad_norm": 3.022879123687744, + "learning_rate": 1.043783975777277e-06, + "loss": 0.3871, + "step": 8623 + }, + { + "epoch": 4.200714517700552, + "grad_norm": 3.3334672451019287, + "learning_rate": 1.0432612566996764e-06, + "loss": 0.4097, + "step": 8624 + }, + { + "epoch": 4.20120168886002, + "grad_norm": 2.9667158126831055, + "learning_rate": 1.04273863402891e-06, + "loss": 0.4256, + "step": 8625 + }, + { + "epoch": 4.201688860019487, + "grad_norm": 2.948371648788452, + "learning_rate": 1.0422161077995657e-06, + "loss": 0.3868, + "step": 8626 + }, + { + "epoch": 4.202176031178954, + "grad_norm": 3.3946919441223145, + "learning_rate": 1.041693678046223e-06, + "loss": 0.3702, + "step": 8627 + }, + { + "epoch": 4.202663202338422, + "grad_norm": 3.107412815093994, + "learning_rate": 1.0411713448034568e-06, + "loss": 0.3976, + "step": 8628 + }, + { + "epoch": 4.203150373497889, + "grad_norm": 3.3426353931427, + "learning_rate": 1.0406491081058354e-06, + "loss": 0.4698, + "step": 8629 + }, + { + "epoch": 4.203637544657356, + "grad_norm": 3.5504119396209717, + "learning_rate": 1.0401269679879207e-06, + "loss": 0.3937, + "step": 8630 + }, + { + "epoch": 4.204124715816824, + "grad_norm": 3.258103609085083, + "learning_rate": 1.0396049244842666e-06, + "loss": 0.3384, + "step": 8631 + }, + { + "epoch": 4.204611886976291, + "grad_norm": 3.5584700107574463, + "learning_rate": 1.0390829776294223e-06, + "loss": 0.3947, + "step": 8632 + }, + { + "epoch": 4.205099058135758, + "grad_norm": 3.395575523376465, + "learning_rate": 1.0385611274579308e-06, + "loss": 0.3821, + "step": 8633 + }, + { + "epoch": 4.205586229295226, + "grad_norm": 2.943268299102783, + "learning_rate": 1.0380393740043287e-06, + "loss": 0.3961, + "step": 8634 + }, + { + "epoch": 4.206073400454693, + "grad_norm": 3.476653575897217, + "learning_rate": 1.0375177173031436e-06, + "loss": 0.39, + "step": 8635 + }, + { + "epoch": 4.20656057161416, + "grad_norm": 2.9810330867767334, + "learning_rate": 1.0369961573889e-06, + "loss": 0.3715, + "step": 8636 + }, + { + "epoch": 4.207047742773628, + "grad_norm": 3.0479063987731934, + "learning_rate": 1.0364746942961143e-06, + "loss": 0.3526, + "step": 8637 + }, + { + "epoch": 4.207534913933095, + "grad_norm": 2.9448862075805664, + "learning_rate": 1.0359533280592982e-06, + "loss": 0.3858, + "step": 8638 + }, + { + "epoch": 4.208022085092563, + "grad_norm": 3.6299357414245605, + "learning_rate": 1.0354320587129532e-06, + "loss": 0.4382, + "step": 8639 + }, + { + "epoch": 4.20850925625203, + "grad_norm": 3.48869252204895, + "learning_rate": 1.0349108862915785e-06, + "loss": 0.3833, + "step": 8640 + }, + { + "epoch": 4.208996427411497, + "grad_norm": 4.821342945098877, + "learning_rate": 1.0343898108296657e-06, + "loss": 0.4265, + "step": 8641 + }, + { + "epoch": 4.209483598570965, + "grad_norm": 3.2365705966949463, + "learning_rate": 1.033868832361698e-06, + "loss": 0.3547, + "step": 8642 + }, + { + "epoch": 4.209970769730432, + "grad_norm": 3.1865804195404053, + "learning_rate": 1.0333479509221542e-06, + "loss": 0.3911, + "step": 8643 + }, + { + "epoch": 4.210457940889899, + "grad_norm": 3.17584228515625, + "learning_rate": 1.0328271665455067e-06, + "loss": 0.3387, + "step": 8644 + }, + { + "epoch": 4.210945112049367, + "grad_norm": 3.294149398803711, + "learning_rate": 1.0323064792662213e-06, + "loss": 0.3398, + "step": 8645 + }, + { + "epoch": 4.2114322832088344, + "grad_norm": 3.6417014598846436, + "learning_rate": 1.0317858891187557e-06, + "loss": 0.4304, + "step": 8646 + }, + { + "epoch": 4.211919454368301, + "grad_norm": 3.5035204887390137, + "learning_rate": 1.0312653961375633e-06, + "loss": 0.4187, + "step": 8647 + }, + { + "epoch": 4.212406625527769, + "grad_norm": 3.5785486698150635, + "learning_rate": 1.0307450003570901e-06, + "loss": 0.4075, + "step": 8648 + }, + { + "epoch": 4.212893796687236, + "grad_norm": 3.0968387126922607, + "learning_rate": 1.0302247018117767e-06, + "loss": 0.3282, + "step": 8649 + }, + { + "epoch": 4.213380967846703, + "grad_norm": 2.867805004119873, + "learning_rate": 1.029704500536055e-06, + "loss": 0.3753, + "step": 8650 + }, + { + "epoch": 4.213868139006171, + "grad_norm": 3.090801239013672, + "learning_rate": 1.0291843965643524e-06, + "loss": 0.3846, + "step": 8651 + }, + { + "epoch": 4.214355310165638, + "grad_norm": 3.0896902084350586, + "learning_rate": 1.0286643899310896e-06, + "loss": 0.3713, + "step": 8652 + }, + { + "epoch": 4.214842481325106, + "grad_norm": 2.694638729095459, + "learning_rate": 1.0281444806706815e-06, + "loss": 0.3853, + "step": 8653 + }, + { + "epoch": 4.215329652484573, + "grad_norm": 3.2513351440429688, + "learning_rate": 1.0276246688175334e-06, + "loss": 0.3574, + "step": 8654 + }, + { + "epoch": 4.21581682364404, + "grad_norm": 3.0993151664733887, + "learning_rate": 1.027104954406048e-06, + "loss": 0.3918, + "step": 8655 + }, + { + "epoch": 4.216303994803508, + "grad_norm": 3.889984130859375, + "learning_rate": 1.0265853374706192e-06, + "loss": 0.4797, + "step": 8656 + }, + { + "epoch": 4.216791165962975, + "grad_norm": 3.406602382659912, + "learning_rate": 1.0260658180456365e-06, + "loss": 0.3496, + "step": 8657 + }, + { + "epoch": 4.217278337122442, + "grad_norm": 3.1820178031921387, + "learning_rate": 1.0255463961654799e-06, + "loss": 0.3714, + "step": 8658 + }, + { + "epoch": 4.21776550828191, + "grad_norm": 2.9371869564056396, + "learning_rate": 1.0250270718645256e-06, + "loss": 0.4434, + "step": 8659 + }, + { + "epoch": 4.2182526794413775, + "grad_norm": 3.596341848373413, + "learning_rate": 1.024507845177143e-06, + "loss": 0.3622, + "step": 8660 + }, + { + "epoch": 4.218739850600844, + "grad_norm": 3.173123359680176, + "learning_rate": 1.023988716137693e-06, + "loss": 0.375, + "step": 8661 + }, + { + "epoch": 4.219227021760312, + "grad_norm": 3.0763893127441406, + "learning_rate": 1.0234696847805326e-06, + "loss": 0.371, + "step": 8662 + }, + { + "epoch": 4.2197141929197794, + "grad_norm": 3.461402654647827, + "learning_rate": 1.0229507511400113e-06, + "loss": 0.3642, + "step": 8663 + }, + { + "epoch": 4.220201364079246, + "grad_norm": 3.261362075805664, + "learning_rate": 1.0224319152504714e-06, + "loss": 0.3336, + "step": 8664 + }, + { + "epoch": 4.220688535238714, + "grad_norm": 2.86283278465271, + "learning_rate": 1.0219131771462495e-06, + "loss": 0.3771, + "step": 8665 + }, + { + "epoch": 4.221175706398181, + "grad_norm": 3.517838954925537, + "learning_rate": 1.0213945368616768e-06, + "loss": 0.3577, + "step": 8666 + }, + { + "epoch": 4.221662877557648, + "grad_norm": 3.363615036010742, + "learning_rate": 1.020875994431075e-06, + "loss": 0.4324, + "step": 8667 + }, + { + "epoch": 4.222150048717116, + "grad_norm": 3.2769205570220947, + "learning_rate": 1.020357549888762e-06, + "loss": 0.3294, + "step": 8668 + }, + { + "epoch": 4.222637219876583, + "grad_norm": 3.1932601928710938, + "learning_rate": 1.0198392032690498e-06, + "loss": 0.3527, + "step": 8669 + }, + { + "epoch": 4.223124391036051, + "grad_norm": 2.9823591709136963, + "learning_rate": 1.0193209546062405e-06, + "loss": 0.4003, + "step": 8670 + }, + { + "epoch": 4.223611562195518, + "grad_norm": 4.592055797576904, + "learning_rate": 1.0188028039346322e-06, + "loss": 0.3832, + "step": 8671 + }, + { + "epoch": 4.224098733354985, + "grad_norm": 3.0598721504211426, + "learning_rate": 1.0182847512885167e-06, + "loss": 0.4025, + "step": 8672 + }, + { + "epoch": 4.224585904514453, + "grad_norm": 3.2672877311706543, + "learning_rate": 1.017766796702179e-06, + "loss": 0.4247, + "step": 8673 + }, + { + "epoch": 4.2250730756739205, + "grad_norm": 3.2150683403015137, + "learning_rate": 1.0172489402098959e-06, + "loss": 0.3833, + "step": 8674 + }, + { + "epoch": 4.225560246833387, + "grad_norm": 3.153343677520752, + "learning_rate": 1.01673118184594e-06, + "loss": 0.3829, + "step": 8675 + }, + { + "epoch": 4.226047417992855, + "grad_norm": 3.21108341217041, + "learning_rate": 1.0162135216445761e-06, + "loss": 0.3939, + "step": 8676 + }, + { + "epoch": 4.2265345891523225, + "grad_norm": 3.265348434448242, + "learning_rate": 1.0156959596400642e-06, + "loss": 0.412, + "step": 8677 + }, + { + "epoch": 4.227021760311789, + "grad_norm": 2.9727413654327393, + "learning_rate": 1.0151784958666543e-06, + "loss": 0.3666, + "step": 8678 + }, + { + "epoch": 4.227508931471257, + "grad_norm": 2.814617156982422, + "learning_rate": 1.0146611303585938e-06, + "loss": 0.379, + "step": 8679 + }, + { + "epoch": 4.2279961026307245, + "grad_norm": 3.275596857070923, + "learning_rate": 1.014143863150121e-06, + "loss": 0.3561, + "step": 8680 + }, + { + "epoch": 4.228483273790191, + "grad_norm": 3.0255191326141357, + "learning_rate": 1.01362669427547e-06, + "loss": 0.3945, + "step": 8681 + }, + { + "epoch": 4.228970444949659, + "grad_norm": 3.0923538208007812, + "learning_rate": 1.013109623768865e-06, + "loss": 0.3876, + "step": 8682 + }, + { + "epoch": 4.229457616109126, + "grad_norm": 3.237910747528076, + "learning_rate": 1.0125926516645268e-06, + "loss": 0.4149, + "step": 8683 + }, + { + "epoch": 4.229944787268594, + "grad_norm": 3.1589057445526123, + "learning_rate": 1.0120757779966684e-06, + "loss": 0.374, + "step": 8684 + }, + { + "epoch": 4.230431958428061, + "grad_norm": 3.2342116832733154, + "learning_rate": 1.011559002799497e-06, + "loss": 0.3741, + "step": 8685 + }, + { + "epoch": 4.230919129587528, + "grad_norm": 3.9047083854675293, + "learning_rate": 1.0110423261072117e-06, + "loss": 0.3484, + "step": 8686 + }, + { + "epoch": 4.231406300746996, + "grad_norm": 3.0656185150146484, + "learning_rate": 1.0105257479540064e-06, + "loss": 0.4558, + "step": 8687 + }, + { + "epoch": 4.231893471906464, + "grad_norm": 4.255083084106445, + "learning_rate": 1.0100092683740684e-06, + "loss": 0.3683, + "step": 8688 + }, + { + "epoch": 4.23238064306593, + "grad_norm": 3.371842861175537, + "learning_rate": 1.0094928874015793e-06, + "loss": 0.4254, + "step": 8689 + }, + { + "epoch": 4.232867814225398, + "grad_norm": 3.21913480758667, + "learning_rate": 1.008976605070711e-06, + "loss": 0.3684, + "step": 8690 + }, + { + "epoch": 4.2333549853848655, + "grad_norm": 3.09859037399292, + "learning_rate": 1.0084604214156324e-06, + "loss": 0.4007, + "step": 8691 + }, + { + "epoch": 4.233842156544332, + "grad_norm": 3.1006698608398438, + "learning_rate": 1.007944336470505e-06, + "loss": 0.4188, + "step": 8692 + }, + { + "epoch": 4.2343293277038, + "grad_norm": 3.0143699645996094, + "learning_rate": 1.0074283502694814e-06, + "loss": 0.3843, + "step": 8693 + }, + { + "epoch": 4.2348164988632675, + "grad_norm": 3.4132182598114014, + "learning_rate": 1.0069124628467108e-06, + "loss": 0.4382, + "step": 8694 + }, + { + "epoch": 4.235303670022734, + "grad_norm": 3.412775993347168, + "learning_rate": 1.0063966742363343e-06, + "loss": 0.361, + "step": 8695 + }, + { + "epoch": 4.235790841182202, + "grad_norm": 3.064220905303955, + "learning_rate": 1.005880984472488e-06, + "loss": 0.3857, + "step": 8696 + }, + { + "epoch": 4.2362780123416695, + "grad_norm": 3.2064101696014404, + "learning_rate": 1.0053653935892976e-06, + "loss": 0.3639, + "step": 8697 + }, + { + "epoch": 4.236765183501137, + "grad_norm": 3.2217047214508057, + "learning_rate": 1.0048499016208865e-06, + "loss": 0.4127, + "step": 8698 + }, + { + "epoch": 4.237252354660604, + "grad_norm": 3.126727819442749, + "learning_rate": 1.0043345086013698e-06, + "loss": 0.3946, + "step": 8699 + }, + { + "epoch": 4.237739525820071, + "grad_norm": 2.9718949794769287, + "learning_rate": 1.0038192145648567e-06, + "loss": 0.3854, + "step": 8700 + }, + { + "epoch": 4.238226696979539, + "grad_norm": 3.414464235305786, + "learning_rate": 1.003304019545448e-06, + "loss": 0.3512, + "step": 8701 + }, + { + "epoch": 4.238713868139006, + "grad_norm": 2.893866539001465, + "learning_rate": 1.0027889235772397e-06, + "loss": 0.4059, + "step": 8702 + }, + { + "epoch": 4.239201039298473, + "grad_norm": 3.0896224975585938, + "learning_rate": 1.0022739266943222e-06, + "loss": 0.3753, + "step": 8703 + }, + { + "epoch": 4.239688210457941, + "grad_norm": 3.33048939704895, + "learning_rate": 1.0017590289307758e-06, + "loss": 0.4184, + "step": 8704 + }, + { + "epoch": 4.240175381617409, + "grad_norm": 3.255495309829712, + "learning_rate": 1.0012442303206771e-06, + "loss": 0.3683, + "step": 8705 + }, + { + "epoch": 4.240662552776875, + "grad_norm": 3.4575634002685547, + "learning_rate": 1.000729530898097e-06, + "loss": 0.3985, + "step": 8706 + }, + { + "epoch": 4.241149723936343, + "grad_norm": 3.937594175338745, + "learning_rate": 1.0002149306970953e-06, + "loss": 0.4123, + "step": 8707 + }, + { + "epoch": 4.2416368950958105, + "grad_norm": 2.9831249713897705, + "learning_rate": 9.997004297517304e-07, + "loss": 0.3967, + "step": 8708 + }, + { + "epoch": 4.242124066255277, + "grad_norm": 3.312283515930176, + "learning_rate": 9.991860280960523e-07, + "loss": 0.3863, + "step": 8709 + }, + { + "epoch": 4.242611237414745, + "grad_norm": 3.5531513690948486, + "learning_rate": 9.986717257641022e-07, + "loss": 0.3747, + "step": 8710 + }, + { + "epoch": 4.2430984085742125, + "grad_norm": 3.381829261779785, + "learning_rate": 9.98157522789917e-07, + "loss": 0.3431, + "step": 8711 + }, + { + "epoch": 4.24358557973368, + "grad_norm": 3.2086219787597656, + "learning_rate": 9.976434192075284e-07, + "loss": 0.4151, + "step": 8712 + }, + { + "epoch": 4.244072750893147, + "grad_norm": 3.320885419845581, + "learning_rate": 9.971294150509575e-07, + "loss": 0.3956, + "step": 8713 + }, + { + "epoch": 4.2445599220526145, + "grad_norm": 3.2909858226776123, + "learning_rate": 9.966155103542218e-07, + "loss": 0.3622, + "step": 8714 + }, + { + "epoch": 4.245047093212082, + "grad_norm": 3.4818899631500244, + "learning_rate": 9.961017051513316e-07, + "loss": 0.4152, + "step": 8715 + }, + { + "epoch": 4.245534264371549, + "grad_norm": 3.262275457382202, + "learning_rate": 9.955879994762917e-07, + "loss": 0.37, + "step": 8716 + }, + { + "epoch": 4.246021435531016, + "grad_norm": 3.1759114265441895, + "learning_rate": 9.95074393363097e-07, + "loss": 0.3537, + "step": 8717 + }, + { + "epoch": 4.246508606690484, + "grad_norm": 2.941218137741089, + "learning_rate": 9.945608868457384e-07, + "loss": 0.344, + "step": 8718 + }, + { + "epoch": 4.246995777849952, + "grad_norm": 3.4011363983154297, + "learning_rate": 9.940474799582004e-07, + "loss": 0.4515, + "step": 8719 + }, + { + "epoch": 4.247482949009418, + "grad_norm": 3.508225679397583, + "learning_rate": 9.935341727344608e-07, + "loss": 0.3944, + "step": 8720 + }, + { + "epoch": 4.247970120168886, + "grad_norm": 3.1999402046203613, + "learning_rate": 9.930209652084885e-07, + "loss": 0.415, + "step": 8721 + }, + { + "epoch": 4.248457291328354, + "grad_norm": 3.6231184005737305, + "learning_rate": 9.925078574142485e-07, + "loss": 0.3789, + "step": 8722 + }, + { + "epoch": 4.24894446248782, + "grad_norm": 3.550844192504883, + "learning_rate": 9.91994849385698e-07, + "loss": 0.3665, + "step": 8723 + }, + { + "epoch": 4.249431633647288, + "grad_norm": 3.1874897480010986, + "learning_rate": 9.91481941156789e-07, + "loss": 0.4071, + "step": 8724 + }, + { + "epoch": 4.2499188048067555, + "grad_norm": 3.1482155323028564, + "learning_rate": 9.909691327614635e-07, + "loss": 0.3784, + "step": 8725 + }, + { + "epoch": 4.250405975966223, + "grad_norm": 3.3262252807617188, + "learning_rate": 9.904564242336602e-07, + "loss": 0.4329, + "step": 8726 + }, + { + "epoch": 4.25089314712569, + "grad_norm": 3.3602654933929443, + "learning_rate": 9.899438156073104e-07, + "loss": 0.3774, + "step": 8727 + }, + { + "epoch": 4.2513803182851575, + "grad_norm": 3.56777286529541, + "learning_rate": 9.894313069163392e-07, + "loss": 0.4173, + "step": 8728 + }, + { + "epoch": 4.251867489444625, + "grad_norm": 3.050922155380249, + "learning_rate": 9.889188981946623e-07, + "loss": 0.3842, + "step": 8729 + }, + { + "epoch": 4.252354660604092, + "grad_norm": 3.1752877235412598, + "learning_rate": 9.884065894761922e-07, + "loss": 0.3725, + "step": 8730 + }, + { + "epoch": 4.2528418317635595, + "grad_norm": 2.984711170196533, + "learning_rate": 9.878943807948333e-07, + "loss": 0.3968, + "step": 8731 + }, + { + "epoch": 4.253329002923027, + "grad_norm": 2.888632297515869, + "learning_rate": 9.873822721844841e-07, + "loss": 0.3045, + "step": 8732 + }, + { + "epoch": 4.253816174082495, + "grad_norm": 3.3965344429016113, + "learning_rate": 9.868702636790344e-07, + "loss": 0.3838, + "step": 8733 + }, + { + "epoch": 4.254303345241961, + "grad_norm": 3.2968015670776367, + "learning_rate": 9.8635835531237e-07, + "loss": 0.3505, + "step": 8734 + }, + { + "epoch": 4.254790516401429, + "grad_norm": 3.148131847381592, + "learning_rate": 9.858465471183687e-07, + "loss": 0.3988, + "step": 8735 + }, + { + "epoch": 4.255277687560897, + "grad_norm": 2.954134941101074, + "learning_rate": 9.853348391309026e-07, + "loss": 0.3839, + "step": 8736 + }, + { + "epoch": 4.255764858720363, + "grad_norm": 3.2700488567352295, + "learning_rate": 9.848232313838353e-07, + "loss": 0.4319, + "step": 8737 + }, + { + "epoch": 4.256252029879831, + "grad_norm": 3.369414806365967, + "learning_rate": 9.843117239110251e-07, + "loss": 0.4195, + "step": 8738 + }, + { + "epoch": 4.256739201039299, + "grad_norm": 3.4109506607055664, + "learning_rate": 9.838003167463236e-07, + "loss": 0.3884, + "step": 8739 + }, + { + "epoch": 4.257226372198766, + "grad_norm": 3.3883743286132812, + "learning_rate": 9.832890099235772e-07, + "loss": 0.3374, + "step": 8740 + }, + { + "epoch": 4.257713543358233, + "grad_norm": 3.103691339492798, + "learning_rate": 9.827778034766223e-07, + "loss": 0.446, + "step": 8741 + }, + { + "epoch": 4.2582007145177005, + "grad_norm": 3.1178836822509766, + "learning_rate": 9.822666974392905e-07, + "loss": 0.3452, + "step": 8742 + }, + { + "epoch": 4.258687885677168, + "grad_norm": 2.9406495094299316, + "learning_rate": 9.817556918454085e-07, + "loss": 0.4089, + "step": 8743 + }, + { + "epoch": 4.259175056836635, + "grad_norm": 3.1658151149749756, + "learning_rate": 9.812447867287925e-07, + "loss": 0.3849, + "step": 8744 + }, + { + "epoch": 4.2596622279961025, + "grad_norm": 2.9013588428497314, + "learning_rate": 9.80733982123255e-07, + "loss": 0.3954, + "step": 8745 + }, + { + "epoch": 4.26014939915557, + "grad_norm": 3.4254302978515625, + "learning_rate": 9.802232780626014e-07, + "loss": 0.3726, + "step": 8746 + }, + { + "epoch": 4.260636570315038, + "grad_norm": 2.8292133808135986, + "learning_rate": 9.797126745806304e-07, + "loss": 0.3646, + "step": 8747 + }, + { + "epoch": 4.2611237414745045, + "grad_norm": 3.049410581588745, + "learning_rate": 9.792021717111322e-07, + "loss": 0.3111, + "step": 8748 + }, + { + "epoch": 4.261610912633972, + "grad_norm": 3.1403939723968506, + "learning_rate": 9.786917694878926e-07, + "loss": 0.3697, + "step": 8749 + }, + { + "epoch": 4.26209808379344, + "grad_norm": 3.2230849266052246, + "learning_rate": 9.781814679446908e-07, + "loss": 0.4155, + "step": 8750 + }, + { + "epoch": 4.262585254952906, + "grad_norm": 3.482532024383545, + "learning_rate": 9.77671267115297e-07, + "loss": 0.3321, + "step": 8751 + }, + { + "epoch": 4.263072426112374, + "grad_norm": 3.079638957977295, + "learning_rate": 9.77161167033477e-07, + "loss": 0.3886, + "step": 8752 + }, + { + "epoch": 4.263559597271842, + "grad_norm": 3.1703734397888184, + "learning_rate": 9.766511677329902e-07, + "loss": 0.369, + "step": 8753 + }, + { + "epoch": 4.264046768431309, + "grad_norm": 3.0931169986724854, + "learning_rate": 9.76141269247586e-07, + "loss": 0.493, + "step": 8754 + }, + { + "epoch": 4.264533939590776, + "grad_norm": 3.309769630432129, + "learning_rate": 9.756314716110118e-07, + "loss": 0.331, + "step": 8755 + }, + { + "epoch": 4.265021110750244, + "grad_norm": 3.336988925933838, + "learning_rate": 9.75121774857004e-07, + "loss": 0.3487, + "step": 8756 + }, + { + "epoch": 4.265508281909711, + "grad_norm": 3.308662176132202, + "learning_rate": 9.74612179019295e-07, + "loss": 0.3995, + "step": 8757 + }, + { + "epoch": 4.265995453069178, + "grad_norm": 3.2778561115264893, + "learning_rate": 9.7410268413161e-07, + "loss": 0.421, + "step": 8758 + }, + { + "epoch": 4.2664826242286455, + "grad_norm": 3.0010933876037598, + "learning_rate": 9.735932902276684e-07, + "loss": 0.3652, + "step": 8759 + }, + { + "epoch": 4.266969795388113, + "grad_norm": 3.0256595611572266, + "learning_rate": 9.730839973411795e-07, + "loss": 0.4143, + "step": 8760 + }, + { + "epoch": 4.267456966547581, + "grad_norm": 3.050264596939087, + "learning_rate": 9.725748055058496e-07, + "loss": 0.3922, + "step": 8761 + }, + { + "epoch": 4.2679441377070475, + "grad_norm": 3.0521395206451416, + "learning_rate": 9.72065714755377e-07, + "loss": 0.3702, + "step": 8762 + }, + { + "epoch": 4.268431308866515, + "grad_norm": 3.0890097618103027, + "learning_rate": 9.715567251234538e-07, + "loss": 0.3724, + "step": 8763 + }, + { + "epoch": 4.268918480025983, + "grad_norm": 3.2754335403442383, + "learning_rate": 9.710478366437635e-07, + "loss": 0.2939, + "step": 8764 + }, + { + "epoch": 4.2694056511854495, + "grad_norm": 3.3627841472625732, + "learning_rate": 9.705390493499846e-07, + "loss": 0.3765, + "step": 8765 + }, + { + "epoch": 4.269892822344917, + "grad_norm": 3.3212838172912598, + "learning_rate": 9.700303632757892e-07, + "loss": 0.37, + "step": 8766 + }, + { + "epoch": 4.270379993504385, + "grad_norm": 3.4466984272003174, + "learning_rate": 9.69521778454843e-07, + "loss": 0.43, + "step": 8767 + }, + { + "epoch": 4.270867164663852, + "grad_norm": 3.241619348526001, + "learning_rate": 9.69013294920802e-07, + "loss": 0.3604, + "step": 8768 + }, + { + "epoch": 4.271354335823319, + "grad_norm": 4.029839992523193, + "learning_rate": 9.685049127073184e-07, + "loss": 0.4327, + "step": 8769 + }, + { + "epoch": 4.271841506982787, + "grad_norm": 3.5255069732666016, + "learning_rate": 9.679966318480372e-07, + "loss": 0.3911, + "step": 8770 + }, + { + "epoch": 4.272328678142254, + "grad_norm": 3.4387083053588867, + "learning_rate": 9.67488452376597e-07, + "loss": 0.353, + "step": 8771 + }, + { + "epoch": 4.272815849301721, + "grad_norm": 3.226374864578247, + "learning_rate": 9.669803743266275e-07, + "loss": 0.3968, + "step": 8772 + }, + { + "epoch": 4.273303020461189, + "grad_norm": 3.3064944744110107, + "learning_rate": 9.664723977317542e-07, + "loss": 0.3956, + "step": 8773 + }, + { + "epoch": 4.273790191620656, + "grad_norm": 3.088360071182251, + "learning_rate": 9.659645226255946e-07, + "loss": 0.3557, + "step": 8774 + }, + { + "epoch": 4.274277362780124, + "grad_norm": 3.6607322692871094, + "learning_rate": 9.65456749041761e-07, + "loss": 0.4205, + "step": 8775 + }, + { + "epoch": 4.2747645339395905, + "grad_norm": 3.042091131210327, + "learning_rate": 9.64949077013856e-07, + "loss": 0.3137, + "step": 8776 + }, + { + "epoch": 4.275251705099058, + "grad_norm": 2.7504286766052246, + "learning_rate": 9.64441506575478e-07, + "loss": 0.4014, + "step": 8777 + }, + { + "epoch": 4.275738876258526, + "grad_norm": 2.9496865272521973, + "learning_rate": 9.639340377602185e-07, + "loss": 0.3936, + "step": 8778 + }, + { + "epoch": 4.2762260474179925, + "grad_norm": 3.59995436668396, + "learning_rate": 9.634266706016618e-07, + "loss": 0.3946, + "step": 8779 + }, + { + "epoch": 4.27671321857746, + "grad_norm": 3.4401607513427734, + "learning_rate": 9.629194051333843e-07, + "loss": 0.3894, + "step": 8780 + }, + { + "epoch": 4.277200389736928, + "grad_norm": 2.9880073070526123, + "learning_rate": 9.624122413889575e-07, + "loss": 0.4676, + "step": 8781 + }, + { + "epoch": 4.277687560896395, + "grad_norm": 3.9691569805145264, + "learning_rate": 9.619051794019452e-07, + "loss": 0.4108, + "step": 8782 + }, + { + "epoch": 4.278174732055862, + "grad_norm": 3.409027576446533, + "learning_rate": 9.613982192059057e-07, + "loss": 0.347, + "step": 8783 + }, + { + "epoch": 4.27866190321533, + "grad_norm": 2.988454580307007, + "learning_rate": 9.608913608343882e-07, + "loss": 0.3465, + "step": 8784 + }, + { + "epoch": 4.279149074374797, + "grad_norm": 3.3646037578582764, + "learning_rate": 9.60384604320937e-07, + "loss": 0.3667, + "step": 8785 + }, + { + "epoch": 4.279636245534264, + "grad_norm": 3.310553550720215, + "learning_rate": 9.598779496990892e-07, + "loss": 0.408, + "step": 8786 + }, + { + "epoch": 4.280123416693732, + "grad_norm": 3.4133923053741455, + "learning_rate": 9.593713970023763e-07, + "loss": 0.3467, + "step": 8787 + }, + { + "epoch": 4.280610587853199, + "grad_norm": 3.2666513919830322, + "learning_rate": 9.5886494626432e-07, + "loss": 0.3711, + "step": 8788 + }, + { + "epoch": 4.281097759012667, + "grad_norm": 3.353294849395752, + "learning_rate": 9.583585975184378e-07, + "loss": 0.3104, + "step": 8789 + }, + { + "epoch": 4.281584930172134, + "grad_norm": 3.0188138484954834, + "learning_rate": 9.57852350798241e-07, + "loss": 0.3923, + "step": 8790 + }, + { + "epoch": 4.282072101331601, + "grad_norm": 3.0751054286956787, + "learning_rate": 9.573462061372312e-07, + "loss": 0.3817, + "step": 8791 + }, + { + "epoch": 4.282559272491069, + "grad_norm": 3.088550329208374, + "learning_rate": 9.568401635689059e-07, + "loss": 0.3635, + "step": 8792 + }, + { + "epoch": 4.2830464436505356, + "grad_norm": 3.4427366256713867, + "learning_rate": 9.563342231267557e-07, + "loss": 0.3307, + "step": 8793 + }, + { + "epoch": 4.283533614810003, + "grad_norm": 3.2864277362823486, + "learning_rate": 9.55828384844262e-07, + "loss": 0.3744, + "step": 8794 + }, + { + "epoch": 4.284020785969471, + "grad_norm": 3.2591586112976074, + "learning_rate": 9.553226487549022e-07, + "loss": 0.4161, + "step": 8795 + }, + { + "epoch": 4.2845079571289375, + "grad_norm": 3.8511648178100586, + "learning_rate": 9.548170148921462e-07, + "loss": 0.3328, + "step": 8796 + }, + { + "epoch": 4.284995128288405, + "grad_norm": 4.683558940887451, + "learning_rate": 9.543114832894557e-07, + "loss": 0.3427, + "step": 8797 + }, + { + "epoch": 4.285482299447873, + "grad_norm": 3.0694637298583984, + "learning_rate": 9.53806053980287e-07, + "loss": 0.3719, + "step": 8798 + }, + { + "epoch": 4.28596947060734, + "grad_norm": 3.319965124130249, + "learning_rate": 9.533007269980909e-07, + "loss": 0.4141, + "step": 8799 + }, + { + "epoch": 4.286456641766807, + "grad_norm": 3.8988606929779053, + "learning_rate": 9.527955023763075e-07, + "loss": 0.3815, + "step": 8800 + }, + { + "epoch": 4.286943812926275, + "grad_norm": 3.4860341548919678, + "learning_rate": 9.522903801483738e-07, + "loss": 0.3598, + "step": 8801 + }, + { + "epoch": 4.287430984085742, + "grad_norm": 3.2102246284484863, + "learning_rate": 9.517853603477195e-07, + "loss": 0.4433, + "step": 8802 + }, + { + "epoch": 4.28791815524521, + "grad_norm": 3.868010997772217, + "learning_rate": 9.512804430077649e-07, + "loss": 0.4088, + "step": 8803 + }, + { + "epoch": 4.288405326404677, + "grad_norm": 3.6778411865234375, + "learning_rate": 9.507756281619265e-07, + "loss": 0.3855, + "step": 8804 + }, + { + "epoch": 4.288892497564144, + "grad_norm": 3.195108413696289, + "learning_rate": 9.502709158436127e-07, + "loss": 0.3906, + "step": 8805 + }, + { + "epoch": 4.289379668723612, + "grad_norm": 3.2441089153289795, + "learning_rate": 9.497663060862261e-07, + "loss": 0.3712, + "step": 8806 + }, + { + "epoch": 4.289866839883079, + "grad_norm": 3.26930832862854, + "learning_rate": 9.492617989231603e-07, + "loss": 0.3688, + "step": 8807 + }, + { + "epoch": 4.290354011042546, + "grad_norm": 3.4559903144836426, + "learning_rate": 9.487573943878042e-07, + "loss": 0.3968, + "step": 8808 + }, + { + "epoch": 4.290841182202014, + "grad_norm": 3.882931709289551, + "learning_rate": 9.482530925135394e-07, + "loss": 0.4114, + "step": 8809 + }, + { + "epoch": 4.2913283533614806, + "grad_norm": 3.241361141204834, + "learning_rate": 9.477488933337411e-07, + "loss": 0.3514, + "step": 8810 + }, + { + "epoch": 4.291815524520948, + "grad_norm": 3.2712864875793457, + "learning_rate": 9.472447968817758e-07, + "loss": 0.3821, + "step": 8811 + }, + { + "epoch": 4.292302695680416, + "grad_norm": 3.2492010593414307, + "learning_rate": 9.467408031910053e-07, + "loss": 0.3789, + "step": 8812 + }, + { + "epoch": 4.292789866839883, + "grad_norm": 3.3154983520507812, + "learning_rate": 9.462369122947835e-07, + "loss": 0.3876, + "step": 8813 + }, + { + "epoch": 4.29327703799935, + "grad_norm": 3.432426691055298, + "learning_rate": 9.457331242264592e-07, + "loss": 0.3843, + "step": 8814 + }, + { + "epoch": 4.293764209158818, + "grad_norm": 3.998263359069824, + "learning_rate": 9.452294390193711e-07, + "loss": 0.4782, + "step": 8815 + }, + { + "epoch": 4.294251380318285, + "grad_norm": 4.347586154937744, + "learning_rate": 9.44725856706854e-07, + "loss": 0.3893, + "step": 8816 + }, + { + "epoch": 4.294738551477753, + "grad_norm": 3.2643141746520996, + "learning_rate": 9.44222377322235e-07, + "loss": 0.37, + "step": 8817 + }, + { + "epoch": 4.29522572263722, + "grad_norm": 3.0639266967773438, + "learning_rate": 9.437190008988351e-07, + "loss": 0.3927, + "step": 8818 + }, + { + "epoch": 4.295712893796687, + "grad_norm": 3.6750874519348145, + "learning_rate": 9.432157274699654e-07, + "loss": 0.4013, + "step": 8819 + }, + { + "epoch": 4.296200064956155, + "grad_norm": 3.7375810146331787, + "learning_rate": 9.427125570689344e-07, + "loss": 0.3863, + "step": 8820 + }, + { + "epoch": 4.296687236115622, + "grad_norm": 3.231240749359131, + "learning_rate": 9.422094897290412e-07, + "loss": 0.3529, + "step": 8821 + }, + { + "epoch": 4.297174407275089, + "grad_norm": 3.1647098064422607, + "learning_rate": 9.4170652548358e-07, + "loss": 0.3477, + "step": 8822 + }, + { + "epoch": 4.297661578434557, + "grad_norm": 3.1485700607299805, + "learning_rate": 9.412036643658345e-07, + "loss": 0.4482, + "step": 8823 + }, + { + "epoch": 4.298148749594024, + "grad_norm": 3.1212968826293945, + "learning_rate": 9.407009064090858e-07, + "loss": 0.2699, + "step": 8824 + }, + { + "epoch": 4.298635920753491, + "grad_norm": 3.029690980911255, + "learning_rate": 9.401982516466057e-07, + "loss": 0.3523, + "step": 8825 + }, + { + "epoch": 4.299123091912959, + "grad_norm": 3.3457460403442383, + "learning_rate": 9.396957001116608e-07, + "loss": 0.346, + "step": 8826 + }, + { + "epoch": 4.299610263072426, + "grad_norm": 3.366164445877075, + "learning_rate": 9.391932518375085e-07, + "loss": 0.4234, + "step": 8827 + }, + { + "epoch": 4.300097434231893, + "grad_norm": 3.23895525932312, + "learning_rate": 9.386909068574013e-07, + "loss": 0.4194, + "step": 8828 + }, + { + "epoch": 4.300584605391361, + "grad_norm": 3.4393978118896484, + "learning_rate": 9.381886652045846e-07, + "loss": 0.3246, + "step": 8829 + }, + { + "epoch": 4.301071776550828, + "grad_norm": 2.857783555984497, + "learning_rate": 9.376865269122975e-07, + "loss": 0.3909, + "step": 8830 + }, + { + "epoch": 4.301558947710296, + "grad_norm": 3.224212408065796, + "learning_rate": 9.371844920137695e-07, + "loss": 0.4262, + "step": 8831 + }, + { + "epoch": 4.302046118869763, + "grad_norm": 3.1884586811065674, + "learning_rate": 9.366825605422267e-07, + "loss": 0.3665, + "step": 8832 + }, + { + "epoch": 4.30253329002923, + "grad_norm": 2.9247655868530273, + "learning_rate": 9.361807325308861e-07, + "loss": 0.3967, + "step": 8833 + }, + { + "epoch": 4.303020461188698, + "grad_norm": 3.606351137161255, + "learning_rate": 9.356790080129599e-07, + "loss": 0.3874, + "step": 8834 + }, + { + "epoch": 4.303507632348165, + "grad_norm": 3.4259164333343506, + "learning_rate": 9.351773870216505e-07, + "loss": 0.3496, + "step": 8835 + }, + { + "epoch": 4.303994803507632, + "grad_norm": 3.3889386653900146, + "learning_rate": 9.346758695901559e-07, + "loss": 0.4233, + "step": 8836 + }, + { + "epoch": 4.3044819746671, + "grad_norm": 3.0695528984069824, + "learning_rate": 9.341744557516674e-07, + "loss": 0.4246, + "step": 8837 + }, + { + "epoch": 4.304969145826567, + "grad_norm": 3.654102325439453, + "learning_rate": 9.336731455393666e-07, + "loss": 0.4127, + "step": 8838 + }, + { + "epoch": 4.305456316986034, + "grad_norm": 2.866910219192505, + "learning_rate": 9.331719389864322e-07, + "loss": 0.4026, + "step": 8839 + }, + { + "epoch": 4.305943488145502, + "grad_norm": 3.2160000801086426, + "learning_rate": 9.32670836126032e-07, + "loss": 0.3939, + "step": 8840 + }, + { + "epoch": 4.3064306593049695, + "grad_norm": 3.3931586742401123, + "learning_rate": 9.321698369913301e-07, + "loss": 0.3677, + "step": 8841 + }, + { + "epoch": 4.306917830464436, + "grad_norm": 3.217216730117798, + "learning_rate": 9.316689416154832e-07, + "loss": 0.3912, + "step": 8842 + }, + { + "epoch": 4.307405001623904, + "grad_norm": 2.839096784591675, + "learning_rate": 9.311681500316385e-07, + "loss": 0.3337, + "step": 8843 + }, + { + "epoch": 4.307892172783371, + "grad_norm": 3.6311073303222656, + "learning_rate": 9.3066746227294e-07, + "loss": 0.4373, + "step": 8844 + }, + { + "epoch": 4.308379343942838, + "grad_norm": 3.3477442264556885, + "learning_rate": 9.301668783725224e-07, + "loss": 0.4561, + "step": 8845 + }, + { + "epoch": 4.308866515102306, + "grad_norm": 3.1653034687042236, + "learning_rate": 9.296663983635157e-07, + "loss": 0.3539, + "step": 8846 + }, + { + "epoch": 4.309353686261773, + "grad_norm": 2.9838597774505615, + "learning_rate": 9.291660222790394e-07, + "loss": 0.3936, + "step": 8847 + }, + { + "epoch": 4.309840857421241, + "grad_norm": 3.645542860031128, + "learning_rate": 9.286657501522095e-07, + "loss": 0.4347, + "step": 8848 + }, + { + "epoch": 4.310328028580708, + "grad_norm": 3.273838758468628, + "learning_rate": 9.281655820161351e-07, + "loss": 0.3796, + "step": 8849 + }, + { + "epoch": 4.310815199740175, + "grad_norm": 2.934917688369751, + "learning_rate": 9.276655179039148e-07, + "loss": 0.3865, + "step": 8850 + }, + { + "epoch": 4.311302370899643, + "grad_norm": 3.4662652015686035, + "learning_rate": 9.271655578486444e-07, + "loss": 0.3618, + "step": 8851 + }, + { + "epoch": 4.31178954205911, + "grad_norm": 3.300318479537964, + "learning_rate": 9.26665701883411e-07, + "loss": 0.382, + "step": 8852 + }, + { + "epoch": 4.312276713218577, + "grad_norm": 3.4199016094207764, + "learning_rate": 9.261659500412956e-07, + "loss": 0.3898, + "step": 8853 + }, + { + "epoch": 4.312763884378045, + "grad_norm": 3.1791672706604004, + "learning_rate": 9.256663023553703e-07, + "loss": 0.3563, + "step": 8854 + }, + { + "epoch": 4.3132510555375125, + "grad_norm": 3.165898323059082, + "learning_rate": 9.251667588587027e-07, + "loss": 0.3766, + "step": 8855 + }, + { + "epoch": 4.313738226696979, + "grad_norm": 3.1434359550476074, + "learning_rate": 9.24667319584352e-07, + "loss": 0.3854, + "step": 8856 + }, + { + "epoch": 4.314225397856447, + "grad_norm": 3.288461208343506, + "learning_rate": 9.241679845653722e-07, + "loss": 0.4073, + "step": 8857 + }, + { + "epoch": 4.3147125690159145, + "grad_norm": 3.3236021995544434, + "learning_rate": 9.23668753834808e-07, + "loss": 0.3661, + "step": 8858 + }, + { + "epoch": 4.315199740175381, + "grad_norm": 3.202244520187378, + "learning_rate": 9.231696274256985e-07, + "loss": 0.384, + "step": 8859 + }, + { + "epoch": 4.315686911334849, + "grad_norm": 3.4168643951416016, + "learning_rate": 9.226706053710765e-07, + "loss": 0.3625, + "step": 8860 + }, + { + "epoch": 4.316174082494316, + "grad_norm": 3.072202205657959, + "learning_rate": 9.221716877039677e-07, + "loss": 0.392, + "step": 8861 + }, + { + "epoch": 4.316661253653784, + "grad_norm": 3.539888620376587, + "learning_rate": 9.216728744573889e-07, + "loss": 0.3731, + "step": 8862 + }, + { + "epoch": 4.317148424813251, + "grad_norm": 3.168830633163452, + "learning_rate": 9.211741656643522e-07, + "loss": 0.3997, + "step": 8863 + }, + { + "epoch": 4.317635595972718, + "grad_norm": 3.234421968460083, + "learning_rate": 9.206755613578622e-07, + "loss": 0.4467, + "step": 8864 + }, + { + "epoch": 4.318122767132186, + "grad_norm": 3.3160526752471924, + "learning_rate": 9.201770615709174e-07, + "loss": 0.399, + "step": 8865 + }, + { + "epoch": 4.318609938291653, + "grad_norm": 3.0331130027770996, + "learning_rate": 9.196786663365065e-07, + "loss": 0.4282, + "step": 8866 + }, + { + "epoch": 4.31909710945112, + "grad_norm": 3.222811222076416, + "learning_rate": 9.191803756876145e-07, + "loss": 0.3853, + "step": 8867 + }, + { + "epoch": 4.319584280610588, + "grad_norm": 3.1982944011688232, + "learning_rate": 9.186821896572179e-07, + "loss": 0.3655, + "step": 8868 + }, + { + "epoch": 4.320071451770056, + "grad_norm": 2.9359307289123535, + "learning_rate": 9.181841082782877e-07, + "loss": 0.3749, + "step": 8869 + }, + { + "epoch": 4.320558622929522, + "grad_norm": 3.1834633350372314, + "learning_rate": 9.176861315837851e-07, + "loss": 0.4159, + "step": 8870 + }, + { + "epoch": 4.32104579408899, + "grad_norm": 3.2603678703308105, + "learning_rate": 9.17188259606667e-07, + "loss": 0.3842, + "step": 8871 + }, + { + "epoch": 4.3215329652484575, + "grad_norm": 3.3170933723449707, + "learning_rate": 9.166904923798822e-07, + "loss": 0.4274, + "step": 8872 + }, + { + "epoch": 4.322020136407924, + "grad_norm": 3.1959309577941895, + "learning_rate": 9.161928299363742e-07, + "loss": 0.3417, + "step": 8873 + }, + { + "epoch": 4.322507307567392, + "grad_norm": 2.8976047039031982, + "learning_rate": 9.156952723090765e-07, + "loss": 0.3216, + "step": 8874 + }, + { + "epoch": 4.3229944787268595, + "grad_norm": 3.04184889793396, + "learning_rate": 9.15197819530918e-07, + "loss": 0.4718, + "step": 8875 + }, + { + "epoch": 4.323481649886327, + "grad_norm": 3.7417500019073486, + "learning_rate": 9.147004716348204e-07, + "loss": 0.3916, + "step": 8876 + }, + { + "epoch": 4.323968821045794, + "grad_norm": 3.3566830158233643, + "learning_rate": 9.142032286536987e-07, + "loss": 0.3971, + "step": 8877 + }, + { + "epoch": 4.3244559922052614, + "grad_norm": 3.2527737617492676, + "learning_rate": 9.137060906204587e-07, + "loss": 0.4072, + "step": 8878 + }, + { + "epoch": 4.324943163364729, + "grad_norm": 3.363161325454712, + "learning_rate": 9.132090575680019e-07, + "loss": 0.3168, + "step": 8879 + }, + { + "epoch": 4.325430334524196, + "grad_norm": 3.2993557453155518, + "learning_rate": 9.127121295292229e-07, + "loss": 0.3978, + "step": 8880 + }, + { + "epoch": 4.325917505683663, + "grad_norm": 3.2496979236602783, + "learning_rate": 9.122153065370065e-07, + "loss": 0.3705, + "step": 8881 + }, + { + "epoch": 4.326404676843131, + "grad_norm": 3.258084297180176, + "learning_rate": 9.117185886242333e-07, + "loss": 0.4221, + "step": 8882 + }, + { + "epoch": 4.326891848002599, + "grad_norm": 3.2963168621063232, + "learning_rate": 9.112219758237766e-07, + "loss": 0.3746, + "step": 8883 + }, + { + "epoch": 4.327379019162065, + "grad_norm": 3.113429069519043, + "learning_rate": 9.10725468168501e-07, + "loss": 0.3821, + "step": 8884 + }, + { + "epoch": 4.327866190321533, + "grad_norm": 3.040682792663574, + "learning_rate": 9.10229065691266e-07, + "loss": 0.3734, + "step": 8885 + }, + { + "epoch": 4.328353361481001, + "grad_norm": 3.295022487640381, + "learning_rate": 9.09732768424924e-07, + "loss": 0.4318, + "step": 8886 + }, + { + "epoch": 4.328840532640467, + "grad_norm": 3.1819043159484863, + "learning_rate": 9.092365764023187e-07, + "loss": 0.3622, + "step": 8887 + }, + { + "epoch": 4.329327703799935, + "grad_norm": 3.1950526237487793, + "learning_rate": 9.087404896562884e-07, + "loss": 0.4208, + "step": 8888 + }, + { + "epoch": 4.3298148749594025, + "grad_norm": 3.068182945251465, + "learning_rate": 9.082445082196653e-07, + "loss": 0.3605, + "step": 8889 + }, + { + "epoch": 4.33030204611887, + "grad_norm": 2.957728385925293, + "learning_rate": 9.077486321252715e-07, + "loss": 0.3838, + "step": 8890 + }, + { + "epoch": 4.330789217278337, + "grad_norm": 3.283785104751587, + "learning_rate": 9.072528614059247e-07, + "loss": 0.4061, + "step": 8891 + }, + { + "epoch": 4.3312763884378045, + "grad_norm": 3.3733081817626953, + "learning_rate": 9.067571960944357e-07, + "loss": 0.41, + "step": 8892 + }, + { + "epoch": 4.331763559597272, + "grad_norm": 3.0444304943084717, + "learning_rate": 9.062616362236077e-07, + "loss": 0.3928, + "step": 8893 + }, + { + "epoch": 4.332250730756739, + "grad_norm": 3.323303461074829, + "learning_rate": 9.057661818262353e-07, + "loss": 0.3863, + "step": 8894 + }, + { + "epoch": 4.3327379019162064, + "grad_norm": 3.0553338527679443, + "learning_rate": 9.052708329351087e-07, + "loss": 0.4308, + "step": 8895 + }, + { + "epoch": 4.333225073075674, + "grad_norm": 3.2727465629577637, + "learning_rate": 9.047755895830099e-07, + "loss": 0.3612, + "step": 8896 + }, + { + "epoch": 4.333712244235142, + "grad_norm": 2.904057741165161, + "learning_rate": 9.042804518027148e-07, + "loss": 0.4011, + "step": 8897 + }, + { + "epoch": 4.334199415394608, + "grad_norm": 3.1947834491729736, + "learning_rate": 9.037854196269899e-07, + "loss": 0.4019, + "step": 8898 + }, + { + "epoch": 4.334686586554076, + "grad_norm": 4.143680095672607, + "learning_rate": 9.032904930885974e-07, + "loss": 0.3838, + "step": 8899 + }, + { + "epoch": 4.335173757713544, + "grad_norm": 3.6092922687530518, + "learning_rate": 9.027956722202921e-07, + "loss": 0.365, + "step": 8900 + }, + { + "epoch": 4.33566092887301, + "grad_norm": 3.2806999683380127, + "learning_rate": 9.023009570548197e-07, + "loss": 0.366, + "step": 8901 + }, + { + "epoch": 4.336148100032478, + "grad_norm": 3.2456090450286865, + "learning_rate": 9.018063476249211e-07, + "loss": 0.3912, + "step": 8902 + }, + { + "epoch": 4.336635271191946, + "grad_norm": 2.848466396331787, + "learning_rate": 9.013118439633298e-07, + "loss": 0.361, + "step": 8903 + }, + { + "epoch": 4.337122442351413, + "grad_norm": 3.1880457401275635, + "learning_rate": 9.008174461027724e-07, + "loss": 0.4062, + "step": 8904 + }, + { + "epoch": 4.33760961351088, + "grad_norm": 3.6805167198181152, + "learning_rate": 9.003231540759669e-07, + "loss": 0.3767, + "step": 8905 + }, + { + "epoch": 4.3380967846703475, + "grad_norm": 3.1077139377593994, + "learning_rate": 8.998289679156258e-07, + "loss": 0.4063, + "step": 8906 + }, + { + "epoch": 4.338583955829815, + "grad_norm": 2.750190258026123, + "learning_rate": 8.99334887654455e-07, + "loss": 0.3883, + "step": 8907 + }, + { + "epoch": 4.339071126989282, + "grad_norm": 3.1501657962799072, + "learning_rate": 8.988409133251527e-07, + "loss": 0.3911, + "step": 8908 + }, + { + "epoch": 4.3395582981487495, + "grad_norm": 3.3830204010009766, + "learning_rate": 8.98347044960409e-07, + "loss": 0.3871, + "step": 8909 + }, + { + "epoch": 4.340045469308217, + "grad_norm": 3.1806881427764893, + "learning_rate": 8.978532825929087e-07, + "loss": 0.4046, + "step": 8910 + }, + { + "epoch": 4.340532640467685, + "grad_norm": 3.3063011169433594, + "learning_rate": 8.973596262553291e-07, + "loss": 0.4401, + "step": 8911 + }, + { + "epoch": 4.3410198116271514, + "grad_norm": 3.1708197593688965, + "learning_rate": 8.96866075980341e-07, + "loss": 0.3565, + "step": 8912 + }, + { + "epoch": 4.341506982786619, + "grad_norm": 3.5531601905822754, + "learning_rate": 8.963726318006058e-07, + "loss": 0.3558, + "step": 8913 + }, + { + "epoch": 4.341994153946087, + "grad_norm": 3.3918957710266113, + "learning_rate": 8.958792937487803e-07, + "loss": 0.3192, + "step": 8914 + }, + { + "epoch": 4.342481325105553, + "grad_norm": 2.930818557739258, + "learning_rate": 8.953860618575141e-07, + "loss": 0.4241, + "step": 8915 + }, + { + "epoch": 4.342968496265021, + "grad_norm": 3.502345323562622, + "learning_rate": 8.948929361594497e-07, + "loss": 0.4551, + "step": 8916 + }, + { + "epoch": 4.343455667424489, + "grad_norm": 3.162384033203125, + "learning_rate": 8.943999166872203e-07, + "loss": 0.3355, + "step": 8917 + }, + { + "epoch": 4.343942838583956, + "grad_norm": 3.0118250846862793, + "learning_rate": 8.939070034734548e-07, + "loss": 0.4066, + "step": 8918 + }, + { + "epoch": 4.344430009743423, + "grad_norm": 3.306047201156616, + "learning_rate": 8.934141965507745e-07, + "loss": 0.3462, + "step": 8919 + }, + { + "epoch": 4.344917180902891, + "grad_norm": 3.1152994632720947, + "learning_rate": 8.92921495951794e-07, + "loss": 0.3819, + "step": 8920 + }, + { + "epoch": 4.345404352062358, + "grad_norm": 3.4146201610565186, + "learning_rate": 8.924289017091181e-07, + "loss": 0.3719, + "step": 8921 + }, + { + "epoch": 4.345891523221825, + "grad_norm": 3.31553053855896, + "learning_rate": 8.91936413855348e-07, + "loss": 0.4064, + "step": 8922 + }, + { + "epoch": 4.3463786943812925, + "grad_norm": 3.486168146133423, + "learning_rate": 8.914440324230761e-07, + "loss": 0.3889, + "step": 8923 + }, + { + "epoch": 4.34686586554076, + "grad_norm": 3.6154489517211914, + "learning_rate": 8.909517574448895e-07, + "loss": 0.4286, + "step": 8924 + }, + { + "epoch": 4.347353036700227, + "grad_norm": 3.577714443206787, + "learning_rate": 8.904595889533646e-07, + "loss": 0.3888, + "step": 8925 + }, + { + "epoch": 4.3478402078596945, + "grad_norm": 3.681519031524658, + "learning_rate": 8.899675269810751e-07, + "loss": 0.3889, + "step": 8926 + }, + { + "epoch": 4.348327379019162, + "grad_norm": 3.482728958129883, + "learning_rate": 8.894755715605841e-07, + "loss": 0.3347, + "step": 8927 + }, + { + "epoch": 4.34881455017863, + "grad_norm": 2.997149705886841, + "learning_rate": 8.889837227244496e-07, + "loss": 0.3299, + "step": 8928 + }, + { + "epoch": 4.3493017213380964, + "grad_norm": 2.835855722427368, + "learning_rate": 8.884919805052234e-07, + "loss": 0.4036, + "step": 8929 + }, + { + "epoch": 4.349788892497564, + "grad_norm": 2.973888397216797, + "learning_rate": 8.880003449354469e-07, + "loss": 0.3533, + "step": 8930 + }, + { + "epoch": 4.350276063657032, + "grad_norm": 3.2418935298919678, + "learning_rate": 8.875088160476572e-07, + "loss": 0.3767, + "step": 8931 + }, + { + "epoch": 4.350763234816499, + "grad_norm": 3.0025722980499268, + "learning_rate": 8.87017393874385e-07, + "loss": 0.4057, + "step": 8932 + }, + { + "epoch": 4.351250405975966, + "grad_norm": 3.2340803146362305, + "learning_rate": 8.865260784481508e-07, + "loss": 0.3507, + "step": 8933 + }, + { + "epoch": 4.351737577135434, + "grad_norm": 3.204023599624634, + "learning_rate": 8.860348698014701e-07, + "loss": 0.4172, + "step": 8934 + }, + { + "epoch": 4.352224748294901, + "grad_norm": 3.387331008911133, + "learning_rate": 8.855437679668516e-07, + "loss": 0.3557, + "step": 8935 + }, + { + "epoch": 4.352711919454368, + "grad_norm": 2.774536371231079, + "learning_rate": 8.850527729767971e-07, + "loss": 0.3676, + "step": 8936 + }, + { + "epoch": 4.353199090613836, + "grad_norm": 3.091247797012329, + "learning_rate": 8.845618848637988e-07, + "loss": 0.443, + "step": 8937 + }, + { + "epoch": 4.353686261773303, + "grad_norm": 3.452584981918335, + "learning_rate": 8.840711036603445e-07, + "loss": 0.41, + "step": 8938 + }, + { + "epoch": 4.35417343293277, + "grad_norm": 3.624983549118042, + "learning_rate": 8.835804293989142e-07, + "loss": 0.4241, + "step": 8939 + }, + { + "epoch": 4.3546606040922375, + "grad_norm": 3.275967597961426, + "learning_rate": 8.830898621119813e-07, + "loss": 0.4008, + "step": 8940 + }, + { + "epoch": 4.355147775251705, + "grad_norm": 3.7368171215057373, + "learning_rate": 8.825994018320099e-07, + "loss": 0.4186, + "step": 8941 + }, + { + "epoch": 4.355634946411173, + "grad_norm": 3.4378771781921387, + "learning_rate": 8.821090485914596e-07, + "loss": 0.4135, + "step": 8942 + }, + { + "epoch": 4.3561221175706395, + "grad_norm": 3.7970550060272217, + "learning_rate": 8.816188024227817e-07, + "loss": 0.4011, + "step": 8943 + }, + { + "epoch": 4.356609288730107, + "grad_norm": 3.7786545753479004, + "learning_rate": 8.811286633584215e-07, + "loss": 0.3608, + "step": 8944 + }, + { + "epoch": 4.357096459889575, + "grad_norm": 3.16428279876709, + "learning_rate": 8.806386314308149e-07, + "loss": 0.3864, + "step": 8945 + }, + { + "epoch": 4.357583631049042, + "grad_norm": 3.1470625400543213, + "learning_rate": 8.80148706672393e-07, + "loss": 0.3424, + "step": 8946 + }, + { + "epoch": 4.358070802208509, + "grad_norm": 3.3100993633270264, + "learning_rate": 8.796588891155786e-07, + "loss": 0.3377, + "step": 8947 + }, + { + "epoch": 4.358557973367977, + "grad_norm": 3.2827250957489014, + "learning_rate": 8.791691787927889e-07, + "loss": 0.371, + "step": 8948 + }, + { + "epoch": 4.359045144527444, + "grad_norm": 3.419351577758789, + "learning_rate": 8.786795757364314e-07, + "loss": 0.3601, + "step": 8949 + }, + { + "epoch": 4.359532315686911, + "grad_norm": 3.066441774368286, + "learning_rate": 8.781900799789084e-07, + "loss": 0.377, + "step": 8950 + }, + { + "epoch": 4.360019486846379, + "grad_norm": 3.250852584838867, + "learning_rate": 8.777006915526157e-07, + "loss": 0.3855, + "step": 8951 + }, + { + "epoch": 4.360506658005846, + "grad_norm": 3.090548515319824, + "learning_rate": 8.772114104899396e-07, + "loss": 0.4118, + "step": 8952 + }, + { + "epoch": 4.360993829165313, + "grad_norm": 3.411766529083252, + "learning_rate": 8.767222368232609e-07, + "loss": 0.3498, + "step": 8953 + }, + { + "epoch": 4.361481000324781, + "grad_norm": 3.4312644004821777, + "learning_rate": 8.762331705849536e-07, + "loss": 0.4108, + "step": 8954 + }, + { + "epoch": 4.361968171484248, + "grad_norm": 3.1633071899414062, + "learning_rate": 8.757442118073847e-07, + "loss": 0.3924, + "step": 8955 + }, + { + "epoch": 4.362455342643716, + "grad_norm": 3.3311259746551514, + "learning_rate": 8.752553605229116e-07, + "loss": 0.3504, + "step": 8956 + }, + { + "epoch": 4.3629425138031825, + "grad_norm": 3.2857658863067627, + "learning_rate": 8.747666167638877e-07, + "loss": 0.4077, + "step": 8957 + }, + { + "epoch": 4.36342968496265, + "grad_norm": 2.924485683441162, + "learning_rate": 8.742779805626575e-07, + "loss": 0.3189, + "step": 8958 + }, + { + "epoch": 4.363916856122118, + "grad_norm": 3.1298484802246094, + "learning_rate": 8.737894519515603e-07, + "loss": 0.4487, + "step": 8959 + }, + { + "epoch": 4.364404027281585, + "grad_norm": 3.474512815475464, + "learning_rate": 8.733010309629247e-07, + "loss": 0.3935, + "step": 8960 + }, + { + "epoch": 4.364891198441052, + "grad_norm": 3.470322608947754, + "learning_rate": 8.728127176290754e-07, + "loss": 0.4137, + "step": 8961 + }, + { + "epoch": 4.36537836960052, + "grad_norm": 3.3400418758392334, + "learning_rate": 8.72324511982329e-07, + "loss": 0.3784, + "step": 8962 + }, + { + "epoch": 4.365865540759987, + "grad_norm": 3.1214613914489746, + "learning_rate": 8.718364140549956e-07, + "loss": 0.4097, + "step": 8963 + }, + { + "epoch": 4.366352711919454, + "grad_norm": 3.4419045448303223, + "learning_rate": 8.713484238793757e-07, + "loss": 0.3628, + "step": 8964 + }, + { + "epoch": 4.366839883078922, + "grad_norm": 3.183767795562744, + "learning_rate": 8.708605414877657e-07, + "loss": 0.3954, + "step": 8965 + }, + { + "epoch": 4.367327054238389, + "grad_norm": 3.229807138442993, + "learning_rate": 8.703727669124534e-07, + "loss": 0.3775, + "step": 8966 + }, + { + "epoch": 4.367814225397856, + "grad_norm": 3.443837881088257, + "learning_rate": 8.698851001857203e-07, + "loss": 0.3843, + "step": 8967 + }, + { + "epoch": 4.368301396557324, + "grad_norm": 2.899824380874634, + "learning_rate": 8.693975413398387e-07, + "loss": 0.3785, + "step": 8968 + }, + { + "epoch": 4.368788567716791, + "grad_norm": 3.713561534881592, + "learning_rate": 8.68910090407076e-07, + "loss": 0.4293, + "step": 8969 + }, + { + "epoch": 4.369275738876259, + "grad_norm": 3.624744176864624, + "learning_rate": 8.684227474196924e-07, + "loss": 0.4122, + "step": 8970 + }, + { + "epoch": 4.369762910035726, + "grad_norm": 3.103705883026123, + "learning_rate": 8.679355124099387e-07, + "loss": 0.365, + "step": 8971 + }, + { + "epoch": 4.370250081195193, + "grad_norm": 3.4571399688720703, + "learning_rate": 8.674483854100607e-07, + "loss": 0.4178, + "step": 8972 + }, + { + "epoch": 4.370737252354661, + "grad_norm": 3.241440773010254, + "learning_rate": 8.669613664522974e-07, + "loss": 0.4385, + "step": 8973 + }, + { + "epoch": 4.371224423514128, + "grad_norm": 3.7578237056732178, + "learning_rate": 8.664744555688779e-07, + "loss": 0.44, + "step": 8974 + }, + { + "epoch": 4.371711594673595, + "grad_norm": 3.479661703109741, + "learning_rate": 8.659876527920277e-07, + "loss": 0.3711, + "step": 8975 + }, + { + "epoch": 4.372198765833063, + "grad_norm": 3.2630059719085693, + "learning_rate": 8.655009581539617e-07, + "loss": 0.4063, + "step": 8976 + }, + { + "epoch": 4.37268593699253, + "grad_norm": 3.493628978729248, + "learning_rate": 8.650143716868901e-07, + "loss": 0.4227, + "step": 8977 + }, + { + "epoch": 4.373173108151997, + "grad_norm": 3.2464494705200195, + "learning_rate": 8.64527893423015e-07, + "loss": 0.3639, + "step": 8978 + }, + { + "epoch": 4.373660279311465, + "grad_norm": 3.240750312805176, + "learning_rate": 8.640415233945327e-07, + "loss": 0.3368, + "step": 8979 + }, + { + "epoch": 4.374147450470932, + "grad_norm": 3.0466456413269043, + "learning_rate": 8.63555261633629e-07, + "loss": 0.3845, + "step": 8980 + }, + { + "epoch": 4.374634621630399, + "grad_norm": 3.2745492458343506, + "learning_rate": 8.630691081724857e-07, + "loss": 0.4532, + "step": 8981 + }, + { + "epoch": 4.375121792789867, + "grad_norm": 3.5284483432769775, + "learning_rate": 8.625830630432766e-07, + "loss": 0.3546, + "step": 8982 + }, + { + "epoch": 4.375608963949334, + "grad_norm": 3.0845210552215576, + "learning_rate": 8.620971262781685e-07, + "loss": 0.4304, + "step": 8983 + }, + { + "epoch": 4.376096135108802, + "grad_norm": 3.300030469894409, + "learning_rate": 8.616112979093195e-07, + "loss": 0.3744, + "step": 8984 + }, + { + "epoch": 4.376583306268269, + "grad_norm": 3.4839370250701904, + "learning_rate": 8.611255779688818e-07, + "loss": 0.4266, + "step": 8985 + }, + { + "epoch": 4.377070477427736, + "grad_norm": 3.763881206512451, + "learning_rate": 8.606399664890011e-07, + "loss": 0.3706, + "step": 8986 + }, + { + "epoch": 4.377557648587204, + "grad_norm": 3.1426868438720703, + "learning_rate": 8.601544635018155e-07, + "loss": 0.3546, + "step": 8987 + }, + { + "epoch": 4.378044819746671, + "grad_norm": 3.0810654163360596, + "learning_rate": 8.596690690394538e-07, + "loss": 0.3789, + "step": 8988 + }, + { + "epoch": 4.378531990906138, + "grad_norm": 3.059204339981079, + "learning_rate": 8.591837831340405e-07, + "loss": 0.412, + "step": 8989 + }, + { + "epoch": 4.379019162065606, + "grad_norm": 3.0803191661834717, + "learning_rate": 8.586986058176916e-07, + "loss": 0.3479, + "step": 8990 + }, + { + "epoch": 4.379506333225073, + "grad_norm": 3.487166166305542, + "learning_rate": 8.582135371225173e-07, + "loss": 0.4274, + "step": 8991 + }, + { + "epoch": 4.37999350438454, + "grad_norm": 3.2816028594970703, + "learning_rate": 8.577285770806171e-07, + "loss": 0.4298, + "step": 8992 + }, + { + "epoch": 4.380480675544008, + "grad_norm": 2.961974859237671, + "learning_rate": 8.572437257240865e-07, + "loss": 0.307, + "step": 8993 + }, + { + "epoch": 4.380967846703475, + "grad_norm": 2.974731683731079, + "learning_rate": 8.567589830850134e-07, + "loss": 0.3829, + "step": 8994 + }, + { + "epoch": 4.381455017862942, + "grad_norm": 3.235360860824585, + "learning_rate": 8.562743491954787e-07, + "loss": 0.4269, + "step": 8995 + }, + { + "epoch": 4.38194218902241, + "grad_norm": 3.2765932083129883, + "learning_rate": 8.557898240875536e-07, + "loss": 0.4219, + "step": 8996 + }, + { + "epoch": 4.382429360181877, + "grad_norm": 3.083277463912964, + "learning_rate": 8.553054077933048e-07, + "loss": 0.427, + "step": 8997 + }, + { + "epoch": 4.382916531341345, + "grad_norm": 3.670099973678589, + "learning_rate": 8.548211003447912e-07, + "loss": 0.4701, + "step": 8998 + }, + { + "epoch": 4.383403702500812, + "grad_norm": 3.3108363151550293, + "learning_rate": 8.543369017740644e-07, + "loss": 0.3189, + "step": 8999 + }, + { + "epoch": 4.383890873660279, + "grad_norm": 3.1432933807373047, + "learning_rate": 8.538528121131675e-07, + "loss": 0.369, + "step": 9000 + }, + { + "epoch": 4.384378044819747, + "grad_norm": 3.15539288520813, + "learning_rate": 8.533688313941385e-07, + "loss": 0.3854, + "step": 9001 + }, + { + "epoch": 4.384865215979214, + "grad_norm": 3.4669785499572754, + "learning_rate": 8.528849596490074e-07, + "loss": 0.3972, + "step": 9002 + }, + { + "epoch": 4.385352387138681, + "grad_norm": 3.0239696502685547, + "learning_rate": 8.524011969097956e-07, + "loss": 0.4288, + "step": 9003 + }, + { + "epoch": 4.385839558298149, + "grad_norm": 3.545590400695801, + "learning_rate": 8.519175432085191e-07, + "loss": 0.4079, + "step": 9004 + }, + { + "epoch": 4.3863267294576165, + "grad_norm": 3.4348931312561035, + "learning_rate": 8.514339985771863e-07, + "loss": 0.4218, + "step": 9005 + }, + { + "epoch": 4.386813900617083, + "grad_norm": 3.6082732677459717, + "learning_rate": 8.509505630477985e-07, + "loss": 0.4914, + "step": 9006 + }, + { + "epoch": 4.387301071776551, + "grad_norm": 3.435314416885376, + "learning_rate": 8.504672366523484e-07, + "loss": 0.4184, + "step": 9007 + }, + { + "epoch": 4.387788242936018, + "grad_norm": 3.175274133682251, + "learning_rate": 8.499840194228226e-07, + "loss": 0.3634, + "step": 9008 + }, + { + "epoch": 4.388275414095485, + "grad_norm": 3.636160373687744, + "learning_rate": 8.495009113912009e-07, + "loss": 0.382, + "step": 9009 + }, + { + "epoch": 4.388762585254953, + "grad_norm": 3.3518173694610596, + "learning_rate": 8.49017912589456e-07, + "loss": 0.3918, + "step": 9010 + }, + { + "epoch": 4.38924975641442, + "grad_norm": 3.115821599960327, + "learning_rate": 8.485350230495512e-07, + "loss": 0.407, + "step": 9011 + }, + { + "epoch": 4.389736927573888, + "grad_norm": 3.1011176109313965, + "learning_rate": 8.480522428034446e-07, + "loss": 0.284, + "step": 9012 + }, + { + "epoch": 4.390224098733355, + "grad_norm": 3.415668487548828, + "learning_rate": 8.475695718830878e-07, + "loss": 0.4656, + "step": 9013 + }, + { + "epoch": 4.390711269892822, + "grad_norm": 4.2471923828125, + "learning_rate": 8.470870103204218e-07, + "loss": 0.3701, + "step": 9014 + }, + { + "epoch": 4.39119844105229, + "grad_norm": 3.289088249206543, + "learning_rate": 8.466045581473836e-07, + "loss": 0.4071, + "step": 9015 + }, + { + "epoch": 4.391685612211757, + "grad_norm": 3.1437370777130127, + "learning_rate": 8.461222153959028e-07, + "loss": 0.381, + "step": 9016 + }, + { + "epoch": 4.392172783371224, + "grad_norm": 3.491558074951172, + "learning_rate": 8.456399820978986e-07, + "loss": 0.401, + "step": 9017 + }, + { + "epoch": 4.392659954530692, + "grad_norm": 3.660468101501465, + "learning_rate": 8.451578582852865e-07, + "loss": 0.4197, + "step": 9018 + }, + { + "epoch": 4.3931471256901595, + "grad_norm": 3.16108775138855, + "learning_rate": 8.446758439899741e-07, + "loss": 0.3846, + "step": 9019 + }, + { + "epoch": 4.393634296849626, + "grad_norm": 3.5752925872802734, + "learning_rate": 8.441939392438594e-07, + "loss": 0.3931, + "step": 9020 + }, + { + "epoch": 4.394121468009094, + "grad_norm": 3.510483980178833, + "learning_rate": 8.437121440788357e-07, + "loss": 0.3861, + "step": 9021 + }, + { + "epoch": 4.3946086391685615, + "grad_norm": 3.36728835105896, + "learning_rate": 8.432304585267886e-07, + "loss": 0.3906, + "step": 9022 + }, + { + "epoch": 4.395095810328028, + "grad_norm": 3.487119436264038, + "learning_rate": 8.427488826195948e-07, + "loss": 0.4563, + "step": 9023 + }, + { + "epoch": 4.395582981487496, + "grad_norm": 3.3855087757110596, + "learning_rate": 8.422674163891259e-07, + "loss": 0.313, + "step": 9024 + }, + { + "epoch": 4.396070152646963, + "grad_norm": 3.8471453189849854, + "learning_rate": 8.417860598672448e-07, + "loss": 0.4041, + "step": 9025 + }, + { + "epoch": 4.396557323806431, + "grad_norm": 3.1905336380004883, + "learning_rate": 8.413048130858084e-07, + "loss": 0.337, + "step": 9026 + }, + { + "epoch": 4.397044494965898, + "grad_norm": 3.0378973484039307, + "learning_rate": 8.408236760766644e-07, + "loss": 0.4163, + "step": 9027 + }, + { + "epoch": 4.397531666125365, + "grad_norm": 3.103809118270874, + "learning_rate": 8.403426488716551e-07, + "loss": 0.377, + "step": 9028 + }, + { + "epoch": 4.398018837284833, + "grad_norm": 3.227647066116333, + "learning_rate": 8.398617315026147e-07, + "loss": 0.3828, + "step": 9029 + }, + { + "epoch": 4.3985060084443, + "grad_norm": 4.419925212860107, + "learning_rate": 8.393809240013711e-07, + "loss": 0.3865, + "step": 9030 + }, + { + "epoch": 4.398993179603767, + "grad_norm": 2.9998161792755127, + "learning_rate": 8.389002263997426e-07, + "loss": 0.3808, + "step": 9031 + }, + { + "epoch": 4.399480350763235, + "grad_norm": 3.2807183265686035, + "learning_rate": 8.384196387295424e-07, + "loss": 0.4219, + "step": 9032 + }, + { + "epoch": 4.399967521922703, + "grad_norm": 3.3912837505340576, + "learning_rate": 8.379391610225757e-07, + "loss": 0.3767, + "step": 9033 + }, + { + "epoch": 4.400454693082169, + "grad_norm": 3.1201961040496826, + "learning_rate": 8.374587933106412e-07, + "loss": 0.4139, + "step": 9034 + }, + { + "epoch": 4.400941864241637, + "grad_norm": 3.4639346599578857, + "learning_rate": 8.369785356255284e-07, + "loss": 0.3739, + "step": 9035 + }, + { + "epoch": 4.4014290354011045, + "grad_norm": 3.2851884365081787, + "learning_rate": 8.364983879990213e-07, + "loss": 0.3969, + "step": 9036 + }, + { + "epoch": 4.401916206560571, + "grad_norm": 3.5648434162139893, + "learning_rate": 8.360183504628958e-07, + "loss": 0.4457, + "step": 9037 + }, + { + "epoch": 4.402403377720039, + "grad_norm": 3.45919132232666, + "learning_rate": 8.355384230489219e-07, + "loss": 0.3546, + "step": 9038 + }, + { + "epoch": 4.4028905488795065, + "grad_norm": 3.3701984882354736, + "learning_rate": 8.350586057888591e-07, + "loss": 0.3712, + "step": 9039 + }, + { + "epoch": 4.403377720038974, + "grad_norm": 3.261354923248291, + "learning_rate": 8.345788987144629e-07, + "loss": 0.4097, + "step": 9040 + }, + { + "epoch": 4.403864891198441, + "grad_norm": 3.2018914222717285, + "learning_rate": 8.3409930185748e-07, + "loss": 0.37, + "step": 9041 + }, + { + "epoch": 4.404352062357908, + "grad_norm": 2.832754373550415, + "learning_rate": 8.33619815249651e-07, + "loss": 0.4005, + "step": 9042 + }, + { + "epoch": 4.404839233517376, + "grad_norm": 3.57568097114563, + "learning_rate": 8.331404389227068e-07, + "loss": 0.3228, + "step": 9043 + }, + { + "epoch": 4.405326404676843, + "grad_norm": 2.955160617828369, + "learning_rate": 8.32661172908373e-07, + "loss": 0.3863, + "step": 9044 + }, + { + "epoch": 4.40581357583631, + "grad_norm": 3.367920160293579, + "learning_rate": 8.321820172383677e-07, + "loss": 0.4054, + "step": 9045 + }, + { + "epoch": 4.406300746995778, + "grad_norm": 3.7152538299560547, + "learning_rate": 8.317029719444017e-07, + "loss": 0.4558, + "step": 9046 + }, + { + "epoch": 4.406787918155246, + "grad_norm": 3.231205940246582, + "learning_rate": 8.31224037058177e-07, + "loss": 0.4018, + "step": 9047 + }, + { + "epoch": 4.407275089314712, + "grad_norm": 2.9684791564941406, + "learning_rate": 8.307452126113902e-07, + "loss": 0.382, + "step": 9048 + }, + { + "epoch": 4.40776226047418, + "grad_norm": 3.376253843307495, + "learning_rate": 8.302664986357298e-07, + "loss": 0.3804, + "step": 9049 + }, + { + "epoch": 4.408249431633648, + "grad_norm": 3.3969602584838867, + "learning_rate": 8.297878951628777e-07, + "loss": 0.3306, + "step": 9050 + }, + { + "epoch": 4.408736602793114, + "grad_norm": 3.3369908332824707, + "learning_rate": 8.293094022245066e-07, + "loss": 0.4002, + "step": 9051 + }, + { + "epoch": 4.409223773952582, + "grad_norm": 3.6027674674987793, + "learning_rate": 8.288310198522834e-07, + "loss": 0.4358, + "step": 9052 + }, + { + "epoch": 4.4097109451120495, + "grad_norm": 3.3909924030303955, + "learning_rate": 8.283527480778686e-07, + "loss": 0.4012, + "step": 9053 + }, + { + "epoch": 4.410198116271516, + "grad_norm": 3.284381151199341, + "learning_rate": 8.278745869329124e-07, + "loss": 0.4098, + "step": 9054 + }, + { + "epoch": 4.410685287430984, + "grad_norm": 2.9671151638031006, + "learning_rate": 8.273965364490605e-07, + "loss": 0.3966, + "step": 9055 + }, + { + "epoch": 4.4111724585904515, + "grad_norm": 3.558401107788086, + "learning_rate": 8.269185966579499e-07, + "loss": 0.4047, + "step": 9056 + }, + { + "epoch": 4.411659629749919, + "grad_norm": 4.215176582336426, + "learning_rate": 8.264407675912117e-07, + "loss": 0.4111, + "step": 9057 + }, + { + "epoch": 4.412146800909386, + "grad_norm": 3.265225887298584, + "learning_rate": 8.259630492804668e-07, + "loss": 0.3787, + "step": 9058 + }, + { + "epoch": 4.412633972068853, + "grad_norm": 3.3693881034851074, + "learning_rate": 8.254854417573313e-07, + "loss": 0.3979, + "step": 9059 + }, + { + "epoch": 4.413121143228321, + "grad_norm": 2.9816925525665283, + "learning_rate": 8.250079450534143e-07, + "loss": 0.4837, + "step": 9060 + }, + { + "epoch": 4.413608314387789, + "grad_norm": 3.549628734588623, + "learning_rate": 8.245305592003147e-07, + "loss": 0.3837, + "step": 9061 + }, + { + "epoch": 4.414095485547255, + "grad_norm": 3.0498557090759277, + "learning_rate": 8.240532842296276e-07, + "loss": 0.3889, + "step": 9062 + }, + { + "epoch": 4.414582656706723, + "grad_norm": 3.6971566677093506, + "learning_rate": 8.235761201729369e-07, + "loss": 0.4409, + "step": 9063 + }, + { + "epoch": 4.415069827866191, + "grad_norm": 3.4877572059631348, + "learning_rate": 8.230990670618227e-07, + "loss": 0.3681, + "step": 9064 + }, + { + "epoch": 4.415556999025657, + "grad_norm": 3.2322871685028076, + "learning_rate": 8.226221249278568e-07, + "loss": 0.4175, + "step": 9065 + }, + { + "epoch": 4.416044170185125, + "grad_norm": 3.0680899620056152, + "learning_rate": 8.221452938026017e-07, + "loss": 0.3832, + "step": 9066 + }, + { + "epoch": 4.416531341344593, + "grad_norm": 3.3255081176757812, + "learning_rate": 8.21668573717615e-07, + "loss": 0.4097, + "step": 9067 + }, + { + "epoch": 4.417018512504059, + "grad_norm": 3.291119337081909, + "learning_rate": 8.211919647044456e-07, + "loss": 0.4169, + "step": 9068 + }, + { + "epoch": 4.417505683663527, + "grad_norm": 3.5981574058532715, + "learning_rate": 8.207154667946366e-07, + "loss": 0.4372, + "step": 9069 + }, + { + "epoch": 4.4179928548229945, + "grad_norm": 3.6427934169769287, + "learning_rate": 8.202390800197207e-07, + "loss": 0.4079, + "step": 9070 + }, + { + "epoch": 4.418480025982462, + "grad_norm": 3.7530345916748047, + "learning_rate": 8.197628044112261e-07, + "loss": 0.3915, + "step": 9071 + }, + { + "epoch": 4.418967197141929, + "grad_norm": 3.24320125579834, + "learning_rate": 8.192866400006727e-07, + "loss": 0.4125, + "step": 9072 + }, + { + "epoch": 4.4194543683013965, + "grad_norm": 3.3961665630340576, + "learning_rate": 8.188105868195737e-07, + "loss": 0.4825, + "step": 9073 + }, + { + "epoch": 4.419941539460864, + "grad_norm": 3.2069358825683594, + "learning_rate": 8.183346448994328e-07, + "loss": 0.3863, + "step": 9074 + }, + { + "epoch": 4.420428710620332, + "grad_norm": 3.2671825885772705, + "learning_rate": 8.178588142717483e-07, + "loss": 0.3458, + "step": 9075 + }, + { + "epoch": 4.420915881779798, + "grad_norm": 3.1230788230895996, + "learning_rate": 8.173830949680108e-07, + "loss": 0.3385, + "step": 9076 + }, + { + "epoch": 4.421403052939266, + "grad_norm": 3.6910126209259033, + "learning_rate": 8.169074870197044e-07, + "loss": 0.3925, + "step": 9077 + }, + { + "epoch": 4.421890224098734, + "grad_norm": 2.931471824645996, + "learning_rate": 8.164319904583029e-07, + "loss": 0.361, + "step": 9078 + }, + { + "epoch": 4.4223773952582, + "grad_norm": 3.646836757659912, + "learning_rate": 8.159566053152756e-07, + "loss": 0.3901, + "step": 9079 + }, + { + "epoch": 4.422864566417668, + "grad_norm": 3.2289695739746094, + "learning_rate": 8.154813316220833e-07, + "loss": 0.341, + "step": 9080 + }, + { + "epoch": 4.423351737577136, + "grad_norm": 3.251042604446411, + "learning_rate": 8.150061694101804e-07, + "loss": 0.3818, + "step": 9081 + }, + { + "epoch": 4.423838908736602, + "grad_norm": 3.549929141998291, + "learning_rate": 8.145311187110113e-07, + "loss": 0.4106, + "step": 9082 + }, + { + "epoch": 4.42432607989607, + "grad_norm": 3.59171986579895, + "learning_rate": 8.140561795560162e-07, + "loss": 0.3879, + "step": 9083 + }, + { + "epoch": 4.424813251055538, + "grad_norm": 3.226522445678711, + "learning_rate": 8.135813519766259e-07, + "loss": 0.3762, + "step": 9084 + }, + { + "epoch": 4.425300422215005, + "grad_norm": 3.272014617919922, + "learning_rate": 8.131066360042656e-07, + "loss": 0.387, + "step": 9085 + }, + { + "epoch": 4.425787593374472, + "grad_norm": 3.3295652866363525, + "learning_rate": 8.126320316703501e-07, + "loss": 0.4108, + "step": 9086 + }, + { + "epoch": 4.4262747645339395, + "grad_norm": 2.899003028869629, + "learning_rate": 8.121575390062897e-07, + "loss": 0.4331, + "step": 9087 + }, + { + "epoch": 4.426761935693407, + "grad_norm": 3.48553204536438, + "learning_rate": 8.116831580434861e-07, + "loss": 0.374, + "step": 9088 + }, + { + "epoch": 4.427249106852875, + "grad_norm": 2.925675868988037, + "learning_rate": 8.112088888133349e-07, + "loss": 0.4346, + "step": 9089 + }, + { + "epoch": 4.4277362780123415, + "grad_norm": 3.5208892822265625, + "learning_rate": 8.107347313472211e-07, + "loss": 0.4333, + "step": 9090 + }, + { + "epoch": 4.428223449171809, + "grad_norm": 3.2433927059173584, + "learning_rate": 8.102606856765255e-07, + "loss": 0.47, + "step": 9091 + }, + { + "epoch": 4.428710620331277, + "grad_norm": 2.9895925521850586, + "learning_rate": 8.097867518326205e-07, + "loss": 0.4283, + "step": 9092 + }, + { + "epoch": 4.429197791490743, + "grad_norm": 3.1444034576416016, + "learning_rate": 8.093129298468719e-07, + "loss": 0.3516, + "step": 9093 + }, + { + "epoch": 4.429684962650211, + "grad_norm": 3.463284730911255, + "learning_rate": 8.08839219750635e-07, + "loss": 0.382, + "step": 9094 + }, + { + "epoch": 4.430172133809679, + "grad_norm": 3.6056535243988037, + "learning_rate": 8.083656215752611e-07, + "loss": 0.4222, + "step": 9095 + }, + { + "epoch": 4.430659304969145, + "grad_norm": 3.33219575881958, + "learning_rate": 8.078921353520929e-07, + "loss": 0.4012, + "step": 9096 + }, + { + "epoch": 4.431146476128613, + "grad_norm": 3.1184747219085693, + "learning_rate": 8.074187611124665e-07, + "loss": 0.3996, + "step": 9097 + }, + { + "epoch": 4.431633647288081, + "grad_norm": 3.125857353210449, + "learning_rate": 8.069454988877079e-07, + "loss": 0.3328, + "step": 9098 + }, + { + "epoch": 4.432120818447548, + "grad_norm": 3.594973087310791, + "learning_rate": 8.064723487091386e-07, + "loss": 0.4191, + "step": 9099 + }, + { + "epoch": 4.432607989607015, + "grad_norm": 3.566493034362793, + "learning_rate": 8.059993106080724e-07, + "loss": 0.4128, + "step": 9100 + }, + { + "epoch": 4.433095160766483, + "grad_norm": 3.1384003162384033, + "learning_rate": 8.055263846158134e-07, + "loss": 0.3315, + "step": 9101 + }, + { + "epoch": 4.43358233192595, + "grad_norm": 3.4904394149780273, + "learning_rate": 8.050535707636603e-07, + "loss": 0.4394, + "step": 9102 + }, + { + "epoch": 4.434069503085418, + "grad_norm": 3.6016829013824463, + "learning_rate": 8.045808690829049e-07, + "loss": 0.4047, + "step": 9103 + }, + { + "epoch": 4.4345566742448845, + "grad_norm": 3.7175817489624023, + "learning_rate": 8.041082796048289e-07, + "loss": 0.3585, + "step": 9104 + }, + { + "epoch": 4.435043845404352, + "grad_norm": 3.0876123905181885, + "learning_rate": 8.036358023607091e-07, + "loss": 0.4363, + "step": 9105 + }, + { + "epoch": 4.43553101656382, + "grad_norm": 3.1851022243499756, + "learning_rate": 8.031634373818148e-07, + "loss": 0.39, + "step": 9106 + }, + { + "epoch": 4.4360181877232865, + "grad_norm": 3.2375199794769287, + "learning_rate": 8.026911846994051e-07, + "loss": 0.4207, + "step": 9107 + }, + { + "epoch": 4.436505358882754, + "grad_norm": 3.22062611579895, + "learning_rate": 8.022190443447351e-07, + "loss": 0.3711, + "step": 9108 + }, + { + "epoch": 4.436992530042222, + "grad_norm": 3.3644514083862305, + "learning_rate": 8.017470163490512e-07, + "loss": 0.4752, + "step": 9109 + }, + { + "epoch": 4.437479701201688, + "grad_norm": 3.169994354248047, + "learning_rate": 8.012751007435909e-07, + "loss": 0.4072, + "step": 9110 + }, + { + "epoch": 4.437966872361156, + "grad_norm": 2.7740602493286133, + "learning_rate": 8.008032975595861e-07, + "loss": 0.3685, + "step": 9111 + }, + { + "epoch": 4.438454043520624, + "grad_norm": 6.848184585571289, + "learning_rate": 8.003316068282618e-07, + "loss": 0.5057, + "step": 9112 + }, + { + "epoch": 4.438941214680091, + "grad_norm": 3.233919382095337, + "learning_rate": 7.998600285808327e-07, + "loss": 0.4084, + "step": 9113 + }, + { + "epoch": 4.439428385839558, + "grad_norm": 3.7693681716918945, + "learning_rate": 7.993885628485085e-07, + "loss": 0.4487, + "step": 9114 + }, + { + "epoch": 4.439915556999026, + "grad_norm": 3.074598789215088, + "learning_rate": 7.989172096624909e-07, + "loss": 0.3777, + "step": 9115 + }, + { + "epoch": 4.440402728158493, + "grad_norm": 3.2176036834716797, + "learning_rate": 7.984459690539748e-07, + "loss": 0.3439, + "step": 9116 + }, + { + "epoch": 4.44088989931796, + "grad_norm": 3.1254608631134033, + "learning_rate": 7.979748410541452e-07, + "loss": 0.3225, + "step": 9117 + }, + { + "epoch": 4.441377070477428, + "grad_norm": 3.466294765472412, + "learning_rate": 7.975038256941822e-07, + "loss": 0.3869, + "step": 9118 + }, + { + "epoch": 4.441864241636895, + "grad_norm": 2.9746670722961426, + "learning_rate": 7.970329230052573e-07, + "loss": 0.3612, + "step": 9119 + }, + { + "epoch": 4.442351412796363, + "grad_norm": 3.238593816757202, + "learning_rate": 7.965621330185358e-07, + "loss": 0.4313, + "step": 9120 + }, + { + "epoch": 4.4428385839558295, + "grad_norm": 3.081117868423462, + "learning_rate": 7.96091455765173e-07, + "loss": 0.4291, + "step": 9121 + }, + { + "epoch": 4.443325755115297, + "grad_norm": 3.4598116874694824, + "learning_rate": 7.956208912763191e-07, + "loss": 0.3987, + "step": 9122 + }, + { + "epoch": 4.443812926274765, + "grad_norm": 3.084636926651001, + "learning_rate": 7.951504395831158e-07, + "loss": 0.3829, + "step": 9123 + }, + { + "epoch": 4.4443000974342315, + "grad_norm": 3.300676107406616, + "learning_rate": 7.946801007166985e-07, + "loss": 0.3608, + "step": 9124 + }, + { + "epoch": 4.444787268593699, + "grad_norm": 3.4899983406066895, + "learning_rate": 7.942098747081923e-07, + "loss": 0.385, + "step": 9125 + }, + { + "epoch": 4.445274439753167, + "grad_norm": 3.6238784790039062, + "learning_rate": 7.93739761588718e-07, + "loss": 0.3921, + "step": 9126 + }, + { + "epoch": 4.445761610912634, + "grad_norm": 3.191589593887329, + "learning_rate": 7.932697613893872e-07, + "loss": 0.3872, + "step": 9127 + }, + { + "epoch": 4.446248782072101, + "grad_norm": 3.3970344066619873, + "learning_rate": 7.927998741413057e-07, + "loss": 0.421, + "step": 9128 + }, + { + "epoch": 4.446735953231569, + "grad_norm": 3.3396854400634766, + "learning_rate": 7.923300998755684e-07, + "loss": 0.3401, + "step": 9129 + }, + { + "epoch": 4.447223124391036, + "grad_norm": 3.2123911380767822, + "learning_rate": 7.918604386232665e-07, + "loss": 0.3917, + "step": 9130 + }, + { + "epoch": 4.447710295550503, + "grad_norm": 3.4217820167541504, + "learning_rate": 7.913908904154812e-07, + "loss": 0.4201, + "step": 9131 + }, + { + "epoch": 4.448197466709971, + "grad_norm": 3.902574062347412, + "learning_rate": 7.909214552832889e-07, + "loss": 0.3535, + "step": 9132 + }, + { + "epoch": 4.448684637869438, + "grad_norm": 3.252885341644287, + "learning_rate": 7.904521332577548e-07, + "loss": 0.3638, + "step": 9133 + }, + { + "epoch": 4.449171809028906, + "grad_norm": 3.622908115386963, + "learning_rate": 7.899829243699389e-07, + "loss": 0.382, + "step": 9134 + }, + { + "epoch": 4.449658980188373, + "grad_norm": 3.2980542182922363, + "learning_rate": 7.895138286508941e-07, + "loss": 0.376, + "step": 9135 + }, + { + "epoch": 4.45014615134784, + "grad_norm": 3.011817455291748, + "learning_rate": 7.890448461316655e-07, + "loss": 0.3696, + "step": 9136 + }, + { + "epoch": 4.450633322507308, + "grad_norm": 3.3438425064086914, + "learning_rate": 7.885759768432891e-07, + "loss": 0.432, + "step": 9137 + }, + { + "epoch": 4.4511204936667745, + "grad_norm": 3.281696319580078, + "learning_rate": 7.88107220816795e-07, + "loss": 0.3814, + "step": 9138 + }, + { + "epoch": 4.451607664826242, + "grad_norm": 3.2638509273529053, + "learning_rate": 7.876385780832058e-07, + "loss": 0.3929, + "step": 9139 + }, + { + "epoch": 4.45209483598571, + "grad_norm": 3.6856987476348877, + "learning_rate": 7.871700486735365e-07, + "loss": 0.3818, + "step": 9140 + }, + { + "epoch": 4.452582007145177, + "grad_norm": 5.116368293762207, + "learning_rate": 7.867016326187935e-07, + "loss": 0.4023, + "step": 9141 + }, + { + "epoch": 4.453069178304644, + "grad_norm": 3.3621599674224854, + "learning_rate": 7.862333299499766e-07, + "loss": 0.3329, + "step": 9142 + }, + { + "epoch": 4.453556349464112, + "grad_norm": 3.4509613513946533, + "learning_rate": 7.857651406980784e-07, + "loss": 0.4085, + "step": 9143 + }, + { + "epoch": 4.454043520623579, + "grad_norm": 3.328986644744873, + "learning_rate": 7.852970648940844e-07, + "loss": 0.4045, + "step": 9144 + }, + { + "epoch": 4.454530691783046, + "grad_norm": 2.836851119995117, + "learning_rate": 7.848291025689702e-07, + "loss": 0.3393, + "step": 9145 + }, + { + "epoch": 4.455017862942514, + "grad_norm": 3.4235994815826416, + "learning_rate": 7.843612537537062e-07, + "loss": 0.4002, + "step": 9146 + }, + { + "epoch": 4.455505034101981, + "grad_norm": 3.188162088394165, + "learning_rate": 7.838935184792554e-07, + "loss": 0.3428, + "step": 9147 + }, + { + "epoch": 4.455992205261449, + "grad_norm": 3.3151180744171143, + "learning_rate": 7.834258967765713e-07, + "loss": 0.3829, + "step": 9148 + }, + { + "epoch": 4.456479376420916, + "grad_norm": 3.5344526767730713, + "learning_rate": 7.829583886766018e-07, + "loss": 0.3783, + "step": 9149 + }, + { + "epoch": 4.456966547580383, + "grad_norm": 3.177664041519165, + "learning_rate": 7.824909942102856e-07, + "loss": 0.3864, + "step": 9150 + }, + { + "epoch": 4.457453718739851, + "grad_norm": 3.2595458030700684, + "learning_rate": 7.820237134085557e-07, + "loss": 0.4128, + "step": 9151 + }, + { + "epoch": 4.457940889899318, + "grad_norm": 4.068575859069824, + "learning_rate": 7.815565463023373e-07, + "loss": 0.404, + "step": 9152 + }, + { + "epoch": 4.458428061058785, + "grad_norm": 3.323578357696533, + "learning_rate": 7.810894929225454e-07, + "loss": 0.3662, + "step": 9153 + }, + { + "epoch": 4.458915232218253, + "grad_norm": 3.549046516418457, + "learning_rate": 7.806225533000913e-07, + "loss": 0.3915, + "step": 9154 + }, + { + "epoch": 4.45940240337772, + "grad_norm": 3.493776321411133, + "learning_rate": 7.80155727465876e-07, + "loss": 0.3799, + "step": 9155 + }, + { + "epoch": 4.459889574537187, + "grad_norm": 3.225857973098755, + "learning_rate": 7.796890154507958e-07, + "loss": 0.3865, + "step": 9156 + }, + { + "epoch": 4.460376745696655, + "grad_norm": 3.9117770195007324, + "learning_rate": 7.792224172857354e-07, + "loss": 0.4019, + "step": 9157 + }, + { + "epoch": 4.460863916856122, + "grad_norm": 3.3588290214538574, + "learning_rate": 7.78755933001575e-07, + "loss": 0.4249, + "step": 9158 + }, + { + "epoch": 4.461351088015589, + "grad_norm": 3.4798269271850586, + "learning_rate": 7.782895626291878e-07, + "loss": 0.3393, + "step": 9159 + }, + { + "epoch": 4.461838259175057, + "grad_norm": 3.4237895011901855, + "learning_rate": 7.778233061994359e-07, + "loss": 0.3747, + "step": 9160 + }, + { + "epoch": 4.462325430334524, + "grad_norm": 3.1349070072174072, + "learning_rate": 7.773571637431773e-07, + "loss": 0.388, + "step": 9161 + }, + { + "epoch": 4.462812601493992, + "grad_norm": 3.195598602294922, + "learning_rate": 7.768911352912614e-07, + "loss": 0.3467, + "step": 9162 + }, + { + "epoch": 4.463299772653459, + "grad_norm": 3.037855625152588, + "learning_rate": 7.764252208745304e-07, + "loss": 0.4445, + "step": 9163 + }, + { + "epoch": 4.463786943812926, + "grad_norm": 3.8976376056671143, + "learning_rate": 7.759594205238169e-07, + "loss": 0.407, + "step": 9164 + }, + { + "epoch": 4.464274114972394, + "grad_norm": 3.2089462280273438, + "learning_rate": 7.754937342699484e-07, + "loss": 0.4189, + "step": 9165 + }, + { + "epoch": 4.464761286131861, + "grad_norm": 3.413759708404541, + "learning_rate": 7.750281621437441e-07, + "loss": 0.399, + "step": 9166 + }, + { + "epoch": 4.465248457291328, + "grad_norm": 3.3884823322296143, + "learning_rate": 7.74562704176016e-07, + "loss": 0.377, + "step": 9167 + }, + { + "epoch": 4.465735628450796, + "grad_norm": 2.975189447402954, + "learning_rate": 7.740973603975669e-07, + "loss": 0.4446, + "step": 9168 + }, + { + "epoch": 4.4662227996102635, + "grad_norm": 3.389265298843384, + "learning_rate": 7.736321308391936e-07, + "loss": 0.3414, + "step": 9169 + }, + { + "epoch": 4.46670997076973, + "grad_norm": 3.058532476425171, + "learning_rate": 7.731670155316853e-07, + "loss": 0.3513, + "step": 9170 + }, + { + "epoch": 4.467197141929198, + "grad_norm": 3.4577383995056152, + "learning_rate": 7.727020145058236e-07, + "loss": 0.3291, + "step": 9171 + }, + { + "epoch": 4.467684313088665, + "grad_norm": 3.551706314086914, + "learning_rate": 7.722371277923812e-07, + "loss": 0.4067, + "step": 9172 + }, + { + "epoch": 4.468171484248132, + "grad_norm": 3.4198484420776367, + "learning_rate": 7.717723554221249e-07, + "loss": 0.3516, + "step": 9173 + }, + { + "epoch": 4.4686586554076, + "grad_norm": 3.2609493732452393, + "learning_rate": 7.71307697425813e-07, + "loss": 0.4165, + "step": 9174 + }, + { + "epoch": 4.469145826567067, + "grad_norm": 3.1604971885681152, + "learning_rate": 7.708431538341974e-07, + "loss": 0.3519, + "step": 9175 + }, + { + "epoch": 4.469632997726535, + "grad_norm": 3.2323784828186035, + "learning_rate": 7.703787246780201e-07, + "loss": 0.4038, + "step": 9176 + }, + { + "epoch": 4.470120168886002, + "grad_norm": 3.2577197551727295, + "learning_rate": 7.699144099880179e-07, + "loss": 0.3331, + "step": 9177 + }, + { + "epoch": 4.470607340045469, + "grad_norm": 3.2251405715942383, + "learning_rate": 7.694502097949188e-07, + "loss": 0.3545, + "step": 9178 + }, + { + "epoch": 4.471094511204937, + "grad_norm": 3.1632297039031982, + "learning_rate": 7.689861241294447e-07, + "loss": 0.4198, + "step": 9179 + }, + { + "epoch": 4.471581682364404, + "grad_norm": 3.480360269546509, + "learning_rate": 7.685221530223066e-07, + "loss": 0.3957, + "step": 9180 + }, + { + "epoch": 4.472068853523871, + "grad_norm": 3.3870465755462646, + "learning_rate": 7.680582965042113e-07, + "loss": 0.4289, + "step": 9181 + }, + { + "epoch": 4.472556024683339, + "grad_norm": 4.2456488609313965, + "learning_rate": 7.675945546058566e-07, + "loss": 0.3902, + "step": 9182 + }, + { + "epoch": 4.4730431958428065, + "grad_norm": 3.37226939201355, + "learning_rate": 7.671309273579339e-07, + "loss": 0.3519, + "step": 9183 + }, + { + "epoch": 4.473530367002273, + "grad_norm": 3.180154323577881, + "learning_rate": 7.666674147911243e-07, + "loss": 0.4447, + "step": 9184 + }, + { + "epoch": 4.474017538161741, + "grad_norm": 3.0825910568237305, + "learning_rate": 7.662040169361035e-07, + "loss": 0.441, + "step": 9185 + }, + { + "epoch": 4.4745047093212085, + "grad_norm": 3.4045379161834717, + "learning_rate": 7.657407338235398e-07, + "loss": 0.4132, + "step": 9186 + }, + { + "epoch": 4.474991880480675, + "grad_norm": 3.0901520252227783, + "learning_rate": 7.652775654840932e-07, + "loss": 0.3453, + "step": 9187 + }, + { + "epoch": 4.475479051640143, + "grad_norm": 3.430309534072876, + "learning_rate": 7.648145119484152e-07, + "loss": 0.4035, + "step": 9188 + }, + { + "epoch": 4.47596622279961, + "grad_norm": 3.753628730773926, + "learning_rate": 7.643515732471512e-07, + "loss": 0.389, + "step": 9189 + }, + { + "epoch": 4.476453393959078, + "grad_norm": 3.6277379989624023, + "learning_rate": 7.638887494109393e-07, + "loss": 0.4285, + "step": 9190 + }, + { + "epoch": 4.476940565118545, + "grad_norm": 3.5529885292053223, + "learning_rate": 7.634260404704075e-07, + "loss": 0.4296, + "step": 9191 + }, + { + "epoch": 4.477427736278012, + "grad_norm": 3.3082692623138428, + "learning_rate": 7.629634464561786e-07, + "loss": 0.3592, + "step": 9192 + }, + { + "epoch": 4.47791490743748, + "grad_norm": 3.0926902294158936, + "learning_rate": 7.625009673988678e-07, + "loss": 0.4551, + "step": 9193 + }, + { + "epoch": 4.478402078596947, + "grad_norm": 3.2551751136779785, + "learning_rate": 7.620386033290805e-07, + "loss": 0.394, + "step": 9194 + }, + { + "epoch": 4.478889249756414, + "grad_norm": 2.8902480602264404, + "learning_rate": 7.615763542774166e-07, + "loss": 0.4471, + "step": 9195 + }, + { + "epoch": 4.479376420915882, + "grad_norm": 3.7043635845184326, + "learning_rate": 7.611142202744685e-07, + "loss": 0.4313, + "step": 9196 + }, + { + "epoch": 4.479863592075349, + "grad_norm": 3.1607279777526855, + "learning_rate": 7.606522013508186e-07, + "loss": 0.3306, + "step": 9197 + }, + { + "epoch": 4.480350763234816, + "grad_norm": 3.4019923210144043, + "learning_rate": 7.60190297537044e-07, + "loss": 0.3623, + "step": 9198 + }, + { + "epoch": 4.480837934394284, + "grad_norm": 3.1789209842681885, + "learning_rate": 7.597285088637141e-07, + "loss": 0.3634, + "step": 9199 + }, + { + "epoch": 4.4813251055537515, + "grad_norm": 3.560274124145508, + "learning_rate": 7.592668353613889e-07, + "loss": 0.3654, + "step": 9200 + }, + { + "epoch": 4.481812276713218, + "grad_norm": 2.7145111560821533, + "learning_rate": 7.588052770606222e-07, + "loss": 0.3922, + "step": 9201 + }, + { + "epoch": 4.482299447872686, + "grad_norm": 3.4404256343841553, + "learning_rate": 7.583438339919602e-07, + "loss": 0.3524, + "step": 9202 + }, + { + "epoch": 4.4827866190321535, + "grad_norm": 3.5713863372802734, + "learning_rate": 7.578825061859421e-07, + "loss": 0.3795, + "step": 9203 + }, + { + "epoch": 4.483273790191621, + "grad_norm": 3.2508695125579834, + "learning_rate": 7.574212936730965e-07, + "loss": 0.4203, + "step": 9204 + }, + { + "epoch": 4.483760961351088, + "grad_norm": 3.6990628242492676, + "learning_rate": 7.569601964839473e-07, + "loss": 0.4379, + "step": 9205 + }, + { + "epoch": 4.484248132510555, + "grad_norm": 3.2513539791107178, + "learning_rate": 7.564992146490102e-07, + "loss": 0.3967, + "step": 9206 + }, + { + "epoch": 4.484735303670023, + "grad_norm": 3.4282655715942383, + "learning_rate": 7.560383481987935e-07, + "loss": 0.4165, + "step": 9207 + }, + { + "epoch": 4.48522247482949, + "grad_norm": 3.631035566329956, + "learning_rate": 7.555775971637958e-07, + "loss": 0.3913, + "step": 9208 + }, + { + "epoch": 4.485709645988957, + "grad_norm": 3.4137582778930664, + "learning_rate": 7.551169615745102e-07, + "loss": 0.3671, + "step": 9209 + }, + { + "epoch": 4.486196817148425, + "grad_norm": 3.2288196086883545, + "learning_rate": 7.546564414614222e-07, + "loss": 0.351, + "step": 9210 + }, + { + "epoch": 4.486683988307892, + "grad_norm": 3.118581533432007, + "learning_rate": 7.541960368550078e-07, + "loss": 0.4136, + "step": 9211 + }, + { + "epoch": 4.487171159467359, + "grad_norm": 3.056486129760742, + "learning_rate": 7.537357477857371e-07, + "loss": 0.3699, + "step": 9212 + }, + { + "epoch": 4.487658330626827, + "grad_norm": 2.9096388816833496, + "learning_rate": 7.532755742840719e-07, + "loss": 0.4263, + "step": 9213 + }, + { + "epoch": 4.488145501786295, + "grad_norm": 4.4192423820495605, + "learning_rate": 7.528155163804674e-07, + "loss": 0.4286, + "step": 9214 + }, + { + "epoch": 4.488632672945761, + "grad_norm": 3.1934926509857178, + "learning_rate": 7.523555741053684e-07, + "loss": 0.3343, + "step": 9215 + }, + { + "epoch": 4.489119844105229, + "grad_norm": 2.986231565475464, + "learning_rate": 7.51895747489215e-07, + "loss": 0.3452, + "step": 9216 + }, + { + "epoch": 4.4896070152646965, + "grad_norm": 3.169971227645874, + "learning_rate": 7.514360365624382e-07, + "loss": 0.3469, + "step": 9217 + }, + { + "epoch": 4.490094186424164, + "grad_norm": 3.128169059753418, + "learning_rate": 7.509764413554624e-07, + "loss": 0.3399, + "step": 9218 + }, + { + "epoch": 4.490581357583631, + "grad_norm": 4.7778730392456055, + "learning_rate": 7.505169618987021e-07, + "loss": 0.3973, + "step": 9219 + }, + { + "epoch": 4.4910685287430985, + "grad_norm": 3.3370718955993652, + "learning_rate": 7.500575982225661e-07, + "loss": 0.4003, + "step": 9220 + }, + { + "epoch": 4.491555699902566, + "grad_norm": 3.498283624649048, + "learning_rate": 7.495983503574556e-07, + "loss": 0.4117, + "step": 9221 + }, + { + "epoch": 4.492042871062033, + "grad_norm": 3.2248435020446777, + "learning_rate": 7.491392183337639e-07, + "loss": 0.4827, + "step": 9222 + }, + { + "epoch": 4.4925300422215, + "grad_norm": 4.03589391708374, + "learning_rate": 7.486802021818751e-07, + "loss": 0.3833, + "step": 9223 + }, + { + "epoch": 4.493017213380968, + "grad_norm": 3.309863328933716, + "learning_rate": 7.482213019321671e-07, + "loss": 0.3807, + "step": 9224 + }, + { + "epoch": 4.493504384540435, + "grad_norm": 3.08591890335083, + "learning_rate": 7.477625176150106e-07, + "loss": 0.3934, + "step": 9225 + }, + { + "epoch": 4.493991555699902, + "grad_norm": 3.481963872909546, + "learning_rate": 7.473038492607681e-07, + "loss": 0.4021, + "step": 9226 + }, + { + "epoch": 4.49447872685937, + "grad_norm": 3.0560343265533447, + "learning_rate": 7.468452968997927e-07, + "loss": 0.3972, + "step": 9227 + }, + { + "epoch": 4.494965898018838, + "grad_norm": 3.1781857013702393, + "learning_rate": 7.463868605624325e-07, + "loss": 0.3837, + "step": 9228 + }, + { + "epoch": 4.495453069178304, + "grad_norm": 3.146965265274048, + "learning_rate": 7.459285402790267e-07, + "loss": 0.3792, + "step": 9229 + }, + { + "epoch": 4.495940240337772, + "grad_norm": 3.047945022583008, + "learning_rate": 7.454703360799074e-07, + "loss": 0.3889, + "step": 9230 + }, + { + "epoch": 4.49642741149724, + "grad_norm": 3.1790084838867188, + "learning_rate": 7.450122479953969e-07, + "loss": 0.4093, + "step": 9231 + }, + { + "epoch": 4.496914582656707, + "grad_norm": 3.1658244132995605, + "learning_rate": 7.445542760558128e-07, + "loss": 0.4197, + "step": 9232 + }, + { + "epoch": 4.497401753816174, + "grad_norm": 2.8005611896514893, + "learning_rate": 7.44096420291463e-07, + "loss": 0.3649, + "step": 9233 + }, + { + "epoch": 4.4978889249756415, + "grad_norm": 3.198068141937256, + "learning_rate": 7.436386807326493e-07, + "loss": 0.4039, + "step": 9234 + }, + { + "epoch": 4.498376096135109, + "grad_norm": 3.38095760345459, + "learning_rate": 7.431810574096635e-07, + "loss": 0.3812, + "step": 9235 + }, + { + "epoch": 4.498863267294576, + "grad_norm": 3.0073657035827637, + "learning_rate": 7.427235503527924e-07, + "loss": 0.3874, + "step": 9236 + }, + { + "epoch": 4.4993504384540435, + "grad_norm": 4.158112049102783, + "learning_rate": 7.422661595923125e-07, + "loss": 0.4, + "step": 9237 + }, + { + "epoch": 4.499837609613511, + "grad_norm": 3.4045896530151367, + "learning_rate": 7.418088851584946e-07, + "loss": 0.4478, + "step": 9238 + }, + { + "epoch": 4.500324780772978, + "grad_norm": 3.522066116333008, + "learning_rate": 7.413517270816015e-07, + "loss": 0.3274, + "step": 9239 + }, + { + "epoch": 4.500811951932445, + "grad_norm": 3.03006911277771, + "learning_rate": 7.408946853918863e-07, + "loss": 0.4252, + "step": 9240 + }, + { + "epoch": 4.501299123091913, + "grad_norm": 3.297783851623535, + "learning_rate": 7.404377601195975e-07, + "loss": 0.4171, + "step": 9241 + }, + { + "epoch": 4.501786294251381, + "grad_norm": 3.4083001613616943, + "learning_rate": 7.399809512949743e-07, + "loss": 0.3496, + "step": 9242 + }, + { + "epoch": 4.502273465410847, + "grad_norm": 3.273369073867798, + "learning_rate": 7.395242589482473e-07, + "loss": 0.3368, + "step": 9243 + }, + { + "epoch": 4.502760636570315, + "grad_norm": 3.2598330974578857, + "learning_rate": 7.390676831096408e-07, + "loss": 0.399, + "step": 9244 + }, + { + "epoch": 4.503247807729783, + "grad_norm": 3.3701813220977783, + "learning_rate": 7.386112238093712e-07, + "loss": 0.3568, + "step": 9245 + }, + { + "epoch": 4.50373497888925, + "grad_norm": 3.4224653244018555, + "learning_rate": 7.381548810776473e-07, + "loss": 0.2884, + "step": 9246 + }, + { + "epoch": 4.504222150048717, + "grad_norm": 3.2952630519866943, + "learning_rate": 7.376986549446688e-07, + "loss": 0.4197, + "step": 9247 + }, + { + "epoch": 4.504709321208185, + "grad_norm": 3.4345550537109375, + "learning_rate": 7.37242545440629e-07, + "loss": 0.3728, + "step": 9248 + }, + { + "epoch": 4.505196492367652, + "grad_norm": 2.8943533897399902, + "learning_rate": 7.367865525957135e-07, + "loss": 0.4174, + "step": 9249 + }, + { + "epoch": 4.505683663527119, + "grad_norm": 3.409534454345703, + "learning_rate": 7.363306764401007e-07, + "loss": 0.3929, + "step": 9250 + }, + { + "epoch": 4.5061708346865865, + "grad_norm": 3.5637576580047607, + "learning_rate": 7.358749170039584e-07, + "loss": 0.3734, + "step": 9251 + }, + { + "epoch": 4.506658005846054, + "grad_norm": 3.358975887298584, + "learning_rate": 7.354192743174502e-07, + "loss": 0.3892, + "step": 9252 + }, + { + "epoch": 4.507145177005521, + "grad_norm": 3.14449143409729, + "learning_rate": 7.349637484107297e-07, + "loss": 0.4271, + "step": 9253 + }, + { + "epoch": 4.5076323481649885, + "grad_norm": 3.742091655731201, + "learning_rate": 7.345083393139449e-07, + "loss": 0.3764, + "step": 9254 + }, + { + "epoch": 4.508119519324456, + "grad_norm": 3.388246774673462, + "learning_rate": 7.34053047057233e-07, + "loss": 0.3719, + "step": 9255 + }, + { + "epoch": 4.508606690483924, + "grad_norm": 3.018998861312866, + "learning_rate": 7.335978716707262e-07, + "loss": 0.4108, + "step": 9256 + }, + { + "epoch": 4.50909386164339, + "grad_norm": 3.0447559356689453, + "learning_rate": 7.331428131845475e-07, + "loss": 0.3552, + "step": 9257 + }, + { + "epoch": 4.509581032802858, + "grad_norm": 3.451012134552002, + "learning_rate": 7.326878716288138e-07, + "loss": 0.3982, + "step": 9258 + }, + { + "epoch": 4.510068203962326, + "grad_norm": 3.0597217082977295, + "learning_rate": 7.322330470336314e-07, + "loss": 0.416, + "step": 9259 + }, + { + "epoch": 4.510555375121793, + "grad_norm": 3.166809558868408, + "learning_rate": 7.317783394291014e-07, + "loss": 0.3675, + "step": 9260 + }, + { + "epoch": 4.51104254628126, + "grad_norm": 3.0051960945129395, + "learning_rate": 7.313237488453168e-07, + "loss": 0.4134, + "step": 9261 + }, + { + "epoch": 4.511529717440728, + "grad_norm": 3.478119134902954, + "learning_rate": 7.308692753123611e-07, + "loss": 0.3611, + "step": 9262 + }, + { + "epoch": 4.512016888600195, + "grad_norm": 3.295332670211792, + "learning_rate": 7.304149188603121e-07, + "loss": 0.3904, + "step": 9263 + }, + { + "epoch": 4.512504059759662, + "grad_norm": 3.2598140239715576, + "learning_rate": 7.29960679519239e-07, + "loss": 0.3775, + "step": 9264 + }, + { + "epoch": 4.51299123091913, + "grad_norm": 3.5887949466705322, + "learning_rate": 7.295065573192042e-07, + "loss": 0.4436, + "step": 9265 + }, + { + "epoch": 4.513478402078597, + "grad_norm": 3.199266195297241, + "learning_rate": 7.290525522902597e-07, + "loss": 0.3579, + "step": 9266 + }, + { + "epoch": 4.513965573238064, + "grad_norm": 3.6405909061431885, + "learning_rate": 7.285986644624521e-07, + "loss": 0.4066, + "step": 9267 + }, + { + "epoch": 4.5144527443975315, + "grad_norm": 3.1926844120025635, + "learning_rate": 7.281448938658203e-07, + "loss": 0.4708, + "step": 9268 + }, + { + "epoch": 4.514939915556999, + "grad_norm": 2.88193678855896, + "learning_rate": 7.276912405303951e-07, + "loss": 0.4025, + "step": 9269 + }, + { + "epoch": 4.515427086716467, + "grad_norm": 3.0900678634643555, + "learning_rate": 7.272377044861978e-07, + "loss": 0.3584, + "step": 9270 + }, + { + "epoch": 4.5159142578759335, + "grad_norm": 3.543545722961426, + "learning_rate": 7.26784285763244e-07, + "loss": 0.3885, + "step": 9271 + }, + { + "epoch": 4.516401429035401, + "grad_norm": 3.2517645359039307, + "learning_rate": 7.263309843915412e-07, + "loss": 0.3705, + "step": 9272 + }, + { + "epoch": 4.516888600194869, + "grad_norm": 3.317131757736206, + "learning_rate": 7.258778004010894e-07, + "loss": 0.3748, + "step": 9273 + }, + { + "epoch": 4.517375771354336, + "grad_norm": 3.2044081687927246, + "learning_rate": 7.254247338218786e-07, + "loss": 0.4106, + "step": 9274 + }, + { + "epoch": 4.517862942513803, + "grad_norm": 3.090555429458618, + "learning_rate": 7.249717846838938e-07, + "loss": 0.3968, + "step": 9275 + }, + { + "epoch": 4.518350113673271, + "grad_norm": 3.2260420322418213, + "learning_rate": 7.245189530171113e-07, + "loss": 0.3827, + "step": 9276 + }, + { + "epoch": 4.518837284832738, + "grad_norm": 3.7410788536071777, + "learning_rate": 7.240662388514996e-07, + "loss": 0.3757, + "step": 9277 + }, + { + "epoch": 4.519324455992205, + "grad_norm": 3.088083028793335, + "learning_rate": 7.236136422170181e-07, + "loss": 0.3415, + "step": 9278 + }, + { + "epoch": 4.519811627151673, + "grad_norm": 3.2211930751800537, + "learning_rate": 7.231611631436203e-07, + "loss": 0.3834, + "step": 9279 + }, + { + "epoch": 4.52029879831114, + "grad_norm": 3.283576488494873, + "learning_rate": 7.227088016612521e-07, + "loss": 0.4068, + "step": 9280 + }, + { + "epoch": 4.520785969470607, + "grad_norm": 3.156041383743286, + "learning_rate": 7.222565577998492e-07, + "loss": 0.3697, + "step": 9281 + }, + { + "epoch": 4.521273140630075, + "grad_norm": 3.4529311656951904, + "learning_rate": 7.218044315893419e-07, + "loss": 0.4154, + "step": 9282 + }, + { + "epoch": 4.521760311789542, + "grad_norm": 3.0578718185424805, + "learning_rate": 7.213524230596522e-07, + "loss": 0.4123, + "step": 9283 + }, + { + "epoch": 4.52224748294901, + "grad_norm": 3.267512083053589, + "learning_rate": 7.209005322406929e-07, + "loss": 0.3626, + "step": 9284 + }, + { + "epoch": 4.5227346541084765, + "grad_norm": 3.0784804821014404, + "learning_rate": 7.204487591623713e-07, + "loss": 0.3839, + "step": 9285 + }, + { + "epoch": 4.523221825267944, + "grad_norm": 3.5697193145751953, + "learning_rate": 7.199971038545847e-07, + "loss": 0.4475, + "step": 9286 + }, + { + "epoch": 4.523708996427412, + "grad_norm": 3.359091281890869, + "learning_rate": 7.195455663472237e-07, + "loss": 0.3698, + "step": 9287 + }, + { + "epoch": 4.5241961675868785, + "grad_norm": 3.2359395027160645, + "learning_rate": 7.190941466701715e-07, + "loss": 0.4573, + "step": 9288 + }, + { + "epoch": 4.524683338746346, + "grad_norm": 4.741020679473877, + "learning_rate": 7.186428448533034e-07, + "loss": 0.4316, + "step": 9289 + }, + { + "epoch": 4.525170509905814, + "grad_norm": 3.2400858402252197, + "learning_rate": 7.181916609264853e-07, + "loss": 0.4249, + "step": 9290 + }, + { + "epoch": 4.525657681065281, + "grad_norm": 3.548269510269165, + "learning_rate": 7.177405949195771e-07, + "loss": 0.3914, + "step": 9291 + }, + { + "epoch": 4.526144852224748, + "grad_norm": 3.6246795654296875, + "learning_rate": 7.172896468624304e-07, + "loss": 0.4105, + "step": 9292 + }, + { + "epoch": 4.526632023384216, + "grad_norm": 3.507983684539795, + "learning_rate": 7.168388167848897e-07, + "loss": 0.4063, + "step": 9293 + }, + { + "epoch": 4.527119194543683, + "grad_norm": 3.7874467372894287, + "learning_rate": 7.163881047167892e-07, + "loss": 0.3583, + "step": 9294 + }, + { + "epoch": 4.52760636570315, + "grad_norm": 3.3181581497192383, + "learning_rate": 7.15937510687958e-07, + "loss": 0.3585, + "step": 9295 + }, + { + "epoch": 4.528093536862618, + "grad_norm": 3.023343086242676, + "learning_rate": 7.154870347282161e-07, + "loss": 0.3587, + "step": 9296 + }, + { + "epoch": 4.528580708022085, + "grad_norm": 3.244870662689209, + "learning_rate": 7.150366768673767e-07, + "loss": 0.3698, + "step": 9297 + }, + { + "epoch": 4.529067879181552, + "grad_norm": 3.4219701290130615, + "learning_rate": 7.145864371352431e-07, + "loss": 0.3843, + "step": 9298 + }, + { + "epoch": 4.52955505034102, + "grad_norm": 3.3953537940979004, + "learning_rate": 7.141363155616129e-07, + "loss": 0.445, + "step": 9299 + }, + { + "epoch": 4.530042221500487, + "grad_norm": 3.1291797161102295, + "learning_rate": 7.136863121762752e-07, + "loss": 0.3854, + "step": 9300 + }, + { + "epoch": 4.530529392659955, + "grad_norm": 3.272922992706299, + "learning_rate": 7.132364270090117e-07, + "loss": 0.4018, + "step": 9301 + }, + { + "epoch": 4.5310165638194215, + "grad_norm": 3.3973684310913086, + "learning_rate": 7.127866600895941e-07, + "loss": 0.3973, + "step": 9302 + }, + { + "epoch": 4.531503734978889, + "grad_norm": 3.3990085124969482, + "learning_rate": 7.12337011447789e-07, + "loss": 0.3562, + "step": 9303 + }, + { + "epoch": 4.531990906138357, + "grad_norm": 3.323467254638672, + "learning_rate": 7.118874811133541e-07, + "loss": 0.3649, + "step": 9304 + }, + { + "epoch": 4.532478077297824, + "grad_norm": 3.399278163909912, + "learning_rate": 7.114380691160399e-07, + "loss": 0.41, + "step": 9305 + }, + { + "epoch": 4.532965248457291, + "grad_norm": 3.17242431640625, + "learning_rate": 7.10988775485587e-07, + "loss": 0.3819, + "step": 9306 + }, + { + "epoch": 4.533452419616759, + "grad_norm": 2.988388776779175, + "learning_rate": 7.105396002517303e-07, + "loss": 0.3407, + "step": 9307 + }, + { + "epoch": 4.533939590776226, + "grad_norm": 3.2454640865325928, + "learning_rate": 7.100905434441962e-07, + "loss": 0.394, + "step": 9308 + }, + { + "epoch": 4.534426761935693, + "grad_norm": 3.4285550117492676, + "learning_rate": 7.09641605092704e-07, + "loss": 0.4526, + "step": 9309 + }, + { + "epoch": 4.534913933095161, + "grad_norm": 3.631131410598755, + "learning_rate": 7.091927852269631e-07, + "loss": 0.4155, + "step": 9310 + }, + { + "epoch": 4.535401104254628, + "grad_norm": 3.001248359680176, + "learning_rate": 7.087440838766768e-07, + "loss": 0.3446, + "step": 9311 + }, + { + "epoch": 4.535888275414095, + "grad_norm": 3.3403687477111816, + "learning_rate": 7.082955010715409e-07, + "loss": 0.3762, + "step": 9312 + }, + { + "epoch": 4.536375446573563, + "grad_norm": 3.3148767948150635, + "learning_rate": 7.078470368412413e-07, + "loss": 0.3168, + "step": 9313 + }, + { + "epoch": 4.53686261773303, + "grad_norm": 3.167118549346924, + "learning_rate": 7.073986912154579e-07, + "loss": 0.3832, + "step": 9314 + }, + { + "epoch": 4.537349788892498, + "grad_norm": 3.566016674041748, + "learning_rate": 7.06950464223862e-07, + "loss": 0.3781, + "step": 9315 + }, + { + "epoch": 4.537836960051965, + "grad_norm": 3.710780143737793, + "learning_rate": 7.065023558961182e-07, + "loss": 0.485, + "step": 9316 + }, + { + "epoch": 4.538324131211432, + "grad_norm": 3.8064074516296387, + "learning_rate": 7.060543662618807e-07, + "loss": 0.3976, + "step": 9317 + }, + { + "epoch": 4.5388113023709, + "grad_norm": 3.058011293411255, + "learning_rate": 7.056064953507985e-07, + "loss": 0.361, + "step": 9318 + }, + { + "epoch": 4.539298473530367, + "grad_norm": 2.9055581092834473, + "learning_rate": 7.051587431925111e-07, + "loss": 0.3485, + "step": 9319 + }, + { + "epoch": 4.539785644689834, + "grad_norm": 3.4038355350494385, + "learning_rate": 7.047111098166517e-07, + "loss": 0.381, + "step": 9320 + }, + { + "epoch": 4.540272815849302, + "grad_norm": 3.1795573234558105, + "learning_rate": 7.042635952528432e-07, + "loss": 0.3722, + "step": 9321 + }, + { + "epoch": 4.540759987008769, + "grad_norm": 3.2817955017089844, + "learning_rate": 7.038161995307027e-07, + "loss": 0.4139, + "step": 9322 + }, + { + "epoch": 4.541247158168236, + "grad_norm": 3.3785743713378906, + "learning_rate": 7.033689226798396e-07, + "loss": 0.3323, + "step": 9323 + }, + { + "epoch": 4.541734329327704, + "grad_norm": 3.0324161052703857, + "learning_rate": 7.029217647298531e-07, + "loss": 0.3691, + "step": 9324 + }, + { + "epoch": 4.542221500487171, + "grad_norm": 3.507370948791504, + "learning_rate": 7.024747257103367e-07, + "loss": 0.3656, + "step": 9325 + }, + { + "epoch": 4.542708671646638, + "grad_norm": 2.821471691131592, + "learning_rate": 7.020278056508767e-07, + "loss": 0.3646, + "step": 9326 + }, + { + "epoch": 4.543195842806106, + "grad_norm": 3.753357172012329, + "learning_rate": 7.015810045810481e-07, + "loss": 0.3333, + "step": 9327 + }, + { + "epoch": 4.543683013965573, + "grad_norm": 3.302145004272461, + "learning_rate": 7.011343225304213e-07, + "loss": 0.4898, + "step": 9328 + }, + { + "epoch": 4.544170185125041, + "grad_norm": 3.7613885402679443, + "learning_rate": 7.006877595285583e-07, + "loss": 0.3843, + "step": 9329 + }, + { + "epoch": 4.544657356284508, + "grad_norm": 3.0552473068237305, + "learning_rate": 7.002413156050109e-07, + "loss": 0.4029, + "step": 9330 + }, + { + "epoch": 4.545144527443975, + "grad_norm": 3.1941850185394287, + "learning_rate": 6.997949907893256e-07, + "loss": 0.3717, + "step": 9331 + }, + { + "epoch": 4.545631698603443, + "grad_norm": 3.295895576477051, + "learning_rate": 6.993487851110411e-07, + "loss": 0.4439, + "step": 9332 + }, + { + "epoch": 4.5461188697629105, + "grad_norm": 3.7417101860046387, + "learning_rate": 6.989026985996855e-07, + "loss": 0.3489, + "step": 9333 + }, + { + "epoch": 4.546606040922377, + "grad_norm": 3.2827823162078857, + "learning_rate": 6.984567312847815e-07, + "loss": 0.3671, + "step": 9334 + }, + { + "epoch": 4.547093212081845, + "grad_norm": 3.3782949447631836, + "learning_rate": 6.980108831958435e-07, + "loss": 0.4068, + "step": 9335 + }, + { + "epoch": 4.547580383241312, + "grad_norm": 3.5621094703674316, + "learning_rate": 6.97565154362378e-07, + "loss": 0.3937, + "step": 9336 + }, + { + "epoch": 4.548067554400779, + "grad_norm": 3.7077527046203613, + "learning_rate": 6.97119544813882e-07, + "loss": 0.4623, + "step": 9337 + }, + { + "epoch": 4.548554725560247, + "grad_norm": 3.1814677715301514, + "learning_rate": 6.966740545798467e-07, + "loss": 0.3643, + "step": 9338 + }, + { + "epoch": 4.549041896719714, + "grad_norm": 3.1527256965637207, + "learning_rate": 6.962286836897545e-07, + "loss": 0.388, + "step": 9339 + }, + { + "epoch": 4.549529067879181, + "grad_norm": 3.279761552810669, + "learning_rate": 6.957834321730808e-07, + "loss": 0.395, + "step": 9340 + }, + { + "epoch": 4.550016239038649, + "grad_norm": 3.2149529457092285, + "learning_rate": 6.953383000592909e-07, + "loss": 0.3869, + "step": 9341 + }, + { + "epoch": 4.550503410198116, + "grad_norm": 3.5438785552978516, + "learning_rate": 6.948932873778441e-07, + "loss": 0.4344, + "step": 9342 + }, + { + "epoch": 4.550990581357584, + "grad_norm": 3.4925272464752197, + "learning_rate": 6.944483941581914e-07, + "loss": 0.3853, + "step": 9343 + }, + { + "epoch": 4.551477752517051, + "grad_norm": 3.525930166244507, + "learning_rate": 6.940036204297767e-07, + "loss": 0.3581, + "step": 9344 + }, + { + "epoch": 4.551964923676518, + "grad_norm": 3.237793207168579, + "learning_rate": 6.935589662220337e-07, + "loss": 0.4173, + "step": 9345 + }, + { + "epoch": 4.552452094835986, + "grad_norm": 3.6175715923309326, + "learning_rate": 6.931144315643898e-07, + "loss": 0.3833, + "step": 9346 + }, + { + "epoch": 4.5529392659954535, + "grad_norm": 3.508599042892456, + "learning_rate": 6.926700164862646e-07, + "loss": 0.4212, + "step": 9347 + }, + { + "epoch": 4.55342643715492, + "grad_norm": 3.238513946533203, + "learning_rate": 6.922257210170702e-07, + "loss": 0.3946, + "step": 9348 + }, + { + "epoch": 4.553913608314388, + "grad_norm": 3.1810824871063232, + "learning_rate": 6.917815451862086e-07, + "loss": 0.4648, + "step": 9349 + }, + { + "epoch": 4.5544007794738555, + "grad_norm": 3.449483871459961, + "learning_rate": 6.913374890230759e-07, + "loss": 0.4168, + "step": 9350 + }, + { + "epoch": 4.554887950633322, + "grad_norm": 3.051408290863037, + "learning_rate": 6.908935525570598e-07, + "loss": 0.443, + "step": 9351 + }, + { + "epoch": 4.55537512179279, + "grad_norm": 3.3578410148620605, + "learning_rate": 6.904497358175405e-07, + "loss": 0.4037, + "step": 9352 + }, + { + "epoch": 4.555862292952257, + "grad_norm": 3.0225257873535156, + "learning_rate": 6.900060388338886e-07, + "loss": 0.3721, + "step": 9353 + }, + { + "epoch": 4.556349464111724, + "grad_norm": 3.324152946472168, + "learning_rate": 6.895624616354685e-07, + "loss": 0.3843, + "step": 9354 + }, + { + "epoch": 4.556836635271192, + "grad_norm": 3.518629789352417, + "learning_rate": 6.891190042516357e-07, + "loss": 0.4059, + "step": 9355 + }, + { + "epoch": 4.557323806430659, + "grad_norm": 3.2975099086761475, + "learning_rate": 6.886756667117398e-07, + "loss": 0.3338, + "step": 9356 + }, + { + "epoch": 4.557810977590127, + "grad_norm": 3.3344271183013916, + "learning_rate": 6.882324490451187e-07, + "loss": 0.3272, + "step": 9357 + }, + { + "epoch": 4.558298148749594, + "grad_norm": 3.155017852783203, + "learning_rate": 6.877893512811054e-07, + "loss": 0.4104, + "step": 9358 + }, + { + "epoch": 4.558785319909061, + "grad_norm": 4.05630350112915, + "learning_rate": 6.873463734490241e-07, + "loss": 0.436, + "step": 9359 + }, + { + "epoch": 4.559272491068529, + "grad_norm": 3.2903940677642822, + "learning_rate": 6.869035155781917e-07, + "loss": 0.4113, + "step": 9360 + }, + { + "epoch": 4.559759662227997, + "grad_norm": 3.2385165691375732, + "learning_rate": 6.864607776979152e-07, + "loss": 0.3977, + "step": 9361 + }, + { + "epoch": 4.560246833387463, + "grad_norm": 3.2934374809265137, + "learning_rate": 6.860181598374955e-07, + "loss": 0.418, + "step": 9362 + }, + { + "epoch": 4.560734004546931, + "grad_norm": 2.9157791137695312, + "learning_rate": 6.855756620262258e-07, + "loss": 0.3547, + "step": 9363 + }, + { + "epoch": 4.5612211757063985, + "grad_norm": 3.0671815872192383, + "learning_rate": 6.851332842933889e-07, + "loss": 0.4509, + "step": 9364 + }, + { + "epoch": 4.561708346865865, + "grad_norm": 3.3495054244995117, + "learning_rate": 6.846910266682624e-07, + "loss": 0.3144, + "step": 9365 + }, + { + "epoch": 4.562195518025333, + "grad_norm": 3.2055442333221436, + "learning_rate": 6.842488891801147e-07, + "loss": 0.437, + "step": 9366 + }, + { + "epoch": 4.5626826891848005, + "grad_norm": 3.4032955169677734, + "learning_rate": 6.838068718582072e-07, + "loss": 0.3666, + "step": 9367 + }, + { + "epoch": 4.563169860344267, + "grad_norm": 3.787595272064209, + "learning_rate": 6.83364974731791e-07, + "loss": 0.3683, + "step": 9368 + }, + { + "epoch": 4.563657031503735, + "grad_norm": 3.3148529529571533, + "learning_rate": 6.829231978301118e-07, + "loss": 0.3471, + "step": 9369 + }, + { + "epoch": 4.564144202663202, + "grad_norm": 3.424901008605957, + "learning_rate": 6.824815411824067e-07, + "loss": 0.4103, + "step": 9370 + }, + { + "epoch": 4.56463137382267, + "grad_norm": 3.3632867336273193, + "learning_rate": 6.820400048179032e-07, + "loss": 0.383, + "step": 9371 + }, + { + "epoch": 4.565118544982137, + "grad_norm": 3.4199378490448, + "learning_rate": 6.815985887658239e-07, + "loss": 0.3932, + "step": 9372 + }, + { + "epoch": 4.565605716141604, + "grad_norm": 4.242677211761475, + "learning_rate": 6.811572930553798e-07, + "loss": 0.4551, + "step": 9373 + }, + { + "epoch": 4.566092887301072, + "grad_norm": 3.394152879714966, + "learning_rate": 6.807161177157764e-07, + "loss": 0.3861, + "step": 9374 + }, + { + "epoch": 4.56658005846054, + "grad_norm": 3.5347702503204346, + "learning_rate": 6.802750627762119e-07, + "loss": 0.323, + "step": 9375 + }, + { + "epoch": 4.567067229620006, + "grad_norm": 3.220489263534546, + "learning_rate": 6.798341282658735e-07, + "loss": 0.3296, + "step": 9376 + }, + { + "epoch": 4.567554400779474, + "grad_norm": 3.1894471645355225, + "learning_rate": 6.793933142139431e-07, + "loss": 0.3448, + "step": 9377 + }, + { + "epoch": 4.568041571938942, + "grad_norm": 2.96846866607666, + "learning_rate": 6.789526206495936e-07, + "loss": 0.3943, + "step": 9378 + }, + { + "epoch": 4.568528743098408, + "grad_norm": 3.3191792964935303, + "learning_rate": 6.785120476019908e-07, + "loss": 0.4628, + "step": 9379 + }, + { + "epoch": 4.569015914257876, + "grad_norm": 3.7769486904144287, + "learning_rate": 6.780715951002903e-07, + "loss": 0.3859, + "step": 9380 + }, + { + "epoch": 4.5695030854173435, + "grad_norm": 3.4988415241241455, + "learning_rate": 6.776312631736417e-07, + "loss": 0.3845, + "step": 9381 + }, + { + "epoch": 4.56999025657681, + "grad_norm": 3.3114871978759766, + "learning_rate": 6.771910518511868e-07, + "loss": 0.3779, + "step": 9382 + }, + { + "epoch": 4.570477427736278, + "grad_norm": 3.089280366897583, + "learning_rate": 6.76750961162059e-07, + "loss": 0.368, + "step": 9383 + }, + { + "epoch": 4.5709645988957455, + "grad_norm": 3.277737855911255, + "learning_rate": 6.763109911353822e-07, + "loss": 0.3999, + "step": 9384 + }, + { + "epoch": 4.571451770055213, + "grad_norm": 3.2404587268829346, + "learning_rate": 6.75871141800274e-07, + "loss": 0.3679, + "step": 9385 + }, + { + "epoch": 4.57193894121468, + "grad_norm": 3.026531934738159, + "learning_rate": 6.754314131858436e-07, + "loss": 0.4001, + "step": 9386 + }, + { + "epoch": 4.572426112374147, + "grad_norm": 3.1305902004241943, + "learning_rate": 6.749918053211934e-07, + "loss": 0.3701, + "step": 9387 + }, + { + "epoch": 4.572913283533615, + "grad_norm": 3.1271257400512695, + "learning_rate": 6.745523182354147e-07, + "loss": 0.3596, + "step": 9388 + }, + { + "epoch": 4.573400454693083, + "grad_norm": 3.0253894329071045, + "learning_rate": 6.741129519575937e-07, + "loss": 0.3811, + "step": 9389 + }, + { + "epoch": 4.573887625852549, + "grad_norm": 3.598757743835449, + "learning_rate": 6.736737065168075e-07, + "loss": 0.3816, + "step": 9390 + }, + { + "epoch": 4.574374797012017, + "grad_norm": 3.308354377746582, + "learning_rate": 6.73234581942126e-07, + "loss": 0.4518, + "step": 9391 + }, + { + "epoch": 4.574861968171485, + "grad_norm": 3.491838216781616, + "learning_rate": 6.727955782626092e-07, + "loss": 0.401, + "step": 9392 + }, + { + "epoch": 4.575349139330951, + "grad_norm": 3.511936664581299, + "learning_rate": 6.723566955073105e-07, + "loss": 0.4986, + "step": 9393 + }, + { + "epoch": 4.575836310490419, + "grad_norm": 3.565380811691284, + "learning_rate": 6.719179337052759e-07, + "loss": 0.3657, + "step": 9394 + }, + { + "epoch": 4.576323481649887, + "grad_norm": 3.1003496646881104, + "learning_rate": 6.714792928855427e-07, + "loss": 0.3986, + "step": 9395 + }, + { + "epoch": 4.576810652809353, + "grad_norm": 4.764305591583252, + "learning_rate": 6.71040773077139e-07, + "loss": 0.4211, + "step": 9396 + }, + { + "epoch": 4.577297823968821, + "grad_norm": 3.1531918048858643, + "learning_rate": 6.706023743090867e-07, + "loss": 0.4038, + "step": 9397 + }, + { + "epoch": 4.5777849951282885, + "grad_norm": 3.068908452987671, + "learning_rate": 6.701640966103987e-07, + "loss": 0.4384, + "step": 9398 + }, + { + "epoch": 4.578272166287756, + "grad_norm": 3.556506633758545, + "learning_rate": 6.697259400100812e-07, + "loss": 0.3932, + "step": 9399 + }, + { + "epoch": 4.578759337447223, + "grad_norm": 3.4161293506622314, + "learning_rate": 6.692879045371298e-07, + "loss": 0.4048, + "step": 9400 + }, + { + "epoch": 4.5792465086066905, + "grad_norm": 3.135338068008423, + "learning_rate": 6.688499902205345e-07, + "loss": 0.3994, + "step": 9401 + }, + { + "epoch": 4.579733679766158, + "grad_norm": 3.0780017375946045, + "learning_rate": 6.684121970892762e-07, + "loss": 0.4453, + "step": 9402 + }, + { + "epoch": 4.580220850925626, + "grad_norm": 4.5987677574157715, + "learning_rate": 6.679745251723291e-07, + "loss": 0.4098, + "step": 9403 + }, + { + "epoch": 4.580708022085092, + "grad_norm": 4.070243835449219, + "learning_rate": 6.675369744986565e-07, + "loss": 0.4196, + "step": 9404 + }, + { + "epoch": 4.58119519324456, + "grad_norm": 3.31239652633667, + "learning_rate": 6.670995450972162e-07, + "loss": 0.3521, + "step": 9405 + }, + { + "epoch": 4.581682364404028, + "grad_norm": 3.313122272491455, + "learning_rate": 6.666622369969575e-07, + "loss": 0.3355, + "step": 9406 + }, + { + "epoch": 4.582169535563494, + "grad_norm": 2.990091323852539, + "learning_rate": 6.662250502268217e-07, + "loss": 0.397, + "step": 9407 + }, + { + "epoch": 4.582656706722962, + "grad_norm": 3.0493741035461426, + "learning_rate": 6.657879848157409e-07, + "loss": 0.3856, + "step": 9408 + }, + { + "epoch": 4.58314387788243, + "grad_norm": 3.1897192001342773, + "learning_rate": 6.653510407926403e-07, + "loss": 0.4544, + "step": 9409 + }, + { + "epoch": 4.583631049041896, + "grad_norm": 3.5427114963531494, + "learning_rate": 6.649142181864377e-07, + "loss": 0.3846, + "step": 9410 + }, + { + "epoch": 4.584118220201364, + "grad_norm": 3.361748695373535, + "learning_rate": 6.644775170260404e-07, + "loss": 0.3295, + "step": 9411 + }, + { + "epoch": 4.584605391360832, + "grad_norm": 3.447746753692627, + "learning_rate": 6.640409373403503e-07, + "loss": 0.3613, + "step": 9412 + }, + { + "epoch": 4.585092562520299, + "grad_norm": 3.4159352779388428, + "learning_rate": 6.636044791582605e-07, + "loss": 0.4114, + "step": 9413 + }, + { + "epoch": 4.585579733679766, + "grad_norm": 3.479973554611206, + "learning_rate": 6.631681425086547e-07, + "loss": 0.4007, + "step": 9414 + }, + { + "epoch": 4.5860669048392335, + "grad_norm": 4.153615474700928, + "learning_rate": 6.627319274204103e-07, + "loss": 0.4202, + "step": 9415 + }, + { + "epoch": 4.586554075998701, + "grad_norm": 3.0913126468658447, + "learning_rate": 6.622958339223965e-07, + "loss": 0.4241, + "step": 9416 + }, + { + "epoch": 4.587041247158168, + "grad_norm": 3.449582099914551, + "learning_rate": 6.618598620434724e-07, + "loss": 0.4191, + "step": 9417 + }, + { + "epoch": 4.5875284183176355, + "grad_norm": 3.471867322921753, + "learning_rate": 6.614240118124915e-07, + "loss": 0.4349, + "step": 9418 + }, + { + "epoch": 4.588015589477103, + "grad_norm": 3.273515224456787, + "learning_rate": 6.609882832582993e-07, + "loss": 0.3322, + "step": 9419 + }, + { + "epoch": 4.588502760636571, + "grad_norm": 2.9966654777526855, + "learning_rate": 6.605526764097303e-07, + "loss": 0.3804, + "step": 9420 + }, + { + "epoch": 4.588989931796037, + "grad_norm": 3.4117302894592285, + "learning_rate": 6.601171912956139e-07, + "loss": 0.3649, + "step": 9421 + }, + { + "epoch": 4.589477102955505, + "grad_norm": 3.406041383743286, + "learning_rate": 6.596818279447712e-07, + "loss": 0.4351, + "step": 9422 + }, + { + "epoch": 4.589964274114973, + "grad_norm": 3.055589437484741, + "learning_rate": 6.592465863860131e-07, + "loss": 0.404, + "step": 9423 + }, + { + "epoch": 4.590451445274439, + "grad_norm": 2.844804286956787, + "learning_rate": 6.588114666481443e-07, + "loss": 0.3705, + "step": 9424 + }, + { + "epoch": 4.590938616433907, + "grad_norm": 3.3157966136932373, + "learning_rate": 6.583764687599615e-07, + "loss": 0.377, + "step": 9425 + }, + { + "epoch": 4.591425787593375, + "grad_norm": 2.9946088790893555, + "learning_rate": 6.579415927502531e-07, + "loss": 0.3666, + "step": 9426 + }, + { + "epoch": 4.591912958752841, + "grad_norm": 3.257434129714966, + "learning_rate": 6.575068386477975e-07, + "loss": 0.4061, + "step": 9427 + }, + { + "epoch": 4.592400129912309, + "grad_norm": 3.263972759246826, + "learning_rate": 6.570722064813681e-07, + "loss": 0.3903, + "step": 9428 + }, + { + "epoch": 4.592887301071777, + "grad_norm": 3.470201015472412, + "learning_rate": 6.566376962797283e-07, + "loss": 0.4076, + "step": 9429 + }, + { + "epoch": 4.593374472231244, + "grad_norm": 2.9349076747894287, + "learning_rate": 6.562033080716352e-07, + "loss": 0.4193, + "step": 9430 + }, + { + "epoch": 4.593861643390711, + "grad_norm": 3.6021952629089355, + "learning_rate": 6.557690418858342e-07, + "loss": 0.3837, + "step": 9431 + }, + { + "epoch": 4.5943488145501785, + "grad_norm": 2.8584744930267334, + "learning_rate": 6.553348977510665e-07, + "loss": 0.3984, + "step": 9432 + }, + { + "epoch": 4.594835985709646, + "grad_norm": 3.3291566371917725, + "learning_rate": 6.549008756960637e-07, + "loss": 0.4038, + "step": 9433 + }, + { + "epoch": 4.595323156869114, + "grad_norm": 3.2698123455047607, + "learning_rate": 6.544669757495497e-07, + "loss": 0.3288, + "step": 9434 + }, + { + "epoch": 4.5958103280285805, + "grad_norm": 3.238273859024048, + "learning_rate": 6.540331979402387e-07, + "loss": 0.3216, + "step": 9435 + }, + { + "epoch": 4.596297499188048, + "grad_norm": 3.550631284713745, + "learning_rate": 6.535995422968386e-07, + "loss": 0.3899, + "step": 9436 + }, + { + "epoch": 4.596784670347516, + "grad_norm": 3.627185106277466, + "learning_rate": 6.531660088480491e-07, + "loss": 0.3986, + "step": 9437 + }, + { + "epoch": 4.597271841506982, + "grad_norm": 3.2846665382385254, + "learning_rate": 6.527325976225615e-07, + "loss": 0.5097, + "step": 9438 + }, + { + "epoch": 4.59775901266645, + "grad_norm": 4.135122299194336, + "learning_rate": 6.522993086490581e-07, + "loss": 0.3826, + "step": 9439 + }, + { + "epoch": 4.598246183825918, + "grad_norm": 3.058634042739868, + "learning_rate": 6.518661419562145e-07, + "loss": 0.3795, + "step": 9440 + }, + { + "epoch": 4.598733354985384, + "grad_norm": 2.9755003452301025, + "learning_rate": 6.514330975726973e-07, + "loss": 0.3892, + "step": 9441 + }, + { + "epoch": 4.599220526144852, + "grad_norm": 3.4691359996795654, + "learning_rate": 6.510001755271666e-07, + "loss": 0.4063, + "step": 9442 + }, + { + "epoch": 4.59970769730432, + "grad_norm": 3.2948760986328125, + "learning_rate": 6.505673758482711e-07, + "loss": 0.4491, + "step": 9443 + }, + { + "epoch": 4.600194868463787, + "grad_norm": 4.13671875, + "learning_rate": 6.501346985646545e-07, + "loss": 0.3909, + "step": 9444 + }, + { + "epoch": 4.600682039623254, + "grad_norm": 3.2190396785736084, + "learning_rate": 6.497021437049514e-07, + "loss": 0.468, + "step": 9445 + }, + { + "epoch": 4.601169210782722, + "grad_norm": 3.5386769771575928, + "learning_rate": 6.492697112977888e-07, + "loss": 0.3378, + "step": 9446 + }, + { + "epoch": 4.601656381942189, + "grad_norm": 2.9058640003204346, + "learning_rate": 6.488374013717835e-07, + "loss": 0.4126, + "step": 9447 + }, + { + "epoch": 4.602143553101657, + "grad_norm": 3.3544695377349854, + "learning_rate": 6.484052139555469e-07, + "loss": 0.4235, + "step": 9448 + }, + { + "epoch": 4.6026307242611235, + "grad_norm": 3.456570625305176, + "learning_rate": 6.479731490776805e-07, + "loss": 0.4078, + "step": 9449 + }, + { + "epoch": 4.603117895420591, + "grad_norm": 3.1971969604492188, + "learning_rate": 6.475412067667797e-07, + "loss": 0.3262, + "step": 9450 + }, + { + "epoch": 4.603605066580059, + "grad_norm": 3.2366838455200195, + "learning_rate": 6.471093870514283e-07, + "loss": 0.4391, + "step": 9451 + }, + { + "epoch": 4.6040922377395255, + "grad_norm": 3.513996124267578, + "learning_rate": 6.466776899602054e-07, + "loss": 0.3527, + "step": 9452 + }, + { + "epoch": 4.604579408898993, + "grad_norm": 3.145907402038574, + "learning_rate": 6.462461155216801e-07, + "loss": 0.3856, + "step": 9453 + }, + { + "epoch": 4.605066580058461, + "grad_norm": 3.4818215370178223, + "learning_rate": 6.45814663764415e-07, + "loss": 0.3533, + "step": 9454 + }, + { + "epoch": 4.605553751217927, + "grad_norm": 3.1325197219848633, + "learning_rate": 6.453833347169621e-07, + "loss": 0.4015, + "step": 9455 + }, + { + "epoch": 4.606040922377395, + "grad_norm": 3.4034838676452637, + "learning_rate": 6.449521284078677e-07, + "loss": 0.4135, + "step": 9456 + }, + { + "epoch": 4.606528093536863, + "grad_norm": 3.344550609588623, + "learning_rate": 6.445210448656691e-07, + "loss": 0.3968, + "step": 9457 + }, + { + "epoch": 4.60701526469633, + "grad_norm": 3.064669370651245, + "learning_rate": 6.440900841188943e-07, + "loss": 0.3831, + "step": 9458 + }, + { + "epoch": 4.607502435855797, + "grad_norm": 3.0672736167907715, + "learning_rate": 6.436592461960658e-07, + "loss": 0.4208, + "step": 9459 + }, + { + "epoch": 4.607989607015265, + "grad_norm": 3.3687002658843994, + "learning_rate": 6.432285311256947e-07, + "loss": 0.3666, + "step": 9460 + }, + { + "epoch": 4.608476778174732, + "grad_norm": 3.2280054092407227, + "learning_rate": 6.427979389362866e-07, + "loss": 0.2799, + "step": 9461 + }, + { + "epoch": 4.6089639493342, + "grad_norm": 3.19728422164917, + "learning_rate": 6.423674696563384e-07, + "loss": 0.4041, + "step": 9462 + }, + { + "epoch": 4.609451120493667, + "grad_norm": 3.1386094093322754, + "learning_rate": 6.419371233143376e-07, + "loss": 0.3651, + "step": 9463 + }, + { + "epoch": 4.609938291653134, + "grad_norm": 3.1801609992980957, + "learning_rate": 6.41506899938765e-07, + "loss": 0.3843, + "step": 9464 + }, + { + "epoch": 4.610425462812602, + "grad_norm": 3.2709643840789795, + "learning_rate": 6.410767995580924e-07, + "loss": 0.4637, + "step": 9465 + }, + { + "epoch": 4.6109126339720685, + "grad_norm": 3.705319881439209, + "learning_rate": 6.40646822200785e-07, + "loss": 0.4181, + "step": 9466 + }, + { + "epoch": 4.611399805131536, + "grad_norm": 3.8683371543884277, + "learning_rate": 6.402169678952972e-07, + "loss": 0.408, + "step": 9467 + }, + { + "epoch": 4.611886976291004, + "grad_norm": 3.329833745956421, + "learning_rate": 6.397872366700769e-07, + "loss": 0.3908, + "step": 9468 + }, + { + "epoch": 4.6123741474504705, + "grad_norm": 3.562455177307129, + "learning_rate": 6.393576285535653e-07, + "loss": 0.3356, + "step": 9469 + }, + { + "epoch": 4.612861318609938, + "grad_norm": 2.758539915084839, + "learning_rate": 6.389281435741915e-07, + "loss": 0.3509, + "step": 9470 + }, + { + "epoch": 4.613348489769406, + "grad_norm": 3.1822662353515625, + "learning_rate": 6.384987817603799e-07, + "loss": 0.4154, + "step": 9471 + }, + { + "epoch": 4.613835660928873, + "grad_norm": 3.139420509338379, + "learning_rate": 6.380695431405453e-07, + "loss": 0.3814, + "step": 9472 + }, + { + "epoch": 4.61432283208834, + "grad_norm": 3.2085354328155518, + "learning_rate": 6.376404277430961e-07, + "loss": 0.4107, + "step": 9473 + }, + { + "epoch": 4.614810003247808, + "grad_norm": 3.52968168258667, + "learning_rate": 6.372114355964292e-07, + "loss": 0.3995, + "step": 9474 + }, + { + "epoch": 4.615297174407275, + "grad_norm": 3.006653308868408, + "learning_rate": 6.36782566728936e-07, + "loss": 0.3858, + "step": 9475 + }, + { + "epoch": 4.615784345566743, + "grad_norm": 3.618652582168579, + "learning_rate": 6.36353821168999e-07, + "loss": 0.3788, + "step": 9476 + }, + { + "epoch": 4.61627151672621, + "grad_norm": 3.603085994720459, + "learning_rate": 6.359251989449933e-07, + "loss": 0.4818, + "step": 9477 + }, + { + "epoch": 4.616758687885677, + "grad_norm": 3.452052593231201, + "learning_rate": 6.354967000852835e-07, + "loss": 0.4339, + "step": 9478 + }, + { + "epoch": 4.617245859045145, + "grad_norm": 3.197599172592163, + "learning_rate": 6.350683246182285e-07, + "loss": 0.3889, + "step": 9479 + }, + { + "epoch": 4.617733030204612, + "grad_norm": 3.0512216091156006, + "learning_rate": 6.346400725721783e-07, + "loss": 0.3893, + "step": 9480 + }, + { + "epoch": 4.618220201364079, + "grad_norm": 3.538814067840576, + "learning_rate": 6.342119439754748e-07, + "loss": 0.4134, + "step": 9481 + }, + { + "epoch": 4.618707372523547, + "grad_norm": 2.984273672103882, + "learning_rate": 6.337839388564506e-07, + "loss": 0.3641, + "step": 9482 + }, + { + "epoch": 4.6191945436830135, + "grad_norm": 3.2971723079681396, + "learning_rate": 6.333560572434316e-07, + "loss": 0.3244, + "step": 9483 + }, + { + "epoch": 4.619681714842481, + "grad_norm": 3.088757276535034, + "learning_rate": 6.329282991647348e-07, + "loss": 0.4178, + "step": 9484 + }, + { + "epoch": 4.620168886001949, + "grad_norm": 3.2124948501586914, + "learning_rate": 6.325006646486701e-07, + "loss": 0.4048, + "step": 9485 + }, + { + "epoch": 4.620656057161416, + "grad_norm": 3.3456075191497803, + "learning_rate": 6.32073153723537e-07, + "loss": 0.3944, + "step": 9486 + }, + { + "epoch": 4.621143228320883, + "grad_norm": 3.1416046619415283, + "learning_rate": 6.316457664176281e-07, + "loss": 0.404, + "step": 9487 + }, + { + "epoch": 4.621630399480351, + "grad_norm": 3.1922361850738525, + "learning_rate": 6.312185027592289e-07, + "loss": 0.3143, + "step": 9488 + }, + { + "epoch": 4.622117570639818, + "grad_norm": 3.320561170578003, + "learning_rate": 6.307913627766155e-07, + "loss": 0.3725, + "step": 9489 + }, + { + "epoch": 4.622604741799286, + "grad_norm": 3.460261583328247, + "learning_rate": 6.303643464980552e-07, + "loss": 0.3983, + "step": 9490 + }, + { + "epoch": 4.623091912958753, + "grad_norm": 3.3803551197052, + "learning_rate": 6.299374539518083e-07, + "loss": 0.4097, + "step": 9491 + }, + { + "epoch": 4.62357908411822, + "grad_norm": 3.0161688327789307, + "learning_rate": 6.295106851661264e-07, + "loss": 0.3497, + "step": 9492 + }, + { + "epoch": 4.624066255277688, + "grad_norm": 3.378370761871338, + "learning_rate": 6.290840401692541e-07, + "loss": 0.366, + "step": 9493 + }, + { + "epoch": 4.624553426437155, + "grad_norm": 3.255496025085449, + "learning_rate": 6.286575189894251e-07, + "loss": 0.4616, + "step": 9494 + }, + { + "epoch": 4.625040597596622, + "grad_norm": 3.154203414916992, + "learning_rate": 6.28231121654867e-07, + "loss": 0.3617, + "step": 9495 + }, + { + "epoch": 4.62552776875609, + "grad_norm": 3.045132875442505, + "learning_rate": 6.27804848193799e-07, + "loss": 0.3517, + "step": 9496 + }, + { + "epoch": 4.626014939915557, + "grad_norm": 3.553595542907715, + "learning_rate": 6.273786986344327e-07, + "loss": 0.3562, + "step": 9497 + }, + { + "epoch": 4.626502111075024, + "grad_norm": 3.589111804962158, + "learning_rate": 6.269526730049691e-07, + "loss": 0.408, + "step": 9498 + }, + { + "epoch": 4.626989282234492, + "grad_norm": 2.813244581222534, + "learning_rate": 6.265267713336029e-07, + "loss": 0.3599, + "step": 9499 + }, + { + "epoch": 4.627476453393959, + "grad_norm": 3.293928384780884, + "learning_rate": 6.261009936485213e-07, + "loss": 0.4389, + "step": 9500 + }, + { + "epoch": 4.627963624553426, + "grad_norm": 3.7401020526885986, + "learning_rate": 6.256753399779008e-07, + "loss": 0.3925, + "step": 9501 + }, + { + "epoch": 4.628450795712894, + "grad_norm": 3.2932214736938477, + "learning_rate": 6.252498103499119e-07, + "loss": 0.4037, + "step": 9502 + }, + { + "epoch": 4.628937966872361, + "grad_norm": 3.4815375804901123, + "learning_rate": 6.248244047927166e-07, + "loss": 0.3861, + "step": 9503 + }, + { + "epoch": 4.629425138031829, + "grad_norm": 3.2303996086120605, + "learning_rate": 6.243991233344667e-07, + "loss": 0.4005, + "step": 9504 + }, + { + "epoch": 4.629912309191296, + "grad_norm": 3.5647199153900146, + "learning_rate": 6.239739660033081e-07, + "loss": 0.3963, + "step": 9505 + }, + { + "epoch": 4.630399480350763, + "grad_norm": 3.240551471710205, + "learning_rate": 6.235489328273789e-07, + "loss": 0.387, + "step": 9506 + }, + { + "epoch": 4.630886651510231, + "grad_norm": 3.366039514541626, + "learning_rate": 6.231240238348055e-07, + "loss": 0.3736, + "step": 9507 + }, + { + "epoch": 4.631373822669698, + "grad_norm": 3.0580966472625732, + "learning_rate": 6.226992390537096e-07, + "loss": 0.3594, + "step": 9508 + }, + { + "epoch": 4.631860993829165, + "grad_norm": 3.1078810691833496, + "learning_rate": 6.222745785122036e-07, + "loss": 0.4217, + "step": 9509 + }, + { + "epoch": 4.632348164988633, + "grad_norm": 3.3143599033355713, + "learning_rate": 6.218500422383908e-07, + "loss": 0.5229, + "step": 9510 + }, + { + "epoch": 4.6328353361481, + "grad_norm": 3.842603921890259, + "learning_rate": 6.214256302603671e-07, + "loss": 0.4061, + "step": 9511 + }, + { + "epoch": 4.633322507307567, + "grad_norm": 3.6567630767822266, + "learning_rate": 6.210013426062206e-07, + "loss": 0.4504, + "step": 9512 + }, + { + "epoch": 4.633809678467035, + "grad_norm": 3.567547559738159, + "learning_rate": 6.205771793040308e-07, + "loss": 0.4238, + "step": 9513 + }, + { + "epoch": 4.6342968496265025, + "grad_norm": 3.3282558917999268, + "learning_rate": 6.201531403818675e-07, + "loss": 0.3406, + "step": 9514 + }, + { + "epoch": 4.634784020785969, + "grad_norm": 3.324552536010742, + "learning_rate": 6.197292258677943e-07, + "loss": 0.3368, + "step": 9515 + }, + { + "epoch": 4.635271191945437, + "grad_norm": 2.9896066188812256, + "learning_rate": 6.193054357898659e-07, + "loss": 0.3931, + "step": 9516 + }, + { + "epoch": 4.635758363104904, + "grad_norm": 3.293771982192993, + "learning_rate": 6.188817701761293e-07, + "loss": 0.3471, + "step": 9517 + }, + { + "epoch": 4.636245534264372, + "grad_norm": 2.8924427032470703, + "learning_rate": 6.184582290546212e-07, + "loss": 0.3693, + "step": 9518 + }, + { + "epoch": 4.636732705423839, + "grad_norm": 3.1460890769958496, + "learning_rate": 6.180348124533722e-07, + "loss": 0.4438, + "step": 9519 + }, + { + "epoch": 4.637219876583306, + "grad_norm": 3.075309991836548, + "learning_rate": 6.176115204004052e-07, + "loss": 0.3871, + "step": 9520 + }, + { + "epoch": 4.637707047742774, + "grad_norm": 3.2750191688537598, + "learning_rate": 6.171883529237313e-07, + "loss": 0.3793, + "step": 9521 + }, + { + "epoch": 4.638194218902241, + "grad_norm": 3.109687566757202, + "learning_rate": 6.167653100513571e-07, + "loss": 0.3539, + "step": 9522 + }, + { + "epoch": 4.638681390061708, + "grad_norm": 3.053410530090332, + "learning_rate": 6.163423918112793e-07, + "loss": 0.4358, + "step": 9523 + }, + { + "epoch": 4.639168561221176, + "grad_norm": 3.4255714416503906, + "learning_rate": 6.159195982314875e-07, + "loss": 0.4127, + "step": 9524 + }, + { + "epoch": 4.639655732380643, + "grad_norm": 3.5721640586853027, + "learning_rate": 6.154969293399604e-07, + "loss": 0.4019, + "step": 9525 + }, + { + "epoch": 4.64014290354011, + "grad_norm": 3.3006670475006104, + "learning_rate": 6.150743851646709e-07, + "loss": 0.3403, + "step": 9526 + }, + { + "epoch": 4.640630074699578, + "grad_norm": 2.887650966644287, + "learning_rate": 6.146519657335831e-07, + "loss": 0.3605, + "step": 9527 + }, + { + "epoch": 4.6411172458590455, + "grad_norm": 3.2961432933807373, + "learning_rate": 6.142296710746534e-07, + "loss": 0.4154, + "step": 9528 + }, + { + "epoch": 4.641604417018512, + "grad_norm": 3.2068192958831787, + "learning_rate": 6.138075012158279e-07, + "loss": 0.3918, + "step": 9529 + }, + { + "epoch": 4.64209158817798, + "grad_norm": 3.526991128921509, + "learning_rate": 6.133854561850461e-07, + "loss": 0.4749, + "step": 9530 + }, + { + "epoch": 4.6425787593374475, + "grad_norm": 3.558271646499634, + "learning_rate": 6.129635360102393e-07, + "loss": 0.399, + "step": 9531 + }, + { + "epoch": 4.643065930496915, + "grad_norm": 3.154938220977783, + "learning_rate": 6.125417407193307e-07, + "loss": 0.3653, + "step": 9532 + }, + { + "epoch": 4.643553101656382, + "grad_norm": 2.9758388996124268, + "learning_rate": 6.12120070340233e-07, + "loss": 0.3854, + "step": 9533 + }, + { + "epoch": 4.644040272815849, + "grad_norm": 3.518714666366577, + "learning_rate": 6.116985249008534e-07, + "loss": 0.4172, + "step": 9534 + }, + { + "epoch": 4.644527443975317, + "grad_norm": 3.6052262783050537, + "learning_rate": 6.112771044290893e-07, + "loss": 0.3846, + "step": 9535 + }, + { + "epoch": 4.645014615134784, + "grad_norm": 3.6977484226226807, + "learning_rate": 6.108558089528319e-07, + "loss": 0.3731, + "step": 9536 + }, + { + "epoch": 4.645501786294251, + "grad_norm": 3.20381498336792, + "learning_rate": 6.104346384999599e-07, + "loss": 0.4182, + "step": 9537 + }, + { + "epoch": 4.645988957453719, + "grad_norm": 3.6371445655822754, + "learning_rate": 6.100135930983478e-07, + "loss": 0.389, + "step": 9538 + }, + { + "epoch": 4.646476128613186, + "grad_norm": 3.327512502670288, + "learning_rate": 6.0959267277586e-07, + "loss": 0.3387, + "step": 9539 + }, + { + "epoch": 4.646963299772653, + "grad_norm": 3.405946731567383, + "learning_rate": 6.091718775603539e-07, + "loss": 0.3613, + "step": 9540 + }, + { + "epoch": 4.647450470932121, + "grad_norm": 3.354546308517456, + "learning_rate": 6.087512074796758e-07, + "loss": 0.3648, + "step": 9541 + }, + { + "epoch": 4.6479376420915886, + "grad_norm": 3.2328386306762695, + "learning_rate": 6.083306625616672e-07, + "loss": 0.3707, + "step": 9542 + }, + { + "epoch": 4.648424813251055, + "grad_norm": 3.5602622032165527, + "learning_rate": 6.079102428341588e-07, + "loss": 0.3907, + "step": 9543 + }, + { + "epoch": 4.648911984410523, + "grad_norm": 3.0340754985809326, + "learning_rate": 6.074899483249752e-07, + "loss": 0.4072, + "step": 9544 + }, + { + "epoch": 4.6493991555699905, + "grad_norm": 3.700562000274658, + "learning_rate": 6.070697790619301e-07, + "loss": 0.4965, + "step": 9545 + }, + { + "epoch": 4.649886326729457, + "grad_norm": 3.1947433948516846, + "learning_rate": 6.066497350728312e-07, + "loss": 0.3433, + "step": 9546 + }, + { + "epoch": 4.650373497888925, + "grad_norm": 3.413384437561035, + "learning_rate": 6.062298163854758e-07, + "loss": 0.4437, + "step": 9547 + }, + { + "epoch": 4.6508606690483925, + "grad_norm": 3.775195837020874, + "learning_rate": 6.058100230276548e-07, + "loss": 0.3998, + "step": 9548 + }, + { + "epoch": 4.65134784020786, + "grad_norm": 3.1481120586395264, + "learning_rate": 6.053903550271509e-07, + "loss": 0.3714, + "step": 9549 + }, + { + "epoch": 4.651835011367327, + "grad_norm": 3.24336314201355, + "learning_rate": 6.049708124117362e-07, + "loss": 0.4122, + "step": 9550 + }, + { + "epoch": 4.652322182526794, + "grad_norm": 3.182558298110962, + "learning_rate": 6.045513952091767e-07, + "loss": 0.3394, + "step": 9551 + }, + { + "epoch": 4.652809353686262, + "grad_norm": 2.792454957962036, + "learning_rate": 6.041321034472297e-07, + "loss": 0.4157, + "step": 9552 + }, + { + "epoch": 4.653296524845729, + "grad_norm": 3.2509348392486572, + "learning_rate": 6.037129371536432e-07, + "loss": 0.3989, + "step": 9553 + }, + { + "epoch": 4.653783696005196, + "grad_norm": 3.390979051589966, + "learning_rate": 6.03293896356158e-07, + "loss": 0.376, + "step": 9554 + }, + { + "epoch": 4.654270867164664, + "grad_norm": 3.1845877170562744, + "learning_rate": 6.028749810825057e-07, + "loss": 0.3843, + "step": 9555 + }, + { + "epoch": 4.654758038324132, + "grad_norm": 3.408921241760254, + "learning_rate": 6.024561913604115e-07, + "loss": 0.412, + "step": 9556 + }, + { + "epoch": 4.655245209483598, + "grad_norm": 3.7860920429229736, + "learning_rate": 6.020375272175893e-07, + "loss": 0.4043, + "step": 9557 + }, + { + "epoch": 4.655732380643066, + "grad_norm": 3.0438103675842285, + "learning_rate": 6.016189886817467e-07, + "loss": 0.3462, + "step": 9558 + }, + { + "epoch": 4.6562195518025336, + "grad_norm": 3.3097057342529297, + "learning_rate": 6.012005757805825e-07, + "loss": 0.3887, + "step": 9559 + }, + { + "epoch": 4.656706722962, + "grad_norm": 3.4522645473480225, + "learning_rate": 6.007822885417883e-07, + "loss": 0.3777, + "step": 9560 + }, + { + "epoch": 4.657193894121468, + "grad_norm": 3.234011650085449, + "learning_rate": 6.003641269930444e-07, + "loss": 0.4363, + "step": 9561 + }, + { + "epoch": 4.6576810652809355, + "grad_norm": 3.324212074279785, + "learning_rate": 5.999460911620258e-07, + "loss": 0.4381, + "step": 9562 + }, + { + "epoch": 4.658168236440403, + "grad_norm": 3.275714159011841, + "learning_rate": 5.995281810763981e-07, + "loss": 0.3637, + "step": 9563 + }, + { + "epoch": 4.65865540759987, + "grad_norm": 3.1675832271575928, + "learning_rate": 5.99110396763819e-07, + "loss": 0.3661, + "step": 9564 + }, + { + "epoch": 4.6591425787593375, + "grad_norm": 3.3630211353302, + "learning_rate": 5.986927382519361e-07, + "loss": 0.3783, + "step": 9565 + }, + { + "epoch": 4.659629749918805, + "grad_norm": 3.586212158203125, + "learning_rate": 5.982752055683908e-07, + "loss": 0.3405, + "step": 9566 + }, + { + "epoch": 4.660116921078272, + "grad_norm": 2.947505474090576, + "learning_rate": 5.978577987408152e-07, + "loss": 0.324, + "step": 9567 + }, + { + "epoch": 4.660604092237739, + "grad_norm": 3.1072781085968018, + "learning_rate": 5.97440517796834e-07, + "loss": 0.3699, + "step": 9568 + }, + { + "epoch": 4.661091263397207, + "grad_norm": 3.15159010887146, + "learning_rate": 5.970233627640617e-07, + "loss": 0.414, + "step": 9569 + }, + { + "epoch": 4.661578434556674, + "grad_norm": 3.315847635269165, + "learning_rate": 5.966063336701058e-07, + "loss": 0.3616, + "step": 9570 + }, + { + "epoch": 4.662065605716141, + "grad_norm": 3.6669321060180664, + "learning_rate": 5.961894305425664e-07, + "loss": 0.4387, + "step": 9571 + }, + { + "epoch": 4.662552776875609, + "grad_norm": 3.5094659328460693, + "learning_rate": 5.957726534090324e-07, + "loss": 0.3391, + "step": 9572 + }, + { + "epoch": 4.663039948035077, + "grad_norm": 3.2431273460388184, + "learning_rate": 5.95356002297087e-07, + "loss": 0.438, + "step": 9573 + }, + { + "epoch": 4.663527119194543, + "grad_norm": 3.4833173751831055, + "learning_rate": 5.94939477234304e-07, + "loss": 0.4421, + "step": 9574 + }, + { + "epoch": 4.664014290354011, + "grad_norm": 3.742114543914795, + "learning_rate": 5.945230782482494e-07, + "loss": 0.3422, + "step": 9575 + }, + { + "epoch": 4.6645014615134786, + "grad_norm": 2.6793265342712402, + "learning_rate": 5.941068053664795e-07, + "loss": 0.3729, + "step": 9576 + }, + { + "epoch": 4.664988632672946, + "grad_norm": 5.454046249389648, + "learning_rate": 5.936906586165439e-07, + "loss": 0.3555, + "step": 9577 + }, + { + "epoch": 4.665475803832413, + "grad_norm": 3.1976318359375, + "learning_rate": 5.93274638025983e-07, + "loss": 0.4196, + "step": 9578 + }, + { + "epoch": 4.6659629749918805, + "grad_norm": 3.3269643783569336, + "learning_rate": 5.928587436223296e-07, + "loss": 0.3506, + "step": 9579 + }, + { + "epoch": 4.666450146151348, + "grad_norm": 3.523283004760742, + "learning_rate": 5.92442975433106e-07, + "loss": 0.4217, + "step": 9580 + }, + { + "epoch": 4.666937317310815, + "grad_norm": 3.407646894454956, + "learning_rate": 5.920273334858289e-07, + "loss": 0.4435, + "step": 9581 + }, + { + "epoch": 4.6674244884702825, + "grad_norm": 3.4723241329193115, + "learning_rate": 5.916118178080052e-07, + "loss": 0.3554, + "step": 9582 + }, + { + "epoch": 4.66791165962975, + "grad_norm": 3.0486092567443848, + "learning_rate": 5.911964284271346e-07, + "loss": 0.4057, + "step": 9583 + }, + { + "epoch": 4.668398830789217, + "grad_norm": 3.2235045433044434, + "learning_rate": 5.907811653707054e-07, + "loss": 0.3619, + "step": 9584 + }, + { + "epoch": 4.668886001948684, + "grad_norm": 3.08862566947937, + "learning_rate": 5.903660286662014e-07, + "loss": 0.4141, + "step": 9585 + }, + { + "epoch": 4.669373173108152, + "grad_norm": 3.377394437789917, + "learning_rate": 5.899510183410956e-07, + "loss": 0.3038, + "step": 9586 + }, + { + "epoch": 4.66986034426762, + "grad_norm": 3.0764477252960205, + "learning_rate": 5.895361344228542e-07, + "loss": 0.3412, + "step": 9587 + }, + { + "epoch": 4.670347515427086, + "grad_norm": 3.852018356323242, + "learning_rate": 5.891213769389329e-07, + "loss": 0.3884, + "step": 9588 + }, + { + "epoch": 4.670834686586554, + "grad_norm": 3.379067897796631, + "learning_rate": 5.887067459167808e-07, + "loss": 0.3778, + "step": 9589 + }, + { + "epoch": 4.671321857746022, + "grad_norm": 3.3856570720672607, + "learning_rate": 5.882922413838391e-07, + "loss": 0.3669, + "step": 9590 + }, + { + "epoch": 4.671809028905489, + "grad_norm": 2.9240715503692627, + "learning_rate": 5.87877863367538e-07, + "loss": 0.3546, + "step": 9591 + }, + { + "epoch": 4.672296200064956, + "grad_norm": 3.3501505851745605, + "learning_rate": 5.87463611895302e-07, + "loss": 0.4304, + "step": 9592 + }, + { + "epoch": 4.6727833712244236, + "grad_norm": 3.187577724456787, + "learning_rate": 5.870494869945467e-07, + "loss": 0.3541, + "step": 9593 + }, + { + "epoch": 4.673270542383891, + "grad_norm": 3.429391384124756, + "learning_rate": 5.866354886926775e-07, + "loss": 0.4247, + "step": 9594 + }, + { + "epoch": 4.673757713543358, + "grad_norm": 3.370992660522461, + "learning_rate": 5.862216170170946e-07, + "loss": 0.3677, + "step": 9595 + }, + { + "epoch": 4.6742448847028255, + "grad_norm": 4.192935943603516, + "learning_rate": 5.858078719951857e-07, + "loss": 0.3841, + "step": 9596 + }, + { + "epoch": 4.674732055862293, + "grad_norm": 3.3056657314300537, + "learning_rate": 5.853942536543336e-07, + "loss": 0.3663, + "step": 9597 + }, + { + "epoch": 4.67521922702176, + "grad_norm": 3.487110137939453, + "learning_rate": 5.849807620219117e-07, + "loss": 0.4318, + "step": 9598 + }, + { + "epoch": 4.6757063981812275, + "grad_norm": 3.591764450073242, + "learning_rate": 5.845673971252853e-07, + "loss": 0.3488, + "step": 9599 + }, + { + "epoch": 4.676193569340695, + "grad_norm": 3.4190802574157715, + "learning_rate": 5.841541589918093e-07, + "loss": 0.3553, + "step": 9600 + }, + { + "epoch": 4.676680740500163, + "grad_norm": 3.384471893310547, + "learning_rate": 5.837410476488328e-07, + "loss": 0.3981, + "step": 9601 + }, + { + "epoch": 4.677167911659629, + "grad_norm": 3.2604527473449707, + "learning_rate": 5.833280631236951e-07, + "loss": 0.3909, + "step": 9602 + }, + { + "epoch": 4.677655082819097, + "grad_norm": 3.221432685852051, + "learning_rate": 5.829152054437287e-07, + "loss": 0.4327, + "step": 9603 + }, + { + "epoch": 4.678142253978565, + "grad_norm": 3.1766910552978516, + "learning_rate": 5.825024746362548e-07, + "loss": 0.378, + "step": 9604 + }, + { + "epoch": 4.678629425138032, + "grad_norm": 3.145298719406128, + "learning_rate": 5.820898707285885e-07, + "loss": 0.4171, + "step": 9605 + }, + { + "epoch": 4.679116596297499, + "grad_norm": 3.4563815593719482, + "learning_rate": 5.816773937480358e-07, + "loss": 0.3673, + "step": 9606 + }, + { + "epoch": 4.679603767456967, + "grad_norm": 3.1811845302581787, + "learning_rate": 5.812650437218952e-07, + "loss": 0.4018, + "step": 9607 + }, + { + "epoch": 4.680090938616434, + "grad_norm": 3.250403881072998, + "learning_rate": 5.808528206774549e-07, + "loss": 0.4183, + "step": 9608 + }, + { + "epoch": 4.680578109775901, + "grad_norm": 2.8326079845428467, + "learning_rate": 5.804407246419963e-07, + "loss": 0.3122, + "step": 9609 + }, + { + "epoch": 4.6810652809353686, + "grad_norm": 3.1660754680633545, + "learning_rate": 5.800287556427917e-07, + "loss": 0.3695, + "step": 9610 + }, + { + "epoch": 4.681552452094836, + "grad_norm": 3.220918893814087, + "learning_rate": 5.796169137071056e-07, + "loss": 0.412, + "step": 9611 + }, + { + "epoch": 4.682039623254303, + "grad_norm": 3.32559871673584, + "learning_rate": 5.79205198862193e-07, + "loss": 0.3483, + "step": 9612 + }, + { + "epoch": 4.6825267944137705, + "grad_norm": 3.243060350418091, + "learning_rate": 5.787936111353015e-07, + "loss": 0.351, + "step": 9613 + }, + { + "epoch": 4.683013965573238, + "grad_norm": 2.9771909713745117, + "learning_rate": 5.783821505536696e-07, + "loss": 0.3503, + "step": 9614 + }, + { + "epoch": 4.683501136732706, + "grad_norm": 3.0070629119873047, + "learning_rate": 5.779708171445289e-07, + "loss": 0.4469, + "step": 9615 + }, + { + "epoch": 4.6839883078921725, + "grad_norm": 3.413972854614258, + "learning_rate": 5.775596109351e-07, + "loss": 0.4271, + "step": 9616 + }, + { + "epoch": 4.68447547905164, + "grad_norm": 3.4439644813537598, + "learning_rate": 5.771485319525969e-07, + "loss": 0.3604, + "step": 9617 + }, + { + "epoch": 4.684962650211108, + "grad_norm": 3.002631187438965, + "learning_rate": 5.767375802242247e-07, + "loss": 0.4552, + "step": 9618 + }, + { + "epoch": 4.685449821370575, + "grad_norm": 3.8804872035980225, + "learning_rate": 5.763267557771815e-07, + "loss": 0.3726, + "step": 9619 + }, + { + "epoch": 4.685936992530042, + "grad_norm": 2.974942922592163, + "learning_rate": 5.759160586386534e-07, + "loss": 0.3914, + "step": 9620 + }, + { + "epoch": 4.68642416368951, + "grad_norm": 3.1289992332458496, + "learning_rate": 5.755054888358217e-07, + "loss": 0.3599, + "step": 9621 + }, + { + "epoch": 4.686911334848977, + "grad_norm": 3.050412893295288, + "learning_rate": 5.750950463958579e-07, + "loss": 0.3576, + "step": 9622 + }, + { + "epoch": 4.687398506008444, + "grad_norm": 3.184046983718872, + "learning_rate": 5.746847313459242e-07, + "loss": 0.3404, + "step": 9623 + }, + { + "epoch": 4.687885677167912, + "grad_norm": 3.1628878116607666, + "learning_rate": 5.742745437131755e-07, + "loss": 0.4283, + "step": 9624 + }, + { + "epoch": 4.688372848327379, + "grad_norm": 3.1370301246643066, + "learning_rate": 5.738644835247581e-07, + "loss": 0.4024, + "step": 9625 + }, + { + "epoch": 4.688860019486846, + "grad_norm": 3.700324773788452, + "learning_rate": 5.734545508078107e-07, + "loss": 0.3771, + "step": 9626 + }, + { + "epoch": 4.689347190646314, + "grad_norm": 3.4805047512054443, + "learning_rate": 5.730447455894611e-07, + "loss": 0.337, + "step": 9627 + }, + { + "epoch": 4.689834361805781, + "grad_norm": 3.215604305267334, + "learning_rate": 5.726350678968306e-07, + "loss": 0.3911, + "step": 9628 + }, + { + "epoch": 4.690321532965249, + "grad_norm": 3.308684825897217, + "learning_rate": 5.722255177570321e-07, + "loss": 0.3925, + "step": 9629 + }, + { + "epoch": 4.6908087041247155, + "grad_norm": 3.2639572620391846, + "learning_rate": 5.718160951971699e-07, + "loss": 0.4081, + "step": 9630 + }, + { + "epoch": 4.691295875284183, + "grad_norm": 3.2012534141540527, + "learning_rate": 5.714068002443382e-07, + "loss": 0.3377, + "step": 9631 + }, + { + "epoch": 4.691783046443651, + "grad_norm": 2.9802565574645996, + "learning_rate": 5.709976329256248e-07, + "loss": 0.3917, + "step": 9632 + }, + { + "epoch": 4.692270217603118, + "grad_norm": 3.7735445499420166, + "learning_rate": 5.705885932681093e-07, + "loss": 0.4339, + "step": 9633 + }, + { + "epoch": 4.692757388762585, + "grad_norm": 3.4921135902404785, + "learning_rate": 5.701796812988603e-07, + "loss": 0.3804, + "step": 9634 + }, + { + "epoch": 4.693244559922053, + "grad_norm": 3.42142653465271, + "learning_rate": 5.697708970449403e-07, + "loss": 0.3734, + "step": 9635 + }, + { + "epoch": 4.69373173108152, + "grad_norm": 3.5534965991973877, + "learning_rate": 5.693622405334032e-07, + "loss": 0.3954, + "step": 9636 + }, + { + "epoch": 4.694218902240987, + "grad_norm": 3.2047629356384277, + "learning_rate": 5.689537117912924e-07, + "loss": 0.3178, + "step": 9637 + }, + { + "epoch": 4.694706073400455, + "grad_norm": 3.293560028076172, + "learning_rate": 5.685453108456454e-07, + "loss": 0.3805, + "step": 9638 + }, + { + "epoch": 4.695193244559922, + "grad_norm": 3.3139712810516357, + "learning_rate": 5.681370377234904e-07, + "loss": 0.4028, + "step": 9639 + }, + { + "epoch": 4.695680415719389, + "grad_norm": 3.2861173152923584, + "learning_rate": 5.677288924518459e-07, + "loss": 0.3614, + "step": 9640 + }, + { + "epoch": 4.696167586878857, + "grad_norm": 3.12557315826416, + "learning_rate": 5.673208750577234e-07, + "loss": 0.4335, + "step": 9641 + }, + { + "epoch": 4.696654758038324, + "grad_norm": 3.6578292846679688, + "learning_rate": 5.669129855681258e-07, + "loss": 0.391, + "step": 9642 + }, + { + "epoch": 4.697141929197792, + "grad_norm": 3.470086097717285, + "learning_rate": 5.66505224010046e-07, + "loss": 0.4223, + "step": 9643 + }, + { + "epoch": 4.697629100357259, + "grad_norm": 3.4170119762420654, + "learning_rate": 5.660975904104707e-07, + "loss": 0.4335, + "step": 9644 + }, + { + "epoch": 4.698116271516726, + "grad_norm": 3.3991780281066895, + "learning_rate": 5.656900847963767e-07, + "loss": 0.4468, + "step": 9645 + }, + { + "epoch": 4.698603442676194, + "grad_norm": 3.2595150470733643, + "learning_rate": 5.652827071947337e-07, + "loss": 0.3501, + "step": 9646 + }, + { + "epoch": 4.699090613835661, + "grad_norm": 4.004204273223877, + "learning_rate": 5.648754576324999e-07, + "loss": 0.4259, + "step": 9647 + }, + { + "epoch": 4.699577784995128, + "grad_norm": 3.0141072273254395, + "learning_rate": 5.644683361366285e-07, + "loss": 0.4275, + "step": 9648 + }, + { + "epoch": 4.700064956154596, + "grad_norm": 3.3043136596679688, + "learning_rate": 5.640613427340621e-07, + "loss": 0.3748, + "step": 9649 + }, + { + "epoch": 4.700552127314063, + "grad_norm": 3.556379556655884, + "learning_rate": 5.636544774517364e-07, + "loss": 0.4443, + "step": 9650 + }, + { + "epoch": 4.70103929847353, + "grad_norm": 3.188762903213501, + "learning_rate": 5.632477403165765e-07, + "loss": 0.3295, + "step": 9651 + }, + { + "epoch": 4.701526469632998, + "grad_norm": 3.3014371395111084, + "learning_rate": 5.628411313555007e-07, + "loss": 0.4481, + "step": 9652 + }, + { + "epoch": 4.702013640792465, + "grad_norm": 3.21240234375, + "learning_rate": 5.624346505954187e-07, + "loss": 0.3602, + "step": 9653 + }, + { + "epoch": 4.702500811951932, + "grad_norm": 3.0388410091400146, + "learning_rate": 5.620282980632316e-07, + "loss": 0.3837, + "step": 9654 + }, + { + "epoch": 4.7029879831114, + "grad_norm": 3.6381638050079346, + "learning_rate": 5.616220737858306e-07, + "loss": 0.3429, + "step": 9655 + }, + { + "epoch": 4.703475154270867, + "grad_norm": 3.2202234268188477, + "learning_rate": 5.612159777901002e-07, + "loss": 0.3489, + "step": 9656 + }, + { + "epoch": 4.703962325430335, + "grad_norm": 2.934170961380005, + "learning_rate": 5.60810010102916e-07, + "loss": 0.3996, + "step": 9657 + }, + { + "epoch": 4.704449496589802, + "grad_norm": 3.5540339946746826, + "learning_rate": 5.604041707511454e-07, + "loss": 0.4357, + "step": 9658 + }, + { + "epoch": 4.704936667749269, + "grad_norm": 3.6629178524017334, + "learning_rate": 5.599984597616453e-07, + "loss": 0.366, + "step": 9659 + }, + { + "epoch": 4.705423838908737, + "grad_norm": 3.224332094192505, + "learning_rate": 5.595928771612666e-07, + "loss": 0.3912, + "step": 9660 + }, + { + "epoch": 4.7059110100682044, + "grad_norm": 3.250507116317749, + "learning_rate": 5.591874229768507e-07, + "loss": 0.375, + "step": 9661 + }, + { + "epoch": 4.706398181227671, + "grad_norm": 3.181975841522217, + "learning_rate": 5.587820972352309e-07, + "loss": 0.346, + "step": 9662 + }, + { + "epoch": 4.706885352387139, + "grad_norm": 3.245623826980591, + "learning_rate": 5.583768999632305e-07, + "loss": 0.4319, + "step": 9663 + }, + { + "epoch": 4.707372523546606, + "grad_norm": 3.571580410003662, + "learning_rate": 5.579718311876662e-07, + "loss": 0.3723, + "step": 9664 + }, + { + "epoch": 4.707859694706073, + "grad_norm": 3.108231782913208, + "learning_rate": 5.575668909353452e-07, + "loss": 0.4091, + "step": 9665 + }, + { + "epoch": 4.708346865865541, + "grad_norm": 3.2403934001922607, + "learning_rate": 5.571620792330673e-07, + "loss": 0.3745, + "step": 9666 + }, + { + "epoch": 4.708834037025008, + "grad_norm": 3.5352931022644043, + "learning_rate": 5.567573961076211e-07, + "loss": 0.3962, + "step": 9667 + }, + { + "epoch": 4.709321208184475, + "grad_norm": 3.310458183288574, + "learning_rate": 5.563528415857897e-07, + "loss": 0.3718, + "step": 9668 + }, + { + "epoch": 4.709808379343943, + "grad_norm": 3.0854523181915283, + "learning_rate": 5.559484156943462e-07, + "loss": 0.4316, + "step": 9669 + }, + { + "epoch": 4.71029555050341, + "grad_norm": 3.554762125015259, + "learning_rate": 5.555441184600561e-07, + "loss": 0.4404, + "step": 9670 + }, + { + "epoch": 4.710782721662878, + "grad_norm": 3.2500505447387695, + "learning_rate": 5.551399499096746e-07, + "loss": 0.3565, + "step": 9671 + }, + { + "epoch": 4.711269892822345, + "grad_norm": 3.3709309101104736, + "learning_rate": 5.547359100699501e-07, + "loss": 0.3434, + "step": 9672 + }, + { + "epoch": 4.711757063981812, + "grad_norm": 3.355191230773926, + "learning_rate": 5.543319989676224e-07, + "loss": 0.4833, + "step": 9673 + }, + { + "epoch": 4.71224423514128, + "grad_norm": 3.571333646774292, + "learning_rate": 5.539282166294216e-07, + "loss": 0.402, + "step": 9674 + }, + { + "epoch": 4.7127314063007475, + "grad_norm": 3.279965877532959, + "learning_rate": 5.535245630820699e-07, + "loss": 0.3579, + "step": 9675 + }, + { + "epoch": 4.713218577460214, + "grad_norm": 3.930610418319702, + "learning_rate": 5.531210383522815e-07, + "loss": 0.4102, + "step": 9676 + }, + { + "epoch": 4.713705748619682, + "grad_norm": 3.1247849464416504, + "learning_rate": 5.527176424667618e-07, + "loss": 0.3517, + "step": 9677 + }, + { + "epoch": 4.7141929197791494, + "grad_norm": 3.3335142135620117, + "learning_rate": 5.523143754522067e-07, + "loss": 0.3595, + "step": 9678 + }, + { + "epoch": 4.714680090938616, + "grad_norm": 3.212890148162842, + "learning_rate": 5.51911237335305e-07, + "loss": 0.343, + "step": 9679 + }, + { + "epoch": 4.715167262098084, + "grad_norm": 3.1413626670837402, + "learning_rate": 5.515082281427369e-07, + "loss": 0.415, + "step": 9680 + }, + { + "epoch": 4.715654433257551, + "grad_norm": 3.377686023712158, + "learning_rate": 5.511053479011721e-07, + "loss": 0.4111, + "step": 9681 + }, + { + "epoch": 4.716141604417018, + "grad_norm": 3.461421251296997, + "learning_rate": 5.507025966372745e-07, + "loss": 0.4643, + "step": 9682 + }, + { + "epoch": 4.716628775576486, + "grad_norm": 3.800630569458008, + "learning_rate": 5.502999743776968e-07, + "loss": 0.3816, + "step": 9683 + }, + { + "epoch": 4.717115946735953, + "grad_norm": 3.1216344833374023, + "learning_rate": 5.498974811490854e-07, + "loss": 0.3845, + "step": 9684 + }, + { + "epoch": 4.717603117895421, + "grad_norm": 3.5383853912353516, + "learning_rate": 5.494951169780777e-07, + "loss": 0.3555, + "step": 9685 + }, + { + "epoch": 4.718090289054888, + "grad_norm": 3.2168490886688232, + "learning_rate": 5.490928818913011e-07, + "loss": 0.4495, + "step": 9686 + }, + { + "epoch": 4.718577460214355, + "grad_norm": 2.944284677505493, + "learning_rate": 5.486907759153756e-07, + "loss": 0.3964, + "step": 9687 + }, + { + "epoch": 4.719064631373823, + "grad_norm": 3.7915751934051514, + "learning_rate": 5.482887990769128e-07, + "loss": 0.4102, + "step": 9688 + }, + { + "epoch": 4.71955180253329, + "grad_norm": 3.436971664428711, + "learning_rate": 5.478869514025165e-07, + "loss": 0.4244, + "step": 9689 + }, + { + "epoch": 4.720038973692757, + "grad_norm": 3.2270078659057617, + "learning_rate": 5.474852329187791e-07, + "loss": 0.4442, + "step": 9690 + }, + { + "epoch": 4.720526144852225, + "grad_norm": 3.235873222351074, + "learning_rate": 5.470836436522869e-07, + "loss": 0.3922, + "step": 9691 + }, + { + "epoch": 4.7210133160116925, + "grad_norm": 3.523439884185791, + "learning_rate": 5.466821836296176e-07, + "loss": 0.3521, + "step": 9692 + }, + { + "epoch": 4.721500487171159, + "grad_norm": 3.090641736984253, + "learning_rate": 5.462808528773403e-07, + "loss": 0.3929, + "step": 9693 + }, + { + "epoch": 4.721987658330627, + "grad_norm": 2.925198793411255, + "learning_rate": 5.458796514220135e-07, + "loss": 0.3753, + "step": 9694 + }, + { + "epoch": 4.7224748294900945, + "grad_norm": 3.611301898956299, + "learning_rate": 5.454785792901893e-07, + "loss": 0.3712, + "step": 9695 + }, + { + "epoch": 4.722962000649561, + "grad_norm": 3.451233386993408, + "learning_rate": 5.450776365084106e-07, + "loss": 0.3725, + "step": 9696 + }, + { + "epoch": 4.723449171809029, + "grad_norm": 3.1788575649261475, + "learning_rate": 5.446768231032129e-07, + "loss": 0.3565, + "step": 9697 + }, + { + "epoch": 4.723936342968496, + "grad_norm": 3.265467405319214, + "learning_rate": 5.4427613910112e-07, + "loss": 0.3665, + "step": 9698 + }, + { + "epoch": 4.724423514127963, + "grad_norm": 3.037245750427246, + "learning_rate": 5.4387558452865e-07, + "loss": 0.3743, + "step": 9699 + }, + { + "epoch": 4.724910685287431, + "grad_norm": 3.3367483615875244, + "learning_rate": 5.43475159412312e-07, + "loss": 0.4054, + "step": 9700 + }, + { + "epoch": 4.725397856446898, + "grad_norm": 3.217067241668701, + "learning_rate": 5.430748637786063e-07, + "loss": 0.3811, + "step": 9701 + }, + { + "epoch": 4.725885027606366, + "grad_norm": 2.9965810775756836, + "learning_rate": 5.426746976540232e-07, + "loss": 0.387, + "step": 9702 + }, + { + "epoch": 4.726372198765833, + "grad_norm": 3.355410575866699, + "learning_rate": 5.422746610650462e-07, + "loss": 0.4139, + "step": 9703 + }, + { + "epoch": 4.7268593699253, + "grad_norm": 3.60123610496521, + "learning_rate": 5.4187475403815e-07, + "loss": 0.3411, + "step": 9704 + }, + { + "epoch": 4.727346541084768, + "grad_norm": 2.9612975120544434, + "learning_rate": 5.41474976599801e-07, + "loss": 0.3656, + "step": 9705 + }, + { + "epoch": 4.7278337122442355, + "grad_norm": 3.1087992191314697, + "learning_rate": 5.410753287764547e-07, + "loss": 0.432, + "step": 9706 + }, + { + "epoch": 4.728320883403702, + "grad_norm": 3.6199586391448975, + "learning_rate": 5.406758105945609e-07, + "loss": 0.4284, + "step": 9707 + }, + { + "epoch": 4.72880805456317, + "grad_norm": 3.3416829109191895, + "learning_rate": 5.402764220805596e-07, + "loss": 0.3662, + "step": 9708 + }, + { + "epoch": 4.7292952257226375, + "grad_norm": 3.165611982345581, + "learning_rate": 5.398771632608826e-07, + "loss": 0.385, + "step": 9709 + }, + { + "epoch": 4.729782396882104, + "grad_norm": 3.5276246070861816, + "learning_rate": 5.394780341619521e-07, + "loss": 0.4222, + "step": 9710 + }, + { + "epoch": 4.730269568041572, + "grad_norm": 3.731396436691284, + "learning_rate": 5.390790348101824e-07, + "loss": 0.4136, + "step": 9711 + }, + { + "epoch": 4.7307567392010395, + "grad_norm": 3.0813071727752686, + "learning_rate": 5.386801652319798e-07, + "loss": 0.4506, + "step": 9712 + }, + { + "epoch": 4.731243910360506, + "grad_norm": 3.447711706161499, + "learning_rate": 5.38281425453742e-07, + "loss": 0.3671, + "step": 9713 + }, + { + "epoch": 4.731731081519974, + "grad_norm": 3.367751121520996, + "learning_rate": 5.378828155018556e-07, + "loss": 0.4377, + "step": 9714 + }, + { + "epoch": 4.732218252679441, + "grad_norm": 3.4286305904388428, + "learning_rate": 5.374843354027021e-07, + "loss": 0.3802, + "step": 9715 + }, + { + "epoch": 4.732705423838909, + "grad_norm": 3.3812942504882812, + "learning_rate": 5.370859851826527e-07, + "loss": 0.3188, + "step": 9716 + }, + { + "epoch": 4.733192594998376, + "grad_norm": 3.1859898567199707, + "learning_rate": 5.366877648680705e-07, + "loss": 0.4321, + "step": 9717 + }, + { + "epoch": 4.733679766157843, + "grad_norm": 3.515263080596924, + "learning_rate": 5.362896744853086e-07, + "loss": 0.3611, + "step": 9718 + }, + { + "epoch": 4.734166937317311, + "grad_norm": 2.9148590564727783, + "learning_rate": 5.358917140607128e-07, + "loss": 0.3935, + "step": 9719 + }, + { + "epoch": 4.734654108476779, + "grad_norm": 3.495398998260498, + "learning_rate": 5.354938836206216e-07, + "loss": 0.4199, + "step": 9720 + }, + { + "epoch": 4.735141279636245, + "grad_norm": 3.5875132083892822, + "learning_rate": 5.350961831913615e-07, + "loss": 0.3516, + "step": 9721 + }, + { + "epoch": 4.735628450795713, + "grad_norm": 2.877427577972412, + "learning_rate": 5.346986127992529e-07, + "loss": 0.3744, + "step": 9722 + }, + { + "epoch": 4.7361156219551805, + "grad_norm": 3.048243284225464, + "learning_rate": 5.343011724706079e-07, + "loss": 0.4078, + "step": 9723 + }, + { + "epoch": 4.736602793114647, + "grad_norm": 3.4793362617492676, + "learning_rate": 5.339038622317275e-07, + "loss": 0.3913, + "step": 9724 + }, + { + "epoch": 4.737089964274115, + "grad_norm": 3.2782132625579834, + "learning_rate": 5.335066821089064e-07, + "loss": 0.3326, + "step": 9725 + }, + { + "epoch": 4.7375771354335825, + "grad_norm": 2.8391692638397217, + "learning_rate": 5.331096321284304e-07, + "loss": 0.3687, + "step": 9726 + }, + { + "epoch": 4.738064306593049, + "grad_norm": 3.109340190887451, + "learning_rate": 5.327127123165754e-07, + "loss": 0.4016, + "step": 9727 + }, + { + "epoch": 4.738551477752517, + "grad_norm": 3.417191982269287, + "learning_rate": 5.3231592269961e-07, + "loss": 0.4175, + "step": 9728 + }, + { + "epoch": 4.7390386489119845, + "grad_norm": 3.6676347255706787, + "learning_rate": 5.319192633037942e-07, + "loss": 0.372, + "step": 9729 + }, + { + "epoch": 4.739525820071452, + "grad_norm": 3.231825590133667, + "learning_rate": 5.315227341553775e-07, + "loss": 0.3128, + "step": 9730 + }, + { + "epoch": 4.740012991230919, + "grad_norm": 3.1245927810668945, + "learning_rate": 5.311263352806029e-07, + "loss": 0.373, + "step": 9731 + }, + { + "epoch": 4.740500162390386, + "grad_norm": 3.5256710052490234, + "learning_rate": 5.307300667057049e-07, + "loss": 0.348, + "step": 9732 + }, + { + "epoch": 4.740987333549854, + "grad_norm": 3.222262382507324, + "learning_rate": 5.30333928456907e-07, + "loss": 0.4541, + "step": 9733 + }, + { + "epoch": 4.741474504709322, + "grad_norm": 3.679448366165161, + "learning_rate": 5.299379205604263e-07, + "loss": 0.3862, + "step": 9734 + }, + { + "epoch": 4.741961675868788, + "grad_norm": 3.362823247909546, + "learning_rate": 5.295420430424706e-07, + "loss": 0.4562, + "step": 9735 + }, + { + "epoch": 4.742448847028256, + "grad_norm": 3.4993152618408203, + "learning_rate": 5.291462959292399e-07, + "loss": 0.5002, + "step": 9736 + }, + { + "epoch": 4.742936018187724, + "grad_norm": 3.5037472248077393, + "learning_rate": 5.287506792469226e-07, + "loss": 0.3915, + "step": 9737 + }, + { + "epoch": 4.74342318934719, + "grad_norm": 3.2247626781463623, + "learning_rate": 5.28355193021702e-07, + "loss": 0.3679, + "step": 9738 + }, + { + "epoch": 4.743910360506658, + "grad_norm": 3.5544350147247314, + "learning_rate": 5.279598372797514e-07, + "loss": 0.3773, + "step": 9739 + }, + { + "epoch": 4.7443975316661255, + "grad_norm": 3.199897050857544, + "learning_rate": 5.275646120472355e-07, + "loss": 0.3775, + "step": 9740 + }, + { + "epoch": 4.744884702825592, + "grad_norm": 3.2597100734710693, + "learning_rate": 5.271695173503094e-07, + "loss": 0.3871, + "step": 9741 + }, + { + "epoch": 4.74537187398506, + "grad_norm": 2.961203098297119, + "learning_rate": 5.267745532151208e-07, + "loss": 0.3691, + "step": 9742 + }, + { + "epoch": 4.7458590451445275, + "grad_norm": 3.131758451461792, + "learning_rate": 5.263797196678086e-07, + "loss": 0.3659, + "step": 9743 + }, + { + "epoch": 4.746346216303995, + "grad_norm": 3.7902262210845947, + "learning_rate": 5.259850167345034e-07, + "loss": 0.3399, + "step": 9744 + }, + { + "epoch": 4.746833387463462, + "grad_norm": 3.2809383869171143, + "learning_rate": 5.255904444413254e-07, + "loss": 0.3953, + "step": 9745 + }, + { + "epoch": 4.7473205586229295, + "grad_norm": 3.141740322113037, + "learning_rate": 5.25196002814388e-07, + "loss": 0.3716, + "step": 9746 + }, + { + "epoch": 4.747807729782397, + "grad_norm": 3.2201030254364014, + "learning_rate": 5.248016918797949e-07, + "loss": 0.3626, + "step": 9747 + }, + { + "epoch": 4.748294900941865, + "grad_norm": 2.9177486896514893, + "learning_rate": 5.244075116636426e-07, + "loss": 0.3219, + "step": 9748 + }, + { + "epoch": 4.748782072101331, + "grad_norm": 3.0707433223724365, + "learning_rate": 5.240134621920168e-07, + "loss": 0.3902, + "step": 9749 + }, + { + "epoch": 4.749269243260799, + "grad_norm": 3.351688861846924, + "learning_rate": 5.23619543490996e-07, + "loss": 0.4405, + "step": 9750 + }, + { + "epoch": 4.749756414420267, + "grad_norm": 3.252725839614868, + "learning_rate": 5.232257555866499e-07, + "loss": 0.3681, + "step": 9751 + }, + { + "epoch": 4.750243585579733, + "grad_norm": 3.0284690856933594, + "learning_rate": 5.228320985050398e-07, + "loss": 0.3505, + "step": 9752 + }, + { + "epoch": 4.750730756739201, + "grad_norm": 3.3251922130584717, + "learning_rate": 5.224385722722164e-07, + "loss": 0.3754, + "step": 9753 + }, + { + "epoch": 4.751217927898669, + "grad_norm": 3.5023481845855713, + "learning_rate": 5.220451769142243e-07, + "loss": 0.454, + "step": 9754 + }, + { + "epoch": 4.751705099058135, + "grad_norm": 3.6349775791168213, + "learning_rate": 5.216519124570982e-07, + "loss": 0.3901, + "step": 9755 + }, + { + "epoch": 4.752192270217603, + "grad_norm": 3.3957765102386475, + "learning_rate": 5.21258778926865e-07, + "loss": 0.3581, + "step": 9756 + }, + { + "epoch": 4.7526794413770705, + "grad_norm": 3.1364071369171143, + "learning_rate": 5.208657763495409e-07, + "loss": 0.3724, + "step": 9757 + }, + { + "epoch": 4.753166612536538, + "grad_norm": 3.0609054565429688, + "learning_rate": 5.204729047511354e-07, + "loss": 0.3717, + "step": 9758 + }, + { + "epoch": 4.753653783696005, + "grad_norm": 2.785193920135498, + "learning_rate": 5.200801641576487e-07, + "loss": 0.4346, + "step": 9759 + }, + { + "epoch": 4.7541409548554725, + "grad_norm": 3.076343536376953, + "learning_rate": 5.196875545950731e-07, + "loss": 0.3596, + "step": 9760 + }, + { + "epoch": 4.75462812601494, + "grad_norm": 3.373867988586426, + "learning_rate": 5.1929507608939e-07, + "loss": 0.371, + "step": 9761 + }, + { + "epoch": 4.755115297174408, + "grad_norm": 3.3097081184387207, + "learning_rate": 5.189027286665743e-07, + "loss": 0.3662, + "step": 9762 + }, + { + "epoch": 4.7556024683338745, + "grad_norm": 3.0205423831939697, + "learning_rate": 5.185105123525916e-07, + "loss": 0.3505, + "step": 9763 + }, + { + "epoch": 4.756089639493342, + "grad_norm": 2.8696165084838867, + "learning_rate": 5.181184271733994e-07, + "loss": 0.3505, + "step": 9764 + }, + { + "epoch": 4.75657681065281, + "grad_norm": 3.956977605819702, + "learning_rate": 5.177264731549445e-07, + "loss": 0.4661, + "step": 9765 + }, + { + "epoch": 4.757063981812276, + "grad_norm": 3.556151866912842, + "learning_rate": 5.173346503231674e-07, + "loss": 0.3825, + "step": 9766 + }, + { + "epoch": 4.757551152971744, + "grad_norm": 3.058652639389038, + "learning_rate": 5.169429587039982e-07, + "loss": 0.3938, + "step": 9767 + }, + { + "epoch": 4.758038324131212, + "grad_norm": 3.065732479095459, + "learning_rate": 5.165513983233589e-07, + "loss": 0.3367, + "step": 9768 + }, + { + "epoch": 4.758525495290678, + "grad_norm": 3.446322202682495, + "learning_rate": 5.161599692071645e-07, + "loss": 0.3464, + "step": 9769 + }, + { + "epoch": 4.759012666450146, + "grad_norm": 3.1983323097229004, + "learning_rate": 5.157686713813176e-07, + "loss": 0.3531, + "step": 9770 + }, + { + "epoch": 4.759499837609614, + "grad_norm": 3.1316938400268555, + "learning_rate": 5.153775048717153e-07, + "loss": 0.3976, + "step": 9771 + }, + { + "epoch": 4.759987008769081, + "grad_norm": 2.809549331665039, + "learning_rate": 5.149864697042456e-07, + "loss": 0.4167, + "step": 9772 + }, + { + "epoch": 4.760474179928548, + "grad_norm": 3.4164879322052, + "learning_rate": 5.14595565904786e-07, + "loss": 0.3263, + "step": 9773 + }, + { + "epoch": 4.7609613510880155, + "grad_norm": 3.2319252490997314, + "learning_rate": 5.14204793499207e-07, + "loss": 0.4179, + "step": 9774 + }, + { + "epoch": 4.761448522247483, + "grad_norm": 3.198887586593628, + "learning_rate": 5.138141525133697e-07, + "loss": 0.3963, + "step": 9775 + }, + { + "epoch": 4.761935693406951, + "grad_norm": 2.994976282119751, + "learning_rate": 5.134236429731274e-07, + "loss": 0.3385, + "step": 9776 + }, + { + "epoch": 4.7624228645664175, + "grad_norm": 3.752957344055176, + "learning_rate": 5.130332649043229e-07, + "loss": 0.3553, + "step": 9777 + }, + { + "epoch": 4.762910035725885, + "grad_norm": 3.143383026123047, + "learning_rate": 5.126430183327919e-07, + "loss": 0.3794, + "step": 9778 + }, + { + "epoch": 4.763397206885353, + "grad_norm": 3.4767799377441406, + "learning_rate": 5.122529032843616e-07, + "loss": 0.4008, + "step": 9779 + }, + { + "epoch": 4.7638843780448195, + "grad_norm": 3.4102742671966553, + "learning_rate": 5.118629197848482e-07, + "loss": 0.3836, + "step": 9780 + }, + { + "epoch": 4.764371549204287, + "grad_norm": 3.208573579788208, + "learning_rate": 5.114730678600616e-07, + "loss": 0.419, + "step": 9781 + }, + { + "epoch": 4.764858720363755, + "grad_norm": 3.340191125869751, + "learning_rate": 5.110833475358024e-07, + "loss": 0.4152, + "step": 9782 + }, + { + "epoch": 4.765345891523221, + "grad_norm": 3.3081865310668945, + "learning_rate": 5.106937588378627e-07, + "loss": 0.4078, + "step": 9783 + }, + { + "epoch": 4.765833062682689, + "grad_norm": 3.084787368774414, + "learning_rate": 5.103043017920239e-07, + "loss": 0.4067, + "step": 9784 + }, + { + "epoch": 4.766320233842157, + "grad_norm": 3.6828529834747314, + "learning_rate": 5.099149764240613e-07, + "loss": 0.3773, + "step": 9785 + }, + { + "epoch": 4.766807405001624, + "grad_norm": 3.660478353500366, + "learning_rate": 5.095257827597403e-07, + "loss": 0.4145, + "step": 9786 + }, + { + "epoch": 4.767294576161091, + "grad_norm": 3.4387903213500977, + "learning_rate": 5.091367208248183e-07, + "loss": 0.412, + "step": 9787 + }, + { + "epoch": 4.767781747320559, + "grad_norm": 3.2884507179260254, + "learning_rate": 5.087477906450419e-07, + "loss": 0.3882, + "step": 9788 + }, + { + "epoch": 4.768268918480026, + "grad_norm": 3.6343178749084473, + "learning_rate": 5.083589922461516e-07, + "loss": 0.4388, + "step": 9789 + }, + { + "epoch": 4.768756089639494, + "grad_norm": 3.8518311977386475, + "learning_rate": 5.079703256538776e-07, + "loss": 0.3814, + "step": 9790 + }, + { + "epoch": 4.7692432607989605, + "grad_norm": 3.9007019996643066, + "learning_rate": 5.075817908939423e-07, + "loss": 0.4471, + "step": 9791 + }, + { + "epoch": 4.769730431958428, + "grad_norm": 3.5459976196289062, + "learning_rate": 5.071933879920582e-07, + "loss": 0.435, + "step": 9792 + }, + { + "epoch": 4.770217603117896, + "grad_norm": 3.646829605102539, + "learning_rate": 5.068051169739302e-07, + "loss": 0.4239, + "step": 9793 + }, + { + "epoch": 4.7707047742773625, + "grad_norm": 3.6532986164093018, + "learning_rate": 5.064169778652536e-07, + "loss": 0.4086, + "step": 9794 + }, + { + "epoch": 4.77119194543683, + "grad_norm": 3.599179744720459, + "learning_rate": 5.060289706917168e-07, + "loss": 0.3936, + "step": 9795 + }, + { + "epoch": 4.771679116596298, + "grad_norm": 3.3889544010162354, + "learning_rate": 5.056410954789961e-07, + "loss": 0.3618, + "step": 9796 + }, + { + "epoch": 4.7721662877557645, + "grad_norm": 3.261216878890991, + "learning_rate": 5.052533522527622e-07, + "loss": 0.3402, + "step": 9797 + }, + { + "epoch": 4.772653458915232, + "grad_norm": 3.1236071586608887, + "learning_rate": 5.048657410386757e-07, + "loss": 0.4261, + "step": 9798 + }, + { + "epoch": 4.7731406300747, + "grad_norm": 3.470961332321167, + "learning_rate": 5.044782618623891e-07, + "loss": 0.4095, + "step": 9799 + }, + { + "epoch": 4.773627801234167, + "grad_norm": 3.1823513507843018, + "learning_rate": 5.040909147495449e-07, + "loss": 0.3832, + "step": 9800 + }, + { + "epoch": 4.774114972393634, + "grad_norm": 3.4252569675445557, + "learning_rate": 5.037036997257777e-07, + "loss": 0.4094, + "step": 9801 + }, + { + "epoch": 4.774602143553102, + "grad_norm": 3.574781656265259, + "learning_rate": 5.033166168167142e-07, + "loss": 0.4184, + "step": 9802 + }, + { + "epoch": 4.775089314712569, + "grad_norm": 4.0113654136657715, + "learning_rate": 5.029296660479715e-07, + "loss": 0.3631, + "step": 9803 + }, + { + "epoch": 4.775576485872037, + "grad_norm": 3.3075053691864014, + "learning_rate": 5.025428474451568e-07, + "loss": 0.362, + "step": 9804 + }, + { + "epoch": 4.776063657031504, + "grad_norm": 3.034764289855957, + "learning_rate": 5.021561610338707e-07, + "loss": 0.3551, + "step": 9805 + }, + { + "epoch": 4.776550828190971, + "grad_norm": 3.39894700050354, + "learning_rate": 5.017696068397038e-07, + "loss": 0.3871, + "step": 9806 + }, + { + "epoch": 4.777037999350439, + "grad_norm": 3.3172669410705566, + "learning_rate": 5.013831848882386e-07, + "loss": 0.333, + "step": 9807 + }, + { + "epoch": 4.7775251705099055, + "grad_norm": 3.3487460613250732, + "learning_rate": 5.009968952050478e-07, + "loss": 0.428, + "step": 9808 + }, + { + "epoch": 4.778012341669373, + "grad_norm": 3.214329481124878, + "learning_rate": 5.006107378156963e-07, + "loss": 0.4227, + "step": 9809 + }, + { + "epoch": 4.778499512828841, + "grad_norm": 3.331599712371826, + "learning_rate": 5.002247127457405e-07, + "loss": 0.3703, + "step": 9810 + }, + { + "epoch": 4.7789866839883075, + "grad_norm": 3.239291191101074, + "learning_rate": 4.998388200207263e-07, + "loss": 0.4318, + "step": 9811 + }, + { + "epoch": 4.779473855147775, + "grad_norm": 3.1696736812591553, + "learning_rate": 4.994530596661928e-07, + "loss": 0.3625, + "step": 9812 + }, + { + "epoch": 4.779961026307243, + "grad_norm": 3.2083442211151123, + "learning_rate": 4.990674317076702e-07, + "loss": 0.4044, + "step": 9813 + }, + { + "epoch": 4.78044819746671, + "grad_norm": 3.4105775356292725, + "learning_rate": 4.98681936170678e-07, + "loss": 0.3617, + "step": 9814 + }, + { + "epoch": 4.780935368626177, + "grad_norm": 3.133089303970337, + "learning_rate": 4.982965730807287e-07, + "loss": 0.3656, + "step": 9815 + }, + { + "epoch": 4.781422539785645, + "grad_norm": 3.2487597465515137, + "learning_rate": 4.979113424633267e-07, + "loss": 0.4745, + "step": 9816 + }, + { + "epoch": 4.781909710945112, + "grad_norm": 3.9448697566986084, + "learning_rate": 4.97526244343965e-07, + "loss": 0.4103, + "step": 9817 + }, + { + "epoch": 4.782396882104579, + "grad_norm": 3.0906076431274414, + "learning_rate": 4.971412787481298e-07, + "loss": 0.359, + "step": 9818 + }, + { + "epoch": 4.782884053264047, + "grad_norm": 3.075141668319702, + "learning_rate": 4.967564457012991e-07, + "loss": 0.4324, + "step": 9819 + }, + { + "epoch": 4.783371224423514, + "grad_norm": 3.0938892364501953, + "learning_rate": 4.963717452289394e-07, + "loss": 0.4001, + "step": 9820 + }, + { + "epoch": 4.783858395582982, + "grad_norm": 3.510716438293457, + "learning_rate": 4.959871773565112e-07, + "loss": 0.4314, + "step": 9821 + }, + { + "epoch": 4.784345566742449, + "grad_norm": 5.891648769378662, + "learning_rate": 4.95602742109465e-07, + "loss": 0.3434, + "step": 9822 + }, + { + "epoch": 4.784832737901916, + "grad_norm": 3.1327579021453857, + "learning_rate": 4.952184395132431e-07, + "loss": 0.4196, + "step": 9823 + }, + { + "epoch": 4.785319909061384, + "grad_norm": 3.370820999145508, + "learning_rate": 4.948342695932779e-07, + "loss": 0.4105, + "step": 9824 + }, + { + "epoch": 4.7858070802208506, + "grad_norm": 3.1077256202697754, + "learning_rate": 4.944502323749936e-07, + "loss": 0.4219, + "step": 9825 + }, + { + "epoch": 4.786294251380318, + "grad_norm": 3.4878430366516113, + "learning_rate": 4.940663278838065e-07, + "loss": 0.3463, + "step": 9826 + }, + { + "epoch": 4.786781422539786, + "grad_norm": 2.7748770713806152, + "learning_rate": 4.936825561451236e-07, + "loss": 0.4046, + "step": 9827 + }, + { + "epoch": 4.7872685936992525, + "grad_norm": 3.0671331882476807, + "learning_rate": 4.932989171843414e-07, + "loss": 0.3789, + "step": 9828 + }, + { + "epoch": 4.78775576485872, + "grad_norm": 3.168712615966797, + "learning_rate": 4.929154110268503e-07, + "loss": 0.4271, + "step": 9829 + }, + { + "epoch": 4.788242936018188, + "grad_norm": 3.3920938968658447, + "learning_rate": 4.925320376980311e-07, + "loss": 0.38, + "step": 9830 + }, + { + "epoch": 4.788730107177655, + "grad_norm": 3.3619842529296875, + "learning_rate": 4.921487972232539e-07, + "loss": 0.3228, + "step": 9831 + }, + { + "epoch": 4.789217278337122, + "grad_norm": 3.2176687717437744, + "learning_rate": 4.917656896278822e-07, + "loss": 0.3045, + "step": 9832 + }, + { + "epoch": 4.78970444949659, + "grad_norm": 2.9389376640319824, + "learning_rate": 4.913827149372702e-07, + "loss": 0.3892, + "step": 9833 + }, + { + "epoch": 4.790191620656057, + "grad_norm": 3.226236581802368, + "learning_rate": 4.90999873176764e-07, + "loss": 0.3826, + "step": 9834 + }, + { + "epoch": 4.790678791815525, + "grad_norm": 3.4434728622436523, + "learning_rate": 4.906171643716984e-07, + "loss": 0.3966, + "step": 9835 + }, + { + "epoch": 4.791165962974992, + "grad_norm": 3.1981329917907715, + "learning_rate": 4.902345885474014e-07, + "loss": 0.3887, + "step": 9836 + }, + { + "epoch": 4.791653134134459, + "grad_norm": 3.2823472023010254, + "learning_rate": 4.898521457291925e-07, + "loss": 0.371, + "step": 9837 + }, + { + "epoch": 4.792140305293927, + "grad_norm": 3.590693473815918, + "learning_rate": 4.89469835942382e-07, + "loss": 0.3674, + "step": 9838 + }, + { + "epoch": 4.792627476453394, + "grad_norm": 3.1257545948028564, + "learning_rate": 4.890876592122698e-07, + "loss": 0.3756, + "step": 9839 + }, + { + "epoch": 4.793114647612861, + "grad_norm": 3.213252067565918, + "learning_rate": 4.887056155641493e-07, + "loss": 0.36, + "step": 9840 + }, + { + "epoch": 4.793601818772329, + "grad_norm": 3.281205415725708, + "learning_rate": 4.883237050233036e-07, + "loss": 0.3301, + "step": 9841 + }, + { + "epoch": 4.7940889899317956, + "grad_norm": 3.5105271339416504, + "learning_rate": 4.879419276150086e-07, + "loss": 0.3805, + "step": 9842 + }, + { + "epoch": 4.794576161091263, + "grad_norm": 4.5899858474731445, + "learning_rate": 4.875602833645288e-07, + "loss": 0.3733, + "step": 9843 + }, + { + "epoch": 4.795063332250731, + "grad_norm": 3.4117627143859863, + "learning_rate": 4.871787722971219e-07, + "loss": 0.4205, + "step": 9844 + }, + { + "epoch": 4.795550503410198, + "grad_norm": 3.392493724822998, + "learning_rate": 4.867973944380363e-07, + "loss": 0.3467, + "step": 9845 + }, + { + "epoch": 4.796037674569665, + "grad_norm": 3.1439290046691895, + "learning_rate": 4.864161498125128e-07, + "loss": 0.3761, + "step": 9846 + }, + { + "epoch": 4.796524845729133, + "grad_norm": 3.1896512508392334, + "learning_rate": 4.860350384457799e-07, + "loss": 0.3155, + "step": 9847 + }, + { + "epoch": 4.7970120168886, + "grad_norm": 3.4594109058380127, + "learning_rate": 4.856540603630607e-07, + "loss": 0.3871, + "step": 9848 + }, + { + "epoch": 4.797499188048068, + "grad_norm": 3.489539623260498, + "learning_rate": 4.852732155895685e-07, + "loss": 0.4453, + "step": 9849 + }, + { + "epoch": 4.797986359207535, + "grad_norm": 3.3645050525665283, + "learning_rate": 4.848925041505076e-07, + "loss": 0.4003, + "step": 9850 + }, + { + "epoch": 4.798473530367002, + "grad_norm": 3.4581830501556396, + "learning_rate": 4.845119260710726e-07, + "loss": 0.3268, + "step": 9851 + }, + { + "epoch": 4.79896070152647, + "grad_norm": 3.220756769180298, + "learning_rate": 4.841314813764508e-07, + "loss": 0.3785, + "step": 9852 + }, + { + "epoch": 4.799447872685937, + "grad_norm": 3.621619701385498, + "learning_rate": 4.837511700918202e-07, + "loss": 0.4326, + "step": 9853 + }, + { + "epoch": 4.799935043845404, + "grad_norm": 3.457657814025879, + "learning_rate": 4.833709922423491e-07, + "loss": 0.3941, + "step": 9854 + }, + { + "epoch": 4.800422215004872, + "grad_norm": 3.2653443813323975, + "learning_rate": 4.829909478531977e-07, + "loss": 0.397, + "step": 9855 + }, + { + "epoch": 4.800909386164339, + "grad_norm": 3.130918502807617, + "learning_rate": 4.826110369495182e-07, + "loss": 0.3989, + "step": 9856 + }, + { + "epoch": 4.801396557323806, + "grad_norm": 3.4457738399505615, + "learning_rate": 4.822312595564518e-07, + "loss": 0.3446, + "step": 9857 + }, + { + "epoch": 4.801883728483274, + "grad_norm": 3.1791927814483643, + "learning_rate": 4.818516156991329e-07, + "loss": 0.4218, + "step": 9858 + }, + { + "epoch": 4.802370899642741, + "grad_norm": 3.689246654510498, + "learning_rate": 4.814721054026867e-07, + "loss": 0.4084, + "step": 9859 + }, + { + "epoch": 4.802858070802208, + "grad_norm": 4.728793621063232, + "learning_rate": 4.810927286922279e-07, + "loss": 0.4365, + "step": 9860 + }, + { + "epoch": 4.803345241961676, + "grad_norm": 3.6183371543884277, + "learning_rate": 4.807134855928641e-07, + "loss": 0.4409, + "step": 9861 + }, + { + "epoch": 4.803832413121143, + "grad_norm": 3.462851047515869, + "learning_rate": 4.803343761296947e-07, + "loss": 0.3291, + "step": 9862 + }, + { + "epoch": 4.804319584280611, + "grad_norm": 3.4030134677886963, + "learning_rate": 4.799554003278075e-07, + "loss": 0.3888, + "step": 9863 + }, + { + "epoch": 4.804806755440078, + "grad_norm": 3.1406922340393066, + "learning_rate": 4.795765582122838e-07, + "loss": 0.3664, + "step": 9864 + }, + { + "epoch": 4.805293926599545, + "grad_norm": 3.1468610763549805, + "learning_rate": 4.791978498081954e-07, + "loss": 0.3992, + "step": 9865 + }, + { + "epoch": 4.805781097759013, + "grad_norm": 3.1793785095214844, + "learning_rate": 4.788192751406057e-07, + "loss": 0.3692, + "step": 9866 + }, + { + "epoch": 4.80626826891848, + "grad_norm": 3.194528102874756, + "learning_rate": 4.784408342345675e-07, + "loss": 0.3827, + "step": 9867 + }, + { + "epoch": 4.806755440077947, + "grad_norm": 3.1485235691070557, + "learning_rate": 4.780625271151267e-07, + "loss": 0.3617, + "step": 9868 + }, + { + "epoch": 4.807242611237415, + "grad_norm": 2.8314623832702637, + "learning_rate": 4.776843538073198e-07, + "loss": 0.3649, + "step": 9869 + }, + { + "epoch": 4.807729782396882, + "grad_norm": 3.212327003479004, + "learning_rate": 4.773063143361745e-07, + "loss": 0.4525, + "step": 9870 + }, + { + "epoch": 4.808216953556349, + "grad_norm": 3.4868831634521484, + "learning_rate": 4.769284087267082e-07, + "loss": 0.3484, + "step": 9871 + }, + { + "epoch": 4.808704124715817, + "grad_norm": 3.261428117752075, + "learning_rate": 4.7655063700393134e-07, + "loss": 0.4595, + "step": 9872 + }, + { + "epoch": 4.8091912958752845, + "grad_norm": 3.4507834911346436, + "learning_rate": 4.7617299919284514e-07, + "loss": 0.4113, + "step": 9873 + }, + { + "epoch": 4.809678467034751, + "grad_norm": 3.085026741027832, + "learning_rate": 4.757954953184418e-07, + "loss": 0.3713, + "step": 9874 + }, + { + "epoch": 4.810165638194219, + "grad_norm": 3.677276849746704, + "learning_rate": 4.754181254057036e-07, + "loss": 0.4062, + "step": 9875 + }, + { + "epoch": 4.810652809353686, + "grad_norm": 3.7944188117980957, + "learning_rate": 4.7504088947960545e-07, + "loss": 0.3979, + "step": 9876 + }, + { + "epoch": 4.811139980513154, + "grad_norm": 3.3794302940368652, + "learning_rate": 4.746637875651125e-07, + "loss": 0.4637, + "step": 9877 + }, + { + "epoch": 4.811627151672621, + "grad_norm": 3.5953385829925537, + "learning_rate": 4.742868196871822e-07, + "loss": 0.4471, + "step": 9878 + }, + { + "epoch": 4.812114322832088, + "grad_norm": 3.388112783432007, + "learning_rate": 4.73909985870761e-07, + "loss": 0.4012, + "step": 9879 + }, + { + "epoch": 4.812601493991556, + "grad_norm": 3.2826755046844482, + "learning_rate": 4.7353328614078793e-07, + "loss": 0.4054, + "step": 9880 + }, + { + "epoch": 4.813088665151023, + "grad_norm": 3.5850086212158203, + "learning_rate": 4.731567205221943e-07, + "loss": 0.3503, + "step": 9881 + }, + { + "epoch": 4.81357583631049, + "grad_norm": 3.702685832977295, + "learning_rate": 4.727802890398994e-07, + "loss": 0.4021, + "step": 9882 + }, + { + "epoch": 4.814063007469958, + "grad_norm": 3.6628222465515137, + "learning_rate": 4.7240399171881613e-07, + "loss": 0.4063, + "step": 9883 + }, + { + "epoch": 4.814550178629425, + "grad_norm": 3.7417259216308594, + "learning_rate": 4.7202782858384783e-07, + "loss": 0.3247, + "step": 9884 + }, + { + "epoch": 4.815037349788892, + "grad_norm": 3.263554334640503, + "learning_rate": 4.716517996598896e-07, + "loss": 0.3522, + "step": 9885 + }, + { + "epoch": 4.81552452094836, + "grad_norm": 3.460442066192627, + "learning_rate": 4.7127590497182603e-07, + "loss": 0.4165, + "step": 9886 + }, + { + "epoch": 4.8160116921078275, + "grad_norm": 3.0354576110839844, + "learning_rate": 4.709001445445338e-07, + "loss": 0.4288, + "step": 9887 + }, + { + "epoch": 4.816498863267294, + "grad_norm": 3.0785367488861084, + "learning_rate": 4.705245184028814e-07, + "loss": 0.3307, + "step": 9888 + }, + { + "epoch": 4.816986034426762, + "grad_norm": 3.109117269515991, + "learning_rate": 4.701490265717279e-07, + "loss": 0.4194, + "step": 9889 + }, + { + "epoch": 4.8174732055862295, + "grad_norm": 3.468914031982422, + "learning_rate": 4.697736690759219e-07, + "loss": 0.3747, + "step": 9890 + }, + { + "epoch": 4.817960376745697, + "grad_norm": 3.708535671234131, + "learning_rate": 4.693984459403056e-07, + "loss": 0.364, + "step": 9891 + }, + { + "epoch": 4.818447547905164, + "grad_norm": 3.170893669128418, + "learning_rate": 4.6902335718971124e-07, + "loss": 0.2946, + "step": 9892 + }, + { + "epoch": 4.8189347190646314, + "grad_norm": 3.2286252975463867, + "learning_rate": 4.686484028489624e-07, + "loss": 0.3938, + "step": 9893 + }, + { + "epoch": 4.819421890224099, + "grad_norm": 3.019465923309326, + "learning_rate": 4.682735829428725e-07, + "loss": 0.3819, + "step": 9894 + }, + { + "epoch": 4.819909061383566, + "grad_norm": 3.416487216949463, + "learning_rate": 4.6789889749624783e-07, + "loss": 0.3873, + "step": 9895 + }, + { + "epoch": 4.820396232543033, + "grad_norm": 3.3013253211975098, + "learning_rate": 4.675243465338847e-07, + "loss": 0.382, + "step": 9896 + }, + { + "epoch": 4.820883403702501, + "grad_norm": 2.8832874298095703, + "learning_rate": 4.671499300805718e-07, + "loss": 0.3504, + "step": 9897 + }, + { + "epoch": 4.821370574861968, + "grad_norm": 3.3231732845306396, + "learning_rate": 4.667756481610866e-07, + "loss": 0.3759, + "step": 9898 + }, + { + "epoch": 4.821857746021435, + "grad_norm": 2.8712713718414307, + "learning_rate": 4.664015008001996e-07, + "loss": 0.3962, + "step": 9899 + }, + { + "epoch": 4.822344917180903, + "grad_norm": 3.532135009765625, + "learning_rate": 4.660274880226726e-07, + "loss": 0.3353, + "step": 9900 + }, + { + "epoch": 4.822832088340371, + "grad_norm": 3.2993998527526855, + "learning_rate": 4.656536098532563e-07, + "loss": 0.4093, + "step": 9901 + }, + { + "epoch": 4.823319259499837, + "grad_norm": 3.3941593170166016, + "learning_rate": 4.652798663166955e-07, + "loss": 0.4074, + "step": 9902 + }, + { + "epoch": 4.823806430659305, + "grad_norm": 3.0987648963928223, + "learning_rate": 4.6490625743772304e-07, + "loss": 0.3928, + "step": 9903 + }, + { + "epoch": 4.8242936018187725, + "grad_norm": 3.5105507373809814, + "learning_rate": 4.645327832410648e-07, + "loss": 0.3532, + "step": 9904 + }, + { + "epoch": 4.82478077297824, + "grad_norm": 3.2779593467712402, + "learning_rate": 4.6415944375143796e-07, + "loss": 0.413, + "step": 9905 + }, + { + "epoch": 4.825267944137707, + "grad_norm": 3.2675392627716064, + "learning_rate": 4.6378623899354906e-07, + "loss": 0.386, + "step": 9906 + }, + { + "epoch": 4.8257551152971745, + "grad_norm": 3.618717670440674, + "learning_rate": 4.6341316899209734e-07, + "loss": 0.4228, + "step": 9907 + }, + { + "epoch": 4.826242286456642, + "grad_norm": 3.7297167778015137, + "learning_rate": 4.6304023377177237e-07, + "loss": 0.4666, + "step": 9908 + }, + { + "epoch": 4.826729457616109, + "grad_norm": 3.2011866569519043, + "learning_rate": 4.6266743335725563e-07, + "loss": 0.4308, + "step": 9909 + }, + { + "epoch": 4.8272166287755764, + "grad_norm": 3.3831636905670166, + "learning_rate": 4.6229476777321786e-07, + "loss": 0.4181, + "step": 9910 + }, + { + "epoch": 4.827703799935044, + "grad_norm": 3.5211873054504395, + "learning_rate": 4.6192223704432244e-07, + "loss": 0.3948, + "step": 9911 + }, + { + "epoch": 4.828190971094511, + "grad_norm": 3.01678204536438, + "learning_rate": 4.615498411952238e-07, + "loss": 0.3879, + "step": 9912 + }, + { + "epoch": 4.828678142253978, + "grad_norm": 3.354964017868042, + "learning_rate": 4.6117758025056745e-07, + "loss": 0.3863, + "step": 9913 + }, + { + "epoch": 4.829165313413446, + "grad_norm": 3.4329378604888916, + "learning_rate": 4.6080545423498826e-07, + "loss": 0.3788, + "step": 9914 + }, + { + "epoch": 4.829652484572914, + "grad_norm": 3.6997249126434326, + "learning_rate": 4.6043346317311444e-07, + "loss": 0.4044, + "step": 9915 + }, + { + "epoch": 4.83013965573238, + "grad_norm": 3.490741014480591, + "learning_rate": 4.600616070895639e-07, + "loss": 0.3645, + "step": 9916 + }, + { + "epoch": 4.830626826891848, + "grad_norm": 3.2663276195526123, + "learning_rate": 4.5968988600894686e-07, + "loss": 0.3261, + "step": 9917 + }, + { + "epoch": 4.831113998051316, + "grad_norm": 2.9457848072052, + "learning_rate": 4.5931829995586264e-07, + "loss": 0.3576, + "step": 9918 + }, + { + "epoch": 4.831601169210783, + "grad_norm": 3.4583868980407715, + "learning_rate": 4.5894684895490314e-07, + "loss": 0.4008, + "step": 9919 + }, + { + "epoch": 4.83208834037025, + "grad_norm": 3.666630268096924, + "learning_rate": 4.585755330306513e-07, + "loss": 0.3573, + "step": 9920 + }, + { + "epoch": 4.8325755115297175, + "grad_norm": 3.323965311050415, + "learning_rate": 4.5820435220768123e-07, + "loss": 0.435, + "step": 9921 + }, + { + "epoch": 4.833062682689185, + "grad_norm": 3.256824016571045, + "learning_rate": 4.5783330651055614e-07, + "loss": 0.3606, + "step": 9922 + }, + { + "epoch": 4.833549853848652, + "grad_norm": 3.270474433898926, + "learning_rate": 4.5746239596383273e-07, + "loss": 0.4713, + "step": 9923 + }, + { + "epoch": 4.8340370250081195, + "grad_norm": 3.3183038234710693, + "learning_rate": 4.570916205920578e-07, + "loss": 0.3969, + "step": 9924 + }, + { + "epoch": 4.834524196167587, + "grad_norm": 3.603848695755005, + "learning_rate": 4.5672098041977003e-07, + "loss": 0.3477, + "step": 9925 + }, + { + "epoch": 4.835011367327054, + "grad_norm": 3.549386501312256, + "learning_rate": 4.5635047547149677e-07, + "loss": 0.3541, + "step": 9926 + }, + { + "epoch": 4.8354985384865214, + "grad_norm": 3.393944501876831, + "learning_rate": 4.559801057717586e-07, + "loss": 0.4663, + "step": 9927 + }, + { + "epoch": 4.835985709645989, + "grad_norm": 3.464670419692993, + "learning_rate": 4.5560987134506656e-07, + "loss": 0.3287, + "step": 9928 + }, + { + "epoch": 4.836472880805457, + "grad_norm": 3.420396566390991, + "learning_rate": 4.552397722159238e-07, + "loss": 0.344, + "step": 9929 + }, + { + "epoch": 4.836960051964923, + "grad_norm": 3.3037514686584473, + "learning_rate": 4.548698084088218e-07, + "loss": 0.4639, + "step": 9930 + }, + { + "epoch": 4.837447223124391, + "grad_norm": 3.2809853553771973, + "learning_rate": 4.5449997994824526e-07, + "loss": 0.3919, + "step": 9931 + }, + { + "epoch": 4.837934394283859, + "grad_norm": 2.757483720779419, + "learning_rate": 4.5413028685867026e-07, + "loss": 0.3225, + "step": 9932 + }, + { + "epoch": 4.838421565443326, + "grad_norm": 3.0977272987365723, + "learning_rate": 4.5376072916456183e-07, + "loss": 0.3653, + "step": 9933 + }, + { + "epoch": 4.838908736602793, + "grad_norm": 3.383732795715332, + "learning_rate": 4.533913068903778e-07, + "loss": 0.4096, + "step": 9934 + }, + { + "epoch": 4.839395907762261, + "grad_norm": 3.578065872192383, + "learning_rate": 4.5302202006056644e-07, + "loss": 0.3675, + "step": 9935 + }, + { + "epoch": 4.839883078921728, + "grad_norm": 3.372694492340088, + "learning_rate": 4.5265286869956795e-07, + "loss": 0.3754, + "step": 9936 + }, + { + "epoch": 4.840370250081195, + "grad_norm": 3.2377851009368896, + "learning_rate": 4.5228385283181104e-07, + "loss": 0.4114, + "step": 9937 + }, + { + "epoch": 4.8408574212406625, + "grad_norm": 3.6814961433410645, + "learning_rate": 4.5191497248171845e-07, + "loss": 0.4117, + "step": 9938 + }, + { + "epoch": 4.84134459240013, + "grad_norm": 3.124680757522583, + "learning_rate": 4.51546227673702e-07, + "loss": 0.3738, + "step": 9939 + }, + { + "epoch": 4.841831763559597, + "grad_norm": 3.155869960784912, + "learning_rate": 4.511776184321662e-07, + "loss": 0.3796, + "step": 9940 + }, + { + "epoch": 4.8423189347190645, + "grad_norm": 3.516664505004883, + "learning_rate": 4.508091447815041e-07, + "loss": 0.3023, + "step": 9941 + }, + { + "epoch": 4.842806105878532, + "grad_norm": 3.043579339981079, + "learning_rate": 4.504408067461019e-07, + "loss": 0.356, + "step": 9942 + }, + { + "epoch": 4.843293277038, + "grad_norm": 3.105591058731079, + "learning_rate": 4.50072604350337e-07, + "loss": 0.3795, + "step": 9943 + }, + { + "epoch": 4.8437804481974664, + "grad_norm": 3.4178826808929443, + "learning_rate": 4.497045376185757e-07, + "loss": 0.4081, + "step": 9944 + }, + { + "epoch": 4.844267619356934, + "grad_norm": 3.7194104194641113, + "learning_rate": 4.49336606575177e-07, + "loss": 0.373, + "step": 9945 + }, + { + "epoch": 4.844754790516402, + "grad_norm": 3.3099067211151123, + "learning_rate": 4.489688112444915e-07, + "loss": 0.444, + "step": 9946 + }, + { + "epoch": 4.845241961675868, + "grad_norm": 3.241661310195923, + "learning_rate": 4.4860115165085845e-07, + "loss": 0.4086, + "step": 9947 + }, + { + "epoch": 4.845729132835336, + "grad_norm": 3.2593865394592285, + "learning_rate": 4.4823362781861003e-07, + "loss": 0.4003, + "step": 9948 + }, + { + "epoch": 4.846216303994804, + "grad_norm": 3.3348565101623535, + "learning_rate": 4.4786623977206986e-07, + "loss": 0.4434, + "step": 9949 + }, + { + "epoch": 4.846703475154271, + "grad_norm": 3.3908796310424805, + "learning_rate": 4.474989875355501e-07, + "loss": 0.4693, + "step": 9950 + }, + { + "epoch": 4.847190646313738, + "grad_norm": 3.421233892440796, + "learning_rate": 4.471318711333561e-07, + "loss": 0.3669, + "step": 9951 + }, + { + "epoch": 4.847677817473206, + "grad_norm": 2.9506380558013916, + "learning_rate": 4.467648905897845e-07, + "loss": 0.373, + "step": 9952 + }, + { + "epoch": 4.848164988632673, + "grad_norm": 3.335359811782837, + "learning_rate": 4.4639804592912063e-07, + "loss": 0.3879, + "step": 9953 + }, + { + "epoch": 4.84865215979214, + "grad_norm": 3.3406593799591064, + "learning_rate": 4.460313371756428e-07, + "loss": 0.402, + "step": 9954 + }, + { + "epoch": 4.8491393309516075, + "grad_norm": 3.545161724090576, + "learning_rate": 4.4566476435361995e-07, + "loss": 0.3826, + "step": 9955 + }, + { + "epoch": 4.849626502111075, + "grad_norm": 3.329986095428467, + "learning_rate": 4.452983274873121e-07, + "loss": 0.3986, + "step": 9956 + }, + { + "epoch": 4.850113673270543, + "grad_norm": 3.283034324645996, + "learning_rate": 4.449320266009691e-07, + "loss": 0.3974, + "step": 9957 + }, + { + "epoch": 4.8506008444300095, + "grad_norm": 3.264761447906494, + "learning_rate": 4.4456586171883307e-07, + "loss": 0.4068, + "step": 9958 + }, + { + "epoch": 4.851088015589477, + "grad_norm": 3.44930362701416, + "learning_rate": 4.44199832865137e-07, + "loss": 0.3908, + "step": 9959 + }, + { + "epoch": 4.851575186748945, + "grad_norm": 4.0269551277160645, + "learning_rate": 4.4383394006410533e-07, + "loss": 0.5011, + "step": 9960 + }, + { + "epoch": 4.8520623579084114, + "grad_norm": 3.5411581993103027, + "learning_rate": 4.4346818333995123e-07, + "loss": 0.4499, + "step": 9961 + }, + { + "epoch": 4.852549529067879, + "grad_norm": 3.122832775115967, + "learning_rate": 4.431025627168811e-07, + "loss": 0.3775, + "step": 9962 + }, + { + "epoch": 4.853036700227347, + "grad_norm": 3.831589937210083, + "learning_rate": 4.4273707821909205e-07, + "loss": 0.4531, + "step": 9963 + }, + { + "epoch": 4.853523871386814, + "grad_norm": 3.627208948135376, + "learning_rate": 4.423717298707722e-07, + "loss": 0.3342, + "step": 9964 + }, + { + "epoch": 4.854011042546281, + "grad_norm": 3.3856492042541504, + "learning_rate": 4.4200651769609913e-07, + "loss": 0.339, + "step": 9965 + }, + { + "epoch": 4.854498213705749, + "grad_norm": 2.995469570159912, + "learning_rate": 4.41641441719243e-07, + "loss": 0.4309, + "step": 9966 + }, + { + "epoch": 4.854985384865216, + "grad_norm": 3.205169677734375, + "learning_rate": 4.4127650196436443e-07, + "loss": 0.3949, + "step": 9967 + }, + { + "epoch": 4.855472556024683, + "grad_norm": 3.5217273235321045, + "learning_rate": 4.409116984556161e-07, + "loss": 0.4094, + "step": 9968 + }, + { + "epoch": 4.855959727184151, + "grad_norm": 3.1085848808288574, + "learning_rate": 4.405470312171392e-07, + "loss": 0.3699, + "step": 9969 + }, + { + "epoch": 4.856446898343618, + "grad_norm": 3.3820836544036865, + "learning_rate": 4.401825002730678e-07, + "loss": 0.3553, + "step": 9970 + }, + { + "epoch": 4.856934069503085, + "grad_norm": 3.8839683532714844, + "learning_rate": 4.3981810564752703e-07, + "loss": 0.367, + "step": 9971 + }, + { + "epoch": 4.8574212406625525, + "grad_norm": 3.2161576747894287, + "learning_rate": 4.394538473646326e-07, + "loss": 0.3696, + "step": 9972 + }, + { + "epoch": 4.85790841182202, + "grad_norm": 3.2478649616241455, + "learning_rate": 4.390897254484899e-07, + "loss": 0.3756, + "step": 9973 + }, + { + "epoch": 4.858395582981488, + "grad_norm": 3.5064499378204346, + "learning_rate": 4.387257399231976e-07, + "loss": 0.3977, + "step": 9974 + }, + { + "epoch": 4.8588827541409545, + "grad_norm": 3.109142303466797, + "learning_rate": 4.383618908128434e-07, + "loss": 0.3878, + "step": 9975 + }, + { + "epoch": 4.859369925300422, + "grad_norm": 3.1555216312408447, + "learning_rate": 4.379981781415082e-07, + "loss": 0.3602, + "step": 9976 + }, + { + "epoch": 4.85985709645989, + "grad_norm": 3.2426888942718506, + "learning_rate": 4.376346019332611e-07, + "loss": 0.3556, + "step": 9977 + }, + { + "epoch": 4.860344267619357, + "grad_norm": 3.3063089847564697, + "learning_rate": 4.3727116221216394e-07, + "loss": 0.384, + "step": 9978 + }, + { + "epoch": 4.860831438778824, + "grad_norm": 3.155931234359741, + "learning_rate": 4.3690785900226907e-07, + "loss": 0.3914, + "step": 9979 + }, + { + "epoch": 4.861318609938292, + "grad_norm": 2.969925880432129, + "learning_rate": 4.3654469232762063e-07, + "loss": 0.4666, + "step": 9980 + }, + { + "epoch": 4.861805781097759, + "grad_norm": 3.832793712615967, + "learning_rate": 4.3618166221225166e-07, + "loss": 0.4321, + "step": 9981 + }, + { + "epoch": 4.862292952257226, + "grad_norm": 3.2944092750549316, + "learning_rate": 4.358187686801879e-07, + "loss": 0.4422, + "step": 9982 + }, + { + "epoch": 4.862780123416694, + "grad_norm": 3.2953941822052, + "learning_rate": 4.35456011755446e-07, + "loss": 0.3691, + "step": 9983 + }, + { + "epoch": 4.863267294576161, + "grad_norm": 3.2505910396575928, + "learning_rate": 4.350933914620337e-07, + "loss": 0.3947, + "step": 9984 + }, + { + "epoch": 4.863754465735628, + "grad_norm": 3.0658092498779297, + "learning_rate": 4.3473090782394784e-07, + "loss": 0.3521, + "step": 9985 + }, + { + "epoch": 4.864241636895096, + "grad_norm": 3.313920736312866, + "learning_rate": 4.343685608651782e-07, + "loss": 0.3735, + "step": 9986 + }, + { + "epoch": 4.864728808054563, + "grad_norm": 3.2342357635498047, + "learning_rate": 4.3400635060970533e-07, + "loss": 0.4285, + "step": 9987 + }, + { + "epoch": 4.865215979214031, + "grad_norm": 3.588536024093628, + "learning_rate": 4.3364427708149914e-07, + "loss": 0.3734, + "step": 9988 + }, + { + "epoch": 4.8657031503734975, + "grad_norm": 3.2876152992248535, + "learning_rate": 4.3328234030452305e-07, + "loss": 0.3454, + "step": 9989 + }, + { + "epoch": 4.866190321532965, + "grad_norm": 3.509770631790161, + "learning_rate": 4.329205403027284e-07, + "loss": 0.4318, + "step": 9990 + }, + { + "epoch": 4.866677492692433, + "grad_norm": 3.2661256790161133, + "learning_rate": 4.325588771000602e-07, + "loss": 0.3885, + "step": 9991 + }, + { + "epoch": 4.8671646638519, + "grad_norm": 3.3657917976379395, + "learning_rate": 4.3219735072045355e-07, + "loss": 0.3912, + "step": 9992 + }, + { + "epoch": 4.867651835011367, + "grad_norm": 3.1158950328826904, + "learning_rate": 4.3183596118783283e-07, + "loss": 0.3838, + "step": 9993 + }, + { + "epoch": 4.868139006170835, + "grad_norm": 3.415377378463745, + "learning_rate": 4.314747085261159e-07, + "loss": 0.4021, + "step": 9994 + }, + { + "epoch": 4.868626177330302, + "grad_norm": 3.3464291095733643, + "learning_rate": 4.3111359275921057e-07, + "loss": 0.3094, + "step": 9995 + }, + { + "epoch": 4.869113348489769, + "grad_norm": 3.3509328365325928, + "learning_rate": 4.3075261391101434e-07, + "loss": 0.3967, + "step": 9996 + }, + { + "epoch": 4.869600519649237, + "grad_norm": 3.181056261062622, + "learning_rate": 4.303917720054177e-07, + "loss": 0.3859, + "step": 9997 + }, + { + "epoch": 4.870087690808704, + "grad_norm": 3.2010598182678223, + "learning_rate": 4.300310670663005e-07, + "loss": 0.4032, + "step": 9998 + }, + { + "epoch": 4.870574861968171, + "grad_norm": 3.287963628768921, + "learning_rate": 4.296704991175354e-07, + "loss": 0.3416, + "step": 9999 + }, + { + "epoch": 4.871062033127639, + "grad_norm": 2.95465087890625, + "learning_rate": 4.2931006818298316e-07, + "loss": 0.4027, + "step": 10000 + }, + { + "epoch": 4.871549204287106, + "grad_norm": 3.4032065868377686, + "learning_rate": 4.2894977428649784e-07, + "loss": 0.4343, + "step": 10001 + }, + { + "epoch": 4.872036375446574, + "grad_norm": 3.4756381511688232, + "learning_rate": 4.2858961745192357e-07, + "loss": 0.3411, + "step": 10002 + }, + { + "epoch": 4.872523546606041, + "grad_norm": 2.970468044281006, + "learning_rate": 4.282295977030962e-07, + "loss": 0.4164, + "step": 10003 + }, + { + "epoch": 4.873010717765508, + "grad_norm": 3.3297247886657715, + "learning_rate": 4.278697150638403e-07, + "loss": 0.3904, + "step": 10004 + }, + { + "epoch": 4.873497888924976, + "grad_norm": 3.3028695583343506, + "learning_rate": 4.2750996955797405e-07, + "loss": 0.3636, + "step": 10005 + }, + { + "epoch": 4.873985060084443, + "grad_norm": 3.9307258129119873, + "learning_rate": 4.2715036120930475e-07, + "loss": 0.3822, + "step": 10006 + }, + { + "epoch": 4.87447223124391, + "grad_norm": 3.2360641956329346, + "learning_rate": 4.2679089004163233e-07, + "loss": 0.3973, + "step": 10007 + }, + { + "epoch": 4.874959402403378, + "grad_norm": 3.2773566246032715, + "learning_rate": 4.264315560787449e-07, + "loss": 0.3864, + "step": 10008 + }, + { + "epoch": 4.875446573562845, + "grad_norm": 3.2554383277893066, + "learning_rate": 4.2607235934442407e-07, + "loss": 0.4372, + "step": 10009 + }, + { + "epoch": 4.875933744722312, + "grad_norm": 3.3728456497192383, + "learning_rate": 4.2571329986244166e-07, + "loss": 0.3576, + "step": 10010 + }, + { + "epoch": 4.87642091588178, + "grad_norm": 3.152798652648926, + "learning_rate": 4.2535437765656025e-07, + "loss": 0.3383, + "step": 10011 + }, + { + "epoch": 4.876908087041247, + "grad_norm": 3.074917793273926, + "learning_rate": 4.2499559275053224e-07, + "loss": 0.3925, + "step": 10012 + }, + { + "epoch": 4.877395258200714, + "grad_norm": 3.122281789779663, + "learning_rate": 4.246369451681029e-07, + "loss": 0.3767, + "step": 10013 + }, + { + "epoch": 4.877882429360182, + "grad_norm": 3.18408465385437, + "learning_rate": 4.242784349330073e-07, + "loss": 0.3903, + "step": 10014 + }, + { + "epoch": 4.878369600519649, + "grad_norm": 3.239426612854004, + "learning_rate": 4.2392006206897205e-07, + "loss": 0.357, + "step": 10015 + }, + { + "epoch": 4.878856771679117, + "grad_norm": 3.4946770668029785, + "learning_rate": 4.235618265997132e-07, + "loss": 0.3871, + "step": 10016 + }, + { + "epoch": 4.879343942838584, + "grad_norm": 3.753643035888672, + "learning_rate": 4.2320372854893934e-07, + "loss": 0.3986, + "step": 10017 + }, + { + "epoch": 4.879831113998051, + "grad_norm": 3.3503055572509766, + "learning_rate": 4.2284576794034914e-07, + "loss": 0.4197, + "step": 10018 + }, + { + "epoch": 4.880318285157519, + "grad_norm": 3.4842019081115723, + "learning_rate": 4.224879447976335e-07, + "loss": 0.3555, + "step": 10019 + }, + { + "epoch": 4.8808054563169865, + "grad_norm": 3.135910987854004, + "learning_rate": 4.221302591444712e-07, + "loss": 0.3851, + "step": 10020 + }, + { + "epoch": 4.881292627476453, + "grad_norm": 3.3326334953308105, + "learning_rate": 4.2177271100453526e-07, + "loss": 0.4367, + "step": 10021 + }, + { + "epoch": 4.881779798635921, + "grad_norm": 3.690305233001709, + "learning_rate": 4.214153004014876e-07, + "loss": 0.5001, + "step": 10022 + }, + { + "epoch": 4.882266969795388, + "grad_norm": 4.301881790161133, + "learning_rate": 4.210580273589826e-07, + "loss": 0.3885, + "step": 10023 + }, + { + "epoch": 4.882754140954855, + "grad_norm": 3.0569827556610107, + "learning_rate": 4.207008919006628e-07, + "loss": 0.3404, + "step": 10024 + }, + { + "epoch": 4.883241312114323, + "grad_norm": 4.188118934631348, + "learning_rate": 4.2034389405016474e-07, + "loss": 0.3519, + "step": 10025 + }, + { + "epoch": 4.88372848327379, + "grad_norm": 3.0602781772613525, + "learning_rate": 4.199870338311138e-07, + "loss": 0.3799, + "step": 10026 + }, + { + "epoch": 4.884215654433257, + "grad_norm": 3.4705514907836914, + "learning_rate": 4.1963031126712796e-07, + "loss": 0.4034, + "step": 10027 + }, + { + "epoch": 4.884702825592725, + "grad_norm": 3.2312676906585693, + "learning_rate": 4.192737263818139e-07, + "loss": 0.3775, + "step": 10028 + }, + { + "epoch": 4.885189996752192, + "grad_norm": 3.119318723678589, + "learning_rate": 4.1891727919877086e-07, + "loss": 0.3569, + "step": 10029 + }, + { + "epoch": 4.88567716791166, + "grad_norm": 3.0817782878875732, + "learning_rate": 4.185609697415891e-07, + "loss": 0.3096, + "step": 10030 + }, + { + "epoch": 4.886164339071127, + "grad_norm": 3.1785027980804443, + "learning_rate": 4.182047980338477e-07, + "loss": 0.3342, + "step": 10031 + }, + { + "epoch": 4.886651510230594, + "grad_norm": 2.7669925689697266, + "learning_rate": 4.178487640991191e-07, + "loss": 0.4064, + "step": 10032 + }, + { + "epoch": 4.887138681390062, + "grad_norm": 3.3145599365234375, + "learning_rate": 4.1749286796096606e-07, + "loss": 0.3776, + "step": 10033 + }, + { + "epoch": 4.8876258525495295, + "grad_norm": 3.2354624271392822, + "learning_rate": 4.1713710964294024e-07, + "loss": 0.3845, + "step": 10034 + }, + { + "epoch": 4.888113023708996, + "grad_norm": 3.171180486679077, + "learning_rate": 4.167814891685867e-07, + "loss": 0.3647, + "step": 10035 + }, + { + "epoch": 4.888600194868464, + "grad_norm": 3.4626824855804443, + "learning_rate": 4.164260065614409e-07, + "loss": 0.3673, + "step": 10036 + }, + { + "epoch": 4.8890873660279315, + "grad_norm": 3.150240182876587, + "learning_rate": 4.160706618450275e-07, + "loss": 0.3987, + "step": 10037 + }, + { + "epoch": 4.889574537187398, + "grad_norm": 3.1917171478271484, + "learning_rate": 4.1571545504286383e-07, + "loss": 0.3882, + "step": 10038 + }, + { + "epoch": 4.890061708346866, + "grad_norm": 3.3337090015411377, + "learning_rate": 4.1536038617845693e-07, + "loss": 0.361, + "step": 10039 + }, + { + "epoch": 4.890548879506333, + "grad_norm": 3.223870277404785, + "learning_rate": 4.150054552753055e-07, + "loss": 0.4178, + "step": 10040 + }, + { + "epoch": 4.8910360506658, + "grad_norm": 3.1508147716522217, + "learning_rate": 4.146506623568988e-07, + "loss": 0.3191, + "step": 10041 + }, + { + "epoch": 4.891523221825268, + "grad_norm": 3.11840558052063, + "learning_rate": 4.1429600744671784e-07, + "loss": 0.4515, + "step": 10042 + }, + { + "epoch": 4.892010392984735, + "grad_norm": 3.8104138374328613, + "learning_rate": 4.1394149056823234e-07, + "loss": 0.3984, + "step": 10043 + }, + { + "epoch": 4.892497564144203, + "grad_norm": 3.399273157119751, + "learning_rate": 4.135871117449045e-07, + "loss": 0.399, + "step": 10044 + }, + { + "epoch": 4.89298473530367, + "grad_norm": 3.3132164478302, + "learning_rate": 4.1323287100018734e-07, + "loss": 0.3434, + "step": 10045 + }, + { + "epoch": 4.893471906463137, + "grad_norm": 3.817878246307373, + "learning_rate": 4.1287876835752525e-07, + "loss": 0.3757, + "step": 10046 + }, + { + "epoch": 4.893959077622605, + "grad_norm": 3.2685348987579346, + "learning_rate": 4.1252480384035123e-07, + "loss": 0.4068, + "step": 10047 + }, + { + "epoch": 4.894446248782073, + "grad_norm": 2.786827564239502, + "learning_rate": 4.1217097747209113e-07, + "loss": 0.3729, + "step": 10048 + }, + { + "epoch": 4.894933419941539, + "grad_norm": 3.3151347637176514, + "learning_rate": 4.1181728927616154e-07, + "loss": 0.3215, + "step": 10049 + }, + { + "epoch": 4.895420591101007, + "grad_norm": 3.525519609451294, + "learning_rate": 4.1146373927596986e-07, + "loss": 0.4378, + "step": 10050 + }, + { + "epoch": 4.8959077622604745, + "grad_norm": 3.4844322204589844, + "learning_rate": 4.111103274949127e-07, + "loss": 0.4401, + "step": 10051 + }, + { + "epoch": 4.896394933419941, + "grad_norm": 2.9236598014831543, + "learning_rate": 4.107570539563799e-07, + "loss": 0.3662, + "step": 10052 + }, + { + "epoch": 4.896882104579409, + "grad_norm": 3.0566961765289307, + "learning_rate": 4.1040391868375024e-07, + "loss": 0.3726, + "step": 10053 + }, + { + "epoch": 4.8973692757388765, + "grad_norm": 3.266998767852783, + "learning_rate": 4.100509217003956e-07, + "loss": 0.3894, + "step": 10054 + }, + { + "epoch": 4.897856446898343, + "grad_norm": 3.399836778640747, + "learning_rate": 4.096980630296757e-07, + "loss": 0.3833, + "step": 10055 + }, + { + "epoch": 4.898343618057811, + "grad_norm": 3.2498316764831543, + "learning_rate": 4.0934534269494314e-07, + "loss": 0.3672, + "step": 10056 + }, + { + "epoch": 4.898830789217278, + "grad_norm": 3.144442319869995, + "learning_rate": 4.0899276071954127e-07, + "loss": 0.3922, + "step": 10057 + }, + { + "epoch": 4.899317960376746, + "grad_norm": 3.1341640949249268, + "learning_rate": 4.086403171268044e-07, + "loss": 0.345, + "step": 10058 + }, + { + "epoch": 4.899805131536213, + "grad_norm": 3.3419554233551025, + "learning_rate": 4.082880119400559e-07, + "loss": 0.4401, + "step": 10059 + }, + { + "epoch": 4.90029230269568, + "grad_norm": 3.6894397735595703, + "learning_rate": 4.079358451826121e-07, + "loss": 0.435, + "step": 10060 + }, + { + "epoch": 4.900779473855148, + "grad_norm": 3.3188064098358154, + "learning_rate": 4.075838168777793e-07, + "loss": 0.3669, + "step": 10061 + }, + { + "epoch": 4.901266645014616, + "grad_norm": 3.423459768295288, + "learning_rate": 4.0723192704885504e-07, + "loss": 0.3772, + "step": 10062 + }, + { + "epoch": 4.901753816174082, + "grad_norm": 3.294912576675415, + "learning_rate": 4.068801757191265e-07, + "loss": 0.3638, + "step": 10063 + }, + { + "epoch": 4.90224098733355, + "grad_norm": 3.2064309120178223, + "learning_rate": 4.065285629118729e-07, + "loss": 0.3982, + "step": 10064 + }, + { + "epoch": 4.902728158493018, + "grad_norm": 3.35512375831604, + "learning_rate": 4.0617708865036436e-07, + "loss": 0.3568, + "step": 10065 + }, + { + "epoch": 4.903215329652484, + "grad_norm": 3.6729183197021484, + "learning_rate": 4.058257529578613e-07, + "loss": 0.3861, + "step": 10066 + }, + { + "epoch": 4.903702500811952, + "grad_norm": 3.2631373405456543, + "learning_rate": 4.054745558576145e-07, + "loss": 0.3083, + "step": 10067 + }, + { + "epoch": 4.9041896719714195, + "grad_norm": 3.2031829357147217, + "learning_rate": 4.051234973728668e-07, + "loss": 0.3908, + "step": 10068 + }, + { + "epoch": 4.904676843130886, + "grad_norm": 3.0298995971679688, + "learning_rate": 4.0477257752685076e-07, + "loss": 0.3789, + "step": 10069 + }, + { + "epoch": 4.905164014290354, + "grad_norm": 3.110804557800293, + "learning_rate": 4.0442179634279097e-07, + "loss": 0.3398, + "step": 10070 + }, + { + "epoch": 4.9056511854498215, + "grad_norm": 6.145146369934082, + "learning_rate": 4.04071153843901e-07, + "loss": 0.4249, + "step": 10071 + }, + { + "epoch": 4.906138356609289, + "grad_norm": 3.381484031677246, + "learning_rate": 4.037206500533869e-07, + "loss": 0.3947, + "step": 10072 + }, + { + "epoch": 4.906625527768756, + "grad_norm": 3.016502857208252, + "learning_rate": 4.0337028499444486e-07, + "loss": 0.4242, + "step": 10073 + }, + { + "epoch": 4.907112698928223, + "grad_norm": 3.2299227714538574, + "learning_rate": 4.03020058690263e-07, + "loss": 0.4069, + "step": 10074 + }, + { + "epoch": 4.907599870087691, + "grad_norm": 3.138312339782715, + "learning_rate": 4.026699711640175e-07, + "loss": 0.3675, + "step": 10075 + }, + { + "epoch": 4.908087041247159, + "grad_norm": 2.8356878757476807, + "learning_rate": 4.0232002243887873e-07, + "loss": 0.399, + "step": 10076 + }, + { + "epoch": 4.908574212406625, + "grad_norm": 3.5497281551361084, + "learning_rate": 4.0197021253800486e-07, + "loss": 0.3842, + "step": 10077 + }, + { + "epoch": 4.909061383566093, + "grad_norm": 3.088012218475342, + "learning_rate": 4.0162054148454697e-07, + "loss": 0.3665, + "step": 10078 + }, + { + "epoch": 4.909548554725561, + "grad_norm": 3.2019505500793457, + "learning_rate": 4.0127100930164696e-07, + "loss": 0.4473, + "step": 10079 + }, + { + "epoch": 4.910035725885027, + "grad_norm": 3.547269344329834, + "learning_rate": 4.0092161601243545e-07, + "loss": 0.3708, + "step": 10080 + }, + { + "epoch": 4.910522897044495, + "grad_norm": 3.1232175827026367, + "learning_rate": 4.0057236164003586e-07, + "loss": 0.3828, + "step": 10081 + }, + { + "epoch": 4.911010068203963, + "grad_norm": 3.3397104740142822, + "learning_rate": 4.002232462075628e-07, + "loss": 0.3323, + "step": 10082 + }, + { + "epoch": 4.911497239363429, + "grad_norm": 3.5396840572357178, + "learning_rate": 3.998742697381189e-07, + "loss": 0.4013, + "step": 10083 + }, + { + "epoch": 4.911984410522897, + "grad_norm": 3.3976917266845703, + "learning_rate": 3.9952543225480035e-07, + "loss": 0.3837, + "step": 10084 + }, + { + "epoch": 4.9124715816823645, + "grad_norm": 3.3385610580444336, + "learning_rate": 3.9917673378069286e-07, + "loss": 0.3953, + "step": 10085 + }, + { + "epoch": 4.912958752841832, + "grad_norm": 3.025542736053467, + "learning_rate": 3.9882817433887446e-07, + "loss": 0.4015, + "step": 10086 + }, + { + "epoch": 4.913445924001299, + "grad_norm": 3.33964204788208, + "learning_rate": 3.98479753952411e-07, + "loss": 0.3502, + "step": 10087 + }, + { + "epoch": 4.9139330951607665, + "grad_norm": 3.285836696624756, + "learning_rate": 3.9813147264436165e-07, + "loss": 0.3626, + "step": 10088 + }, + { + "epoch": 4.914420266320234, + "grad_norm": 3.320009708404541, + "learning_rate": 3.977833304377765e-07, + "loss": 0.3368, + "step": 10089 + }, + { + "epoch": 4.914907437479701, + "grad_norm": 3.073678970336914, + "learning_rate": 3.974353273556941e-07, + "loss": 0.3839, + "step": 10090 + }, + { + "epoch": 4.915394608639168, + "grad_norm": 3.379608154296875, + "learning_rate": 3.970874634211461e-07, + "loss": 0.373, + "step": 10091 + }, + { + "epoch": 4.915881779798636, + "grad_norm": 3.699065923690796, + "learning_rate": 3.9673973865715367e-07, + "loss": 0.3653, + "step": 10092 + }, + { + "epoch": 4.916368950958104, + "grad_norm": 3.4549288749694824, + "learning_rate": 3.9639215308673013e-07, + "loss": 0.3715, + "step": 10093 + }, + { + "epoch": 4.91685612211757, + "grad_norm": 3.1288483142852783, + "learning_rate": 3.9604470673287764e-07, + "loss": 0.3829, + "step": 10094 + }, + { + "epoch": 4.917343293277038, + "grad_norm": 3.327178478240967, + "learning_rate": 3.9569739961859035e-07, + "loss": 0.3947, + "step": 10095 + }, + { + "epoch": 4.917830464436506, + "grad_norm": 3.1882853507995605, + "learning_rate": 3.953502317668531e-07, + "loss": 0.3587, + "step": 10096 + }, + { + "epoch": 4.918317635595972, + "grad_norm": 3.7040491104125977, + "learning_rate": 3.9500320320064236e-07, + "loss": 0.3937, + "step": 10097 + }, + { + "epoch": 4.91880480675544, + "grad_norm": 2.9816534519195557, + "learning_rate": 3.94656313942923e-07, + "loss": 0.4049, + "step": 10098 + }, + { + "epoch": 4.919291977914908, + "grad_norm": 3.515799045562744, + "learning_rate": 3.9430956401665277e-07, + "loss": 0.3973, + "step": 10099 + }, + { + "epoch": 4.919779149074374, + "grad_norm": 3.300346851348877, + "learning_rate": 3.939629534447792e-07, + "loss": 0.3864, + "step": 10100 + }, + { + "epoch": 4.920266320233842, + "grad_norm": 3.3697714805603027, + "learning_rate": 3.936164822502422e-07, + "loss": 0.3988, + "step": 10101 + }, + { + "epoch": 4.9207534913933095, + "grad_norm": 3.7369027137756348, + "learning_rate": 3.9327015045596936e-07, + "loss": 0.4202, + "step": 10102 + }, + { + "epoch": 4.921240662552777, + "grad_norm": 4.399448871612549, + "learning_rate": 3.9292395808488203e-07, + "loss": 0.37, + "step": 10103 + }, + { + "epoch": 4.921727833712244, + "grad_norm": 3.2152163982391357, + "learning_rate": 3.9257790515989103e-07, + "loss": 0.3676, + "step": 10104 + }, + { + "epoch": 4.9222150048717115, + "grad_norm": 3.511927604675293, + "learning_rate": 3.922319917038983e-07, + "loss": 0.429, + "step": 10105 + }, + { + "epoch": 4.922702176031179, + "grad_norm": 3.7070257663726807, + "learning_rate": 3.918862177397956e-07, + "loss": 0.3282, + "step": 10106 + }, + { + "epoch": 4.923189347190647, + "grad_norm": 3.371443510055542, + "learning_rate": 3.9154058329046673e-07, + "loss": 0.3859, + "step": 10107 + }, + { + "epoch": 4.923676518350113, + "grad_norm": 3.466451644897461, + "learning_rate": 3.911950883787857e-07, + "loss": 0.3865, + "step": 10108 + }, + { + "epoch": 4.924163689509581, + "grad_norm": 3.312446117401123, + "learning_rate": 3.9084973302761816e-07, + "loss": 0.3958, + "step": 10109 + }, + { + "epoch": 4.924650860669049, + "grad_norm": 3.4719221591949463, + "learning_rate": 3.905045172598179e-07, + "loss": 0.45, + "step": 10110 + }, + { + "epoch": 4.925138031828515, + "grad_norm": 3.071488618850708, + "learning_rate": 3.901594410982326e-07, + "loss": 0.3904, + "step": 10111 + }, + { + "epoch": 4.925625202987983, + "grad_norm": 3.061051845550537, + "learning_rate": 3.8981450456569895e-07, + "loss": 0.3475, + "step": 10112 + }, + { + "epoch": 4.926112374147451, + "grad_norm": 3.069196939468384, + "learning_rate": 3.8946970768504536e-07, + "loss": 0.3208, + "step": 10113 + }, + { + "epoch": 4.926599545306917, + "grad_norm": 3.1969192028045654, + "learning_rate": 3.8912505047908944e-07, + "loss": 0.3908, + "step": 10114 + }, + { + "epoch": 4.927086716466385, + "grad_norm": 3.5708889961242676, + "learning_rate": 3.8878053297064093e-07, + "loss": 0.3638, + "step": 10115 + }, + { + "epoch": 4.927573887625853, + "grad_norm": 3.4882113933563232, + "learning_rate": 3.8843615518250053e-07, + "loss": 0.364, + "step": 10116 + }, + { + "epoch": 4.92806105878532, + "grad_norm": 3.339003324508667, + "learning_rate": 3.8809191713745894e-07, + "loss": 0.3287, + "step": 10117 + }, + { + "epoch": 4.928548229944787, + "grad_norm": 2.9319546222686768, + "learning_rate": 3.8774781885829726e-07, + "loss": 0.3929, + "step": 10118 + }, + { + "epoch": 4.9290354011042545, + "grad_norm": 3.452974796295166, + "learning_rate": 3.8740386036778823e-07, + "loss": 0.3935, + "step": 10119 + }, + { + "epoch": 4.929522572263722, + "grad_norm": 3.2983453273773193, + "learning_rate": 3.870600416886955e-07, + "loss": 0.3847, + "step": 10120 + }, + { + "epoch": 4.93000974342319, + "grad_norm": 3.3904032707214355, + "learning_rate": 3.86716362843772e-07, + "loss": 0.3396, + "step": 10121 + }, + { + "epoch": 4.9304969145826565, + "grad_norm": 3.011613607406616, + "learning_rate": 3.8637282385576264e-07, + "loss": 0.4486, + "step": 10122 + }, + { + "epoch": 4.930984085742124, + "grad_norm": 3.6343305110931396, + "learning_rate": 3.860294247474039e-07, + "loss": 0.4373, + "step": 10123 + }, + { + "epoch": 4.931471256901592, + "grad_norm": 3.2411601543426514, + "learning_rate": 3.8568616554142014e-07, + "loss": 0.4085, + "step": 10124 + }, + { + "epoch": 4.931958428061058, + "grad_norm": 3.4203908443450928, + "learning_rate": 3.8534304626052957e-07, + "loss": 0.3701, + "step": 10125 + }, + { + "epoch": 4.932445599220526, + "grad_norm": 3.3577535152435303, + "learning_rate": 3.850000669274387e-07, + "loss": 0.4263, + "step": 10126 + }, + { + "epoch": 4.932932770379994, + "grad_norm": 3.450948476791382, + "learning_rate": 3.846572275648466e-07, + "loss": 0.3878, + "step": 10127 + }, + { + "epoch": 4.93341994153946, + "grad_norm": 3.2205567359924316, + "learning_rate": 3.843145281954422e-07, + "loss": 0.3787, + "step": 10128 + }, + { + "epoch": 4.933907112698928, + "grad_norm": 3.192321538925171, + "learning_rate": 3.839719688419058e-07, + "loss": 0.3777, + "step": 10129 + }, + { + "epoch": 4.934394283858396, + "grad_norm": 3.293138027191162, + "learning_rate": 3.8362954952690686e-07, + "loss": 0.4076, + "step": 10130 + }, + { + "epoch": 4.934881455017863, + "grad_norm": 3.501756429672241, + "learning_rate": 3.832872702731072e-07, + "loss": 0.3952, + "step": 10131 + }, + { + "epoch": 4.93536862617733, + "grad_norm": 3.1152713298797607, + "learning_rate": 3.82945131103159e-07, + "loss": 0.3655, + "step": 10132 + }, + { + "epoch": 4.935855797336798, + "grad_norm": 3.5720391273498535, + "learning_rate": 3.8260313203970555e-07, + "loss": 0.3915, + "step": 10133 + }, + { + "epoch": 4.936342968496265, + "grad_norm": 3.2194433212280273, + "learning_rate": 3.8226127310537905e-07, + "loss": 0.3638, + "step": 10134 + }, + { + "epoch": 4.936830139655733, + "grad_norm": 4.064489841461182, + "learning_rate": 3.819195543228041e-07, + "loss": 0.3948, + "step": 10135 + }, + { + "epoch": 4.9373173108151995, + "grad_norm": 3.184390068054199, + "learning_rate": 3.81577975714596e-07, + "loss": 0.3441, + "step": 10136 + }, + { + "epoch": 4.937804481974667, + "grad_norm": 3.202822208404541, + "learning_rate": 3.8123653730336054e-07, + "loss": 0.378, + "step": 10137 + }, + { + "epoch": 4.938291653134135, + "grad_norm": 3.6628880500793457, + "learning_rate": 3.8089523911169354e-07, + "loss": 0.3806, + "step": 10138 + }, + { + "epoch": 4.9387788242936015, + "grad_norm": 3.7189574241638184, + "learning_rate": 3.805540811621819e-07, + "loss": 0.3985, + "step": 10139 + }, + { + "epoch": 4.939265995453069, + "grad_norm": 3.450815200805664, + "learning_rate": 3.8021306347740453e-07, + "loss": 0.3943, + "step": 10140 + }, + { + "epoch": 4.939753166612537, + "grad_norm": 3.130599021911621, + "learning_rate": 3.798721860799287e-07, + "loss": 0.3659, + "step": 10141 + }, + { + "epoch": 4.940240337772003, + "grad_norm": 3.4316201210021973, + "learning_rate": 3.795314489923141e-07, + "loss": 0.3598, + "step": 10142 + }, + { + "epoch": 4.940727508931471, + "grad_norm": 3.7835209369659424, + "learning_rate": 3.7919085223711074e-07, + "loss": 0.3784, + "step": 10143 + }, + { + "epoch": 4.941214680090939, + "grad_norm": 3.4346485137939453, + "learning_rate": 3.7885039583686e-07, + "loss": 0.3911, + "step": 10144 + }, + { + "epoch": 4.941701851250406, + "grad_norm": 2.8888111114501953, + "learning_rate": 3.78510079814092e-07, + "loss": 0.3875, + "step": 10145 + }, + { + "epoch": 4.942189022409873, + "grad_norm": 3.5812110900878906, + "learning_rate": 3.781699041913292e-07, + "loss": 0.4412, + "step": 10146 + }, + { + "epoch": 4.942676193569341, + "grad_norm": 3.2378015518188477, + "learning_rate": 3.778298689910845e-07, + "loss": 0.3525, + "step": 10147 + }, + { + "epoch": 4.943163364728808, + "grad_norm": 3.440610885620117, + "learning_rate": 3.7748997423586203e-07, + "loss": 0.4201, + "step": 10148 + }, + { + "epoch": 4.943650535888276, + "grad_norm": 3.3810484409332275, + "learning_rate": 3.77150219948155e-07, + "loss": 0.375, + "step": 10149 + }, + { + "epoch": 4.944137707047743, + "grad_norm": 3.03560733795166, + "learning_rate": 3.768106061504487e-07, + "loss": 0.3817, + "step": 10150 + }, + { + "epoch": 4.94462487820721, + "grad_norm": 2.89755916595459, + "learning_rate": 3.7647113286521876e-07, + "loss": 0.3296, + "step": 10151 + }, + { + "epoch": 4.945112049366678, + "grad_norm": 3.729677200317383, + "learning_rate": 3.7613180011493224e-07, + "loss": 0.3555, + "step": 10152 + }, + { + "epoch": 4.9455992205261445, + "grad_norm": 3.6285502910614014, + "learning_rate": 3.757926079220445e-07, + "loss": 0.3686, + "step": 10153 + }, + { + "epoch": 4.946086391685612, + "grad_norm": 4.057083606719971, + "learning_rate": 3.754535563090042e-07, + "loss": 0.3474, + "step": 10154 + }, + { + "epoch": 4.94657356284508, + "grad_norm": 3.6389408111572266, + "learning_rate": 3.7511464529824987e-07, + "loss": 0.3403, + "step": 10155 + }, + { + "epoch": 4.9470607340045465, + "grad_norm": 3.0146636962890625, + "learning_rate": 3.74775874912211e-07, + "loss": 0.4007, + "step": 10156 + }, + { + "epoch": 4.947547905164014, + "grad_norm": 2.9553234577178955, + "learning_rate": 3.7443724517330635e-07, + "loss": 0.3967, + "step": 10157 + }, + { + "epoch": 4.948035076323482, + "grad_norm": 3.2601888179779053, + "learning_rate": 3.7409875610394665e-07, + "loss": 0.3486, + "step": 10158 + }, + { + "epoch": 4.948522247482949, + "grad_norm": 3.1675124168395996, + "learning_rate": 3.737604077265336e-07, + "loss": 0.3066, + "step": 10159 + }, + { + "epoch": 4.949009418642416, + "grad_norm": 3.4658257961273193, + "learning_rate": 3.734222000634591e-07, + "loss": 0.3897, + "step": 10160 + }, + { + "epoch": 4.949496589801884, + "grad_norm": 2.8657548427581787, + "learning_rate": 3.730841331371049e-07, + "loss": 0.4013, + "step": 10161 + }, + { + "epoch": 4.949983760961351, + "grad_norm": 3.2318718433380127, + "learning_rate": 3.727462069698445e-07, + "loss": 0.3392, + "step": 10162 + }, + { + "epoch": 4.950470932120819, + "grad_norm": 3.1690638065338135, + "learning_rate": 3.724084215840429e-07, + "loss": 0.3789, + "step": 10163 + }, + { + "epoch": 4.950958103280286, + "grad_norm": 3.2202978134155273, + "learning_rate": 3.720707770020532e-07, + "loss": 0.358, + "step": 10164 + }, + { + "epoch": 4.951445274439753, + "grad_norm": 3.556974411010742, + "learning_rate": 3.7173327324622117e-07, + "loss": 0.4478, + "step": 10165 + }, + { + "epoch": 4.951932445599221, + "grad_norm": 3.317241907119751, + "learning_rate": 3.713959103388837e-07, + "loss": 0.4243, + "step": 10166 + }, + { + "epoch": 4.952419616758688, + "grad_norm": 3.6211609840393066, + "learning_rate": 3.7105868830236593e-07, + "loss": 0.3475, + "step": 10167 + }, + { + "epoch": 4.952906787918155, + "grad_norm": 3.374544858932495, + "learning_rate": 3.7072160715898584e-07, + "loss": 0.4456, + "step": 10168 + }, + { + "epoch": 4.953393959077623, + "grad_norm": 3.308798313140869, + "learning_rate": 3.7038466693105237e-07, + "loss": 0.3717, + "step": 10169 + }, + { + "epoch": 4.9538811302370895, + "grad_norm": 3.121805191040039, + "learning_rate": 3.7004786764086253e-07, + "loss": 0.3989, + "step": 10170 + }, + { + "epoch": 4.954368301396557, + "grad_norm": 3.777115821838379, + "learning_rate": 3.6971120931070665e-07, + "loss": 0.3986, + "step": 10171 + }, + { + "epoch": 4.954855472556025, + "grad_norm": 3.2363784313201904, + "learning_rate": 3.693746919628652e-07, + "loss": 0.3579, + "step": 10172 + }, + { + "epoch": 4.955342643715492, + "grad_norm": 3.1721014976501465, + "learning_rate": 3.690383156196073e-07, + "loss": 0.3664, + "step": 10173 + }, + { + "epoch": 4.955829814874959, + "grad_norm": 3.0866177082061768, + "learning_rate": 3.6870208030319543e-07, + "loss": 0.4034, + "step": 10174 + }, + { + "epoch": 4.956316986034427, + "grad_norm": 3.59139347076416, + "learning_rate": 3.6836598603588154e-07, + "loss": 0.3927, + "step": 10175 + }, + { + "epoch": 4.956804157193894, + "grad_norm": 3.91290545463562, + "learning_rate": 3.680300328399086e-07, + "loss": 0.3484, + "step": 10176 + }, + { + "epoch": 4.957291328353362, + "grad_norm": 3.29648756980896, + "learning_rate": 3.676942207375092e-07, + "loss": 0.4022, + "step": 10177 + }, + { + "epoch": 4.957778499512829, + "grad_norm": 3.368079900741577, + "learning_rate": 3.6735854975090747e-07, + "loss": 0.3623, + "step": 10178 + }, + { + "epoch": 4.958265670672296, + "grad_norm": 3.2548928260803223, + "learning_rate": 3.6702301990231837e-07, + "loss": 0.338, + "step": 10179 + }, + { + "epoch": 4.958752841831764, + "grad_norm": 3.3847038745880127, + "learning_rate": 3.666876312139478e-07, + "loss": 0.3463, + "step": 10180 + }, + { + "epoch": 4.959240012991231, + "grad_norm": 3.384859800338745, + "learning_rate": 3.6635238370799075e-07, + "loss": 0.3684, + "step": 10181 + }, + { + "epoch": 4.959727184150698, + "grad_norm": 3.2735137939453125, + "learning_rate": 3.6601727740663396e-07, + "loss": 0.3454, + "step": 10182 + }, + { + "epoch": 4.960214355310166, + "grad_norm": 3.2128124237060547, + "learning_rate": 3.656823123320552e-07, + "loss": 0.3255, + "step": 10183 + }, + { + "epoch": 4.960701526469633, + "grad_norm": 3.0897774696350098, + "learning_rate": 3.6534748850642286e-07, + "loss": 0.3586, + "step": 10184 + }, + { + "epoch": 4.9611886976291, + "grad_norm": 3.2088069915771484, + "learning_rate": 3.6501280595189427e-07, + "loss": 0.3679, + "step": 10185 + }, + { + "epoch": 4.961675868788568, + "grad_norm": 3.026634931564331, + "learning_rate": 3.646782646906194e-07, + "loss": 0.3286, + "step": 10186 + }, + { + "epoch": 4.962163039948035, + "grad_norm": 3.3800907135009766, + "learning_rate": 3.643438647447381e-07, + "loss": 0.3644, + "step": 10187 + }, + { + "epoch": 4.962650211107502, + "grad_norm": 2.8461251258850098, + "learning_rate": 3.6400960613638156e-07, + "loss": 0.3597, + "step": 10188 + }, + { + "epoch": 4.96313738226697, + "grad_norm": 3.0107531547546387, + "learning_rate": 3.6367548888767007e-07, + "loss": 0.345, + "step": 10189 + }, + { + "epoch": 4.963624553426437, + "grad_norm": 3.264554262161255, + "learning_rate": 3.6334151302071547e-07, + "loss": 0.3694, + "step": 10190 + }, + { + "epoch": 4.964111724585905, + "grad_norm": 3.3900065422058105, + "learning_rate": 3.630076785576211e-07, + "loss": 0.3865, + "step": 10191 + }, + { + "epoch": 4.964598895745372, + "grad_norm": 3.0797462463378906, + "learning_rate": 3.6267398552047915e-07, + "loss": 0.3931, + "step": 10192 + }, + { + "epoch": 4.965086066904839, + "grad_norm": 3.4856367111206055, + "learning_rate": 3.623404339313738e-07, + "loss": 0.3585, + "step": 10193 + }, + { + "epoch": 4.965573238064307, + "grad_norm": 3.4463913440704346, + "learning_rate": 3.620070238123796e-07, + "loss": 0.3898, + "step": 10194 + }, + { + "epoch": 4.966060409223774, + "grad_norm": 3.3054966926574707, + "learning_rate": 3.6167375518556173e-07, + "loss": 0.3946, + "step": 10195 + }, + { + "epoch": 4.966547580383241, + "grad_norm": 3.128445863723755, + "learning_rate": 3.613406280729753e-07, + "loss": 0.3848, + "step": 10196 + }, + { + "epoch": 4.967034751542709, + "grad_norm": 3.1904215812683105, + "learning_rate": 3.6100764249666687e-07, + "loss": 0.3373, + "step": 10197 + }, + { + "epoch": 4.967521922702176, + "grad_norm": 2.731126546859741, + "learning_rate": 3.6067479847867346e-07, + "loss": 0.3837, + "step": 10198 + }, + { + "epoch": 4.968009093861643, + "grad_norm": 3.1139910221099854, + "learning_rate": 3.603420960410234e-07, + "loss": 0.3737, + "step": 10199 + }, + { + "epoch": 4.968496265021111, + "grad_norm": 3.2097504138946533, + "learning_rate": 3.600095352057334e-07, + "loss": 0.397, + "step": 10200 + }, + { + "epoch": 4.9689834361805785, + "grad_norm": 3.081949472427368, + "learning_rate": 3.5967711599481316e-07, + "loss": 0.3337, + "step": 10201 + }, + { + "epoch": 4.969470607340045, + "grad_norm": 3.317779302597046, + "learning_rate": 3.5934483843026194e-07, + "loss": 0.3599, + "step": 10202 + }, + { + "epoch": 4.969957778499513, + "grad_norm": 3.0074427127838135, + "learning_rate": 3.590127025340709e-07, + "loss": 0.3797, + "step": 10203 + }, + { + "epoch": 4.97044494965898, + "grad_norm": 3.607574939727783, + "learning_rate": 3.58680708328219e-07, + "loss": 0.3878, + "step": 10204 + }, + { + "epoch": 4.970932120818448, + "grad_norm": 3.877084493637085, + "learning_rate": 3.5834885583467845e-07, + "loss": 0.372, + "step": 10205 + }, + { + "epoch": 4.971419291977915, + "grad_norm": 3.344362497329712, + "learning_rate": 3.5801714507541107e-07, + "loss": 0.4147, + "step": 10206 + }, + { + "epoch": 4.971906463137382, + "grad_norm": 3.5462379455566406, + "learning_rate": 3.5768557607237046e-07, + "loss": 0.3753, + "step": 10207 + }, + { + "epoch": 4.97239363429685, + "grad_norm": 3.6123533248901367, + "learning_rate": 3.5735414884749825e-07, + "loss": 0.3887, + "step": 10208 + }, + { + "epoch": 4.972880805456317, + "grad_norm": 3.2552409172058105, + "learning_rate": 3.570228634227288e-07, + "loss": 0.403, + "step": 10209 + }, + { + "epoch": 4.973367976615784, + "grad_norm": 3.5421323776245117, + "learning_rate": 3.5669171981998733e-07, + "loss": 0.3545, + "step": 10210 + }, + { + "epoch": 4.973855147775252, + "grad_norm": 3.8252792358398438, + "learning_rate": 3.5636071806118776e-07, + "loss": 0.4448, + "step": 10211 + }, + { + "epoch": 4.974342318934719, + "grad_norm": 3.3224799633026123, + "learning_rate": 3.560298581682367e-07, + "loss": 0.3573, + "step": 10212 + }, + { + "epoch": 4.974829490094186, + "grad_norm": 3.346096992492676, + "learning_rate": 3.5569914016302943e-07, + "loss": 0.4192, + "step": 10213 + }, + { + "epoch": 4.975316661253654, + "grad_norm": 3.3196520805358887, + "learning_rate": 3.553685640674534e-07, + "loss": 0.3658, + "step": 10214 + }, + { + "epoch": 4.9758038324131215, + "grad_norm": 3.0232038497924805, + "learning_rate": 3.550381299033867e-07, + "loss": 0.3879, + "step": 10215 + }, + { + "epoch": 4.976291003572588, + "grad_norm": 3.756385564804077, + "learning_rate": 3.547078376926963e-07, + "loss": 0.3414, + "step": 10216 + }, + { + "epoch": 4.976778174732056, + "grad_norm": 3.2052717208862305, + "learning_rate": 3.543776874572413e-07, + "loss": 0.4072, + "step": 10217 + }, + { + "epoch": 4.9772653458915235, + "grad_norm": 3.4817686080932617, + "learning_rate": 3.5404767921887127e-07, + "loss": 0.3249, + "step": 10218 + }, + { + "epoch": 4.97775251705099, + "grad_norm": 3.3325672149658203, + "learning_rate": 3.537178129994265e-07, + "loss": 0.4004, + "step": 10219 + }, + { + "epoch": 4.978239688210458, + "grad_norm": 3.6894850730895996, + "learning_rate": 3.533880888207364e-07, + "loss": 0.3928, + "step": 10220 + }, + { + "epoch": 4.978726859369925, + "grad_norm": 3.518065929412842, + "learning_rate": 3.530585067046227e-07, + "loss": 0.3697, + "step": 10221 + }, + { + "epoch": 4.979214030529393, + "grad_norm": 3.8638625144958496, + "learning_rate": 3.5272906667289717e-07, + "loss": 0.4556, + "step": 10222 + }, + { + "epoch": 4.97970120168886, + "grad_norm": 3.481776237487793, + "learning_rate": 3.5239976874736264e-07, + "loss": 0.3895, + "step": 10223 + }, + { + "epoch": 4.980188372848327, + "grad_norm": 3.041919469833374, + "learning_rate": 3.520706129498108e-07, + "loss": 0.3886, + "step": 10224 + }, + { + "epoch": 4.980675544007795, + "grad_norm": 3.260793447494507, + "learning_rate": 3.5174159930202593e-07, + "loss": 0.3496, + "step": 10225 + }, + { + "epoch": 4.981162715167262, + "grad_norm": 3.3561806678771973, + "learning_rate": 3.514127278257817e-07, + "loss": 0.4077, + "step": 10226 + }, + { + "epoch": 4.981649886326729, + "grad_norm": 3.3647918701171875, + "learning_rate": 3.510839985428438e-07, + "loss": 0.3924, + "step": 10227 + }, + { + "epoch": 4.982137057486197, + "grad_norm": 3.5711164474487305, + "learning_rate": 3.507554114749662e-07, + "loss": 0.3652, + "step": 10228 + }, + { + "epoch": 4.982624228645664, + "grad_norm": 3.3314208984375, + "learning_rate": 3.504269666438953e-07, + "loss": 0.3901, + "step": 10229 + }, + { + "epoch": 4.983111399805131, + "grad_norm": 3.044003963470459, + "learning_rate": 3.5009866407136773e-07, + "loss": 0.3353, + "step": 10230 + }, + { + "epoch": 4.983598570964599, + "grad_norm": 3.1698427200317383, + "learning_rate": 3.4977050377911076e-07, + "loss": 0.4013, + "step": 10231 + }, + { + "epoch": 4.9840857421240665, + "grad_norm": 3.0124449729919434, + "learning_rate": 3.4944248578884094e-07, + "loss": 0.3612, + "step": 10232 + }, + { + "epoch": 4.984572913283533, + "grad_norm": 3.1462488174438477, + "learning_rate": 3.4911461012226754e-07, + "loss": 0.3982, + "step": 10233 + }, + { + "epoch": 4.985060084443001, + "grad_norm": 3.7311365604400635, + "learning_rate": 3.4878687680108854e-07, + "loss": 0.3861, + "step": 10234 + }, + { + "epoch": 4.9855472556024685, + "grad_norm": 2.8472094535827637, + "learning_rate": 3.4845928584699455e-07, + "loss": 0.3749, + "step": 10235 + }, + { + "epoch": 4.986034426761936, + "grad_norm": 3.1922738552093506, + "learning_rate": 3.481318372816639e-07, + "loss": 0.3804, + "step": 10236 + }, + { + "epoch": 4.986521597921403, + "grad_norm": 3.1603786945343018, + "learning_rate": 3.4780453112676807e-07, + "loss": 0.3953, + "step": 10237 + }, + { + "epoch": 4.98700876908087, + "grad_norm": 3.2363851070404053, + "learning_rate": 3.4747736740396776e-07, + "loss": 0.3942, + "step": 10238 + }, + { + "epoch": 4.987495940240338, + "grad_norm": 3.5516786575317383, + "learning_rate": 3.471503461349157e-07, + "loss": 0.3718, + "step": 10239 + }, + { + "epoch": 4.987983111399805, + "grad_norm": 3.5967960357666016, + "learning_rate": 3.468234673412524e-07, + "loss": 0.4042, + "step": 10240 + }, + { + "epoch": 4.988470282559272, + "grad_norm": 3.2771878242492676, + "learning_rate": 3.4649673104461157e-07, + "loss": 0.389, + "step": 10241 + }, + { + "epoch": 4.98895745371874, + "grad_norm": 3.6579177379608154, + "learning_rate": 3.461701372666168e-07, + "loss": 0.4021, + "step": 10242 + }, + { + "epoch": 4.989444624878207, + "grad_norm": 3.5953152179718018, + "learning_rate": 3.4584368602888193e-07, + "loss": 0.3829, + "step": 10243 + }, + { + "epoch": 4.989931796037674, + "grad_norm": 3.1883511543273926, + "learning_rate": 3.45517377353011e-07, + "loss": 0.4013, + "step": 10244 + }, + { + "epoch": 4.990418967197142, + "grad_norm": 3.508007526397705, + "learning_rate": 3.4519121126059957e-07, + "loss": 0.3678, + "step": 10245 + }, + { + "epoch": 4.99090613835661, + "grad_norm": 3.403113603591919, + "learning_rate": 3.448651877732334e-07, + "loss": 0.446, + "step": 10246 + }, + { + "epoch": 4.991393309516076, + "grad_norm": 3.9490294456481934, + "learning_rate": 3.4453930691248806e-07, + "loss": 0.38, + "step": 10247 + }, + { + "epoch": 4.991880480675544, + "grad_norm": 3.1328959465026855, + "learning_rate": 3.442135686999304e-07, + "loss": 0.4004, + "step": 10248 + }, + { + "epoch": 4.9923676518350115, + "grad_norm": 4.0235915184021, + "learning_rate": 3.438879731571182e-07, + "loss": 0.4167, + "step": 10249 + }, + { + "epoch": 4.992854822994479, + "grad_norm": 3.3743743896484375, + "learning_rate": 3.4356252030559977e-07, + "loss": 0.3393, + "step": 10250 + }, + { + "epoch": 4.993341994153946, + "grad_norm": 2.9085583686828613, + "learning_rate": 3.432372101669124e-07, + "loss": 0.3959, + "step": 10251 + }, + { + "epoch": 4.9938291653134135, + "grad_norm": 3.3012726306915283, + "learning_rate": 3.429120427625854e-07, + "loss": 0.3957, + "step": 10252 + }, + { + "epoch": 4.994316336472881, + "grad_norm": 3.2740681171417236, + "learning_rate": 3.425870181141394e-07, + "loss": 0.3602, + "step": 10253 + }, + { + "epoch": 4.994803507632348, + "grad_norm": 3.1725122928619385, + "learning_rate": 3.422621362430828e-07, + "loss": 0.3787, + "step": 10254 + }, + { + "epoch": 4.995290678791815, + "grad_norm": 3.109503984451294, + "learning_rate": 3.419373971709172e-07, + "loss": 0.3908, + "step": 10255 + }, + { + "epoch": 4.995777849951283, + "grad_norm": 3.3040120601654053, + "learning_rate": 3.416128009191344e-07, + "loss": 0.3779, + "step": 10256 + }, + { + "epoch": 4.99626502111075, + "grad_norm": 3.374885320663452, + "learning_rate": 3.4128834750921465e-07, + "loss": 0.4359, + "step": 10257 + }, + { + "epoch": 4.996752192270217, + "grad_norm": 3.6470556259155273, + "learning_rate": 3.409640369626313e-07, + "loss": 0.4469, + "step": 10258 + }, + { + "epoch": 4.997239363429685, + "grad_norm": 4.4094390869140625, + "learning_rate": 3.4063986930084727e-07, + "loss": 0.3133, + "step": 10259 + }, + { + "epoch": 4.997726534589153, + "grad_norm": 3.842092752456665, + "learning_rate": 3.40315844545315e-07, + "loss": 0.4142, + "step": 10260 + } + ], + "logging_steps": 1, + "max_steps": 12312, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 2052, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.591376845983187e+19, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}