{ "best_global_step": 3689, "best_metric": 62.08277152527448, "best_model_checkpoint": "whisper-medium-quantized-lora/checkpoints/checkpoint-3689", "epoch": 0.24995130048868902, "eval_steps": 3689, "global_step": 3689, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 6.775584182398726e-05, "grad_norm": 11.2294282913208, "learning_rate": 0.0, "loss": 3.7623, "step": 1 }, { "epoch": 0.00013551168364797452, "grad_norm": 27.403789520263672, "learning_rate": 5.333333333333333e-07, "loss": 3.7509, "step": 2 }, { "epoch": 0.0002032675254719618, "grad_norm": 29.29781150817871, "learning_rate": 1.0666666666666667e-06, "loss": 4.1779, "step": 3 }, { "epoch": 0.00027102336729594904, "grad_norm": 5.965636730194092, "learning_rate": 1.6000000000000001e-06, "loss": 3.0093, "step": 4 }, { "epoch": 0.0003387792091199363, "grad_norm": 7.786504745483398, "learning_rate": 2.1333333333333334e-06, "loss": 3.0575, "step": 5 }, { "epoch": 0.0004065350509439236, "grad_norm": 21.127445220947266, "learning_rate": 2.666666666666667e-06, "loss": 2.7851, "step": 6 }, { "epoch": 0.00047429089276791086, "grad_norm": 5.899219989776611, "learning_rate": 3.2000000000000003e-06, "loss": 2.6554, "step": 7 }, { "epoch": 0.0005420467345918981, "grad_norm": 29.54319953918457, "learning_rate": 3.7333333333333337e-06, "loss": 2.561, "step": 8 }, { "epoch": 0.0006098025764158854, "grad_norm": 16.01921844482422, "learning_rate": 4.266666666666667e-06, "loss": 3.1085, "step": 9 }, { "epoch": 0.0006775584182398726, "grad_norm": 6.624917030334473, "learning_rate": 4.800000000000001e-06, "loss": 2.8822, "step": 10 }, { "epoch": 0.0007453142600638598, "grad_norm": 8.62118148803711, "learning_rate": 5.333333333333334e-06, "loss": 3.8258, "step": 11 }, { "epoch": 0.0008130701018878472, "grad_norm": 6.586535453796387, "learning_rate": 5.866666666666667e-06, "loss": 3.0811, "step": 12 }, { "epoch": 0.0008808259437118344, "grad_norm": 22.17951011657715, "learning_rate": 6.4000000000000006e-06, "loss": 3.4985, "step": 13 }, { "epoch": 0.0009485817855358217, "grad_norm": 14.850955963134766, "learning_rate": 6.933333333333334e-06, "loss": 4.1996, "step": 14 }, { "epoch": 0.0010163376273598088, "grad_norm": 62.20494842529297, "learning_rate": 7.4666666666666675e-06, "loss": 3.4069, "step": 15 }, { "epoch": 0.0010840934691837961, "grad_norm": 6.534540176391602, "learning_rate": 8.000000000000001e-06, "loss": 2.9703, "step": 16 }, { "epoch": 0.0011518493110077835, "grad_norm": 8.012496948242188, "learning_rate": 8.533333333333334e-06, "loss": 2.8364, "step": 17 }, { "epoch": 0.0012196051528317708, "grad_norm": 9.2630615234375, "learning_rate": 9.066666666666667e-06, "loss": 2.004, "step": 18 }, { "epoch": 0.001287360994655758, "grad_norm": 3.8082871437072754, "learning_rate": 9.600000000000001e-06, "loss": 2.1368, "step": 19 }, { "epoch": 0.0013551168364797452, "grad_norm": 5.680924415588379, "learning_rate": 1.0133333333333333e-05, "loss": 2.0163, "step": 20 }, { "epoch": 0.0014228726783037326, "grad_norm": 34.47060012817383, "learning_rate": 1.0666666666666667e-05, "loss": 3.1532, "step": 21 }, { "epoch": 0.0014906285201277197, "grad_norm": 6.08674430847168, "learning_rate": 1.1200000000000001e-05, "loss": 2.9393, "step": 22 }, { "epoch": 0.001558384361951707, "grad_norm": 8.805831909179688, "learning_rate": 1.1733333333333333e-05, "loss": 2.7824, "step": 23 }, { "epoch": 0.0016261402037756943, "grad_norm": 10.128340721130371, "learning_rate": 1.2266666666666667e-05, "loss": 2.9302, "step": 24 }, { "epoch": 0.0016938960455996814, "grad_norm": 5.021735668182373, "learning_rate": 1.2800000000000001e-05, "loss": 2.8988, "step": 25 }, { "epoch": 0.0017616518874236688, "grad_norm": 27.6428165435791, "learning_rate": 1.3333333333333333e-05, "loss": 4.0013, "step": 26 }, { "epoch": 0.001829407729247656, "grad_norm": 5.653388500213623, "learning_rate": 1.3866666666666667e-05, "loss": 3.0088, "step": 27 }, { "epoch": 0.0018971635710716434, "grad_norm": 8.216667175292969, "learning_rate": 1.44e-05, "loss": 2.8235, "step": 28 }, { "epoch": 0.0019649194128956307, "grad_norm": 5.633044719696045, "learning_rate": 1.4933333333333335e-05, "loss": 2.5359, "step": 29 }, { "epoch": 0.0020326752547196176, "grad_norm": 6.593887805938721, "learning_rate": 1.546666666666667e-05, "loss": 2.307, "step": 30 }, { "epoch": 0.002100431096543605, "grad_norm": 16.731855392456055, "learning_rate": 1.6000000000000003e-05, "loss": 2.3399, "step": 31 }, { "epoch": 0.0021681869383675923, "grad_norm": 15.881052017211914, "learning_rate": 1.6533333333333333e-05, "loss": 2.6155, "step": 32 }, { "epoch": 0.0022359427801915796, "grad_norm": 14.086976051330566, "learning_rate": 1.7066666666666667e-05, "loss": 2.3971, "step": 33 }, { "epoch": 0.002303698622015567, "grad_norm": 36.32572937011719, "learning_rate": 1.76e-05, "loss": 3.217, "step": 34 }, { "epoch": 0.0023714544638395543, "grad_norm": 8.181658744812012, "learning_rate": 1.8133333333333335e-05, "loss": 2.0689, "step": 35 }, { "epoch": 0.0024392103056635416, "grad_norm": 5.850778579711914, "learning_rate": 1.866666666666667e-05, "loss": 3.0411, "step": 36 }, { "epoch": 0.0025069661474875285, "grad_norm": 4.756629943847656, "learning_rate": 1.9200000000000003e-05, "loss": 2.7895, "step": 37 }, { "epoch": 0.002574721989311516, "grad_norm": 4.575255870819092, "learning_rate": 1.9733333333333333e-05, "loss": 2.6805, "step": 38 }, { "epoch": 0.002642477831135503, "grad_norm": 4.238344669342041, "learning_rate": 2.0266666666666667e-05, "loss": 2.2225, "step": 39 }, { "epoch": 0.0027102336729594905, "grad_norm": 5.039498805999756, "learning_rate": 2.08e-05, "loss": 2.9887, "step": 40 }, { "epoch": 0.002777989514783478, "grad_norm": 3.856889247894287, "learning_rate": 2.1333333333333335e-05, "loss": 2.4299, "step": 41 }, { "epoch": 0.002845745356607465, "grad_norm": 3.5066351890563965, "learning_rate": 2.186666666666667e-05, "loss": 2.237, "step": 42 }, { "epoch": 0.0029135011984314525, "grad_norm": 4.3462677001953125, "learning_rate": 2.2400000000000002e-05, "loss": 2.5201, "step": 43 }, { "epoch": 0.0029812570402554394, "grad_norm": 4.263307094573975, "learning_rate": 2.2933333333333333e-05, "loss": 2.5842, "step": 44 }, { "epoch": 0.0030490128820794267, "grad_norm": 4.542484760284424, "learning_rate": 2.3466666666666667e-05, "loss": 2.3541, "step": 45 }, { "epoch": 0.003116768723903414, "grad_norm": 3.7715518474578857, "learning_rate": 2.4e-05, "loss": 2.206, "step": 46 }, { "epoch": 0.0031845245657274013, "grad_norm": 4.391230583190918, "learning_rate": 2.4533333333333334e-05, "loss": 2.4971, "step": 47 }, { "epoch": 0.0032522804075513887, "grad_norm": 4.592225551605225, "learning_rate": 2.5066666666666665e-05, "loss": 2.7705, "step": 48 }, { "epoch": 0.003320036249375376, "grad_norm": 4.211614608764648, "learning_rate": 2.5600000000000002e-05, "loss": 2.6184, "step": 49 }, { "epoch": 0.003387792091199363, "grad_norm": 3.9807486534118652, "learning_rate": 2.6133333333333333e-05, "loss": 2.3636, "step": 50 }, { "epoch": 0.00345554793302335, "grad_norm": 3.6588141918182373, "learning_rate": 2.6666666666666667e-05, "loss": 1.8357, "step": 51 }, { "epoch": 0.0035233037748473375, "grad_norm": 6.977685928344727, "learning_rate": 2.7200000000000004e-05, "loss": 2.9069, "step": 52 }, { "epoch": 0.003591059616671325, "grad_norm": 3.890204906463623, "learning_rate": 2.7733333333333334e-05, "loss": 2.2357, "step": 53 }, { "epoch": 0.003658815458495312, "grad_norm": 3.3455967903137207, "learning_rate": 2.8266666666666668e-05, "loss": 1.7488, "step": 54 }, { "epoch": 0.0037265713003192995, "grad_norm": 3.685267686843872, "learning_rate": 2.88e-05, "loss": 1.9956, "step": 55 }, { "epoch": 0.003794327142143287, "grad_norm": 4.967723846435547, "learning_rate": 2.9333333333333336e-05, "loss": 2.4638, "step": 56 }, { "epoch": 0.0038620829839672737, "grad_norm": 3.6209311485290527, "learning_rate": 2.986666666666667e-05, "loss": 1.8269, "step": 57 }, { "epoch": 0.0039298388257912615, "grad_norm": 4.622175216674805, "learning_rate": 3.04e-05, "loss": 2.6827, "step": 58 }, { "epoch": 0.003997594667615249, "grad_norm": 4.11090612411499, "learning_rate": 3.093333333333334e-05, "loss": 2.2083, "step": 59 }, { "epoch": 0.004065350509439235, "grad_norm": 3.5581538677215576, "learning_rate": 3.146666666666667e-05, "loss": 1.9052, "step": 60 }, { "epoch": 0.004133106351263223, "grad_norm": 3.498354196548462, "learning_rate": 3.2000000000000005e-05, "loss": 1.7929, "step": 61 }, { "epoch": 0.00420086219308721, "grad_norm": 5.344475746154785, "learning_rate": 3.253333333333333e-05, "loss": 2.0024, "step": 62 }, { "epoch": 0.004268618034911197, "grad_norm": 4.66567325592041, "learning_rate": 3.3066666666666666e-05, "loss": 2.7599, "step": 63 }, { "epoch": 0.004336373876735185, "grad_norm": 3.411456346511841, "learning_rate": 3.3600000000000004e-05, "loss": 1.9109, "step": 64 }, { "epoch": 0.004404129718559172, "grad_norm": 4.811199188232422, "learning_rate": 3.4133333333333334e-05, "loss": 2.5342, "step": 65 }, { "epoch": 0.004471885560383159, "grad_norm": 3.51120662689209, "learning_rate": 3.466666666666667e-05, "loss": 1.8567, "step": 66 }, { "epoch": 0.004539641402207147, "grad_norm": 5.19026517868042, "learning_rate": 3.52e-05, "loss": 2.4372, "step": 67 }, { "epoch": 0.004607397244031134, "grad_norm": 3.779278516769409, "learning_rate": 3.573333333333333e-05, "loss": 1.8131, "step": 68 }, { "epoch": 0.004675153085855121, "grad_norm": 2.883894443511963, "learning_rate": 3.626666666666667e-05, "loss": 1.4007, "step": 69 }, { "epoch": 0.0047429089276791086, "grad_norm": 4.038563251495361, "learning_rate": 3.68e-05, "loss": 1.7251, "step": 70 }, { "epoch": 0.004810664769503096, "grad_norm": 4.436848163604736, "learning_rate": 3.733333333333334e-05, "loss": 2.0543, "step": 71 }, { "epoch": 0.004878420611327083, "grad_norm": 3.919203996658325, "learning_rate": 3.786666666666667e-05, "loss": 1.8566, "step": 72 }, { "epoch": 0.0049461764531510705, "grad_norm": 3.197636604309082, "learning_rate": 3.8400000000000005e-05, "loss": 1.7718, "step": 73 }, { "epoch": 0.005013932294975057, "grad_norm": 4.205629348754883, "learning_rate": 3.8933333333333336e-05, "loss": 2.2516, "step": 74 }, { "epoch": 0.005081688136799044, "grad_norm": 4.737552642822266, "learning_rate": 3.9466666666666666e-05, "loss": 2.277, "step": 75 }, { "epoch": 0.005149443978623032, "grad_norm": 4.187109470367432, "learning_rate": 4e-05, "loss": 2.0053, "step": 76 }, { "epoch": 0.005217199820447019, "grad_norm": 4.559085369110107, "learning_rate": 4.0533333333333334e-05, "loss": 1.9662, "step": 77 }, { "epoch": 0.005284955662271006, "grad_norm": 4.184353351593018, "learning_rate": 4.106666666666667e-05, "loss": 2.2256, "step": 78 }, { "epoch": 0.005352711504094994, "grad_norm": 3.842942714691162, "learning_rate": 4.16e-05, "loss": 1.7252, "step": 79 }, { "epoch": 0.005420467345918981, "grad_norm": 4.1036763191223145, "learning_rate": 4.213333333333334e-05, "loss": 2.1751, "step": 80 }, { "epoch": 0.005488223187742968, "grad_norm": 2.9933395385742188, "learning_rate": 4.266666666666667e-05, "loss": 1.3973, "step": 81 }, { "epoch": 0.005555979029566956, "grad_norm": 4.0848565101623535, "learning_rate": 4.32e-05, "loss": 1.7059, "step": 82 }, { "epoch": 0.005623734871390943, "grad_norm": 3.6175732612609863, "learning_rate": 4.373333333333334e-05, "loss": 1.7883, "step": 83 }, { "epoch": 0.00569149071321493, "grad_norm": 5.00525426864624, "learning_rate": 4.426666666666667e-05, "loss": 2.4767, "step": 84 }, { "epoch": 0.005759246555038918, "grad_norm": 3.891401767730713, "learning_rate": 4.4800000000000005e-05, "loss": 1.8256, "step": 85 }, { "epoch": 0.005827002396862905, "grad_norm": 3.9124996662139893, "learning_rate": 4.5333333333333335e-05, "loss": 2.0405, "step": 86 }, { "epoch": 0.005894758238686891, "grad_norm": 3.7935447692871094, "learning_rate": 4.5866666666666666e-05, "loss": 1.9211, "step": 87 }, { "epoch": 0.005962514080510879, "grad_norm": 3.410778045654297, "learning_rate": 4.64e-05, "loss": 1.6252, "step": 88 }, { "epoch": 0.006030269922334866, "grad_norm": 3.507871389389038, "learning_rate": 4.6933333333333333e-05, "loss": 1.7026, "step": 89 }, { "epoch": 0.006098025764158853, "grad_norm": 3.449367046356201, "learning_rate": 4.746666666666667e-05, "loss": 1.4691, "step": 90 }, { "epoch": 0.006165781605982841, "grad_norm": 4.297393798828125, "learning_rate": 4.8e-05, "loss": 2.1289, "step": 91 }, { "epoch": 0.006233537447806828, "grad_norm": 3.728422164916992, "learning_rate": 4.853333333333334e-05, "loss": 1.6801, "step": 92 }, { "epoch": 0.006301293289630815, "grad_norm": 4.284979343414307, "learning_rate": 4.906666666666667e-05, "loss": 1.8669, "step": 93 }, { "epoch": 0.006369049131454803, "grad_norm": 2.7867817878723145, "learning_rate": 4.96e-05, "loss": 1.2686, "step": 94 }, { "epoch": 0.00643680497327879, "grad_norm": 4.663074016571045, "learning_rate": 5.013333333333333e-05, "loss": 1.5725, "step": 95 }, { "epoch": 0.006504560815102777, "grad_norm": 3.4489972591400146, "learning_rate": 5.0666666666666674e-05, "loss": 1.5573, "step": 96 }, { "epoch": 0.006572316656926765, "grad_norm": 3.902392864227295, "learning_rate": 5.1200000000000004e-05, "loss": 1.1236, "step": 97 }, { "epoch": 0.006640072498750752, "grad_norm": 3.595935821533203, "learning_rate": 5.1733333333333335e-05, "loss": 1.4469, "step": 98 }, { "epoch": 0.006707828340574739, "grad_norm": 3.480823040008545, "learning_rate": 5.2266666666666665e-05, "loss": 1.6294, "step": 99 }, { "epoch": 0.006775584182398726, "grad_norm": 5.669904708862305, "learning_rate": 5.28e-05, "loss": 2.3132, "step": 100 }, { "epoch": 0.006843340024222713, "grad_norm": 3.375321626663208, "learning_rate": 5.333333333333333e-05, "loss": 1.5394, "step": 101 }, { "epoch": 0.0069110958660467, "grad_norm": 4.202518463134766, "learning_rate": 5.3866666666666664e-05, "loss": 1.9353, "step": 102 }, { "epoch": 0.006978851707870688, "grad_norm": 5.418217182159424, "learning_rate": 5.440000000000001e-05, "loss": 1.8818, "step": 103 }, { "epoch": 0.007046607549694675, "grad_norm": 3.229679584503174, "learning_rate": 5.493333333333334e-05, "loss": 1.5784, "step": 104 }, { "epoch": 0.007114363391518662, "grad_norm": 5.178295612335205, "learning_rate": 5.546666666666667e-05, "loss": 2.0211, "step": 105 }, { "epoch": 0.00718211923334265, "grad_norm": 3.9106807708740234, "learning_rate": 5.6000000000000006e-05, "loss": 1.7349, "step": 106 }, { "epoch": 0.007249875075166637, "grad_norm": 694.2337646484375, "learning_rate": 5.6533333333333336e-05, "loss": 1.3922, "step": 107 }, { "epoch": 0.007317630916990624, "grad_norm": 3.4870429039001465, "learning_rate": 5.706666666666667e-05, "loss": 1.5259, "step": 108 }, { "epoch": 0.007385386758814612, "grad_norm": 3.41497802734375, "learning_rate": 5.76e-05, "loss": 1.262, "step": 109 }, { "epoch": 0.007453142600638599, "grad_norm": 4.137330532073975, "learning_rate": 5.813333333333334e-05, "loss": 1.6849, "step": 110 }, { "epoch": 0.007520898442462586, "grad_norm": 3.933605909347534, "learning_rate": 5.866666666666667e-05, "loss": 1.604, "step": 111 }, { "epoch": 0.007588654284286574, "grad_norm": 4.345171928405762, "learning_rate": 5.92e-05, "loss": 1.7677, "step": 112 }, { "epoch": 0.007656410126110561, "grad_norm": 4.059954643249512, "learning_rate": 5.973333333333334e-05, "loss": 1.5128, "step": 113 }, { "epoch": 0.0077241659679345475, "grad_norm": 4.287632465362549, "learning_rate": 6.026666666666667e-05, "loss": 1.7742, "step": 114 }, { "epoch": 0.007791921809758535, "grad_norm": 4.052312850952148, "learning_rate": 6.08e-05, "loss": 1.3687, "step": 115 }, { "epoch": 0.007859677651582523, "grad_norm": 3.244309425354004, "learning_rate": 6.133333333333334e-05, "loss": 1.3108, "step": 116 }, { "epoch": 0.00792743349340651, "grad_norm": 3.454943895339966, "learning_rate": 6.186666666666668e-05, "loss": 1.4112, "step": 117 }, { "epoch": 0.007995189335230498, "grad_norm": 5.388648509979248, "learning_rate": 6.24e-05, "loss": 2.002, "step": 118 }, { "epoch": 0.008062945177054484, "grad_norm": 4.050853252410889, "learning_rate": 6.293333333333334e-05, "loss": 1.454, "step": 119 }, { "epoch": 0.00813070101887847, "grad_norm": 4.374539375305176, "learning_rate": 6.346666666666667e-05, "loss": 1.5503, "step": 120 }, { "epoch": 0.008198456860702459, "grad_norm": 5.6923933029174805, "learning_rate": 6.400000000000001e-05, "loss": 1.9256, "step": 121 }, { "epoch": 0.008266212702526445, "grad_norm": 4.457003593444824, "learning_rate": 6.453333333333333e-05, "loss": 1.6966, "step": 122 }, { "epoch": 0.008333968544350433, "grad_norm": 5.176385879516602, "learning_rate": 6.506666666666666e-05, "loss": 1.7484, "step": 123 }, { "epoch": 0.00840172438617442, "grad_norm": 3.6228065490722656, "learning_rate": 6.560000000000001e-05, "loss": 1.1885, "step": 124 }, { "epoch": 0.008469480227998408, "grad_norm": 4.48412561416626, "learning_rate": 6.613333333333333e-05, "loss": 1.2613, "step": 125 }, { "epoch": 0.008537236069822395, "grad_norm": 4.5856194496154785, "learning_rate": 6.666666666666667e-05, "loss": 1.3579, "step": 126 }, { "epoch": 0.008604991911646383, "grad_norm": 4.485749244689941, "learning_rate": 6.720000000000001e-05, "loss": 1.2942, "step": 127 }, { "epoch": 0.00867274775347037, "grad_norm": 4.471734046936035, "learning_rate": 6.773333333333333e-05, "loss": 1.2102, "step": 128 }, { "epoch": 0.008740503595294357, "grad_norm": 3.6487972736358643, "learning_rate": 6.826666666666667e-05, "loss": 1.0166, "step": 129 }, { "epoch": 0.008808259437118344, "grad_norm": 4.2145304679870605, "learning_rate": 6.879999999999999e-05, "loss": 1.2544, "step": 130 }, { "epoch": 0.008876015278942332, "grad_norm": 3.7923190593719482, "learning_rate": 6.933333333333334e-05, "loss": 0.9969, "step": 131 }, { "epoch": 0.008943771120766318, "grad_norm": 5.3202080726623535, "learning_rate": 6.986666666666667e-05, "loss": 1.3898, "step": 132 }, { "epoch": 0.009011526962590307, "grad_norm": 3.911545515060425, "learning_rate": 7.04e-05, "loss": 1.0362, "step": 133 }, { "epoch": 0.009079282804414293, "grad_norm": 4.085530757904053, "learning_rate": 7.093333333333334e-05, "loss": 1.0703, "step": 134 }, { "epoch": 0.00914703864623828, "grad_norm": 3.54276180267334, "learning_rate": 7.146666666666666e-05, "loss": 0.7889, "step": 135 }, { "epoch": 0.009214794488062268, "grad_norm": 3.69317364692688, "learning_rate": 7.2e-05, "loss": 0.9805, "step": 136 }, { "epoch": 0.009282550329886254, "grad_norm": 4.455929756164551, "learning_rate": 7.253333333333334e-05, "loss": 1.0272, "step": 137 }, { "epoch": 0.009350306171710242, "grad_norm": 3.0266013145446777, "learning_rate": 7.306666666666668e-05, "loss": 0.6603, "step": 138 }, { "epoch": 0.009418062013534229, "grad_norm": 3.714937925338745, "learning_rate": 7.36e-05, "loss": 0.8242, "step": 139 }, { "epoch": 0.009485817855358217, "grad_norm": 4.581395626068115, "learning_rate": 7.413333333333334e-05, "loss": 1.1374, "step": 140 }, { "epoch": 0.009553573697182204, "grad_norm": 3.572059154510498, "learning_rate": 7.466666666666667e-05, "loss": 0.8379, "step": 141 }, { "epoch": 0.009621329539006192, "grad_norm": 4.542558193206787, "learning_rate": 7.52e-05, "loss": 0.8601, "step": 142 }, { "epoch": 0.009689085380830178, "grad_norm": 4.202064037322998, "learning_rate": 7.573333333333334e-05, "loss": 1.0706, "step": 143 }, { "epoch": 0.009756841222654166, "grad_norm": 4.403738498687744, "learning_rate": 7.626666666666667e-05, "loss": 1.0262, "step": 144 }, { "epoch": 0.009824597064478153, "grad_norm": 4.19655704498291, "learning_rate": 7.680000000000001e-05, "loss": 0.926, "step": 145 }, { "epoch": 0.009892352906302141, "grad_norm": 3.5180933475494385, "learning_rate": 7.733333333333333e-05, "loss": 0.8757, "step": 146 }, { "epoch": 0.009960108748126128, "grad_norm": 22.808645248413086, "learning_rate": 7.786666666666667e-05, "loss": 1.27, "step": 147 }, { "epoch": 0.010027864589950114, "grad_norm": 3.7410507202148438, "learning_rate": 7.840000000000001e-05, "loss": 1.0631, "step": 148 }, { "epoch": 0.010095620431774102, "grad_norm": 3.299713611602783, "learning_rate": 7.893333333333333e-05, "loss": 0.8575, "step": 149 }, { "epoch": 0.010163376273598089, "grad_norm": 2.661968231201172, "learning_rate": 7.946666666666667e-05, "loss": 0.7436, "step": 150 }, { "epoch": 0.010231132115422077, "grad_norm": 4.543766021728516, "learning_rate": 8e-05, "loss": 1.0868, "step": 151 }, { "epoch": 0.010298887957246063, "grad_norm": 5.307805061340332, "learning_rate": 8.053333333333334e-05, "loss": 1.112, "step": 152 }, { "epoch": 0.010366643799070051, "grad_norm": 4.216804504394531, "learning_rate": 8.106666666666667e-05, "loss": 0.9197, "step": 153 }, { "epoch": 0.010434399640894038, "grad_norm": 4.234525203704834, "learning_rate": 8.16e-05, "loss": 0.9975, "step": 154 }, { "epoch": 0.010502155482718026, "grad_norm": 3.536555051803589, "learning_rate": 8.213333333333334e-05, "loss": 0.8271, "step": 155 }, { "epoch": 0.010569911324542013, "grad_norm": 4.309999465942383, "learning_rate": 8.266666666666667e-05, "loss": 0.9115, "step": 156 }, { "epoch": 0.010637667166366, "grad_norm": 2.9323556423187256, "learning_rate": 8.32e-05, "loss": 0.8719, "step": 157 }, { "epoch": 0.010705423008189987, "grad_norm": 3.687777519226074, "learning_rate": 8.373333333333334e-05, "loss": 1.0331, "step": 158 }, { "epoch": 0.010773178850013975, "grad_norm": 3.153407573699951, "learning_rate": 8.426666666666668e-05, "loss": 0.8971, "step": 159 }, { "epoch": 0.010840934691837962, "grad_norm": 4.415497779846191, "learning_rate": 8.48e-05, "loss": 1.2505, "step": 160 }, { "epoch": 0.010908690533661948, "grad_norm": 3.849696636199951, "learning_rate": 8.533333333333334e-05, "loss": 1.0184, "step": 161 }, { "epoch": 0.010976446375485937, "grad_norm": 3.315385580062866, "learning_rate": 8.586666666666668e-05, "loss": 0.7916, "step": 162 }, { "epoch": 0.011044202217309923, "grad_norm": 3.611583948135376, "learning_rate": 8.64e-05, "loss": 1.0153, "step": 163 }, { "epoch": 0.011111958059133911, "grad_norm": 4.0692138671875, "learning_rate": 8.693333333333334e-05, "loss": 1.1156, "step": 164 }, { "epoch": 0.011179713900957898, "grad_norm": 3.334744691848755, "learning_rate": 8.746666666666667e-05, "loss": 0.7543, "step": 165 }, { "epoch": 0.011247469742781886, "grad_norm": 3.654917001724243, "learning_rate": 8.800000000000001e-05, "loss": 0.8977, "step": 166 }, { "epoch": 0.011315225584605872, "grad_norm": 4.702078342437744, "learning_rate": 8.853333333333333e-05, "loss": 1.1895, "step": 167 }, { "epoch": 0.01138298142642986, "grad_norm": 4.290605545043945, "learning_rate": 8.906666666666667e-05, "loss": 1.0402, "step": 168 }, { "epoch": 0.011450737268253847, "grad_norm": 3.28179931640625, "learning_rate": 8.960000000000001e-05, "loss": 0.7626, "step": 169 }, { "epoch": 0.011518493110077835, "grad_norm": 3.1806724071502686, "learning_rate": 9.013333333333333e-05, "loss": 0.7357, "step": 170 }, { "epoch": 0.011586248951901822, "grad_norm": 3.482248544692993, "learning_rate": 9.066666666666667e-05, "loss": 0.98, "step": 171 }, { "epoch": 0.01165400479372581, "grad_norm": 3.5067379474639893, "learning_rate": 9.120000000000001e-05, "loss": 0.9183, "step": 172 }, { "epoch": 0.011721760635549796, "grad_norm": 4.362545013427734, "learning_rate": 9.173333333333333e-05, "loss": 0.9597, "step": 173 }, { "epoch": 0.011789516477373783, "grad_norm": 3.65118145942688, "learning_rate": 9.226666666666667e-05, "loss": 0.866, "step": 174 }, { "epoch": 0.011857272319197771, "grad_norm": 7.4912896156311035, "learning_rate": 9.28e-05, "loss": 1.2576, "step": 175 }, { "epoch": 0.011925028161021757, "grad_norm": 2.8435049057006836, "learning_rate": 9.333333333333334e-05, "loss": 0.8427, "step": 176 }, { "epoch": 0.011992784002845746, "grad_norm": 3.598757266998291, "learning_rate": 9.386666666666667e-05, "loss": 1.0179, "step": 177 }, { "epoch": 0.012060539844669732, "grad_norm": 3.3011691570281982, "learning_rate": 9.44e-05, "loss": 0.8488, "step": 178 }, { "epoch": 0.01212829568649372, "grad_norm": 3.7547154426574707, "learning_rate": 9.493333333333334e-05, "loss": 0.9933, "step": 179 }, { "epoch": 0.012196051528317707, "grad_norm": 4.0833611488342285, "learning_rate": 9.546666666666667e-05, "loss": 1.1849, "step": 180 }, { "epoch": 0.012263807370141695, "grad_norm": 3.6523630619049072, "learning_rate": 9.6e-05, "loss": 0.9923, "step": 181 }, { "epoch": 0.012331563211965681, "grad_norm": 3.123002052307129, "learning_rate": 9.653333333333334e-05, "loss": 0.8659, "step": 182 }, { "epoch": 0.01239931905378967, "grad_norm": 3.65999436378479, "learning_rate": 9.706666666666668e-05, "loss": 0.9637, "step": 183 }, { "epoch": 0.012467074895613656, "grad_norm": 4.547555446624756, "learning_rate": 9.76e-05, "loss": 1.2165, "step": 184 }, { "epoch": 0.012534830737437644, "grad_norm": 3.4823532104492188, "learning_rate": 9.813333333333334e-05, "loss": 0.9079, "step": 185 }, { "epoch": 0.01260258657926163, "grad_norm": 4.131546974182129, "learning_rate": 9.866666666666668e-05, "loss": 0.9782, "step": 186 }, { "epoch": 0.012670342421085617, "grad_norm": 3.025775909423828, "learning_rate": 9.92e-05, "loss": 0.7689, "step": 187 }, { "epoch": 0.012738098262909605, "grad_norm": 4.011147975921631, "learning_rate": 9.973333333333334e-05, "loss": 1.0396, "step": 188 }, { "epoch": 0.012805854104733592, "grad_norm": 3.103466749191284, "learning_rate": 0.00010026666666666666, "loss": 0.795, "step": 189 }, { "epoch": 0.01287360994655758, "grad_norm": 2.866642713546753, "learning_rate": 0.00010080000000000001, "loss": 0.8959, "step": 190 }, { "epoch": 0.012941365788381566, "grad_norm": 2.8990366458892822, "learning_rate": 0.00010133333333333335, "loss": 0.9935, "step": 191 }, { "epoch": 0.013009121630205555, "grad_norm": 2.6250760555267334, "learning_rate": 0.00010186666666666667, "loss": 0.9082, "step": 192 }, { "epoch": 0.013076877472029541, "grad_norm": 3.451451301574707, "learning_rate": 0.00010240000000000001, "loss": 0.8814, "step": 193 }, { "epoch": 0.01314463331385353, "grad_norm": 3.2968547344207764, "learning_rate": 0.00010293333333333335, "loss": 0.7823, "step": 194 }, { "epoch": 0.013212389155677516, "grad_norm": 2.873727798461914, "learning_rate": 0.00010346666666666667, "loss": 0.8378, "step": 195 }, { "epoch": 0.013280144997501504, "grad_norm": 3.4928581714630127, "learning_rate": 0.00010400000000000001, "loss": 1.0139, "step": 196 }, { "epoch": 0.01334790083932549, "grad_norm": 3.464414358139038, "learning_rate": 0.00010453333333333333, "loss": 0.9106, "step": 197 }, { "epoch": 0.013415656681149479, "grad_norm": 3.058838129043579, "learning_rate": 0.00010506666666666667, "loss": 0.6724, "step": 198 }, { "epoch": 0.013483412522973465, "grad_norm": 3.876274824142456, "learning_rate": 0.0001056, "loss": 0.872, "step": 199 }, { "epoch": 0.013551168364797452, "grad_norm": 3.2966196537017822, "learning_rate": 0.00010613333333333333, "loss": 0.9288, "step": 200 }, { "epoch": 0.01361892420662144, "grad_norm": 3.082512617111206, "learning_rate": 0.00010666666666666667, "loss": 0.833, "step": 201 }, { "epoch": 0.013686680048445426, "grad_norm": 4.060554504394531, "learning_rate": 0.00010720000000000002, "loss": 1.0788, "step": 202 }, { "epoch": 0.013754435890269414, "grad_norm": 3.3564369678497314, "learning_rate": 0.00010773333333333333, "loss": 1.19, "step": 203 }, { "epoch": 0.0138221917320934, "grad_norm": 3.56966495513916, "learning_rate": 0.00010826666666666668, "loss": 0.8799, "step": 204 }, { "epoch": 0.013889947573917389, "grad_norm": 3.1727395057678223, "learning_rate": 0.00010880000000000002, "loss": 0.8247, "step": 205 }, { "epoch": 0.013957703415741375, "grad_norm": 4.123342514038086, "learning_rate": 0.00010933333333333333, "loss": 1.3223, "step": 206 }, { "epoch": 0.014025459257565364, "grad_norm": 3.971982002258301, "learning_rate": 0.00010986666666666668, "loss": 1.1013, "step": 207 }, { "epoch": 0.01409321509938935, "grad_norm": 3.719574451446533, "learning_rate": 0.00011040000000000001, "loss": 1.17, "step": 208 }, { "epoch": 0.014160970941213338, "grad_norm": 4.213026523590088, "learning_rate": 0.00011093333333333334, "loss": 0.7647, "step": 209 }, { "epoch": 0.014228726783037325, "grad_norm": 3.341592311859131, "learning_rate": 0.00011146666666666667, "loss": 1.0674, "step": 210 }, { "epoch": 0.014296482624861313, "grad_norm": 3.7537477016448975, "learning_rate": 0.00011200000000000001, "loss": 0.9634, "step": 211 }, { "epoch": 0.0143642384666853, "grad_norm": 3.766063928604126, "learning_rate": 0.00011253333333333334, "loss": 0.9494, "step": 212 }, { "epoch": 0.014431994308509288, "grad_norm": 2.777581214904785, "learning_rate": 0.00011306666666666667, "loss": 0.6205, "step": 213 }, { "epoch": 0.014499750150333274, "grad_norm": 3.964770793914795, "learning_rate": 0.0001136, "loss": 0.9115, "step": 214 }, { "epoch": 0.01456750599215726, "grad_norm": 3.22684645652771, "learning_rate": 0.00011413333333333333, "loss": 0.923, "step": 215 }, { "epoch": 0.014635261833981249, "grad_norm": 3.009641408920288, "learning_rate": 0.00011466666666666667, "loss": 0.8137, "step": 216 }, { "epoch": 0.014703017675805235, "grad_norm": 4.65781307220459, "learning_rate": 0.0001152, "loss": 1.1172, "step": 217 }, { "epoch": 0.014770773517629223, "grad_norm": 3.9027650356292725, "learning_rate": 0.00011573333333333333, "loss": 1.4727, "step": 218 }, { "epoch": 0.01483852935945321, "grad_norm": 3.0178661346435547, "learning_rate": 0.00011626666666666668, "loss": 0.7875, "step": 219 }, { "epoch": 0.014906285201277198, "grad_norm": 3.878479480743408, "learning_rate": 0.00011679999999999999, "loss": 1.3101, "step": 220 }, { "epoch": 0.014974041043101185, "grad_norm": 4.002731800079346, "learning_rate": 0.00011733333333333334, "loss": 0.9453, "step": 221 }, { "epoch": 0.015041796884925173, "grad_norm": 2.576362133026123, "learning_rate": 0.00011786666666666668, "loss": 0.7215, "step": 222 }, { "epoch": 0.01510955272674916, "grad_norm": 3.33084774017334, "learning_rate": 0.0001184, "loss": 0.9659, "step": 223 }, { "epoch": 0.015177308568573147, "grad_norm": 3.3597161769866943, "learning_rate": 0.00011893333333333334, "loss": 0.9257, "step": 224 }, { "epoch": 0.015245064410397134, "grad_norm": 3.4632585048675537, "learning_rate": 0.00011946666666666668, "loss": 0.8929, "step": 225 }, { "epoch": 0.015312820252221122, "grad_norm": 4.075017929077148, "learning_rate": 0.00012, "loss": 0.958, "step": 226 }, { "epoch": 0.015380576094045108, "grad_norm": 3.3880038261413574, "learning_rate": 0.00012053333333333334, "loss": 1.0581, "step": 227 }, { "epoch": 0.015448331935869095, "grad_norm": 4.207815647125244, "learning_rate": 0.00012106666666666666, "loss": 1.1511, "step": 228 }, { "epoch": 0.015516087777693083, "grad_norm": 3.4052436351776123, "learning_rate": 0.0001216, "loss": 1.0359, "step": 229 }, { "epoch": 0.01558384361951707, "grad_norm": 3.804454803466797, "learning_rate": 0.00012213333333333334, "loss": 1.0629, "step": 230 }, { "epoch": 0.015651599461341058, "grad_norm": 3.846328020095825, "learning_rate": 0.00012266666666666668, "loss": 0.993, "step": 231 }, { "epoch": 0.015719355303165046, "grad_norm": 3.96236515045166, "learning_rate": 0.0001232, "loss": 1.1515, "step": 232 }, { "epoch": 0.01578711114498903, "grad_norm": 3.798048496246338, "learning_rate": 0.00012373333333333335, "loss": 0.7662, "step": 233 }, { "epoch": 0.01585486698681302, "grad_norm": 3.0351669788360596, "learning_rate": 0.00012426666666666666, "loss": 0.9648, "step": 234 }, { "epoch": 0.015922622828637007, "grad_norm": 3.380739212036133, "learning_rate": 0.0001248, "loss": 0.8763, "step": 235 }, { "epoch": 0.015990378670460995, "grad_norm": 3.049619674682617, "learning_rate": 0.00012533333333333334, "loss": 0.8025, "step": 236 }, { "epoch": 0.01605813451228498, "grad_norm": 3.167330265045166, "learning_rate": 0.00012586666666666667, "loss": 1.2007, "step": 237 }, { "epoch": 0.016125890354108968, "grad_norm": 3.3723909854888916, "learning_rate": 0.0001264, "loss": 1.0588, "step": 238 }, { "epoch": 0.016193646195932956, "grad_norm": 2.830580234527588, "learning_rate": 0.00012693333333333335, "loss": 0.6549, "step": 239 }, { "epoch": 0.01626140203775694, "grad_norm": 3.2318930625915527, "learning_rate": 0.00012746666666666666, "loss": 1.1047, "step": 240 }, { "epoch": 0.01632915787958093, "grad_norm": 2.931008815765381, "learning_rate": 0.00012800000000000002, "loss": 0.8645, "step": 241 }, { "epoch": 0.016396913721404918, "grad_norm": 4.3279829025268555, "learning_rate": 0.00012853333333333336, "loss": 1.0481, "step": 242 }, { "epoch": 0.016464669563228906, "grad_norm": 2.757809638977051, "learning_rate": 0.00012906666666666667, "loss": 0.7434, "step": 243 }, { "epoch": 0.01653242540505289, "grad_norm": 2.969024658203125, "learning_rate": 0.0001296, "loss": 0.8965, "step": 244 }, { "epoch": 0.01660018124687688, "grad_norm": 2.9457848072052, "learning_rate": 0.00013013333333333332, "loss": 0.938, "step": 245 }, { "epoch": 0.016667937088700867, "grad_norm": 3.3430593013763428, "learning_rate": 0.00013066666666666668, "loss": 1.0461, "step": 246 }, { "epoch": 0.016735692930524855, "grad_norm": 3.8152661323547363, "learning_rate": 0.00013120000000000002, "loss": 1.1667, "step": 247 }, { "epoch": 0.01680344877234884, "grad_norm": 3.352856397628784, "learning_rate": 0.00013173333333333333, "loss": 0.9266, "step": 248 }, { "epoch": 0.016871204614172828, "grad_norm": 2.7780158519744873, "learning_rate": 0.00013226666666666667, "loss": 0.8136, "step": 249 }, { "epoch": 0.016938960455996816, "grad_norm": 3.764047145843506, "learning_rate": 0.0001328, "loss": 0.8565, "step": 250 }, { "epoch": 0.017006716297820804, "grad_norm": 3.134251832962036, "learning_rate": 0.00013333333333333334, "loss": 0.9564, "step": 251 }, { "epoch": 0.01707447213964479, "grad_norm": 3.4929568767547607, "learning_rate": 0.00013386666666666668, "loss": 1.1402, "step": 252 }, { "epoch": 0.017142227981468777, "grad_norm": 2.6721930503845215, "learning_rate": 0.00013440000000000001, "loss": 0.9342, "step": 253 }, { "epoch": 0.017209983823292765, "grad_norm": 2.8653156757354736, "learning_rate": 0.00013493333333333332, "loss": 0.7382, "step": 254 }, { "epoch": 0.01727773966511675, "grad_norm": 2.549006938934326, "learning_rate": 0.00013546666666666666, "loss": 0.7793, "step": 255 }, { "epoch": 0.01734549550694074, "grad_norm": 2.753033399581909, "learning_rate": 0.00013600000000000003, "loss": 0.8672, "step": 256 }, { "epoch": 0.017413251348764727, "grad_norm": 2.9388537406921387, "learning_rate": 0.00013653333333333334, "loss": 0.916, "step": 257 }, { "epoch": 0.017481007190588715, "grad_norm": 2.5541250705718994, "learning_rate": 0.00013706666666666667, "loss": 0.8142, "step": 258 }, { "epoch": 0.0175487630324127, "grad_norm": 3.18033504486084, "learning_rate": 0.00013759999999999998, "loss": 0.8763, "step": 259 }, { "epoch": 0.017616518874236688, "grad_norm": 3.3773415088653564, "learning_rate": 0.00013813333333333335, "loss": 1.0218, "step": 260 }, { "epoch": 0.017684274716060676, "grad_norm": 3.6449365615844727, "learning_rate": 0.00013866666666666669, "loss": 0.9939, "step": 261 }, { "epoch": 0.017752030557884664, "grad_norm": 3.41015887260437, "learning_rate": 0.0001392, "loss": 1.06, "step": 262 }, { "epoch": 0.01781978639970865, "grad_norm": 2.5204412937164307, "learning_rate": 0.00013973333333333333, "loss": 0.8231, "step": 263 }, { "epoch": 0.017887542241532637, "grad_norm": 3.4601964950561523, "learning_rate": 0.00014026666666666667, "loss": 1.0674, "step": 264 }, { "epoch": 0.017955298083356625, "grad_norm": 3.367053747177124, "learning_rate": 0.0001408, "loss": 1.0056, "step": 265 }, { "epoch": 0.018023053925180613, "grad_norm": 2.6200222969055176, "learning_rate": 0.00014133333333333334, "loss": 0.7781, "step": 266 }, { "epoch": 0.018090809767004598, "grad_norm": 2.720961809158325, "learning_rate": 0.00014186666666666668, "loss": 0.8741, "step": 267 }, { "epoch": 0.018158565608828586, "grad_norm": 4.248485088348389, "learning_rate": 0.0001424, "loss": 1.1909, "step": 268 }, { "epoch": 0.018226321450652574, "grad_norm": 2.551419258117676, "learning_rate": 0.00014293333333333333, "loss": 0.8267, "step": 269 }, { "epoch": 0.01829407729247656, "grad_norm": 2.759575366973877, "learning_rate": 0.0001434666666666667, "loss": 0.8734, "step": 270 }, { "epoch": 0.018361833134300547, "grad_norm": 2.9063026905059814, "learning_rate": 0.000144, "loss": 0.8042, "step": 271 }, { "epoch": 0.018429588976124536, "grad_norm": 10.986420631408691, "learning_rate": 0.00014453333333333334, "loss": 1.1958, "step": 272 }, { "epoch": 0.018497344817948524, "grad_norm": 2.7618868350982666, "learning_rate": 0.00014506666666666668, "loss": 0.7861, "step": 273 }, { "epoch": 0.01856510065977251, "grad_norm": 3.453500509262085, "learning_rate": 0.00014560000000000002, "loss": 1.1442, "step": 274 }, { "epoch": 0.018632856501596497, "grad_norm": 3.5590434074401855, "learning_rate": 0.00014613333333333335, "loss": 0.8994, "step": 275 }, { "epoch": 0.018700612343420485, "grad_norm": 2.4055211544036865, "learning_rate": 0.00014666666666666666, "loss": 0.7386, "step": 276 }, { "epoch": 0.018768368185244473, "grad_norm": 2.8150551319122314, "learning_rate": 0.0001472, "loss": 0.8823, "step": 277 }, { "epoch": 0.018836124027068458, "grad_norm": 2.630995035171509, "learning_rate": 0.00014773333333333334, "loss": 0.7175, "step": 278 }, { "epoch": 0.018903879868892446, "grad_norm": 3.557161569595337, "learning_rate": 0.00014826666666666667, "loss": 0.8712, "step": 279 }, { "epoch": 0.018971635710716434, "grad_norm": 2.6256723403930664, "learning_rate": 0.0001488, "loss": 0.7671, "step": 280 }, { "epoch": 0.01903939155254042, "grad_norm": 3.0437567234039307, "learning_rate": 0.00014933333333333335, "loss": 1.0063, "step": 281 }, { "epoch": 0.019107147394364407, "grad_norm": 3.4554121494293213, "learning_rate": 0.00014986666666666666, "loss": 1.129, "step": 282 }, { "epoch": 0.019174903236188395, "grad_norm": 2.7510783672332764, "learning_rate": 0.0001504, "loss": 0.8512, "step": 283 }, { "epoch": 0.019242659078012384, "grad_norm": 3.6570682525634766, "learning_rate": 0.00015093333333333336, "loss": 1.0906, "step": 284 }, { "epoch": 0.019310414919836368, "grad_norm": 2.8303823471069336, "learning_rate": 0.00015146666666666667, "loss": 0.9401, "step": 285 }, { "epoch": 0.019378170761660356, "grad_norm": 4.669590950012207, "learning_rate": 0.000152, "loss": 1.2186, "step": 286 }, { "epoch": 0.019445926603484345, "grad_norm": 3.1037960052490234, "learning_rate": 0.00015253333333333335, "loss": 1.0955, "step": 287 }, { "epoch": 0.019513682445308333, "grad_norm": 2.164179563522339, "learning_rate": 0.00015306666666666666, "loss": 0.5372, "step": 288 }, { "epoch": 0.019581438287132318, "grad_norm": 3.3186652660369873, "learning_rate": 0.00015360000000000002, "loss": 0.9419, "step": 289 }, { "epoch": 0.019649194128956306, "grad_norm": 2.662095785140991, "learning_rate": 0.00015413333333333336, "loss": 0.9512, "step": 290 }, { "epoch": 0.019716949970780294, "grad_norm": 2.8470234870910645, "learning_rate": 0.00015466666666666667, "loss": 0.8353, "step": 291 }, { "epoch": 0.019784705812604282, "grad_norm": 2.5618224143981934, "learning_rate": 0.0001552, "loss": 0.7797, "step": 292 }, { "epoch": 0.019852461654428267, "grad_norm": 3.2155261039733887, "learning_rate": 0.00015573333333333334, "loss": 0.8396, "step": 293 }, { "epoch": 0.019920217496252255, "grad_norm": 2.711542844772339, "learning_rate": 0.00015626666666666668, "loss": 1.0394, "step": 294 }, { "epoch": 0.019987973338076243, "grad_norm": 2.3177831172943115, "learning_rate": 0.00015680000000000002, "loss": 0.7734, "step": 295 }, { "epoch": 0.020055729179900228, "grad_norm": 2.9781432151794434, "learning_rate": 0.00015733333333333333, "loss": 0.9441, "step": 296 }, { "epoch": 0.020123485021724216, "grad_norm": 3.547036647796631, "learning_rate": 0.00015786666666666666, "loss": 0.9722, "step": 297 }, { "epoch": 0.020191240863548204, "grad_norm": 2.6772027015686035, "learning_rate": 0.00015840000000000003, "loss": 0.9965, "step": 298 }, { "epoch": 0.020258996705372193, "grad_norm": 2.731415033340454, "learning_rate": 0.00015893333333333334, "loss": 0.88, "step": 299 }, { "epoch": 0.020326752547196177, "grad_norm": 3.2088096141815186, "learning_rate": 0.00015946666666666668, "loss": 0.8667, "step": 300 }, { "epoch": 0.020394508389020165, "grad_norm": 2.8343393802642822, "learning_rate": 0.00016, "loss": 0.8519, "step": 301 }, { "epoch": 0.020462264230844154, "grad_norm": 2.5601022243499756, "learning_rate": 0.00016053333333333332, "loss": 0.7365, "step": 302 }, { "epoch": 0.020530020072668142, "grad_norm": 2.4892749786376953, "learning_rate": 0.0001610666666666667, "loss": 0.7574, "step": 303 }, { "epoch": 0.020597775914492127, "grad_norm": 2.4366605281829834, "learning_rate": 0.00016160000000000002, "loss": 0.804, "step": 304 }, { "epoch": 0.020665531756316115, "grad_norm": 3.2073616981506348, "learning_rate": 0.00016213333333333334, "loss": 1.0517, "step": 305 }, { "epoch": 0.020733287598140103, "grad_norm": 2.8581137657165527, "learning_rate": 0.00016266666666666667, "loss": 0.76, "step": 306 }, { "epoch": 0.020801043439964088, "grad_norm": 2.4472806453704834, "learning_rate": 0.0001632, "loss": 0.7537, "step": 307 }, { "epoch": 0.020868799281788076, "grad_norm": 3.2488083839416504, "learning_rate": 0.00016373333333333335, "loss": 0.9871, "step": 308 }, { "epoch": 0.020936555123612064, "grad_norm": 2.7125866413116455, "learning_rate": 0.00016426666666666668, "loss": 0.9442, "step": 309 }, { "epoch": 0.021004310965436052, "grad_norm": 3.6005852222442627, "learning_rate": 0.0001648, "loss": 1.1218, "step": 310 }, { "epoch": 0.021072066807260037, "grad_norm": 2.3172333240509033, "learning_rate": 0.00016533333333333333, "loss": 0.7338, "step": 311 }, { "epoch": 0.021139822649084025, "grad_norm": 2.5886096954345703, "learning_rate": 0.00016586666666666667, "loss": 0.9067, "step": 312 }, { "epoch": 0.021207578490908013, "grad_norm": 2.5885937213897705, "learning_rate": 0.0001664, "loss": 0.8025, "step": 313 }, { "epoch": 0.021275334332732, "grad_norm": 2.9777655601501465, "learning_rate": 0.00016693333333333334, "loss": 0.9416, "step": 314 }, { "epoch": 0.021343090174555986, "grad_norm": 2.4649710655212402, "learning_rate": 0.00016746666666666668, "loss": 0.7932, "step": 315 }, { "epoch": 0.021410846016379975, "grad_norm": 3.091679096221924, "learning_rate": 0.000168, "loss": 0.7981, "step": 316 }, { "epoch": 0.021478601858203963, "grad_norm": 3.334155797958374, "learning_rate": 0.00016853333333333336, "loss": 0.7285, "step": 317 }, { "epoch": 0.02154635770002795, "grad_norm": 3.0908756256103516, "learning_rate": 0.0001690666666666667, "loss": 1.0706, "step": 318 }, { "epoch": 0.021614113541851936, "grad_norm": 2.8760488033294678, "learning_rate": 0.0001696, "loss": 0.8715, "step": 319 }, { "epoch": 0.021681869383675924, "grad_norm": 2.9952962398529053, "learning_rate": 0.00017013333333333334, "loss": 0.9492, "step": 320 }, { "epoch": 0.021749625225499912, "grad_norm": 3.413982391357422, "learning_rate": 0.00017066666666666668, "loss": 1.0295, "step": 321 }, { "epoch": 0.021817381067323897, "grad_norm": 2.950277805328369, "learning_rate": 0.00017120000000000001, "loss": 0.9315, "step": 322 }, { "epoch": 0.021885136909147885, "grad_norm": 2.929976224899292, "learning_rate": 0.00017173333333333335, "loss": 0.8389, "step": 323 }, { "epoch": 0.021952892750971873, "grad_norm": 2.878108263015747, "learning_rate": 0.00017226666666666666, "loss": 0.9949, "step": 324 }, { "epoch": 0.02202064859279586, "grad_norm": 2.654740571975708, "learning_rate": 0.0001728, "loss": 1.1421, "step": 325 }, { "epoch": 0.022088404434619846, "grad_norm": 2.782033681869507, "learning_rate": 0.00017333333333333334, "loss": 0.8863, "step": 326 }, { "epoch": 0.022156160276443834, "grad_norm": 3.639472246170044, "learning_rate": 0.00017386666666666667, "loss": 1.1587, "step": 327 }, { "epoch": 0.022223916118267822, "grad_norm": 3.403918981552124, "learning_rate": 0.0001744, "loss": 1.0138, "step": 328 }, { "epoch": 0.02229167196009181, "grad_norm": 2.8108558654785156, "learning_rate": 0.00017493333333333335, "loss": 0.8243, "step": 329 }, { "epoch": 0.022359427801915795, "grad_norm": 3.242318868637085, "learning_rate": 0.00017546666666666666, "loss": 1.0326, "step": 330 }, { "epoch": 0.022427183643739784, "grad_norm": 2.594991683959961, "learning_rate": 0.00017600000000000002, "loss": 0.8135, "step": 331 }, { "epoch": 0.022494939485563772, "grad_norm": 2.890305519104004, "learning_rate": 0.00017653333333333336, "loss": 0.9373, "step": 332 }, { "epoch": 0.02256269532738776, "grad_norm": 2.6706676483154297, "learning_rate": 0.00017706666666666667, "loss": 0.8682, "step": 333 }, { "epoch": 0.022630451169211745, "grad_norm": 2.7954084873199463, "learning_rate": 0.0001776, "loss": 0.8013, "step": 334 }, { "epoch": 0.022698207011035733, "grad_norm": 3.222986936569214, "learning_rate": 0.00017813333333333334, "loss": 1.136, "step": 335 }, { "epoch": 0.02276596285285972, "grad_norm": 3.1042490005493164, "learning_rate": 0.00017866666666666668, "loss": 1.1317, "step": 336 }, { "epoch": 0.022833718694683706, "grad_norm": 3.553807497024536, "learning_rate": 0.00017920000000000002, "loss": 1.0309, "step": 337 }, { "epoch": 0.022901474536507694, "grad_norm": 3.4068756103515625, "learning_rate": 0.00017973333333333333, "loss": 0.8812, "step": 338 }, { "epoch": 0.022969230378331682, "grad_norm": 2.8250908851623535, "learning_rate": 0.00018026666666666667, "loss": 0.8691, "step": 339 }, { "epoch": 0.02303698622015567, "grad_norm": 3.1037330627441406, "learning_rate": 0.0001808, "loss": 0.8692, "step": 340 }, { "epoch": 0.023104742061979655, "grad_norm": 2.989802598953247, "learning_rate": 0.00018133333333333334, "loss": 1.0568, "step": 341 }, { "epoch": 0.023172497903803643, "grad_norm": 4.6037068367004395, "learning_rate": 0.00018186666666666668, "loss": 1.1491, "step": 342 }, { "epoch": 0.02324025374562763, "grad_norm": 3.0545082092285156, "learning_rate": 0.00018240000000000002, "loss": 0.9532, "step": 343 }, { "epoch": 0.02330800958745162, "grad_norm": 2.2162246704101562, "learning_rate": 0.00018293333333333333, "loss": 0.8658, "step": 344 }, { "epoch": 0.023375765429275604, "grad_norm": 2.557610273361206, "learning_rate": 0.00018346666666666666, "loss": 0.8813, "step": 345 }, { "epoch": 0.023443521271099593, "grad_norm": 2.785144805908203, "learning_rate": 0.00018400000000000003, "loss": 0.8998, "step": 346 }, { "epoch": 0.02351127711292358, "grad_norm": 2.822430372238159, "learning_rate": 0.00018453333333333334, "loss": 0.9069, "step": 347 }, { "epoch": 0.023579032954747565, "grad_norm": 3.2252402305603027, "learning_rate": 0.00018506666666666667, "loss": 0.8276, "step": 348 }, { "epoch": 0.023646788796571554, "grad_norm": 3.279068946838379, "learning_rate": 0.0001856, "loss": 1.2341, "step": 349 }, { "epoch": 0.023714544638395542, "grad_norm": 2.407787799835205, "learning_rate": 0.00018613333333333335, "loss": 0.848, "step": 350 }, { "epoch": 0.02378230048021953, "grad_norm": 2.13181209564209, "learning_rate": 0.0001866666666666667, "loss": 0.7186, "step": 351 }, { "epoch": 0.023850056322043515, "grad_norm": 3.5988128185272217, "learning_rate": 0.00018720000000000002, "loss": 0.9629, "step": 352 }, { "epoch": 0.023917812163867503, "grad_norm": 2.6874983310699463, "learning_rate": 0.00018773333333333333, "loss": 0.8074, "step": 353 }, { "epoch": 0.02398556800569149, "grad_norm": 3.2776784896850586, "learning_rate": 0.00018826666666666667, "loss": 1.0154, "step": 354 }, { "epoch": 0.02405332384751548, "grad_norm": 2.798130750656128, "learning_rate": 0.0001888, "loss": 1.1858, "step": 355 }, { "epoch": 0.024121079689339464, "grad_norm": 2.985553503036499, "learning_rate": 0.00018933333333333335, "loss": 1.0351, "step": 356 }, { "epoch": 0.024188835531163452, "grad_norm": 2.8096508979797363, "learning_rate": 0.00018986666666666668, "loss": 0.8279, "step": 357 }, { "epoch": 0.02425659137298744, "grad_norm": 2.5704684257507324, "learning_rate": 0.0001904, "loss": 0.7203, "step": 358 }, { "epoch": 0.02432434721481143, "grad_norm": 4.470186710357666, "learning_rate": 0.00019093333333333333, "loss": 1.0528, "step": 359 }, { "epoch": 0.024392103056635413, "grad_norm": 2.5878007411956787, "learning_rate": 0.0001914666666666667, "loss": 0.7575, "step": 360 }, { "epoch": 0.0244598588984594, "grad_norm": 3.5642030239105225, "learning_rate": 0.000192, "loss": 1.0158, "step": 361 }, { "epoch": 0.02452761474028339, "grad_norm": 3.050518751144409, "learning_rate": 0.00019253333333333334, "loss": 0.7788, "step": 362 }, { "epoch": 0.024595370582107375, "grad_norm": 2.7089364528656006, "learning_rate": 0.00019306666666666668, "loss": 1.0027, "step": 363 }, { "epoch": 0.024663126423931363, "grad_norm": 2.6781551837921143, "learning_rate": 0.00019360000000000002, "loss": 0.9534, "step": 364 }, { "epoch": 0.02473088226575535, "grad_norm": 2.707778215408325, "learning_rate": 0.00019413333333333335, "loss": 0.741, "step": 365 }, { "epoch": 0.02479863810757934, "grad_norm": 2.9382736682891846, "learning_rate": 0.0001946666666666667, "loss": 0.8631, "step": 366 }, { "epoch": 0.024866393949403324, "grad_norm": 2.2730894088745117, "learning_rate": 0.0001952, "loss": 0.8481, "step": 367 }, { "epoch": 0.024934149791227312, "grad_norm": 2.788771867752075, "learning_rate": 0.00019573333333333334, "loss": 0.8739, "step": 368 }, { "epoch": 0.0250019056330513, "grad_norm": 2.500476598739624, "learning_rate": 0.00019626666666666668, "loss": 0.761, "step": 369 }, { "epoch": 0.02506966147487529, "grad_norm": 2.9384407997131348, "learning_rate": 0.0001968, "loss": 0.8118, "step": 370 }, { "epoch": 0.025137417316699273, "grad_norm": 3.3813512325286865, "learning_rate": 0.00019733333333333335, "loss": 0.9742, "step": 371 }, { "epoch": 0.02520517315852326, "grad_norm": 3.193253993988037, "learning_rate": 0.00019786666666666666, "loss": 1.1012, "step": 372 }, { "epoch": 0.02527292900034725, "grad_norm": 2.9866743087768555, "learning_rate": 0.0001984, "loss": 1.0315, "step": 373 }, { "epoch": 0.025340684842171234, "grad_norm": 3.231721878051758, "learning_rate": 0.00019893333333333336, "loss": 1.2791, "step": 374 }, { "epoch": 0.025408440683995222, "grad_norm": 2.749004364013672, "learning_rate": 0.00019946666666666667, "loss": 0.6948, "step": 375 }, { "epoch": 0.02547619652581921, "grad_norm": 2.88962459564209, "learning_rate": 0.0002, "loss": 0.7165, "step": 376 }, { "epoch": 0.0255439523676432, "grad_norm": 2.8554301261901855, "learning_rate": 0.00020053333333333332, "loss": 1.064, "step": 377 }, { "epoch": 0.025611708209467184, "grad_norm": 4.560920238494873, "learning_rate": 0.00020106666666666668, "loss": 1.0572, "step": 378 }, { "epoch": 0.025679464051291172, "grad_norm": 3.338428497314453, "learning_rate": 0.00020160000000000002, "loss": 1.1139, "step": 379 }, { "epoch": 0.02574721989311516, "grad_norm": 3.9975242614746094, "learning_rate": 0.00020213333333333333, "loss": 0.9781, "step": 380 }, { "epoch": 0.025814975734939148, "grad_norm": 2.463844060897827, "learning_rate": 0.0002026666666666667, "loss": 0.8245, "step": 381 }, { "epoch": 0.025882731576763133, "grad_norm": 2.7112913131713867, "learning_rate": 0.0002032, "loss": 0.8705, "step": 382 }, { "epoch": 0.02595048741858712, "grad_norm": 3.175318956375122, "learning_rate": 0.00020373333333333334, "loss": 0.8363, "step": 383 }, { "epoch": 0.02601824326041111, "grad_norm": 3.4241058826446533, "learning_rate": 0.0002042666666666667, "loss": 0.8909, "step": 384 }, { "epoch": 0.026085999102235097, "grad_norm": 2.8540198802948, "learning_rate": 0.00020480000000000002, "loss": 0.7986, "step": 385 }, { "epoch": 0.026153754944059082, "grad_norm": 3.551433801651001, "learning_rate": 0.00020533333333333333, "loss": 1.0388, "step": 386 }, { "epoch": 0.02622151078588307, "grad_norm": 2.895348310470581, "learning_rate": 0.0002058666666666667, "loss": 1.0517, "step": 387 }, { "epoch": 0.02628926662770706, "grad_norm": 2.378032684326172, "learning_rate": 0.0002064, "loss": 1.0098, "step": 388 }, { "epoch": 0.026357022469531043, "grad_norm": 2.6091766357421875, "learning_rate": 0.00020693333333333334, "loss": 0.9787, "step": 389 }, { "epoch": 0.02642477831135503, "grad_norm": 3.1191773414611816, "learning_rate": 0.0002074666666666667, "loss": 0.9477, "step": 390 }, { "epoch": 0.02649253415317902, "grad_norm": 2.812795400619507, "learning_rate": 0.00020800000000000001, "loss": 0.8659, "step": 391 }, { "epoch": 0.026560289995003008, "grad_norm": 2.7146248817443848, "learning_rate": 0.00020853333333333332, "loss": 0.9959, "step": 392 }, { "epoch": 0.026628045836826993, "grad_norm": 3.1946487426757812, "learning_rate": 0.00020906666666666666, "loss": 0.902, "step": 393 }, { "epoch": 0.02669580167865098, "grad_norm": 3.826266050338745, "learning_rate": 0.00020960000000000003, "loss": 1.0937, "step": 394 }, { "epoch": 0.02676355752047497, "grad_norm": 2.771653175354004, "learning_rate": 0.00021013333333333334, "loss": 0.9665, "step": 395 }, { "epoch": 0.026831313362298957, "grad_norm": 3.2525782585144043, "learning_rate": 0.00021066666666666665, "loss": 1.1349, "step": 396 }, { "epoch": 0.026899069204122942, "grad_norm": 3.4051384925842285, "learning_rate": 0.0002112, "loss": 1.2167, "step": 397 }, { "epoch": 0.02696682504594693, "grad_norm": 3.4748735427856445, "learning_rate": 0.00021173333333333335, "loss": 0.848, "step": 398 }, { "epoch": 0.02703458088777092, "grad_norm": 2.094017267227173, "learning_rate": 0.00021226666666666666, "loss": 0.6179, "step": 399 }, { "epoch": 0.027102336729594903, "grad_norm": 2.9348931312561035, "learning_rate": 0.00021280000000000002, "loss": 0.9316, "step": 400 }, { "epoch": 0.02717009257141889, "grad_norm": 2.874321222305298, "learning_rate": 0.00021333333333333333, "loss": 0.9912, "step": 401 }, { "epoch": 0.02723784841324288, "grad_norm": 3.2894723415374756, "learning_rate": 0.00021386666666666667, "loss": 1.1963, "step": 402 }, { "epoch": 0.027305604255066868, "grad_norm": 3.2671563625335693, "learning_rate": 0.00021440000000000003, "loss": 0.9469, "step": 403 }, { "epoch": 0.027373360096890852, "grad_norm": 2.426436424255371, "learning_rate": 0.00021493333333333334, "loss": 0.7675, "step": 404 }, { "epoch": 0.02744111593871484, "grad_norm": 2.979081869125366, "learning_rate": 0.00021546666666666665, "loss": 1.0344, "step": 405 }, { "epoch": 0.02750887178053883, "grad_norm": 2.872697114944458, "learning_rate": 0.00021600000000000002, "loss": 0.7642, "step": 406 }, { "epoch": 0.027576627622362817, "grad_norm": 2.9084696769714355, "learning_rate": 0.00021653333333333336, "loss": 0.87, "step": 407 }, { "epoch": 0.0276443834641868, "grad_norm": 2.231675386428833, "learning_rate": 0.00021706666666666667, "loss": 0.7374, "step": 408 }, { "epoch": 0.02771213930601079, "grad_norm": 2.4713785648345947, "learning_rate": 0.00021760000000000003, "loss": 0.7137, "step": 409 }, { "epoch": 0.027779895147834778, "grad_norm": 3.2178897857666016, "learning_rate": 0.00021813333333333334, "loss": 0.8865, "step": 410 }, { "epoch": 0.027847650989658766, "grad_norm": 2.7771530151367188, "learning_rate": 0.00021866666666666665, "loss": 0.8657, "step": 411 }, { "epoch": 0.02791540683148275, "grad_norm": 2.7896981239318848, "learning_rate": 0.00021920000000000002, "loss": 0.8779, "step": 412 }, { "epoch": 0.02798316267330674, "grad_norm": 3.215338706970215, "learning_rate": 0.00021973333333333335, "loss": 0.971, "step": 413 }, { "epoch": 0.028050918515130727, "grad_norm": 3.515970468521118, "learning_rate": 0.00022026666666666666, "loss": 0.9283, "step": 414 }, { "epoch": 0.028118674356954712, "grad_norm": 2.9704067707061768, "learning_rate": 0.00022080000000000003, "loss": 0.9245, "step": 415 }, { "epoch": 0.0281864301987787, "grad_norm": 2.9705166816711426, "learning_rate": 0.00022133333333333334, "loss": 0.9025, "step": 416 }, { "epoch": 0.02825418604060269, "grad_norm": 2.9910025596618652, "learning_rate": 0.00022186666666666667, "loss": 1.0317, "step": 417 }, { "epoch": 0.028321941882426677, "grad_norm": 3.0812675952911377, "learning_rate": 0.00022240000000000004, "loss": 0.9516, "step": 418 }, { "epoch": 0.02838969772425066, "grad_norm": 4.055918216705322, "learning_rate": 0.00022293333333333335, "loss": 0.9599, "step": 419 }, { "epoch": 0.02845745356607465, "grad_norm": 2.581022024154663, "learning_rate": 0.00022346666666666666, "loss": 0.7968, "step": 420 }, { "epoch": 0.028525209407898638, "grad_norm": 2.069627523422241, "learning_rate": 0.00022400000000000002, "loss": 0.6994, "step": 421 }, { "epoch": 0.028592965249722626, "grad_norm": 3.5436763763427734, "learning_rate": 0.00022453333333333336, "loss": 0.9731, "step": 422 }, { "epoch": 0.02866072109154661, "grad_norm": 2.5492074489593506, "learning_rate": 0.00022506666666666667, "loss": 0.9319, "step": 423 }, { "epoch": 0.0287284769333706, "grad_norm": 3.4567394256591797, "learning_rate": 0.00022559999999999998, "loss": 0.9793, "step": 424 }, { "epoch": 0.028796232775194587, "grad_norm": 3.0208230018615723, "learning_rate": 0.00022613333333333335, "loss": 0.9211, "step": 425 }, { "epoch": 0.028863988617018575, "grad_norm": 3.6448960304260254, "learning_rate": 0.00022666666666666668, "loss": 1.0795, "step": 426 }, { "epoch": 0.02893174445884256, "grad_norm": 2.785637140274048, "learning_rate": 0.0002272, "loss": 1.0245, "step": 427 }, { "epoch": 0.028999500300666548, "grad_norm": 2.842777967453003, "learning_rate": 0.00022773333333333336, "loss": 0.8712, "step": 428 }, { "epoch": 0.029067256142490536, "grad_norm": 3.13641095161438, "learning_rate": 0.00022826666666666667, "loss": 0.9302, "step": 429 }, { "epoch": 0.02913501198431452, "grad_norm": 3.7317042350769043, "learning_rate": 0.0002288, "loss": 1.0933, "step": 430 }, { "epoch": 0.02920276782613851, "grad_norm": 2.997500419616699, "learning_rate": 0.00022933333333333334, "loss": 0.887, "step": 431 }, { "epoch": 0.029270523667962497, "grad_norm": 3.0842528343200684, "learning_rate": 0.00022986666666666668, "loss": 1.1222, "step": 432 }, { "epoch": 0.029338279509786486, "grad_norm": 2.915990114212036, "learning_rate": 0.0002304, "loss": 0.9592, "step": 433 }, { "epoch": 0.02940603535161047, "grad_norm": 3.1716978549957275, "learning_rate": 0.00023093333333333335, "loss": 1.0654, "step": 434 }, { "epoch": 0.02947379119343446, "grad_norm": 2.9748449325561523, "learning_rate": 0.00023146666666666666, "loss": 0.9074, "step": 435 }, { "epoch": 0.029541547035258447, "grad_norm": 2.785900115966797, "learning_rate": 0.000232, "loss": 0.8457, "step": 436 }, { "epoch": 0.029609302877082435, "grad_norm": 2.569838523864746, "learning_rate": 0.00023253333333333337, "loss": 0.8915, "step": 437 }, { "epoch": 0.02967705871890642, "grad_norm": 2.8905043601989746, "learning_rate": 0.00023306666666666668, "loss": 0.8502, "step": 438 }, { "epoch": 0.029744814560730408, "grad_norm": 2.7213053703308105, "learning_rate": 0.00023359999999999999, "loss": 0.8998, "step": 439 }, { "epoch": 0.029812570402554396, "grad_norm": 3.745122194290161, "learning_rate": 0.00023413333333333335, "loss": 1.1459, "step": 440 }, { "epoch": 0.02988032624437838, "grad_norm": 2.9431862831115723, "learning_rate": 0.0002346666666666667, "loss": 0.9016, "step": 441 }, { "epoch": 0.02994808208620237, "grad_norm": 2.872504711151123, "learning_rate": 0.0002352, "loss": 0.9417, "step": 442 }, { "epoch": 0.030015837928026357, "grad_norm": 2.563321828842163, "learning_rate": 0.00023573333333333336, "loss": 0.8268, "step": 443 }, { "epoch": 0.030083593769850345, "grad_norm": 3.125023603439331, "learning_rate": 0.00023626666666666667, "loss": 0.8371, "step": 444 }, { "epoch": 0.03015134961167433, "grad_norm": 3.140502452850342, "learning_rate": 0.0002368, "loss": 1.0886, "step": 445 }, { "epoch": 0.03021910545349832, "grad_norm": 2.709343433380127, "learning_rate": 0.00023733333333333337, "loss": 0.8819, "step": 446 }, { "epoch": 0.030286861295322307, "grad_norm": 2.917924165725708, "learning_rate": 0.00023786666666666668, "loss": 1.0793, "step": 447 }, { "epoch": 0.030354617137146295, "grad_norm": 2.741274833679199, "learning_rate": 0.0002384, "loss": 0.9807, "step": 448 }, { "epoch": 0.03042237297897028, "grad_norm": 2.7455129623413086, "learning_rate": 0.00023893333333333336, "loss": 0.9961, "step": 449 }, { "epoch": 0.030490128820794268, "grad_norm": 3.417269229888916, "learning_rate": 0.0002394666666666667, "loss": 0.8002, "step": 450 }, { "epoch": 0.030557884662618256, "grad_norm": 2.652172088623047, "learning_rate": 0.00024, "loss": 0.7414, "step": 451 }, { "epoch": 0.030625640504442244, "grad_norm": 3.259737968444824, "learning_rate": 0.00024053333333333337, "loss": 1.0514, "step": 452 }, { "epoch": 0.03069339634626623, "grad_norm": 4.5452070236206055, "learning_rate": 0.00024106666666666668, "loss": 1.2329, "step": 453 }, { "epoch": 0.030761152188090217, "grad_norm": 3.586223602294922, "learning_rate": 0.0002416, "loss": 1.0274, "step": 454 }, { "epoch": 0.030828908029914205, "grad_norm": 2.549325704574585, "learning_rate": 0.00024213333333333333, "loss": 0.8767, "step": 455 }, { "epoch": 0.03089666387173819, "grad_norm": 2.952594757080078, "learning_rate": 0.0002426666666666667, "loss": 0.9265, "step": 456 }, { "epoch": 0.030964419713562178, "grad_norm": 2.737058639526367, "learning_rate": 0.0002432, "loss": 0.9006, "step": 457 }, { "epoch": 0.031032175555386166, "grad_norm": 2.8463776111602783, "learning_rate": 0.0002437333333333333, "loss": 1.0022, "step": 458 }, { "epoch": 0.031099931397210154, "grad_norm": 2.8198354244232178, "learning_rate": 0.0002442666666666667, "loss": 0.8579, "step": 459 }, { "epoch": 0.03116768723903414, "grad_norm": 2.6514904499053955, "learning_rate": 0.0002448, "loss": 1.0371, "step": 460 }, { "epoch": 0.031235443080858127, "grad_norm": 3.080641031265259, "learning_rate": 0.00024533333333333335, "loss": 0.9005, "step": 461 }, { "epoch": 0.031303198922682116, "grad_norm": 3.199880361557007, "learning_rate": 0.0002458666666666667, "loss": 0.9525, "step": 462 }, { "epoch": 0.031370954764506104, "grad_norm": 2.3403611183166504, "learning_rate": 0.0002464, "loss": 0.8538, "step": 463 }, { "epoch": 0.03143871060633009, "grad_norm": 2.694025754928589, "learning_rate": 0.00024693333333333334, "loss": 0.8852, "step": 464 }, { "epoch": 0.03150646644815408, "grad_norm": 2.6202657222747803, "learning_rate": 0.0002474666666666667, "loss": 0.7338, "step": 465 }, { "epoch": 0.03157422228997806, "grad_norm": 2.4244868755340576, "learning_rate": 0.000248, "loss": 0.8328, "step": 466 }, { "epoch": 0.03164197813180205, "grad_norm": 2.990705966949463, "learning_rate": 0.0002485333333333333, "loss": 0.9602, "step": 467 }, { "epoch": 0.03170973397362604, "grad_norm": 2.595392942428589, "learning_rate": 0.0002490666666666667, "loss": 0.7387, "step": 468 }, { "epoch": 0.031777489815450026, "grad_norm": 2.7632737159729004, "learning_rate": 0.0002496, "loss": 0.8419, "step": 469 }, { "epoch": 0.031845245657274014, "grad_norm": 3.8289895057678223, "learning_rate": 0.0002501333333333333, "loss": 1.0181, "step": 470 }, { "epoch": 0.031913001499098, "grad_norm": 4.551751136779785, "learning_rate": 0.00025066666666666667, "loss": 0.901, "step": 471 }, { "epoch": 0.03198075734092199, "grad_norm": 2.899014949798584, "learning_rate": 0.00025120000000000003, "loss": 0.8226, "step": 472 }, { "epoch": 0.03204851318274597, "grad_norm": 2.805997610092163, "learning_rate": 0.00025173333333333334, "loss": 0.7902, "step": 473 }, { "epoch": 0.03211626902456996, "grad_norm": 2.6749446392059326, "learning_rate": 0.0002522666666666667, "loss": 0.9136, "step": 474 }, { "epoch": 0.03218402486639395, "grad_norm": 3.000572681427002, "learning_rate": 0.0002528, "loss": 0.977, "step": 475 }, { "epoch": 0.032251780708217936, "grad_norm": 3.4974238872528076, "learning_rate": 0.00025333333333333333, "loss": 1.0862, "step": 476 }, { "epoch": 0.032319536550041925, "grad_norm": 2.88631272315979, "learning_rate": 0.0002538666666666667, "loss": 1.1025, "step": 477 }, { "epoch": 0.03238729239186591, "grad_norm": 3.2899982929229736, "learning_rate": 0.0002544, "loss": 0.9626, "step": 478 }, { "epoch": 0.0324550482336899, "grad_norm": 2.4934239387512207, "learning_rate": 0.0002549333333333333, "loss": 0.857, "step": 479 }, { "epoch": 0.03252280407551388, "grad_norm": 3.5751945972442627, "learning_rate": 0.0002554666666666667, "loss": 1.039, "step": 480 }, { "epoch": 0.03259055991733787, "grad_norm": 2.848538637161255, "learning_rate": 0.00025600000000000004, "loss": 0.8887, "step": 481 }, { "epoch": 0.03265831575916186, "grad_norm": 3.5423502922058105, "learning_rate": 0.00025653333333333335, "loss": 0.9348, "step": 482 }, { "epoch": 0.03272607160098585, "grad_norm": 2.3428285121917725, "learning_rate": 0.0002570666666666667, "loss": 0.6604, "step": 483 }, { "epoch": 0.032793827442809835, "grad_norm": 2.862464427947998, "learning_rate": 0.00025760000000000003, "loss": 0.7328, "step": 484 }, { "epoch": 0.03286158328463382, "grad_norm": 2.3965930938720703, "learning_rate": 0.00025813333333333334, "loss": 0.8123, "step": 485 }, { "epoch": 0.03292933912645781, "grad_norm": 3.907257556915283, "learning_rate": 0.00025866666666666665, "loss": 1.1166, "step": 486 }, { "epoch": 0.0329970949682818, "grad_norm": 6.006860733032227, "learning_rate": 0.0002592, "loss": 0.9687, "step": 487 }, { "epoch": 0.03306485081010578, "grad_norm": 3.908031702041626, "learning_rate": 0.0002597333333333333, "loss": 0.8756, "step": 488 }, { "epoch": 0.03313260665192977, "grad_norm": 2.7148728370666504, "learning_rate": 0.00026026666666666663, "loss": 0.7544, "step": 489 }, { "epoch": 0.03320036249375376, "grad_norm": 3.107423782348633, "learning_rate": 0.0002608, "loss": 0.8232, "step": 490 }, { "epoch": 0.033268118335577745, "grad_norm": 2.9716622829437256, "learning_rate": 0.00026133333333333336, "loss": 0.7154, "step": 491 }, { "epoch": 0.033335874177401734, "grad_norm": 2.700042963027954, "learning_rate": 0.00026186666666666667, "loss": 0.7477, "step": 492 }, { "epoch": 0.03340363001922572, "grad_norm": 3.002424478530884, "learning_rate": 0.00026240000000000004, "loss": 0.7439, "step": 493 }, { "epoch": 0.03347138586104971, "grad_norm": 2.69582200050354, "learning_rate": 0.00026293333333333335, "loss": 0.8381, "step": 494 }, { "epoch": 0.03353914170287369, "grad_norm": 3.2823925018310547, "learning_rate": 0.00026346666666666666, "loss": 0.8639, "step": 495 }, { "epoch": 0.03360689754469768, "grad_norm": 2.939606189727783, "learning_rate": 0.000264, "loss": 0.9284, "step": 496 }, { "epoch": 0.03367465338652167, "grad_norm": 2.807971954345703, "learning_rate": 0.00026453333333333333, "loss": 0.8527, "step": 497 }, { "epoch": 0.033742409228345656, "grad_norm": 2.946587562561035, "learning_rate": 0.00026506666666666664, "loss": 0.986, "step": 498 }, { "epoch": 0.033810165070169644, "grad_norm": 3.3178935050964355, "learning_rate": 0.0002656, "loss": 1.0663, "step": 499 }, { "epoch": 0.03387792091199363, "grad_norm": 3.8597490787506104, "learning_rate": 0.00026613333333333337, "loss": 1.1641, "step": 500 }, { "epoch": 0.03394567675381762, "grad_norm": 2.5785224437713623, "learning_rate": 0.0002666666666666667, "loss": 0.9571, "step": 501 }, { "epoch": 0.03401343259564161, "grad_norm": 3.227065324783325, "learning_rate": 0.00026720000000000004, "loss": 1.0555, "step": 502 }, { "epoch": 0.03408118843746559, "grad_norm": 2.8175675868988037, "learning_rate": 0.00026773333333333335, "loss": 0.892, "step": 503 }, { "epoch": 0.03414894427928958, "grad_norm": 2.33778977394104, "learning_rate": 0.00026826666666666666, "loss": 0.7826, "step": 504 }, { "epoch": 0.034216700121113566, "grad_norm": 3.1155123710632324, "learning_rate": 0.00026880000000000003, "loss": 1.1612, "step": 505 }, { "epoch": 0.034284455962937554, "grad_norm": 2.7025229930877686, "learning_rate": 0.00026933333333333334, "loss": 0.9171, "step": 506 }, { "epoch": 0.03435221180476154, "grad_norm": 3.002000093460083, "learning_rate": 0.00026986666666666665, "loss": 0.9522, "step": 507 }, { "epoch": 0.03441996764658553, "grad_norm": 2.819808006286621, "learning_rate": 0.0002704, "loss": 0.9387, "step": 508 }, { "epoch": 0.03448772348840952, "grad_norm": 2.7500219345092773, "learning_rate": 0.0002709333333333333, "loss": 1.0149, "step": 509 }, { "epoch": 0.0345554793302335, "grad_norm": 3.1415674686431885, "learning_rate": 0.0002714666666666667, "loss": 0.7711, "step": 510 }, { "epoch": 0.03462323517205749, "grad_norm": 4.178181171417236, "learning_rate": 0.00027200000000000005, "loss": 1.1125, "step": 511 }, { "epoch": 0.03469099101388148, "grad_norm": 3.3421645164489746, "learning_rate": 0.00027253333333333336, "loss": 1.0714, "step": 512 }, { "epoch": 0.034758746855705465, "grad_norm": 3.297851085662842, "learning_rate": 0.00027306666666666667, "loss": 0.9904, "step": 513 }, { "epoch": 0.03482650269752945, "grad_norm": 2.8368825912475586, "learning_rate": 0.00027360000000000004, "loss": 0.7776, "step": 514 }, { "epoch": 0.03489425853935344, "grad_norm": 2.7940316200256348, "learning_rate": 0.00027413333333333335, "loss": 0.8111, "step": 515 }, { "epoch": 0.03496201438117743, "grad_norm": 2.5380403995513916, "learning_rate": 0.00027466666666666666, "loss": 0.7809, "step": 516 }, { "epoch": 0.03502977022300142, "grad_norm": 3.905104398727417, "learning_rate": 0.00027519999999999997, "loss": 1.0114, "step": 517 }, { "epoch": 0.0350975260648254, "grad_norm": 3.1693973541259766, "learning_rate": 0.00027573333333333333, "loss": 0.8502, "step": 518 }, { "epoch": 0.03516528190664939, "grad_norm": 2.3691413402557373, "learning_rate": 0.0002762666666666667, "loss": 0.6939, "step": 519 }, { "epoch": 0.035233037748473375, "grad_norm": 2.954050064086914, "learning_rate": 0.0002768, "loss": 0.8949, "step": 520 }, { "epoch": 0.035300793590297364, "grad_norm": 3.937025547027588, "learning_rate": 0.00027733333333333337, "loss": 1.2839, "step": 521 }, { "epoch": 0.03536854943212135, "grad_norm": 2.931206703186035, "learning_rate": 0.0002778666666666667, "loss": 0.9143, "step": 522 }, { "epoch": 0.03543630527394534, "grad_norm": 4.082540512084961, "learning_rate": 0.0002784, "loss": 0.8048, "step": 523 }, { "epoch": 0.03550406111576933, "grad_norm": 2.4357848167419434, "learning_rate": 0.00027893333333333336, "loss": 0.716, "step": 524 }, { "epoch": 0.03557181695759331, "grad_norm": 2.947275400161743, "learning_rate": 0.00027946666666666667, "loss": 0.897, "step": 525 }, { "epoch": 0.0356395727994173, "grad_norm": 3.021286725997925, "learning_rate": 0.00028, "loss": 1.0273, "step": 526 }, { "epoch": 0.035707328641241286, "grad_norm": 3.121258020401001, "learning_rate": 0.00028053333333333334, "loss": 0.8244, "step": 527 }, { "epoch": 0.035775084483065274, "grad_norm": 2.824312210083008, "learning_rate": 0.0002810666666666667, "loss": 1.058, "step": 528 }, { "epoch": 0.03584284032488926, "grad_norm": 3.4016153812408447, "learning_rate": 0.0002816, "loss": 0.772, "step": 529 }, { "epoch": 0.03591059616671325, "grad_norm": 3.031737804412842, "learning_rate": 0.0002821333333333334, "loss": 0.9401, "step": 530 }, { "epoch": 0.03597835200853724, "grad_norm": 3.0860114097595215, "learning_rate": 0.0002826666666666667, "loss": 0.8193, "step": 531 }, { "epoch": 0.03604610785036123, "grad_norm": 3.2134311199188232, "learning_rate": 0.0002832, "loss": 0.9726, "step": 532 }, { "epoch": 0.03611386369218521, "grad_norm": 2.6056156158447266, "learning_rate": 0.00028373333333333336, "loss": 0.7144, "step": 533 }, { "epoch": 0.036181619534009196, "grad_norm": 3.4816110134124756, "learning_rate": 0.0002842666666666667, "loss": 0.9486, "step": 534 }, { "epoch": 0.036249375375833184, "grad_norm": 2.573964834213257, "learning_rate": 0.0002848, "loss": 0.8565, "step": 535 }, { "epoch": 0.03631713121765717, "grad_norm": 2.908560276031494, "learning_rate": 0.00028533333333333335, "loss": 0.8386, "step": 536 }, { "epoch": 0.03638488705948116, "grad_norm": 2.734622001647949, "learning_rate": 0.00028586666666666666, "loss": 0.8293, "step": 537 }, { "epoch": 0.03645264290130515, "grad_norm": 2.6459169387817383, "learning_rate": 0.0002864, "loss": 0.8053, "step": 538 }, { "epoch": 0.03652039874312914, "grad_norm": 2.2266340255737305, "learning_rate": 0.0002869333333333334, "loss": 0.6746, "step": 539 }, { "epoch": 0.03658815458495312, "grad_norm": 2.785195827484131, "learning_rate": 0.0002874666666666667, "loss": 0.8155, "step": 540 }, { "epoch": 0.03665591042677711, "grad_norm": 3.1835994720458984, "learning_rate": 0.000288, "loss": 0.753, "step": 541 }, { "epoch": 0.036723666268601095, "grad_norm": 2.3835973739624023, "learning_rate": 0.00028853333333333337, "loss": 0.7402, "step": 542 }, { "epoch": 0.03679142211042508, "grad_norm": 3.2223799228668213, "learning_rate": 0.0002890666666666667, "loss": 0.9505, "step": 543 }, { "epoch": 0.03685917795224907, "grad_norm": 3.0398600101470947, "learning_rate": 0.0002896, "loss": 0.8357, "step": 544 }, { "epoch": 0.03692693379407306, "grad_norm": 3.1473746299743652, "learning_rate": 0.00029013333333333336, "loss": 0.8665, "step": 545 }, { "epoch": 0.03699468963589705, "grad_norm": 3.3487610816955566, "learning_rate": 0.00029066666666666667, "loss": 1.2443, "step": 546 }, { "epoch": 0.03706244547772103, "grad_norm": 3.5685651302337646, "learning_rate": 0.00029120000000000003, "loss": 0.8331, "step": 547 }, { "epoch": 0.03713020131954502, "grad_norm": 2.2936601638793945, "learning_rate": 0.00029173333333333334, "loss": 0.7988, "step": 548 }, { "epoch": 0.037197957161369005, "grad_norm": 2.2276930809020996, "learning_rate": 0.0002922666666666667, "loss": 0.7061, "step": 549 }, { "epoch": 0.03726571300319299, "grad_norm": 2.878218650817871, "learning_rate": 0.0002928, "loss": 0.8431, "step": 550 }, { "epoch": 0.03733346884501698, "grad_norm": 2.5694830417633057, "learning_rate": 0.0002933333333333333, "loss": 0.785, "step": 551 }, { "epoch": 0.03740122468684097, "grad_norm": 2.700960397720337, "learning_rate": 0.0002938666666666667, "loss": 0.8431, "step": 552 }, { "epoch": 0.03746898052866496, "grad_norm": 2.5335254669189453, "learning_rate": 0.0002944, "loss": 0.7384, "step": 553 }, { "epoch": 0.037536736370488946, "grad_norm": 3.4916114807128906, "learning_rate": 0.0002949333333333333, "loss": 1.1884, "step": 554 }, { "epoch": 0.03760449221231293, "grad_norm": 2.864866018295288, "learning_rate": 0.0002954666666666667, "loss": 0.7412, "step": 555 }, { "epoch": 0.037672248054136916, "grad_norm": 2.990323305130005, "learning_rate": 0.000296, "loss": 0.9573, "step": 556 }, { "epoch": 0.037740003895960904, "grad_norm": 3.1998960971832275, "learning_rate": 0.00029653333333333335, "loss": 0.6946, "step": 557 }, { "epoch": 0.03780775973778489, "grad_norm": 4.070067405700684, "learning_rate": 0.0002970666666666667, "loss": 1.1749, "step": 558 }, { "epoch": 0.03787551557960888, "grad_norm": 3.5025758743286133, "learning_rate": 0.0002976, "loss": 1.0348, "step": 559 }, { "epoch": 0.03794327142143287, "grad_norm": 3.336334705352783, "learning_rate": 0.00029813333333333333, "loss": 0.8743, "step": 560 }, { "epoch": 0.03801102726325686, "grad_norm": 2.947784662246704, "learning_rate": 0.0002986666666666667, "loss": 0.8129, "step": 561 }, { "epoch": 0.03807878310508084, "grad_norm": 3.5019314289093018, "learning_rate": 0.0002992, "loss": 0.8994, "step": 562 }, { "epoch": 0.038146538946904826, "grad_norm": 5.031900882720947, "learning_rate": 0.0002997333333333333, "loss": 0.8815, "step": 563 }, { "epoch": 0.038214294788728814, "grad_norm": 2.7715864181518555, "learning_rate": 0.0003002666666666667, "loss": 0.9663, "step": 564 }, { "epoch": 0.0382820506305528, "grad_norm": 3.0140392780303955, "learning_rate": 0.0003008, "loss": 1.3123, "step": 565 }, { "epoch": 0.03834980647237679, "grad_norm": 2.718428134918213, "learning_rate": 0.00030133333333333336, "loss": 0.8085, "step": 566 }, { "epoch": 0.03841756231420078, "grad_norm": 3.546281099319458, "learning_rate": 0.0003018666666666667, "loss": 1.1422, "step": 567 }, { "epoch": 0.03848531815602477, "grad_norm": 2.809030771255493, "learning_rate": 0.00030240000000000003, "loss": 0.8929, "step": 568 }, { "epoch": 0.038553073997848755, "grad_norm": 4.255012512207031, "learning_rate": 0.00030293333333333334, "loss": 0.9192, "step": 569 }, { "epoch": 0.038620829839672736, "grad_norm": 4.0748419761657715, "learning_rate": 0.0003034666666666667, "loss": 1.1949, "step": 570 }, { "epoch": 0.038688585681496725, "grad_norm": 3.522765636444092, "learning_rate": 0.000304, "loss": 0.9937, "step": 571 }, { "epoch": 0.03875634152332071, "grad_norm": 3.249178886413574, "learning_rate": 0.0003045333333333333, "loss": 0.9696, "step": 572 }, { "epoch": 0.0388240973651447, "grad_norm": 2.560654401779175, "learning_rate": 0.0003050666666666667, "loss": 0.7643, "step": 573 }, { "epoch": 0.03889185320696869, "grad_norm": 3.2656116485595703, "learning_rate": 0.0003056, "loss": 0.9903, "step": 574 }, { "epoch": 0.03895960904879268, "grad_norm": 3.4029576778411865, "learning_rate": 0.0003061333333333333, "loss": 0.9068, "step": 575 }, { "epoch": 0.039027364890616666, "grad_norm": 2.3622934818267822, "learning_rate": 0.0003066666666666667, "loss": 0.846, "step": 576 }, { "epoch": 0.03909512073244065, "grad_norm": 2.6016721725463867, "learning_rate": 0.00030720000000000004, "loss": 0.8258, "step": 577 }, { "epoch": 0.039162876574264635, "grad_norm": 2.965467929840088, "learning_rate": 0.00030773333333333335, "loss": 0.8572, "step": 578 }, { "epoch": 0.03923063241608862, "grad_norm": 3.1143174171447754, "learning_rate": 0.0003082666666666667, "loss": 1.0713, "step": 579 }, { "epoch": 0.03929838825791261, "grad_norm": 3.061610221862793, "learning_rate": 0.0003088, "loss": 1.0639, "step": 580 }, { "epoch": 0.0393661440997366, "grad_norm": 2.217522382736206, "learning_rate": 0.00030933333333333334, "loss": 0.7877, "step": 581 }, { "epoch": 0.03943389994156059, "grad_norm": 2.861865282058716, "learning_rate": 0.00030986666666666665, "loss": 0.9587, "step": 582 }, { "epoch": 0.039501655783384576, "grad_norm": 2.6877834796905518, "learning_rate": 0.0003104, "loss": 0.8095, "step": 583 }, { "epoch": 0.039569411625208564, "grad_norm": 2.6415841579437256, "learning_rate": 0.0003109333333333333, "loss": 0.829, "step": 584 }, { "epoch": 0.039637167467032546, "grad_norm": 3.292417049407959, "learning_rate": 0.0003114666666666667, "loss": 1.1176, "step": 585 }, { "epoch": 0.039704923308856534, "grad_norm": 3.163455009460449, "learning_rate": 0.00031200000000000005, "loss": 1.0602, "step": 586 }, { "epoch": 0.03977267915068052, "grad_norm": 2.049830675125122, "learning_rate": 0.00031253333333333336, "loss": 0.7819, "step": 587 }, { "epoch": 0.03984043499250451, "grad_norm": 3.6346240043640137, "learning_rate": 0.00031306666666666667, "loss": 0.9563, "step": 588 }, { "epoch": 0.0399081908343285, "grad_norm": 3.3403074741363525, "learning_rate": 0.00031360000000000003, "loss": 0.9137, "step": 589 }, { "epoch": 0.039975946676152486, "grad_norm": 3.304730176925659, "learning_rate": 0.00031413333333333334, "loss": 0.7727, "step": 590 }, { "epoch": 0.040043702517976475, "grad_norm": 2.6178839206695557, "learning_rate": 0.00031466666666666665, "loss": 0.8185, "step": 591 }, { "epoch": 0.040111458359800456, "grad_norm": 2.526662826538086, "learning_rate": 0.0003152, "loss": 0.9307, "step": 592 }, { "epoch": 0.040179214201624444, "grad_norm": 2.9365689754486084, "learning_rate": 0.00031573333333333333, "loss": 1.0056, "step": 593 }, { "epoch": 0.04024697004344843, "grad_norm": 30.42789649963379, "learning_rate": 0.0003162666666666667, "loss": 1.089, "step": 594 }, { "epoch": 0.04031472588527242, "grad_norm": 2.970416784286499, "learning_rate": 0.00031680000000000006, "loss": 0.9169, "step": 595 }, { "epoch": 0.04038248172709641, "grad_norm": 3.2144784927368164, "learning_rate": 0.00031733333333333337, "loss": 1.0329, "step": 596 }, { "epoch": 0.0404502375689204, "grad_norm": 3.707195281982422, "learning_rate": 0.0003178666666666667, "loss": 0.9338, "step": 597 }, { "epoch": 0.040517993410744385, "grad_norm": 3.0473079681396484, "learning_rate": 0.00031840000000000004, "loss": 0.9199, "step": 598 }, { "epoch": 0.04058574925256837, "grad_norm": 2.6992573738098145, "learning_rate": 0.00031893333333333335, "loss": 0.7692, "step": 599 }, { "epoch": 0.040653505094392355, "grad_norm": 3.3280906677246094, "learning_rate": 0.00031946666666666666, "loss": 1.0642, "step": 600 }, { "epoch": 0.04072126093621634, "grad_norm": 3.3712239265441895, "learning_rate": 0.00032, "loss": 1.0029, "step": 601 }, { "epoch": 0.04078901677804033, "grad_norm": 3.070742130279541, "learning_rate": 0.00032053333333333334, "loss": 0.9725, "step": 602 }, { "epoch": 0.04085677261986432, "grad_norm": 2.912259578704834, "learning_rate": 0.00032106666666666665, "loss": 1.0389, "step": 603 }, { "epoch": 0.04092452846168831, "grad_norm": 2.375462532043457, "learning_rate": 0.0003216, "loss": 0.8011, "step": 604 }, { "epoch": 0.040992284303512296, "grad_norm": 3.4855964183807373, "learning_rate": 0.0003221333333333334, "loss": 0.9889, "step": 605 }, { "epoch": 0.041060040145336284, "grad_norm": 2.7019526958465576, "learning_rate": 0.0003226666666666667, "loss": 0.818, "step": 606 }, { "epoch": 0.041127795987160265, "grad_norm": 10.730135917663574, "learning_rate": 0.00032320000000000005, "loss": 0.7862, "step": 607 }, { "epoch": 0.04119555182898425, "grad_norm": 2.9470598697662354, "learning_rate": 0.00032373333333333336, "loss": 0.841, "step": 608 }, { "epoch": 0.04126330767080824, "grad_norm": 3.629456043243408, "learning_rate": 0.00032426666666666667, "loss": 0.8935, "step": 609 }, { "epoch": 0.04133106351263223, "grad_norm": 2.7942190170288086, "learning_rate": 0.00032480000000000003, "loss": 0.8737, "step": 610 }, { "epoch": 0.04139881935445622, "grad_norm": 3.5281028747558594, "learning_rate": 0.00032533333333333334, "loss": 0.7731, "step": 611 }, { "epoch": 0.041466575196280206, "grad_norm": 2.6879961490631104, "learning_rate": 0.00032586666666666665, "loss": 0.7488, "step": 612 }, { "epoch": 0.041534331038104194, "grad_norm": 2.6562204360961914, "learning_rate": 0.0003264, "loss": 0.8302, "step": 613 }, { "epoch": 0.041602086879928175, "grad_norm": 2.436709403991699, "learning_rate": 0.0003269333333333334, "loss": 0.6787, "step": 614 }, { "epoch": 0.041669842721752164, "grad_norm": 3.0573251247406006, "learning_rate": 0.0003274666666666667, "loss": 0.897, "step": 615 }, { "epoch": 0.04173759856357615, "grad_norm": 3.079270839691162, "learning_rate": 0.000328, "loss": 0.7824, "step": 616 }, { "epoch": 0.04180535440540014, "grad_norm": 2.557379961013794, "learning_rate": 0.00032853333333333337, "loss": 0.9262, "step": 617 }, { "epoch": 0.04187311024722413, "grad_norm": 2.3489010334014893, "learning_rate": 0.0003290666666666667, "loss": 0.7791, "step": 618 }, { "epoch": 0.041940866089048116, "grad_norm": 3.7144384384155273, "learning_rate": 0.0003296, "loss": 1.1106, "step": 619 }, { "epoch": 0.042008621930872105, "grad_norm": 3.401601552963257, "learning_rate": 0.00033013333333333335, "loss": 0.8669, "step": 620 }, { "epoch": 0.04207637777269609, "grad_norm": 2.546100616455078, "learning_rate": 0.00033066666666666666, "loss": 0.8754, "step": 621 }, { "epoch": 0.042144133614520074, "grad_norm": 2.6193525791168213, "learning_rate": 0.0003312, "loss": 0.7717, "step": 622 }, { "epoch": 0.04221188945634406, "grad_norm": 4.521993160247803, "learning_rate": 0.00033173333333333334, "loss": 0.9104, "step": 623 }, { "epoch": 0.04227964529816805, "grad_norm": 2.9595706462860107, "learning_rate": 0.0003322666666666667, "loss": 0.957, "step": 624 }, { "epoch": 0.04234740113999204, "grad_norm": 4.649613857269287, "learning_rate": 0.0003328, "loss": 1.0525, "step": 625 }, { "epoch": 0.04241515698181603, "grad_norm": 3.1371946334838867, "learning_rate": 0.0003333333333333334, "loss": 0.7659, "step": 626 }, { "epoch": 0.042482912823640015, "grad_norm": 2.692748785018921, "learning_rate": 0.0003338666666666667, "loss": 0.8026, "step": 627 }, { "epoch": 0.042550668665464, "grad_norm": 2.395095109939575, "learning_rate": 0.0003344, "loss": 0.78, "step": 628 }, { "epoch": 0.042618424507287984, "grad_norm": 2.9649155139923096, "learning_rate": 0.00033493333333333336, "loss": 0.8616, "step": 629 }, { "epoch": 0.04268618034911197, "grad_norm": 2.880617618560791, "learning_rate": 0.00033546666666666667, "loss": 0.8346, "step": 630 }, { "epoch": 0.04275393619093596, "grad_norm": 3.5783746242523193, "learning_rate": 0.000336, "loss": 0.9194, "step": 631 }, { "epoch": 0.04282169203275995, "grad_norm": 3.871279239654541, "learning_rate": 0.00033653333333333335, "loss": 1.0733, "step": 632 }, { "epoch": 0.04288944787458394, "grad_norm": 4.584722995758057, "learning_rate": 0.0003370666666666667, "loss": 1.0333, "step": 633 }, { "epoch": 0.042957203716407925, "grad_norm": 2.96329402923584, "learning_rate": 0.0003376, "loss": 0.9121, "step": 634 }, { "epoch": 0.043024959558231914, "grad_norm": 3.54400634765625, "learning_rate": 0.0003381333333333334, "loss": 1.0999, "step": 635 }, { "epoch": 0.0430927154000559, "grad_norm": 3.631817579269409, "learning_rate": 0.0003386666666666667, "loss": 0.8852, "step": 636 }, { "epoch": 0.04316047124187988, "grad_norm": 2.812607765197754, "learning_rate": 0.0003392, "loss": 1.0069, "step": 637 }, { "epoch": 0.04322822708370387, "grad_norm": 2.5128774642944336, "learning_rate": 0.00033973333333333337, "loss": 0.8595, "step": 638 }, { "epoch": 0.04329598292552786, "grad_norm": 3.2381293773651123, "learning_rate": 0.0003402666666666667, "loss": 0.8818, "step": 639 }, { "epoch": 0.04336373876735185, "grad_norm": 3.136335611343384, "learning_rate": 0.0003408, "loss": 1.0274, "step": 640 }, { "epoch": 0.043431494609175836, "grad_norm": 3.9792425632476807, "learning_rate": 0.00034133333333333335, "loss": 1.0939, "step": 641 }, { "epoch": 0.043499250450999824, "grad_norm": 3.2825241088867188, "learning_rate": 0.00034186666666666666, "loss": 1.1231, "step": 642 }, { "epoch": 0.04356700629282381, "grad_norm": 3.5767383575439453, "learning_rate": 0.00034240000000000003, "loss": 0.9208, "step": 643 }, { "epoch": 0.04363476213464779, "grad_norm": 3.1160783767700195, "learning_rate": 0.00034293333333333334, "loss": 1.0849, "step": 644 }, { "epoch": 0.04370251797647178, "grad_norm": 3.11011004447937, "learning_rate": 0.0003434666666666667, "loss": 0.9525, "step": 645 }, { "epoch": 0.04377027381829577, "grad_norm": 3.019231081008911, "learning_rate": 0.000344, "loss": 0.8811, "step": 646 }, { "epoch": 0.04383802966011976, "grad_norm": 4.965334892272949, "learning_rate": 0.0003445333333333333, "loss": 1.012, "step": 647 }, { "epoch": 0.043905785501943746, "grad_norm": 3.4868407249450684, "learning_rate": 0.0003450666666666667, "loss": 0.9993, "step": 648 }, { "epoch": 0.043973541343767734, "grad_norm": 3.9563772678375244, "learning_rate": 0.0003456, "loss": 1.0098, "step": 649 }, { "epoch": 0.04404129718559172, "grad_norm": 2.7158477306365967, "learning_rate": 0.0003461333333333333, "loss": 0.7081, "step": 650 }, { "epoch": 0.04410905302741571, "grad_norm": 4.0581512451171875, "learning_rate": 0.00034666666666666667, "loss": 1.1142, "step": 651 }, { "epoch": 0.04417680886923969, "grad_norm": 2.7889671325683594, "learning_rate": 0.00034720000000000004, "loss": 0.7659, "step": 652 }, { "epoch": 0.04424456471106368, "grad_norm": 3.3274965286254883, "learning_rate": 0.00034773333333333335, "loss": 0.9495, "step": 653 }, { "epoch": 0.04431232055288767, "grad_norm": 3.3760342597961426, "learning_rate": 0.0003482666666666667, "loss": 0.9421, "step": 654 }, { "epoch": 0.04438007639471166, "grad_norm": 2.580782651901245, "learning_rate": 0.0003488, "loss": 0.7386, "step": 655 }, { "epoch": 0.044447832236535645, "grad_norm": 2.6320087909698486, "learning_rate": 0.00034933333333333333, "loss": 0.838, "step": 656 }, { "epoch": 0.04451558807835963, "grad_norm": 3.1689820289611816, "learning_rate": 0.0003498666666666667, "loss": 0.9038, "step": 657 }, { "epoch": 0.04458334392018362, "grad_norm": 3.744007110595703, "learning_rate": 0.0003504, "loss": 1.0123, "step": 658 }, { "epoch": 0.0446510997620076, "grad_norm": 3.1133198738098145, "learning_rate": 0.0003509333333333333, "loss": 0.8086, "step": 659 }, { "epoch": 0.04471885560383159, "grad_norm": 3.892974376678467, "learning_rate": 0.0003514666666666667, "loss": 1.0866, "step": 660 }, { "epoch": 0.04478661144565558, "grad_norm": 3.008099317550659, "learning_rate": 0.00035200000000000005, "loss": 0.9656, "step": 661 }, { "epoch": 0.04485436728747957, "grad_norm": 2.807969808578491, "learning_rate": 0.00035253333333333336, "loss": 0.7714, "step": 662 }, { "epoch": 0.044922123129303555, "grad_norm": 3.3305890560150146, "learning_rate": 0.0003530666666666667, "loss": 1.0984, "step": 663 }, { "epoch": 0.044989878971127543, "grad_norm": 3.3619790077209473, "learning_rate": 0.00035360000000000003, "loss": 0.9861, "step": 664 }, { "epoch": 0.04505763481295153, "grad_norm": 3.570796251296997, "learning_rate": 0.00035413333333333334, "loss": 1.1391, "step": 665 }, { "epoch": 0.04512539065477552, "grad_norm": 3.096438407897949, "learning_rate": 0.0003546666666666667, "loss": 1.2258, "step": 666 }, { "epoch": 0.0451931464965995, "grad_norm": 2.5740082263946533, "learning_rate": 0.0003552, "loss": 0.8846, "step": 667 }, { "epoch": 0.04526090233842349, "grad_norm": 3.066537380218506, "learning_rate": 0.0003557333333333333, "loss": 0.855, "step": 668 }, { "epoch": 0.04532865818024748, "grad_norm": 3.297196388244629, "learning_rate": 0.0003562666666666667, "loss": 1.1191, "step": 669 }, { "epoch": 0.045396414022071466, "grad_norm": 3.025925874710083, "learning_rate": 0.0003568, "loss": 0.8708, "step": 670 }, { "epoch": 0.045464169863895454, "grad_norm": 2.9280247688293457, "learning_rate": 0.00035733333333333336, "loss": 0.965, "step": 671 }, { "epoch": 0.04553192570571944, "grad_norm": 3.341310739517212, "learning_rate": 0.00035786666666666673, "loss": 1.0948, "step": 672 }, { "epoch": 0.04559968154754343, "grad_norm": 3.2387642860412598, "learning_rate": 0.00035840000000000004, "loss": 0.9062, "step": 673 }, { "epoch": 0.04566743738936741, "grad_norm": 3.0945987701416016, "learning_rate": 0.00035893333333333335, "loss": 0.9888, "step": 674 }, { "epoch": 0.0457351932311914, "grad_norm": 3.4746453762054443, "learning_rate": 0.00035946666666666666, "loss": 0.9801, "step": 675 }, { "epoch": 0.04580294907301539, "grad_norm": 3.2266781330108643, "learning_rate": 0.00036, "loss": 0.9089, "step": 676 }, { "epoch": 0.045870704914839376, "grad_norm": 3.4057743549346924, "learning_rate": 0.00036053333333333333, "loss": 0.9321, "step": 677 }, { "epoch": 0.045938460756663364, "grad_norm": 5.1262617111206055, "learning_rate": 0.00036106666666666664, "loss": 1.0046, "step": 678 }, { "epoch": 0.04600621659848735, "grad_norm": 3.4456918239593506, "learning_rate": 0.0003616, "loss": 0.9374, "step": 679 }, { "epoch": 0.04607397244031134, "grad_norm": 4.373721122741699, "learning_rate": 0.00036213333333333337, "loss": 1.0569, "step": 680 }, { "epoch": 0.04614172828213532, "grad_norm": 4.362050533294678, "learning_rate": 0.0003626666666666667, "loss": 0.9526, "step": 681 }, { "epoch": 0.04620948412395931, "grad_norm": 3.5073964595794678, "learning_rate": 0.00036320000000000005, "loss": 1.0211, "step": 682 }, { "epoch": 0.0462772399657833, "grad_norm": 3.259704351425171, "learning_rate": 0.00036373333333333336, "loss": 0.7831, "step": 683 }, { "epoch": 0.04634499580760729, "grad_norm": 2.681959867477417, "learning_rate": 0.00036426666666666667, "loss": 0.8421, "step": 684 }, { "epoch": 0.046412751649431275, "grad_norm": 4.008633136749268, "learning_rate": 0.00036480000000000003, "loss": 1.0095, "step": 685 }, { "epoch": 0.04648050749125526, "grad_norm": 3.215111494064331, "learning_rate": 0.00036533333333333334, "loss": 0.7915, "step": 686 }, { "epoch": 0.04654826333307925, "grad_norm": 3.1472980976104736, "learning_rate": 0.00036586666666666665, "loss": 0.8681, "step": 687 }, { "epoch": 0.04661601917490324, "grad_norm": 3.7159383296966553, "learning_rate": 0.0003664, "loss": 1.2919, "step": 688 }, { "epoch": 0.04668377501672722, "grad_norm": 2.898164749145508, "learning_rate": 0.0003669333333333333, "loss": 0.6529, "step": 689 }, { "epoch": 0.04675153085855121, "grad_norm": 3.1191375255584717, "learning_rate": 0.0003674666666666667, "loss": 1.119, "step": 690 }, { "epoch": 0.0468192867003752, "grad_norm": 3.1812002658843994, "learning_rate": 0.00036800000000000005, "loss": 1.1914, "step": 691 }, { "epoch": 0.046887042542199185, "grad_norm": 4.197123050689697, "learning_rate": 0.00036853333333333336, "loss": 1.2257, "step": 692 }, { "epoch": 0.04695479838402317, "grad_norm": 2.880222797393799, "learning_rate": 0.0003690666666666667, "loss": 0.7928, "step": 693 }, { "epoch": 0.04702255422584716, "grad_norm": 3.1386241912841797, "learning_rate": 0.00036960000000000004, "loss": 0.9633, "step": 694 }, { "epoch": 0.04709031006767115, "grad_norm": 2.7785093784332275, "learning_rate": 0.00037013333333333335, "loss": 0.7293, "step": 695 }, { "epoch": 0.04715806590949513, "grad_norm": 3.5515894889831543, "learning_rate": 0.00037066666666666666, "loss": 0.8559, "step": 696 }, { "epoch": 0.04722582175131912, "grad_norm": 3.512742519378662, "learning_rate": 0.0003712, "loss": 0.8807, "step": 697 }, { "epoch": 0.04729357759314311, "grad_norm": 3.525923728942871, "learning_rate": 0.00037173333333333333, "loss": 0.8417, "step": 698 }, { "epoch": 0.047361333434967096, "grad_norm": 4.317751884460449, "learning_rate": 0.0003722666666666667, "loss": 1.134, "step": 699 }, { "epoch": 0.047429089276791084, "grad_norm": 3.909864664077759, "learning_rate": 0.00037280000000000006, "loss": 1.0501, "step": 700 }, { "epoch": 0.04749684511861507, "grad_norm": 3.746986150741577, "learning_rate": 0.0003733333333333334, "loss": 0.8394, "step": 701 }, { "epoch": 0.04756460096043906, "grad_norm": 2.4710848331451416, "learning_rate": 0.0003738666666666667, "loss": 0.7526, "step": 702 }, { "epoch": 0.04763235680226305, "grad_norm": 4.529815673828125, "learning_rate": 0.00037440000000000005, "loss": 1.1123, "step": 703 }, { "epoch": 0.04770011264408703, "grad_norm": 3.030824899673462, "learning_rate": 0.00037493333333333336, "loss": 0.8696, "step": 704 }, { "epoch": 0.04776786848591102, "grad_norm": 5.741715431213379, "learning_rate": 0.00037546666666666667, "loss": 0.7314, "step": 705 }, { "epoch": 0.047835624327735006, "grad_norm": 3.3223133087158203, "learning_rate": 0.000376, "loss": 0.8842, "step": 706 }, { "epoch": 0.047903380169558994, "grad_norm": 3.0566413402557373, "learning_rate": 0.00037653333333333334, "loss": 0.7958, "step": 707 }, { "epoch": 0.04797113601138298, "grad_norm": 6.268533706665039, "learning_rate": 0.00037706666666666665, "loss": 0.9317, "step": 708 }, { "epoch": 0.04803889185320697, "grad_norm": 3.068249464035034, "learning_rate": 0.0003776, "loss": 0.7673, "step": 709 }, { "epoch": 0.04810664769503096, "grad_norm": 2.9452309608459473, "learning_rate": 0.0003781333333333334, "loss": 0.8689, "step": 710 }, { "epoch": 0.04817440353685494, "grad_norm": 2.7499923706054688, "learning_rate": 0.0003786666666666667, "loss": 0.8073, "step": 711 }, { "epoch": 0.04824215937867893, "grad_norm": 3.541820764541626, "learning_rate": 0.0003792, "loss": 1.0404, "step": 712 }, { "epoch": 0.048309915220502916, "grad_norm": 2.7944793701171875, "learning_rate": 0.00037973333333333337, "loss": 0.8429, "step": 713 }, { "epoch": 0.048377671062326905, "grad_norm": 3.4795570373535156, "learning_rate": 0.0003802666666666667, "loss": 1.1089, "step": 714 }, { "epoch": 0.04844542690415089, "grad_norm": 4.470477104187012, "learning_rate": 0.0003808, "loss": 1.1987, "step": 715 }, { "epoch": 0.04851318274597488, "grad_norm": 3.7686710357666016, "learning_rate": 0.00038133333333333335, "loss": 0.9905, "step": 716 }, { "epoch": 0.04858093858779887, "grad_norm": 3.4814260005950928, "learning_rate": 0.00038186666666666666, "loss": 1.0501, "step": 717 }, { "epoch": 0.04864869442962286, "grad_norm": 2.9690535068511963, "learning_rate": 0.0003824, "loss": 0.8655, "step": 718 }, { "epoch": 0.04871645027144684, "grad_norm": 3.0018696784973145, "learning_rate": 0.0003829333333333334, "loss": 0.9617, "step": 719 }, { "epoch": 0.04878420611327083, "grad_norm": 2.861053466796875, "learning_rate": 0.0003834666666666667, "loss": 0.8903, "step": 720 }, { "epoch": 0.048851961955094815, "grad_norm": 3.7595620155334473, "learning_rate": 0.000384, "loss": 0.9875, "step": 721 }, { "epoch": 0.0489197177969188, "grad_norm": 2.907060146331787, "learning_rate": 0.0003845333333333334, "loss": 0.7723, "step": 722 }, { "epoch": 0.04898747363874279, "grad_norm": 4.519503593444824, "learning_rate": 0.0003850666666666667, "loss": 0.8526, "step": 723 }, { "epoch": 0.04905522948056678, "grad_norm": 3.334078788757324, "learning_rate": 0.0003856, "loss": 0.9351, "step": 724 }, { "epoch": 0.04912298532239077, "grad_norm": 3.5368268489837646, "learning_rate": 0.00038613333333333336, "loss": 0.8089, "step": 725 }, { "epoch": 0.04919074116421475, "grad_norm": 4.235016345977783, "learning_rate": 0.00038666666666666667, "loss": 0.8744, "step": 726 }, { "epoch": 0.04925849700603874, "grad_norm": 2.5189549922943115, "learning_rate": 0.00038720000000000003, "loss": 0.6836, "step": 727 }, { "epoch": 0.049326252847862725, "grad_norm": 3.3870596885681152, "learning_rate": 0.0003877333333333334, "loss": 0.9274, "step": 728 }, { "epoch": 0.049394008689686714, "grad_norm": 4.001610279083252, "learning_rate": 0.0003882666666666667, "loss": 1.0687, "step": 729 }, { "epoch": 0.0494617645315107, "grad_norm": 3.3768110275268555, "learning_rate": 0.0003888, "loss": 0.9495, "step": 730 }, { "epoch": 0.04952952037333469, "grad_norm": 2.8878121376037598, "learning_rate": 0.0003893333333333334, "loss": 0.8092, "step": 731 }, { "epoch": 0.04959727621515868, "grad_norm": 3.6729800701141357, "learning_rate": 0.0003898666666666667, "loss": 0.9209, "step": 732 }, { "epoch": 0.049665032056982666, "grad_norm": 2.899528980255127, "learning_rate": 0.0003904, "loss": 0.896, "step": 733 }, { "epoch": 0.04973278789880665, "grad_norm": 2.9886481761932373, "learning_rate": 0.00039093333333333337, "loss": 0.9277, "step": 734 }, { "epoch": 0.049800543740630636, "grad_norm": 3.924001693725586, "learning_rate": 0.0003914666666666667, "loss": 1.131, "step": 735 }, { "epoch": 0.049868299582454624, "grad_norm": 2.929126262664795, "learning_rate": 0.000392, "loss": 0.8613, "step": 736 }, { "epoch": 0.04993605542427861, "grad_norm": 3.5196645259857178, "learning_rate": 0.00039253333333333335, "loss": 1.0517, "step": 737 }, { "epoch": 0.0500038112661026, "grad_norm": 5.8890485763549805, "learning_rate": 0.0003930666666666667, "loss": 0.8777, "step": 738 }, { "epoch": 0.05007156710792659, "grad_norm": 11.812308311462402, "learning_rate": 0.0003936, "loss": 3.0905, "step": 739 }, { "epoch": 0.05013932294975058, "grad_norm": 17.958906173706055, "learning_rate": 0.00039413333333333334, "loss": 2.6268, "step": 740 }, { "epoch": 0.05020707879157456, "grad_norm": 600.3106079101562, "learning_rate": 0.0003946666666666667, "loss": 7.0879, "step": 741 }, { "epoch": 0.050274834633398546, "grad_norm": 84.66808319091797, "learning_rate": 0.0003952, "loss": 12.546, "step": 742 }, { "epoch": 0.050342590475222534, "grad_norm": 16.59645652770996, "learning_rate": 0.0003957333333333333, "loss": 10.5624, "step": 743 }, { "epoch": 0.05041034631704652, "grad_norm": 8.208555221557617, "learning_rate": 0.0003962666666666667, "loss": 10.2705, "step": 744 }, { "epoch": 0.05047810215887051, "grad_norm": 6.307867527008057, "learning_rate": 0.0003968, "loss": 9.8993, "step": 745 }, { "epoch": 0.0505458580006945, "grad_norm": 8.820687294006348, "learning_rate": 0.00039733333333333336, "loss": 9.9412, "step": 746 }, { "epoch": 0.05061361384251849, "grad_norm": 2.205470085144043, "learning_rate": 0.0003978666666666667, "loss": 9.9789, "step": 747 }, { "epoch": 0.05068136968434247, "grad_norm": 4.151752471923828, "learning_rate": 0.00039840000000000003, "loss": 9.6283, "step": 748 }, { "epoch": 0.05074912552616646, "grad_norm": 3.4692091941833496, "learning_rate": 0.00039893333333333334, "loss": 9.2068, "step": 749 }, { "epoch": 0.050816881367990445, "grad_norm": 6.643427848815918, "learning_rate": 0.0003994666666666667, "loss": 9.306, "step": 750 }, { "epoch": 0.05088463720981443, "grad_norm": 2.035916805267334, "learning_rate": 0.0004, "loss": 9.2433, "step": 751 }, { "epoch": 0.05095239305163842, "grad_norm": 3.0338926315307617, "learning_rate": 0.0003999945235487404, "loss": 8.8081, "step": 752 }, { "epoch": 0.05102014889346241, "grad_norm": 4.1774983406066895, "learning_rate": 0.0003999890470974808, "loss": 7.834, "step": 753 }, { "epoch": 0.0510879047352864, "grad_norm": 10.138450622558594, "learning_rate": 0.00039998357064622127, "loss": 8.4914, "step": 754 }, { "epoch": 0.051155660577110386, "grad_norm": 6.8651442527771, "learning_rate": 0.0003999780941949617, "loss": 8.3133, "step": 755 }, { "epoch": 0.05122341641893437, "grad_norm": 6.002278804779053, "learning_rate": 0.0003999726177437021, "loss": 7.6195, "step": 756 }, { "epoch": 0.051291172260758355, "grad_norm": 8.576353073120117, "learning_rate": 0.0003999671412924425, "loss": 6.9921, "step": 757 }, { "epoch": 0.051358928102582344, "grad_norm": 4.290818214416504, "learning_rate": 0.0003999616648411829, "loss": 6.7199, "step": 758 }, { "epoch": 0.05142668394440633, "grad_norm": 4.6634602546691895, "learning_rate": 0.0003999561883899233, "loss": 7.2129, "step": 759 }, { "epoch": 0.05149443978623032, "grad_norm": 2.2393553256988525, "learning_rate": 0.0003999507119386638, "loss": 6.3264, "step": 760 }, { "epoch": 0.05156219562805431, "grad_norm": 8.81263256072998, "learning_rate": 0.0003999452354874042, "loss": 6.212, "step": 761 }, { "epoch": 0.051629951469878296, "grad_norm": 4.736633777618408, "learning_rate": 0.0003999397590361446, "loss": 6.3428, "step": 762 }, { "epoch": 0.05169770731170228, "grad_norm": 4.971992015838623, "learning_rate": 0.000399934282584885, "loss": 5.8014, "step": 763 }, { "epoch": 0.051765463153526266, "grad_norm": 38.19525909423828, "learning_rate": 0.0003999288061336254, "loss": 6.3546, "step": 764 }, { "epoch": 0.051833218995350254, "grad_norm": 23.310277938842773, "learning_rate": 0.0003999233296823659, "loss": 6.5051, "step": 765 }, { "epoch": 0.05190097483717424, "grad_norm": 13.49642562866211, "learning_rate": 0.0003999178532311063, "loss": 5.7665, "step": 766 }, { "epoch": 0.05196873067899823, "grad_norm": 2.8793632984161377, "learning_rate": 0.0003999123767798467, "loss": 6.0075, "step": 767 }, { "epoch": 0.05203648652082222, "grad_norm": 3.381035327911377, "learning_rate": 0.0003999069003285871, "loss": 5.3249, "step": 768 }, { "epoch": 0.05210424236264621, "grad_norm": 6.088931560516357, "learning_rate": 0.0003999014238773275, "loss": 5.4656, "step": 769 }, { "epoch": 0.052171998204470195, "grad_norm": 3.5593619346618652, "learning_rate": 0.00039989594742606793, "loss": 5.5078, "step": 770 }, { "epoch": 0.052239754046294176, "grad_norm": 2.1775152683258057, "learning_rate": 0.0003998904709748084, "loss": 5.5093, "step": 771 }, { "epoch": 0.052307509888118164, "grad_norm": 7.67357063293457, "learning_rate": 0.0003998849945235488, "loss": 5.1376, "step": 772 }, { "epoch": 0.05237526572994215, "grad_norm": 3.736935615539551, "learning_rate": 0.0003998795180722892, "loss": 5.2221, "step": 773 }, { "epoch": 0.05244302157176614, "grad_norm": 1.1449586153030396, "learning_rate": 0.0003998740416210296, "loss": 5.4248, "step": 774 }, { "epoch": 0.05251077741359013, "grad_norm": 1.7828917503356934, "learning_rate": 0.00039986856516977, "loss": 4.9743, "step": 775 }, { "epoch": 0.05257853325541412, "grad_norm": 1.9232449531555176, "learning_rate": 0.00039986308871851043, "loss": 4.9526, "step": 776 }, { "epoch": 0.052646289097238105, "grad_norm": 0.8622642159461975, "learning_rate": 0.00039985761226725083, "loss": 5.24, "step": 777 }, { "epoch": 0.05271404493906209, "grad_norm": 1.9289792776107788, "learning_rate": 0.00039985213581599123, "loss": 4.9224, "step": 778 }, { "epoch": 0.052781800780886075, "grad_norm": 1.022391438484192, "learning_rate": 0.0003998466593647317, "loss": 4.7525, "step": 779 }, { "epoch": 0.05284955662271006, "grad_norm": 1.4984608888626099, "learning_rate": 0.0003998411829134721, "loss": 4.667, "step": 780 }, { "epoch": 0.05291731246453405, "grad_norm": 4.3209662437438965, "learning_rate": 0.00039983570646221254, "loss": 4.692, "step": 781 }, { "epoch": 0.05298506830635804, "grad_norm": 1.600207805633545, "learning_rate": 0.00039983023001095294, "loss": 4.6204, "step": 782 }, { "epoch": 0.05305282414818203, "grad_norm": 2.554429531097412, "learning_rate": 0.00039982475355969334, "loss": 4.5914, "step": 783 }, { "epoch": 0.053120579990006016, "grad_norm": 2.0773489475250244, "learning_rate": 0.00039981927710843374, "loss": 4.6497, "step": 784 }, { "epoch": 0.053188335831830004, "grad_norm": 2.5732383728027344, "learning_rate": 0.00039981380065717413, "loss": 4.8684, "step": 785 }, { "epoch": 0.053256091673653985, "grad_norm": 1.5930901765823364, "learning_rate": 0.0003998083242059146, "loss": 4.7589, "step": 786 }, { "epoch": 0.05332384751547797, "grad_norm": 1.8856751918792725, "learning_rate": 0.00039980284775465504, "loss": 4.7605, "step": 787 }, { "epoch": 0.05339160335730196, "grad_norm": 19.68305778503418, "learning_rate": 0.00039979737130339544, "loss": 4.9804, "step": 788 }, { "epoch": 0.05345935919912595, "grad_norm": 2.6940083503723145, "learning_rate": 0.00039979189485213584, "loss": 4.4454, "step": 789 }, { "epoch": 0.05352711504094994, "grad_norm": 45.97073745727539, "learning_rate": 0.00039978641840087624, "loss": 4.7939, "step": 790 }, { "epoch": 0.053594870882773926, "grad_norm": 10.410804748535156, "learning_rate": 0.00039978094194961664, "loss": 4.5857, "step": 791 }, { "epoch": 0.053662626724597914, "grad_norm": 23.36691665649414, "learning_rate": 0.0003997754654983571, "loss": 4.3369, "step": 792 }, { "epoch": 0.053730382566421896, "grad_norm": 1.963236927986145, "learning_rate": 0.0003997699890470975, "loss": 4.6738, "step": 793 }, { "epoch": 0.053798138408245884, "grad_norm": 1.1950116157531738, "learning_rate": 0.00039976451259583794, "loss": 4.8778, "step": 794 }, { "epoch": 0.05386589425006987, "grad_norm": 81.23480987548828, "learning_rate": 0.00039975903614457834, "loss": 4.7837, "step": 795 }, { "epoch": 0.05393365009189386, "grad_norm": 5.290766716003418, "learning_rate": 0.00039975355969331874, "loss": 4.506, "step": 796 }, { "epoch": 0.05400140593371785, "grad_norm": 3.2260262966156006, "learning_rate": 0.00039974808324205914, "loss": 4.3442, "step": 797 }, { "epoch": 0.05406916177554184, "grad_norm": 1.7140204906463623, "learning_rate": 0.0003997426067907996, "loss": 4.2865, "step": 798 }, { "epoch": 0.054136917617365825, "grad_norm": 3.7086181640625, "learning_rate": 0.00039973713033954, "loss": 4.4939, "step": 799 }, { "epoch": 0.054204673459189806, "grad_norm": 2.6034536361694336, "learning_rate": 0.0003997316538882804, "loss": 4.5271, "step": 800 }, { "epoch": 0.054272429301013794, "grad_norm": 1.5339417457580566, "learning_rate": 0.0003997261774370208, "loss": 4.5487, "step": 801 }, { "epoch": 0.05434018514283778, "grad_norm": 0.9662337303161621, "learning_rate": 0.00039972070098576125, "loss": 4.4613, "step": 802 }, { "epoch": 0.05440794098466177, "grad_norm": 4.066468715667725, "learning_rate": 0.0003997152245345017, "loss": 4.5914, "step": 803 }, { "epoch": 0.05447569682648576, "grad_norm": 1.0601027011871338, "learning_rate": 0.0003997097480832421, "loss": 4.7598, "step": 804 }, { "epoch": 0.05454345266830975, "grad_norm": 0.9377830624580383, "learning_rate": 0.0003997042716319825, "loss": 4.8, "step": 805 }, { "epoch": 0.054611208510133735, "grad_norm": 1.1871048212051392, "learning_rate": 0.0003996987951807229, "loss": 4.7096, "step": 806 }, { "epoch": 0.05467896435195772, "grad_norm": 21.85776138305664, "learning_rate": 0.0003996933187294633, "loss": 5.3304, "step": 807 }, { "epoch": 0.054746720193781705, "grad_norm": 1.4815789461135864, "learning_rate": 0.00039968784227820375, "loss": 4.552, "step": 808 }, { "epoch": 0.05481447603560569, "grad_norm": 2.6075069904327393, "learning_rate": 0.00039968236582694415, "loss": 4.4726, "step": 809 }, { "epoch": 0.05488223187742968, "grad_norm": 2.40385365486145, "learning_rate": 0.0003996768893756846, "loss": 4.5902, "step": 810 }, { "epoch": 0.05494998771925367, "grad_norm": 0.9601931571960449, "learning_rate": 0.000399671412924425, "loss": 4.6827, "step": 811 }, { "epoch": 0.05501774356107766, "grad_norm": 2.3693535327911377, "learning_rate": 0.0003996659364731654, "loss": 4.5815, "step": 812 }, { "epoch": 0.055085499402901646, "grad_norm": 11.339011192321777, "learning_rate": 0.0003996604600219058, "loss": 4.8419, "step": 813 }, { "epoch": 0.055153255244725634, "grad_norm": 0.9337912201881409, "learning_rate": 0.00039965498357064625, "loss": 4.3496, "step": 814 }, { "epoch": 0.055221011086549615, "grad_norm": 6.3379950523376465, "learning_rate": 0.00039964950711938665, "loss": 4.911, "step": 815 }, { "epoch": 0.0552887669283736, "grad_norm": 1.7116317749023438, "learning_rate": 0.00039964403066812705, "loss": 4.9699, "step": 816 }, { "epoch": 0.05535652277019759, "grad_norm": 2.0963220596313477, "learning_rate": 0.0003996385542168675, "loss": 4.507, "step": 817 }, { "epoch": 0.05542427861202158, "grad_norm": 4.710665702819824, "learning_rate": 0.0003996330777656079, "loss": 4.5403, "step": 818 }, { "epoch": 0.05549203445384557, "grad_norm": 6.667975425720215, "learning_rate": 0.00039962760131434836, "loss": 4.4518, "step": 819 }, { "epoch": 0.055559790295669556, "grad_norm": 2.572284698486328, "learning_rate": 0.00039962212486308876, "loss": 5.0368, "step": 820 }, { "epoch": 0.055627546137493544, "grad_norm": 1.625669002532959, "learning_rate": 0.00039961664841182916, "loss": 4.6799, "step": 821 }, { "epoch": 0.05569530197931753, "grad_norm": 2.4382598400115967, "learning_rate": 0.00039961117196056956, "loss": 4.568, "step": 822 }, { "epoch": 0.055763057821141514, "grad_norm": 1.5718884468078613, "learning_rate": 0.00039960569550930996, "loss": 4.4828, "step": 823 }, { "epoch": 0.0558308136629655, "grad_norm": 1.398815631866455, "learning_rate": 0.0003996002190580504, "loss": 4.7335, "step": 824 }, { "epoch": 0.05589856950478949, "grad_norm": 2.414658784866333, "learning_rate": 0.00039959474260679086, "loss": 4.8948, "step": 825 }, { "epoch": 0.05596632534661348, "grad_norm": 0.6470986008644104, "learning_rate": 0.00039958926615553126, "loss": 4.7307, "step": 826 }, { "epoch": 0.056034081188437466, "grad_norm": 0.9816928505897522, "learning_rate": 0.00039958378970427166, "loss": 4.6925, "step": 827 }, { "epoch": 0.056101837030261455, "grad_norm": 0.7028578519821167, "learning_rate": 0.00039957831325301206, "loss": 3.8737, "step": 828 }, { "epoch": 0.05616959287208544, "grad_norm": 1.4886821508407593, "learning_rate": 0.00039957283680175246, "loss": 4.2847, "step": 829 }, { "epoch": 0.056237348713909424, "grad_norm": 0.9010695815086365, "learning_rate": 0.0003995673603504929, "loss": 4.5775, "step": 830 }, { "epoch": 0.05630510455573341, "grad_norm": 1.010182499885559, "learning_rate": 0.0003995618838992333, "loss": 4.201, "step": 831 }, { "epoch": 0.0563728603975574, "grad_norm": 0.8768413662910461, "learning_rate": 0.0003995564074479737, "loss": 5.0419, "step": 832 }, { "epoch": 0.05644061623938139, "grad_norm": 0.5664284229278564, "learning_rate": 0.00039955093099671416, "loss": 4.8007, "step": 833 }, { "epoch": 0.05650837208120538, "grad_norm": 0.9429548382759094, "learning_rate": 0.00039954545454545456, "loss": 4.2521, "step": 834 }, { "epoch": 0.056576127923029365, "grad_norm": 1.328769564628601, "learning_rate": 0.00039953997809419496, "loss": 4.9852, "step": 835 }, { "epoch": 0.05664388376485335, "grad_norm": 0.9594823122024536, "learning_rate": 0.0003995345016429354, "loss": 4.6721, "step": 836 }, { "epoch": 0.05671163960667734, "grad_norm": 1.1371556520462036, "learning_rate": 0.0003995290251916758, "loss": 4.6596, "step": 837 }, { "epoch": 0.05677939544850132, "grad_norm": 0.9706981778144836, "learning_rate": 0.0003995235487404162, "loss": 4.8398, "step": 838 }, { "epoch": 0.05684715129032531, "grad_norm": 3.1281120777130127, "learning_rate": 0.0003995180722891566, "loss": 4.2573, "step": 839 }, { "epoch": 0.0569149071321493, "grad_norm": 3.9823930263519287, "learning_rate": 0.00039951259583789707, "loss": 4.5177, "step": 840 }, { "epoch": 0.05698266297397329, "grad_norm": 3.1667728424072266, "learning_rate": 0.0003995071193866375, "loss": 4.362, "step": 841 }, { "epoch": 0.057050418815797276, "grad_norm": 1.2826701402664185, "learning_rate": 0.0003995016429353779, "loss": 4.4142, "step": 842 }, { "epoch": 0.057118174657621264, "grad_norm": 1.3967714309692383, "learning_rate": 0.0003994961664841183, "loss": 4.6293, "step": 843 }, { "epoch": 0.05718593049944525, "grad_norm": 1.2823281288146973, "learning_rate": 0.0003994906900328587, "loss": 4.5802, "step": 844 }, { "epoch": 0.05725368634126923, "grad_norm": 1.9164766073226929, "learning_rate": 0.0003994852135815991, "loss": 4.5611, "step": 845 }, { "epoch": 0.05732144218309322, "grad_norm": 3.7186031341552734, "learning_rate": 0.00039947973713033957, "loss": 4.6504, "step": 846 }, { "epoch": 0.05738919802491721, "grad_norm": 0.5601058006286621, "learning_rate": 0.00039947426067907997, "loss": 4.3909, "step": 847 }, { "epoch": 0.0574569538667412, "grad_norm": 1.2436442375183105, "learning_rate": 0.00039946878422782037, "loss": 4.4965, "step": 848 }, { "epoch": 0.057524709708565186, "grad_norm": 0.584384024143219, "learning_rate": 0.0003994633077765608, "loss": 4.5081, "step": 849 }, { "epoch": 0.057592465550389174, "grad_norm": 0.7431687116622925, "learning_rate": 0.0003994578313253012, "loss": 4.5399, "step": 850 }, { "epoch": 0.05766022139221316, "grad_norm": 1.0575697422027588, "learning_rate": 0.0003994523548740416, "loss": 4.4467, "step": 851 }, { "epoch": 0.05772797723403715, "grad_norm": 1.2355396747589111, "learning_rate": 0.0003994468784227821, "loss": 4.1727, "step": 852 }, { "epoch": 0.05779573307586113, "grad_norm": 0.7289327383041382, "learning_rate": 0.0003994414019715225, "loss": 4.4649, "step": 853 }, { "epoch": 0.05786348891768512, "grad_norm": 0.6486912369728088, "learning_rate": 0.0003994359255202629, "loss": 4.4857, "step": 854 }, { "epoch": 0.05793124475950911, "grad_norm": 3.4088003635406494, "learning_rate": 0.00039943044906900327, "loss": 4.3327, "step": 855 }, { "epoch": 0.057999000601333096, "grad_norm": 0.862855851650238, "learning_rate": 0.0003994249726177437, "loss": 4.3531, "step": 856 }, { "epoch": 0.058066756443157085, "grad_norm": 1.7253564596176147, "learning_rate": 0.0003994194961664842, "loss": 4.5792, "step": 857 }, { "epoch": 0.05813451228498107, "grad_norm": 1.3579140901565552, "learning_rate": 0.0003994140197152246, "loss": 4.6806, "step": 858 }, { "epoch": 0.05820226812680506, "grad_norm": 0.5810004472732544, "learning_rate": 0.000399408543263965, "loss": 4.5432, "step": 859 }, { "epoch": 0.05827002396862904, "grad_norm": 1.7456716299057007, "learning_rate": 0.0003994030668127054, "loss": 4.9803, "step": 860 }, { "epoch": 0.05833777981045303, "grad_norm": 3.1904361248016357, "learning_rate": 0.0003993975903614458, "loss": 4.3816, "step": 861 }, { "epoch": 0.05840553565227702, "grad_norm": 0.6206144094467163, "learning_rate": 0.00039939211391018623, "loss": 4.3105, "step": 862 }, { "epoch": 0.05847329149410101, "grad_norm": 1.7596697807312012, "learning_rate": 0.00039938663745892663, "loss": 4.621, "step": 863 }, { "epoch": 0.058541047335924995, "grad_norm": 2.8368983268737793, "learning_rate": 0.0003993811610076671, "loss": 4.152, "step": 864 }, { "epoch": 0.05860880317774898, "grad_norm": 1.091814398765564, "learning_rate": 0.0003993756845564075, "loss": 4.5723, "step": 865 }, { "epoch": 0.05867655901957297, "grad_norm": 1.0570927858352661, "learning_rate": 0.0003993702081051479, "loss": 4.1766, "step": 866 }, { "epoch": 0.05874431486139695, "grad_norm": 3.649738073348999, "learning_rate": 0.0003993647316538883, "loss": 4.7552, "step": 867 }, { "epoch": 0.05881207070322094, "grad_norm": 2.5619242191314697, "learning_rate": 0.00039935925520262873, "loss": 4.3709, "step": 868 }, { "epoch": 0.05887982654504493, "grad_norm": 3.267697334289551, "learning_rate": 0.00039935377875136913, "loss": 3.8417, "step": 869 }, { "epoch": 0.05894758238686892, "grad_norm": 1.2656768560409546, "learning_rate": 0.00039934830230010953, "loss": 4.7309, "step": 870 }, { "epoch": 0.059015338228692905, "grad_norm": 1.1866697072982788, "learning_rate": 0.00039934282584884993, "loss": 4.5168, "step": 871 }, { "epoch": 0.059083094070516894, "grad_norm": 2.16274356842041, "learning_rate": 0.0003993373493975904, "loss": 4.3978, "step": 872 }, { "epoch": 0.05915084991234088, "grad_norm": 3.568162441253662, "learning_rate": 0.0003993318729463308, "loss": 4.5905, "step": 873 }, { "epoch": 0.05921860575416487, "grad_norm": 0.8236764669418335, "learning_rate": 0.00039932639649507124, "loss": 4.4475, "step": 874 }, { "epoch": 0.05928636159598885, "grad_norm": 0.4662351608276367, "learning_rate": 0.00039932092004381164, "loss": 4.6467, "step": 875 }, { "epoch": 0.05935411743781284, "grad_norm": 0.882517397403717, "learning_rate": 0.00039931544359255204, "loss": 4.7026, "step": 876 }, { "epoch": 0.05942187327963683, "grad_norm": 0.5317507386207581, "learning_rate": 0.00039930996714129243, "loss": 4.3514, "step": 877 }, { "epoch": 0.059489629121460816, "grad_norm": 0.6500367522239685, "learning_rate": 0.0003993044906900329, "loss": 4.533, "step": 878 }, { "epoch": 0.059557384963284804, "grad_norm": 0.4958527982234955, "learning_rate": 0.0003992990142387733, "loss": 4.241, "step": 879 }, { "epoch": 0.05962514080510879, "grad_norm": 0.5385131239891052, "learning_rate": 0.00039929353778751374, "loss": 4.5951, "step": 880 }, { "epoch": 0.05969289664693278, "grad_norm": 0.9459277987480164, "learning_rate": 0.00039928806133625414, "loss": 4.6937, "step": 881 }, { "epoch": 0.05976065248875676, "grad_norm": 7.695199489593506, "learning_rate": 0.00039928258488499454, "loss": 4.8578, "step": 882 }, { "epoch": 0.05982840833058075, "grad_norm": 1.0553722381591797, "learning_rate": 0.00039927710843373494, "loss": 4.2417, "step": 883 }, { "epoch": 0.05989616417240474, "grad_norm": 1.0332300662994385, "learning_rate": 0.0003992716319824754, "loss": 4.2995, "step": 884 }, { "epoch": 0.059963920014228726, "grad_norm": 2.336469888687134, "learning_rate": 0.0003992661555312158, "loss": 4.4714, "step": 885 }, { "epoch": 0.060031675856052714, "grad_norm": 1.1845463514328003, "learning_rate": 0.0003992606790799562, "loss": 4.1134, "step": 886 }, { "epoch": 0.0600994316978767, "grad_norm": 4.708810806274414, "learning_rate": 0.00039925520262869664, "loss": 4.3074, "step": 887 }, { "epoch": 0.06016718753970069, "grad_norm": 1.973006248474121, "learning_rate": 0.00039924972617743704, "loss": 4.173, "step": 888 }, { "epoch": 0.06023494338152468, "grad_norm": 1.1969362497329712, "learning_rate": 0.00039924424972617744, "loss": 4.4708, "step": 889 }, { "epoch": 0.06030269922334866, "grad_norm": 1.9653278589248657, "learning_rate": 0.0003992387732749179, "loss": 4.9491, "step": 890 }, { "epoch": 0.06037045506517265, "grad_norm": 4.1152472496032715, "learning_rate": 0.0003992332968236583, "loss": 4.098, "step": 891 }, { "epoch": 0.06043821090699664, "grad_norm": 1.6702840328216553, "learning_rate": 0.0003992278203723987, "loss": 4.49, "step": 892 }, { "epoch": 0.060505966748820625, "grad_norm": 2.285109758377075, "learning_rate": 0.0003992223439211391, "loss": 4.2792, "step": 893 }, { "epoch": 0.06057372259064461, "grad_norm": 1.6747833490371704, "learning_rate": 0.0003992168674698795, "loss": 4.5042, "step": 894 }, { "epoch": 0.0606414784324686, "grad_norm": 1.3493297100067139, "learning_rate": 0.00039921139101862, "loss": 4.5992, "step": 895 }, { "epoch": 0.06070923427429259, "grad_norm": 0.7767273187637329, "learning_rate": 0.0003992059145673604, "loss": 4.4913, "step": 896 }, { "epoch": 0.06077699011611657, "grad_norm": 3.5427534580230713, "learning_rate": 0.0003992004381161008, "loss": 4.6355, "step": 897 }, { "epoch": 0.06084474595794056, "grad_norm": 6.20245361328125, "learning_rate": 0.0003991949616648412, "loss": 4.264, "step": 898 }, { "epoch": 0.06091250179976455, "grad_norm": 2.2082314491271973, "learning_rate": 0.0003991894852135816, "loss": 4.5164, "step": 899 }, { "epoch": 0.060980257641588535, "grad_norm": 2.7530717849731445, "learning_rate": 0.00039918400876232205, "loss": 4.6049, "step": 900 }, { "epoch": 0.061048013483412523, "grad_norm": 1.00613534450531, "learning_rate": 0.00039917853231106245, "loss": 4.6851, "step": 901 }, { "epoch": 0.06111576932523651, "grad_norm": 1.1841747760772705, "learning_rate": 0.00039917305585980285, "loss": 4.2413, "step": 902 }, { "epoch": 0.0611835251670605, "grad_norm": 1.3521040678024292, "learning_rate": 0.0003991675794085433, "loss": 4.2449, "step": 903 }, { "epoch": 0.06125128100888449, "grad_norm": 1.6031383275985718, "learning_rate": 0.0003991621029572837, "loss": 4.5417, "step": 904 }, { "epoch": 0.06131903685070847, "grad_norm": 0.4420834183692932, "learning_rate": 0.0003991566265060241, "loss": 4.4557, "step": 905 }, { "epoch": 0.06138679269253246, "grad_norm": 0.8626751899719238, "learning_rate": 0.00039915115005476455, "loss": 4.4907, "step": 906 }, { "epoch": 0.061454548534356446, "grad_norm": 0.6176138520240784, "learning_rate": 0.00039914567360350495, "loss": 4.9553, "step": 907 }, { "epoch": 0.061522304376180434, "grad_norm": 1.3151031732559204, "learning_rate": 0.00039914019715224535, "loss": 4.2128, "step": 908 }, { "epoch": 0.06159006021800442, "grad_norm": 1.351062536239624, "learning_rate": 0.00039913472070098575, "loss": 4.557, "step": 909 }, { "epoch": 0.06165781605982841, "grad_norm": 0.9055910110473633, "learning_rate": 0.00039912924424972615, "loss": 4.4782, "step": 910 }, { "epoch": 0.0617255719016524, "grad_norm": 0.6445788145065308, "learning_rate": 0.0003991237677984666, "loss": 4.5884, "step": 911 }, { "epoch": 0.06179332774347638, "grad_norm": 7.6627421379089355, "learning_rate": 0.00039911829134720706, "loss": 4.4845, "step": 912 }, { "epoch": 0.06186108358530037, "grad_norm": 1.9889767169952393, "learning_rate": 0.00039911281489594746, "loss": 4.3548, "step": 913 }, { "epoch": 0.061928839427124356, "grad_norm": 0.8727238774299622, "learning_rate": 0.00039910733844468786, "loss": 4.8449, "step": 914 }, { "epoch": 0.061996595268948344, "grad_norm": 0.5041613578796387, "learning_rate": 0.00039910186199342826, "loss": 4.8343, "step": 915 }, { "epoch": 0.06206435111077233, "grad_norm": 1.674844741821289, "learning_rate": 0.0003990963855421687, "loss": 4.8545, "step": 916 }, { "epoch": 0.06213210695259632, "grad_norm": 0.914272665977478, "learning_rate": 0.0003990909090909091, "loss": 4.7193, "step": 917 }, { "epoch": 0.06219986279442031, "grad_norm": 0.46128419041633606, "learning_rate": 0.00039908543263964956, "loss": 4.8599, "step": 918 }, { "epoch": 0.0622676186362443, "grad_norm": 0.9554855823516846, "learning_rate": 0.00039907995618838996, "loss": 4.6282, "step": 919 }, { "epoch": 0.06233537447806828, "grad_norm": 0.6503332257270813, "learning_rate": 0.00039907447973713036, "loss": 4.73, "step": 920 }, { "epoch": 0.06240313031989227, "grad_norm": 0.6322983503341675, "learning_rate": 0.00039906900328587076, "loss": 4.5616, "step": 921 }, { "epoch": 0.062470886161716255, "grad_norm": 1.0539222955703735, "learning_rate": 0.0003990635268346112, "loss": 4.561, "step": 922 }, { "epoch": 0.06253864200354024, "grad_norm": 0.5341288447380066, "learning_rate": 0.0003990580503833516, "loss": 4.5272, "step": 923 }, { "epoch": 0.06260639784536423, "grad_norm": 0.5188624262809753, "learning_rate": 0.000399052573932092, "loss": 3.9657, "step": 924 }, { "epoch": 0.06267415368718822, "grad_norm": 1.161488652229309, "learning_rate": 0.0003990470974808324, "loss": 4.8578, "step": 925 }, { "epoch": 0.06274190952901221, "grad_norm": 0.901706337928772, "learning_rate": 0.00039904162102957286, "loss": 4.1258, "step": 926 }, { "epoch": 0.0628096653708362, "grad_norm": 0.7442399263381958, "learning_rate": 0.00039903614457831326, "loss": 4.3266, "step": 927 }, { "epoch": 0.06287742121266018, "grad_norm": 0.933000385761261, "learning_rate": 0.0003990306681270537, "loss": 4.3659, "step": 928 }, { "epoch": 0.06294517705448417, "grad_norm": 0.8820518851280212, "learning_rate": 0.0003990251916757941, "loss": 4.4208, "step": 929 }, { "epoch": 0.06301293289630816, "grad_norm": 0.5613499283790588, "learning_rate": 0.0003990197152245345, "loss": 4.3645, "step": 930 }, { "epoch": 0.06308068873813213, "grad_norm": 0.5223057270050049, "learning_rate": 0.0003990142387732749, "loss": 4.2303, "step": 931 }, { "epoch": 0.06314844457995612, "grad_norm": 1.0660102367401123, "learning_rate": 0.0003990087623220153, "loss": 4.7075, "step": 932 }, { "epoch": 0.06321620042178011, "grad_norm": 0.6657586693763733, "learning_rate": 0.00039900328587075577, "loss": 4.4128, "step": 933 }, { "epoch": 0.0632839562636041, "grad_norm": 1.7147760391235352, "learning_rate": 0.0003989978094194962, "loss": 4.2895, "step": 934 }, { "epoch": 0.06335171210542809, "grad_norm": 1.1054401397705078, "learning_rate": 0.0003989923329682366, "loss": 4.2386, "step": 935 }, { "epoch": 0.06341946794725208, "grad_norm": 1.0966932773590088, "learning_rate": 0.000398986856516977, "loss": 4.8277, "step": 936 }, { "epoch": 0.06348722378907606, "grad_norm": 3.9974160194396973, "learning_rate": 0.0003989813800657174, "loss": 4.5172, "step": 937 }, { "epoch": 0.06355497963090005, "grad_norm": 1.267701268196106, "learning_rate": 0.00039897590361445787, "loss": 4.3277, "step": 938 }, { "epoch": 0.06362273547272404, "grad_norm": 1.3108429908752441, "learning_rate": 0.00039897042716319827, "loss": 4.7102, "step": 939 }, { "epoch": 0.06369049131454803, "grad_norm": 1.343170166015625, "learning_rate": 0.00039896495071193867, "loss": 4.6718, "step": 940 }, { "epoch": 0.06375824715637202, "grad_norm": 0.9056527614593506, "learning_rate": 0.00039895947426067907, "loss": 4.539, "step": 941 }, { "epoch": 0.063826002998196, "grad_norm": 0.9318846464157104, "learning_rate": 0.0003989539978094195, "loss": 4.5815, "step": 942 }, { "epoch": 0.06389375884001999, "grad_norm": 3.7652578353881836, "learning_rate": 0.0003989485213581599, "loss": 4.3776, "step": 943 }, { "epoch": 0.06396151468184398, "grad_norm": 3.356126546859741, "learning_rate": 0.0003989430449069004, "loss": 4.4733, "step": 944 }, { "epoch": 0.06402927052366797, "grad_norm": 0.5597432851791382, "learning_rate": 0.0003989375684556408, "loss": 4.2956, "step": 945 }, { "epoch": 0.06409702636549194, "grad_norm": 1.158571720123291, "learning_rate": 0.00039893209200438117, "loss": 4.5364, "step": 946 }, { "epoch": 0.06416478220731593, "grad_norm": 0.6880437135696411, "learning_rate": 0.00039892661555312157, "loss": 4.7095, "step": 947 }, { "epoch": 0.06423253804913992, "grad_norm": 0.8595593571662903, "learning_rate": 0.00039892113910186197, "loss": 4.1686, "step": 948 }, { "epoch": 0.06430029389096391, "grad_norm": 1.2641078233718872, "learning_rate": 0.0003989156626506024, "loss": 4.3254, "step": 949 }, { "epoch": 0.0643680497327879, "grad_norm": 2.034414768218994, "learning_rate": 0.0003989101861993429, "loss": 4.7, "step": 950 }, { "epoch": 0.06443580557461188, "grad_norm": 0.6731711626052856, "learning_rate": 0.0003989047097480833, "loss": 3.9361, "step": 951 }, { "epoch": 0.06450356141643587, "grad_norm": 0.8040578365325928, "learning_rate": 0.0003988992332968237, "loss": 4.5364, "step": 952 }, { "epoch": 0.06457131725825986, "grad_norm": 1.1313163042068481, "learning_rate": 0.0003988937568455641, "loss": 4.3159, "step": 953 }, { "epoch": 0.06463907310008385, "grad_norm": 0.8646087646484375, "learning_rate": 0.00039888828039430453, "loss": 4.2919, "step": 954 }, { "epoch": 0.06470682894190784, "grad_norm": 1.0710028409957886, "learning_rate": 0.00039888280394304493, "loss": 4.3825, "step": 955 }, { "epoch": 0.06477458478373183, "grad_norm": 0.5882676839828491, "learning_rate": 0.00039887732749178533, "loss": 4.5684, "step": 956 }, { "epoch": 0.06484234062555581, "grad_norm": 0.8101402521133423, "learning_rate": 0.0003988718510405258, "loss": 4.6568, "step": 957 }, { "epoch": 0.0649100964673798, "grad_norm": 1.2646816968917847, "learning_rate": 0.0003988663745892662, "loss": 4.3971, "step": 958 }, { "epoch": 0.06497785230920379, "grad_norm": 0.813895046710968, "learning_rate": 0.0003988608981380066, "loss": 4.295, "step": 959 }, { "epoch": 0.06504560815102776, "grad_norm": 0.5896276831626892, "learning_rate": 0.00039885542168674703, "loss": 4.5589, "step": 960 }, { "epoch": 0.06511336399285175, "grad_norm": 0.8398045301437378, "learning_rate": 0.00039884994523548743, "loss": 4.5343, "step": 961 }, { "epoch": 0.06518111983467574, "grad_norm": 0.811392605304718, "learning_rate": 0.00039884446878422783, "loss": 4.5549, "step": 962 }, { "epoch": 0.06524887567649973, "grad_norm": 1.1811622381210327, "learning_rate": 0.00039883899233296823, "loss": 4.3639, "step": 963 }, { "epoch": 0.06531663151832372, "grad_norm": 0.8886870741844177, "learning_rate": 0.00039883351588170863, "loss": 4.344, "step": 964 }, { "epoch": 0.0653843873601477, "grad_norm": 0.6795641183853149, "learning_rate": 0.0003988280394304491, "loss": 4.0812, "step": 965 }, { "epoch": 0.0654521432019717, "grad_norm": 0.890152633190155, "learning_rate": 0.00039882256297918954, "loss": 4.3028, "step": 966 }, { "epoch": 0.06551989904379568, "grad_norm": 1.1197621822357178, "learning_rate": 0.00039881708652792994, "loss": 4.1714, "step": 967 }, { "epoch": 0.06558765488561967, "grad_norm": 0.7774588465690613, "learning_rate": 0.00039881161007667033, "loss": 4.5156, "step": 968 }, { "epoch": 0.06565541072744366, "grad_norm": 0.7460579872131348, "learning_rate": 0.00039880613362541073, "loss": 4.7706, "step": 969 }, { "epoch": 0.06572316656926765, "grad_norm": 0.9860925078392029, "learning_rate": 0.00039880065717415113, "loss": 4.3168, "step": 970 }, { "epoch": 0.06579092241109163, "grad_norm": 0.6449454426765442, "learning_rate": 0.0003987951807228916, "loss": 4.2623, "step": 971 }, { "epoch": 0.06585867825291562, "grad_norm": 1.4655145406723022, "learning_rate": 0.000398789704271632, "loss": 4.4417, "step": 972 }, { "epoch": 0.06592643409473961, "grad_norm": 0.7237178683280945, "learning_rate": 0.00039878422782037244, "loss": 4.7146, "step": 973 }, { "epoch": 0.0659941899365636, "grad_norm": 1.0752249956130981, "learning_rate": 0.00039877875136911284, "loss": 4.4108, "step": 974 }, { "epoch": 0.06606194577838757, "grad_norm": 0.8396843075752258, "learning_rate": 0.00039877327491785324, "loss": 4.6174, "step": 975 }, { "epoch": 0.06612970162021156, "grad_norm": 0.56378173828125, "learning_rate": 0.0003987677984665937, "loss": 4.5004, "step": 976 }, { "epoch": 0.06619745746203555, "grad_norm": 2.1583292484283447, "learning_rate": 0.0003987623220153341, "loss": 4.4253, "step": 977 }, { "epoch": 0.06626521330385954, "grad_norm": 0.8933156132698059, "learning_rate": 0.0003987568455640745, "loss": 4.0331, "step": 978 }, { "epoch": 0.06633296914568353, "grad_norm": 1.0771517753601074, "learning_rate": 0.0003987513691128149, "loss": 4.4169, "step": 979 }, { "epoch": 0.06640072498750751, "grad_norm": 0.991187334060669, "learning_rate": 0.0003987458926615553, "loss": 4.5004, "step": 980 }, { "epoch": 0.0664684808293315, "grad_norm": 1.509549856185913, "learning_rate": 0.00039874041621029574, "loss": 3.8146, "step": 981 }, { "epoch": 0.06653623667115549, "grad_norm": 1.5677217245101929, "learning_rate": 0.0003987349397590362, "loss": 4.6354, "step": 982 }, { "epoch": 0.06660399251297948, "grad_norm": 1.3638211488723755, "learning_rate": 0.0003987294633077766, "loss": 4.5198, "step": 983 }, { "epoch": 0.06667174835480347, "grad_norm": 1.1667355298995972, "learning_rate": 0.000398723986856517, "loss": 4.8187, "step": 984 }, { "epoch": 0.06673950419662746, "grad_norm": 0.5072071552276611, "learning_rate": 0.0003987185104052574, "loss": 4.4645, "step": 985 }, { "epoch": 0.06680726003845144, "grad_norm": 1.853524088859558, "learning_rate": 0.0003987130339539978, "loss": 3.9969, "step": 986 }, { "epoch": 0.06687501588027543, "grad_norm": 1.3862594366073608, "learning_rate": 0.00039870755750273825, "loss": 4.4346, "step": 987 }, { "epoch": 0.06694277172209942, "grad_norm": 1.3050211668014526, "learning_rate": 0.0003987020810514787, "loss": 4.1496, "step": 988 }, { "epoch": 0.06701052756392341, "grad_norm": 1.7112905979156494, "learning_rate": 0.0003986966046002191, "loss": 3.8965, "step": 989 }, { "epoch": 0.06707828340574738, "grad_norm": 0.9160133004188538, "learning_rate": 0.0003986911281489595, "loss": 4.0008, "step": 990 }, { "epoch": 0.06714603924757137, "grad_norm": 1.2426139116287231, "learning_rate": 0.0003986856516976999, "loss": 4.8977, "step": 991 }, { "epoch": 0.06721379508939536, "grad_norm": 1.0831527709960938, "learning_rate": 0.00039868017524644035, "loss": 4.548, "step": 992 }, { "epoch": 0.06728155093121935, "grad_norm": 1.182830572128296, "learning_rate": 0.00039867469879518075, "loss": 4.2618, "step": 993 }, { "epoch": 0.06734930677304334, "grad_norm": 1.413882851600647, "learning_rate": 0.00039866922234392115, "loss": 4.6476, "step": 994 }, { "epoch": 0.06741706261486732, "grad_norm": 0.7133269906044006, "learning_rate": 0.00039866374589266155, "loss": 4.3837, "step": 995 }, { "epoch": 0.06748481845669131, "grad_norm": 0.6412605047225952, "learning_rate": 0.000398658269441402, "loss": 4.1152, "step": 996 }, { "epoch": 0.0675525742985153, "grad_norm": 0.9335253834724426, "learning_rate": 0.0003986527929901424, "loss": 4.7547, "step": 997 }, { "epoch": 0.06762033014033929, "grad_norm": 1.1208101511001587, "learning_rate": 0.00039864731653888285, "loss": 3.9859, "step": 998 }, { "epoch": 0.06768808598216328, "grad_norm": 2.39587664604187, "learning_rate": 0.00039864184008762325, "loss": 4.3104, "step": 999 }, { "epoch": 0.06775584182398726, "grad_norm": 2.464878797531128, "learning_rate": 0.00039863636363636365, "loss": 4.2393, "step": 1000 }, { "epoch": 0.06782359766581125, "grad_norm": 1.4290380477905273, "learning_rate": 0.00039863088718510405, "loss": 4.3641, "step": 1001 }, { "epoch": 0.06789135350763524, "grad_norm": 1.573796033859253, "learning_rate": 0.00039862541073384445, "loss": 4.0094, "step": 1002 }, { "epoch": 0.06795910934945923, "grad_norm": 0.8379250168800354, "learning_rate": 0.0003986199342825849, "loss": 4.2881, "step": 1003 }, { "epoch": 0.06802686519128322, "grad_norm": 1.0250741243362427, "learning_rate": 0.00039861445783132536, "loss": 4.5162, "step": 1004 }, { "epoch": 0.06809462103310719, "grad_norm": 0.7703651785850525, "learning_rate": 0.00039860898138006576, "loss": 4.1853, "step": 1005 }, { "epoch": 0.06816237687493118, "grad_norm": 0.8128710389137268, "learning_rate": 0.00039860350492880616, "loss": 4.4538, "step": 1006 }, { "epoch": 0.06823013271675517, "grad_norm": 1.4643748998641968, "learning_rate": 0.00039859802847754655, "loss": 4.1139, "step": 1007 }, { "epoch": 0.06829788855857916, "grad_norm": 1.2365601062774658, "learning_rate": 0.00039859255202628695, "loss": 4.5798, "step": 1008 }, { "epoch": 0.06836564440040314, "grad_norm": 1.72235906124115, "learning_rate": 0.0003985870755750274, "loss": 4.0178, "step": 1009 }, { "epoch": 0.06843340024222713, "grad_norm": 1.3185621500015259, "learning_rate": 0.0003985815991237678, "loss": 4.2725, "step": 1010 }, { "epoch": 0.06850115608405112, "grad_norm": 1.703866958618164, "learning_rate": 0.0003985761226725082, "loss": 4.2919, "step": 1011 }, { "epoch": 0.06856891192587511, "grad_norm": 1.0072482824325562, "learning_rate": 0.00039857064622124866, "loss": 3.9373, "step": 1012 }, { "epoch": 0.0686366677676991, "grad_norm": 0.7422221302986145, "learning_rate": 0.00039856516976998906, "loss": 3.892, "step": 1013 }, { "epoch": 0.06870442360952309, "grad_norm": 1.128000020980835, "learning_rate": 0.0003985596933187295, "loss": 4.4625, "step": 1014 }, { "epoch": 0.06877217945134707, "grad_norm": 1.2138216495513916, "learning_rate": 0.0003985542168674699, "loss": 4.1466, "step": 1015 }, { "epoch": 0.06883993529317106, "grad_norm": 2.2494773864746094, "learning_rate": 0.0003985487404162103, "loss": 4.2887, "step": 1016 }, { "epoch": 0.06890769113499505, "grad_norm": 2.6325812339782715, "learning_rate": 0.0003985432639649507, "loss": 4.1611, "step": 1017 }, { "epoch": 0.06897544697681904, "grad_norm": 0.9742172956466675, "learning_rate": 0.0003985377875136911, "loss": 4.2634, "step": 1018 }, { "epoch": 0.06904320281864303, "grad_norm": 2.056999683380127, "learning_rate": 0.00039853231106243156, "loss": 4.0202, "step": 1019 }, { "epoch": 0.069110958660467, "grad_norm": 1.5828012228012085, "learning_rate": 0.000398526834611172, "loss": 4.4205, "step": 1020 }, { "epoch": 0.06917871450229099, "grad_norm": 0.9790469408035278, "learning_rate": 0.0003985213581599124, "loss": 4.1712, "step": 1021 }, { "epoch": 0.06924647034411498, "grad_norm": 1.1368647813796997, "learning_rate": 0.0003985158817086528, "loss": 4.4563, "step": 1022 }, { "epoch": 0.06931422618593897, "grad_norm": 1.9270405769348145, "learning_rate": 0.0003985104052573932, "loss": 4.3087, "step": 1023 }, { "epoch": 0.06938198202776295, "grad_norm": 1.3553575277328491, "learning_rate": 0.0003985049288061336, "loss": 3.8444, "step": 1024 }, { "epoch": 0.06944973786958694, "grad_norm": 0.9720080494880676, "learning_rate": 0.00039849945235487407, "loss": 4.2423, "step": 1025 }, { "epoch": 0.06951749371141093, "grad_norm": 1.716758370399475, "learning_rate": 0.00039849397590361447, "loss": 4.4032, "step": 1026 }, { "epoch": 0.06958524955323492, "grad_norm": 0.9964433908462524, "learning_rate": 0.0003984884994523549, "loss": 4.3942, "step": 1027 }, { "epoch": 0.0696530053950589, "grad_norm": 3.2764737606048584, "learning_rate": 0.0003984830230010953, "loss": 4.1631, "step": 1028 }, { "epoch": 0.0697207612368829, "grad_norm": 1.7656371593475342, "learning_rate": 0.0003984775465498357, "loss": 4.373, "step": 1029 }, { "epoch": 0.06978851707870688, "grad_norm": 1.1527352333068848, "learning_rate": 0.00039847207009857617, "loss": 4.2716, "step": 1030 }, { "epoch": 0.06985627292053087, "grad_norm": 1.7451441287994385, "learning_rate": 0.00039846659364731657, "loss": 4.2979, "step": 1031 }, { "epoch": 0.06992402876235486, "grad_norm": 1.229651689529419, "learning_rate": 0.00039846111719605697, "loss": 4.101, "step": 1032 }, { "epoch": 0.06999178460417885, "grad_norm": 1.1700464487075806, "learning_rate": 0.00039845564074479737, "loss": 4.4432, "step": 1033 }, { "epoch": 0.07005954044600284, "grad_norm": 1.0305942296981812, "learning_rate": 0.00039845016429353777, "loss": 3.8128, "step": 1034 }, { "epoch": 0.07012729628782681, "grad_norm": 0.9988133907318115, "learning_rate": 0.0003984446878422782, "loss": 4.2801, "step": 1035 }, { "epoch": 0.0701950521296508, "grad_norm": 1.3242058753967285, "learning_rate": 0.0003984392113910187, "loss": 4.1748, "step": 1036 }, { "epoch": 0.07026280797147479, "grad_norm": 1.203094720840454, "learning_rate": 0.0003984337349397591, "loss": 4.5268, "step": 1037 }, { "epoch": 0.07033056381329877, "grad_norm": 1.6429884433746338, "learning_rate": 0.00039842825848849947, "loss": 4.2603, "step": 1038 }, { "epoch": 0.07039831965512276, "grad_norm": 1.2461512088775635, "learning_rate": 0.00039842278203723987, "loss": 4.0363, "step": 1039 }, { "epoch": 0.07046607549694675, "grad_norm": 1.0287927389144897, "learning_rate": 0.00039841730558598027, "loss": 4.5749, "step": 1040 }, { "epoch": 0.07053383133877074, "grad_norm": 1.0760092735290527, "learning_rate": 0.0003984118291347207, "loss": 4.2876, "step": 1041 }, { "epoch": 0.07060158718059473, "grad_norm": 1.201130986213684, "learning_rate": 0.0003984063526834611, "loss": 4.0461, "step": 1042 }, { "epoch": 0.07066934302241872, "grad_norm": 1.146349549293518, "learning_rate": 0.0003984008762322016, "loss": 4.2335, "step": 1043 }, { "epoch": 0.0707370988642427, "grad_norm": 2.2435390949249268, "learning_rate": 0.000398395399780942, "loss": 4.5031, "step": 1044 }, { "epoch": 0.07080485470606669, "grad_norm": 1.7536578178405762, "learning_rate": 0.0003983899233296824, "loss": 4.1415, "step": 1045 }, { "epoch": 0.07087261054789068, "grad_norm": 2.5189902782440186, "learning_rate": 0.0003983844468784228, "loss": 3.8517, "step": 1046 }, { "epoch": 0.07094036638971467, "grad_norm": 1.8342978954315186, "learning_rate": 0.00039837897042716323, "loss": 4.3664, "step": 1047 }, { "epoch": 0.07100812223153866, "grad_norm": 1.0705716609954834, "learning_rate": 0.00039837349397590363, "loss": 4.6583, "step": 1048 }, { "epoch": 0.07107587807336264, "grad_norm": 2.1361026763916016, "learning_rate": 0.000398368017524644, "loss": 4.0603, "step": 1049 }, { "epoch": 0.07114363391518662, "grad_norm": 1.3097550868988037, "learning_rate": 0.0003983625410733845, "loss": 4.0418, "step": 1050 }, { "epoch": 0.0712113897570106, "grad_norm": 1.4214437007904053, "learning_rate": 0.0003983570646221249, "loss": 4.2993, "step": 1051 }, { "epoch": 0.0712791455988346, "grad_norm": 1.1609143018722534, "learning_rate": 0.00039835158817086533, "loss": 4.2596, "step": 1052 }, { "epoch": 0.07134690144065858, "grad_norm": 1.250801920890808, "learning_rate": 0.00039834611171960573, "loss": 4.3336, "step": 1053 }, { "epoch": 0.07141465728248257, "grad_norm": 1.3202877044677734, "learning_rate": 0.00039834063526834613, "loss": 4.3284, "step": 1054 }, { "epoch": 0.07148241312430656, "grad_norm": 4.244118690490723, "learning_rate": 0.00039833515881708653, "loss": 3.9084, "step": 1055 }, { "epoch": 0.07155016896613055, "grad_norm": 1.9815622568130493, "learning_rate": 0.00039832968236582693, "loss": 4.1809, "step": 1056 }, { "epoch": 0.07161792480795454, "grad_norm": 1.541872262954712, "learning_rate": 0.0003983242059145674, "loss": 4.3144, "step": 1057 }, { "epoch": 0.07168568064977852, "grad_norm": 1.379879117012024, "learning_rate": 0.00039831872946330784, "loss": 4.5101, "step": 1058 }, { "epoch": 0.07175343649160251, "grad_norm": 1.5362669229507446, "learning_rate": 0.00039831325301204824, "loss": 4.2083, "step": 1059 }, { "epoch": 0.0718211923334265, "grad_norm": 1.9623464345932007, "learning_rate": 0.00039830777656078863, "loss": 4.2309, "step": 1060 }, { "epoch": 0.07188894817525049, "grad_norm": 1.2217607498168945, "learning_rate": 0.00039830230010952903, "loss": 4.4965, "step": 1061 }, { "epoch": 0.07195670401707448, "grad_norm": 1.2235454320907593, "learning_rate": 0.00039829682365826943, "loss": 4.6595, "step": 1062 }, { "epoch": 0.07202445985889847, "grad_norm": 1.48357093334198, "learning_rate": 0.0003982913472070099, "loss": 4.3102, "step": 1063 }, { "epoch": 0.07209221570072245, "grad_norm": 0.9867489337921143, "learning_rate": 0.0003982858707557503, "loss": 4.4084, "step": 1064 }, { "epoch": 0.07215997154254643, "grad_norm": 2.2526583671569824, "learning_rate": 0.0003982803943044907, "loss": 3.8039, "step": 1065 }, { "epoch": 0.07222772738437042, "grad_norm": 1.3563092947006226, "learning_rate": 0.00039827491785323114, "loss": 3.8589, "step": 1066 }, { "epoch": 0.0722954832261944, "grad_norm": 2.4422338008880615, "learning_rate": 0.00039826944140197154, "loss": 3.8616, "step": 1067 }, { "epoch": 0.07236323906801839, "grad_norm": 1.7460412979125977, "learning_rate": 0.000398263964950712, "loss": 3.8777, "step": 1068 }, { "epoch": 0.07243099490984238, "grad_norm": 1.8586010932922363, "learning_rate": 0.0003982584884994524, "loss": 4.1959, "step": 1069 }, { "epoch": 0.07249875075166637, "grad_norm": 1.4706990718841553, "learning_rate": 0.0003982530120481928, "loss": 4.0381, "step": 1070 }, { "epoch": 0.07256650659349036, "grad_norm": 0.9791918992996216, "learning_rate": 0.0003982475355969332, "loss": 4.5752, "step": 1071 }, { "epoch": 0.07263426243531435, "grad_norm": 1.6259567737579346, "learning_rate": 0.0003982420591456736, "loss": 4.043, "step": 1072 }, { "epoch": 0.07270201827713833, "grad_norm": 4.648763179779053, "learning_rate": 0.00039823658269441404, "loss": 3.8579, "step": 1073 }, { "epoch": 0.07276977411896232, "grad_norm": 1.136399745941162, "learning_rate": 0.0003982311062431545, "loss": 3.8428, "step": 1074 }, { "epoch": 0.07283752996078631, "grad_norm": 1.1962316036224365, "learning_rate": 0.0003982256297918949, "loss": 4.3146, "step": 1075 }, { "epoch": 0.0729052858026103, "grad_norm": 1.6209527254104614, "learning_rate": 0.0003982201533406353, "loss": 4.1499, "step": 1076 }, { "epoch": 0.07297304164443429, "grad_norm": 1.22721529006958, "learning_rate": 0.0003982146768893757, "loss": 4.0989, "step": 1077 }, { "epoch": 0.07304079748625827, "grad_norm": 1.5025233030319214, "learning_rate": 0.0003982092004381161, "loss": 3.9824, "step": 1078 }, { "epoch": 0.07310855332808226, "grad_norm": 24.357927322387695, "learning_rate": 0.00039820372398685655, "loss": 4.501, "step": 1079 }, { "epoch": 0.07317630916990624, "grad_norm": 3.035731792449951, "learning_rate": 0.00039819824753559694, "loss": 4.2724, "step": 1080 }, { "epoch": 0.07324406501173022, "grad_norm": 2.525163173675537, "learning_rate": 0.0003981927710843374, "loss": 4.1938, "step": 1081 }, { "epoch": 0.07331182085355421, "grad_norm": 2.7789783477783203, "learning_rate": 0.0003981872946330778, "loss": 4.4936, "step": 1082 }, { "epoch": 0.0733795766953782, "grad_norm": 2.0346388816833496, "learning_rate": 0.0003981818181818182, "loss": 4.2575, "step": 1083 }, { "epoch": 0.07344733253720219, "grad_norm": 2.5741000175476074, "learning_rate": 0.0003981763417305586, "loss": 3.5467, "step": 1084 }, { "epoch": 0.07351508837902618, "grad_norm": 1.519612431526184, "learning_rate": 0.00039817086527929905, "loss": 4.0075, "step": 1085 }, { "epoch": 0.07358284422085017, "grad_norm": 1.7198134660720825, "learning_rate": 0.00039816538882803945, "loss": 3.7975, "step": 1086 }, { "epoch": 0.07365060006267415, "grad_norm": 1.966604232788086, "learning_rate": 0.00039815991237677985, "loss": 3.9286, "step": 1087 }, { "epoch": 0.07371835590449814, "grad_norm": 44.42156982421875, "learning_rate": 0.00039815443592552025, "loss": 4.1852, "step": 1088 }, { "epoch": 0.07378611174632213, "grad_norm": 6.972700119018555, "learning_rate": 0.0003981489594742607, "loss": 4.1075, "step": 1089 }, { "epoch": 0.07385386758814612, "grad_norm": 3.37237548828125, "learning_rate": 0.00039814348302300115, "loss": 4.036, "step": 1090 }, { "epoch": 0.0739216234299701, "grad_norm": 2.1974034309387207, "learning_rate": 0.00039813800657174155, "loss": 3.7596, "step": 1091 }, { "epoch": 0.0739893792717941, "grad_norm": 2.3063008785247803, "learning_rate": 0.00039813253012048195, "loss": 3.9324, "step": 1092 }, { "epoch": 0.07405713511361808, "grad_norm": 1.6853234767913818, "learning_rate": 0.00039812705366922235, "loss": 3.9795, "step": 1093 }, { "epoch": 0.07412489095544206, "grad_norm": 3.2866408824920654, "learning_rate": 0.00039812157721796275, "loss": 4.1397, "step": 1094 }, { "epoch": 0.07419264679726605, "grad_norm": 3.4523186683654785, "learning_rate": 0.0003981161007667032, "loss": 3.7963, "step": 1095 }, { "epoch": 0.07426040263909003, "grad_norm": 1.4153375625610352, "learning_rate": 0.0003981106243154436, "loss": 3.878, "step": 1096 }, { "epoch": 0.07432815848091402, "grad_norm": 1.4361470937728882, "learning_rate": 0.00039810514786418406, "loss": 4.0755, "step": 1097 }, { "epoch": 0.07439591432273801, "grad_norm": 1.1677697896957397, "learning_rate": 0.00039809967141292446, "loss": 4.3206, "step": 1098 }, { "epoch": 0.074463670164562, "grad_norm": 1.5047650337219238, "learning_rate": 0.00039809419496166485, "loss": 3.8538, "step": 1099 }, { "epoch": 0.07453142600638599, "grad_norm": 2.8860878944396973, "learning_rate": 0.00039808871851040525, "loss": 3.9605, "step": 1100 }, { "epoch": 0.07459918184820997, "grad_norm": 3.908668041229248, "learning_rate": 0.0003980832420591457, "loss": 3.5234, "step": 1101 }, { "epoch": 0.07466693769003396, "grad_norm": 3.668790340423584, "learning_rate": 0.0003980777656078861, "loss": 3.9832, "step": 1102 }, { "epoch": 0.07473469353185795, "grad_norm": 3.1576225757598877, "learning_rate": 0.0003980722891566265, "loss": 4.3703, "step": 1103 }, { "epoch": 0.07480244937368194, "grad_norm": 1.5882000923156738, "learning_rate": 0.0003980668127053669, "loss": 3.4142, "step": 1104 }, { "epoch": 0.07487020521550593, "grad_norm": 1.420247197151184, "learning_rate": 0.00039806133625410736, "loss": 4.2078, "step": 1105 }, { "epoch": 0.07493796105732992, "grad_norm": 1.231593370437622, "learning_rate": 0.0003980558598028478, "loss": 4.2299, "step": 1106 }, { "epoch": 0.0750057168991539, "grad_norm": 1.689089059829712, "learning_rate": 0.0003980503833515882, "loss": 3.762, "step": 1107 }, { "epoch": 0.07507347274097789, "grad_norm": 2.394148588180542, "learning_rate": 0.0003980449069003286, "loss": 4.3099, "step": 1108 }, { "epoch": 0.07514122858280187, "grad_norm": 1.8881683349609375, "learning_rate": 0.000398039430449069, "loss": 4.3448, "step": 1109 }, { "epoch": 0.07520898442462585, "grad_norm": 2.1728670597076416, "learning_rate": 0.0003980339539978094, "loss": 3.8618, "step": 1110 }, { "epoch": 0.07527674026644984, "grad_norm": 1.6774365901947021, "learning_rate": 0.00039802847754654986, "loss": 4.1364, "step": 1111 }, { "epoch": 0.07534449610827383, "grad_norm": 1.4898735284805298, "learning_rate": 0.00039802300109529026, "loss": 3.9005, "step": 1112 }, { "epoch": 0.07541225195009782, "grad_norm": 1.7666774988174438, "learning_rate": 0.0003980175246440307, "loss": 3.4955, "step": 1113 }, { "epoch": 0.07548000779192181, "grad_norm": 2.5947883129119873, "learning_rate": 0.0003980120481927711, "loss": 4.3883, "step": 1114 }, { "epoch": 0.0755477636337458, "grad_norm": 1.867881417274475, "learning_rate": 0.0003980065717415115, "loss": 3.8803, "step": 1115 }, { "epoch": 0.07561551947556978, "grad_norm": 1.6566728353500366, "learning_rate": 0.0003980010952902519, "loss": 4.3265, "step": 1116 }, { "epoch": 0.07568327531739377, "grad_norm": 2.073270559310913, "learning_rate": 0.00039799561883899237, "loss": 3.99, "step": 1117 }, { "epoch": 0.07575103115921776, "grad_norm": 2.2588183879852295, "learning_rate": 0.00039799014238773277, "loss": 4.1738, "step": 1118 }, { "epoch": 0.07581878700104175, "grad_norm": 1.4516103267669678, "learning_rate": 0.00039798466593647316, "loss": 4.043, "step": 1119 }, { "epoch": 0.07588654284286574, "grad_norm": 2.8987886905670166, "learning_rate": 0.0003979791894852136, "loss": 3.8589, "step": 1120 }, { "epoch": 0.07595429868468973, "grad_norm": 4.085813999176025, "learning_rate": 0.000397973713033954, "loss": 3.9801, "step": 1121 }, { "epoch": 0.07602205452651371, "grad_norm": 2.655787229537964, "learning_rate": 0.0003979682365826944, "loss": 4.2628, "step": 1122 }, { "epoch": 0.0760898103683377, "grad_norm": 2.1648433208465576, "learning_rate": 0.00039796276013143487, "loss": 4.2921, "step": 1123 }, { "epoch": 0.07615756621016168, "grad_norm": 27.031539916992188, "learning_rate": 0.00039795728368017527, "loss": 4.7289, "step": 1124 }, { "epoch": 0.07622532205198566, "grad_norm": 20.55954933166504, "learning_rate": 0.00039795180722891567, "loss": 4.0354, "step": 1125 }, { "epoch": 0.07629307789380965, "grad_norm": 3.843536615371704, "learning_rate": 0.00039794633077765607, "loss": 4.1474, "step": 1126 }, { "epoch": 0.07636083373563364, "grad_norm": 2.7076430320739746, "learning_rate": 0.0003979408543263965, "loss": 4.074, "step": 1127 }, { "epoch": 0.07642858957745763, "grad_norm": 1.980013132095337, "learning_rate": 0.000397935377875137, "loss": 4.0907, "step": 1128 }, { "epoch": 0.07649634541928162, "grad_norm": 1.9771209955215454, "learning_rate": 0.0003979299014238774, "loss": 3.8624, "step": 1129 }, { "epoch": 0.0765641012611056, "grad_norm": 3.319946050643921, "learning_rate": 0.00039792442497261777, "loss": 4.1016, "step": 1130 }, { "epoch": 0.0766318571029296, "grad_norm": 4.815307140350342, "learning_rate": 0.00039791894852135817, "loss": 4.1382, "step": 1131 }, { "epoch": 0.07669961294475358, "grad_norm": 3.3918800354003906, "learning_rate": 0.00039791347207009857, "loss": 4.0498, "step": 1132 }, { "epoch": 0.07676736878657757, "grad_norm": 1.7358074188232422, "learning_rate": 0.000397907995618839, "loss": 3.4853, "step": 1133 }, { "epoch": 0.07683512462840156, "grad_norm": 2.709115505218506, "learning_rate": 0.0003979025191675794, "loss": 3.5773, "step": 1134 }, { "epoch": 0.07690288047022555, "grad_norm": 2.2798941135406494, "learning_rate": 0.0003978970427163198, "loss": 3.6055, "step": 1135 }, { "epoch": 0.07697063631204953, "grad_norm": 2.559767723083496, "learning_rate": 0.0003978915662650603, "loss": 4.0622, "step": 1136 }, { "epoch": 0.07703839215387352, "grad_norm": 1.953681230545044, "learning_rate": 0.0003978860898138007, "loss": 3.9368, "step": 1137 }, { "epoch": 0.07710614799569751, "grad_norm": 1.8372457027435303, "learning_rate": 0.0003978806133625411, "loss": 3.8773, "step": 1138 }, { "epoch": 0.07717390383752148, "grad_norm": 2.1165294647216797, "learning_rate": 0.00039787513691128153, "loss": 3.9216, "step": 1139 }, { "epoch": 0.07724165967934547, "grad_norm": 1.4757955074310303, "learning_rate": 0.00039786966046002193, "loss": 4.2867, "step": 1140 }, { "epoch": 0.07730941552116946, "grad_norm": 2.4588067531585693, "learning_rate": 0.0003978641840087623, "loss": 4.1888, "step": 1141 }, { "epoch": 0.07737717136299345, "grad_norm": 1.3946064710617065, "learning_rate": 0.0003978587075575027, "loss": 4.1066, "step": 1142 }, { "epoch": 0.07744492720481744, "grad_norm": 2.1390771865844727, "learning_rate": 0.0003978532311062431, "loss": 4.1404, "step": 1143 }, { "epoch": 0.07751268304664143, "grad_norm": 1.8265283107757568, "learning_rate": 0.00039784775465498363, "loss": 4.1295, "step": 1144 }, { "epoch": 0.07758043888846541, "grad_norm": 1.5415360927581787, "learning_rate": 0.00039784227820372403, "loss": 4.4015, "step": 1145 }, { "epoch": 0.0776481947302894, "grad_norm": 2.158518075942993, "learning_rate": 0.00039783680175246443, "loss": 3.7127, "step": 1146 }, { "epoch": 0.07771595057211339, "grad_norm": 2.151182174682617, "learning_rate": 0.00039783132530120483, "loss": 4.1543, "step": 1147 }, { "epoch": 0.07778370641393738, "grad_norm": 2.291529893875122, "learning_rate": 0.00039782584884994523, "loss": 3.8075, "step": 1148 }, { "epoch": 0.07785146225576137, "grad_norm": 2.1098005771636963, "learning_rate": 0.0003978203723986857, "loss": 3.7958, "step": 1149 }, { "epoch": 0.07791921809758535, "grad_norm": 1.9717562198638916, "learning_rate": 0.0003978148959474261, "loss": 3.7032, "step": 1150 }, { "epoch": 0.07798697393940934, "grad_norm": 15.983768463134766, "learning_rate": 0.00039780941949616654, "loss": 3.6224, "step": 1151 }, { "epoch": 0.07805472978123333, "grad_norm": 2.297041654586792, "learning_rate": 0.00039780394304490693, "loss": 3.739, "step": 1152 }, { "epoch": 0.07812248562305732, "grad_norm": 1.7756987810134888, "learning_rate": 0.00039779846659364733, "loss": 3.9994, "step": 1153 }, { "epoch": 0.0781902414648813, "grad_norm": 2.3172268867492676, "learning_rate": 0.00039779299014238773, "loss": 4.0928, "step": 1154 }, { "epoch": 0.07825799730670528, "grad_norm": 1.8446166515350342, "learning_rate": 0.0003977875136911282, "loss": 3.9243, "step": 1155 }, { "epoch": 0.07832575314852927, "grad_norm": 4.68627405166626, "learning_rate": 0.0003977820372398686, "loss": 4.1586, "step": 1156 }, { "epoch": 0.07839350899035326, "grad_norm": 3.5925960540771484, "learning_rate": 0.000397776560788609, "loss": 3.6326, "step": 1157 }, { "epoch": 0.07846126483217725, "grad_norm": 2.144824743270874, "learning_rate": 0.0003977710843373494, "loss": 3.8534, "step": 1158 }, { "epoch": 0.07852902067400123, "grad_norm": 2.5469396114349365, "learning_rate": 0.00039776560788608984, "loss": 3.6471, "step": 1159 }, { "epoch": 0.07859677651582522, "grad_norm": 2.061211347579956, "learning_rate": 0.00039776013143483024, "loss": 3.7156, "step": 1160 }, { "epoch": 0.07866453235764921, "grad_norm": 1.4750325679779053, "learning_rate": 0.0003977546549835707, "loss": 3.9912, "step": 1161 }, { "epoch": 0.0787322881994732, "grad_norm": 3.591585159301758, "learning_rate": 0.0003977491785323111, "loss": 3.6267, "step": 1162 }, { "epoch": 0.07880004404129719, "grad_norm": 4.180237293243408, "learning_rate": 0.0003977437020810515, "loss": 3.6864, "step": 1163 }, { "epoch": 0.07886779988312118, "grad_norm": 2.0687642097473145, "learning_rate": 0.0003977382256297919, "loss": 3.9684, "step": 1164 }, { "epoch": 0.07893555572494516, "grad_norm": 1.9622997045516968, "learning_rate": 0.00039773274917853234, "loss": 4.2062, "step": 1165 }, { "epoch": 0.07900331156676915, "grad_norm": 2.522752285003662, "learning_rate": 0.00039772727272727274, "loss": 3.9249, "step": 1166 }, { "epoch": 0.07907106740859314, "grad_norm": 2.1261966228485107, "learning_rate": 0.0003977217962760132, "loss": 4.1087, "step": 1167 }, { "epoch": 0.07913882325041713, "grad_norm": 1.7126882076263428, "learning_rate": 0.0003977163198247536, "loss": 4.3366, "step": 1168 }, { "epoch": 0.0792065790922411, "grad_norm": 1.7277259826660156, "learning_rate": 0.000397710843373494, "loss": 3.702, "step": 1169 }, { "epoch": 0.07927433493406509, "grad_norm": 2.5878758430480957, "learning_rate": 0.0003977053669222344, "loss": 4.0585, "step": 1170 }, { "epoch": 0.07934209077588908, "grad_norm": 1.8253357410430908, "learning_rate": 0.00039769989047097484, "loss": 4.1687, "step": 1171 }, { "epoch": 0.07940984661771307, "grad_norm": 2.15036940574646, "learning_rate": 0.00039769441401971524, "loss": 3.8367, "step": 1172 }, { "epoch": 0.07947760245953706, "grad_norm": 2.114055871963501, "learning_rate": 0.00039768893756845564, "loss": 3.707, "step": 1173 }, { "epoch": 0.07954535830136104, "grad_norm": 2.469308614730835, "learning_rate": 0.00039768346111719604, "loss": 3.6145, "step": 1174 }, { "epoch": 0.07961311414318503, "grad_norm": 1.8836160898208618, "learning_rate": 0.0003976779846659365, "loss": 3.9521, "step": 1175 }, { "epoch": 0.07968086998500902, "grad_norm": 1.7649003267288208, "learning_rate": 0.0003976725082146769, "loss": 4.0314, "step": 1176 }, { "epoch": 0.07974862582683301, "grad_norm": 2.820612907409668, "learning_rate": 0.00039766703176341735, "loss": 3.6167, "step": 1177 }, { "epoch": 0.079816381668657, "grad_norm": 2.147737979888916, "learning_rate": 0.00039766155531215775, "loss": 3.6669, "step": 1178 }, { "epoch": 0.07988413751048098, "grad_norm": 1.8214514255523682, "learning_rate": 0.00039765607886089815, "loss": 3.5683, "step": 1179 }, { "epoch": 0.07995189335230497, "grad_norm": 1.6605969667434692, "learning_rate": 0.00039765060240963855, "loss": 4.1463, "step": 1180 }, { "epoch": 0.08001964919412896, "grad_norm": 2.7118911743164062, "learning_rate": 0.00039764512595837895, "loss": 3.3552, "step": 1181 }, { "epoch": 0.08008740503595295, "grad_norm": 2.5016448497772217, "learning_rate": 0.00039763964950711945, "loss": 3.3393, "step": 1182 }, { "epoch": 0.08015516087777694, "grad_norm": 3.1485249996185303, "learning_rate": 0.00039763417305585985, "loss": 4.069, "step": 1183 }, { "epoch": 0.08022291671960091, "grad_norm": 2.1321465969085693, "learning_rate": 0.00039762869660460025, "loss": 3.6277, "step": 1184 }, { "epoch": 0.0802906725614249, "grad_norm": 2.209886074066162, "learning_rate": 0.00039762322015334065, "loss": 3.8263, "step": 1185 }, { "epoch": 0.08035842840324889, "grad_norm": 2.7138776779174805, "learning_rate": 0.00039761774370208105, "loss": 3.3651, "step": 1186 }, { "epoch": 0.08042618424507288, "grad_norm": 1.907503366470337, "learning_rate": 0.0003976122672508215, "loss": 3.9095, "step": 1187 }, { "epoch": 0.08049394008689686, "grad_norm": 2.3787288665771484, "learning_rate": 0.0003976067907995619, "loss": 3.618, "step": 1188 }, { "epoch": 0.08056169592872085, "grad_norm": 1.8711837530136108, "learning_rate": 0.0003976013143483023, "loss": 3.8969, "step": 1189 }, { "epoch": 0.08062945177054484, "grad_norm": 2.116868257522583, "learning_rate": 0.00039759583789704276, "loss": 3.4202, "step": 1190 }, { "epoch": 0.08069720761236883, "grad_norm": 2.397768497467041, "learning_rate": 0.00039759036144578315, "loss": 3.5774, "step": 1191 }, { "epoch": 0.08076496345419282, "grad_norm": 2.316174030303955, "learning_rate": 0.00039758488499452355, "loss": 3.7317, "step": 1192 }, { "epoch": 0.0808327192960168, "grad_norm": 1.8714388608932495, "learning_rate": 0.000397579408543264, "loss": 3.7417, "step": 1193 }, { "epoch": 0.0809004751378408, "grad_norm": 3.3521344661712646, "learning_rate": 0.0003975739320920044, "loss": 3.8725, "step": 1194 }, { "epoch": 0.08096823097966478, "grad_norm": 2.0648295879364014, "learning_rate": 0.0003975684556407448, "loss": 3.9086, "step": 1195 }, { "epoch": 0.08103598682148877, "grad_norm": 2.3664958477020264, "learning_rate": 0.0003975629791894852, "loss": 4.2032, "step": 1196 }, { "epoch": 0.08110374266331276, "grad_norm": 1.995086908340454, "learning_rate": 0.0003975575027382256, "loss": 3.8502, "step": 1197 }, { "epoch": 0.08117149850513675, "grad_norm": 1.6786521673202515, "learning_rate": 0.00039755202628696606, "loss": 4.3441, "step": 1198 }, { "epoch": 0.08123925434696072, "grad_norm": 2.189594030380249, "learning_rate": 0.0003975465498357065, "loss": 4.1061, "step": 1199 }, { "epoch": 0.08130701018878471, "grad_norm": 2.766935110092163, "learning_rate": 0.0003975410733844469, "loss": 3.564, "step": 1200 }, { "epoch": 0.0813747660306087, "grad_norm": 2.5711405277252197, "learning_rate": 0.0003975355969331873, "loss": 3.6341, "step": 1201 }, { "epoch": 0.08144252187243269, "grad_norm": 2.426211357116699, "learning_rate": 0.0003975301204819277, "loss": 4.0764, "step": 1202 }, { "epoch": 0.08151027771425667, "grad_norm": 2.789097309112549, "learning_rate": 0.00039752464403066816, "loss": 3.7514, "step": 1203 }, { "epoch": 0.08157803355608066, "grad_norm": 2.1677470207214355, "learning_rate": 0.00039751916757940856, "loss": 3.9546, "step": 1204 }, { "epoch": 0.08164578939790465, "grad_norm": 2.308382511138916, "learning_rate": 0.00039751369112814896, "loss": 3.7583, "step": 1205 }, { "epoch": 0.08171354523972864, "grad_norm": 2.3552372455596924, "learning_rate": 0.0003975082146768894, "loss": 3.9251, "step": 1206 }, { "epoch": 0.08178130108155263, "grad_norm": 5.0255656242370605, "learning_rate": 0.0003975027382256298, "loss": 3.1247, "step": 1207 }, { "epoch": 0.08184905692337661, "grad_norm": 2.271611213684082, "learning_rate": 0.0003974972617743702, "loss": 3.9579, "step": 1208 }, { "epoch": 0.0819168127652006, "grad_norm": 2.6856629848480225, "learning_rate": 0.00039749178532311067, "loss": 3.4936, "step": 1209 }, { "epoch": 0.08198456860702459, "grad_norm": 2.271657943725586, "learning_rate": 0.00039748630887185106, "loss": 3.4154, "step": 1210 }, { "epoch": 0.08205232444884858, "grad_norm": 1.982198715209961, "learning_rate": 0.00039748083242059146, "loss": 4.0364, "step": 1211 }, { "epoch": 0.08212008029067257, "grad_norm": 2.110283136367798, "learning_rate": 0.00039747535596933186, "loss": 3.9218, "step": 1212 }, { "epoch": 0.08218783613249656, "grad_norm": 2.626458168029785, "learning_rate": 0.0003974698795180723, "loss": 3.7014, "step": 1213 }, { "epoch": 0.08225559197432053, "grad_norm": 2.199005365371704, "learning_rate": 0.0003974644030668127, "loss": 3.7801, "step": 1214 }, { "epoch": 0.08232334781614452, "grad_norm": 10.258249282836914, "learning_rate": 0.00039745892661555317, "loss": 3.7388, "step": 1215 }, { "epoch": 0.0823911036579685, "grad_norm": 6.497037887573242, "learning_rate": 0.00039745345016429357, "loss": 3.8982, "step": 1216 }, { "epoch": 0.0824588594997925, "grad_norm": 2.4071481227874756, "learning_rate": 0.00039744797371303397, "loss": 3.3666, "step": 1217 }, { "epoch": 0.08252661534161648, "grad_norm": 3.222608804702759, "learning_rate": 0.00039744249726177437, "loss": 3.4107, "step": 1218 }, { "epoch": 0.08259437118344047, "grad_norm": 2.414372682571411, "learning_rate": 0.00039743702081051477, "loss": 3.832, "step": 1219 }, { "epoch": 0.08266212702526446, "grad_norm": 2.607917070388794, "learning_rate": 0.0003974315443592552, "loss": 3.6625, "step": 1220 }, { "epoch": 0.08272988286708845, "grad_norm": 3.644857883453369, "learning_rate": 0.0003974260679079957, "loss": 3.3418, "step": 1221 }, { "epoch": 0.08279763870891244, "grad_norm": 4.009765625, "learning_rate": 0.00039742059145673607, "loss": 3.6009, "step": 1222 }, { "epoch": 0.08286539455073642, "grad_norm": 4.018517971038818, "learning_rate": 0.00039741511500547647, "loss": 3.2331, "step": 1223 }, { "epoch": 0.08293315039256041, "grad_norm": 3.6456215381622314, "learning_rate": 0.00039740963855421687, "loss": 3.8497, "step": 1224 }, { "epoch": 0.0830009062343844, "grad_norm": 2.227560520172119, "learning_rate": 0.0003974041621029573, "loss": 3.8611, "step": 1225 }, { "epoch": 0.08306866207620839, "grad_norm": 2.6084868907928467, "learning_rate": 0.0003973986856516977, "loss": 3.999, "step": 1226 }, { "epoch": 0.08313641791803238, "grad_norm": 2.6379199028015137, "learning_rate": 0.0003973932092004381, "loss": 3.446, "step": 1227 }, { "epoch": 0.08320417375985635, "grad_norm": 2.3672125339508057, "learning_rate": 0.0003973877327491785, "loss": 3.1848, "step": 1228 }, { "epoch": 0.08327192960168034, "grad_norm": 3.0808262825012207, "learning_rate": 0.000397382256297919, "loss": 3.4877, "step": 1229 }, { "epoch": 0.08333968544350433, "grad_norm": 5.694168567657471, "learning_rate": 0.0003973767798466594, "loss": 3.4092, "step": 1230 }, { "epoch": 0.08340744128532832, "grad_norm": 3.3568620681762695, "learning_rate": 0.00039737130339539983, "loss": 3.5356, "step": 1231 }, { "epoch": 0.0834751971271523, "grad_norm": 2.977346181869507, "learning_rate": 0.0003973658269441402, "loss": 3.4667, "step": 1232 }, { "epoch": 0.08354295296897629, "grad_norm": 2.025078773498535, "learning_rate": 0.0003973603504928806, "loss": 3.8365, "step": 1233 }, { "epoch": 0.08361070881080028, "grad_norm": 2.478451728820801, "learning_rate": 0.000397354874041621, "loss": 4.1945, "step": 1234 }, { "epoch": 0.08367846465262427, "grad_norm": 2.5750892162323, "learning_rate": 0.0003973493975903614, "loss": 3.6372, "step": 1235 }, { "epoch": 0.08374622049444826, "grad_norm": 1.994739055633545, "learning_rate": 0.0003973439211391019, "loss": 3.7147, "step": 1236 }, { "epoch": 0.08381397633627224, "grad_norm": 2.232776641845703, "learning_rate": 0.00039733844468784233, "loss": 3.7312, "step": 1237 }, { "epoch": 0.08388173217809623, "grad_norm": 2.582740068435669, "learning_rate": 0.00039733296823658273, "loss": 3.3792, "step": 1238 }, { "epoch": 0.08394948801992022, "grad_norm": 3.110473394393921, "learning_rate": 0.00039732749178532313, "loss": 3.5127, "step": 1239 }, { "epoch": 0.08401724386174421, "grad_norm": 5.40733003616333, "learning_rate": 0.00039732201533406353, "loss": 3.675, "step": 1240 }, { "epoch": 0.0840849997035682, "grad_norm": 3.805846691131592, "learning_rate": 0.000397316538882804, "loss": 3.7882, "step": 1241 }, { "epoch": 0.08415275554539219, "grad_norm": 2.9184610843658447, "learning_rate": 0.0003973110624315444, "loss": 3.1571, "step": 1242 }, { "epoch": 0.08422051138721616, "grad_norm": 2.2012500762939453, "learning_rate": 0.0003973055859802848, "loss": 3.5428, "step": 1243 }, { "epoch": 0.08428826722904015, "grad_norm": 4.310023784637451, "learning_rate": 0.00039730010952902523, "loss": 3.7799, "step": 1244 }, { "epoch": 0.08435602307086414, "grad_norm": 4.646803855895996, "learning_rate": 0.00039729463307776563, "loss": 3.6777, "step": 1245 }, { "epoch": 0.08442377891268812, "grad_norm": 3.766892194747925, "learning_rate": 0.00039728915662650603, "loss": 3.4044, "step": 1246 }, { "epoch": 0.08449153475451211, "grad_norm": 4.320135593414307, "learning_rate": 0.0003972836801752465, "loss": 3.8089, "step": 1247 }, { "epoch": 0.0845592905963361, "grad_norm": 2.767925500869751, "learning_rate": 0.0003972782037239869, "loss": 3.7742, "step": 1248 }, { "epoch": 0.08462704643816009, "grad_norm": 2.7828221321105957, "learning_rate": 0.0003972727272727273, "loss": 3.8359, "step": 1249 }, { "epoch": 0.08469480227998408, "grad_norm": 3.131392478942871, "learning_rate": 0.0003972672508214677, "loss": 3.4213, "step": 1250 }, { "epoch": 0.08476255812180807, "grad_norm": 5.914330959320068, "learning_rate": 0.0003972617743702081, "loss": 3.5991, "step": 1251 }, { "epoch": 0.08483031396363205, "grad_norm": 3.821100950241089, "learning_rate": 0.00039725629791894854, "loss": 3.7064, "step": 1252 }, { "epoch": 0.08489806980545604, "grad_norm": 2.647351026535034, "learning_rate": 0.000397250821467689, "loss": 3.5764, "step": 1253 }, { "epoch": 0.08496582564728003, "grad_norm": 2.7663350105285645, "learning_rate": 0.0003972453450164294, "loss": 3.799, "step": 1254 }, { "epoch": 0.08503358148910402, "grad_norm": 2.8820762634277344, "learning_rate": 0.0003972398685651698, "loss": 3.4514, "step": 1255 }, { "epoch": 0.085101337330928, "grad_norm": 3.7021536827087402, "learning_rate": 0.0003972343921139102, "loss": 3.6858, "step": 1256 }, { "epoch": 0.085169093172752, "grad_norm": 3.1065738201141357, "learning_rate": 0.0003972289156626506, "loss": 3.5724, "step": 1257 }, { "epoch": 0.08523684901457597, "grad_norm": 2.983675003051758, "learning_rate": 0.00039722343921139104, "loss": 3.6062, "step": 1258 }, { "epoch": 0.08530460485639996, "grad_norm": 3.379542112350464, "learning_rate": 0.00039721796276013144, "loss": 3.6398, "step": 1259 }, { "epoch": 0.08537236069822395, "grad_norm": 4.429060459136963, "learning_rate": 0.0003972124863088719, "loss": 3.5044, "step": 1260 }, { "epoch": 0.08544011654004793, "grad_norm": 3.0019874572753906, "learning_rate": 0.0003972070098576123, "loss": 3.5509, "step": 1261 }, { "epoch": 0.08550787238187192, "grad_norm": 1.8628062009811401, "learning_rate": 0.0003972015334063527, "loss": 3.919, "step": 1262 }, { "epoch": 0.08557562822369591, "grad_norm": 5.054527759552002, "learning_rate": 0.00039719605695509314, "loss": 2.9386, "step": 1263 }, { "epoch": 0.0856433840655199, "grad_norm": 2.693042516708374, "learning_rate": 0.00039719058050383354, "loss": 3.7089, "step": 1264 }, { "epoch": 0.08571113990734389, "grad_norm": 2.401817798614502, "learning_rate": 0.00039718510405257394, "loss": 3.6142, "step": 1265 }, { "epoch": 0.08577889574916787, "grad_norm": 2.529827117919922, "learning_rate": 0.00039717962760131434, "loss": 3.3647, "step": 1266 }, { "epoch": 0.08584665159099186, "grad_norm": 1.9811527729034424, "learning_rate": 0.00039717415115005474, "loss": 3.8447, "step": 1267 }, { "epoch": 0.08591440743281585, "grad_norm": 2.4195988178253174, "learning_rate": 0.0003971686746987952, "loss": 3.7935, "step": 1268 }, { "epoch": 0.08598216327463984, "grad_norm": 3.4873714447021484, "learning_rate": 0.00039716319824753565, "loss": 3.8628, "step": 1269 }, { "epoch": 0.08604991911646383, "grad_norm": 3.22560715675354, "learning_rate": 0.00039715772179627605, "loss": 3.545, "step": 1270 }, { "epoch": 0.08611767495828782, "grad_norm": 2.0998849868774414, "learning_rate": 0.00039715224534501645, "loss": 4.0359, "step": 1271 }, { "epoch": 0.0861854308001118, "grad_norm": 3.034543752670288, "learning_rate": 0.00039714676889375685, "loss": 3.6052, "step": 1272 }, { "epoch": 0.08625318664193578, "grad_norm": 3.707247734069824, "learning_rate": 0.00039714129244249725, "loss": 3.0152, "step": 1273 }, { "epoch": 0.08632094248375977, "grad_norm": 3.397484302520752, "learning_rate": 0.0003971358159912377, "loss": 3.2396, "step": 1274 }, { "epoch": 0.08638869832558375, "grad_norm": 2.2703237533569336, "learning_rate": 0.0003971303395399781, "loss": 3.5158, "step": 1275 }, { "epoch": 0.08645645416740774, "grad_norm": 3.1869099140167236, "learning_rate": 0.00039712486308871855, "loss": 3.7413, "step": 1276 }, { "epoch": 0.08652421000923173, "grad_norm": 7.536550521850586, "learning_rate": 0.00039711938663745895, "loss": 3.9123, "step": 1277 }, { "epoch": 0.08659196585105572, "grad_norm": 3.970386266708374, "learning_rate": 0.00039711391018619935, "loss": 3.7636, "step": 1278 }, { "epoch": 0.08665972169287971, "grad_norm": 8.451091766357422, "learning_rate": 0.0003971084337349398, "loss": 3.2727, "step": 1279 }, { "epoch": 0.0867274775347037, "grad_norm": 3.6506245136260986, "learning_rate": 0.0003971029572836802, "loss": 3.2431, "step": 1280 }, { "epoch": 0.08679523337652768, "grad_norm": 2.3779211044311523, "learning_rate": 0.0003970974808324206, "loss": 4.0591, "step": 1281 }, { "epoch": 0.08686298921835167, "grad_norm": 3.352254867553711, "learning_rate": 0.000397092004381161, "loss": 3.4645, "step": 1282 }, { "epoch": 0.08693074506017566, "grad_norm": 2.5513463020324707, "learning_rate": 0.00039708652792990145, "loss": 3.7434, "step": 1283 }, { "epoch": 0.08699850090199965, "grad_norm": 2.913905620574951, "learning_rate": 0.00039708105147864185, "loss": 3.9168, "step": 1284 }, { "epoch": 0.08706625674382364, "grad_norm": 2.863588333129883, "learning_rate": 0.0003970755750273823, "loss": 3.2169, "step": 1285 }, { "epoch": 0.08713401258564762, "grad_norm": 2.9639158248901367, "learning_rate": 0.0003970700985761227, "loss": 3.5472, "step": 1286 }, { "epoch": 0.08720176842747161, "grad_norm": 2.3217127323150635, "learning_rate": 0.0003970646221248631, "loss": 3.0184, "step": 1287 }, { "epoch": 0.08726952426929559, "grad_norm": 3.683032274246216, "learning_rate": 0.0003970591456736035, "loss": 3.6671, "step": 1288 }, { "epoch": 0.08733728011111958, "grad_norm": 4.01596736907959, "learning_rate": 0.0003970536692223439, "loss": 3.9717, "step": 1289 }, { "epoch": 0.08740503595294356, "grad_norm": 3.7086997032165527, "learning_rate": 0.00039704819277108436, "loss": 2.9476, "step": 1290 }, { "epoch": 0.08747279179476755, "grad_norm": 2.7542834281921387, "learning_rate": 0.0003970427163198248, "loss": 3.6172, "step": 1291 }, { "epoch": 0.08754054763659154, "grad_norm": 4.100141525268555, "learning_rate": 0.0003970372398685652, "loss": 3.2291, "step": 1292 }, { "epoch": 0.08760830347841553, "grad_norm": 3.0796902179718018, "learning_rate": 0.0003970317634173056, "loss": 3.712, "step": 1293 }, { "epoch": 0.08767605932023952, "grad_norm": 3.585057497024536, "learning_rate": 0.000397026286966046, "loss": 3.0526, "step": 1294 }, { "epoch": 0.0877438151620635, "grad_norm": 3.9870405197143555, "learning_rate": 0.0003970208105147864, "loss": 3.2653, "step": 1295 }, { "epoch": 0.08781157100388749, "grad_norm": 2.910722255706787, "learning_rate": 0.00039701533406352686, "loss": 3.4261, "step": 1296 }, { "epoch": 0.08787932684571148, "grad_norm": 2.6165783405303955, "learning_rate": 0.00039700985761226726, "loss": 3.7293, "step": 1297 }, { "epoch": 0.08794708268753547, "grad_norm": 4.945798397064209, "learning_rate": 0.00039700438116100766, "loss": 3.5293, "step": 1298 }, { "epoch": 0.08801483852935946, "grad_norm": 2.7560267448425293, "learning_rate": 0.0003969989047097481, "loss": 3.4106, "step": 1299 }, { "epoch": 0.08808259437118345, "grad_norm": 7.161624431610107, "learning_rate": 0.0003969934282584885, "loss": 3.4445, "step": 1300 }, { "epoch": 0.08815035021300743, "grad_norm": 5.628994464874268, "learning_rate": 0.00039698795180722897, "loss": 3.5507, "step": 1301 }, { "epoch": 0.08821810605483142, "grad_norm": 3.652656316757202, "learning_rate": 0.00039698247535596936, "loss": 3.2534, "step": 1302 }, { "epoch": 0.0882858618966554, "grad_norm": 6.7578959465026855, "learning_rate": 0.00039697699890470976, "loss": 4.0627, "step": 1303 }, { "epoch": 0.08835361773847938, "grad_norm": 8.673806190490723, "learning_rate": 0.00039697152245345016, "loss": 3.3421, "step": 1304 }, { "epoch": 0.08842137358030337, "grad_norm": 2.8153746128082275, "learning_rate": 0.00039696604600219056, "loss": 2.7227, "step": 1305 }, { "epoch": 0.08848912942212736, "grad_norm": 4.202126502990723, "learning_rate": 0.000396960569550931, "loss": 3.5121, "step": 1306 }, { "epoch": 0.08855688526395135, "grad_norm": 3.4320075511932373, "learning_rate": 0.00039695509309967147, "loss": 3.4808, "step": 1307 }, { "epoch": 0.08862464110577534, "grad_norm": 3.0245823860168457, "learning_rate": 0.00039694961664841187, "loss": 3.0227, "step": 1308 }, { "epoch": 0.08869239694759933, "grad_norm": 2.5735254287719727, "learning_rate": 0.00039694414019715227, "loss": 3.2619, "step": 1309 }, { "epoch": 0.08876015278942331, "grad_norm": 5.699209213256836, "learning_rate": 0.00039693866374589267, "loss": 3.2324, "step": 1310 }, { "epoch": 0.0888279086312473, "grad_norm": 4.076539993286133, "learning_rate": 0.00039693318729463307, "loss": 2.823, "step": 1311 }, { "epoch": 0.08889566447307129, "grad_norm": 4.222191333770752, "learning_rate": 0.0003969277108433735, "loss": 3.6137, "step": 1312 }, { "epoch": 0.08896342031489528, "grad_norm": 3.719456672668457, "learning_rate": 0.0003969222343921139, "loss": 3.6398, "step": 1313 }, { "epoch": 0.08903117615671927, "grad_norm": 2.9299585819244385, "learning_rate": 0.00039691675794085437, "loss": 3.5518, "step": 1314 }, { "epoch": 0.08909893199854325, "grad_norm": 3.1081583499908447, "learning_rate": 0.00039691128148959477, "loss": 3.7595, "step": 1315 }, { "epoch": 0.08916668784036724, "grad_norm": 2.9591050148010254, "learning_rate": 0.00039690580503833517, "loss": 3.9243, "step": 1316 }, { "epoch": 0.08923444368219123, "grad_norm": 2.3958845138549805, "learning_rate": 0.0003969003285870756, "loss": 3.5978, "step": 1317 }, { "epoch": 0.0893021995240152, "grad_norm": 2.804438591003418, "learning_rate": 0.000396894852135816, "loss": 2.9166, "step": 1318 }, { "epoch": 0.0893699553658392, "grad_norm": 3.00793194770813, "learning_rate": 0.0003968893756845564, "loss": 3.2201, "step": 1319 }, { "epoch": 0.08943771120766318, "grad_norm": 3.10834002494812, "learning_rate": 0.0003968838992332968, "loss": 3.7626, "step": 1320 }, { "epoch": 0.08950546704948717, "grad_norm": 4.358127117156982, "learning_rate": 0.0003968784227820372, "loss": 3.3129, "step": 1321 }, { "epoch": 0.08957322289131116, "grad_norm": 4.7332305908203125, "learning_rate": 0.0003968729463307777, "loss": 3.4599, "step": 1322 }, { "epoch": 0.08964097873313515, "grad_norm": 4.504525184631348, "learning_rate": 0.00039686746987951813, "loss": 3.0129, "step": 1323 }, { "epoch": 0.08970873457495913, "grad_norm": 3.25687837600708, "learning_rate": 0.0003968619934282585, "loss": 2.944, "step": 1324 }, { "epoch": 0.08977649041678312, "grad_norm": 4.91163444519043, "learning_rate": 0.0003968565169769989, "loss": 3.6705, "step": 1325 }, { "epoch": 0.08984424625860711, "grad_norm": 2.9517040252685547, "learning_rate": 0.0003968510405257393, "loss": 3.0409, "step": 1326 }, { "epoch": 0.0899120021004311, "grad_norm": 3.991425037384033, "learning_rate": 0.0003968455640744797, "loss": 3.1587, "step": 1327 }, { "epoch": 0.08997975794225509, "grad_norm": 2.9522857666015625, "learning_rate": 0.0003968400876232202, "loss": 3.3263, "step": 1328 }, { "epoch": 0.09004751378407908, "grad_norm": 3.084933280944824, "learning_rate": 0.0003968346111719606, "loss": 3.6117, "step": 1329 }, { "epoch": 0.09011526962590306, "grad_norm": 4.300806999206543, "learning_rate": 0.00039682913472070103, "loss": 3.3952, "step": 1330 }, { "epoch": 0.09018302546772705, "grad_norm": 3.70619797706604, "learning_rate": 0.00039682365826944143, "loss": 2.8007, "step": 1331 }, { "epoch": 0.09025078130955104, "grad_norm": 3.1480019092559814, "learning_rate": 0.00039681818181818183, "loss": 3.4314, "step": 1332 }, { "epoch": 0.09031853715137501, "grad_norm": 5.503115653991699, "learning_rate": 0.00039681270536692223, "loss": 3.2433, "step": 1333 }, { "epoch": 0.090386292993199, "grad_norm": 5.408228397369385, "learning_rate": 0.0003968072289156627, "loss": 2.953, "step": 1334 }, { "epoch": 0.09045404883502299, "grad_norm": 4.142291069030762, "learning_rate": 0.0003968017524644031, "loss": 3.5657, "step": 1335 }, { "epoch": 0.09052180467684698, "grad_norm": 6.977697849273682, "learning_rate": 0.0003967962760131435, "loss": 3.2821, "step": 1336 }, { "epoch": 0.09058956051867097, "grad_norm": 3.5408384799957275, "learning_rate": 0.0003967907995618839, "loss": 3.3353, "step": 1337 }, { "epoch": 0.09065731636049496, "grad_norm": 7.353470802307129, "learning_rate": 0.00039678532311062433, "loss": 3.4904, "step": 1338 }, { "epoch": 0.09072507220231894, "grad_norm": 5.857029914855957, "learning_rate": 0.0003967798466593648, "loss": 3.1007, "step": 1339 }, { "epoch": 0.09079282804414293, "grad_norm": 3.2496349811553955, "learning_rate": 0.0003967743702081052, "loss": 3.0114, "step": 1340 }, { "epoch": 0.09086058388596692, "grad_norm": 4.774024486541748, "learning_rate": 0.0003967688937568456, "loss": 3.5133, "step": 1341 }, { "epoch": 0.09092833972779091, "grad_norm": 4.487707138061523, "learning_rate": 0.000396763417305586, "loss": 3.2241, "step": 1342 }, { "epoch": 0.0909960955696149, "grad_norm": 4.201592445373535, "learning_rate": 0.0003967579408543264, "loss": 3.7208, "step": 1343 }, { "epoch": 0.09106385141143888, "grad_norm": 8.039995193481445, "learning_rate": 0.00039675246440306684, "loss": 3.5228, "step": 1344 }, { "epoch": 0.09113160725326287, "grad_norm": 8.409346580505371, "learning_rate": 0.0003967469879518073, "loss": 3.6083, "step": 1345 }, { "epoch": 0.09119936309508686, "grad_norm": 3.661201000213623, "learning_rate": 0.0003967415115005477, "loss": 3.6194, "step": 1346 }, { "epoch": 0.09126711893691083, "grad_norm": 3.2771658897399902, "learning_rate": 0.0003967360350492881, "loss": 3.1981, "step": 1347 }, { "epoch": 0.09133487477873482, "grad_norm": 2.745028257369995, "learning_rate": 0.0003967305585980285, "loss": 3.0765, "step": 1348 }, { "epoch": 0.09140263062055881, "grad_norm": 3.067625045776367, "learning_rate": 0.0003967250821467689, "loss": 3.4039, "step": 1349 }, { "epoch": 0.0914703864623828, "grad_norm": 4.588881015777588, "learning_rate": 0.00039671960569550934, "loss": 3.5035, "step": 1350 }, { "epoch": 0.09153814230420679, "grad_norm": 3.3754680156707764, "learning_rate": 0.00039671412924424974, "loss": 3.459, "step": 1351 }, { "epoch": 0.09160589814603078, "grad_norm": 3.2061920166015625, "learning_rate": 0.00039670865279299014, "loss": 3.6217, "step": 1352 }, { "epoch": 0.09167365398785476, "grad_norm": 3.9220097064971924, "learning_rate": 0.0003967031763417306, "loss": 3.4331, "step": 1353 }, { "epoch": 0.09174140982967875, "grad_norm": 3.916447639465332, "learning_rate": 0.000396697699890471, "loss": 3.1779, "step": 1354 }, { "epoch": 0.09180916567150274, "grad_norm": 5.143884181976318, "learning_rate": 0.00039669222343921144, "loss": 3.0871, "step": 1355 }, { "epoch": 0.09187692151332673, "grad_norm": 3.3875112533569336, "learning_rate": 0.00039668674698795184, "loss": 2.807, "step": 1356 }, { "epoch": 0.09194467735515072, "grad_norm": 4.5538506507873535, "learning_rate": 0.00039668127053669224, "loss": 3.0644, "step": 1357 }, { "epoch": 0.0920124331969747, "grad_norm": 4.417010307312012, "learning_rate": 0.00039667579408543264, "loss": 3.4697, "step": 1358 }, { "epoch": 0.0920801890387987, "grad_norm": 5.526939392089844, "learning_rate": 0.00039667031763417304, "loss": 3.6266, "step": 1359 }, { "epoch": 0.09214794488062268, "grad_norm": 5.602273941040039, "learning_rate": 0.0003966648411829135, "loss": 3.2932, "step": 1360 }, { "epoch": 0.09221570072244667, "grad_norm": 3.233726739883423, "learning_rate": 0.00039665936473165395, "loss": 3.3159, "step": 1361 }, { "epoch": 0.09228345656427064, "grad_norm": 4.651501178741455, "learning_rate": 0.00039665388828039435, "loss": 3.0679, "step": 1362 }, { "epoch": 0.09235121240609463, "grad_norm": 4.072600841522217, "learning_rate": 0.00039664841182913475, "loss": 3.5562, "step": 1363 }, { "epoch": 0.09241896824791862, "grad_norm": 3.0264618396759033, "learning_rate": 0.00039664293537787515, "loss": 3.1274, "step": 1364 }, { "epoch": 0.09248672408974261, "grad_norm": 3.4799113273620605, "learning_rate": 0.00039663745892661555, "loss": 3.1202, "step": 1365 }, { "epoch": 0.0925544799315666, "grad_norm": 3.753041982650757, "learning_rate": 0.000396631982475356, "loss": 3.2853, "step": 1366 }, { "epoch": 0.09262223577339058, "grad_norm": 3.1483142375946045, "learning_rate": 0.0003966265060240964, "loss": 2.8398, "step": 1367 }, { "epoch": 0.09268999161521457, "grad_norm": 4.377002716064453, "learning_rate": 0.0003966210295728368, "loss": 2.6961, "step": 1368 }, { "epoch": 0.09275774745703856, "grad_norm": 5.337320327758789, "learning_rate": 0.00039661555312157725, "loss": 3.8357, "step": 1369 }, { "epoch": 0.09282550329886255, "grad_norm": 8.339109420776367, "learning_rate": 0.00039661007667031765, "loss": 3.4294, "step": 1370 }, { "epoch": 0.09289325914068654, "grad_norm": 4.493359565734863, "learning_rate": 0.00039660460021905805, "loss": 3.004, "step": 1371 }, { "epoch": 0.09296101498251053, "grad_norm": 3.51812481880188, "learning_rate": 0.0003965991237677985, "loss": 3.3724, "step": 1372 }, { "epoch": 0.09302877082433451, "grad_norm": 6.700555801391602, "learning_rate": 0.0003965936473165389, "loss": 3.4482, "step": 1373 }, { "epoch": 0.0930965266661585, "grad_norm": 3.407228469848633, "learning_rate": 0.0003965881708652793, "loss": 3.3394, "step": 1374 }, { "epoch": 0.09316428250798249, "grad_norm": 3.750430107116699, "learning_rate": 0.0003965826944140197, "loss": 3.1796, "step": 1375 }, { "epoch": 0.09323203834980648, "grad_norm": 4.640967845916748, "learning_rate": 0.00039657721796276015, "loss": 3.3042, "step": 1376 }, { "epoch": 0.09329979419163045, "grad_norm": 6.119855880737305, "learning_rate": 0.0003965717415115006, "loss": 3.14, "step": 1377 }, { "epoch": 0.09336755003345444, "grad_norm": 3.8944735527038574, "learning_rate": 0.000396566265060241, "loss": 3.4573, "step": 1378 }, { "epoch": 0.09343530587527843, "grad_norm": 6.502141952514648, "learning_rate": 0.0003965607886089814, "loss": 3.5776, "step": 1379 }, { "epoch": 0.09350306171710242, "grad_norm": 2.7702245712280273, "learning_rate": 0.0003965553121577218, "loss": 3.4609, "step": 1380 }, { "epoch": 0.0935708175589264, "grad_norm": 3.210815668106079, "learning_rate": 0.0003965498357064622, "loss": 3.251, "step": 1381 }, { "epoch": 0.0936385734007504, "grad_norm": 6.19578742980957, "learning_rate": 0.00039654435925520266, "loss": 2.6267, "step": 1382 }, { "epoch": 0.09370632924257438, "grad_norm": 4.072349548339844, "learning_rate": 0.00039653888280394306, "loss": 3.2351, "step": 1383 }, { "epoch": 0.09377408508439837, "grad_norm": 4.1360392570495605, "learning_rate": 0.0003965334063526835, "loss": 3.162, "step": 1384 }, { "epoch": 0.09384184092622236, "grad_norm": 5.888863563537598, "learning_rate": 0.0003965279299014239, "loss": 3.9504, "step": 1385 }, { "epoch": 0.09390959676804635, "grad_norm": 5.3361430168151855, "learning_rate": 0.0003965224534501643, "loss": 3.3076, "step": 1386 }, { "epoch": 0.09397735260987033, "grad_norm": 3.3490030765533447, "learning_rate": 0.0003965169769989047, "loss": 3.0575, "step": 1387 }, { "epoch": 0.09404510845169432, "grad_norm": 3.54402756690979, "learning_rate": 0.00039651150054764516, "loss": 2.7207, "step": 1388 }, { "epoch": 0.09411286429351831, "grad_norm": 6.320133209228516, "learning_rate": 0.00039650602409638556, "loss": 3.2264, "step": 1389 }, { "epoch": 0.0941806201353423, "grad_norm": 4.098153591156006, "learning_rate": 0.00039650054764512596, "loss": 3.4084, "step": 1390 }, { "epoch": 0.09424837597716629, "grad_norm": 3.734311580657959, "learning_rate": 0.00039649507119386636, "loss": 2.9108, "step": 1391 }, { "epoch": 0.09431613181899026, "grad_norm": 8.233625411987305, "learning_rate": 0.0003964895947426068, "loss": 3.3912, "step": 1392 }, { "epoch": 0.09438388766081425, "grad_norm": 3.449410915374756, "learning_rate": 0.00039648411829134727, "loss": 2.9413, "step": 1393 }, { "epoch": 0.09445164350263824, "grad_norm": 4.187081336975098, "learning_rate": 0.00039647864184008766, "loss": 2.9833, "step": 1394 }, { "epoch": 0.09451939934446223, "grad_norm": 3.8117613792419434, "learning_rate": 0.00039647316538882806, "loss": 3.5015, "step": 1395 }, { "epoch": 0.09458715518628621, "grad_norm": 3.2268080711364746, "learning_rate": 0.00039646768893756846, "loss": 3.4804, "step": 1396 }, { "epoch": 0.0946549110281102, "grad_norm": 5.477669715881348, "learning_rate": 0.00039646221248630886, "loss": 2.9596, "step": 1397 }, { "epoch": 0.09472266686993419, "grad_norm": 3.455084800720215, "learning_rate": 0.0003964567360350493, "loss": 3.476, "step": 1398 }, { "epoch": 0.09479042271175818, "grad_norm": 3.612034559249878, "learning_rate": 0.0003964512595837897, "loss": 2.7545, "step": 1399 }, { "epoch": 0.09485817855358217, "grad_norm": 5.558725833892822, "learning_rate": 0.00039644578313253017, "loss": 2.6956, "step": 1400 }, { "epoch": 0.09492593439540616, "grad_norm": 4.886908054351807, "learning_rate": 0.00039644030668127057, "loss": 3.1163, "step": 1401 }, { "epoch": 0.09499369023723014, "grad_norm": 3.92747163772583, "learning_rate": 0.00039643483023001097, "loss": 3.2695, "step": 1402 }, { "epoch": 0.09506144607905413, "grad_norm": 5.023552894592285, "learning_rate": 0.00039642935377875137, "loss": 2.6324, "step": 1403 }, { "epoch": 0.09512920192087812, "grad_norm": 5.036474704742432, "learning_rate": 0.0003964238773274918, "loss": 3.3597, "step": 1404 }, { "epoch": 0.09519695776270211, "grad_norm": 7.500438213348389, "learning_rate": 0.0003964184008762322, "loss": 3.2728, "step": 1405 }, { "epoch": 0.0952647136045261, "grad_norm": 4.588469505310059, "learning_rate": 0.0003964129244249726, "loss": 2.8626, "step": 1406 }, { "epoch": 0.09533246944635007, "grad_norm": 4.683378219604492, "learning_rate": 0.000396407447973713, "loss": 3.4165, "step": 1407 }, { "epoch": 0.09540022528817406, "grad_norm": 3.172830581665039, "learning_rate": 0.00039640197152245347, "loss": 3.1271, "step": 1408 }, { "epoch": 0.09546798112999805, "grad_norm": 5.370691776275635, "learning_rate": 0.00039639649507119387, "loss": 2.6997, "step": 1409 }, { "epoch": 0.09553573697182204, "grad_norm": 4.910064697265625, "learning_rate": 0.0003963910186199343, "loss": 2.9232, "step": 1410 }, { "epoch": 0.09560349281364602, "grad_norm": 5.6997785568237305, "learning_rate": 0.0003963855421686747, "loss": 2.7393, "step": 1411 }, { "epoch": 0.09567124865547001, "grad_norm": 3.432875871658325, "learning_rate": 0.0003963800657174151, "loss": 2.9108, "step": 1412 }, { "epoch": 0.095739004497294, "grad_norm": 5.890623092651367, "learning_rate": 0.0003963745892661555, "loss": 3.1078, "step": 1413 }, { "epoch": 0.09580676033911799, "grad_norm": 3.961026668548584, "learning_rate": 0.000396369112814896, "loss": 3.2651, "step": 1414 }, { "epoch": 0.09587451618094198, "grad_norm": 5.667285442352295, "learning_rate": 0.00039636363636363643, "loss": 2.8413, "step": 1415 }, { "epoch": 0.09594227202276596, "grad_norm": 4.445078372955322, "learning_rate": 0.0003963581599123768, "loss": 3.1501, "step": 1416 }, { "epoch": 0.09601002786458995, "grad_norm": 5.004968643188477, "learning_rate": 0.0003963526834611172, "loss": 3.0727, "step": 1417 }, { "epoch": 0.09607778370641394, "grad_norm": 5.210635662078857, "learning_rate": 0.0003963472070098576, "loss": 2.9248, "step": 1418 }, { "epoch": 0.09614553954823793, "grad_norm": 7.102078914642334, "learning_rate": 0.000396341730558598, "loss": 2.6631, "step": 1419 }, { "epoch": 0.09621329539006192, "grad_norm": 3.516155242919922, "learning_rate": 0.0003963362541073385, "loss": 3.0647, "step": 1420 }, { "epoch": 0.0962810512318859, "grad_norm": 3.451227903366089, "learning_rate": 0.0003963307776560789, "loss": 3.6905, "step": 1421 }, { "epoch": 0.09634880707370988, "grad_norm": 4.446077823638916, "learning_rate": 0.0003963253012048193, "loss": 3.3557, "step": 1422 }, { "epoch": 0.09641656291553387, "grad_norm": 4.488996982574463, "learning_rate": 0.00039631982475355973, "loss": 2.5365, "step": 1423 }, { "epoch": 0.09648431875735786, "grad_norm": 6.061187744140625, "learning_rate": 0.00039631434830230013, "loss": 3.1525, "step": 1424 }, { "epoch": 0.09655207459918184, "grad_norm": 5.657248020172119, "learning_rate": 0.00039630887185104053, "loss": 2.6317, "step": 1425 }, { "epoch": 0.09661983044100583, "grad_norm": 6.425600528717041, "learning_rate": 0.000396303395399781, "loss": 2.9827, "step": 1426 }, { "epoch": 0.09668758628282982, "grad_norm": 5.036628723144531, "learning_rate": 0.0003962979189485214, "loss": 2.879, "step": 1427 }, { "epoch": 0.09675534212465381, "grad_norm": 4.367918968200684, "learning_rate": 0.0003962924424972618, "loss": 2.8053, "step": 1428 }, { "epoch": 0.0968230979664778, "grad_norm": 4.822283744812012, "learning_rate": 0.0003962869660460022, "loss": 3.023, "step": 1429 }, { "epoch": 0.09689085380830179, "grad_norm": 6.562445640563965, "learning_rate": 0.0003962814895947426, "loss": 3.0136, "step": 1430 }, { "epoch": 0.09695860965012577, "grad_norm": 4.266103267669678, "learning_rate": 0.0003962760131434831, "loss": 2.6668, "step": 1431 }, { "epoch": 0.09702636549194976, "grad_norm": 3.7837908267974854, "learning_rate": 0.0003962705366922235, "loss": 2.7827, "step": 1432 }, { "epoch": 0.09709412133377375, "grad_norm": 7.818897247314453, "learning_rate": 0.0003962650602409639, "loss": 2.8416, "step": 1433 }, { "epoch": 0.09716187717559774, "grad_norm": 7.434922695159912, "learning_rate": 0.0003962595837897043, "loss": 3.4634, "step": 1434 }, { "epoch": 0.09722963301742173, "grad_norm": 3.6432392597198486, "learning_rate": 0.0003962541073384447, "loss": 2.9409, "step": 1435 }, { "epoch": 0.09729738885924571, "grad_norm": 4.159408092498779, "learning_rate": 0.00039624863088718514, "loss": 3.1823, "step": 1436 }, { "epoch": 0.09736514470106969, "grad_norm": 3.582764148712158, "learning_rate": 0.00039624315443592554, "loss": 2.3781, "step": 1437 }, { "epoch": 0.09743290054289368, "grad_norm": 7.628521919250488, "learning_rate": 0.00039623767798466593, "loss": 3.2472, "step": 1438 }, { "epoch": 0.09750065638471767, "grad_norm": 5.080845832824707, "learning_rate": 0.0003962322015334064, "loss": 2.9873, "step": 1439 }, { "epoch": 0.09756841222654165, "grad_norm": 6.963984489440918, "learning_rate": 0.0003962267250821468, "loss": 2.9787, "step": 1440 }, { "epoch": 0.09763616806836564, "grad_norm": 4.882854461669922, "learning_rate": 0.0003962212486308872, "loss": 2.9118, "step": 1441 }, { "epoch": 0.09770392391018963, "grad_norm": 3.8537163734436035, "learning_rate": 0.00039621577217962764, "loss": 3.0972, "step": 1442 }, { "epoch": 0.09777167975201362, "grad_norm": 4.407215595245361, "learning_rate": 0.00039621029572836804, "loss": 3.0343, "step": 1443 }, { "epoch": 0.0978394355938376, "grad_norm": 6.060123443603516, "learning_rate": 0.00039620481927710844, "loss": 2.7828, "step": 1444 }, { "epoch": 0.0979071914356616, "grad_norm": 5.434008598327637, "learning_rate": 0.00039619934282584884, "loss": 2.5776, "step": 1445 }, { "epoch": 0.09797494727748558, "grad_norm": 4.19885778427124, "learning_rate": 0.0003961938663745893, "loss": 2.3513, "step": 1446 }, { "epoch": 0.09804270311930957, "grad_norm": 4.149463176727295, "learning_rate": 0.0003961883899233297, "loss": 2.7142, "step": 1447 }, { "epoch": 0.09811045896113356, "grad_norm": 6.602935791015625, "learning_rate": 0.00039618291347207014, "loss": 2.975, "step": 1448 }, { "epoch": 0.09817821480295755, "grad_norm": 5.896492958068848, "learning_rate": 0.00039617743702081054, "loss": 3.2241, "step": 1449 }, { "epoch": 0.09824597064478154, "grad_norm": 4.733145713806152, "learning_rate": 0.00039617196056955094, "loss": 2.801, "step": 1450 }, { "epoch": 0.09831372648660552, "grad_norm": 5.897229194641113, "learning_rate": 0.00039616648411829134, "loss": 2.1594, "step": 1451 }, { "epoch": 0.0983814823284295, "grad_norm": 4.709463596343994, "learning_rate": 0.0003961610076670318, "loss": 2.8955, "step": 1452 }, { "epoch": 0.09844923817025349, "grad_norm": 5.284852981567383, "learning_rate": 0.0003961555312157722, "loss": 2.5491, "step": 1453 }, { "epoch": 0.09851699401207747, "grad_norm": 8.136198043823242, "learning_rate": 0.00039615005476451265, "loss": 2.8608, "step": 1454 }, { "epoch": 0.09858474985390146, "grad_norm": 4.148588180541992, "learning_rate": 0.00039614457831325305, "loss": 2.7032, "step": 1455 }, { "epoch": 0.09865250569572545, "grad_norm": 7.193914890289307, "learning_rate": 0.00039613910186199345, "loss": 3.1814, "step": 1456 }, { "epoch": 0.09872026153754944, "grad_norm": 5.965203285217285, "learning_rate": 0.00039613362541073385, "loss": 2.9008, "step": 1457 }, { "epoch": 0.09878801737937343, "grad_norm": 8.79650592803955, "learning_rate": 0.0003961281489594743, "loss": 3.2172, "step": 1458 }, { "epoch": 0.09885577322119742, "grad_norm": 6.5463032722473145, "learning_rate": 0.0003961226725082147, "loss": 2.8966, "step": 1459 }, { "epoch": 0.0989235290630214, "grad_norm": 4.920925617218018, "learning_rate": 0.0003961171960569551, "loss": 2.7232, "step": 1460 }, { "epoch": 0.09899128490484539, "grad_norm": 5.8619771003723145, "learning_rate": 0.0003961117196056955, "loss": 2.5593, "step": 1461 }, { "epoch": 0.09905904074666938, "grad_norm": 4.172482967376709, "learning_rate": 0.00039610624315443595, "loss": 3.1805, "step": 1462 }, { "epoch": 0.09912679658849337, "grad_norm": 4.212977409362793, "learning_rate": 0.00039610076670317635, "loss": 2.6483, "step": 1463 }, { "epoch": 0.09919455243031736, "grad_norm": 6.384244441986084, "learning_rate": 0.0003960952902519168, "loss": 2.5341, "step": 1464 }, { "epoch": 0.09926230827214134, "grad_norm": 5.955776691436768, "learning_rate": 0.0003960898138006572, "loss": 2.9147, "step": 1465 }, { "epoch": 0.09933006411396533, "grad_norm": 5.747391223907471, "learning_rate": 0.0003960843373493976, "loss": 3.1934, "step": 1466 }, { "epoch": 0.09939781995578931, "grad_norm": 5.476200580596924, "learning_rate": 0.000396078860898138, "loss": 3.2506, "step": 1467 }, { "epoch": 0.0994655757976133, "grad_norm": 6.76502799987793, "learning_rate": 0.0003960733844468784, "loss": 2.8963, "step": 1468 }, { "epoch": 0.09953333163943728, "grad_norm": 5.352315425872803, "learning_rate": 0.00039606790799561885, "loss": 2.5278, "step": 1469 }, { "epoch": 0.09960108748126127, "grad_norm": 6.485702991485596, "learning_rate": 0.0003960624315443593, "loss": 2.7978, "step": 1470 }, { "epoch": 0.09966884332308526, "grad_norm": 7.389594554901123, "learning_rate": 0.0003960569550930997, "loss": 2.773, "step": 1471 }, { "epoch": 0.09973659916490925, "grad_norm": 6.473922252655029, "learning_rate": 0.0003960514786418401, "loss": 2.0795, "step": 1472 }, { "epoch": 0.09980435500673324, "grad_norm": 4.82306432723999, "learning_rate": 0.0003960460021905805, "loss": 2.3972, "step": 1473 }, { "epoch": 0.09987211084855722, "grad_norm": 6.37922477722168, "learning_rate": 0.00039604052573932096, "loss": 2.3796, "step": 1474 }, { "epoch": 0.09993986669038121, "grad_norm": 5.97680139541626, "learning_rate": 0.00039603504928806136, "loss": 2.878, "step": 1475 }, { "epoch": 0.1000076225322052, "grad_norm": 5.786788463592529, "learning_rate": 0.00039602957283680176, "loss": 2.7823, "step": 1476 }, { "epoch": 0.10007537837402919, "grad_norm": 4.16139030456543, "learning_rate": 0.0003960240963855422, "loss": 2.4569, "step": 1477 }, { "epoch": 0.10014313421585318, "grad_norm": 7.280211448669434, "learning_rate": 0.0003960186199342826, "loss": 2.6938, "step": 1478 }, { "epoch": 0.10021089005767717, "grad_norm": 6.102603435516357, "learning_rate": 0.000396013143483023, "loss": 3.0976, "step": 1479 }, { "epoch": 0.10027864589950115, "grad_norm": 10.204267501831055, "learning_rate": 0.00039600766703176346, "loss": 2.3684, "step": 1480 }, { "epoch": 0.10034640174132513, "grad_norm": 4.742012977600098, "learning_rate": 0.00039600219058050386, "loss": 2.2722, "step": 1481 }, { "epoch": 0.10041415758314912, "grad_norm": 6.115045070648193, "learning_rate": 0.00039599671412924426, "loss": 3.1026, "step": 1482 }, { "epoch": 0.1004819134249731, "grad_norm": 5.21455717086792, "learning_rate": 0.00039599123767798466, "loss": 2.4663, "step": 1483 }, { "epoch": 0.10054966926679709, "grad_norm": 5.136172294616699, "learning_rate": 0.00039598576122672506, "loss": 2.7271, "step": 1484 }, { "epoch": 0.10061742510862108, "grad_norm": 8.2840576171875, "learning_rate": 0.0003959802847754655, "loss": 2.7367, "step": 1485 }, { "epoch": 0.10068518095044507, "grad_norm": 7.235759735107422, "learning_rate": 0.00039597480832420596, "loss": 2.706, "step": 1486 }, { "epoch": 0.10075293679226906, "grad_norm": 5.551850318908691, "learning_rate": 0.00039596933187294636, "loss": 2.3654, "step": 1487 }, { "epoch": 0.10082069263409305, "grad_norm": 9.961389541625977, "learning_rate": 0.00039596385542168676, "loss": 3.0937, "step": 1488 }, { "epoch": 0.10088844847591703, "grad_norm": 7.169528484344482, "learning_rate": 0.00039595837897042716, "loss": 2.2312, "step": 1489 }, { "epoch": 0.10095620431774102, "grad_norm": 7.014019012451172, "learning_rate": 0.0003959529025191676, "loss": 2.3882, "step": 1490 }, { "epoch": 0.10102396015956501, "grad_norm": 6.1078948974609375, "learning_rate": 0.000395947426067908, "loss": 2.1489, "step": 1491 }, { "epoch": 0.101091716001389, "grad_norm": 8.101475715637207, "learning_rate": 0.0003959419496166484, "loss": 2.4828, "step": 1492 }, { "epoch": 0.10115947184321299, "grad_norm": 7.1105523109436035, "learning_rate": 0.00039593647316538887, "loss": 2.5171, "step": 1493 }, { "epoch": 0.10122722768503697, "grad_norm": 6.7352614402771, "learning_rate": 0.00039593099671412927, "loss": 2.5246, "step": 1494 }, { "epoch": 0.10129498352686096, "grad_norm": 5.006543159484863, "learning_rate": 0.00039592552026286967, "loss": 2.515, "step": 1495 }, { "epoch": 0.10136273936868494, "grad_norm": 12.342672348022461, "learning_rate": 0.0003959200438116101, "loss": 2.4808, "step": 1496 }, { "epoch": 0.10143049521050893, "grad_norm": 7.189533710479736, "learning_rate": 0.0003959145673603505, "loss": 2.9216, "step": 1497 }, { "epoch": 0.10149825105233291, "grad_norm": 6.30384635925293, "learning_rate": 0.0003959090909090909, "loss": 2.6325, "step": 1498 }, { "epoch": 0.1015660068941569, "grad_norm": 14.021766662597656, "learning_rate": 0.0003959036144578313, "loss": 2.9859, "step": 1499 }, { "epoch": 0.10163376273598089, "grad_norm": 5.380823612213135, "learning_rate": 0.0003958981380065717, "loss": 2.5742, "step": 1500 }, { "epoch": 0.10170151857780488, "grad_norm": 10.402581214904785, "learning_rate": 0.00039589266155531217, "loss": 2.3011, "step": 1501 }, { "epoch": 0.10176927441962887, "grad_norm": 10.72081184387207, "learning_rate": 0.0003958871851040526, "loss": 2.6849, "step": 1502 }, { "epoch": 0.10183703026145285, "grad_norm": 9.020021438598633, "learning_rate": 0.000395881708652793, "loss": 2.3963, "step": 1503 }, { "epoch": 0.10190478610327684, "grad_norm": 6.383272171020508, "learning_rate": 0.0003958762322015334, "loss": 1.6993, "step": 1504 }, { "epoch": 0.10197254194510083, "grad_norm": 10.206408500671387, "learning_rate": 0.0003958707557502738, "loss": 2.9702, "step": 1505 }, { "epoch": 0.10204029778692482, "grad_norm": 5.172316551208496, "learning_rate": 0.0003958652792990142, "loss": 2.1335, "step": 1506 }, { "epoch": 0.10210805362874881, "grad_norm": 8.15674114227295, "learning_rate": 0.0003958598028477547, "loss": 2.934, "step": 1507 }, { "epoch": 0.1021758094705728, "grad_norm": 6.668185234069824, "learning_rate": 0.0003958543263964951, "loss": 2.7704, "step": 1508 }, { "epoch": 0.10224356531239678, "grad_norm": 12.379969596862793, "learning_rate": 0.0003958488499452355, "loss": 2.6764, "step": 1509 }, { "epoch": 0.10231132115422077, "grad_norm": 11.362810134887695, "learning_rate": 0.0003958433734939759, "loss": 2.2466, "step": 1510 }, { "epoch": 0.10237907699604475, "grad_norm": 6.38102388381958, "learning_rate": 0.0003958378970427163, "loss": 2.6702, "step": 1511 }, { "epoch": 0.10244683283786873, "grad_norm": 5.729297637939453, "learning_rate": 0.0003958324205914568, "loss": 2.4596, "step": 1512 }, { "epoch": 0.10251458867969272, "grad_norm": 7.818333148956299, "learning_rate": 0.0003958269441401972, "loss": 2.7329, "step": 1513 }, { "epoch": 0.10258234452151671, "grad_norm": 6.059780597686768, "learning_rate": 0.0003958214676889376, "loss": 2.4393, "step": 1514 }, { "epoch": 0.1026501003633407, "grad_norm": 6.723374366760254, "learning_rate": 0.000395815991237678, "loss": 2.477, "step": 1515 }, { "epoch": 0.10271785620516469, "grad_norm": 6.627840518951416, "learning_rate": 0.00039581051478641843, "loss": 2.1288, "step": 1516 }, { "epoch": 0.10278561204698868, "grad_norm": 6.84044885635376, "learning_rate": 0.00039580503833515883, "loss": 2.5723, "step": 1517 }, { "epoch": 0.10285336788881266, "grad_norm": 6.076621055603027, "learning_rate": 0.0003957995618838993, "loss": 2.3418, "step": 1518 }, { "epoch": 0.10292112373063665, "grad_norm": 5.930188179016113, "learning_rate": 0.0003957940854326397, "loss": 2.8516, "step": 1519 }, { "epoch": 0.10298887957246064, "grad_norm": 6.278172492980957, "learning_rate": 0.0003957886089813801, "loss": 2.0256, "step": 1520 }, { "epoch": 0.10305663541428463, "grad_norm": 10.729782104492188, "learning_rate": 0.0003957831325301205, "loss": 2.2401, "step": 1521 }, { "epoch": 0.10312439125610862, "grad_norm": 6.3917741775512695, "learning_rate": 0.0003957776560788609, "loss": 2.8768, "step": 1522 }, { "epoch": 0.1031921470979326, "grad_norm": 7.535367012023926, "learning_rate": 0.00039577217962760133, "loss": 2.4606, "step": 1523 }, { "epoch": 0.10325990293975659, "grad_norm": 5.2437262535095215, "learning_rate": 0.0003957667031763418, "loss": 1.643, "step": 1524 }, { "epoch": 0.10332765878158058, "grad_norm": 10.855128288269043, "learning_rate": 0.0003957612267250822, "loss": 1.9913, "step": 1525 }, { "epoch": 0.10339541462340456, "grad_norm": 5.784173011779785, "learning_rate": 0.0003957557502738226, "loss": 2.0367, "step": 1526 }, { "epoch": 0.10346317046522854, "grad_norm": 12.547819137573242, "learning_rate": 0.000395750273822563, "loss": 2.4602, "step": 1527 }, { "epoch": 0.10353092630705253, "grad_norm": 12.875210762023926, "learning_rate": 0.00039574479737130344, "loss": 2.6349, "step": 1528 }, { "epoch": 0.10359868214887652, "grad_norm": 11.491096496582031, "learning_rate": 0.00039573932092004384, "loss": 2.8111, "step": 1529 }, { "epoch": 0.10366643799070051, "grad_norm": 10.651978492736816, "learning_rate": 0.00039573384446878423, "loss": 2.4111, "step": 1530 }, { "epoch": 0.1037341938325245, "grad_norm": 11.756603240966797, "learning_rate": 0.00039572836801752463, "loss": 2.1734, "step": 1531 }, { "epoch": 0.10380194967434848, "grad_norm": 7.683773994445801, "learning_rate": 0.0003957228915662651, "loss": 2.0083, "step": 1532 }, { "epoch": 0.10386970551617247, "grad_norm": 6.986481666564941, "learning_rate": 0.0003957174151150055, "loss": 1.7878, "step": 1533 }, { "epoch": 0.10393746135799646, "grad_norm": 7.494926929473877, "learning_rate": 0.00039571193866374594, "loss": 2.2895, "step": 1534 }, { "epoch": 0.10400521719982045, "grad_norm": 5.501714706420898, "learning_rate": 0.00039570646221248634, "loss": 1.9569, "step": 1535 }, { "epoch": 0.10407297304164444, "grad_norm": 10.523665428161621, "learning_rate": 0.00039570098576122674, "loss": 2.0419, "step": 1536 }, { "epoch": 0.10414072888346843, "grad_norm": 10.4854097366333, "learning_rate": 0.00039569550930996714, "loss": 2.2933, "step": 1537 }, { "epoch": 0.10420848472529241, "grad_norm": 19.430438995361328, "learning_rate": 0.00039569003285870754, "loss": 2.2288, "step": 1538 }, { "epoch": 0.1042762405671164, "grad_norm": 8.312188148498535, "learning_rate": 0.000395684556407448, "loss": 1.9245, "step": 1539 }, { "epoch": 0.10434399640894039, "grad_norm": 9.426054000854492, "learning_rate": 0.00039567907995618844, "loss": 2.2526, "step": 1540 }, { "epoch": 0.10441175225076436, "grad_norm": 10.691904067993164, "learning_rate": 0.00039567360350492884, "loss": 2.193, "step": 1541 }, { "epoch": 0.10447950809258835, "grad_norm": 6.674832820892334, "learning_rate": 0.00039566812705366924, "loss": 2.0514, "step": 1542 }, { "epoch": 0.10454726393441234, "grad_norm": 7.035017013549805, "learning_rate": 0.00039566265060240964, "loss": 2.0483, "step": 1543 }, { "epoch": 0.10461501977623633, "grad_norm": 7.759828090667725, "learning_rate": 0.00039565717415115004, "loss": 2.1638, "step": 1544 }, { "epoch": 0.10468277561806032, "grad_norm": 7.071404933929443, "learning_rate": 0.0003956516976998905, "loss": 2.0131, "step": 1545 }, { "epoch": 0.1047505314598843, "grad_norm": 8.26661491394043, "learning_rate": 0.0003956462212486309, "loss": 1.8065, "step": 1546 }, { "epoch": 0.1048182873017083, "grad_norm": 5.137301445007324, "learning_rate": 0.00039564074479737135, "loss": 1.8734, "step": 1547 }, { "epoch": 0.10488604314353228, "grad_norm": 6.565690994262695, "learning_rate": 0.00039563526834611175, "loss": 2.0718, "step": 1548 }, { "epoch": 0.10495379898535627, "grad_norm": 5.399538993835449, "learning_rate": 0.00039562979189485214, "loss": 1.6017, "step": 1549 }, { "epoch": 0.10502155482718026, "grad_norm": 8.50685977935791, "learning_rate": 0.0003956243154435926, "loss": 1.6937, "step": 1550 }, { "epoch": 0.10508931066900425, "grad_norm": 9.405255317687988, "learning_rate": 0.000395618838992333, "loss": 1.5298, "step": 1551 }, { "epoch": 0.10515706651082823, "grad_norm": 10.210711479187012, "learning_rate": 0.0003956133625410734, "loss": 1.9383, "step": 1552 }, { "epoch": 0.10522482235265222, "grad_norm": 5.813169002532959, "learning_rate": 0.0003956078860898138, "loss": 1.7489, "step": 1553 }, { "epoch": 0.10529257819447621, "grad_norm": 7.955734729766846, "learning_rate": 0.0003956024096385542, "loss": 2.0913, "step": 1554 }, { "epoch": 0.1053603340363002, "grad_norm": 6.004859447479248, "learning_rate": 0.00039559693318729465, "loss": 1.7586, "step": 1555 }, { "epoch": 0.10542808987812417, "grad_norm": 7.190276622772217, "learning_rate": 0.0003955914567360351, "loss": 1.8692, "step": 1556 }, { "epoch": 0.10549584571994816, "grad_norm": 6.522455215454102, "learning_rate": 0.0003955859802847755, "loss": 1.6489, "step": 1557 }, { "epoch": 0.10556360156177215, "grad_norm": 16.731210708618164, "learning_rate": 0.0003955805038335159, "loss": 2.1741, "step": 1558 }, { "epoch": 0.10563135740359614, "grad_norm": 9.050701141357422, "learning_rate": 0.0003955750273822563, "loss": 1.71, "step": 1559 }, { "epoch": 0.10569911324542013, "grad_norm": 5.514721870422363, "learning_rate": 0.0003955695509309967, "loss": 1.5551, "step": 1560 }, { "epoch": 0.10576686908724411, "grad_norm": 8.143468856811523, "learning_rate": 0.00039556407447973715, "loss": 1.3367, "step": 1561 }, { "epoch": 0.1058346249290681, "grad_norm": 7.43070650100708, "learning_rate": 0.00039555859802847755, "loss": 1.7636, "step": 1562 }, { "epoch": 0.10590238077089209, "grad_norm": 7.976333141326904, "learning_rate": 0.000395553121577218, "loss": 1.6309, "step": 1563 }, { "epoch": 0.10597013661271608, "grad_norm": 5.722769260406494, "learning_rate": 0.0003955476451259584, "loss": 1.7133, "step": 1564 }, { "epoch": 0.10603789245454007, "grad_norm": 6.700421333312988, "learning_rate": 0.0003955421686746988, "loss": 1.6723, "step": 1565 }, { "epoch": 0.10610564829636406, "grad_norm": 6.96038293838501, "learning_rate": 0.00039553669222343926, "loss": 1.8589, "step": 1566 }, { "epoch": 0.10617340413818804, "grad_norm": 5.43287467956543, "learning_rate": 0.00039553121577217966, "loss": 1.5708, "step": 1567 }, { "epoch": 0.10624115998001203, "grad_norm": 6.3512654304504395, "learning_rate": 0.00039552573932092006, "loss": 1.8835, "step": 1568 }, { "epoch": 0.10630891582183602, "grad_norm": 6.035426139831543, "learning_rate": 0.00039552026286966045, "loss": 1.3528, "step": 1569 }, { "epoch": 0.10637667166366001, "grad_norm": 6.670430660247803, "learning_rate": 0.00039551478641840085, "loss": 1.7812, "step": 1570 }, { "epoch": 0.10644442750548398, "grad_norm": 6.571809768676758, "learning_rate": 0.0003955093099671413, "loss": 1.532, "step": 1571 }, { "epoch": 0.10651218334730797, "grad_norm": 5.342930793762207, "learning_rate": 0.00039550383351588176, "loss": 1.284, "step": 1572 }, { "epoch": 0.10657993918913196, "grad_norm": 5.984511375427246, "learning_rate": 0.00039549835706462216, "loss": 1.4433, "step": 1573 }, { "epoch": 0.10664769503095595, "grad_norm": 5.874916076660156, "learning_rate": 0.00039549288061336256, "loss": 1.6582, "step": 1574 }, { "epoch": 0.10671545087277994, "grad_norm": 6.260425090789795, "learning_rate": 0.00039548740416210296, "loss": 1.3773, "step": 1575 }, { "epoch": 0.10678320671460392, "grad_norm": 7.710513114929199, "learning_rate": 0.00039548192771084336, "loss": 1.7306, "step": 1576 }, { "epoch": 0.10685096255642791, "grad_norm": 7.668156623840332, "learning_rate": 0.0003954764512595838, "loss": 1.1667, "step": 1577 }, { "epoch": 0.1069187183982519, "grad_norm": 4.716894626617432, "learning_rate": 0.00039547097480832426, "loss": 1.6682, "step": 1578 }, { "epoch": 0.10698647424007589, "grad_norm": 14.606276512145996, "learning_rate": 0.00039546549835706466, "loss": 1.8005, "step": 1579 }, { "epoch": 0.10705423008189988, "grad_norm": 8.625142097473145, "learning_rate": 0.00039546002190580506, "loss": 2.0248, "step": 1580 }, { "epoch": 0.10712198592372386, "grad_norm": 6.178574562072754, "learning_rate": 0.00039545454545454546, "loss": 1.6757, "step": 1581 }, { "epoch": 0.10718974176554785, "grad_norm": 6.782114028930664, "learning_rate": 0.00039544906900328586, "loss": 1.384, "step": 1582 }, { "epoch": 0.10725749760737184, "grad_norm": 4.977735996246338, "learning_rate": 0.0003954435925520263, "loss": 1.4704, "step": 1583 }, { "epoch": 0.10732525344919583, "grad_norm": 6.067221164703369, "learning_rate": 0.0003954381161007667, "loss": 1.5696, "step": 1584 }, { "epoch": 0.10739300929101982, "grad_norm": 8.837647438049316, "learning_rate": 0.0003954326396495071, "loss": 1.6017, "step": 1585 }, { "epoch": 0.10746076513284379, "grad_norm": 5.214369773864746, "learning_rate": 0.00039542716319824757, "loss": 1.4412, "step": 1586 }, { "epoch": 0.10752852097466778, "grad_norm": 7.491814136505127, "learning_rate": 0.00039542168674698797, "loss": 1.3705, "step": 1587 }, { "epoch": 0.10759627681649177, "grad_norm": 6.499130725860596, "learning_rate": 0.0003954162102957284, "loss": 1.237, "step": 1588 }, { "epoch": 0.10766403265831576, "grad_norm": 5.969573974609375, "learning_rate": 0.0003954107338444688, "loss": 1.2787, "step": 1589 }, { "epoch": 0.10773178850013974, "grad_norm": 5.7216267585754395, "learning_rate": 0.0003954052573932092, "loss": 1.5467, "step": 1590 }, { "epoch": 0.10779954434196373, "grad_norm": 4.942505836486816, "learning_rate": 0.0003953997809419496, "loss": 1.467, "step": 1591 }, { "epoch": 0.10786730018378772, "grad_norm": 5.39434814453125, "learning_rate": 0.00039539430449069, "loss": 1.0495, "step": 1592 }, { "epoch": 0.10793505602561171, "grad_norm": 6.367964744567871, "learning_rate": 0.00039538882803943047, "loss": 1.4729, "step": 1593 }, { "epoch": 0.1080028118674357, "grad_norm": 6.016597270965576, "learning_rate": 0.0003953833515881709, "loss": 1.5513, "step": 1594 }, { "epoch": 0.10807056770925969, "grad_norm": 7.340023517608643, "learning_rate": 0.0003953778751369113, "loss": 1.5264, "step": 1595 }, { "epoch": 0.10813832355108367, "grad_norm": 7.440442085266113, "learning_rate": 0.0003953723986856517, "loss": 1.272, "step": 1596 }, { "epoch": 0.10820607939290766, "grad_norm": 6.647540092468262, "learning_rate": 0.0003953669222343921, "loss": 1.2248, "step": 1597 }, { "epoch": 0.10827383523473165, "grad_norm": 4.900035858154297, "learning_rate": 0.0003953614457831325, "loss": 1.41, "step": 1598 }, { "epoch": 0.10834159107655564, "grad_norm": 4.665961742401123, "learning_rate": 0.000395355969331873, "loss": 1.4108, "step": 1599 }, { "epoch": 0.10840934691837961, "grad_norm": 4.730132579803467, "learning_rate": 0.00039535049288061337, "loss": 1.3128, "step": 1600 }, { "epoch": 0.1084771027602036, "grad_norm": 4.807755947113037, "learning_rate": 0.00039534501642935377, "loss": 1.0521, "step": 1601 }, { "epoch": 0.10854485860202759, "grad_norm": 6.883864402770996, "learning_rate": 0.0003953395399780942, "loss": 1.3626, "step": 1602 }, { "epoch": 0.10861261444385158, "grad_norm": 7.1888813972473145, "learning_rate": 0.0003953340635268346, "loss": 1.4059, "step": 1603 }, { "epoch": 0.10868037028567556, "grad_norm": 5.712009906768799, "learning_rate": 0.0003953285870755751, "loss": 1.01, "step": 1604 }, { "epoch": 0.10874812612749955, "grad_norm": 4.674473285675049, "learning_rate": 0.0003953231106243155, "loss": 1.2335, "step": 1605 }, { "epoch": 0.10881588196932354, "grad_norm": 6.6640706062316895, "learning_rate": 0.0003953176341730559, "loss": 1.5289, "step": 1606 }, { "epoch": 0.10888363781114753, "grad_norm": 4.0357794761657715, "learning_rate": 0.0003953121577217963, "loss": 0.9895, "step": 1607 }, { "epoch": 0.10895139365297152, "grad_norm": 4.879321575164795, "learning_rate": 0.0003953066812705367, "loss": 1.5579, "step": 1608 }, { "epoch": 0.1090191494947955, "grad_norm": 5.251523971557617, "learning_rate": 0.00039530120481927713, "loss": 1.4569, "step": 1609 }, { "epoch": 0.1090869053366195, "grad_norm": 7.1547956466674805, "learning_rate": 0.0003952957283680176, "loss": 1.6002, "step": 1610 }, { "epoch": 0.10915466117844348, "grad_norm": 8.155040740966797, "learning_rate": 0.000395290251916758, "loss": 1.4523, "step": 1611 }, { "epoch": 0.10922241702026747, "grad_norm": 5.886632919311523, "learning_rate": 0.0003952847754654984, "loss": 1.2546, "step": 1612 }, { "epoch": 0.10929017286209146, "grad_norm": 6.654346466064453, "learning_rate": 0.0003952792990142388, "loss": 1.3579, "step": 1613 }, { "epoch": 0.10935792870391545, "grad_norm": 4.759560585021973, "learning_rate": 0.0003952738225629792, "loss": 1.6299, "step": 1614 }, { "epoch": 0.10942568454573942, "grad_norm": 5.457400321960449, "learning_rate": 0.00039526834611171963, "loss": 1.1663, "step": 1615 }, { "epoch": 0.10949344038756341, "grad_norm": 5.7266645431518555, "learning_rate": 0.00039526286966046003, "loss": 1.4867, "step": 1616 }, { "epoch": 0.1095611962293874, "grad_norm": 5.383896827697754, "learning_rate": 0.0003952573932092005, "loss": 1.5384, "step": 1617 }, { "epoch": 0.10962895207121139, "grad_norm": 4.416357040405273, "learning_rate": 0.0003952519167579409, "loss": 1.2511, "step": 1618 }, { "epoch": 0.10969670791303537, "grad_norm": 5.343496322631836, "learning_rate": 0.0003952464403066813, "loss": 1.4638, "step": 1619 }, { "epoch": 0.10976446375485936, "grad_norm": 7.058844566345215, "learning_rate": 0.0003952409638554217, "loss": 1.444, "step": 1620 }, { "epoch": 0.10983221959668335, "grad_norm": 4.528946399688721, "learning_rate": 0.00039523548740416214, "loss": 1.2046, "step": 1621 }, { "epoch": 0.10989997543850734, "grad_norm": 4.877941131591797, "learning_rate": 0.00039523001095290253, "loss": 1.2193, "step": 1622 }, { "epoch": 0.10996773128033133, "grad_norm": 4.469624996185303, "learning_rate": 0.00039522453450164293, "loss": 1.2235, "step": 1623 }, { "epoch": 0.11003548712215531, "grad_norm": 5.840634822845459, "learning_rate": 0.00039521905805038333, "loss": 1.3656, "step": 1624 }, { "epoch": 0.1101032429639793, "grad_norm": 8.5831880569458, "learning_rate": 0.0003952135815991238, "loss": 1.4582, "step": 1625 }, { "epoch": 0.11017099880580329, "grad_norm": 6.826632976531982, "learning_rate": 0.00039520810514786424, "loss": 1.3172, "step": 1626 }, { "epoch": 0.11023875464762728, "grad_norm": 4.207441806793213, "learning_rate": 0.00039520262869660464, "loss": 1.3562, "step": 1627 }, { "epoch": 0.11030651048945127, "grad_norm": 4.119324207305908, "learning_rate": 0.00039519715224534504, "loss": 1.522, "step": 1628 }, { "epoch": 0.11037426633127526, "grad_norm": 3.9980592727661133, "learning_rate": 0.00039519167579408544, "loss": 0.942, "step": 1629 }, { "epoch": 0.11044202217309923, "grad_norm": 6.543672561645508, "learning_rate": 0.00039518619934282584, "loss": 1.2047, "step": 1630 }, { "epoch": 0.11050977801492322, "grad_norm": 5.766164302825928, "learning_rate": 0.0003951807228915663, "loss": 1.3645, "step": 1631 }, { "epoch": 0.1105775338567472, "grad_norm": 4.91998815536499, "learning_rate": 0.0003951752464403067, "loss": 1.2502, "step": 1632 }, { "epoch": 0.1106452896985712, "grad_norm": 3.525949001312256, "learning_rate": 0.00039516976998904714, "loss": 1.2463, "step": 1633 }, { "epoch": 0.11071304554039518, "grad_norm": 4.586998462677002, "learning_rate": 0.00039516429353778754, "loss": 1.1385, "step": 1634 }, { "epoch": 0.11078080138221917, "grad_norm": 3.7567641735076904, "learning_rate": 0.00039515881708652794, "loss": 0.9112, "step": 1635 }, { "epoch": 0.11084855722404316, "grad_norm": 2.997755527496338, "learning_rate": 0.00039515334063526834, "loss": 0.9114, "step": 1636 }, { "epoch": 0.11091631306586715, "grad_norm": 4.092390537261963, "learning_rate": 0.0003951478641840088, "loss": 1.0279, "step": 1637 }, { "epoch": 0.11098406890769114, "grad_norm": 5.978790760040283, "learning_rate": 0.0003951423877327492, "loss": 1.2139, "step": 1638 }, { "epoch": 0.11105182474951512, "grad_norm": 4.779129981994629, "learning_rate": 0.0003951369112814896, "loss": 1.2412, "step": 1639 }, { "epoch": 0.11111958059133911, "grad_norm": 5.832210540771484, "learning_rate": 0.00039513143483023005, "loss": 1.2601, "step": 1640 }, { "epoch": 0.1111873364331631, "grad_norm": 5.680749416351318, "learning_rate": 0.00039512595837897044, "loss": 1.4006, "step": 1641 }, { "epoch": 0.11125509227498709, "grad_norm": 15.622469902038574, "learning_rate": 0.0003951204819277109, "loss": 1.3316, "step": 1642 }, { "epoch": 0.11132284811681108, "grad_norm": 5.687697410583496, "learning_rate": 0.0003951150054764513, "loss": 0.9684, "step": 1643 }, { "epoch": 0.11139060395863506, "grad_norm": 4.347750186920166, "learning_rate": 0.0003951095290251917, "loss": 1.21, "step": 1644 }, { "epoch": 0.11145835980045904, "grad_norm": 4.140382289886475, "learning_rate": 0.0003951040525739321, "loss": 1.2795, "step": 1645 }, { "epoch": 0.11152611564228303, "grad_norm": 4.603139400482178, "learning_rate": 0.0003950985761226725, "loss": 1.1003, "step": 1646 }, { "epoch": 0.11159387148410702, "grad_norm": 4.206707000732422, "learning_rate": 0.00039509309967141295, "loss": 1.1876, "step": 1647 }, { "epoch": 0.111661627325931, "grad_norm": 4.687857627868652, "learning_rate": 0.0003950876232201534, "loss": 1.1081, "step": 1648 }, { "epoch": 0.11172938316775499, "grad_norm": 4.899212837219238, "learning_rate": 0.0003950821467688938, "loss": 1.4534, "step": 1649 }, { "epoch": 0.11179713900957898, "grad_norm": 4.628307342529297, "learning_rate": 0.0003950766703176342, "loss": 1.0362, "step": 1650 }, { "epoch": 0.11186489485140297, "grad_norm": 5.196741104125977, "learning_rate": 0.0003950711938663746, "loss": 1.3754, "step": 1651 }, { "epoch": 0.11193265069322696, "grad_norm": 6.112636089324951, "learning_rate": 0.000395065717415115, "loss": 0.9511, "step": 1652 }, { "epoch": 0.11200040653505094, "grad_norm": 4.267406463623047, "learning_rate": 0.00039506024096385545, "loss": 0.9151, "step": 1653 }, { "epoch": 0.11206816237687493, "grad_norm": 8.704758644104004, "learning_rate": 0.00039505476451259585, "loss": 1.1057, "step": 1654 }, { "epoch": 0.11213591821869892, "grad_norm": 4.903751850128174, "learning_rate": 0.00039504928806133625, "loss": 1.0817, "step": 1655 }, { "epoch": 0.11220367406052291, "grad_norm": 8.844914436340332, "learning_rate": 0.0003950438116100767, "loss": 1.4385, "step": 1656 }, { "epoch": 0.1122714299023469, "grad_norm": 4.817325115203857, "learning_rate": 0.0003950383351588171, "loss": 1.1953, "step": 1657 }, { "epoch": 0.11233918574417089, "grad_norm": 9.573628425598145, "learning_rate": 0.0003950328587075575, "loss": 1.4472, "step": 1658 }, { "epoch": 0.11240694158599487, "grad_norm": 7.392648220062256, "learning_rate": 0.00039502738225629796, "loss": 0.987, "step": 1659 }, { "epoch": 0.11247469742781885, "grad_norm": 7.680069446563721, "learning_rate": 0.00039502190580503835, "loss": 1.0029, "step": 1660 }, { "epoch": 0.11254245326964284, "grad_norm": 5.185049057006836, "learning_rate": 0.00039501642935377875, "loss": 0.9314, "step": 1661 }, { "epoch": 0.11261020911146682, "grad_norm": 4.063965797424316, "learning_rate": 0.00039501095290251915, "loss": 1.1307, "step": 1662 }, { "epoch": 0.11267796495329081, "grad_norm": 3.820652723312378, "learning_rate": 0.0003950054764512596, "loss": 1.161, "step": 1663 }, { "epoch": 0.1127457207951148, "grad_norm": 4.7121477127075195, "learning_rate": 0.00039500000000000006, "loss": 1.3195, "step": 1664 }, { "epoch": 0.11281347663693879, "grad_norm": 4.318291664123535, "learning_rate": 0.00039499452354874046, "loss": 1.1162, "step": 1665 }, { "epoch": 0.11288123247876278, "grad_norm": 3.9716618061065674, "learning_rate": 0.00039498904709748086, "loss": 1.1623, "step": 1666 }, { "epoch": 0.11294898832058677, "grad_norm": 5.931141376495361, "learning_rate": 0.00039498357064622126, "loss": 1.0076, "step": 1667 }, { "epoch": 0.11301674416241075, "grad_norm": 5.203697204589844, "learning_rate": 0.00039497809419496166, "loss": 1.2425, "step": 1668 }, { "epoch": 0.11308450000423474, "grad_norm": 6.24186897277832, "learning_rate": 0.0003949726177437021, "loss": 1.1169, "step": 1669 }, { "epoch": 0.11315225584605873, "grad_norm": 4.47183895111084, "learning_rate": 0.0003949671412924425, "loss": 1.4699, "step": 1670 }, { "epoch": 0.11322001168788272, "grad_norm": 5.265829563140869, "learning_rate": 0.00039496166484118296, "loss": 1.2083, "step": 1671 }, { "epoch": 0.1132877675297067, "grad_norm": 4.152087688446045, "learning_rate": 0.00039495618838992336, "loss": 1.0802, "step": 1672 }, { "epoch": 0.1133555233715307, "grad_norm": 4.926446437835693, "learning_rate": 0.00039495071193866376, "loss": 1.3733, "step": 1673 }, { "epoch": 0.11342327921335468, "grad_norm": 4.520811557769775, "learning_rate": 0.00039494523548740416, "loss": 1.292, "step": 1674 }, { "epoch": 0.11349103505517866, "grad_norm": 4.155094623565674, "learning_rate": 0.0003949397590361446, "loss": 1.1369, "step": 1675 }, { "epoch": 0.11355879089700265, "grad_norm": 4.683309078216553, "learning_rate": 0.000394934282584885, "loss": 1.141, "step": 1676 }, { "epoch": 0.11362654673882663, "grad_norm": 4.8344902992248535, "learning_rate": 0.0003949288061336254, "loss": 1.2673, "step": 1677 }, { "epoch": 0.11369430258065062, "grad_norm": 6.6621994972229, "learning_rate": 0.0003949233296823658, "loss": 1.1439, "step": 1678 }, { "epoch": 0.11376205842247461, "grad_norm": 5.894452095031738, "learning_rate": 0.00039491785323110627, "loss": 1.226, "step": 1679 }, { "epoch": 0.1138298142642986, "grad_norm": 4.4098615646362305, "learning_rate": 0.0003949123767798467, "loss": 0.973, "step": 1680 }, { "epoch": 0.11389757010612259, "grad_norm": 4.392683982849121, "learning_rate": 0.0003949069003285871, "loss": 1.3998, "step": 1681 }, { "epoch": 0.11396532594794657, "grad_norm": 6.045029640197754, "learning_rate": 0.0003949014238773275, "loss": 1.3963, "step": 1682 }, { "epoch": 0.11403308178977056, "grad_norm": 4.191753387451172, "learning_rate": 0.0003948959474260679, "loss": 1.1791, "step": 1683 }, { "epoch": 0.11410083763159455, "grad_norm": 4.844413757324219, "learning_rate": 0.0003948904709748083, "loss": 1.124, "step": 1684 }, { "epoch": 0.11416859347341854, "grad_norm": 3.790539264678955, "learning_rate": 0.00039488499452354877, "loss": 1.1751, "step": 1685 }, { "epoch": 0.11423634931524253, "grad_norm": 4.676909923553467, "learning_rate": 0.00039487951807228917, "loss": 1.2751, "step": 1686 }, { "epoch": 0.11430410515706652, "grad_norm": 4.819333553314209, "learning_rate": 0.0003948740416210296, "loss": 1.3905, "step": 1687 }, { "epoch": 0.1143718609988905, "grad_norm": 6.56510066986084, "learning_rate": 0.00039486856516977, "loss": 1.2123, "step": 1688 }, { "epoch": 0.11443961684071449, "grad_norm": 3.3898584842681885, "learning_rate": 0.0003948630887185104, "loss": 0.9921, "step": 1689 }, { "epoch": 0.11450737268253847, "grad_norm": 3.93350887298584, "learning_rate": 0.0003948576122672508, "loss": 0.9894, "step": 1690 }, { "epoch": 0.11457512852436245, "grad_norm": 4.13767671585083, "learning_rate": 0.00039485213581599127, "loss": 1.2404, "step": 1691 }, { "epoch": 0.11464288436618644, "grad_norm": 4.236894607543945, "learning_rate": 0.00039484665936473167, "loss": 0.9455, "step": 1692 }, { "epoch": 0.11471064020801043, "grad_norm": 5.689702987670898, "learning_rate": 0.00039484118291347207, "loss": 1.0252, "step": 1693 }, { "epoch": 0.11477839604983442, "grad_norm": 3.144671678543091, "learning_rate": 0.00039483570646221247, "loss": 0.9894, "step": 1694 }, { "epoch": 0.11484615189165841, "grad_norm": 4.160654544830322, "learning_rate": 0.0003948302300109529, "loss": 0.9231, "step": 1695 }, { "epoch": 0.1149139077334824, "grad_norm": 4.583846092224121, "learning_rate": 0.0003948247535596933, "loss": 1.0016, "step": 1696 }, { "epoch": 0.11498166357530638, "grad_norm": 3.437683343887329, "learning_rate": 0.0003948192771084338, "loss": 1.0387, "step": 1697 }, { "epoch": 0.11504941941713037, "grad_norm": 4.374656677246094, "learning_rate": 0.0003948138006571742, "loss": 1.3334, "step": 1698 }, { "epoch": 0.11511717525895436, "grad_norm": 5.45866060256958, "learning_rate": 0.0003948083242059146, "loss": 1.1768, "step": 1699 }, { "epoch": 0.11518493110077835, "grad_norm": 3.4499471187591553, "learning_rate": 0.000394802847754655, "loss": 1.1246, "step": 1700 }, { "epoch": 0.11525268694260234, "grad_norm": 5.838320732116699, "learning_rate": 0.00039479737130339543, "loss": 1.2268, "step": 1701 }, { "epoch": 0.11532044278442632, "grad_norm": 4.32048225402832, "learning_rate": 0.0003947918948521358, "loss": 1.3561, "step": 1702 }, { "epoch": 0.11538819862625031, "grad_norm": 5.305597305297852, "learning_rate": 0.0003947864184008763, "loss": 1.2363, "step": 1703 }, { "epoch": 0.1154559544680743, "grad_norm": 4.5266194343566895, "learning_rate": 0.0003947809419496167, "loss": 1.2523, "step": 1704 }, { "epoch": 0.11552371030989828, "grad_norm": 5.089867115020752, "learning_rate": 0.0003947754654983571, "loss": 1.1502, "step": 1705 }, { "epoch": 0.11559146615172226, "grad_norm": 3.994213104248047, "learning_rate": 0.0003947699890470975, "loss": 1.0087, "step": 1706 }, { "epoch": 0.11565922199354625, "grad_norm": 3.7438063621520996, "learning_rate": 0.00039476451259583793, "loss": 1.1033, "step": 1707 }, { "epoch": 0.11572697783537024, "grad_norm": 4.32094144821167, "learning_rate": 0.00039475903614457833, "loss": 1.2075, "step": 1708 }, { "epoch": 0.11579473367719423, "grad_norm": 5.756608963012695, "learning_rate": 0.00039475355969331873, "loss": 1.0526, "step": 1709 }, { "epoch": 0.11586248951901822, "grad_norm": 3.862917423248291, "learning_rate": 0.0003947480832420592, "loss": 0.9751, "step": 1710 }, { "epoch": 0.1159302453608422, "grad_norm": 4.8131866455078125, "learning_rate": 0.0003947426067907996, "loss": 1.0616, "step": 1711 }, { "epoch": 0.11599800120266619, "grad_norm": 4.730789661407471, "learning_rate": 0.00039473713033954, "loss": 1.3042, "step": 1712 }, { "epoch": 0.11606575704449018, "grad_norm": 4.7966790199279785, "learning_rate": 0.00039473165388828043, "loss": 1.2819, "step": 1713 }, { "epoch": 0.11613351288631417, "grad_norm": 7.302445888519287, "learning_rate": 0.00039472617743702083, "loss": 1.1682, "step": 1714 }, { "epoch": 0.11620126872813816, "grad_norm": 5.587337017059326, "learning_rate": 0.00039472070098576123, "loss": 1.2243, "step": 1715 }, { "epoch": 0.11626902456996215, "grad_norm": 5.829506874084473, "learning_rate": 0.00039471522453450163, "loss": 1.4658, "step": 1716 }, { "epoch": 0.11633678041178613, "grad_norm": 4.281583786010742, "learning_rate": 0.0003947097480832421, "loss": 1.1215, "step": 1717 }, { "epoch": 0.11640453625361012, "grad_norm": 5.28855562210083, "learning_rate": 0.00039470427163198254, "loss": 1.0329, "step": 1718 }, { "epoch": 0.11647229209543411, "grad_norm": 3.234163284301758, "learning_rate": 0.00039469879518072294, "loss": 0.926, "step": 1719 }, { "epoch": 0.11654004793725808, "grad_norm": 3.5200612545013428, "learning_rate": 0.00039469331872946334, "loss": 1.1168, "step": 1720 }, { "epoch": 0.11660780377908207, "grad_norm": 10.688859939575195, "learning_rate": 0.00039468784227820374, "loss": 1.0462, "step": 1721 }, { "epoch": 0.11667555962090606, "grad_norm": 4.175796985626221, "learning_rate": 0.00039468236582694414, "loss": 0.965, "step": 1722 }, { "epoch": 0.11674331546273005, "grad_norm": 4.7228102684021, "learning_rate": 0.0003946768893756846, "loss": 1.0021, "step": 1723 }, { "epoch": 0.11681107130455404, "grad_norm": 3.9389655590057373, "learning_rate": 0.000394671412924425, "loss": 1.3303, "step": 1724 }, { "epoch": 0.11687882714637803, "grad_norm": 4.331575870513916, "learning_rate": 0.0003946659364731654, "loss": 1.1472, "step": 1725 }, { "epoch": 0.11694658298820201, "grad_norm": 4.8456830978393555, "learning_rate": 0.00039466046002190584, "loss": 1.0899, "step": 1726 }, { "epoch": 0.117014338830026, "grad_norm": 4.778947353363037, "learning_rate": 0.00039465498357064624, "loss": 1.0251, "step": 1727 }, { "epoch": 0.11708209467184999, "grad_norm": 3.2696573734283447, "learning_rate": 0.00039464950711938664, "loss": 0.9866, "step": 1728 }, { "epoch": 0.11714985051367398, "grad_norm": 4.278584003448486, "learning_rate": 0.0003946440306681271, "loss": 1.2932, "step": 1729 }, { "epoch": 0.11721760635549797, "grad_norm": 4.096561908721924, "learning_rate": 0.0003946385542168675, "loss": 1.1068, "step": 1730 }, { "epoch": 0.11728536219732195, "grad_norm": 4.081788063049316, "learning_rate": 0.0003946330777656079, "loss": 1.0378, "step": 1731 }, { "epoch": 0.11735311803914594, "grad_norm": 4.094526767730713, "learning_rate": 0.0003946276013143483, "loss": 1.0869, "step": 1732 }, { "epoch": 0.11742087388096993, "grad_norm": 3.8240902423858643, "learning_rate": 0.0003946221248630887, "loss": 1.134, "step": 1733 }, { "epoch": 0.1174886297227939, "grad_norm": 5.050183296203613, "learning_rate": 0.00039461664841182914, "loss": 1.2009, "step": 1734 }, { "epoch": 0.1175563855646179, "grad_norm": 4.489464282989502, "learning_rate": 0.0003946111719605696, "loss": 1.0914, "step": 1735 }, { "epoch": 0.11762414140644188, "grad_norm": 5.214837551116943, "learning_rate": 0.00039460569550931, "loss": 0.9761, "step": 1736 }, { "epoch": 0.11769189724826587, "grad_norm": 5.526569843292236, "learning_rate": 0.0003946002190580504, "loss": 1.0425, "step": 1737 }, { "epoch": 0.11775965309008986, "grad_norm": 7.189176559448242, "learning_rate": 0.0003945947426067908, "loss": 1.3829, "step": 1738 }, { "epoch": 0.11782740893191385, "grad_norm": 4.321474075317383, "learning_rate": 0.00039458926615553125, "loss": 1.0042, "step": 1739 }, { "epoch": 0.11789516477373783, "grad_norm": 6.104100704193115, "learning_rate": 0.00039458378970427165, "loss": 1.1142, "step": 1740 }, { "epoch": 0.11796292061556182, "grad_norm": 3.5817818641662598, "learning_rate": 0.0003945783132530121, "loss": 0.7784, "step": 1741 }, { "epoch": 0.11803067645738581, "grad_norm": 5.243605136871338, "learning_rate": 0.0003945728368017525, "loss": 0.9711, "step": 1742 }, { "epoch": 0.1180984322992098, "grad_norm": 4.971038818359375, "learning_rate": 0.0003945673603504929, "loss": 1.1133, "step": 1743 }, { "epoch": 0.11816618814103379, "grad_norm": 4.007874965667725, "learning_rate": 0.0003945618838992333, "loss": 1.063, "step": 1744 }, { "epoch": 0.11823394398285778, "grad_norm": 3.4564332962036133, "learning_rate": 0.00039455640744797375, "loss": 1.0024, "step": 1745 }, { "epoch": 0.11830169982468176, "grad_norm": 5.168217658996582, "learning_rate": 0.00039455093099671415, "loss": 0.9845, "step": 1746 }, { "epoch": 0.11836945566650575, "grad_norm": 4.99029541015625, "learning_rate": 0.00039454545454545455, "loss": 1.3208, "step": 1747 }, { "epoch": 0.11843721150832974, "grad_norm": 4.303587436676025, "learning_rate": 0.00039453997809419495, "loss": 1.2922, "step": 1748 }, { "epoch": 0.11850496735015371, "grad_norm": 4.584913730621338, "learning_rate": 0.0003945345016429354, "loss": 1.4634, "step": 1749 }, { "epoch": 0.1185727231919777, "grad_norm": 3.5839524269104004, "learning_rate": 0.0003945290251916758, "loss": 0.9421, "step": 1750 }, { "epoch": 0.11864047903380169, "grad_norm": 3.8840532302856445, "learning_rate": 0.00039452354874041626, "loss": 0.9788, "step": 1751 }, { "epoch": 0.11870823487562568, "grad_norm": 4.87594747543335, "learning_rate": 0.00039451807228915665, "loss": 1.3483, "step": 1752 }, { "epoch": 0.11877599071744967, "grad_norm": 6.998860836029053, "learning_rate": 0.00039451259583789705, "loss": 0.9338, "step": 1753 }, { "epoch": 0.11884374655927366, "grad_norm": 5.057334899902344, "learning_rate": 0.00039450711938663745, "loss": 1.0174, "step": 1754 }, { "epoch": 0.11891150240109764, "grad_norm": 4.529388427734375, "learning_rate": 0.0003945016429353779, "loss": 1.3518, "step": 1755 }, { "epoch": 0.11897925824292163, "grad_norm": 3.73679780960083, "learning_rate": 0.0003944961664841183, "loss": 0.9592, "step": 1756 }, { "epoch": 0.11904701408474562, "grad_norm": 4.708619117736816, "learning_rate": 0.00039449069003285876, "loss": 1.157, "step": 1757 }, { "epoch": 0.11911476992656961, "grad_norm": 4.4102654457092285, "learning_rate": 0.00039448521358159916, "loss": 1.1355, "step": 1758 }, { "epoch": 0.1191825257683936, "grad_norm": 4.033970355987549, "learning_rate": 0.00039447973713033956, "loss": 1.1677, "step": 1759 }, { "epoch": 0.11925028161021758, "grad_norm": 4.176807403564453, "learning_rate": 0.00039447426067907996, "loss": 1.05, "step": 1760 }, { "epoch": 0.11931803745204157, "grad_norm": 5.637633323669434, "learning_rate": 0.0003944687842278204, "loss": 0.8578, "step": 1761 }, { "epoch": 0.11938579329386556, "grad_norm": 4.136836051940918, "learning_rate": 0.0003944633077765608, "loss": 1.1186, "step": 1762 }, { "epoch": 0.11945354913568955, "grad_norm": 3.8431918621063232, "learning_rate": 0.0003944578313253012, "loss": 0.882, "step": 1763 }, { "epoch": 0.11952130497751352, "grad_norm": 4.419764995574951, "learning_rate": 0.0003944523548740416, "loss": 1.173, "step": 1764 }, { "epoch": 0.11958906081933751, "grad_norm": 3.850884437561035, "learning_rate": 0.00039444687842278206, "loss": 1.2835, "step": 1765 }, { "epoch": 0.1196568166611615, "grad_norm": 3.596729278564453, "learning_rate": 0.00039444140197152246, "loss": 0.9831, "step": 1766 }, { "epoch": 0.11972457250298549, "grad_norm": 4.727614402770996, "learning_rate": 0.0003944359255202629, "loss": 1.1181, "step": 1767 }, { "epoch": 0.11979232834480948, "grad_norm": 3.492016315460205, "learning_rate": 0.0003944304490690033, "loss": 0.9477, "step": 1768 }, { "epoch": 0.11986008418663346, "grad_norm": 4.364447116851807, "learning_rate": 0.0003944249726177437, "loss": 1.1886, "step": 1769 }, { "epoch": 0.11992784002845745, "grad_norm": 3.8655457496643066, "learning_rate": 0.0003944194961664841, "loss": 1.1625, "step": 1770 }, { "epoch": 0.11999559587028144, "grad_norm": 3.2298905849456787, "learning_rate": 0.0003944140197152245, "loss": 1.0474, "step": 1771 }, { "epoch": 0.12006335171210543, "grad_norm": 3.3382368087768555, "learning_rate": 0.00039440854326396496, "loss": 0.987, "step": 1772 }, { "epoch": 0.12013110755392942, "grad_norm": 4.010012626647949, "learning_rate": 0.0003944030668127054, "loss": 0.783, "step": 1773 }, { "epoch": 0.1201988633957534, "grad_norm": 3.6999971866607666, "learning_rate": 0.0003943975903614458, "loss": 1.2317, "step": 1774 }, { "epoch": 0.1202666192375774, "grad_norm": 4.398083686828613, "learning_rate": 0.0003943921139101862, "loss": 1.1358, "step": 1775 }, { "epoch": 0.12033437507940138, "grad_norm": 4.35454797744751, "learning_rate": 0.0003943866374589266, "loss": 1.0099, "step": 1776 }, { "epoch": 0.12040213092122537, "grad_norm": 5.814678192138672, "learning_rate": 0.00039438116100766707, "loss": 1.2242, "step": 1777 }, { "epoch": 0.12046988676304936, "grad_norm": 3.9683175086975098, "learning_rate": 0.00039437568455640747, "loss": 0.9783, "step": 1778 }, { "epoch": 0.12053764260487333, "grad_norm": 5.008693218231201, "learning_rate": 0.00039437020810514787, "loss": 1.0379, "step": 1779 }, { "epoch": 0.12060539844669732, "grad_norm": 5.433497428894043, "learning_rate": 0.0003943647316538883, "loss": 1.3031, "step": 1780 }, { "epoch": 0.12067315428852131, "grad_norm": 5.59146785736084, "learning_rate": 0.0003943592552026287, "loss": 0.8908, "step": 1781 }, { "epoch": 0.1207409101303453, "grad_norm": 4.415099620819092, "learning_rate": 0.0003943537787513691, "loss": 1.1647, "step": 1782 }, { "epoch": 0.12080866597216929, "grad_norm": 5.488921642303467, "learning_rate": 0.00039434830230010957, "loss": 0.8395, "step": 1783 }, { "epoch": 0.12087642181399327, "grad_norm": 6.12910270690918, "learning_rate": 0.00039434282584884997, "loss": 1.4274, "step": 1784 }, { "epoch": 0.12094417765581726, "grad_norm": 4.462892532348633, "learning_rate": 0.00039433734939759037, "loss": 1.0254, "step": 1785 }, { "epoch": 0.12101193349764125, "grad_norm": 3.3209166526794434, "learning_rate": 0.00039433187294633077, "loss": 1.0128, "step": 1786 }, { "epoch": 0.12107968933946524, "grad_norm": 2.8834686279296875, "learning_rate": 0.00039432639649507117, "loss": 0.9423, "step": 1787 }, { "epoch": 0.12114744518128923, "grad_norm": 3.505333185195923, "learning_rate": 0.0003943209200438116, "loss": 0.9559, "step": 1788 }, { "epoch": 0.12121520102311321, "grad_norm": 4.342011451721191, "learning_rate": 0.0003943154435925521, "loss": 1.131, "step": 1789 }, { "epoch": 0.1212829568649372, "grad_norm": 3.5092380046844482, "learning_rate": 0.0003943099671412925, "loss": 0.944, "step": 1790 }, { "epoch": 0.12135071270676119, "grad_norm": 3.4735143184661865, "learning_rate": 0.0003943044906900329, "loss": 1.1781, "step": 1791 }, { "epoch": 0.12141846854858518, "grad_norm": 3.1634914875030518, "learning_rate": 0.0003942990142387733, "loss": 0.8058, "step": 1792 }, { "epoch": 0.12148622439040917, "grad_norm": 3.91396164894104, "learning_rate": 0.00039429353778751373, "loss": 1.0665, "step": 1793 }, { "epoch": 0.12155398023223314, "grad_norm": 3.8572874069213867, "learning_rate": 0.0003942880613362541, "loss": 1.2749, "step": 1794 }, { "epoch": 0.12162173607405713, "grad_norm": 4.059725761413574, "learning_rate": 0.0003942825848849945, "loss": 1.0422, "step": 1795 }, { "epoch": 0.12168949191588112, "grad_norm": 4.507661819458008, "learning_rate": 0.000394277108433735, "loss": 0.9989, "step": 1796 }, { "epoch": 0.1217572477577051, "grad_norm": 3.1628077030181885, "learning_rate": 0.0003942716319824754, "loss": 0.8951, "step": 1797 }, { "epoch": 0.1218250035995291, "grad_norm": 3.8676633834838867, "learning_rate": 0.0003942661555312158, "loss": 0.9236, "step": 1798 }, { "epoch": 0.12189275944135308, "grad_norm": 4.942222595214844, "learning_rate": 0.00039426067907995623, "loss": 1.3439, "step": 1799 }, { "epoch": 0.12196051528317707, "grad_norm": 3.636735439300537, "learning_rate": 0.00039425520262869663, "loss": 1.1559, "step": 1800 }, { "epoch": 0.12202827112500106, "grad_norm": 5.252878665924072, "learning_rate": 0.00039424972617743703, "loss": 1.1251, "step": 1801 }, { "epoch": 0.12209602696682505, "grad_norm": 3.1147611141204834, "learning_rate": 0.00039424424972617743, "loss": 0.8764, "step": 1802 }, { "epoch": 0.12216378280864904, "grad_norm": 4.047060966491699, "learning_rate": 0.0003942387732749179, "loss": 0.9859, "step": 1803 }, { "epoch": 0.12223153865047302, "grad_norm": 4.130135536193848, "learning_rate": 0.0003942332968236583, "loss": 1.109, "step": 1804 }, { "epoch": 0.12229929449229701, "grad_norm": 4.171240329742432, "learning_rate": 0.00039422782037239873, "loss": 1.0646, "step": 1805 }, { "epoch": 0.122367050334121, "grad_norm": 3.735419750213623, "learning_rate": 0.00039422234392113913, "loss": 0.9979, "step": 1806 }, { "epoch": 0.12243480617594499, "grad_norm": 4.967855453491211, "learning_rate": 0.00039421686746987953, "loss": 1.385, "step": 1807 }, { "epoch": 0.12250256201776898, "grad_norm": 5.171457290649414, "learning_rate": 0.00039421139101861993, "loss": 1.3712, "step": 1808 }, { "epoch": 0.12257031785959295, "grad_norm": 3.952385187149048, "learning_rate": 0.00039420591456736033, "loss": 1.129, "step": 1809 }, { "epoch": 0.12263807370141694, "grad_norm": 3.3985743522644043, "learning_rate": 0.0003942004381161008, "loss": 1.0455, "step": 1810 }, { "epoch": 0.12270582954324093, "grad_norm": 3.752293586730957, "learning_rate": 0.00039419496166484124, "loss": 0.9598, "step": 1811 }, { "epoch": 0.12277358538506492, "grad_norm": 3.917031764984131, "learning_rate": 0.00039418948521358164, "loss": 0.9095, "step": 1812 }, { "epoch": 0.1228413412268889, "grad_norm": 3.6257455348968506, "learning_rate": 0.00039418400876232204, "loss": 1.0676, "step": 1813 }, { "epoch": 0.12290909706871289, "grad_norm": 4.0780134201049805, "learning_rate": 0.00039417853231106244, "loss": 1.0206, "step": 1814 }, { "epoch": 0.12297685291053688, "grad_norm": 3.7420945167541504, "learning_rate": 0.0003941730558598029, "loss": 1.0192, "step": 1815 }, { "epoch": 0.12304460875236087, "grad_norm": 3.4092986583709717, "learning_rate": 0.0003941675794085433, "loss": 0.9628, "step": 1816 }, { "epoch": 0.12311236459418486, "grad_norm": 3.78250789642334, "learning_rate": 0.0003941621029572837, "loss": 1.1779, "step": 1817 }, { "epoch": 0.12318012043600884, "grad_norm": 4.220417022705078, "learning_rate": 0.0003941566265060241, "loss": 1.1888, "step": 1818 }, { "epoch": 0.12324787627783283, "grad_norm": 4.681819915771484, "learning_rate": 0.00039415115005476454, "loss": 1.5143, "step": 1819 }, { "epoch": 0.12331563211965682, "grad_norm": 4.093831539154053, "learning_rate": 0.00039414567360350494, "loss": 1.0043, "step": 1820 }, { "epoch": 0.12338338796148081, "grad_norm": 4.1450042724609375, "learning_rate": 0.0003941401971522454, "loss": 1.0525, "step": 1821 }, { "epoch": 0.1234511438033048, "grad_norm": 4.708794116973877, "learning_rate": 0.0003941347207009858, "loss": 1.1564, "step": 1822 }, { "epoch": 0.12351889964512879, "grad_norm": 3.7568252086639404, "learning_rate": 0.0003941292442497262, "loss": 1.0529, "step": 1823 }, { "epoch": 0.12358665548695276, "grad_norm": 5.402228832244873, "learning_rate": 0.0003941237677984666, "loss": 1.0908, "step": 1824 }, { "epoch": 0.12365441132877675, "grad_norm": 4.131001949310303, "learning_rate": 0.000394118291347207, "loss": 1.0465, "step": 1825 }, { "epoch": 0.12372216717060074, "grad_norm": 4.514806270599365, "learning_rate": 0.00039411281489594744, "loss": 1.2402, "step": 1826 }, { "epoch": 0.12378992301242472, "grad_norm": 4.582322120666504, "learning_rate": 0.0003941073384446879, "loss": 0.8958, "step": 1827 }, { "epoch": 0.12385767885424871, "grad_norm": 3.4016494750976562, "learning_rate": 0.0003941018619934283, "loss": 0.8938, "step": 1828 }, { "epoch": 0.1239254346960727, "grad_norm": 3.735915422439575, "learning_rate": 0.0003940963855421687, "loss": 1.0233, "step": 1829 }, { "epoch": 0.12399319053789669, "grad_norm": 4.394433498382568, "learning_rate": 0.0003940909090909091, "loss": 0.8388, "step": 1830 }, { "epoch": 0.12406094637972068, "grad_norm": 4.135808944702148, "learning_rate": 0.00039408543263964955, "loss": 1.1488, "step": 1831 }, { "epoch": 0.12412870222154467, "grad_norm": 3.190540075302124, "learning_rate": 0.00039407995618838995, "loss": 0.861, "step": 1832 }, { "epoch": 0.12419645806336865, "grad_norm": 4.317681789398193, "learning_rate": 0.00039407447973713035, "loss": 1.1285, "step": 1833 }, { "epoch": 0.12426421390519264, "grad_norm": 4.385807037353516, "learning_rate": 0.00039406900328587075, "loss": 0.9116, "step": 1834 }, { "epoch": 0.12433196974701663, "grad_norm": 3.790863275527954, "learning_rate": 0.0003940635268346112, "loss": 1.1933, "step": 1835 }, { "epoch": 0.12439972558884062, "grad_norm": 4.086167812347412, "learning_rate": 0.0003940580503833516, "loss": 1.1334, "step": 1836 }, { "epoch": 0.1244674814306646, "grad_norm": 5.536815166473389, "learning_rate": 0.00039405257393209205, "loss": 0.8746, "step": 1837 }, { "epoch": 0.1245352372724886, "grad_norm": 3.602147340774536, "learning_rate": 0.00039404709748083245, "loss": 1.003, "step": 1838 }, { "epoch": 0.12460299311431257, "grad_norm": 4.224771976470947, "learning_rate": 0.00039404162102957285, "loss": 1.2177, "step": 1839 }, { "epoch": 0.12467074895613656, "grad_norm": 4.504672527313232, "learning_rate": 0.00039403614457831325, "loss": 1.2838, "step": 1840 }, { "epoch": 0.12473850479796054, "grad_norm": 4.660628795623779, "learning_rate": 0.00039403066812705365, "loss": 0.8578, "step": 1841 }, { "epoch": 0.12480626063978453, "grad_norm": 7.820394039154053, "learning_rate": 0.0003940251916757941, "loss": 1.2228, "step": 1842 }, { "epoch": 0.12487401648160852, "grad_norm": 3.7876226902008057, "learning_rate": 0.00039401971522453456, "loss": 0.8574, "step": 1843 }, { "epoch": 0.12494177232343251, "grad_norm": 3.594557762145996, "learning_rate": 0.00039401423877327495, "loss": 1.0456, "step": 1844 }, { "epoch": 0.1250095281652565, "grad_norm": 3.5986955165863037, "learning_rate": 0.00039400876232201535, "loss": 0.9908, "step": 1845 }, { "epoch": 0.12507728400708049, "grad_norm": 3.9941565990448, "learning_rate": 0.00039400328587075575, "loss": 1.0189, "step": 1846 }, { "epoch": 0.12514503984890446, "grad_norm": 4.092831134796143, "learning_rate": 0.00039399780941949615, "loss": 0.9447, "step": 1847 }, { "epoch": 0.12521279569072846, "grad_norm": 4.5969390869140625, "learning_rate": 0.0003939923329682366, "loss": 1.3735, "step": 1848 }, { "epoch": 0.12528055153255244, "grad_norm": 3.631680965423584, "learning_rate": 0.000393986856516977, "loss": 0.9864, "step": 1849 }, { "epoch": 0.12534830737437644, "grad_norm": 4.72289514541626, "learning_rate": 0.00039398138006571746, "loss": 0.9884, "step": 1850 }, { "epoch": 0.1254160632162004, "grad_norm": 3.4760959148406982, "learning_rate": 0.00039397590361445786, "loss": 0.8979, "step": 1851 }, { "epoch": 0.12548381905802442, "grad_norm": 4.025509834289551, "learning_rate": 0.00039397042716319826, "loss": 1.0439, "step": 1852 }, { "epoch": 0.1255515748998484, "grad_norm": 3.7536063194274902, "learning_rate": 0.0003939649507119387, "loss": 1.0138, "step": 1853 }, { "epoch": 0.1256193307416724, "grad_norm": 3.739171028137207, "learning_rate": 0.0003939594742606791, "loss": 1.1156, "step": 1854 }, { "epoch": 0.12568708658349637, "grad_norm": 4.956604480743408, "learning_rate": 0.0003939539978094195, "loss": 1.2284, "step": 1855 }, { "epoch": 0.12575484242532037, "grad_norm": 3.817725896835327, "learning_rate": 0.0003939485213581599, "loss": 0.7855, "step": 1856 }, { "epoch": 0.12582259826714434, "grad_norm": 3.9724466800689697, "learning_rate": 0.0003939430449069003, "loss": 1.0788, "step": 1857 }, { "epoch": 0.12589035410896834, "grad_norm": 3.7098357677459717, "learning_rate": 0.00039393756845564076, "loss": 0.9621, "step": 1858 }, { "epoch": 0.12595810995079232, "grad_norm": 4.540678977966309, "learning_rate": 0.0003939320920043812, "loss": 1.2005, "step": 1859 }, { "epoch": 0.12602586579261632, "grad_norm": 4.152740955352783, "learning_rate": 0.0003939266155531216, "loss": 1.1063, "step": 1860 }, { "epoch": 0.1260936216344403, "grad_norm": 5.679985523223877, "learning_rate": 0.000393921139101862, "loss": 0.9053, "step": 1861 }, { "epoch": 0.12616137747626427, "grad_norm": 5.012297630310059, "learning_rate": 0.0003939156626506024, "loss": 1.3216, "step": 1862 }, { "epoch": 0.12622913331808827, "grad_norm": 3.621983528137207, "learning_rate": 0.0003939101861993428, "loss": 0.8771, "step": 1863 }, { "epoch": 0.12629688915991225, "grad_norm": 3.760590076446533, "learning_rate": 0.00039390470974808326, "loss": 1.1374, "step": 1864 }, { "epoch": 0.12636464500173625, "grad_norm": 4.252878665924072, "learning_rate": 0.00039389923329682366, "loss": 1.1744, "step": 1865 }, { "epoch": 0.12643240084356022, "grad_norm": 4.536449909210205, "learning_rate": 0.0003938937568455641, "loss": 1.1041, "step": 1866 }, { "epoch": 0.12650015668538422, "grad_norm": 3.6722726821899414, "learning_rate": 0.0003938882803943045, "loss": 1.2249, "step": 1867 }, { "epoch": 0.1265679125272082, "grad_norm": 3.4794342517852783, "learning_rate": 0.0003938828039430449, "loss": 0.9817, "step": 1868 }, { "epoch": 0.1266356683690322, "grad_norm": 3.702176570892334, "learning_rate": 0.00039387732749178537, "loss": 0.9495, "step": 1869 }, { "epoch": 0.12670342421085617, "grad_norm": 3.2054443359375, "learning_rate": 0.00039387185104052577, "loss": 1.0327, "step": 1870 }, { "epoch": 0.12677118005268018, "grad_norm": 3.591895341873169, "learning_rate": 0.00039386637458926617, "loss": 0.9529, "step": 1871 }, { "epoch": 0.12683893589450415, "grad_norm": 3.1956913471221924, "learning_rate": 0.00039386089813800657, "loss": 0.9698, "step": 1872 }, { "epoch": 0.12690669173632815, "grad_norm": 5.032118797302246, "learning_rate": 0.000393855421686747, "loss": 0.8259, "step": 1873 }, { "epoch": 0.12697444757815213, "grad_norm": 3.301806688308716, "learning_rate": 0.0003938499452354874, "loss": 0.8616, "step": 1874 }, { "epoch": 0.12704220341997613, "grad_norm": 5.120059013366699, "learning_rate": 0.00039384446878422787, "loss": 1.1377, "step": 1875 }, { "epoch": 0.1271099592618001, "grad_norm": 5.013489723205566, "learning_rate": 0.00039383899233296827, "loss": 1.2948, "step": 1876 }, { "epoch": 0.12717771510362408, "grad_norm": 4.0147271156311035, "learning_rate": 0.00039383351588170867, "loss": 1.2281, "step": 1877 }, { "epoch": 0.12724547094544808, "grad_norm": 3.3962347507476807, "learning_rate": 0.00039382803943044907, "loss": 0.9244, "step": 1878 }, { "epoch": 0.12731322678727205, "grad_norm": 3.543621063232422, "learning_rate": 0.00039382256297918947, "loss": 1.0228, "step": 1879 }, { "epoch": 0.12738098262909606, "grad_norm": 3.8247084617614746, "learning_rate": 0.0003938170865279299, "loss": 1.0812, "step": 1880 }, { "epoch": 0.12744873847092003, "grad_norm": 2.591374397277832, "learning_rate": 0.0003938116100766704, "loss": 0.6708, "step": 1881 }, { "epoch": 0.12751649431274403, "grad_norm": 3.8190577030181885, "learning_rate": 0.0003938061336254108, "loss": 1.0116, "step": 1882 }, { "epoch": 0.127584250154568, "grad_norm": 3.098743200302124, "learning_rate": 0.0003938006571741512, "loss": 0.8639, "step": 1883 }, { "epoch": 0.127652005996392, "grad_norm": 6.168898105621338, "learning_rate": 0.0003937951807228916, "loss": 1.5368, "step": 1884 }, { "epoch": 0.12771976183821598, "grad_norm": 4.254391193389893, "learning_rate": 0.000393789704271632, "loss": 1.1098, "step": 1885 }, { "epoch": 0.12778751768003999, "grad_norm": 3.7747035026550293, "learning_rate": 0.0003937842278203724, "loss": 1.1788, "step": 1886 }, { "epoch": 0.12785527352186396, "grad_norm": 4.273073196411133, "learning_rate": 0.0003937787513691128, "loss": 0.9761, "step": 1887 }, { "epoch": 0.12792302936368796, "grad_norm": 3.7713770866394043, "learning_rate": 0.0003937732749178532, "loss": 1.0652, "step": 1888 }, { "epoch": 0.12799078520551194, "grad_norm": 2.95625376701355, "learning_rate": 0.0003937677984665937, "loss": 0.9818, "step": 1889 }, { "epoch": 0.12805854104733594, "grad_norm": 3.0310325622558594, "learning_rate": 0.0003937623220153341, "loss": 0.8769, "step": 1890 }, { "epoch": 0.1281262968891599, "grad_norm": 3.168720006942749, "learning_rate": 0.00039375684556407453, "loss": 1.0757, "step": 1891 }, { "epoch": 0.1281940527309839, "grad_norm": 3.157223701477051, "learning_rate": 0.00039375136911281493, "loss": 0.9094, "step": 1892 }, { "epoch": 0.1282618085728079, "grad_norm": 4.76072883605957, "learning_rate": 0.00039374589266155533, "loss": 1.0586, "step": 1893 }, { "epoch": 0.12832956441463186, "grad_norm": 3.1946115493774414, "learning_rate": 0.00039374041621029573, "loss": 0.9054, "step": 1894 }, { "epoch": 0.12839732025645587, "grad_norm": 3.923870325088501, "learning_rate": 0.00039373493975903613, "loss": 1.1368, "step": 1895 }, { "epoch": 0.12846507609827984, "grad_norm": 4.610617637634277, "learning_rate": 0.0003937294633077766, "loss": 1.0974, "step": 1896 }, { "epoch": 0.12853283194010384, "grad_norm": 3.994490623474121, "learning_rate": 0.00039372398685651703, "loss": 1.2797, "step": 1897 }, { "epoch": 0.12860058778192782, "grad_norm": 3.981736183166504, "learning_rate": 0.00039371851040525743, "loss": 0.9522, "step": 1898 }, { "epoch": 0.12866834362375182, "grad_norm": 2.765848159790039, "learning_rate": 0.00039371303395399783, "loss": 0.7337, "step": 1899 }, { "epoch": 0.1287360994655758, "grad_norm": 3.7265939712524414, "learning_rate": 0.00039370755750273823, "loss": 1.0794, "step": 1900 }, { "epoch": 0.1288038553073998, "grad_norm": 3.501375198364258, "learning_rate": 0.00039370208105147863, "loss": 1.0447, "step": 1901 }, { "epoch": 0.12887161114922377, "grad_norm": 4.184782028198242, "learning_rate": 0.0003936966046002191, "loss": 1.1991, "step": 1902 }, { "epoch": 0.12893936699104777, "grad_norm": 8.838193893432617, "learning_rate": 0.0003936911281489595, "loss": 1.3164, "step": 1903 }, { "epoch": 0.12900712283287175, "grad_norm": 4.1561431884765625, "learning_rate": 0.00039368565169769994, "loss": 1.1864, "step": 1904 }, { "epoch": 0.12907487867469575, "grad_norm": 3.0376083850860596, "learning_rate": 0.00039368017524644034, "loss": 0.9058, "step": 1905 }, { "epoch": 0.12914263451651972, "grad_norm": 3.793778419494629, "learning_rate": 0.00039367469879518074, "loss": 0.977, "step": 1906 }, { "epoch": 0.1292103903583437, "grad_norm": 5.864257335662842, "learning_rate": 0.0003936692223439212, "loss": 1.194, "step": 1907 }, { "epoch": 0.1292781462001677, "grad_norm": 4.183814525604248, "learning_rate": 0.0003936637458926616, "loss": 0.8594, "step": 1908 }, { "epoch": 0.12934590204199167, "grad_norm": 3.8619446754455566, "learning_rate": 0.000393658269441402, "loss": 1.0024, "step": 1909 }, { "epoch": 0.12941365788381567, "grad_norm": 3.3930470943450928, "learning_rate": 0.0003936527929901424, "loss": 0.8368, "step": 1910 }, { "epoch": 0.12948141372563965, "grad_norm": 4.857607364654541, "learning_rate": 0.0003936473165388828, "loss": 1.2485, "step": 1911 }, { "epoch": 0.12954916956746365, "grad_norm": 3.1457479000091553, "learning_rate": 0.00039364184008762324, "loss": 1.1457, "step": 1912 }, { "epoch": 0.12961692540928763, "grad_norm": 3.3775463104248047, "learning_rate": 0.0003936363636363637, "loss": 0.8994, "step": 1913 }, { "epoch": 0.12968468125111163, "grad_norm": 3.782445192337036, "learning_rate": 0.0003936308871851041, "loss": 1.2035, "step": 1914 }, { "epoch": 0.1297524370929356, "grad_norm": 3.567512273788452, "learning_rate": 0.0003936254107338445, "loss": 1.2224, "step": 1915 }, { "epoch": 0.1298201929347596, "grad_norm": 4.4081926345825195, "learning_rate": 0.0003936199342825849, "loss": 1.0101, "step": 1916 }, { "epoch": 0.12988794877658358, "grad_norm": 3.556549072265625, "learning_rate": 0.0003936144578313253, "loss": 0.8286, "step": 1917 }, { "epoch": 0.12995570461840758, "grad_norm": 4.220504283905029, "learning_rate": 0.00039360898138006574, "loss": 0.9659, "step": 1918 }, { "epoch": 0.13002346046023155, "grad_norm": 3.6562061309814453, "learning_rate": 0.00039360350492880614, "loss": 0.9577, "step": 1919 }, { "epoch": 0.13009121630205553, "grad_norm": 2.9608206748962402, "learning_rate": 0.0003935980284775466, "loss": 0.887, "step": 1920 }, { "epoch": 0.13015897214387953, "grad_norm": 4.085014820098877, "learning_rate": 0.000393592552026287, "loss": 1.2644, "step": 1921 }, { "epoch": 0.1302267279857035, "grad_norm": 5.186523914337158, "learning_rate": 0.0003935870755750274, "loss": 1.1567, "step": 1922 }, { "epoch": 0.1302944838275275, "grad_norm": 4.298685073852539, "learning_rate": 0.0003935815991237678, "loss": 0.964, "step": 1923 }, { "epoch": 0.13036223966935148, "grad_norm": 3.5432169437408447, "learning_rate": 0.00039357612267250825, "loss": 1.0692, "step": 1924 }, { "epoch": 0.13042999551117548, "grad_norm": 3.3199679851531982, "learning_rate": 0.00039357064622124865, "loss": 1.0625, "step": 1925 }, { "epoch": 0.13049775135299946, "grad_norm": 3.9811880588531494, "learning_rate": 0.00039356516976998905, "loss": 1.1002, "step": 1926 }, { "epoch": 0.13056550719482346, "grad_norm": 3.4884250164031982, "learning_rate": 0.00039355969331872944, "loss": 0.8508, "step": 1927 }, { "epoch": 0.13063326303664743, "grad_norm": 3.6489617824554443, "learning_rate": 0.0003935542168674699, "loss": 0.9108, "step": 1928 }, { "epoch": 0.13070101887847144, "grad_norm": 3.4023690223693848, "learning_rate": 0.00039354874041621035, "loss": 0.9027, "step": 1929 }, { "epoch": 0.1307687747202954, "grad_norm": 3.6431820392608643, "learning_rate": 0.00039354326396495075, "loss": 1.0201, "step": 1930 }, { "epoch": 0.1308365305621194, "grad_norm": 2.8955748081207275, "learning_rate": 0.00039353778751369115, "loss": 0.7453, "step": 1931 }, { "epoch": 0.1309042864039434, "grad_norm": 4.0951972007751465, "learning_rate": 0.00039353231106243155, "loss": 1.0779, "step": 1932 }, { "epoch": 0.1309720422457674, "grad_norm": 4.259312629699707, "learning_rate": 0.00039352683461117195, "loss": 1.1478, "step": 1933 }, { "epoch": 0.13103979808759136, "grad_norm": 3.5302422046661377, "learning_rate": 0.0003935213581599124, "loss": 0.9817, "step": 1934 }, { "epoch": 0.13110755392941534, "grad_norm": 4.699799060821533, "learning_rate": 0.00039351588170865286, "loss": 1.0692, "step": 1935 }, { "epoch": 0.13117530977123934, "grad_norm": 3.059727191925049, "learning_rate": 0.00039351040525739325, "loss": 0.9202, "step": 1936 }, { "epoch": 0.13124306561306331, "grad_norm": 3.9536166191101074, "learning_rate": 0.00039350492880613365, "loss": 0.9628, "step": 1937 }, { "epoch": 0.13131082145488732, "grad_norm": 4.202176570892334, "learning_rate": 0.00039349945235487405, "loss": 1.3277, "step": 1938 }, { "epoch": 0.1313785772967113, "grad_norm": 4.248271465301514, "learning_rate": 0.00039349397590361445, "loss": 1.3216, "step": 1939 }, { "epoch": 0.1314463331385353, "grad_norm": 3.7766263484954834, "learning_rate": 0.0003934884994523549, "loss": 0.9515, "step": 1940 }, { "epoch": 0.13151408898035927, "grad_norm": 2.7945661544799805, "learning_rate": 0.0003934830230010953, "loss": 0.9346, "step": 1941 }, { "epoch": 0.13158184482218327, "grad_norm": 3.5524611473083496, "learning_rate": 0.0003934775465498357, "loss": 0.9856, "step": 1942 }, { "epoch": 0.13164960066400724, "grad_norm": 3.028143882751465, "learning_rate": 0.00039347207009857616, "loss": 0.7529, "step": 1943 }, { "epoch": 0.13171735650583125, "grad_norm": 3.4239816665649414, "learning_rate": 0.00039346659364731656, "loss": 0.9032, "step": 1944 }, { "epoch": 0.13178511234765522, "grad_norm": 4.826498985290527, "learning_rate": 0.000393461117196057, "loss": 1.208, "step": 1945 }, { "epoch": 0.13185286818947922, "grad_norm": 3.227996349334717, "learning_rate": 0.0003934556407447974, "loss": 1.0093, "step": 1946 }, { "epoch": 0.1319206240313032, "grad_norm": 3.463676691055298, "learning_rate": 0.0003934501642935378, "loss": 0.8302, "step": 1947 }, { "epoch": 0.1319883798731272, "grad_norm": 3.568833112716675, "learning_rate": 0.0003934446878422782, "loss": 0.8723, "step": 1948 }, { "epoch": 0.13205613571495117, "grad_norm": 4.0637030601501465, "learning_rate": 0.0003934392113910186, "loss": 0.8502, "step": 1949 }, { "epoch": 0.13212389155677515, "grad_norm": 3.7621164321899414, "learning_rate": 0.00039343373493975906, "loss": 1.0091, "step": 1950 }, { "epoch": 0.13219164739859915, "grad_norm": 4.1454386711120605, "learning_rate": 0.0003934282584884995, "loss": 1.1275, "step": 1951 }, { "epoch": 0.13225940324042312, "grad_norm": 5.30384635925293, "learning_rate": 0.0003934227820372399, "loss": 0.99, "step": 1952 }, { "epoch": 0.13232715908224713, "grad_norm": 3.7639780044555664, "learning_rate": 0.0003934173055859803, "loss": 1.0394, "step": 1953 }, { "epoch": 0.1323949149240711, "grad_norm": 4.136169910430908, "learning_rate": 0.0003934118291347207, "loss": 0.9947, "step": 1954 }, { "epoch": 0.1324626707658951, "grad_norm": 4.134731769561768, "learning_rate": 0.0003934063526834611, "loss": 1.0699, "step": 1955 }, { "epoch": 0.13253042660771908, "grad_norm": 3.162278652191162, "learning_rate": 0.00039340087623220156, "loss": 0.9359, "step": 1956 }, { "epoch": 0.13259818244954308, "grad_norm": 3.5070345401763916, "learning_rate": 0.00039339539978094196, "loss": 0.9803, "step": 1957 }, { "epoch": 0.13266593829136705, "grad_norm": 3.192782163619995, "learning_rate": 0.00039338992332968236, "loss": 0.907, "step": 1958 }, { "epoch": 0.13273369413319105, "grad_norm": 4.041593551635742, "learning_rate": 0.0003933844468784228, "loss": 0.878, "step": 1959 }, { "epoch": 0.13280144997501503, "grad_norm": 5.201786041259766, "learning_rate": 0.0003933789704271632, "loss": 1.1319, "step": 1960 }, { "epoch": 0.13286920581683903, "grad_norm": 3.7757298946380615, "learning_rate": 0.0003933734939759036, "loss": 0.8727, "step": 1961 }, { "epoch": 0.132936961658663, "grad_norm": 4.938465118408203, "learning_rate": 0.00039336801752464407, "loss": 1.0432, "step": 1962 }, { "epoch": 0.133004717500487, "grad_norm": 3.8576560020446777, "learning_rate": 0.00039336254107338447, "loss": 1.1053, "step": 1963 }, { "epoch": 0.13307247334231098, "grad_norm": 4.339880466461182, "learning_rate": 0.00039335706462212487, "loss": 1.0894, "step": 1964 }, { "epoch": 0.13314022918413496, "grad_norm": 3.7322065830230713, "learning_rate": 0.00039335158817086527, "loss": 0.8703, "step": 1965 }, { "epoch": 0.13320798502595896, "grad_norm": 3.177706718444824, "learning_rate": 0.0003933461117196057, "loss": 1.0181, "step": 1966 }, { "epoch": 0.13327574086778293, "grad_norm": 4.539488315582275, "learning_rate": 0.00039334063526834617, "loss": 0.9901, "step": 1967 }, { "epoch": 0.13334349670960693, "grad_norm": 4.758763790130615, "learning_rate": 0.00039333515881708657, "loss": 1.1513, "step": 1968 }, { "epoch": 0.1334112525514309, "grad_norm": 3.745053768157959, "learning_rate": 0.00039332968236582697, "loss": 1.0957, "step": 1969 }, { "epoch": 0.1334790083932549, "grad_norm": 4.3162841796875, "learning_rate": 0.00039332420591456737, "loss": 1.3349, "step": 1970 }, { "epoch": 0.13354676423507889, "grad_norm": 3.856776237487793, "learning_rate": 0.00039331872946330777, "loss": 0.7896, "step": 1971 }, { "epoch": 0.1336145200769029, "grad_norm": 2.958937168121338, "learning_rate": 0.0003933132530120482, "loss": 0.9366, "step": 1972 }, { "epoch": 0.13368227591872686, "grad_norm": 3.46213698387146, "learning_rate": 0.0003933077765607886, "loss": 1.0736, "step": 1973 }, { "epoch": 0.13375003176055086, "grad_norm": 3.272308588027954, "learning_rate": 0.0003933023001095291, "loss": 1.0175, "step": 1974 }, { "epoch": 0.13381778760237484, "grad_norm": 4.073008060455322, "learning_rate": 0.0003932968236582695, "loss": 0.8983, "step": 1975 }, { "epoch": 0.13388554344419884, "grad_norm": 4.09628438949585, "learning_rate": 0.0003932913472070099, "loss": 1.0099, "step": 1976 }, { "epoch": 0.13395329928602281, "grad_norm": 6.971934795379639, "learning_rate": 0.00039328587075575027, "loss": 1.1008, "step": 1977 }, { "epoch": 0.13402105512784682, "grad_norm": 4.075629711151123, "learning_rate": 0.0003932803943044907, "loss": 1.1905, "step": 1978 }, { "epoch": 0.1340888109696708, "grad_norm": 3.393519639968872, "learning_rate": 0.0003932749178532311, "loss": 0.9598, "step": 1979 }, { "epoch": 0.13415656681149477, "grad_norm": 3.1066389083862305, "learning_rate": 0.0003932694414019715, "loss": 0.7126, "step": 1980 }, { "epoch": 0.13422432265331877, "grad_norm": 3.3074233531951904, "learning_rate": 0.0003932639649507119, "loss": 0.9072, "step": 1981 }, { "epoch": 0.13429207849514274, "grad_norm": 3.258561611175537, "learning_rate": 0.0003932584884994524, "loss": 1.16, "step": 1982 }, { "epoch": 0.13435983433696674, "grad_norm": 4.268911361694336, "learning_rate": 0.00039325301204819283, "loss": 1.0758, "step": 1983 }, { "epoch": 0.13442759017879072, "grad_norm": 3.4436447620391846, "learning_rate": 0.00039324753559693323, "loss": 0.8295, "step": 1984 }, { "epoch": 0.13449534602061472, "grad_norm": 5.179892063140869, "learning_rate": 0.00039324205914567363, "loss": 1.1655, "step": 1985 }, { "epoch": 0.1345631018624387, "grad_norm": 3.867161989212036, "learning_rate": 0.00039323658269441403, "loss": 0.7784, "step": 1986 }, { "epoch": 0.1346308577042627, "grad_norm": 3.456005096435547, "learning_rate": 0.00039323110624315443, "loss": 0.9455, "step": 1987 }, { "epoch": 0.13469861354608667, "grad_norm": 4.781862735748291, "learning_rate": 0.0003932256297918949, "loss": 1.0181, "step": 1988 }, { "epoch": 0.13476636938791067, "grad_norm": 4.400859355926514, "learning_rate": 0.0003932201533406353, "loss": 0.9509, "step": 1989 }, { "epoch": 0.13483412522973465, "grad_norm": 3.301814317703247, "learning_rate": 0.00039321467688937573, "loss": 0.8368, "step": 1990 }, { "epoch": 0.13490188107155865, "grad_norm": 4.028049945831299, "learning_rate": 0.00039320920043811613, "loss": 1.231, "step": 1991 }, { "epoch": 0.13496963691338262, "grad_norm": 3.308310031890869, "learning_rate": 0.00039320372398685653, "loss": 0.8777, "step": 1992 }, { "epoch": 0.13503739275520663, "grad_norm": 3.7963478565216064, "learning_rate": 0.00039319824753559693, "loss": 0.9725, "step": 1993 }, { "epoch": 0.1351051485970306, "grad_norm": 3.3007848262786865, "learning_rate": 0.0003931927710843374, "loss": 0.9158, "step": 1994 }, { "epoch": 0.13517290443885457, "grad_norm": 3.9070980548858643, "learning_rate": 0.0003931872946330778, "loss": 1.0185, "step": 1995 }, { "epoch": 0.13524066028067858, "grad_norm": 4.237251281738281, "learning_rate": 0.0003931818181818182, "loss": 1.2018, "step": 1996 }, { "epoch": 0.13530841612250255, "grad_norm": 3.5162501335144043, "learning_rate": 0.0003931763417305586, "loss": 1.0947, "step": 1997 }, { "epoch": 0.13537617196432655, "grad_norm": 3.6359193325042725, "learning_rate": 0.00039317086527929904, "loss": 0.8624, "step": 1998 }, { "epoch": 0.13544392780615053, "grad_norm": 4.43295431137085, "learning_rate": 0.00039316538882803943, "loss": 0.8911, "step": 1999 }, { "epoch": 0.13551168364797453, "grad_norm": 2.9032492637634277, "learning_rate": 0.0003931599123767799, "loss": 1.0228, "step": 2000 }, { "epoch": 0.1355794394897985, "grad_norm": 3.0034313201904297, "learning_rate": 0.0003931544359255203, "loss": 0.9481, "step": 2001 }, { "epoch": 0.1356471953316225, "grad_norm": 3.2944109439849854, "learning_rate": 0.0003931489594742607, "loss": 0.8185, "step": 2002 }, { "epoch": 0.13571495117344648, "grad_norm": 3.4253170490264893, "learning_rate": 0.0003931434830230011, "loss": 0.6822, "step": 2003 }, { "epoch": 0.13578270701527048, "grad_norm": 4.060725688934326, "learning_rate": 0.00039313800657174154, "loss": 0.908, "step": 2004 }, { "epoch": 0.13585046285709446, "grad_norm": 4.162361145019531, "learning_rate": 0.000393132530120482, "loss": 1.1549, "step": 2005 }, { "epoch": 0.13591821869891846, "grad_norm": 4.029611587524414, "learning_rate": 0.0003931270536692224, "loss": 1.1323, "step": 2006 }, { "epoch": 0.13598597454074243, "grad_norm": 3.3221356868743896, "learning_rate": 0.0003931215772179628, "loss": 0.813, "step": 2007 }, { "epoch": 0.13605373038256643, "grad_norm": 4.053617000579834, "learning_rate": 0.0003931161007667032, "loss": 0.9458, "step": 2008 }, { "epoch": 0.1361214862243904, "grad_norm": 3.43747615814209, "learning_rate": 0.0003931106243154436, "loss": 0.9893, "step": 2009 }, { "epoch": 0.13618924206621438, "grad_norm": 3.0420141220092773, "learning_rate": 0.00039310514786418404, "loss": 0.7516, "step": 2010 }, { "epoch": 0.13625699790803839, "grad_norm": 3.4257984161376953, "learning_rate": 0.00039309967141292444, "loss": 0.9864, "step": 2011 }, { "epoch": 0.13632475374986236, "grad_norm": 3.67362904548645, "learning_rate": 0.00039309419496166484, "loss": 0.9954, "step": 2012 }, { "epoch": 0.13639250959168636, "grad_norm": 4.418880462646484, "learning_rate": 0.0003930887185104053, "loss": 0.9241, "step": 2013 }, { "epoch": 0.13646026543351034, "grad_norm": 3.5813469886779785, "learning_rate": 0.0003930832420591457, "loss": 0.8535, "step": 2014 }, { "epoch": 0.13652802127533434, "grad_norm": 3.273195743560791, "learning_rate": 0.0003930777656078861, "loss": 0.9462, "step": 2015 }, { "epoch": 0.1365957771171583, "grad_norm": 3.664151430130005, "learning_rate": 0.00039307228915662655, "loss": 0.9933, "step": 2016 }, { "epoch": 0.13666353295898231, "grad_norm": 3.130005359649658, "learning_rate": 0.00039306681270536695, "loss": 0.962, "step": 2017 }, { "epoch": 0.1367312888008063, "grad_norm": 2.9658141136169434, "learning_rate": 0.00039306133625410735, "loss": 0.8924, "step": 2018 }, { "epoch": 0.1367990446426303, "grad_norm": 3.34104585647583, "learning_rate": 0.00039305585980284774, "loss": 1.1484, "step": 2019 }, { "epoch": 0.13686680048445427, "grad_norm": 3.6597139835357666, "learning_rate": 0.00039305038335158814, "loss": 0.9039, "step": 2020 }, { "epoch": 0.13693455632627827, "grad_norm": 3.5139095783233643, "learning_rate": 0.00039304490690032865, "loss": 1.0545, "step": 2021 }, { "epoch": 0.13700231216810224, "grad_norm": 2.8280012607574463, "learning_rate": 0.00039303943044906905, "loss": 0.8668, "step": 2022 }, { "epoch": 0.13707006800992624, "grad_norm": 3.2996609210968018, "learning_rate": 0.00039303395399780945, "loss": 0.9786, "step": 2023 }, { "epoch": 0.13713782385175022, "grad_norm": 4.314419269561768, "learning_rate": 0.00039302847754654985, "loss": 1.3774, "step": 2024 }, { "epoch": 0.1372055796935742, "grad_norm": 3.809823513031006, "learning_rate": 0.00039302300109529025, "loss": 1.0177, "step": 2025 }, { "epoch": 0.1372733355353982, "grad_norm": 3.2323153018951416, "learning_rate": 0.0003930175246440307, "loss": 0.9028, "step": 2026 }, { "epoch": 0.13734109137722217, "grad_norm": 4.208483695983887, "learning_rate": 0.0003930120481927711, "loss": 1.0285, "step": 2027 }, { "epoch": 0.13740884721904617, "grad_norm": 5.641801834106445, "learning_rate": 0.0003930065717415115, "loss": 1.2101, "step": 2028 }, { "epoch": 0.13747660306087015, "grad_norm": 3.521711587905884, "learning_rate": 0.00039300109529025195, "loss": 1.0655, "step": 2029 }, { "epoch": 0.13754435890269415, "grad_norm": 2.7891299724578857, "learning_rate": 0.00039299561883899235, "loss": 0.8202, "step": 2030 }, { "epoch": 0.13761211474451812, "grad_norm": 2.9741005897521973, "learning_rate": 0.00039299014238773275, "loss": 0.7934, "step": 2031 }, { "epoch": 0.13767987058634212, "grad_norm": 7.217936038970947, "learning_rate": 0.0003929846659364732, "loss": 1.1702, "step": 2032 }, { "epoch": 0.1377476264281661, "grad_norm": 3.7688241004943848, "learning_rate": 0.0003929791894852136, "loss": 0.8708, "step": 2033 }, { "epoch": 0.1378153822699901, "grad_norm": 3.4128355979919434, "learning_rate": 0.000392973713033954, "loss": 1.1512, "step": 2034 }, { "epoch": 0.13788313811181407, "grad_norm": 4.0219597816467285, "learning_rate": 0.0003929682365826944, "loss": 1.1092, "step": 2035 }, { "epoch": 0.13795089395363808, "grad_norm": 4.2756571769714355, "learning_rate": 0.00039296276013143486, "loss": 1.0763, "step": 2036 }, { "epoch": 0.13801864979546205, "grad_norm": 5.709436416625977, "learning_rate": 0.00039295728368017526, "loss": 1.1948, "step": 2037 }, { "epoch": 0.13808640563728605, "grad_norm": 2.7488200664520264, "learning_rate": 0.0003929518072289157, "loss": 0.9372, "step": 2038 }, { "epoch": 0.13815416147911003, "grad_norm": 3.348262310028076, "learning_rate": 0.0003929463307776561, "loss": 0.8962, "step": 2039 }, { "epoch": 0.138221917320934, "grad_norm": 5.292445659637451, "learning_rate": 0.0003929408543263965, "loss": 1.1315, "step": 2040 }, { "epoch": 0.138289673162758, "grad_norm": 4.651246547698975, "learning_rate": 0.0003929353778751369, "loss": 1.0607, "step": 2041 }, { "epoch": 0.13835742900458198, "grad_norm": 3.3094773292541504, "learning_rate": 0.00039292990142387736, "loss": 1.0272, "step": 2042 }, { "epoch": 0.13842518484640598, "grad_norm": 4.301296710968018, "learning_rate": 0.00039292442497261776, "loss": 1.1787, "step": 2043 }, { "epoch": 0.13849294068822995, "grad_norm": 5.791898250579834, "learning_rate": 0.0003929189485213582, "loss": 0.9111, "step": 2044 }, { "epoch": 0.13856069653005396, "grad_norm": 4.063332557678223, "learning_rate": 0.0003929134720700986, "loss": 1.0756, "step": 2045 }, { "epoch": 0.13862845237187793, "grad_norm": 4.12991189956665, "learning_rate": 0.000392907995618839, "loss": 1.3346, "step": 2046 }, { "epoch": 0.13869620821370193, "grad_norm": 2.9076972007751465, "learning_rate": 0.0003929025191675794, "loss": 0.8134, "step": 2047 }, { "epoch": 0.1387639640555259, "grad_norm": 3.608147144317627, "learning_rate": 0.00039289704271631986, "loss": 1.0595, "step": 2048 }, { "epoch": 0.1388317198973499, "grad_norm": 2.986985683441162, "learning_rate": 0.00039289156626506026, "loss": 0.8501, "step": 2049 }, { "epoch": 0.13889947573917388, "grad_norm": 3.6966941356658936, "learning_rate": 0.00039288608981380066, "loss": 0.8774, "step": 2050 }, { "epoch": 0.13896723158099789, "grad_norm": 4.209543704986572, "learning_rate": 0.00039288061336254106, "loss": 1.0716, "step": 2051 }, { "epoch": 0.13903498742282186, "grad_norm": 3.5952212810516357, "learning_rate": 0.0003928751369112815, "loss": 0.8348, "step": 2052 }, { "epoch": 0.13910274326464586, "grad_norm": 3.881042242050171, "learning_rate": 0.0003928696604600219, "loss": 1.0374, "step": 2053 }, { "epoch": 0.13917049910646984, "grad_norm": 3.3455593585968018, "learning_rate": 0.00039286418400876237, "loss": 0.8962, "step": 2054 }, { "epoch": 0.1392382549482938, "grad_norm": 3.5047824382781982, "learning_rate": 0.00039285870755750277, "loss": 0.9041, "step": 2055 }, { "epoch": 0.1393060107901178, "grad_norm": 5.325544357299805, "learning_rate": 0.00039285323110624317, "loss": 1.0228, "step": 2056 }, { "epoch": 0.1393737666319418, "grad_norm": 3.436539649963379, "learning_rate": 0.00039284775465498357, "loss": 0.9214, "step": 2057 }, { "epoch": 0.1394415224737658, "grad_norm": 3.9321558475494385, "learning_rate": 0.00039284227820372396, "loss": 0.9105, "step": 2058 }, { "epoch": 0.13950927831558976, "grad_norm": 5.037657737731934, "learning_rate": 0.0003928368017524644, "loss": 0.9893, "step": 2059 }, { "epoch": 0.13957703415741377, "grad_norm": 4.227497577667236, "learning_rate": 0.00039283132530120487, "loss": 1.161, "step": 2060 }, { "epoch": 0.13964478999923774, "grad_norm": 3.6891133785247803, "learning_rate": 0.00039282584884994527, "loss": 0.983, "step": 2061 }, { "epoch": 0.13971254584106174, "grad_norm": 3.7053208351135254, "learning_rate": 0.00039282037239868567, "loss": 1.0068, "step": 2062 }, { "epoch": 0.13978030168288572, "grad_norm": 4.371405124664307, "learning_rate": 0.00039281489594742607, "loss": 1.168, "step": 2063 }, { "epoch": 0.13984805752470972, "grad_norm": 5.198989391326904, "learning_rate": 0.0003928094194961665, "loss": 1.0547, "step": 2064 }, { "epoch": 0.1399158133665337, "grad_norm": 4.24947452545166, "learning_rate": 0.0003928039430449069, "loss": 0.8769, "step": 2065 }, { "epoch": 0.1399835692083577, "grad_norm": 3.553696393966675, "learning_rate": 0.0003927984665936473, "loss": 0.927, "step": 2066 }, { "epoch": 0.14005132505018167, "grad_norm": 7.917940616607666, "learning_rate": 0.0003927929901423878, "loss": 0.8645, "step": 2067 }, { "epoch": 0.14011908089200567, "grad_norm": 3.103926181793213, "learning_rate": 0.0003927875136911282, "loss": 0.8436, "step": 2068 }, { "epoch": 0.14018683673382965, "grad_norm": 5.303426742553711, "learning_rate": 0.00039278203723986857, "loss": 1.0966, "step": 2069 }, { "epoch": 0.14025459257565362, "grad_norm": 4.286533832550049, "learning_rate": 0.000392776560788609, "loss": 1.0876, "step": 2070 }, { "epoch": 0.14032234841747762, "grad_norm": 4.240042209625244, "learning_rate": 0.0003927710843373494, "loss": 1.073, "step": 2071 }, { "epoch": 0.1403901042593016, "grad_norm": 3.280837059020996, "learning_rate": 0.0003927656078860898, "loss": 0.9107, "step": 2072 }, { "epoch": 0.1404578601011256, "grad_norm": 4.542739391326904, "learning_rate": 0.0003927601314348302, "loss": 1.0865, "step": 2073 }, { "epoch": 0.14052561594294957, "grad_norm": 7.275092601776123, "learning_rate": 0.0003927546549835706, "loss": 1.2329, "step": 2074 }, { "epoch": 0.14059337178477357, "grad_norm": 4.99735164642334, "learning_rate": 0.0003927491785323111, "loss": 0.9936, "step": 2075 }, { "epoch": 0.14066112762659755, "grad_norm": 5.522153854370117, "learning_rate": 0.00039274370208105153, "loss": 1.3785, "step": 2076 }, { "epoch": 0.14072888346842155, "grad_norm": 5.422906875610352, "learning_rate": 0.00039273822562979193, "loss": 1.2085, "step": 2077 }, { "epoch": 0.14079663931024552, "grad_norm": 3.4788289070129395, "learning_rate": 0.00039273274917853233, "loss": 0.9071, "step": 2078 }, { "epoch": 0.14086439515206953, "grad_norm": 3.3869335651397705, "learning_rate": 0.00039272727272727273, "loss": 0.7983, "step": 2079 }, { "epoch": 0.1409321509938935, "grad_norm": 5.545078277587891, "learning_rate": 0.0003927217962760132, "loss": 1.2516, "step": 2080 }, { "epoch": 0.1409999068357175, "grad_norm": 3.380993604660034, "learning_rate": 0.0003927163198247536, "loss": 0.9104, "step": 2081 }, { "epoch": 0.14106766267754148, "grad_norm": 2.994847536087036, "learning_rate": 0.000392710843373494, "loss": 0.8234, "step": 2082 }, { "epoch": 0.14113541851936548, "grad_norm": 4.117560386657715, "learning_rate": 0.00039270536692223443, "loss": 0.8144, "step": 2083 }, { "epoch": 0.14120317436118945, "grad_norm": 3.5420539379119873, "learning_rate": 0.00039269989047097483, "loss": 0.8388, "step": 2084 }, { "epoch": 0.14127093020301343, "grad_norm": 3.6176364421844482, "learning_rate": 0.00039269441401971523, "loss": 0.9861, "step": 2085 }, { "epoch": 0.14133868604483743, "grad_norm": 3.8295180797576904, "learning_rate": 0.0003926889375684557, "loss": 0.8231, "step": 2086 }, { "epoch": 0.1414064418866614, "grad_norm": 5.298388481140137, "learning_rate": 0.0003926834611171961, "loss": 1.2394, "step": 2087 }, { "epoch": 0.1414741977284854, "grad_norm": 4.269284725189209, "learning_rate": 0.0003926779846659365, "loss": 1.1129, "step": 2088 }, { "epoch": 0.14154195357030938, "grad_norm": 3.3076815605163574, "learning_rate": 0.0003926725082146769, "loss": 0.8602, "step": 2089 }, { "epoch": 0.14160970941213338, "grad_norm": 4.088136196136475, "learning_rate": 0.0003926670317634173, "loss": 1.0013, "step": 2090 }, { "epoch": 0.14167746525395736, "grad_norm": 4.127427577972412, "learning_rate": 0.00039266155531215773, "loss": 0.8866, "step": 2091 }, { "epoch": 0.14174522109578136, "grad_norm": 7.000527381896973, "learning_rate": 0.0003926560788608982, "loss": 0.7672, "step": 2092 }, { "epoch": 0.14181297693760533, "grad_norm": 3.2157084941864014, "learning_rate": 0.0003926506024096386, "loss": 0.8506, "step": 2093 }, { "epoch": 0.14188073277942934, "grad_norm": 3.456639051437378, "learning_rate": 0.000392645125958379, "loss": 1.1805, "step": 2094 }, { "epoch": 0.1419484886212533, "grad_norm": 2.636897087097168, "learning_rate": 0.0003926396495071194, "loss": 0.8704, "step": 2095 }, { "epoch": 0.1420162444630773, "grad_norm": 3.683851718902588, "learning_rate": 0.0003926341730558598, "loss": 0.8362, "step": 2096 }, { "epoch": 0.1420840003049013, "grad_norm": 3.9194719791412354, "learning_rate": 0.00039262869660460024, "loss": 1.0408, "step": 2097 }, { "epoch": 0.1421517561467253, "grad_norm": 5.0648884773254395, "learning_rate": 0.0003926232201533407, "loss": 1.3083, "step": 2098 }, { "epoch": 0.14221951198854926, "grad_norm": 3.2401018142700195, "learning_rate": 0.0003926177437020811, "loss": 0.8714, "step": 2099 }, { "epoch": 0.14228726783037324, "grad_norm": 2.7266671657562256, "learning_rate": 0.0003926122672508215, "loss": 0.5694, "step": 2100 }, { "epoch": 0.14235502367219724, "grad_norm": 4.627926349639893, "learning_rate": 0.0003926067907995619, "loss": 1.2818, "step": 2101 }, { "epoch": 0.1424227795140212, "grad_norm": 3.868546962738037, "learning_rate": 0.00039260131434830234, "loss": 0.8009, "step": 2102 }, { "epoch": 0.14249053535584522, "grad_norm": 4.255434989929199, "learning_rate": 0.00039259583789704274, "loss": 1.1021, "step": 2103 }, { "epoch": 0.1425582911976692, "grad_norm": 3.9906864166259766, "learning_rate": 0.00039259036144578314, "loss": 1.0281, "step": 2104 }, { "epoch": 0.1426260470394932, "grad_norm": 3.5460011959075928, "learning_rate": 0.00039258488499452354, "loss": 0.9633, "step": 2105 }, { "epoch": 0.14269380288131717, "grad_norm": 3.7452077865600586, "learning_rate": 0.000392579408543264, "loss": 0.936, "step": 2106 }, { "epoch": 0.14276155872314117, "grad_norm": 3.526322364807129, "learning_rate": 0.0003925739320920044, "loss": 1.2107, "step": 2107 }, { "epoch": 0.14282931456496514, "grad_norm": 3.7674813270568848, "learning_rate": 0.00039256845564074485, "loss": 0.9477, "step": 2108 }, { "epoch": 0.14289707040678915, "grad_norm": 3.5109872817993164, "learning_rate": 0.00039256297918948525, "loss": 0.9526, "step": 2109 }, { "epoch": 0.14296482624861312, "grad_norm": 3.6948585510253906, "learning_rate": 0.00039255750273822565, "loss": 0.8935, "step": 2110 }, { "epoch": 0.14303258209043712, "grad_norm": 3.927109479904175, "learning_rate": 0.00039255202628696604, "loss": 1.3078, "step": 2111 }, { "epoch": 0.1431003379322611, "grad_norm": 3.035163402557373, "learning_rate": 0.00039254654983570644, "loss": 0.913, "step": 2112 }, { "epoch": 0.1431680937740851, "grad_norm": 2.9310853481292725, "learning_rate": 0.0003925410733844469, "loss": 0.8595, "step": 2113 }, { "epoch": 0.14323584961590907, "grad_norm": 3.4072773456573486, "learning_rate": 0.00039253559693318735, "loss": 1.0167, "step": 2114 }, { "epoch": 0.14330360545773305, "grad_norm": 3.48146390914917, "learning_rate": 0.00039253012048192775, "loss": 1.0699, "step": 2115 }, { "epoch": 0.14337136129955705, "grad_norm": 3.5380361080169678, "learning_rate": 0.00039252464403066815, "loss": 1.0225, "step": 2116 }, { "epoch": 0.14343911714138102, "grad_norm": 3.9286787509918213, "learning_rate": 0.00039251916757940855, "loss": 1.2311, "step": 2117 }, { "epoch": 0.14350687298320502, "grad_norm": 3.2067854404449463, "learning_rate": 0.000392513691128149, "loss": 0.9134, "step": 2118 }, { "epoch": 0.143574628825029, "grad_norm": 4.170510292053223, "learning_rate": 0.0003925082146768894, "loss": 1.2088, "step": 2119 }, { "epoch": 0.143642384666853, "grad_norm": 2.894944667816162, "learning_rate": 0.0003925027382256298, "loss": 0.8306, "step": 2120 }, { "epoch": 0.14371014050867698, "grad_norm": 3.0361433029174805, "learning_rate": 0.0003924972617743702, "loss": 1.1654, "step": 2121 }, { "epoch": 0.14377789635050098, "grad_norm": 5.335592746734619, "learning_rate": 0.00039249178532311065, "loss": 1.1184, "step": 2122 }, { "epoch": 0.14384565219232495, "grad_norm": 3.661141872406006, "learning_rate": 0.00039248630887185105, "loss": 0.913, "step": 2123 }, { "epoch": 0.14391340803414895, "grad_norm": 3.199061393737793, "learning_rate": 0.0003924808324205915, "loss": 0.8722, "step": 2124 }, { "epoch": 0.14398116387597293, "grad_norm": 3.088141679763794, "learning_rate": 0.0003924753559693319, "loss": 0.8965, "step": 2125 }, { "epoch": 0.14404891971779693, "grad_norm": 4.142609596252441, "learning_rate": 0.0003924698795180723, "loss": 1.3532, "step": 2126 }, { "epoch": 0.1441166755596209, "grad_norm": 3.54492449760437, "learning_rate": 0.0003924644030668127, "loss": 1.1457, "step": 2127 }, { "epoch": 0.1441844314014449, "grad_norm": 3.0541512966156006, "learning_rate": 0.0003924589266155531, "loss": 0.8593, "step": 2128 }, { "epoch": 0.14425218724326888, "grad_norm": 2.877804756164551, "learning_rate": 0.00039245345016429356, "loss": 0.7873, "step": 2129 }, { "epoch": 0.14431994308509286, "grad_norm": 3.437086820602417, "learning_rate": 0.000392447973713034, "loss": 0.9796, "step": 2130 }, { "epoch": 0.14438769892691686, "grad_norm": 4.922691822052002, "learning_rate": 0.0003924424972617744, "loss": 1.1888, "step": 2131 }, { "epoch": 0.14445545476874083, "grad_norm": 7.429305553436279, "learning_rate": 0.0003924370208105148, "loss": 0.9245, "step": 2132 }, { "epoch": 0.14452321061056483, "grad_norm": 3.9140686988830566, "learning_rate": 0.0003924315443592552, "loss": 1.1263, "step": 2133 }, { "epoch": 0.1445909664523888, "grad_norm": 3.129779577255249, "learning_rate": 0.0003924260679079956, "loss": 0.8501, "step": 2134 }, { "epoch": 0.1446587222942128, "grad_norm": 3.4149117469787598, "learning_rate": 0.00039242059145673606, "loss": 0.8251, "step": 2135 }, { "epoch": 0.14472647813603678, "grad_norm": 4.53984260559082, "learning_rate": 0.00039241511500547646, "loss": 1.0949, "step": 2136 }, { "epoch": 0.1447942339778608, "grad_norm": 2.7803573608398438, "learning_rate": 0.0003924096385542169, "loss": 0.8716, "step": 2137 }, { "epoch": 0.14486198981968476, "grad_norm": 3.4807686805725098, "learning_rate": 0.0003924041621029573, "loss": 1.1694, "step": 2138 }, { "epoch": 0.14492974566150876, "grad_norm": 4.289523601531982, "learning_rate": 0.0003923986856516977, "loss": 0.8649, "step": 2139 }, { "epoch": 0.14499750150333274, "grad_norm": 3.568579912185669, "learning_rate": 0.00039239320920043816, "loss": 1.0478, "step": 2140 }, { "epoch": 0.14506525734515674, "grad_norm": 3.8929920196533203, "learning_rate": 0.00039238773274917856, "loss": 0.9281, "step": 2141 }, { "epoch": 0.1451330131869807, "grad_norm": 2.9407284259796143, "learning_rate": 0.00039238225629791896, "loss": 0.9095, "step": 2142 }, { "epoch": 0.14520076902880472, "grad_norm": 3.0965287685394287, "learning_rate": 0.00039237677984665936, "loss": 0.9094, "step": 2143 }, { "epoch": 0.1452685248706287, "grad_norm": 3.28955340385437, "learning_rate": 0.00039237130339539976, "loss": 0.8101, "step": 2144 }, { "epoch": 0.14533628071245266, "grad_norm": 3.4915881156921387, "learning_rate": 0.0003923658269441402, "loss": 0.7588, "step": 2145 }, { "epoch": 0.14540403655427667, "grad_norm": 3.6683669090270996, "learning_rate": 0.00039236035049288067, "loss": 1.001, "step": 2146 }, { "epoch": 0.14547179239610064, "grad_norm": 3.052374839782715, "learning_rate": 0.00039235487404162107, "loss": 0.838, "step": 2147 }, { "epoch": 0.14553954823792464, "grad_norm": 4.590811729431152, "learning_rate": 0.00039234939759036147, "loss": 0.9947, "step": 2148 }, { "epoch": 0.14560730407974862, "grad_norm": 4.013795375823975, "learning_rate": 0.00039234392113910186, "loss": 1.0772, "step": 2149 }, { "epoch": 0.14567505992157262, "grad_norm": 3.562592029571533, "learning_rate": 0.00039233844468784226, "loss": 1.023, "step": 2150 }, { "epoch": 0.1457428157633966, "grad_norm": 11.617969512939453, "learning_rate": 0.0003923329682365827, "loss": 1.1349, "step": 2151 }, { "epoch": 0.1458105716052206, "grad_norm": 3.8551318645477295, "learning_rate": 0.0003923274917853231, "loss": 0.8366, "step": 2152 }, { "epoch": 0.14587832744704457, "grad_norm": 4.113894939422607, "learning_rate": 0.00039232201533406357, "loss": 1.1338, "step": 2153 }, { "epoch": 0.14594608328886857, "grad_norm": 6.330554485321045, "learning_rate": 0.00039231653888280397, "loss": 0.7766, "step": 2154 }, { "epoch": 0.14601383913069255, "grad_norm": 3.9672188758850098, "learning_rate": 0.00039231106243154437, "loss": 0.907, "step": 2155 }, { "epoch": 0.14608159497251655, "grad_norm": 4.124312877655029, "learning_rate": 0.0003923055859802848, "loss": 1.0132, "step": 2156 }, { "epoch": 0.14614935081434052, "grad_norm": 2.816448450088501, "learning_rate": 0.0003923001095290252, "loss": 0.8018, "step": 2157 }, { "epoch": 0.14621710665616452, "grad_norm": 4.131261348724365, "learning_rate": 0.0003922946330777656, "loss": 1.0888, "step": 2158 }, { "epoch": 0.1462848624979885, "grad_norm": 4.663915157318115, "learning_rate": 0.000392289156626506, "loss": 1.2924, "step": 2159 }, { "epoch": 0.14635261833981247, "grad_norm": 6.631914138793945, "learning_rate": 0.0003922836801752464, "loss": 0.9737, "step": 2160 }, { "epoch": 0.14642037418163648, "grad_norm": 3.5624353885650635, "learning_rate": 0.00039227820372398687, "loss": 1.0281, "step": 2161 }, { "epoch": 0.14648813002346045, "grad_norm": 5.444281578063965, "learning_rate": 0.0003922727272727273, "loss": 0.8537, "step": 2162 }, { "epoch": 0.14655588586528445, "grad_norm": 3.349419116973877, "learning_rate": 0.0003922672508214677, "loss": 0.8149, "step": 2163 }, { "epoch": 0.14662364170710843, "grad_norm": 4.304533004760742, "learning_rate": 0.0003922617743702081, "loss": 0.9076, "step": 2164 }, { "epoch": 0.14669139754893243, "grad_norm": 2.891998529434204, "learning_rate": 0.0003922562979189485, "loss": 0.7654, "step": 2165 }, { "epoch": 0.1467591533907564, "grad_norm": 4.707879543304443, "learning_rate": 0.0003922508214676889, "loss": 1.1893, "step": 2166 }, { "epoch": 0.1468269092325804, "grad_norm": 4.528392791748047, "learning_rate": 0.0003922453450164294, "loss": 1.2779, "step": 2167 }, { "epoch": 0.14689466507440438, "grad_norm": 3.851830005645752, "learning_rate": 0.00039223986856516983, "loss": 1.4486, "step": 2168 }, { "epoch": 0.14696242091622838, "grad_norm": 3.184576988220215, "learning_rate": 0.00039223439211391023, "loss": 1.0507, "step": 2169 }, { "epoch": 0.14703017675805236, "grad_norm": 3.7840793132781982, "learning_rate": 0.00039222891566265063, "loss": 1.0007, "step": 2170 }, { "epoch": 0.14709793259987636, "grad_norm": 3.729084014892578, "learning_rate": 0.00039222343921139103, "loss": 0.9816, "step": 2171 }, { "epoch": 0.14716568844170033, "grad_norm": 4.772675037384033, "learning_rate": 0.0003922179627601314, "loss": 0.9053, "step": 2172 }, { "epoch": 0.1472334442835243, "grad_norm": 4.377356052398682, "learning_rate": 0.0003922124863088719, "loss": 1.3553, "step": 2173 }, { "epoch": 0.1473012001253483, "grad_norm": 3.304994821548462, "learning_rate": 0.0003922070098576123, "loss": 0.8358, "step": 2174 }, { "epoch": 0.14736895596717228, "grad_norm": 2.856403112411499, "learning_rate": 0.0003922015334063527, "loss": 0.8754, "step": 2175 }, { "epoch": 0.14743671180899628, "grad_norm": 3.035207748413086, "learning_rate": 0.00039219605695509313, "loss": 0.704, "step": 2176 }, { "epoch": 0.14750446765082026, "grad_norm": 3.813202381134033, "learning_rate": 0.00039219058050383353, "loss": 0.836, "step": 2177 }, { "epoch": 0.14757222349264426, "grad_norm": 11.135214805603027, "learning_rate": 0.000392185104052574, "loss": 0.9285, "step": 2178 }, { "epoch": 0.14763997933446824, "grad_norm": 4.281454563140869, "learning_rate": 0.0003921796276013144, "loss": 1.0635, "step": 2179 }, { "epoch": 0.14770773517629224, "grad_norm": 5.054574489593506, "learning_rate": 0.0003921741511500548, "loss": 1.0832, "step": 2180 }, { "epoch": 0.1477754910181162, "grad_norm": 2.9585464000701904, "learning_rate": 0.0003921686746987952, "loss": 0.8989, "step": 2181 }, { "epoch": 0.1478432468599402, "grad_norm": 4.758214950561523, "learning_rate": 0.0003921631982475356, "loss": 1.2327, "step": 2182 }, { "epoch": 0.1479110027017642, "grad_norm": 3.2253663539886475, "learning_rate": 0.00039215772179627603, "loss": 0.8912, "step": 2183 }, { "epoch": 0.1479787585435882, "grad_norm": 3.5604875087738037, "learning_rate": 0.0003921522453450165, "loss": 1.2369, "step": 2184 }, { "epoch": 0.14804651438541216, "grad_norm": 3.8764712810516357, "learning_rate": 0.0003921467688937569, "loss": 0.9069, "step": 2185 }, { "epoch": 0.14811427022723617, "grad_norm": 3.3568320274353027, "learning_rate": 0.0003921412924424973, "loss": 0.9533, "step": 2186 }, { "epoch": 0.14818202606906014, "grad_norm": 3.796172618865967, "learning_rate": 0.0003921358159912377, "loss": 1.1698, "step": 2187 }, { "epoch": 0.14824978191088412, "grad_norm": 2.7328951358795166, "learning_rate": 0.0003921303395399781, "loss": 0.9762, "step": 2188 }, { "epoch": 0.14831753775270812, "grad_norm": 3.548103094100952, "learning_rate": 0.00039212486308871854, "loss": 0.97, "step": 2189 }, { "epoch": 0.1483852935945321, "grad_norm": 3.534048080444336, "learning_rate": 0.00039211938663745894, "loss": 0.7716, "step": 2190 }, { "epoch": 0.1484530494363561, "grad_norm": 3.7900633811950684, "learning_rate": 0.00039211391018619934, "loss": 0.8052, "step": 2191 }, { "epoch": 0.14852080527818007, "grad_norm": 2.778799533843994, "learning_rate": 0.0003921084337349398, "loss": 0.846, "step": 2192 }, { "epoch": 0.14858856112000407, "grad_norm": 4.366217613220215, "learning_rate": 0.0003921029572836802, "loss": 1.2339, "step": 2193 }, { "epoch": 0.14865631696182804, "grad_norm": 3.8015713691711426, "learning_rate": 0.00039209748083242064, "loss": 1.2771, "step": 2194 }, { "epoch": 0.14872407280365205, "grad_norm": 3.049508571624756, "learning_rate": 0.00039209200438116104, "loss": 0.7798, "step": 2195 }, { "epoch": 0.14879182864547602, "grad_norm": 6.033420085906982, "learning_rate": 0.00039208652792990144, "loss": 1.2439, "step": 2196 }, { "epoch": 0.14885958448730002, "grad_norm": 3.342482089996338, "learning_rate": 0.00039208105147864184, "loss": 0.8499, "step": 2197 }, { "epoch": 0.148927340329124, "grad_norm": 2.993528127670288, "learning_rate": 0.00039207557502738224, "loss": 0.702, "step": 2198 }, { "epoch": 0.148995096170948, "grad_norm": 3.35360050201416, "learning_rate": 0.0003920700985761227, "loss": 0.9834, "step": 2199 }, { "epoch": 0.14906285201277197, "grad_norm": 3.9648444652557373, "learning_rate": 0.00039206462212486315, "loss": 0.9954, "step": 2200 }, { "epoch": 0.14913060785459598, "grad_norm": 5.362756252288818, "learning_rate": 0.00039205914567360355, "loss": 1.3304, "step": 2201 }, { "epoch": 0.14919836369641995, "grad_norm": 3.2520275115966797, "learning_rate": 0.00039205366922234394, "loss": 0.8701, "step": 2202 }, { "epoch": 0.14926611953824392, "grad_norm": 4.183501243591309, "learning_rate": 0.00039204819277108434, "loss": 1.0009, "step": 2203 }, { "epoch": 0.14933387538006793, "grad_norm": 3.7564728260040283, "learning_rate": 0.00039204271631982474, "loss": 0.819, "step": 2204 }, { "epoch": 0.1494016312218919, "grad_norm": 2.428234100341797, "learning_rate": 0.0003920372398685652, "loss": 0.6578, "step": 2205 }, { "epoch": 0.1494693870637159, "grad_norm": 3.571819305419922, "learning_rate": 0.0003920317634173056, "loss": 0.9744, "step": 2206 }, { "epoch": 0.14953714290553988, "grad_norm": 3.7363381385803223, "learning_rate": 0.00039202628696604605, "loss": 1.2083, "step": 2207 }, { "epoch": 0.14960489874736388, "grad_norm": 3.2642152309417725, "learning_rate": 0.00039202081051478645, "loss": 0.9189, "step": 2208 }, { "epoch": 0.14967265458918785, "grad_norm": 3.3988795280456543, "learning_rate": 0.00039201533406352685, "loss": 0.8617, "step": 2209 }, { "epoch": 0.14974041043101186, "grad_norm": 3.4839141368865967, "learning_rate": 0.00039200985761226725, "loss": 1.0504, "step": 2210 }, { "epoch": 0.14980816627283583, "grad_norm": 4.23141622543335, "learning_rate": 0.0003920043811610077, "loss": 1.2958, "step": 2211 }, { "epoch": 0.14987592211465983, "grad_norm": 3.6334457397460938, "learning_rate": 0.0003919989047097481, "loss": 1.0198, "step": 2212 }, { "epoch": 0.1499436779564838, "grad_norm": 3.187537908554077, "learning_rate": 0.0003919934282584885, "loss": 0.8468, "step": 2213 }, { "epoch": 0.1500114337983078, "grad_norm": 3.3582940101623535, "learning_rate": 0.0003919879518072289, "loss": 0.9453, "step": 2214 }, { "epoch": 0.15007918964013178, "grad_norm": 4.020011901855469, "learning_rate": 0.00039198247535596935, "loss": 0.9365, "step": 2215 }, { "epoch": 0.15014694548195578, "grad_norm": 4.460612773895264, "learning_rate": 0.0003919769989047098, "loss": 0.9106, "step": 2216 }, { "epoch": 0.15021470132377976, "grad_norm": 3.0218677520751953, "learning_rate": 0.0003919715224534502, "loss": 1.0, "step": 2217 }, { "epoch": 0.15028245716560373, "grad_norm": 3.0925698280334473, "learning_rate": 0.0003919660460021906, "loss": 1.0344, "step": 2218 }, { "epoch": 0.15035021300742774, "grad_norm": 7.2655487060546875, "learning_rate": 0.000391960569550931, "loss": 1.0533, "step": 2219 }, { "epoch": 0.1504179688492517, "grad_norm": 3.533228874206543, "learning_rate": 0.0003919550930996714, "loss": 1.1094, "step": 2220 }, { "epoch": 0.1504857246910757, "grad_norm": 3.9258177280426025, "learning_rate": 0.00039194961664841186, "loss": 1.0608, "step": 2221 }, { "epoch": 0.1505534805328997, "grad_norm": 2.8459794521331787, "learning_rate": 0.00039194414019715225, "loss": 0.8872, "step": 2222 }, { "epoch": 0.1506212363747237, "grad_norm": 3.4895272254943848, "learning_rate": 0.0003919386637458927, "loss": 0.9805, "step": 2223 }, { "epoch": 0.15068899221654766, "grad_norm": 3.941066026687622, "learning_rate": 0.0003919331872946331, "loss": 1.2712, "step": 2224 }, { "epoch": 0.15075674805837166, "grad_norm": 3.0331318378448486, "learning_rate": 0.0003919277108433735, "loss": 0.9326, "step": 2225 }, { "epoch": 0.15082450390019564, "grad_norm": 3.08465576171875, "learning_rate": 0.0003919222343921139, "loss": 0.9931, "step": 2226 }, { "epoch": 0.15089225974201964, "grad_norm": 3.2743256092071533, "learning_rate": 0.00039191675794085436, "loss": 0.8881, "step": 2227 }, { "epoch": 0.15096001558384362, "grad_norm": 3.4027581214904785, "learning_rate": 0.00039191128148959476, "loss": 1.0742, "step": 2228 }, { "epoch": 0.15102777142566762, "grad_norm": 3.3248753547668457, "learning_rate": 0.00039190580503833516, "loss": 0.8918, "step": 2229 }, { "epoch": 0.1510955272674916, "grad_norm": 2.8444812297821045, "learning_rate": 0.0003919003285870756, "loss": 0.8646, "step": 2230 }, { "epoch": 0.1511632831093156, "grad_norm": 2.828279733657837, "learning_rate": 0.000391894852135816, "loss": 1.0387, "step": 2231 }, { "epoch": 0.15123103895113957, "grad_norm": 3.376667022705078, "learning_rate": 0.00039188937568455646, "loss": 1.1138, "step": 2232 }, { "epoch": 0.15129879479296354, "grad_norm": 3.6115779876708984, "learning_rate": 0.00039188389923329686, "loss": 1.0158, "step": 2233 }, { "epoch": 0.15136655063478754, "grad_norm": 3.4668235778808594, "learning_rate": 0.00039187842278203726, "loss": 1.0549, "step": 2234 }, { "epoch": 0.15143430647661152, "grad_norm": 6.9779767990112305, "learning_rate": 0.00039187294633077766, "loss": 1.0743, "step": 2235 }, { "epoch": 0.15150206231843552, "grad_norm": 2.8324077129364014, "learning_rate": 0.00039186746987951806, "loss": 0.6378, "step": 2236 }, { "epoch": 0.1515698181602595, "grad_norm": 3.062481164932251, "learning_rate": 0.0003918619934282585, "loss": 0.9118, "step": 2237 }, { "epoch": 0.1516375740020835, "grad_norm": 2.9558863639831543, "learning_rate": 0.00039185651697699897, "loss": 0.8893, "step": 2238 }, { "epoch": 0.15170532984390747, "grad_norm": 5.563960075378418, "learning_rate": 0.00039185104052573937, "loss": 0.8774, "step": 2239 }, { "epoch": 0.15177308568573147, "grad_norm": 3.3860373497009277, "learning_rate": 0.00039184556407447977, "loss": 0.8095, "step": 2240 }, { "epoch": 0.15184084152755545, "grad_norm": 3.8272602558135986, "learning_rate": 0.00039184008762322016, "loss": 0.9593, "step": 2241 }, { "epoch": 0.15190859736937945, "grad_norm": 4.192650318145752, "learning_rate": 0.00039183461117196056, "loss": 0.9229, "step": 2242 }, { "epoch": 0.15197635321120342, "grad_norm": 4.173397541046143, "learning_rate": 0.000391829134720701, "loss": 1.1817, "step": 2243 }, { "epoch": 0.15204410905302743, "grad_norm": 3.5099539756774902, "learning_rate": 0.0003918236582694414, "loss": 1.0553, "step": 2244 }, { "epoch": 0.1521118648948514, "grad_norm": 3.4042956829071045, "learning_rate": 0.0003918181818181818, "loss": 0.8956, "step": 2245 }, { "epoch": 0.1521796207366754, "grad_norm": 4.20835542678833, "learning_rate": 0.00039181270536692227, "loss": 0.9878, "step": 2246 }, { "epoch": 0.15224737657849938, "grad_norm": 3.3840863704681396, "learning_rate": 0.00039180722891566267, "loss": 0.9552, "step": 2247 }, { "epoch": 0.15231513242032335, "grad_norm": 2.9864425659179688, "learning_rate": 0.00039180175246440307, "loss": 0.9833, "step": 2248 }, { "epoch": 0.15238288826214735, "grad_norm": 3.2888622283935547, "learning_rate": 0.0003917962760131435, "loss": 0.9713, "step": 2249 }, { "epoch": 0.15245064410397133, "grad_norm": 3.2475526332855225, "learning_rate": 0.0003917907995618839, "loss": 1.1102, "step": 2250 }, { "epoch": 0.15251839994579533, "grad_norm": 3.051023244857788, "learning_rate": 0.0003917853231106243, "loss": 0.9727, "step": 2251 }, { "epoch": 0.1525861557876193, "grad_norm": 3.0472633838653564, "learning_rate": 0.0003917798466593647, "loss": 0.895, "step": 2252 }, { "epoch": 0.1526539116294433, "grad_norm": 3.274644613265991, "learning_rate": 0.00039177437020810517, "loss": 0.8118, "step": 2253 }, { "epoch": 0.15272166747126728, "grad_norm": 4.101376533508301, "learning_rate": 0.0003917688937568456, "loss": 1.0857, "step": 2254 }, { "epoch": 0.15278942331309128, "grad_norm": 2.2067580223083496, "learning_rate": 0.000391763417305586, "loss": 0.6782, "step": 2255 }, { "epoch": 0.15285717915491526, "grad_norm": 3.0936734676361084, "learning_rate": 0.0003917579408543264, "loss": 0.7118, "step": 2256 }, { "epoch": 0.15292493499673926, "grad_norm": 4.483651161193848, "learning_rate": 0.0003917524644030668, "loss": 0.9089, "step": 2257 }, { "epoch": 0.15299269083856323, "grad_norm": 3.09566330909729, "learning_rate": 0.0003917469879518072, "loss": 0.9912, "step": 2258 }, { "epoch": 0.15306044668038724, "grad_norm": 2.877511978149414, "learning_rate": 0.0003917415115005477, "loss": 0.8348, "step": 2259 }, { "epoch": 0.1531282025222112, "grad_norm": 3.1886019706726074, "learning_rate": 0.0003917360350492881, "loss": 0.926, "step": 2260 }, { "epoch": 0.1531959583640352, "grad_norm": 3.413902997970581, "learning_rate": 0.0003917305585980285, "loss": 1.1722, "step": 2261 }, { "epoch": 0.1532637142058592, "grad_norm": 3.527569055557251, "learning_rate": 0.00039172508214676893, "loss": 0.9802, "step": 2262 }, { "epoch": 0.15333147004768316, "grad_norm": 4.155728816986084, "learning_rate": 0.0003917196056955093, "loss": 0.9751, "step": 2263 }, { "epoch": 0.15339922588950716, "grad_norm": 4.1871161460876465, "learning_rate": 0.0003917141292442497, "loss": 1.0094, "step": 2264 }, { "epoch": 0.15346698173133114, "grad_norm": 3.4102444648742676, "learning_rate": 0.0003917086527929902, "loss": 0.8676, "step": 2265 }, { "epoch": 0.15353473757315514, "grad_norm": 3.842257022857666, "learning_rate": 0.0003917031763417306, "loss": 1.2218, "step": 2266 }, { "epoch": 0.1536024934149791, "grad_norm": 4.487432956695557, "learning_rate": 0.000391697699890471, "loss": 1.1234, "step": 2267 }, { "epoch": 0.15367024925680312, "grad_norm": 4.266439437866211, "learning_rate": 0.0003916922234392114, "loss": 1.0491, "step": 2268 }, { "epoch": 0.1537380050986271, "grad_norm": 4.349552631378174, "learning_rate": 0.00039168674698795183, "loss": 0.8451, "step": 2269 }, { "epoch": 0.1538057609404511, "grad_norm": 3.5002074241638184, "learning_rate": 0.0003916812705366923, "loss": 1.1064, "step": 2270 }, { "epoch": 0.15387351678227507, "grad_norm": 3.354480028152466, "learning_rate": 0.0003916757940854327, "loss": 1.0278, "step": 2271 }, { "epoch": 0.15394127262409907, "grad_norm": 3.434291362762451, "learning_rate": 0.0003916703176341731, "loss": 0.8235, "step": 2272 }, { "epoch": 0.15400902846592304, "grad_norm": 3.9096696376800537, "learning_rate": 0.0003916648411829135, "loss": 1.1045, "step": 2273 }, { "epoch": 0.15407678430774704, "grad_norm": 3.05324125289917, "learning_rate": 0.0003916593647316539, "loss": 0.8646, "step": 2274 }, { "epoch": 0.15414454014957102, "grad_norm": 3.145240068435669, "learning_rate": 0.00039165388828039433, "loss": 0.9912, "step": 2275 }, { "epoch": 0.15421229599139502, "grad_norm": 6.391587257385254, "learning_rate": 0.00039164841182913473, "loss": 0.9912, "step": 2276 }, { "epoch": 0.154280051833219, "grad_norm": 3.384464740753174, "learning_rate": 0.0003916429353778752, "loss": 0.875, "step": 2277 }, { "epoch": 0.15434780767504297, "grad_norm": 3.7162532806396484, "learning_rate": 0.0003916374589266156, "loss": 0.9127, "step": 2278 }, { "epoch": 0.15441556351686697, "grad_norm": 4.7119879722595215, "learning_rate": 0.000391631982475356, "loss": 1.0822, "step": 2279 }, { "epoch": 0.15448331935869095, "grad_norm": 4.590001106262207, "learning_rate": 0.0003916265060240964, "loss": 0.9078, "step": 2280 }, { "epoch": 0.15455107520051495, "grad_norm": 4.780983924865723, "learning_rate": 0.00039162102957283684, "loss": 1.1061, "step": 2281 }, { "epoch": 0.15461883104233892, "grad_norm": 4.1171393394470215, "learning_rate": 0.00039161555312157724, "loss": 1.0692, "step": 2282 }, { "epoch": 0.15468658688416292, "grad_norm": 2.6301543712615967, "learning_rate": 0.00039161007667031764, "loss": 0.6736, "step": 2283 }, { "epoch": 0.1547543427259869, "grad_norm": 3.0278191566467285, "learning_rate": 0.00039160460021905804, "loss": 0.8822, "step": 2284 }, { "epoch": 0.1548220985678109, "grad_norm": 3.533506393432617, "learning_rate": 0.0003915991237677985, "loss": 1.0576, "step": 2285 }, { "epoch": 0.15488985440963488, "grad_norm": 4.43066930770874, "learning_rate": 0.0003915936473165389, "loss": 0.9728, "step": 2286 }, { "epoch": 0.15495761025145888, "grad_norm": 2.989530086517334, "learning_rate": 0.00039158817086527934, "loss": 0.9439, "step": 2287 }, { "epoch": 0.15502536609328285, "grad_norm": 4.224128246307373, "learning_rate": 0.00039158269441401974, "loss": 0.9691, "step": 2288 }, { "epoch": 0.15509312193510685, "grad_norm": 4.779012680053711, "learning_rate": 0.00039157721796276014, "loss": 1.0763, "step": 2289 }, { "epoch": 0.15516087777693083, "grad_norm": 4.328718662261963, "learning_rate": 0.00039157174151150054, "loss": 1.2291, "step": 2290 }, { "epoch": 0.15522863361875483, "grad_norm": 3.2869222164154053, "learning_rate": 0.000391566265060241, "loss": 0.9877, "step": 2291 }, { "epoch": 0.1552963894605788, "grad_norm": 2.935812473297119, "learning_rate": 0.0003915607886089814, "loss": 0.7392, "step": 2292 }, { "epoch": 0.15536414530240278, "grad_norm": 3.7724058628082275, "learning_rate": 0.00039155531215772185, "loss": 0.9363, "step": 2293 }, { "epoch": 0.15543190114422678, "grad_norm": 3.956517219543457, "learning_rate": 0.00039154983570646224, "loss": 1.124, "step": 2294 }, { "epoch": 0.15549965698605075, "grad_norm": 2.9776771068573, "learning_rate": 0.00039154435925520264, "loss": 0.7236, "step": 2295 }, { "epoch": 0.15556741282787476, "grad_norm": 2.835200548171997, "learning_rate": 0.00039153888280394304, "loss": 0.8119, "step": 2296 }, { "epoch": 0.15563516866969873, "grad_norm": 3.4048142433166504, "learning_rate": 0.0003915334063526835, "loss": 0.8828, "step": 2297 }, { "epoch": 0.15570292451152273, "grad_norm": 6.744691371917725, "learning_rate": 0.0003915279299014239, "loss": 1.2655, "step": 2298 }, { "epoch": 0.1557706803533467, "grad_norm": 4.069280624389648, "learning_rate": 0.0003915224534501643, "loss": 0.802, "step": 2299 }, { "epoch": 0.1558384361951707, "grad_norm": 4.398569107055664, "learning_rate": 0.00039151697699890475, "loss": 1.3301, "step": 2300 }, { "epoch": 0.15590619203699468, "grad_norm": 3.0462827682495117, "learning_rate": 0.00039151150054764515, "loss": 0.9454, "step": 2301 }, { "epoch": 0.1559739478788187, "grad_norm": 3.5707690715789795, "learning_rate": 0.00039150602409638555, "loss": 0.8728, "step": 2302 }, { "epoch": 0.15604170372064266, "grad_norm": 4.72637414932251, "learning_rate": 0.000391500547645126, "loss": 1.2063, "step": 2303 }, { "epoch": 0.15610945956246666, "grad_norm": 4.286033630371094, "learning_rate": 0.0003914950711938664, "loss": 0.9753, "step": 2304 }, { "epoch": 0.15617721540429064, "grad_norm": 3.4821553230285645, "learning_rate": 0.0003914895947426068, "loss": 1.0313, "step": 2305 }, { "epoch": 0.15624497124611464, "grad_norm": 4.716664791107178, "learning_rate": 0.0003914841182913472, "loss": 0.878, "step": 2306 }, { "epoch": 0.1563127270879386, "grad_norm": 4.626977920532227, "learning_rate": 0.0003914786418400876, "loss": 1.0571, "step": 2307 }, { "epoch": 0.1563804829297626, "grad_norm": 4.215327739715576, "learning_rate": 0.0003914731653888281, "loss": 0.988, "step": 2308 }, { "epoch": 0.1564482387715866, "grad_norm": 4.7321577072143555, "learning_rate": 0.0003914676889375685, "loss": 1.0299, "step": 2309 }, { "epoch": 0.15651599461341056, "grad_norm": 3.076568126678467, "learning_rate": 0.0003914622124863089, "loss": 0.9726, "step": 2310 }, { "epoch": 0.15658375045523457, "grad_norm": 3.5624375343322754, "learning_rate": 0.0003914567360350493, "loss": 0.9258, "step": 2311 }, { "epoch": 0.15665150629705854, "grad_norm": 2.8745973110198975, "learning_rate": 0.0003914512595837897, "loss": 0.7874, "step": 2312 }, { "epoch": 0.15671926213888254, "grad_norm": 3.484389543533325, "learning_rate": 0.00039144578313253016, "loss": 0.8271, "step": 2313 }, { "epoch": 0.15678701798070652, "grad_norm": 2.9967684745788574, "learning_rate": 0.00039144030668127055, "loss": 0.6841, "step": 2314 }, { "epoch": 0.15685477382253052, "grad_norm": 3.3432979583740234, "learning_rate": 0.00039143483023001095, "loss": 0.7757, "step": 2315 }, { "epoch": 0.1569225296643545, "grad_norm": 3.500347137451172, "learning_rate": 0.0003914293537787514, "loss": 1.0069, "step": 2316 }, { "epoch": 0.1569902855061785, "grad_norm": 4.281485557556152, "learning_rate": 0.0003914238773274918, "loss": 0.7904, "step": 2317 }, { "epoch": 0.15705804134800247, "grad_norm": 3.8250153064727783, "learning_rate": 0.0003914184008762322, "loss": 0.9104, "step": 2318 }, { "epoch": 0.15712579718982647, "grad_norm": 3.9165241718292236, "learning_rate": 0.00039141292442497266, "loss": 1.1665, "step": 2319 }, { "epoch": 0.15719355303165045, "grad_norm": 5.408055782318115, "learning_rate": 0.00039140744797371306, "loss": 1.0558, "step": 2320 }, { "epoch": 0.15726130887347445, "grad_norm": 2.858224868774414, "learning_rate": 0.00039140197152245346, "loss": 0.8598, "step": 2321 }, { "epoch": 0.15732906471529842, "grad_norm": 3.4219090938568115, "learning_rate": 0.00039139649507119386, "loss": 1.2276, "step": 2322 }, { "epoch": 0.1573968205571224, "grad_norm": 3.4176478385925293, "learning_rate": 0.00039139101861993426, "loss": 0.9964, "step": 2323 }, { "epoch": 0.1574645763989464, "grad_norm": 2.2429585456848145, "learning_rate": 0.0003913855421686747, "loss": 0.6614, "step": 2324 }, { "epoch": 0.15753233224077037, "grad_norm": 3.5706393718719482, "learning_rate": 0.00039138006571741516, "loss": 1.0506, "step": 2325 }, { "epoch": 0.15760008808259438, "grad_norm": 2.80277156829834, "learning_rate": 0.00039137458926615556, "loss": 0.8488, "step": 2326 }, { "epoch": 0.15766784392441835, "grad_norm": 3.518329620361328, "learning_rate": 0.00039136911281489596, "loss": 1.1364, "step": 2327 }, { "epoch": 0.15773559976624235, "grad_norm": 3.4713735580444336, "learning_rate": 0.00039136363636363636, "loss": 0.9814, "step": 2328 }, { "epoch": 0.15780335560806633, "grad_norm": 3.9110569953918457, "learning_rate": 0.0003913581599123768, "loss": 1.0061, "step": 2329 }, { "epoch": 0.15787111144989033, "grad_norm": 9.212739944458008, "learning_rate": 0.0003913526834611172, "loss": 1.1774, "step": 2330 }, { "epoch": 0.1579388672917143, "grad_norm": 3.064990282058716, "learning_rate": 0.00039134720700985767, "loss": 0.9529, "step": 2331 }, { "epoch": 0.1580066231335383, "grad_norm": 2.9996888637542725, "learning_rate": 0.00039134173055859807, "loss": 0.8527, "step": 2332 }, { "epoch": 0.15807437897536228, "grad_norm": 3.304190158843994, "learning_rate": 0.00039133625410733846, "loss": 1.0561, "step": 2333 }, { "epoch": 0.15814213481718628, "grad_norm": 3.0462400913238525, "learning_rate": 0.00039133077765607886, "loss": 0.9801, "step": 2334 }, { "epoch": 0.15820989065901025, "grad_norm": 4.078166961669922, "learning_rate": 0.0003913253012048193, "loss": 1.1188, "step": 2335 }, { "epoch": 0.15827764650083426, "grad_norm": 3.2843191623687744, "learning_rate": 0.0003913198247535597, "loss": 1.0667, "step": 2336 }, { "epoch": 0.15834540234265823, "grad_norm": 3.945307731628418, "learning_rate": 0.0003913143483023001, "loss": 1.1438, "step": 2337 }, { "epoch": 0.1584131581844822, "grad_norm": 3.158125162124634, "learning_rate": 0.0003913088718510405, "loss": 0.9246, "step": 2338 }, { "epoch": 0.1584809140263062, "grad_norm": 3.2488436698913574, "learning_rate": 0.00039130339539978097, "loss": 0.8155, "step": 2339 }, { "epoch": 0.15854866986813018, "grad_norm": 3.2344892024993896, "learning_rate": 0.00039129791894852137, "loss": 0.7457, "step": 2340 }, { "epoch": 0.15861642570995418, "grad_norm": 10.447481155395508, "learning_rate": 0.0003912924424972618, "loss": 0.9385, "step": 2341 }, { "epoch": 0.15868418155177816, "grad_norm": 3.2391421794891357, "learning_rate": 0.0003912869660460022, "loss": 0.9638, "step": 2342 }, { "epoch": 0.15875193739360216, "grad_norm": 3.150144577026367, "learning_rate": 0.0003912814895947426, "loss": 0.8102, "step": 2343 }, { "epoch": 0.15881969323542613, "grad_norm": 3.88244366645813, "learning_rate": 0.000391276013143483, "loss": 1.1372, "step": 2344 }, { "epoch": 0.15888744907725014, "grad_norm": 5.775148391723633, "learning_rate": 0.0003912705366922234, "loss": 1.0088, "step": 2345 }, { "epoch": 0.1589552049190741, "grad_norm": 2.7865357398986816, "learning_rate": 0.00039126506024096387, "loss": 0.829, "step": 2346 }, { "epoch": 0.1590229607608981, "grad_norm": 3.307279586791992, "learning_rate": 0.0003912595837897043, "loss": 1.0321, "step": 2347 }, { "epoch": 0.1590907166027221, "grad_norm": 3.713334798812866, "learning_rate": 0.0003912541073384447, "loss": 0.9425, "step": 2348 }, { "epoch": 0.1591584724445461, "grad_norm": 3.826998710632324, "learning_rate": 0.0003912486308871851, "loss": 0.9558, "step": 2349 }, { "epoch": 0.15922622828637006, "grad_norm": 3.291170120239258, "learning_rate": 0.0003912431544359255, "loss": 0.8875, "step": 2350 }, { "epoch": 0.15929398412819407, "grad_norm": 3.2902164459228516, "learning_rate": 0.000391237677984666, "loss": 0.9305, "step": 2351 }, { "epoch": 0.15936173997001804, "grad_norm": 4.9762372970581055, "learning_rate": 0.0003912322015334064, "loss": 0.9433, "step": 2352 }, { "epoch": 0.15942949581184201, "grad_norm": 4.354738235473633, "learning_rate": 0.0003912267250821468, "loss": 1.1192, "step": 2353 }, { "epoch": 0.15949725165366602, "grad_norm": 5.2057271003723145, "learning_rate": 0.0003912212486308872, "loss": 0.7609, "step": 2354 }, { "epoch": 0.15956500749549, "grad_norm": 3.0585134029388428, "learning_rate": 0.0003912157721796276, "loss": 0.8057, "step": 2355 }, { "epoch": 0.159632763337314, "grad_norm": 3.8493916988372803, "learning_rate": 0.000391210295728368, "loss": 0.9733, "step": 2356 }, { "epoch": 0.15970051917913797, "grad_norm": 4.423927307128906, "learning_rate": 0.0003912048192771085, "loss": 1.2549, "step": 2357 }, { "epoch": 0.15976827502096197, "grad_norm": 3.4531209468841553, "learning_rate": 0.0003911993428258489, "loss": 1.0354, "step": 2358 }, { "epoch": 0.15983603086278594, "grad_norm": 3.800834894180298, "learning_rate": 0.0003911938663745893, "loss": 0.9978, "step": 2359 }, { "epoch": 0.15990378670460995, "grad_norm": 4.611058712005615, "learning_rate": 0.0003911883899233297, "loss": 1.0681, "step": 2360 }, { "epoch": 0.15997154254643392, "grad_norm": 3.2351226806640625, "learning_rate": 0.0003911829134720701, "loss": 0.8756, "step": 2361 }, { "epoch": 0.16003929838825792, "grad_norm": 4.632431983947754, "learning_rate": 0.00039117743702081053, "loss": 1.2032, "step": 2362 }, { "epoch": 0.1601070542300819, "grad_norm": 3.07958722114563, "learning_rate": 0.000391171960569551, "loss": 0.9315, "step": 2363 }, { "epoch": 0.1601748100719059, "grad_norm": 3.0759241580963135, "learning_rate": 0.0003911664841182914, "loss": 0.747, "step": 2364 }, { "epoch": 0.16024256591372987, "grad_norm": 3.3531603813171387, "learning_rate": 0.0003911610076670318, "loss": 1.1043, "step": 2365 }, { "epoch": 0.16031032175555388, "grad_norm": 2.83420991897583, "learning_rate": 0.0003911555312157722, "loss": 0.8363, "step": 2366 }, { "epoch": 0.16037807759737785, "grad_norm": 3.5032336711883545, "learning_rate": 0.00039115005476451263, "loss": 0.9617, "step": 2367 }, { "epoch": 0.16044583343920182, "grad_norm": 3.935238838195801, "learning_rate": 0.00039114457831325303, "loss": 1.2056, "step": 2368 }, { "epoch": 0.16051358928102583, "grad_norm": 4.242410182952881, "learning_rate": 0.00039113910186199343, "loss": 1.1656, "step": 2369 }, { "epoch": 0.1605813451228498, "grad_norm": 2.4447858333587646, "learning_rate": 0.0003911336254107339, "loss": 0.6654, "step": 2370 }, { "epoch": 0.1606491009646738, "grad_norm": 3.3764092922210693, "learning_rate": 0.0003911281489594743, "loss": 1.0379, "step": 2371 }, { "epoch": 0.16071685680649778, "grad_norm": 2.7722201347351074, "learning_rate": 0.0003911226725082147, "loss": 0.7055, "step": 2372 }, { "epoch": 0.16078461264832178, "grad_norm": 4.085498809814453, "learning_rate": 0.00039111719605695514, "loss": 1.0092, "step": 2373 }, { "epoch": 0.16085236849014575, "grad_norm": 3.5239946842193604, "learning_rate": 0.00039111171960569554, "loss": 0.9611, "step": 2374 }, { "epoch": 0.16092012433196975, "grad_norm": 3.5129261016845703, "learning_rate": 0.00039110624315443594, "loss": 0.9589, "step": 2375 }, { "epoch": 0.16098788017379373, "grad_norm": 3.942143440246582, "learning_rate": 0.00039110076670317634, "loss": 0.9712, "step": 2376 }, { "epoch": 0.16105563601561773, "grad_norm": 2.9521069526672363, "learning_rate": 0.00039109529025191673, "loss": 0.8362, "step": 2377 }, { "epoch": 0.1611233918574417, "grad_norm": 3.5620615482330322, "learning_rate": 0.0003910898138006572, "loss": 0.9572, "step": 2378 }, { "epoch": 0.1611911476992657, "grad_norm": 3.334449529647827, "learning_rate": 0.00039108433734939764, "loss": 0.9318, "step": 2379 }, { "epoch": 0.16125890354108968, "grad_norm": 3.3526742458343506, "learning_rate": 0.00039107886089813804, "loss": 0.9934, "step": 2380 }, { "epoch": 0.16132665938291368, "grad_norm": 2.76525616645813, "learning_rate": 0.00039107338444687844, "loss": 0.7018, "step": 2381 }, { "epoch": 0.16139441522473766, "grad_norm": 3.9329357147216797, "learning_rate": 0.00039106790799561884, "loss": 0.8203, "step": 2382 }, { "epoch": 0.16146217106656163, "grad_norm": 3.5432684421539307, "learning_rate": 0.00039106243154435924, "loss": 0.9169, "step": 2383 }, { "epoch": 0.16152992690838563, "grad_norm": 3.117920398712158, "learning_rate": 0.0003910569550930997, "loss": 0.9765, "step": 2384 }, { "epoch": 0.1615976827502096, "grad_norm": 3.467099666595459, "learning_rate": 0.0003910514786418401, "loss": 0.9997, "step": 2385 }, { "epoch": 0.1616654385920336, "grad_norm": 3.366712808609009, "learning_rate": 0.00039104600219058054, "loss": 0.7379, "step": 2386 }, { "epoch": 0.16173319443385759, "grad_norm": 4.257052898406982, "learning_rate": 0.00039104052573932094, "loss": 0.9985, "step": 2387 }, { "epoch": 0.1618009502756816, "grad_norm": 3.339301824569702, "learning_rate": 0.00039103504928806134, "loss": 1.0006, "step": 2388 }, { "epoch": 0.16186870611750556, "grad_norm": 3.987985849380493, "learning_rate": 0.0003910295728368018, "loss": 1.1376, "step": 2389 }, { "epoch": 0.16193646195932956, "grad_norm": 2.9686198234558105, "learning_rate": 0.0003910240963855422, "loss": 0.9392, "step": 2390 }, { "epoch": 0.16200421780115354, "grad_norm": 2.9727485179901123, "learning_rate": 0.0003910186199342826, "loss": 1.0688, "step": 2391 }, { "epoch": 0.16207197364297754, "grad_norm": 4.215958595275879, "learning_rate": 0.000391013143483023, "loss": 1.4286, "step": 2392 }, { "epoch": 0.16213972948480151, "grad_norm": 3.9825403690338135, "learning_rate": 0.0003910076670317634, "loss": 1.0671, "step": 2393 }, { "epoch": 0.16220748532662552, "grad_norm": 3.5320558547973633, "learning_rate": 0.00039100219058050385, "loss": 1.0737, "step": 2394 }, { "epoch": 0.1622752411684495, "grad_norm": 3.767899990081787, "learning_rate": 0.0003909967141292443, "loss": 1.2116, "step": 2395 }, { "epoch": 0.1623429970102735, "grad_norm": 2.673516273498535, "learning_rate": 0.0003909912376779847, "loss": 0.7047, "step": 2396 }, { "epoch": 0.16241075285209747, "grad_norm": 2.674363136291504, "learning_rate": 0.0003909857612267251, "loss": 0.8814, "step": 2397 }, { "epoch": 0.16247850869392144, "grad_norm": 3.7027153968811035, "learning_rate": 0.0003909802847754655, "loss": 1.0225, "step": 2398 }, { "epoch": 0.16254626453574544, "grad_norm": 3.7984888553619385, "learning_rate": 0.0003909748083242059, "loss": 0.9809, "step": 2399 }, { "epoch": 0.16261402037756942, "grad_norm": 2.950029134750366, "learning_rate": 0.00039096933187294635, "loss": 0.6951, "step": 2400 }, { "epoch": 0.16268177621939342, "grad_norm": 3.2700440883636475, "learning_rate": 0.0003909638554216868, "loss": 0.943, "step": 2401 }, { "epoch": 0.1627495320612174, "grad_norm": 2.6874001026153564, "learning_rate": 0.0003909583789704272, "loss": 0.6593, "step": 2402 }, { "epoch": 0.1628172879030414, "grad_norm": 4.0272979736328125, "learning_rate": 0.0003909529025191676, "loss": 1.1069, "step": 2403 }, { "epoch": 0.16288504374486537, "grad_norm": 4.092314720153809, "learning_rate": 0.000390947426067908, "loss": 1.1179, "step": 2404 }, { "epoch": 0.16295279958668937, "grad_norm": 5.414695739746094, "learning_rate": 0.00039094194961664845, "loss": 1.0901, "step": 2405 }, { "epoch": 0.16302055542851335, "grad_norm": 3.7071361541748047, "learning_rate": 0.00039093647316538885, "loss": 1.2141, "step": 2406 }, { "epoch": 0.16308831127033735, "grad_norm": 4.393648147583008, "learning_rate": 0.00039093099671412925, "loss": 1.0145, "step": 2407 }, { "epoch": 0.16315606711216132, "grad_norm": 3.34543776512146, "learning_rate": 0.00039092552026286965, "loss": 1.0512, "step": 2408 }, { "epoch": 0.16322382295398533, "grad_norm": 3.7193424701690674, "learning_rate": 0.0003909200438116101, "loss": 1.1034, "step": 2409 }, { "epoch": 0.1632915787958093, "grad_norm": 2.5605249404907227, "learning_rate": 0.0003909145673603505, "loss": 0.6876, "step": 2410 }, { "epoch": 0.1633593346376333, "grad_norm": 4.167562961578369, "learning_rate": 0.00039090909090909096, "loss": 1.2861, "step": 2411 }, { "epoch": 0.16342709047945728, "grad_norm": 3.20143723487854, "learning_rate": 0.00039090361445783136, "loss": 1.006, "step": 2412 }, { "epoch": 0.16349484632128125, "grad_norm": 3.333951234817505, "learning_rate": 0.00039089813800657176, "loss": 1.0943, "step": 2413 }, { "epoch": 0.16356260216310525, "grad_norm": 3.614236831665039, "learning_rate": 0.00039089266155531216, "loss": 0.9313, "step": 2414 }, { "epoch": 0.16363035800492923, "grad_norm": 3.5606918334960938, "learning_rate": 0.00039088718510405256, "loss": 1.2799, "step": 2415 }, { "epoch": 0.16369811384675323, "grad_norm": 3.7573323249816895, "learning_rate": 0.000390881708652793, "loss": 1.2246, "step": 2416 }, { "epoch": 0.1637658696885772, "grad_norm": 3.0211334228515625, "learning_rate": 0.00039087623220153346, "loss": 0.7964, "step": 2417 }, { "epoch": 0.1638336255304012, "grad_norm": 2.8421409130096436, "learning_rate": 0.00039087075575027386, "loss": 0.7261, "step": 2418 }, { "epoch": 0.16390138137222518, "grad_norm": 4.219807147979736, "learning_rate": 0.00039086527929901426, "loss": 1.2506, "step": 2419 }, { "epoch": 0.16396913721404918, "grad_norm": 3.1810686588287354, "learning_rate": 0.00039085980284775466, "loss": 1.1376, "step": 2420 }, { "epoch": 0.16403689305587316, "grad_norm": 3.150115728378296, "learning_rate": 0.00039085432639649506, "loss": 0.8848, "step": 2421 }, { "epoch": 0.16410464889769716, "grad_norm": 2.913374662399292, "learning_rate": 0.0003908488499452355, "loss": 0.8255, "step": 2422 }, { "epoch": 0.16417240473952113, "grad_norm": 3.515133857727051, "learning_rate": 0.0003908433734939759, "loss": 1.0325, "step": 2423 }, { "epoch": 0.16424016058134513, "grad_norm": 4.310667991638184, "learning_rate": 0.0003908378970427163, "loss": 1.0862, "step": 2424 }, { "epoch": 0.1643079164231691, "grad_norm": 6.09130859375, "learning_rate": 0.00039083242059145676, "loss": 0.8767, "step": 2425 }, { "epoch": 0.1643756722649931, "grad_norm": 4.40318489074707, "learning_rate": 0.00039082694414019716, "loss": 1.0716, "step": 2426 }, { "epoch": 0.16444342810681709, "grad_norm": 3.8440544605255127, "learning_rate": 0.0003908214676889376, "loss": 1.1218, "step": 2427 }, { "epoch": 0.16451118394864106, "grad_norm": 4.074769496917725, "learning_rate": 0.000390815991237678, "loss": 1.2436, "step": 2428 }, { "epoch": 0.16457893979046506, "grad_norm": 3.9255714416503906, "learning_rate": 0.0003908105147864184, "loss": 1.0215, "step": 2429 }, { "epoch": 0.16464669563228904, "grad_norm": 3.4815566539764404, "learning_rate": 0.0003908050383351588, "loss": 1.0192, "step": 2430 }, { "epoch": 0.16471445147411304, "grad_norm": 4.408581256866455, "learning_rate": 0.0003907995618838992, "loss": 1.0681, "step": 2431 }, { "epoch": 0.164782207315937, "grad_norm": 3.2579798698425293, "learning_rate": 0.00039079408543263967, "loss": 0.9485, "step": 2432 }, { "epoch": 0.16484996315776101, "grad_norm": 3.922394037246704, "learning_rate": 0.0003907886089813801, "loss": 1.2523, "step": 2433 }, { "epoch": 0.164917718999585, "grad_norm": 2.5622668266296387, "learning_rate": 0.0003907831325301205, "loss": 0.8427, "step": 2434 }, { "epoch": 0.164985474841409, "grad_norm": 3.2815206050872803, "learning_rate": 0.0003907776560788609, "loss": 1.017, "step": 2435 }, { "epoch": 0.16505323068323297, "grad_norm": 4.11004638671875, "learning_rate": 0.0003907721796276013, "loss": 0.8469, "step": 2436 }, { "epoch": 0.16512098652505697, "grad_norm": 3.5815510749816895, "learning_rate": 0.0003907667031763417, "loss": 1.0321, "step": 2437 }, { "epoch": 0.16518874236688094, "grad_norm": 4.084681510925293, "learning_rate": 0.00039076122672508217, "loss": 1.1594, "step": 2438 }, { "epoch": 0.16525649820870494, "grad_norm": 4.138881206512451, "learning_rate": 0.00039075575027382257, "loss": 1.2214, "step": 2439 }, { "epoch": 0.16532425405052892, "grad_norm": 3.9706532955169678, "learning_rate": 0.000390750273822563, "loss": 1.0061, "step": 2440 }, { "epoch": 0.1653920098923529, "grad_norm": 2.5234005451202393, "learning_rate": 0.0003907447973713034, "loss": 0.729, "step": 2441 }, { "epoch": 0.1654597657341769, "grad_norm": 2.7967615127563477, "learning_rate": 0.0003907393209200438, "loss": 0.6525, "step": 2442 }, { "epoch": 0.16552752157600087, "grad_norm": 4.568713188171387, "learning_rate": 0.0003907338444687843, "loss": 0.8736, "step": 2443 }, { "epoch": 0.16559527741782487, "grad_norm": 3.099701166152954, "learning_rate": 0.0003907283680175247, "loss": 0.7611, "step": 2444 }, { "epoch": 0.16566303325964885, "grad_norm": 3.5989129543304443, "learning_rate": 0.0003907228915662651, "loss": 1.0886, "step": 2445 }, { "epoch": 0.16573078910147285, "grad_norm": 3.7296478748321533, "learning_rate": 0.0003907174151150055, "loss": 1.1618, "step": 2446 }, { "epoch": 0.16579854494329682, "grad_norm": 4.262120723724365, "learning_rate": 0.00039071193866374587, "loss": 1.056, "step": 2447 }, { "epoch": 0.16586630078512082, "grad_norm": 3.680967330932617, "learning_rate": 0.0003907064622124863, "loss": 0.9768, "step": 2448 }, { "epoch": 0.1659340566269448, "grad_norm": 2.9749538898468018, "learning_rate": 0.0003907009857612268, "loss": 0.93, "step": 2449 }, { "epoch": 0.1660018124687688, "grad_norm": 4.246497631072998, "learning_rate": 0.0003906955093099672, "loss": 1.1383, "step": 2450 }, { "epoch": 0.16606956831059277, "grad_norm": 3.7326085567474365, "learning_rate": 0.0003906900328587076, "loss": 1.0418, "step": 2451 }, { "epoch": 0.16613732415241678, "grad_norm": 3.5544731616973877, "learning_rate": 0.000390684556407448, "loss": 1.06, "step": 2452 }, { "epoch": 0.16620507999424075, "grad_norm": 3.8475277423858643, "learning_rate": 0.0003906790799561884, "loss": 1.1003, "step": 2453 }, { "epoch": 0.16627283583606475, "grad_norm": 4.518758296966553, "learning_rate": 0.00039067360350492883, "loss": 1.1254, "step": 2454 }, { "epoch": 0.16634059167788873, "grad_norm": 3.011996269226074, "learning_rate": 0.00039066812705366923, "loss": 0.9466, "step": 2455 }, { "epoch": 0.1664083475197127, "grad_norm": 3.706911563873291, "learning_rate": 0.0003906626506024097, "loss": 0.9137, "step": 2456 }, { "epoch": 0.1664761033615367, "grad_norm": 3.9900949001312256, "learning_rate": 0.0003906571741511501, "loss": 1.1883, "step": 2457 }, { "epoch": 0.16654385920336068, "grad_norm": 3.853766679763794, "learning_rate": 0.0003906516976998905, "loss": 0.9743, "step": 2458 }, { "epoch": 0.16661161504518468, "grad_norm": 4.99286413192749, "learning_rate": 0.0003906462212486309, "loss": 0.6058, "step": 2459 }, { "epoch": 0.16667937088700865, "grad_norm": 3.200807809829712, "learning_rate": 0.00039064074479737133, "loss": 0.7169, "step": 2460 }, { "epoch": 0.16674712672883266, "grad_norm": 4.306271553039551, "learning_rate": 0.00039063526834611173, "loss": 0.9279, "step": 2461 }, { "epoch": 0.16681488257065663, "grad_norm": 3.0968384742736816, "learning_rate": 0.00039062979189485213, "loss": 1.0192, "step": 2462 }, { "epoch": 0.16688263841248063, "grad_norm": 4.1242289543151855, "learning_rate": 0.0003906243154435926, "loss": 0.9908, "step": 2463 }, { "epoch": 0.1669503942543046, "grad_norm": 3.7863428592681885, "learning_rate": 0.000390618838992333, "loss": 0.9677, "step": 2464 }, { "epoch": 0.1670181500961286, "grad_norm": 5.367023468017578, "learning_rate": 0.00039061336254107344, "loss": 1.1125, "step": 2465 }, { "epoch": 0.16708590593795258, "grad_norm": 3.8724138736724854, "learning_rate": 0.00039060788608981384, "loss": 1.1829, "step": 2466 }, { "epoch": 0.16715366177977659, "grad_norm": 4.036258220672607, "learning_rate": 0.00039060240963855424, "loss": 1.0198, "step": 2467 }, { "epoch": 0.16722141762160056, "grad_norm": 2.6432082653045654, "learning_rate": 0.00039059693318729464, "loss": 0.6928, "step": 2468 }, { "epoch": 0.16728917346342456, "grad_norm": 4.638551712036133, "learning_rate": 0.00039059145673603503, "loss": 0.892, "step": 2469 }, { "epoch": 0.16735692930524854, "grad_norm": 4.4014387130737305, "learning_rate": 0.0003905859802847755, "loss": 0.9948, "step": 2470 }, { "epoch": 0.1674246851470725, "grad_norm": 6.303602695465088, "learning_rate": 0.00039058050383351594, "loss": 0.9433, "step": 2471 }, { "epoch": 0.1674924409888965, "grad_norm": 2.4351272583007812, "learning_rate": 0.00039057502738225634, "loss": 0.7639, "step": 2472 }, { "epoch": 0.1675601968307205, "grad_norm": 3.6072940826416016, "learning_rate": 0.00039056955093099674, "loss": 1.0302, "step": 2473 }, { "epoch": 0.1676279526725445, "grad_norm": 4.892384052276611, "learning_rate": 0.00039056407447973714, "loss": 0.7655, "step": 2474 }, { "epoch": 0.16769570851436846, "grad_norm": 2.934410572052002, "learning_rate": 0.00039055859802847754, "loss": 0.9017, "step": 2475 }, { "epoch": 0.16776346435619247, "grad_norm": 3.499940872192383, "learning_rate": 0.000390553121577218, "loss": 1.1146, "step": 2476 }, { "epoch": 0.16783122019801644, "grad_norm": 4.725958824157715, "learning_rate": 0.0003905476451259584, "loss": 1.2517, "step": 2477 }, { "epoch": 0.16789897603984044, "grad_norm": 2.8330156803131104, "learning_rate": 0.0003905421686746988, "loss": 0.8041, "step": 2478 }, { "epoch": 0.16796673188166442, "grad_norm": 3.1319074630737305, "learning_rate": 0.00039053669222343924, "loss": 0.9445, "step": 2479 }, { "epoch": 0.16803448772348842, "grad_norm": 3.1208853721618652, "learning_rate": 0.00039053121577217964, "loss": 0.8958, "step": 2480 }, { "epoch": 0.1681022435653124, "grad_norm": 3.205109119415283, "learning_rate": 0.0003905257393209201, "loss": 0.9979, "step": 2481 }, { "epoch": 0.1681699994071364, "grad_norm": 2.689314126968384, "learning_rate": 0.0003905202628696605, "loss": 0.7155, "step": 2482 }, { "epoch": 0.16823775524896037, "grad_norm": 3.5805270671844482, "learning_rate": 0.0003905147864184009, "loss": 0.874, "step": 2483 }, { "epoch": 0.16830551109078437, "grad_norm": 5.0327301025390625, "learning_rate": 0.0003905093099671413, "loss": 1.0809, "step": 2484 }, { "epoch": 0.16837326693260835, "grad_norm": 2.962639093399048, "learning_rate": 0.0003905038335158817, "loss": 0.6237, "step": 2485 }, { "epoch": 0.16844102277443232, "grad_norm": 3.0607898235321045, "learning_rate": 0.00039049835706462215, "loss": 1.0555, "step": 2486 }, { "epoch": 0.16850877861625632, "grad_norm": 3.251007318496704, "learning_rate": 0.0003904928806133626, "loss": 0.86, "step": 2487 }, { "epoch": 0.1685765344580803, "grad_norm": 7.316134929656982, "learning_rate": 0.000390487404162103, "loss": 1.0998, "step": 2488 }, { "epoch": 0.1686442902999043, "grad_norm": 3.2383830547332764, "learning_rate": 0.0003904819277108434, "loss": 0.8463, "step": 2489 }, { "epoch": 0.16871204614172827, "grad_norm": 3.8393495082855225, "learning_rate": 0.0003904764512595838, "loss": 1.0925, "step": 2490 }, { "epoch": 0.16877980198355227, "grad_norm": 4.367223262786865, "learning_rate": 0.0003904709748083242, "loss": 0.9895, "step": 2491 }, { "epoch": 0.16884755782537625, "grad_norm": 3.1378276348114014, "learning_rate": 0.00039046549835706465, "loss": 0.8643, "step": 2492 }, { "epoch": 0.16891531366720025, "grad_norm": 3.3862216472625732, "learning_rate": 0.00039046002190580505, "loss": 0.7248, "step": 2493 }, { "epoch": 0.16898306950902423, "grad_norm": 4.475974082946777, "learning_rate": 0.0003904545454545455, "loss": 0.959, "step": 2494 }, { "epoch": 0.16905082535084823, "grad_norm": 3.610344648361206, "learning_rate": 0.0003904490690032859, "loss": 0.7547, "step": 2495 }, { "epoch": 0.1691185811926722, "grad_norm": 2.978452444076538, "learning_rate": 0.0003904435925520263, "loss": 0.838, "step": 2496 }, { "epoch": 0.1691863370344962, "grad_norm": 4.296289443969727, "learning_rate": 0.0003904381161007667, "loss": 1.1953, "step": 2497 }, { "epoch": 0.16925409287632018, "grad_norm": 2.882884979248047, "learning_rate": 0.00039043263964950715, "loss": 0.7595, "step": 2498 }, { "epoch": 0.16932184871814418, "grad_norm": 5.182488918304443, "learning_rate": 0.00039042716319824755, "loss": 0.9124, "step": 2499 }, { "epoch": 0.16938960455996815, "grad_norm": 3.2569730281829834, "learning_rate": 0.00039042168674698795, "loss": 0.9455, "step": 2500 }, { "epoch": 0.16945736040179213, "grad_norm": 3.6530795097351074, "learning_rate": 0.00039041621029572835, "loss": 1.2466, "step": 2501 }, { "epoch": 0.16952511624361613, "grad_norm": 3.1642515659332275, "learning_rate": 0.0003904107338444688, "loss": 0.9318, "step": 2502 }, { "epoch": 0.1695928720854401, "grad_norm": 5.807140350341797, "learning_rate": 0.00039040525739320926, "loss": 0.9086, "step": 2503 }, { "epoch": 0.1696606279272641, "grad_norm": 3.1718502044677734, "learning_rate": 0.00039039978094194966, "loss": 0.715, "step": 2504 }, { "epoch": 0.16972838376908808, "grad_norm": 3.395629644393921, "learning_rate": 0.00039039430449069006, "loss": 0.8206, "step": 2505 }, { "epoch": 0.16979613961091208, "grad_norm": 3.5122809410095215, "learning_rate": 0.00039038882803943046, "loss": 0.8728, "step": 2506 }, { "epoch": 0.16986389545273606, "grad_norm": 2.7259223461151123, "learning_rate": 0.00039038335158817086, "loss": 0.7943, "step": 2507 }, { "epoch": 0.16993165129456006, "grad_norm": 2.6649601459503174, "learning_rate": 0.0003903778751369113, "loss": 0.8234, "step": 2508 }, { "epoch": 0.16999940713638403, "grad_norm": 3.652932643890381, "learning_rate": 0.0003903723986856517, "loss": 0.929, "step": 2509 }, { "epoch": 0.17006716297820804, "grad_norm": 2.9952287673950195, "learning_rate": 0.00039036692223439216, "loss": 0.8891, "step": 2510 }, { "epoch": 0.170134918820032, "grad_norm": 4.975259304046631, "learning_rate": 0.00039036144578313256, "loss": 0.7084, "step": 2511 }, { "epoch": 0.170202674661856, "grad_norm": 2.84755277633667, "learning_rate": 0.00039035596933187296, "loss": 0.8879, "step": 2512 }, { "epoch": 0.17027043050368, "grad_norm": 4.363710403442383, "learning_rate": 0.00039035049288061336, "loss": 0.9035, "step": 2513 }, { "epoch": 0.170338186345504, "grad_norm": 3.009085178375244, "learning_rate": 0.0003903450164293538, "loss": 0.8504, "step": 2514 }, { "epoch": 0.17040594218732796, "grad_norm": 4.749852657318115, "learning_rate": 0.0003903395399780942, "loss": 1.2194, "step": 2515 }, { "epoch": 0.17047369802915194, "grad_norm": 3.3849310874938965, "learning_rate": 0.0003903340635268346, "loss": 0.9086, "step": 2516 }, { "epoch": 0.17054145387097594, "grad_norm": 3.9758949279785156, "learning_rate": 0.000390328587075575, "loss": 0.9983, "step": 2517 }, { "epoch": 0.17060920971279991, "grad_norm": 3.2182397842407227, "learning_rate": 0.00039032311062431546, "loss": 0.7665, "step": 2518 }, { "epoch": 0.17067696555462392, "grad_norm": 2.9485557079315186, "learning_rate": 0.0003903176341730559, "loss": 0.8612, "step": 2519 }, { "epoch": 0.1707447213964479, "grad_norm": 5.343372344970703, "learning_rate": 0.0003903121577217963, "loss": 1.2397, "step": 2520 }, { "epoch": 0.1708124772382719, "grad_norm": 2.915400743484497, "learning_rate": 0.0003903066812705367, "loss": 0.8531, "step": 2521 }, { "epoch": 0.17088023308009587, "grad_norm": 3.5305919647216797, "learning_rate": 0.0003903012048192771, "loss": 1.0432, "step": 2522 }, { "epoch": 0.17094798892191987, "grad_norm": 3.0573630332946777, "learning_rate": 0.0003902957283680175, "loss": 0.9158, "step": 2523 }, { "epoch": 0.17101574476374384, "grad_norm": 5.164576053619385, "learning_rate": 0.00039029025191675797, "loss": 1.3094, "step": 2524 }, { "epoch": 0.17108350060556785, "grad_norm": 3.91410756111145, "learning_rate": 0.0003902847754654984, "loss": 0.757, "step": 2525 }, { "epoch": 0.17115125644739182, "grad_norm": 4.13603401184082, "learning_rate": 0.0003902792990142388, "loss": 1.0597, "step": 2526 }, { "epoch": 0.17121901228921582, "grad_norm": 4.983597755432129, "learning_rate": 0.0003902738225629792, "loss": 0.8252, "step": 2527 }, { "epoch": 0.1712867681310398, "grad_norm": 3.5605082511901855, "learning_rate": 0.0003902683461117196, "loss": 1.0352, "step": 2528 }, { "epoch": 0.1713545239728638, "grad_norm": 2.942303419113159, "learning_rate": 0.00039026286966046, "loss": 0.6983, "step": 2529 }, { "epoch": 0.17142227981468777, "grad_norm": 4.3727335929870605, "learning_rate": 0.00039025739320920047, "loss": 1.139, "step": 2530 }, { "epoch": 0.17149003565651175, "grad_norm": 3.518662452697754, "learning_rate": 0.00039025191675794087, "loss": 0.7531, "step": 2531 }, { "epoch": 0.17155779149833575, "grad_norm": 3.054861307144165, "learning_rate": 0.00039024644030668127, "loss": 0.9009, "step": 2532 }, { "epoch": 0.17162554734015972, "grad_norm": 2.9179372787475586, "learning_rate": 0.0003902409638554217, "loss": 0.717, "step": 2533 }, { "epoch": 0.17169330318198373, "grad_norm": 3.4097423553466797, "learning_rate": 0.0003902354874041621, "loss": 0.9311, "step": 2534 }, { "epoch": 0.1717610590238077, "grad_norm": 4.934108734130859, "learning_rate": 0.0003902300109529025, "loss": 1.1504, "step": 2535 }, { "epoch": 0.1718288148656317, "grad_norm": 4.038305282592773, "learning_rate": 0.000390224534501643, "loss": 1.1144, "step": 2536 }, { "epoch": 0.17189657070745568, "grad_norm": 3.374476671218872, "learning_rate": 0.0003902190580503834, "loss": 0.7995, "step": 2537 }, { "epoch": 0.17196432654927968, "grad_norm": 3.6881179809570312, "learning_rate": 0.0003902135815991238, "loss": 1.1833, "step": 2538 }, { "epoch": 0.17203208239110365, "grad_norm": 3.08377742767334, "learning_rate": 0.00039020810514786417, "loss": 0.931, "step": 2539 }, { "epoch": 0.17209983823292765, "grad_norm": 3.472093343734741, "learning_rate": 0.0003902026286966046, "loss": 1.1611, "step": 2540 }, { "epoch": 0.17216759407475163, "grad_norm": 4.440534591674805, "learning_rate": 0.0003901971522453451, "loss": 1.1486, "step": 2541 }, { "epoch": 0.17223534991657563, "grad_norm": 11.117719650268555, "learning_rate": 0.0003901916757940855, "loss": 1.2, "step": 2542 }, { "epoch": 0.1723031057583996, "grad_norm": 3.0526633262634277, "learning_rate": 0.0003901861993428259, "loss": 0.9043, "step": 2543 }, { "epoch": 0.1723708616002236, "grad_norm": 2.7024383544921875, "learning_rate": 0.0003901807228915663, "loss": 0.7794, "step": 2544 }, { "epoch": 0.17243861744204758, "grad_norm": 3.114699125289917, "learning_rate": 0.0003901752464403067, "loss": 0.8526, "step": 2545 }, { "epoch": 0.17250637328387156, "grad_norm": 3.6229288578033447, "learning_rate": 0.00039016976998904713, "loss": 1.0887, "step": 2546 }, { "epoch": 0.17257412912569556, "grad_norm": 3.174581527709961, "learning_rate": 0.00039016429353778753, "loss": 0.8004, "step": 2547 }, { "epoch": 0.17264188496751953, "grad_norm": 3.8656678199768066, "learning_rate": 0.00039015881708652793, "loss": 1.0335, "step": 2548 }, { "epoch": 0.17270964080934353, "grad_norm": 3.8717849254608154, "learning_rate": 0.0003901533406352684, "loss": 1.0159, "step": 2549 }, { "epoch": 0.1727773966511675, "grad_norm": 3.6877241134643555, "learning_rate": 0.0003901478641840088, "loss": 1.2281, "step": 2550 }, { "epoch": 0.1728451524929915, "grad_norm": 4.0649542808532715, "learning_rate": 0.0003901423877327492, "loss": 1.054, "step": 2551 }, { "epoch": 0.17291290833481548, "grad_norm": 2.8179779052734375, "learning_rate": 0.00039013691128148963, "loss": 0.8153, "step": 2552 }, { "epoch": 0.1729806641766395, "grad_norm": 3.6458847522735596, "learning_rate": 0.00039013143483023003, "loss": 0.9427, "step": 2553 }, { "epoch": 0.17304842001846346, "grad_norm": 4.747622013092041, "learning_rate": 0.00039012595837897043, "loss": 1.1424, "step": 2554 }, { "epoch": 0.17311617586028746, "grad_norm": 3.742690324783325, "learning_rate": 0.00039012048192771083, "loss": 0.7889, "step": 2555 }, { "epoch": 0.17318393170211144, "grad_norm": 2.595752716064453, "learning_rate": 0.00039011500547645123, "loss": 0.84, "step": 2556 }, { "epoch": 0.17325168754393544, "grad_norm": 3.23945689201355, "learning_rate": 0.00039010952902519174, "loss": 1.0411, "step": 2557 }, { "epoch": 0.17331944338575941, "grad_norm": 3.4730265140533447, "learning_rate": 0.00039010405257393214, "loss": 1.0891, "step": 2558 }, { "epoch": 0.17338719922758342, "grad_norm": 3.289047956466675, "learning_rate": 0.00039009857612267254, "loss": 0.9431, "step": 2559 }, { "epoch": 0.1734549550694074, "grad_norm": 4.19510555267334, "learning_rate": 0.00039009309967141294, "loss": 1.0412, "step": 2560 }, { "epoch": 0.17352271091123136, "grad_norm": 4.516744613647461, "learning_rate": 0.00039008762322015333, "loss": 1.2181, "step": 2561 }, { "epoch": 0.17359046675305537, "grad_norm": 3.5150797367095947, "learning_rate": 0.0003900821467688938, "loss": 1.0449, "step": 2562 }, { "epoch": 0.17365822259487934, "grad_norm": 3.157424211502075, "learning_rate": 0.0003900766703176342, "loss": 0.9585, "step": 2563 }, { "epoch": 0.17372597843670334, "grad_norm": 3.0821104049682617, "learning_rate": 0.00039007119386637464, "loss": 0.9302, "step": 2564 }, { "epoch": 0.17379373427852732, "grad_norm": 4.197528839111328, "learning_rate": 0.00039006571741511504, "loss": 0.9406, "step": 2565 }, { "epoch": 0.17386149012035132, "grad_norm": 3.539961576461792, "learning_rate": 0.00039006024096385544, "loss": 0.9513, "step": 2566 }, { "epoch": 0.1739292459621753, "grad_norm": 4.81611442565918, "learning_rate": 0.00039005476451259584, "loss": 1.1297, "step": 2567 }, { "epoch": 0.1739970018039993, "grad_norm": 3.3031651973724365, "learning_rate": 0.0003900492880613363, "loss": 0.8257, "step": 2568 }, { "epoch": 0.17406475764582327, "grad_norm": 3.876675605773926, "learning_rate": 0.0003900438116100767, "loss": 1.1138, "step": 2569 }, { "epoch": 0.17413251348764727, "grad_norm": 3.430529832839966, "learning_rate": 0.0003900383351588171, "loss": 1.0249, "step": 2570 }, { "epoch": 0.17420026932947125, "grad_norm": 3.6733033657073975, "learning_rate": 0.0003900328587075575, "loss": 0.929, "step": 2571 }, { "epoch": 0.17426802517129525, "grad_norm": 3.7165138721466064, "learning_rate": 0.00039002738225629794, "loss": 0.923, "step": 2572 }, { "epoch": 0.17433578101311922, "grad_norm": 3.363859176635742, "learning_rate": 0.00039002190580503834, "loss": 0.8085, "step": 2573 }, { "epoch": 0.17440353685494323, "grad_norm": 3.9822096824645996, "learning_rate": 0.0003900164293537788, "loss": 1.0058, "step": 2574 }, { "epoch": 0.1744712926967672, "grad_norm": 4.498959541320801, "learning_rate": 0.0003900109529025192, "loss": 0.8737, "step": 2575 }, { "epoch": 0.17453904853859117, "grad_norm": 2.949816942214966, "learning_rate": 0.0003900054764512596, "loss": 0.8397, "step": 2576 }, { "epoch": 0.17460680438041518, "grad_norm": 3.4892256259918213, "learning_rate": 0.00039, "loss": 0.8412, "step": 2577 }, { "epoch": 0.17467456022223915, "grad_norm": 3.707641124725342, "learning_rate": 0.00038999452354874045, "loss": 0.7764, "step": 2578 }, { "epoch": 0.17474231606406315, "grad_norm": 3.977794647216797, "learning_rate": 0.00038998904709748085, "loss": 1.0539, "step": 2579 }, { "epoch": 0.17481007190588713, "grad_norm": 3.536850929260254, "learning_rate": 0.0003899835706462213, "loss": 0.901, "step": 2580 }, { "epoch": 0.17487782774771113, "grad_norm": 4.956271171569824, "learning_rate": 0.0003899780941949617, "loss": 1.0666, "step": 2581 }, { "epoch": 0.1749455835895351, "grad_norm": 4.243458271026611, "learning_rate": 0.0003899726177437021, "loss": 0.8712, "step": 2582 }, { "epoch": 0.1750133394313591, "grad_norm": 4.35375452041626, "learning_rate": 0.0003899671412924425, "loss": 0.9773, "step": 2583 }, { "epoch": 0.17508109527318308, "grad_norm": 3.2398815155029297, "learning_rate": 0.00038996166484118295, "loss": 1.0486, "step": 2584 }, { "epoch": 0.17514885111500708, "grad_norm": 5.159060955047607, "learning_rate": 0.00038995618838992335, "loss": 1.0913, "step": 2585 }, { "epoch": 0.17521660695683106, "grad_norm": 3.0955114364624023, "learning_rate": 0.00038995071193866375, "loss": 0.9304, "step": 2586 }, { "epoch": 0.17528436279865506, "grad_norm": 3.279528856277466, "learning_rate": 0.00038994523548740415, "loss": 0.7707, "step": 2587 }, { "epoch": 0.17535211864047903, "grad_norm": 2.55900502204895, "learning_rate": 0.0003899397590361446, "loss": 0.7498, "step": 2588 }, { "epoch": 0.17541987448230303, "grad_norm": 3.802781105041504, "learning_rate": 0.000389934282584885, "loss": 1.1252, "step": 2589 }, { "epoch": 0.175487630324127, "grad_norm": 2.3951821327209473, "learning_rate": 0.00038992880613362545, "loss": 0.7257, "step": 2590 }, { "epoch": 0.17555538616595098, "grad_norm": 3.6497018337249756, "learning_rate": 0.00038992332968236585, "loss": 0.9651, "step": 2591 }, { "epoch": 0.17562314200777498, "grad_norm": 4.077710151672363, "learning_rate": 0.00038991785323110625, "loss": 1.0033, "step": 2592 }, { "epoch": 0.17569089784959896, "grad_norm": 3.225461959838867, "learning_rate": 0.00038991237677984665, "loss": 1.0121, "step": 2593 }, { "epoch": 0.17575865369142296, "grad_norm": 2.669999122619629, "learning_rate": 0.00038990690032858705, "loss": 0.8283, "step": 2594 }, { "epoch": 0.17582640953324694, "grad_norm": 3.697070837020874, "learning_rate": 0.00038990142387732756, "loss": 1.0134, "step": 2595 }, { "epoch": 0.17589416537507094, "grad_norm": 3.4952752590179443, "learning_rate": 0.00038989594742606796, "loss": 0.8584, "step": 2596 }, { "epoch": 0.1759619212168949, "grad_norm": 2.3571763038635254, "learning_rate": 0.00038989047097480836, "loss": 0.5339, "step": 2597 }, { "epoch": 0.17602967705871891, "grad_norm": 3.5802249908447266, "learning_rate": 0.00038988499452354876, "loss": 1.0767, "step": 2598 }, { "epoch": 0.1760974329005429, "grad_norm": 3.3723952770233154, "learning_rate": 0.00038987951807228916, "loss": 0.922, "step": 2599 }, { "epoch": 0.1761651887423669, "grad_norm": 3.4926297664642334, "learning_rate": 0.0003898740416210296, "loss": 1.0347, "step": 2600 }, { "epoch": 0.17623294458419086, "grad_norm": 2.861858606338501, "learning_rate": 0.00038986856516977, "loss": 0.7524, "step": 2601 }, { "epoch": 0.17630070042601487, "grad_norm": 3.37429141998291, "learning_rate": 0.0003898630887185104, "loss": 0.8898, "step": 2602 }, { "epoch": 0.17636845626783884, "grad_norm": 2.9697158336639404, "learning_rate": 0.00038985761226725086, "loss": 0.9308, "step": 2603 }, { "epoch": 0.17643621210966284, "grad_norm": 2.912766456604004, "learning_rate": 0.00038985213581599126, "loss": 0.8449, "step": 2604 }, { "epoch": 0.17650396795148682, "grad_norm": 2.9587221145629883, "learning_rate": 0.00038984665936473166, "loss": 0.8446, "step": 2605 }, { "epoch": 0.1765717237933108, "grad_norm": 2.7218613624572754, "learning_rate": 0.0003898411829134721, "loss": 0.7121, "step": 2606 }, { "epoch": 0.1766394796351348, "grad_norm": 3.5207486152648926, "learning_rate": 0.0003898357064622125, "loss": 0.9856, "step": 2607 }, { "epoch": 0.17670723547695877, "grad_norm": 3.294046401977539, "learning_rate": 0.0003898302300109529, "loss": 0.9977, "step": 2608 }, { "epoch": 0.17677499131878277, "grad_norm": 3.4100379943847656, "learning_rate": 0.0003898247535596933, "loss": 1.0713, "step": 2609 }, { "epoch": 0.17684274716060674, "grad_norm": 3.8984200954437256, "learning_rate": 0.0003898192771084337, "loss": 1.2285, "step": 2610 }, { "epoch": 0.17691050300243075, "grad_norm": 2.80888032913208, "learning_rate": 0.00038981380065717416, "loss": 0.7347, "step": 2611 }, { "epoch": 0.17697825884425472, "grad_norm": 2.8719544410705566, "learning_rate": 0.0003898083242059146, "loss": 0.7561, "step": 2612 }, { "epoch": 0.17704601468607872, "grad_norm": 4.0782551765441895, "learning_rate": 0.000389802847754655, "loss": 1.0882, "step": 2613 }, { "epoch": 0.1771137705279027, "grad_norm": 3.844801902770996, "learning_rate": 0.0003897973713033954, "loss": 0.8765, "step": 2614 }, { "epoch": 0.1771815263697267, "grad_norm": 4.475718021392822, "learning_rate": 0.0003897918948521358, "loss": 0.9182, "step": 2615 }, { "epoch": 0.17724928221155067, "grad_norm": 3.4106149673461914, "learning_rate": 0.00038978641840087627, "loss": 0.9449, "step": 2616 }, { "epoch": 0.17731703805337468, "grad_norm": 3.4275550842285156, "learning_rate": 0.00038978094194961667, "loss": 0.8974, "step": 2617 }, { "epoch": 0.17738479389519865, "grad_norm": 3.4054598808288574, "learning_rate": 0.00038977546549835707, "loss": 1.0757, "step": 2618 }, { "epoch": 0.17745254973702265, "grad_norm": 3.75724458694458, "learning_rate": 0.0003897699890470975, "loss": 1.12, "step": 2619 }, { "epoch": 0.17752030557884663, "grad_norm": 6.18004846572876, "learning_rate": 0.0003897645125958379, "loss": 1.2614, "step": 2620 }, { "epoch": 0.1775880614206706, "grad_norm": 3.289705276489258, "learning_rate": 0.0003897590361445783, "loss": 1.1055, "step": 2621 }, { "epoch": 0.1776558172624946, "grad_norm": 3.257082939147949, "learning_rate": 0.00038975355969331877, "loss": 0.9413, "step": 2622 }, { "epoch": 0.17772357310431858, "grad_norm": 3.874941349029541, "learning_rate": 0.00038974808324205917, "loss": 0.9413, "step": 2623 }, { "epoch": 0.17779132894614258, "grad_norm": 2.6239776611328125, "learning_rate": 0.00038974260679079957, "loss": 0.8165, "step": 2624 }, { "epoch": 0.17785908478796655, "grad_norm": 2.9095847606658936, "learning_rate": 0.00038973713033953997, "loss": 1.0151, "step": 2625 }, { "epoch": 0.17792684062979056, "grad_norm": 3.940357208251953, "learning_rate": 0.0003897316538882804, "loss": 0.893, "step": 2626 }, { "epoch": 0.17799459647161453, "grad_norm": 4.15380859375, "learning_rate": 0.0003897261774370208, "loss": 0.9203, "step": 2627 }, { "epoch": 0.17806235231343853, "grad_norm": 2.5981051921844482, "learning_rate": 0.0003897207009857613, "loss": 0.7267, "step": 2628 }, { "epoch": 0.1781301081552625, "grad_norm": 3.4140498638153076, "learning_rate": 0.0003897152245345017, "loss": 1.1709, "step": 2629 }, { "epoch": 0.1781978639970865, "grad_norm": 5.037257194519043, "learning_rate": 0.00038970974808324207, "loss": 0.9983, "step": 2630 }, { "epoch": 0.17826561983891048, "grad_norm": 3.5703980922698975, "learning_rate": 0.00038970427163198247, "loss": 0.914, "step": 2631 }, { "epoch": 0.17833337568073448, "grad_norm": 3.9696242809295654, "learning_rate": 0.00038969879518072287, "loss": 0.8462, "step": 2632 }, { "epoch": 0.17840113152255846, "grad_norm": 3.3272550106048584, "learning_rate": 0.0003896933187294633, "loss": 0.9579, "step": 2633 }, { "epoch": 0.17846888736438246, "grad_norm": 2.634965419769287, "learning_rate": 0.0003896878422782038, "loss": 0.8297, "step": 2634 }, { "epoch": 0.17853664320620644, "grad_norm": 4.258874893188477, "learning_rate": 0.0003896823658269442, "loss": 1.1595, "step": 2635 }, { "epoch": 0.1786043990480304, "grad_norm": 4.283937454223633, "learning_rate": 0.0003896768893756846, "loss": 0.8804, "step": 2636 }, { "epoch": 0.1786721548898544, "grad_norm": 3.6384127140045166, "learning_rate": 0.000389671412924425, "loss": 1.2789, "step": 2637 }, { "epoch": 0.1787399107316784, "grad_norm": 5.409982204437256, "learning_rate": 0.00038966593647316543, "loss": 0.9387, "step": 2638 }, { "epoch": 0.1788076665735024, "grad_norm": 5.593501091003418, "learning_rate": 0.00038966046002190583, "loss": 0.9363, "step": 2639 }, { "epoch": 0.17887542241532636, "grad_norm": 4.182191848754883, "learning_rate": 0.00038965498357064623, "loss": 0.8834, "step": 2640 }, { "epoch": 0.17894317825715036, "grad_norm": 3.317460298538208, "learning_rate": 0.0003896495071193866, "loss": 1.0, "step": 2641 }, { "epoch": 0.17901093409897434, "grad_norm": 4.710450649261475, "learning_rate": 0.0003896440306681271, "loss": 0.8489, "step": 2642 }, { "epoch": 0.17907868994079834, "grad_norm": 3.936033010482788, "learning_rate": 0.0003896385542168675, "loss": 0.9309, "step": 2643 }, { "epoch": 0.17914644578262232, "grad_norm": 3.5020408630371094, "learning_rate": 0.00038963307776560793, "loss": 0.9269, "step": 2644 }, { "epoch": 0.17921420162444632, "grad_norm": 7.226351261138916, "learning_rate": 0.00038962760131434833, "loss": 0.7649, "step": 2645 }, { "epoch": 0.1792819574662703, "grad_norm": 2.956892490386963, "learning_rate": 0.00038962212486308873, "loss": 0.6984, "step": 2646 }, { "epoch": 0.1793497133080943, "grad_norm": 4.771556377410889, "learning_rate": 0.00038961664841182913, "loss": 1.158, "step": 2647 }, { "epoch": 0.17941746914991827, "grad_norm": 2.7597439289093018, "learning_rate": 0.00038961117196056953, "loss": 0.7287, "step": 2648 }, { "epoch": 0.17948522499174227, "grad_norm": 2.55896258354187, "learning_rate": 0.00038960569550931, "loss": 0.8061, "step": 2649 }, { "epoch": 0.17955298083356624, "grad_norm": 3.6491754055023193, "learning_rate": 0.00038960021905805044, "loss": 0.9915, "step": 2650 }, { "epoch": 0.17962073667539022, "grad_norm": 2.76542592048645, "learning_rate": 0.00038959474260679084, "loss": 0.8947, "step": 2651 }, { "epoch": 0.17968849251721422, "grad_norm": 4.276484966278076, "learning_rate": 0.00038958926615553124, "loss": 0.9419, "step": 2652 }, { "epoch": 0.1797562483590382, "grad_norm": 2.909879684448242, "learning_rate": 0.00038958378970427163, "loss": 0.6983, "step": 2653 }, { "epoch": 0.1798240042008622, "grad_norm": 2.697392702102661, "learning_rate": 0.0003895783132530121, "loss": 0.6049, "step": 2654 }, { "epoch": 0.17989176004268617, "grad_norm": 2.8968899250030518, "learning_rate": 0.0003895728368017525, "loss": 0.881, "step": 2655 }, { "epoch": 0.17995951588451017, "grad_norm": 3.7650716304779053, "learning_rate": 0.0003895673603504929, "loss": 1.1511, "step": 2656 }, { "epoch": 0.18002727172633415, "grad_norm": 3.2984042167663574, "learning_rate": 0.00038956188389923334, "loss": 0.9816, "step": 2657 }, { "epoch": 0.18009502756815815, "grad_norm": 3.3338100910186768, "learning_rate": 0.00038955640744797374, "loss": 0.8021, "step": 2658 }, { "epoch": 0.18016278340998212, "grad_norm": 3.511577606201172, "learning_rate": 0.00038955093099671414, "loss": 1.0287, "step": 2659 }, { "epoch": 0.18023053925180613, "grad_norm": 3.08599591255188, "learning_rate": 0.0003895454545454546, "loss": 0.8372, "step": 2660 }, { "epoch": 0.1802982950936301, "grad_norm": 4.34332799911499, "learning_rate": 0.000389539978094195, "loss": 1.3726, "step": 2661 }, { "epoch": 0.1803660509354541, "grad_norm": 3.9376471042633057, "learning_rate": 0.0003895345016429354, "loss": 0.893, "step": 2662 }, { "epoch": 0.18043380677727808, "grad_norm": 3.613227367401123, "learning_rate": 0.0003895290251916758, "loss": 1.067, "step": 2663 }, { "epoch": 0.18050156261910208, "grad_norm": 4.009829044342041, "learning_rate": 0.0003895235487404162, "loss": 1.0189, "step": 2664 }, { "epoch": 0.18056931846092605, "grad_norm": 2.7565722465515137, "learning_rate": 0.00038951807228915664, "loss": 0.9221, "step": 2665 }, { "epoch": 0.18063707430275003, "grad_norm": 3.824349880218506, "learning_rate": 0.0003895125958378971, "loss": 1.1327, "step": 2666 }, { "epoch": 0.18070483014457403, "grad_norm": 3.4602041244506836, "learning_rate": 0.0003895071193866375, "loss": 0.9262, "step": 2667 }, { "epoch": 0.180772585986398, "grad_norm": 4.573212146759033, "learning_rate": 0.0003895016429353779, "loss": 1.0516, "step": 2668 }, { "epoch": 0.180840341828222, "grad_norm": 3.433784008026123, "learning_rate": 0.0003894961664841183, "loss": 0.9526, "step": 2669 }, { "epoch": 0.18090809767004598, "grad_norm": 3.506720781326294, "learning_rate": 0.0003894906900328587, "loss": 0.9541, "step": 2670 }, { "epoch": 0.18097585351186998, "grad_norm": 3.599555015563965, "learning_rate": 0.00038948521358159915, "loss": 1.0681, "step": 2671 }, { "epoch": 0.18104360935369396, "grad_norm": 3.4419429302215576, "learning_rate": 0.00038947973713033954, "loss": 1.1236, "step": 2672 }, { "epoch": 0.18111136519551796, "grad_norm": 3.681448221206665, "learning_rate": 0.00038947426067908, "loss": 0.9423, "step": 2673 }, { "epoch": 0.18117912103734193, "grad_norm": 3.2584047317504883, "learning_rate": 0.0003894687842278204, "loss": 0.9144, "step": 2674 }, { "epoch": 0.18124687687916594, "grad_norm": 4.071553707122803, "learning_rate": 0.0003894633077765608, "loss": 1.1134, "step": 2675 }, { "epoch": 0.1813146327209899, "grad_norm": 3.0014286041259766, "learning_rate": 0.00038945783132530125, "loss": 0.9765, "step": 2676 }, { "epoch": 0.1813823885628139, "grad_norm": 3.6688241958618164, "learning_rate": 0.00038945235487404165, "loss": 1.1312, "step": 2677 }, { "epoch": 0.1814501444046379, "grad_norm": 3.805133104324341, "learning_rate": 0.00038944687842278205, "loss": 1.153, "step": 2678 }, { "epoch": 0.1815179002464619, "grad_norm": 7.35417366027832, "learning_rate": 0.00038944140197152245, "loss": 1.0345, "step": 2679 }, { "epoch": 0.18158565608828586, "grad_norm": 3.128613233566284, "learning_rate": 0.00038943592552026285, "loss": 0.9284, "step": 2680 }, { "epoch": 0.18165341193010984, "grad_norm": 2.820746660232544, "learning_rate": 0.0003894304490690033, "loss": 0.7568, "step": 2681 }, { "epoch": 0.18172116777193384, "grad_norm": 4.232748508453369, "learning_rate": 0.00038942497261774375, "loss": 1.1921, "step": 2682 }, { "epoch": 0.1817889236137578, "grad_norm": 3.090925931930542, "learning_rate": 0.00038941949616648415, "loss": 0.9605, "step": 2683 }, { "epoch": 0.18185667945558182, "grad_norm": 3.2545409202575684, "learning_rate": 0.00038941401971522455, "loss": 0.7772, "step": 2684 }, { "epoch": 0.1819244352974058, "grad_norm": 2.8999624252319336, "learning_rate": 0.00038940854326396495, "loss": 0.7736, "step": 2685 }, { "epoch": 0.1819921911392298, "grad_norm": 3.194016218185425, "learning_rate": 0.00038940306681270535, "loss": 0.9554, "step": 2686 }, { "epoch": 0.18205994698105377, "grad_norm": 3.151383399963379, "learning_rate": 0.0003893975903614458, "loss": 1.0019, "step": 2687 }, { "epoch": 0.18212770282287777, "grad_norm": 3.5041158199310303, "learning_rate": 0.0003893921139101862, "loss": 0.9024, "step": 2688 }, { "epoch": 0.18219545866470174, "grad_norm": 3.3110926151275635, "learning_rate": 0.00038938663745892666, "loss": 0.9726, "step": 2689 }, { "epoch": 0.18226321450652574, "grad_norm": 4.077019691467285, "learning_rate": 0.00038938116100766706, "loss": 1.0472, "step": 2690 }, { "epoch": 0.18233097034834972, "grad_norm": 3.9141652584075928, "learning_rate": 0.00038937568455640745, "loss": 1.2568, "step": 2691 }, { "epoch": 0.18239872619017372, "grad_norm": 3.8966596126556396, "learning_rate": 0.0003893702081051479, "loss": 0.9997, "step": 2692 }, { "epoch": 0.1824664820319977, "grad_norm": 3.6254265308380127, "learning_rate": 0.0003893647316538883, "loss": 1.2175, "step": 2693 }, { "epoch": 0.18253423787382167, "grad_norm": 3.1052329540252686, "learning_rate": 0.0003893592552026287, "loss": 1.0351, "step": 2694 }, { "epoch": 0.18260199371564567, "grad_norm": 3.3898918628692627, "learning_rate": 0.0003893537787513691, "loss": 1.1057, "step": 2695 }, { "epoch": 0.18266974955746965, "grad_norm": 2.928323984146118, "learning_rate": 0.00038934830230010956, "loss": 0.8844, "step": 2696 }, { "epoch": 0.18273750539929365, "grad_norm": 2.679624080657959, "learning_rate": 0.00038934282584884996, "loss": 0.85, "step": 2697 }, { "epoch": 0.18280526124111762, "grad_norm": 2.8958041667938232, "learning_rate": 0.0003893373493975904, "loss": 0.8185, "step": 2698 }, { "epoch": 0.18287301708294162, "grad_norm": 3.57362961769104, "learning_rate": 0.0003893318729463308, "loss": 1.1278, "step": 2699 }, { "epoch": 0.1829407729247656, "grad_norm": 4.073363304138184, "learning_rate": 0.0003893263964950712, "loss": 1.1645, "step": 2700 }, { "epoch": 0.1830085287665896, "grad_norm": 3.3202733993530273, "learning_rate": 0.0003893209200438116, "loss": 1.0613, "step": 2701 }, { "epoch": 0.18307628460841358, "grad_norm": 3.0552022457122803, "learning_rate": 0.000389315443592552, "loss": 0.8995, "step": 2702 }, { "epoch": 0.18314404045023758, "grad_norm": 3.6667277812957764, "learning_rate": 0.00038930996714129246, "loss": 1.0548, "step": 2703 }, { "epoch": 0.18321179629206155, "grad_norm": 2.921941041946411, "learning_rate": 0.0003893044906900329, "loss": 0.8421, "step": 2704 }, { "epoch": 0.18327955213388555, "grad_norm": 3.0407462120056152, "learning_rate": 0.0003892990142387733, "loss": 0.8049, "step": 2705 }, { "epoch": 0.18334730797570953, "grad_norm": 3.932305335998535, "learning_rate": 0.0003892935377875137, "loss": 0.8372, "step": 2706 }, { "epoch": 0.18341506381753353, "grad_norm": 4.510745048522949, "learning_rate": 0.0003892880613362541, "loss": 1.4213, "step": 2707 }, { "epoch": 0.1834828196593575, "grad_norm": 3.7805044651031494, "learning_rate": 0.0003892825848849945, "loss": 1.235, "step": 2708 }, { "epoch": 0.18355057550118148, "grad_norm": 3.113142728805542, "learning_rate": 0.00038927710843373497, "loss": 0.9122, "step": 2709 }, { "epoch": 0.18361833134300548, "grad_norm": 3.3371307849884033, "learning_rate": 0.00038927163198247537, "loss": 1.185, "step": 2710 }, { "epoch": 0.18368608718482946, "grad_norm": 3.1075308322906494, "learning_rate": 0.00038926615553121576, "loss": 0.9452, "step": 2711 }, { "epoch": 0.18375384302665346, "grad_norm": 3.0181798934936523, "learning_rate": 0.0003892606790799562, "loss": 0.9451, "step": 2712 }, { "epoch": 0.18382159886847743, "grad_norm": 2.9030022621154785, "learning_rate": 0.0003892552026286966, "loss": 0.6971, "step": 2713 }, { "epoch": 0.18388935471030143, "grad_norm": 3.294637680053711, "learning_rate": 0.00038924972617743707, "loss": 0.8706, "step": 2714 }, { "epoch": 0.1839571105521254, "grad_norm": 3.6005096435546875, "learning_rate": 0.00038924424972617747, "loss": 0.6631, "step": 2715 }, { "epoch": 0.1840248663939494, "grad_norm": 2.861650228500366, "learning_rate": 0.00038923877327491787, "loss": 0.8788, "step": 2716 }, { "epoch": 0.18409262223577338, "grad_norm": 2.8314208984375, "learning_rate": 0.00038923329682365827, "loss": 0.8282, "step": 2717 }, { "epoch": 0.1841603780775974, "grad_norm": 3.6045475006103516, "learning_rate": 0.00038922782037239867, "loss": 0.8872, "step": 2718 }, { "epoch": 0.18422813391942136, "grad_norm": 4.004055500030518, "learning_rate": 0.0003892223439211391, "loss": 1.1149, "step": 2719 }, { "epoch": 0.18429588976124536, "grad_norm": 3.295501708984375, "learning_rate": 0.0003892168674698796, "loss": 1.0293, "step": 2720 }, { "epoch": 0.18436364560306934, "grad_norm": 3.769365072250366, "learning_rate": 0.00038921139101862, "loss": 1.042, "step": 2721 }, { "epoch": 0.18443140144489334, "grad_norm": 3.303382396697998, "learning_rate": 0.00038920591456736037, "loss": 1.0317, "step": 2722 }, { "epoch": 0.1844991572867173, "grad_norm": 5.053825378417969, "learning_rate": 0.00038920043811610077, "loss": 1.2384, "step": 2723 }, { "epoch": 0.1845669131285413, "grad_norm": 3.6436846256256104, "learning_rate": 0.00038919496166484117, "loss": 0.8183, "step": 2724 }, { "epoch": 0.1846346689703653, "grad_norm": 2.6752846240997314, "learning_rate": 0.0003891894852135816, "loss": 0.792, "step": 2725 }, { "epoch": 0.18470242481218926, "grad_norm": 3.5337884426116943, "learning_rate": 0.000389184008762322, "loss": 1.1572, "step": 2726 }, { "epoch": 0.18477018065401327, "grad_norm": 3.323737382888794, "learning_rate": 0.0003891785323110625, "loss": 0.9435, "step": 2727 }, { "epoch": 0.18483793649583724, "grad_norm": 3.610866069793701, "learning_rate": 0.0003891730558598029, "loss": 1.1143, "step": 2728 }, { "epoch": 0.18490569233766124, "grad_norm": 2.9186441898345947, "learning_rate": 0.0003891675794085433, "loss": 0.8383, "step": 2729 }, { "epoch": 0.18497344817948522, "grad_norm": 4.304622173309326, "learning_rate": 0.00038916210295728373, "loss": 1.1049, "step": 2730 }, { "epoch": 0.18504120402130922, "grad_norm": 3.8744423389434814, "learning_rate": 0.00038915662650602413, "loss": 0.9221, "step": 2731 }, { "epoch": 0.1851089598631332, "grad_norm": 3.7568271160125732, "learning_rate": 0.00038915115005476453, "loss": 1.0329, "step": 2732 }, { "epoch": 0.1851767157049572, "grad_norm": 5.976889133453369, "learning_rate": 0.0003891456736035049, "loss": 0.8101, "step": 2733 }, { "epoch": 0.18524447154678117, "grad_norm": 2.8122339248657227, "learning_rate": 0.0003891401971522453, "loss": 0.8191, "step": 2734 }, { "epoch": 0.18531222738860517, "grad_norm": 3.552717685699463, "learning_rate": 0.0003891347207009858, "loss": 1.1971, "step": 2735 }, { "epoch": 0.18537998323042915, "grad_norm": 2.736872911453247, "learning_rate": 0.00038912924424972623, "loss": 0.9329, "step": 2736 }, { "epoch": 0.18544773907225315, "grad_norm": 2.921700954437256, "learning_rate": 0.00038912376779846663, "loss": 0.868, "step": 2737 }, { "epoch": 0.18551549491407712, "grad_norm": 3.485687732696533, "learning_rate": 0.00038911829134720703, "loss": 0.9068, "step": 2738 }, { "epoch": 0.1855832507559011, "grad_norm": 3.2625999450683594, "learning_rate": 0.00038911281489594743, "loss": 0.7244, "step": 2739 }, { "epoch": 0.1856510065977251, "grad_norm": 3.078843355178833, "learning_rate": 0.00038910733844468783, "loss": 0.8292, "step": 2740 }, { "epoch": 0.18571876243954907, "grad_norm": 3.0648934841156006, "learning_rate": 0.0003891018619934283, "loss": 0.8904, "step": 2741 }, { "epoch": 0.18578651828137308, "grad_norm": 4.389811038970947, "learning_rate": 0.0003890963855421687, "loss": 0.8412, "step": 2742 }, { "epoch": 0.18585427412319705, "grad_norm": 3.422853469848633, "learning_rate": 0.00038909090909090914, "loss": 1.0376, "step": 2743 }, { "epoch": 0.18592202996502105, "grad_norm": 3.5476505756378174, "learning_rate": 0.00038908543263964953, "loss": 0.953, "step": 2744 }, { "epoch": 0.18598978580684503, "grad_norm": 3.744685649871826, "learning_rate": 0.00038907995618838993, "loss": 0.7457, "step": 2745 }, { "epoch": 0.18605754164866903, "grad_norm": 3.5987589359283447, "learning_rate": 0.00038907447973713033, "loss": 0.8563, "step": 2746 }, { "epoch": 0.186125297490493, "grad_norm": 3.329843759536743, "learning_rate": 0.0003890690032858708, "loss": 0.9502, "step": 2747 }, { "epoch": 0.186193053332317, "grad_norm": 3.714345693588257, "learning_rate": 0.0003890635268346112, "loss": 1.0397, "step": 2748 }, { "epoch": 0.18626080917414098, "grad_norm": 3.18338680267334, "learning_rate": 0.0003890580503833516, "loss": 0.6911, "step": 2749 }, { "epoch": 0.18632856501596498, "grad_norm": 3.047652006149292, "learning_rate": 0.000389052573932092, "loss": 0.9154, "step": 2750 }, { "epoch": 0.18639632085778896, "grad_norm": 2.831117630004883, "learning_rate": 0.00038904709748083244, "loss": 0.7986, "step": 2751 }, { "epoch": 0.18646407669961296, "grad_norm": 2.417776107788086, "learning_rate": 0.0003890416210295729, "loss": 0.8015, "step": 2752 }, { "epoch": 0.18653183254143693, "grad_norm": 8.800686836242676, "learning_rate": 0.0003890361445783133, "loss": 1.11, "step": 2753 }, { "epoch": 0.1865995883832609, "grad_norm": 3.318408727645874, "learning_rate": 0.0003890306681270537, "loss": 0.8978, "step": 2754 }, { "epoch": 0.1866673442250849, "grad_norm": 4.237911224365234, "learning_rate": 0.0003890251916757941, "loss": 0.9389, "step": 2755 }, { "epoch": 0.18673510006690888, "grad_norm": 3.7788336277008057, "learning_rate": 0.0003890197152245345, "loss": 1.0492, "step": 2756 }, { "epoch": 0.18680285590873288, "grad_norm": 3.272289752960205, "learning_rate": 0.00038901423877327494, "loss": 0.8189, "step": 2757 }, { "epoch": 0.18687061175055686, "grad_norm": 3.770719528198242, "learning_rate": 0.0003890087623220154, "loss": 1.0237, "step": 2758 }, { "epoch": 0.18693836759238086, "grad_norm": 3.5051212310791016, "learning_rate": 0.0003890032858707558, "loss": 0.8965, "step": 2759 }, { "epoch": 0.18700612343420484, "grad_norm": 3.461427688598633, "learning_rate": 0.0003889978094194962, "loss": 0.9546, "step": 2760 }, { "epoch": 0.18707387927602884, "grad_norm": 3.920598268508911, "learning_rate": 0.0003889923329682366, "loss": 0.8705, "step": 2761 }, { "epoch": 0.1871416351178528, "grad_norm": 3.849118232727051, "learning_rate": 0.000388986856516977, "loss": 1.0791, "step": 2762 }, { "epoch": 0.1872093909596768, "grad_norm": 3.188839912414551, "learning_rate": 0.00038898138006571745, "loss": 0.8578, "step": 2763 }, { "epoch": 0.1872771468015008, "grad_norm": 3.7328360080718994, "learning_rate": 0.00038897590361445784, "loss": 1.1852, "step": 2764 }, { "epoch": 0.1873449026433248, "grad_norm": 4.0043416023254395, "learning_rate": 0.00038897042716319824, "loss": 0.8806, "step": 2765 }, { "epoch": 0.18741265848514876, "grad_norm": 2.7096457481384277, "learning_rate": 0.0003889649507119387, "loss": 0.8455, "step": 2766 }, { "epoch": 0.18748041432697277, "grad_norm": 3.324594020843506, "learning_rate": 0.0003889594742606791, "loss": 1.0915, "step": 2767 }, { "epoch": 0.18754817016879674, "grad_norm": 3.4153079986572266, "learning_rate": 0.00038895399780941955, "loss": 1.2219, "step": 2768 }, { "epoch": 0.18761592601062071, "grad_norm": 3.73614764213562, "learning_rate": 0.00038894852135815995, "loss": 0.8139, "step": 2769 }, { "epoch": 0.18768368185244472, "grad_norm": 3.6013848781585693, "learning_rate": 0.00038894304490690035, "loss": 1.1202, "step": 2770 }, { "epoch": 0.1877514376942687, "grad_norm": 3.0717055797576904, "learning_rate": 0.00038893756845564075, "loss": 0.9865, "step": 2771 }, { "epoch": 0.1878191935360927, "grad_norm": 4.379116058349609, "learning_rate": 0.00038893209200438115, "loss": 1.0945, "step": 2772 }, { "epoch": 0.18788694937791667, "grad_norm": 2.782845973968506, "learning_rate": 0.0003889266155531216, "loss": 0.7597, "step": 2773 }, { "epoch": 0.18795470521974067, "grad_norm": 3.087285041809082, "learning_rate": 0.00038892113910186205, "loss": 0.8965, "step": 2774 }, { "epoch": 0.18802246106156464, "grad_norm": 3.3503403663635254, "learning_rate": 0.00038891566265060245, "loss": 0.7468, "step": 2775 }, { "epoch": 0.18809021690338865, "grad_norm": 3.1630403995513916, "learning_rate": 0.00038891018619934285, "loss": 0.9173, "step": 2776 }, { "epoch": 0.18815797274521262, "grad_norm": 3.3000705242156982, "learning_rate": 0.00038890470974808325, "loss": 0.823, "step": 2777 }, { "epoch": 0.18822572858703662, "grad_norm": 3.7373993396759033, "learning_rate": 0.00038889923329682365, "loss": 1.2386, "step": 2778 }, { "epoch": 0.1882934844288606, "grad_norm": 4.010168075561523, "learning_rate": 0.0003888937568455641, "loss": 1.0299, "step": 2779 }, { "epoch": 0.1883612402706846, "grad_norm": 3.214586019515991, "learning_rate": 0.0003888882803943045, "loss": 0.8643, "step": 2780 }, { "epoch": 0.18842899611250857, "grad_norm": 3.3871943950653076, "learning_rate": 0.0003888828039430449, "loss": 0.7314, "step": 2781 }, { "epoch": 0.18849675195433258, "grad_norm": 4.106551170349121, "learning_rate": 0.00038887732749178536, "loss": 0.9923, "step": 2782 }, { "epoch": 0.18856450779615655, "grad_norm": 2.8578834533691406, "learning_rate": 0.00038887185104052575, "loss": 0.7854, "step": 2783 }, { "epoch": 0.18863226363798052, "grad_norm": 5.243823528289795, "learning_rate": 0.00038886637458926615, "loss": 0.9607, "step": 2784 }, { "epoch": 0.18870001947980453, "grad_norm": 2.833411455154419, "learning_rate": 0.0003888608981380066, "loss": 0.7695, "step": 2785 }, { "epoch": 0.1887677753216285, "grad_norm": 2.9352073669433594, "learning_rate": 0.000388855421686747, "loss": 0.7108, "step": 2786 }, { "epoch": 0.1888355311634525, "grad_norm": 3.3581740856170654, "learning_rate": 0.0003888499452354874, "loss": 0.8294, "step": 2787 }, { "epoch": 0.18890328700527648, "grad_norm": 5.8704962730407715, "learning_rate": 0.0003888444687842278, "loss": 1.4642, "step": 2788 }, { "epoch": 0.18897104284710048, "grad_norm": 2.691150188446045, "learning_rate": 0.00038883899233296826, "loss": 0.6854, "step": 2789 }, { "epoch": 0.18903879868892445, "grad_norm": 4.402223110198975, "learning_rate": 0.0003888335158817087, "loss": 1.0299, "step": 2790 }, { "epoch": 0.18910655453074846, "grad_norm": 3.609065055847168, "learning_rate": 0.0003888280394304491, "loss": 0.951, "step": 2791 }, { "epoch": 0.18917431037257243, "grad_norm": 3.6020936965942383, "learning_rate": 0.0003888225629791895, "loss": 0.8829, "step": 2792 }, { "epoch": 0.18924206621439643, "grad_norm": 3.3175535202026367, "learning_rate": 0.0003888170865279299, "loss": 0.9027, "step": 2793 }, { "epoch": 0.1893098220562204, "grad_norm": 3.529503107070923, "learning_rate": 0.0003888116100766703, "loss": 0.8035, "step": 2794 }, { "epoch": 0.1893775778980444, "grad_norm": 3.649784564971924, "learning_rate": 0.00038880613362541076, "loss": 1.1049, "step": 2795 }, { "epoch": 0.18944533373986838, "grad_norm": 2.818162441253662, "learning_rate": 0.00038880065717415116, "loss": 0.8623, "step": 2796 }, { "epoch": 0.18951308958169238, "grad_norm": 3.7919747829437256, "learning_rate": 0.0003887951807228916, "loss": 1.0414, "step": 2797 }, { "epoch": 0.18958084542351636, "grad_norm": 3.144446611404419, "learning_rate": 0.000388789704271632, "loss": 0.9603, "step": 2798 }, { "epoch": 0.18964860126534033, "grad_norm": 4.011592864990234, "learning_rate": 0.0003887842278203724, "loss": 1.2122, "step": 2799 }, { "epoch": 0.18971635710716434, "grad_norm": 5.217103481292725, "learning_rate": 0.0003887787513691128, "loss": 0.8854, "step": 2800 }, { "epoch": 0.1897841129489883, "grad_norm": 4.114294052124023, "learning_rate": 0.00038877327491785327, "loss": 0.9165, "step": 2801 }, { "epoch": 0.1898518687908123, "grad_norm": 3.6854746341705322, "learning_rate": 0.00038876779846659367, "loss": 1.2091, "step": 2802 }, { "epoch": 0.18991962463263629, "grad_norm": 3.5141451358795166, "learning_rate": 0.00038876232201533406, "loss": 0.9723, "step": 2803 }, { "epoch": 0.1899873804744603, "grad_norm": 3.343451499938965, "learning_rate": 0.00038875684556407446, "loss": 0.9458, "step": 2804 }, { "epoch": 0.19005513631628426, "grad_norm": 3.792968511581421, "learning_rate": 0.0003887513691128149, "loss": 0.8935, "step": 2805 }, { "epoch": 0.19012289215810826, "grad_norm": 3.1623423099517822, "learning_rate": 0.00038874589266155537, "loss": 1.0347, "step": 2806 }, { "epoch": 0.19019064799993224, "grad_norm": 3.3526084423065186, "learning_rate": 0.00038874041621029577, "loss": 1.0338, "step": 2807 }, { "epoch": 0.19025840384175624, "grad_norm": 3.289684772491455, "learning_rate": 0.00038873493975903617, "loss": 0.961, "step": 2808 }, { "epoch": 0.19032615968358021, "grad_norm": 3.482077121734619, "learning_rate": 0.00038872946330777657, "loss": 1.014, "step": 2809 }, { "epoch": 0.19039391552540422, "grad_norm": 2.5455565452575684, "learning_rate": 0.00038872398685651697, "loss": 0.7763, "step": 2810 }, { "epoch": 0.1904616713672282, "grad_norm": 3.986311435699463, "learning_rate": 0.0003887185104052574, "loss": 1.0587, "step": 2811 }, { "epoch": 0.1905294272090522, "grad_norm": 3.5373101234436035, "learning_rate": 0.0003887130339539978, "loss": 1.0648, "step": 2812 }, { "epoch": 0.19059718305087617, "grad_norm": 3.479529619216919, "learning_rate": 0.0003887075575027383, "loss": 0.9923, "step": 2813 }, { "epoch": 0.19066493889270014, "grad_norm": 3.5464696884155273, "learning_rate": 0.00038870208105147867, "loss": 1.0059, "step": 2814 }, { "epoch": 0.19073269473452414, "grad_norm": 3.3405473232269287, "learning_rate": 0.00038869660460021907, "loss": 0.8901, "step": 2815 }, { "epoch": 0.19080045057634812, "grad_norm": 4.152277946472168, "learning_rate": 0.00038869112814895947, "loss": 1.0844, "step": 2816 }, { "epoch": 0.19086820641817212, "grad_norm": 3.700972557067871, "learning_rate": 0.0003886856516976999, "loss": 0.9723, "step": 2817 }, { "epoch": 0.1909359622599961, "grad_norm": 3.3416385650634766, "learning_rate": 0.0003886801752464403, "loss": 0.8549, "step": 2818 }, { "epoch": 0.1910037181018201, "grad_norm": 4.603366851806641, "learning_rate": 0.0003886746987951807, "loss": 1.0604, "step": 2819 }, { "epoch": 0.19107147394364407, "grad_norm": 2.6596109867095947, "learning_rate": 0.0003886692223439211, "loss": 0.6537, "step": 2820 }, { "epoch": 0.19113922978546807, "grad_norm": 3.3411309719085693, "learning_rate": 0.0003886637458926616, "loss": 0.9467, "step": 2821 }, { "epoch": 0.19120698562729205, "grad_norm": 3.437974691390991, "learning_rate": 0.000388658269441402, "loss": 0.9295, "step": 2822 }, { "epoch": 0.19127474146911605, "grad_norm": 3.7563765048980713, "learning_rate": 0.00038865279299014243, "loss": 1.0671, "step": 2823 }, { "epoch": 0.19134249731094002, "grad_norm": 3.3614068031311035, "learning_rate": 0.00038864731653888283, "loss": 0.8864, "step": 2824 }, { "epoch": 0.19141025315276403, "grad_norm": 3.8564016819000244, "learning_rate": 0.0003886418400876232, "loss": 1.2477, "step": 2825 }, { "epoch": 0.191478008994588, "grad_norm": 4.71919059753418, "learning_rate": 0.0003886363636363636, "loss": 0.9753, "step": 2826 }, { "epoch": 0.191545764836412, "grad_norm": 3.0620319843292236, "learning_rate": 0.0003886308871851041, "loss": 0.7808, "step": 2827 }, { "epoch": 0.19161352067823598, "grad_norm": 3.8438165187835693, "learning_rate": 0.00038862541073384453, "loss": 1.0785, "step": 2828 }, { "epoch": 0.19168127652005995, "grad_norm": 4.259598255157471, "learning_rate": 0.00038861993428258493, "loss": 1.1103, "step": 2829 }, { "epoch": 0.19174903236188395, "grad_norm": 3.251192808151245, "learning_rate": 0.00038861445783132533, "loss": 0.9754, "step": 2830 }, { "epoch": 0.19181678820370793, "grad_norm": 2.5088682174682617, "learning_rate": 0.00038860898138006573, "loss": 0.6504, "step": 2831 }, { "epoch": 0.19188454404553193, "grad_norm": 3.704563856124878, "learning_rate": 0.00038860350492880613, "loss": 0.9141, "step": 2832 }, { "epoch": 0.1919522998873559, "grad_norm": 4.012651443481445, "learning_rate": 0.0003885980284775466, "loss": 0.8808, "step": 2833 }, { "epoch": 0.1920200557291799, "grad_norm": 3.077089786529541, "learning_rate": 0.000388592552026287, "loss": 1.0051, "step": 2834 }, { "epoch": 0.19208781157100388, "grad_norm": 3.3188600540161133, "learning_rate": 0.0003885870755750274, "loss": 0.8485, "step": 2835 }, { "epoch": 0.19215556741282788, "grad_norm": 3.9176952838897705, "learning_rate": 0.00038858159912376783, "loss": 0.9251, "step": 2836 }, { "epoch": 0.19222332325465186, "grad_norm": 3.9131100177764893, "learning_rate": 0.00038857612267250823, "loss": 1.3158, "step": 2837 }, { "epoch": 0.19229107909647586, "grad_norm": 3.2665579319000244, "learning_rate": 0.00038857064622124863, "loss": 0.9732, "step": 2838 }, { "epoch": 0.19235883493829983, "grad_norm": 3.4108076095581055, "learning_rate": 0.0003885651697699891, "loss": 0.9345, "step": 2839 }, { "epoch": 0.19242659078012384, "grad_norm": 2.991594076156616, "learning_rate": 0.0003885596933187295, "loss": 0.8311, "step": 2840 }, { "epoch": 0.1924943466219478, "grad_norm": 3.9856655597686768, "learning_rate": 0.0003885542168674699, "loss": 1.0621, "step": 2841 }, { "epoch": 0.1925621024637718, "grad_norm": 2.8245956897735596, "learning_rate": 0.0003885487404162103, "loss": 0.9845, "step": 2842 }, { "epoch": 0.19262985830559579, "grad_norm": 4.349861145019531, "learning_rate": 0.0003885432639649507, "loss": 1.007, "step": 2843 }, { "epoch": 0.19269761414741976, "grad_norm": 2.9254069328308105, "learning_rate": 0.0003885377875136912, "loss": 0.9285, "step": 2844 }, { "epoch": 0.19276536998924376, "grad_norm": 3.744016647338867, "learning_rate": 0.0003885323110624316, "loss": 1.2372, "step": 2845 }, { "epoch": 0.19283312583106774, "grad_norm": 3.735994338989258, "learning_rate": 0.000388526834611172, "loss": 0.7919, "step": 2846 }, { "epoch": 0.19290088167289174, "grad_norm": 3.766037940979004, "learning_rate": 0.0003885213581599124, "loss": 1.1431, "step": 2847 }, { "epoch": 0.1929686375147157, "grad_norm": 2.934973955154419, "learning_rate": 0.0003885158817086528, "loss": 0.8543, "step": 2848 }, { "epoch": 0.19303639335653971, "grad_norm": 3.1117990016937256, "learning_rate": 0.00038851040525739324, "loss": 0.9102, "step": 2849 }, { "epoch": 0.1931041491983637, "grad_norm": 3.7863378524780273, "learning_rate": 0.00038850492880613364, "loss": 0.8891, "step": 2850 }, { "epoch": 0.1931719050401877, "grad_norm": 3.22411847114563, "learning_rate": 0.00038849945235487404, "loss": 0.8789, "step": 2851 }, { "epoch": 0.19323966088201167, "grad_norm": 3.320915937423706, "learning_rate": 0.0003884939759036145, "loss": 0.9193, "step": 2852 }, { "epoch": 0.19330741672383567, "grad_norm": 3.0854501724243164, "learning_rate": 0.0003884884994523549, "loss": 0.8459, "step": 2853 }, { "epoch": 0.19337517256565964, "grad_norm": 3.4609007835388184, "learning_rate": 0.0003884830230010953, "loss": 0.8557, "step": 2854 }, { "epoch": 0.19344292840748364, "grad_norm": 3.226769208908081, "learning_rate": 0.00038847754654983575, "loss": 0.7877, "step": 2855 }, { "epoch": 0.19351068424930762, "grad_norm": 3.452515125274658, "learning_rate": 0.00038847207009857614, "loss": 0.8445, "step": 2856 }, { "epoch": 0.19357844009113162, "grad_norm": 2.7265677452087402, "learning_rate": 0.00038846659364731654, "loss": 0.8042, "step": 2857 }, { "epoch": 0.1936461959329556, "grad_norm": 3.52346134185791, "learning_rate": 0.00038846111719605694, "loss": 0.8998, "step": 2858 }, { "epoch": 0.19371395177477957, "grad_norm": 3.444248914718628, "learning_rate": 0.0003884556407447974, "loss": 0.9179, "step": 2859 }, { "epoch": 0.19378170761660357, "grad_norm": 3.4823927879333496, "learning_rate": 0.0003884501642935378, "loss": 1.0943, "step": 2860 }, { "epoch": 0.19384946345842755, "grad_norm": 3.679563283920288, "learning_rate": 0.00038844468784227825, "loss": 1.0095, "step": 2861 }, { "epoch": 0.19391721930025155, "grad_norm": 2.840827465057373, "learning_rate": 0.00038843921139101865, "loss": 0.8466, "step": 2862 }, { "epoch": 0.19398497514207552, "grad_norm": 2.8723158836364746, "learning_rate": 0.00038843373493975905, "loss": 0.9074, "step": 2863 }, { "epoch": 0.19405273098389952, "grad_norm": 3.3672428131103516, "learning_rate": 0.00038842825848849945, "loss": 0.7702, "step": 2864 }, { "epoch": 0.1941204868257235, "grad_norm": 3.102827787399292, "learning_rate": 0.0003884227820372399, "loss": 0.8358, "step": 2865 }, { "epoch": 0.1941882426675475, "grad_norm": 3.773686170578003, "learning_rate": 0.0003884173055859803, "loss": 0.9157, "step": 2866 }, { "epoch": 0.19425599850937147, "grad_norm": 3.287602663040161, "learning_rate": 0.00038841182913472075, "loss": 0.9637, "step": 2867 }, { "epoch": 0.19432375435119548, "grad_norm": 3.0248587131500244, "learning_rate": 0.00038840635268346115, "loss": 0.7526, "step": 2868 }, { "epoch": 0.19439151019301945, "grad_norm": 3.0648927688598633, "learning_rate": 0.00038840087623220155, "loss": 0.7493, "step": 2869 }, { "epoch": 0.19445926603484345, "grad_norm": 3.211517572402954, "learning_rate": 0.00038839539978094195, "loss": 0.8731, "step": 2870 }, { "epoch": 0.19452702187666743, "grad_norm": 2.5447211265563965, "learning_rate": 0.0003883899233296824, "loss": 0.7819, "step": 2871 }, { "epoch": 0.19459477771849143, "grad_norm": 3.0504274368286133, "learning_rate": 0.0003883844468784228, "loss": 0.8631, "step": 2872 }, { "epoch": 0.1946625335603154, "grad_norm": 6.16290283203125, "learning_rate": 0.0003883789704271632, "loss": 1.1057, "step": 2873 }, { "epoch": 0.19473028940213938, "grad_norm": 3.770357131958008, "learning_rate": 0.0003883734939759036, "loss": 0.9779, "step": 2874 }, { "epoch": 0.19479804524396338, "grad_norm": 4.432990550994873, "learning_rate": 0.00038836801752464405, "loss": 1.088, "step": 2875 }, { "epoch": 0.19486580108578735, "grad_norm": 3.4904263019561768, "learning_rate": 0.00038836254107338445, "loss": 1.1733, "step": 2876 }, { "epoch": 0.19493355692761136, "grad_norm": 4.487160682678223, "learning_rate": 0.0003883570646221249, "loss": 0.8655, "step": 2877 }, { "epoch": 0.19500131276943533, "grad_norm": 3.2880842685699463, "learning_rate": 0.0003883515881708653, "loss": 0.7608, "step": 2878 }, { "epoch": 0.19506906861125933, "grad_norm": 3.5662264823913574, "learning_rate": 0.0003883461117196057, "loss": 0.7713, "step": 2879 }, { "epoch": 0.1951368244530833, "grad_norm": 3.6133651733398438, "learning_rate": 0.0003883406352683461, "loss": 1.1589, "step": 2880 }, { "epoch": 0.1952045802949073, "grad_norm": 3.9296798706054688, "learning_rate": 0.0003883351588170865, "loss": 0.8892, "step": 2881 }, { "epoch": 0.19527233613673128, "grad_norm": 3.160841464996338, "learning_rate": 0.00038832968236582696, "loss": 1.0361, "step": 2882 }, { "epoch": 0.19534009197855529, "grad_norm": 2.5656821727752686, "learning_rate": 0.0003883242059145674, "loss": 0.7439, "step": 2883 }, { "epoch": 0.19540784782037926, "grad_norm": 4.165538311004639, "learning_rate": 0.0003883187294633078, "loss": 1.2375, "step": 2884 }, { "epoch": 0.19547560366220326, "grad_norm": 3.054222583770752, "learning_rate": 0.0003883132530120482, "loss": 0.9817, "step": 2885 }, { "epoch": 0.19554335950402724, "grad_norm": 3.955233335494995, "learning_rate": 0.0003883077765607886, "loss": 0.8546, "step": 2886 }, { "epoch": 0.19561111534585124, "grad_norm": 3.7192070484161377, "learning_rate": 0.00038830230010952906, "loss": 1.0685, "step": 2887 }, { "epoch": 0.1956788711876752, "grad_norm": 2.6796040534973145, "learning_rate": 0.00038829682365826946, "loss": 0.6622, "step": 2888 }, { "epoch": 0.1957466270294992, "grad_norm": 2.8797216415405273, "learning_rate": 0.00038829134720700986, "loss": 0.9228, "step": 2889 }, { "epoch": 0.1958143828713232, "grad_norm": 2.3468785285949707, "learning_rate": 0.0003882858707557503, "loss": 0.7853, "step": 2890 }, { "epoch": 0.19588213871314716, "grad_norm": 2.93365478515625, "learning_rate": 0.0003882803943044907, "loss": 0.7724, "step": 2891 }, { "epoch": 0.19594989455497117, "grad_norm": 2.797774076461792, "learning_rate": 0.0003882749178532311, "loss": 0.7918, "step": 2892 }, { "epoch": 0.19601765039679514, "grad_norm": 3.337390661239624, "learning_rate": 0.00038826944140197157, "loss": 1.1268, "step": 2893 }, { "epoch": 0.19608540623861914, "grad_norm": 3.9615604877471924, "learning_rate": 0.00038826396495071196, "loss": 1.0514, "step": 2894 }, { "epoch": 0.19615316208044312, "grad_norm": 3.5322580337524414, "learning_rate": 0.00038825848849945236, "loss": 0.9163, "step": 2895 }, { "epoch": 0.19622091792226712, "grad_norm": 3.612990140914917, "learning_rate": 0.00038825301204819276, "loss": 0.9093, "step": 2896 }, { "epoch": 0.1962886737640911, "grad_norm": 3.0081143379211426, "learning_rate": 0.00038824753559693316, "loss": 0.9885, "step": 2897 }, { "epoch": 0.1963564296059151, "grad_norm": 3.5394842624664307, "learning_rate": 0.0003882420591456736, "loss": 1.0406, "step": 2898 }, { "epoch": 0.19642418544773907, "grad_norm": 3.30678129196167, "learning_rate": 0.00038823658269441407, "loss": 1.0223, "step": 2899 }, { "epoch": 0.19649194128956307, "grad_norm": 3.34920072555542, "learning_rate": 0.00038823110624315447, "loss": 0.7228, "step": 2900 }, { "epoch": 0.19655969713138705, "grad_norm": 3.10904860496521, "learning_rate": 0.00038822562979189487, "loss": 0.7764, "step": 2901 }, { "epoch": 0.19662745297321105, "grad_norm": 3.1832408905029297, "learning_rate": 0.00038822015334063527, "loss": 0.8856, "step": 2902 }, { "epoch": 0.19669520881503502, "grad_norm": 3.5309600830078125, "learning_rate": 0.0003882146768893757, "loss": 1.1269, "step": 2903 }, { "epoch": 0.196762964656859, "grad_norm": 2.922008514404297, "learning_rate": 0.0003882092004381161, "loss": 0.7044, "step": 2904 }, { "epoch": 0.196830720498683, "grad_norm": 2.6173906326293945, "learning_rate": 0.0003882037239868565, "loss": 0.8005, "step": 2905 }, { "epoch": 0.19689847634050697, "grad_norm": 3.036520004272461, "learning_rate": 0.00038819824753559697, "loss": 0.7915, "step": 2906 }, { "epoch": 0.19696623218233097, "grad_norm": 3.1924149990081787, "learning_rate": 0.00038819277108433737, "loss": 0.9731, "step": 2907 }, { "epoch": 0.19703398802415495, "grad_norm": 3.872278928756714, "learning_rate": 0.00038818729463307777, "loss": 1.0911, "step": 2908 }, { "epoch": 0.19710174386597895, "grad_norm": 2.464524984359741, "learning_rate": 0.0003881818181818182, "loss": 0.713, "step": 2909 }, { "epoch": 0.19716949970780293, "grad_norm": 3.9767656326293945, "learning_rate": 0.0003881763417305586, "loss": 1.0382, "step": 2910 }, { "epoch": 0.19723725554962693, "grad_norm": 5.209201812744141, "learning_rate": 0.000388170865279299, "loss": 1.1775, "step": 2911 }, { "epoch": 0.1973050113914509, "grad_norm": 2.733712673187256, "learning_rate": 0.0003881653888280394, "loss": 0.7543, "step": 2912 }, { "epoch": 0.1973727672332749, "grad_norm": 3.7703781127929688, "learning_rate": 0.0003881599123767798, "loss": 1.0978, "step": 2913 }, { "epoch": 0.19744052307509888, "grad_norm": 3.3680343627929688, "learning_rate": 0.0003881544359255203, "loss": 1.0289, "step": 2914 }, { "epoch": 0.19750827891692288, "grad_norm": 2.3637564182281494, "learning_rate": 0.00038814895947426073, "loss": 0.6622, "step": 2915 }, { "epoch": 0.19757603475874685, "grad_norm": 3.969456195831299, "learning_rate": 0.00038814348302300113, "loss": 0.9732, "step": 2916 }, { "epoch": 0.19764379060057086, "grad_norm": 2.699615240097046, "learning_rate": 0.0003881380065717415, "loss": 0.9226, "step": 2917 }, { "epoch": 0.19771154644239483, "grad_norm": 4.2555952072143555, "learning_rate": 0.0003881325301204819, "loss": 1.0636, "step": 2918 }, { "epoch": 0.1977793022842188, "grad_norm": 3.04146146774292, "learning_rate": 0.0003881270536692223, "loss": 1.0585, "step": 2919 }, { "epoch": 0.1978470581260428, "grad_norm": 2.6904592514038086, "learning_rate": 0.0003881215772179628, "loss": 0.817, "step": 2920 }, { "epoch": 0.19791481396786678, "grad_norm": 2.9807913303375244, "learning_rate": 0.00038811610076670323, "loss": 1.0955, "step": 2921 }, { "epoch": 0.19798256980969078, "grad_norm": 3.082659959793091, "learning_rate": 0.00038811062431544363, "loss": 0.8816, "step": 2922 }, { "epoch": 0.19805032565151476, "grad_norm": 3.4324140548706055, "learning_rate": 0.00038810514786418403, "loss": 0.9182, "step": 2923 }, { "epoch": 0.19811808149333876, "grad_norm": 3.4310765266418457, "learning_rate": 0.00038809967141292443, "loss": 1.0241, "step": 2924 }, { "epoch": 0.19818583733516273, "grad_norm": 3.3917229175567627, "learning_rate": 0.0003880941949616649, "loss": 0.7229, "step": 2925 }, { "epoch": 0.19825359317698674, "grad_norm": 3.621485471725464, "learning_rate": 0.0003880887185104053, "loss": 1.0176, "step": 2926 }, { "epoch": 0.1983213490188107, "grad_norm": 3.299516201019287, "learning_rate": 0.0003880832420591457, "loss": 0.8587, "step": 2927 }, { "epoch": 0.1983891048606347, "grad_norm": 3.3287224769592285, "learning_rate": 0.0003880777656078861, "loss": 0.9476, "step": 2928 }, { "epoch": 0.1984568607024587, "grad_norm": 7.810877799987793, "learning_rate": 0.00038807228915662653, "loss": 1.2695, "step": 2929 }, { "epoch": 0.1985246165442827, "grad_norm": 3.242846727371216, "learning_rate": 0.00038806681270536693, "loss": 1.0626, "step": 2930 }, { "epoch": 0.19859237238610666, "grad_norm": 3.5498478412628174, "learning_rate": 0.0003880613362541074, "loss": 0.8431, "step": 2931 }, { "epoch": 0.19866012822793067, "grad_norm": 4.93901252746582, "learning_rate": 0.0003880558598028478, "loss": 1.0355, "step": 2932 }, { "epoch": 0.19872788406975464, "grad_norm": 4.194961071014404, "learning_rate": 0.0003880503833515882, "loss": 1.1942, "step": 2933 }, { "epoch": 0.19879563991157861, "grad_norm": 3.144320249557495, "learning_rate": 0.0003880449069003286, "loss": 1.0055, "step": 2934 }, { "epoch": 0.19886339575340262, "grad_norm": 2.446681261062622, "learning_rate": 0.000388039430449069, "loss": 0.6765, "step": 2935 }, { "epoch": 0.1989311515952266, "grad_norm": 3.9585554599761963, "learning_rate": 0.00038803395399780944, "loss": 0.7786, "step": 2936 }, { "epoch": 0.1989989074370506, "grad_norm": 4.166809558868408, "learning_rate": 0.0003880284775465499, "loss": 0.7984, "step": 2937 }, { "epoch": 0.19906666327887457, "grad_norm": 3.4924962520599365, "learning_rate": 0.0003880230010952903, "loss": 0.785, "step": 2938 }, { "epoch": 0.19913441912069857, "grad_norm": 2.6020219326019287, "learning_rate": 0.0003880175246440307, "loss": 0.6835, "step": 2939 }, { "epoch": 0.19920217496252254, "grad_norm": 3.0359699726104736, "learning_rate": 0.0003880120481927711, "loss": 0.9068, "step": 2940 }, { "epoch": 0.19926993080434655, "grad_norm": 4.5870771408081055, "learning_rate": 0.00038800657174151154, "loss": 1.0493, "step": 2941 }, { "epoch": 0.19933768664617052, "grad_norm": 4.453811168670654, "learning_rate": 0.00038800109529025194, "loss": 0.9529, "step": 2942 }, { "epoch": 0.19940544248799452, "grad_norm": 3.0521836280822754, "learning_rate": 0.00038799561883899234, "loss": 0.6147, "step": 2943 }, { "epoch": 0.1994731983298185, "grad_norm": 2.691821813583374, "learning_rate": 0.00038799014238773274, "loss": 0.7854, "step": 2944 }, { "epoch": 0.1995409541716425, "grad_norm": 3.0034406185150146, "learning_rate": 0.0003879846659364732, "loss": 0.7829, "step": 2945 }, { "epoch": 0.19960871001346647, "grad_norm": 4.020975112915039, "learning_rate": 0.0003879791894852136, "loss": 1.032, "step": 2946 }, { "epoch": 0.19967646585529045, "grad_norm": 3.4426751136779785, "learning_rate": 0.00038797371303395404, "loss": 1.2749, "step": 2947 }, { "epoch": 0.19974422169711445, "grad_norm": 2.9995882511138916, "learning_rate": 0.00038796823658269444, "loss": 0.8232, "step": 2948 }, { "epoch": 0.19981197753893842, "grad_norm": 2.9870011806488037, "learning_rate": 0.00038796276013143484, "loss": 0.7232, "step": 2949 }, { "epoch": 0.19987973338076243, "grad_norm": 3.5426650047302246, "learning_rate": 0.00038795728368017524, "loss": 0.8878, "step": 2950 }, { "epoch": 0.1999474892225864, "grad_norm": 2.684795618057251, "learning_rate": 0.00038795180722891564, "loss": 0.9354, "step": 2951 }, { "epoch": 0.2000152450644104, "grad_norm": 3.869802951812744, "learning_rate": 0.0003879463307776561, "loss": 1.0431, "step": 2952 }, { "epoch": 0.20008300090623438, "grad_norm": 4.170981407165527, "learning_rate": 0.00038794085432639655, "loss": 1.2185, "step": 2953 }, { "epoch": 0.20015075674805838, "grad_norm": 2.944383144378662, "learning_rate": 0.00038793537787513695, "loss": 0.7583, "step": 2954 }, { "epoch": 0.20021851258988235, "grad_norm": 5.020541667938232, "learning_rate": 0.00038792990142387735, "loss": 0.9039, "step": 2955 }, { "epoch": 0.20028626843170635, "grad_norm": 3.3172452449798584, "learning_rate": 0.00038792442497261775, "loss": 0.8774, "step": 2956 }, { "epoch": 0.20035402427353033, "grad_norm": 4.42636775970459, "learning_rate": 0.00038791894852135815, "loss": 1.1344, "step": 2957 }, { "epoch": 0.20042178011535433, "grad_norm": 3.51851749420166, "learning_rate": 0.0003879134720700986, "loss": 1.0192, "step": 2958 }, { "epoch": 0.2004895359571783, "grad_norm": 2.7377655506134033, "learning_rate": 0.000387907995618839, "loss": 0.8097, "step": 2959 }, { "epoch": 0.2005572917990023, "grad_norm": 4.000570774078369, "learning_rate": 0.00038790251916757945, "loss": 0.9769, "step": 2960 }, { "epoch": 0.20062504764082628, "grad_norm": 3.289140462875366, "learning_rate": 0.00038789704271631985, "loss": 0.8663, "step": 2961 }, { "epoch": 0.20069280348265026, "grad_norm": 3.1356420516967773, "learning_rate": 0.00038789156626506025, "loss": 0.8782, "step": 2962 }, { "epoch": 0.20076055932447426, "grad_norm": 3.1205527782440186, "learning_rate": 0.0003878860898138007, "loss": 0.6768, "step": 2963 }, { "epoch": 0.20082831516629823, "grad_norm": 3.3293068408966064, "learning_rate": 0.0003878806133625411, "loss": 0.905, "step": 2964 }, { "epoch": 0.20089607100812223, "grad_norm": 4.211151599884033, "learning_rate": 0.0003878751369112815, "loss": 1.205, "step": 2965 }, { "epoch": 0.2009638268499462, "grad_norm": 5.293976306915283, "learning_rate": 0.0003878696604600219, "loss": 1.1434, "step": 2966 }, { "epoch": 0.2010315826917702, "grad_norm": 4.372865676879883, "learning_rate": 0.0003878641840087623, "loss": 1.0981, "step": 2967 }, { "epoch": 0.20109933853359419, "grad_norm": 4.066809177398682, "learning_rate": 0.00038785870755750275, "loss": 0.9831, "step": 2968 }, { "epoch": 0.2011670943754182, "grad_norm": 4.448725700378418, "learning_rate": 0.0003878532311062432, "loss": 0.995, "step": 2969 }, { "epoch": 0.20123485021724216, "grad_norm": 3.098972797393799, "learning_rate": 0.0003878477546549836, "loss": 0.9696, "step": 2970 }, { "epoch": 0.20130260605906616, "grad_norm": 3.9107375144958496, "learning_rate": 0.000387842278203724, "loss": 1.1097, "step": 2971 }, { "epoch": 0.20137036190089014, "grad_norm": 3.3919646739959717, "learning_rate": 0.0003878368017524644, "loss": 0.8155, "step": 2972 }, { "epoch": 0.20143811774271414, "grad_norm": 4.422256946563721, "learning_rate": 0.0003878313253012048, "loss": 0.9722, "step": 2973 }, { "epoch": 0.20150587358453811, "grad_norm": 3.44040846824646, "learning_rate": 0.00038782584884994526, "loss": 0.8639, "step": 2974 }, { "epoch": 0.20157362942636212, "grad_norm": 3.9588050842285156, "learning_rate": 0.00038782037239868566, "loss": 1.0882, "step": 2975 }, { "epoch": 0.2016413852681861, "grad_norm": 4.006252765655518, "learning_rate": 0.0003878148959474261, "loss": 1.0616, "step": 2976 }, { "epoch": 0.20170914111001007, "grad_norm": 2.9318010807037354, "learning_rate": 0.0003878094194961665, "loss": 0.8486, "step": 2977 }, { "epoch": 0.20177689695183407, "grad_norm": 2.7023775577545166, "learning_rate": 0.0003878039430449069, "loss": 0.8735, "step": 2978 }, { "epoch": 0.20184465279365804, "grad_norm": 3.5952556133270264, "learning_rate": 0.00038779846659364736, "loss": 0.8897, "step": 2979 }, { "epoch": 0.20191240863548204, "grad_norm": 2.8645665645599365, "learning_rate": 0.00038779299014238776, "loss": 0.7177, "step": 2980 }, { "epoch": 0.20198016447730602, "grad_norm": 3.171987533569336, "learning_rate": 0.00038778751369112816, "loss": 0.8614, "step": 2981 }, { "epoch": 0.20204792031913002, "grad_norm": 2.974250555038452, "learning_rate": 0.00038778203723986856, "loss": 0.9177, "step": 2982 }, { "epoch": 0.202115676160954, "grad_norm": 3.443190097808838, "learning_rate": 0.00038777656078860896, "loss": 0.8449, "step": 2983 }, { "epoch": 0.202183432002778, "grad_norm": 3.9158647060394287, "learning_rate": 0.0003877710843373494, "loss": 0.7569, "step": 2984 }, { "epoch": 0.20225118784460197, "grad_norm": 4.267241954803467, "learning_rate": 0.00038776560788608987, "loss": 0.9139, "step": 2985 }, { "epoch": 0.20231894368642597, "grad_norm": 3.2630350589752197, "learning_rate": 0.00038776013143483026, "loss": 0.8679, "step": 2986 }, { "epoch": 0.20238669952824995, "grad_norm": 9.023880958557129, "learning_rate": 0.00038775465498357066, "loss": 0.6787, "step": 2987 }, { "epoch": 0.20245445537007395, "grad_norm": 2.9599552154541016, "learning_rate": 0.00038774917853231106, "loss": 0.7772, "step": 2988 }, { "epoch": 0.20252221121189792, "grad_norm": 4.031742572784424, "learning_rate": 0.00038774370208105146, "loss": 0.9991, "step": 2989 }, { "epoch": 0.20258996705372193, "grad_norm": 2.2382352352142334, "learning_rate": 0.0003877382256297919, "loss": 0.5522, "step": 2990 }, { "epoch": 0.2026577228955459, "grad_norm": 2.7017180919647217, "learning_rate": 0.00038773274917853237, "loss": 0.8779, "step": 2991 }, { "epoch": 0.20272547873736987, "grad_norm": 2.520508289337158, "learning_rate": 0.00038772727272727277, "loss": 0.7455, "step": 2992 }, { "epoch": 0.20279323457919388, "grad_norm": 3.901690721511841, "learning_rate": 0.00038772179627601317, "loss": 0.9256, "step": 2993 }, { "epoch": 0.20286099042101785, "grad_norm": 3.673201322555542, "learning_rate": 0.00038771631982475357, "loss": 0.9651, "step": 2994 }, { "epoch": 0.20292874626284185, "grad_norm": 3.2190515995025635, "learning_rate": 0.00038771084337349397, "loss": 0.7486, "step": 2995 }, { "epoch": 0.20299650210466583, "grad_norm": 3.5136494636535645, "learning_rate": 0.0003877053669222344, "loss": 0.9897, "step": 2996 }, { "epoch": 0.20306425794648983, "grad_norm": 3.8646607398986816, "learning_rate": 0.0003876998904709748, "loss": 1.047, "step": 2997 }, { "epoch": 0.2031320137883138, "grad_norm": 3.926952600479126, "learning_rate": 0.0003876944140197152, "loss": 1.1799, "step": 2998 }, { "epoch": 0.2031997696301378, "grad_norm": 2.6277692317962646, "learning_rate": 0.00038768893756845567, "loss": 0.8088, "step": 2999 }, { "epoch": 0.20326752547196178, "grad_norm": 3.3782174587249756, "learning_rate": 0.00038768346111719607, "loss": 1.0853, "step": 3000 }, { "epoch": 0.20333528131378578, "grad_norm": 2.427452325820923, "learning_rate": 0.0003876779846659365, "loss": 0.7884, "step": 3001 }, { "epoch": 0.20340303715560976, "grad_norm": 5.444807529449463, "learning_rate": 0.0003876725082146769, "loss": 1.0673, "step": 3002 }, { "epoch": 0.20347079299743376, "grad_norm": 3.8093421459198, "learning_rate": 0.0003876670317634173, "loss": 1.1959, "step": 3003 }, { "epoch": 0.20353854883925773, "grad_norm": 3.9400832653045654, "learning_rate": 0.0003876615553121577, "loss": 0.9064, "step": 3004 }, { "epoch": 0.20360630468108173, "grad_norm": 2.7309412956237793, "learning_rate": 0.0003876560788608981, "loss": 0.7175, "step": 3005 }, { "epoch": 0.2036740605229057, "grad_norm": 3.4516921043395996, "learning_rate": 0.0003876506024096386, "loss": 0.9031, "step": 3006 }, { "epoch": 0.20374181636472968, "grad_norm": 3.0753893852233887, "learning_rate": 0.00038764512595837903, "loss": 0.9436, "step": 3007 }, { "epoch": 0.20380957220655369, "grad_norm": 3.9933080673217773, "learning_rate": 0.0003876396495071194, "loss": 1.0872, "step": 3008 }, { "epoch": 0.20387732804837766, "grad_norm": 3.4695262908935547, "learning_rate": 0.0003876341730558598, "loss": 1.0169, "step": 3009 }, { "epoch": 0.20394508389020166, "grad_norm": 4.436805248260498, "learning_rate": 0.0003876286966046002, "loss": 1.2448, "step": 3010 }, { "epoch": 0.20401283973202564, "grad_norm": 3.8343169689178467, "learning_rate": 0.0003876232201533406, "loss": 0.889, "step": 3011 }, { "epoch": 0.20408059557384964, "grad_norm": 3.2676072120666504, "learning_rate": 0.0003876177437020811, "loss": 1.0095, "step": 3012 }, { "epoch": 0.2041483514156736, "grad_norm": 3.435744047164917, "learning_rate": 0.0003876122672508215, "loss": 0.9986, "step": 3013 }, { "epoch": 0.20421610725749761, "grad_norm": 2.862062454223633, "learning_rate": 0.0003876067907995619, "loss": 0.7056, "step": 3014 }, { "epoch": 0.2042838630993216, "grad_norm": 3.753910779953003, "learning_rate": 0.00038760131434830233, "loss": 0.8493, "step": 3015 }, { "epoch": 0.2043516189411456, "grad_norm": 2.116185426712036, "learning_rate": 0.00038759583789704273, "loss": 0.6275, "step": 3016 }, { "epoch": 0.20441937478296957, "grad_norm": 3.509892702102661, "learning_rate": 0.0003875903614457832, "loss": 0.9279, "step": 3017 }, { "epoch": 0.20448713062479357, "grad_norm": 4.098609447479248, "learning_rate": 0.0003875848849945236, "loss": 1.1334, "step": 3018 }, { "epoch": 0.20455488646661754, "grad_norm": 3.273157835006714, "learning_rate": 0.000387579408543264, "loss": 0.9214, "step": 3019 }, { "epoch": 0.20462264230844154, "grad_norm": 3.5070319175720215, "learning_rate": 0.0003875739320920044, "loss": 0.815, "step": 3020 }, { "epoch": 0.20469039815026552, "grad_norm": 2.5359714031219482, "learning_rate": 0.0003875684556407448, "loss": 0.7331, "step": 3021 }, { "epoch": 0.2047581539920895, "grad_norm": 3.425753116607666, "learning_rate": 0.00038756297918948523, "loss": 0.7633, "step": 3022 }, { "epoch": 0.2048259098339135, "grad_norm": 3.675658941268921, "learning_rate": 0.0003875575027382257, "loss": 1.1037, "step": 3023 }, { "epoch": 0.20489366567573747, "grad_norm": 2.5491268634796143, "learning_rate": 0.0003875520262869661, "loss": 0.6575, "step": 3024 }, { "epoch": 0.20496142151756147, "grad_norm": 4.113846778869629, "learning_rate": 0.0003875465498357065, "loss": 0.8881, "step": 3025 }, { "epoch": 0.20502917735938544, "grad_norm": 3.828291893005371, "learning_rate": 0.0003875410733844469, "loss": 0.9057, "step": 3026 }, { "epoch": 0.20509693320120945, "grad_norm": 2.85843563079834, "learning_rate": 0.0003875355969331873, "loss": 0.719, "step": 3027 }, { "epoch": 0.20516468904303342, "grad_norm": 2.6541266441345215, "learning_rate": 0.00038753012048192774, "loss": 0.825, "step": 3028 }, { "epoch": 0.20523244488485742, "grad_norm": 3.2358591556549072, "learning_rate": 0.00038752464403066814, "loss": 0.9772, "step": 3029 }, { "epoch": 0.2053002007266814, "grad_norm": 3.1505749225616455, "learning_rate": 0.0003875191675794086, "loss": 0.8697, "step": 3030 }, { "epoch": 0.2053679565685054, "grad_norm": 3.205974578857422, "learning_rate": 0.000387513691128149, "loss": 1.0819, "step": 3031 }, { "epoch": 0.20543571241032937, "grad_norm": 3.558382749557495, "learning_rate": 0.0003875082146768894, "loss": 0.8795, "step": 3032 }, { "epoch": 0.20550346825215338, "grad_norm": 3.715158700942993, "learning_rate": 0.0003875027382256298, "loss": 1.2681, "step": 3033 }, { "epoch": 0.20557122409397735, "grad_norm": 3.849266529083252, "learning_rate": 0.00038749726177437024, "loss": 1.1093, "step": 3034 }, { "epoch": 0.20563897993580135, "grad_norm": 2.420072555541992, "learning_rate": 0.00038749178532311064, "loss": 0.7177, "step": 3035 }, { "epoch": 0.20570673577762533, "grad_norm": 3.3123974800109863, "learning_rate": 0.00038748630887185104, "loss": 0.973, "step": 3036 }, { "epoch": 0.2057744916194493, "grad_norm": 5.708742618560791, "learning_rate": 0.00038748083242059144, "loss": 0.8842, "step": 3037 }, { "epoch": 0.2058422474612733, "grad_norm": 4.232296943664551, "learning_rate": 0.0003874753559693319, "loss": 1.3733, "step": 3038 }, { "epoch": 0.20591000330309728, "grad_norm": 4.40988302230835, "learning_rate": 0.00038746987951807234, "loss": 0.8927, "step": 3039 }, { "epoch": 0.20597775914492128, "grad_norm": 2.910693883895874, "learning_rate": 0.00038746440306681274, "loss": 0.8717, "step": 3040 }, { "epoch": 0.20604551498674525, "grad_norm": 3.609844207763672, "learning_rate": 0.00038745892661555314, "loss": 0.9452, "step": 3041 }, { "epoch": 0.20611327082856926, "grad_norm": 3.769583225250244, "learning_rate": 0.00038745345016429354, "loss": 1.0423, "step": 3042 }, { "epoch": 0.20618102667039323, "grad_norm": 2.7205891609191895, "learning_rate": 0.00038744797371303394, "loss": 0.7227, "step": 3043 }, { "epoch": 0.20624878251221723, "grad_norm": 3.284769058227539, "learning_rate": 0.0003874424972617744, "loss": 0.8812, "step": 3044 }, { "epoch": 0.2063165383540412, "grad_norm": 3.0832784175872803, "learning_rate": 0.0003874370208105148, "loss": 0.9679, "step": 3045 }, { "epoch": 0.2063842941958652, "grad_norm": 4.454905986785889, "learning_rate": 0.00038743154435925525, "loss": 1.18, "step": 3046 }, { "epoch": 0.20645205003768918, "grad_norm": 5.9112548828125, "learning_rate": 0.00038742606790799565, "loss": 0.8368, "step": 3047 }, { "epoch": 0.20651980587951319, "grad_norm": 4.081108570098877, "learning_rate": 0.00038742059145673605, "loss": 0.9317, "step": 3048 }, { "epoch": 0.20658756172133716, "grad_norm": 3.965015411376953, "learning_rate": 0.00038741511500547645, "loss": 0.9889, "step": 3049 }, { "epoch": 0.20665531756316116, "grad_norm": 3.206282377243042, "learning_rate": 0.0003874096385542169, "loss": 0.8683, "step": 3050 }, { "epoch": 0.20672307340498514, "grad_norm": 3.1499555110931396, "learning_rate": 0.0003874041621029573, "loss": 0.9119, "step": 3051 }, { "epoch": 0.2067908292468091, "grad_norm": 4.203317165374756, "learning_rate": 0.0003873986856516977, "loss": 1.1122, "step": 3052 }, { "epoch": 0.2068585850886331, "grad_norm": 3.7479679584503174, "learning_rate": 0.00038739320920043815, "loss": 0.8592, "step": 3053 }, { "epoch": 0.2069263409304571, "grad_norm": 2.524534225463867, "learning_rate": 0.00038738773274917855, "loss": 0.6836, "step": 3054 }, { "epoch": 0.2069940967722811, "grad_norm": 3.2653846740722656, "learning_rate": 0.000387382256297919, "loss": 0.7931, "step": 3055 }, { "epoch": 0.20706185261410506, "grad_norm": 3.3260385990142822, "learning_rate": 0.0003873767798466594, "loss": 0.8665, "step": 3056 }, { "epoch": 0.20712960845592907, "grad_norm": 3.0397768020629883, "learning_rate": 0.0003873713033953998, "loss": 0.7832, "step": 3057 }, { "epoch": 0.20719736429775304, "grad_norm": 3.086559295654297, "learning_rate": 0.0003873658269441402, "loss": 0.8847, "step": 3058 }, { "epoch": 0.20726512013957704, "grad_norm": 3.479982614517212, "learning_rate": 0.0003873603504928806, "loss": 0.9956, "step": 3059 }, { "epoch": 0.20733287598140102, "grad_norm": 3.2430217266082764, "learning_rate": 0.00038735487404162105, "loss": 0.8956, "step": 3060 }, { "epoch": 0.20740063182322502, "grad_norm": 2.7269043922424316, "learning_rate": 0.0003873493975903615, "loss": 0.6624, "step": 3061 }, { "epoch": 0.207468387665049, "grad_norm": 2.7710084915161133, "learning_rate": 0.0003873439211391019, "loss": 0.84, "step": 3062 }, { "epoch": 0.207536143506873, "grad_norm": 3.279461145401001, "learning_rate": 0.0003873384446878423, "loss": 0.9323, "step": 3063 }, { "epoch": 0.20760389934869697, "grad_norm": 2.4109456539154053, "learning_rate": 0.0003873329682365827, "loss": 0.7564, "step": 3064 }, { "epoch": 0.20767165519052097, "grad_norm": 3.398273229598999, "learning_rate": 0.0003873274917853231, "loss": 0.9354, "step": 3065 }, { "epoch": 0.20773941103234494, "grad_norm": 3.6894140243530273, "learning_rate": 0.00038732201533406356, "loss": 1.1319, "step": 3066 }, { "epoch": 0.20780716687416892, "grad_norm": 3.255312204360962, "learning_rate": 0.00038731653888280396, "loss": 0.879, "step": 3067 }, { "epoch": 0.20787492271599292, "grad_norm": 2.3051600456237793, "learning_rate": 0.00038731106243154436, "loss": 0.7277, "step": 3068 }, { "epoch": 0.2079426785578169, "grad_norm": 2.9135687351226807, "learning_rate": 0.0003873055859802848, "loss": 0.9002, "step": 3069 }, { "epoch": 0.2080104343996409, "grad_norm": 4.4830522537231445, "learning_rate": 0.0003873001095290252, "loss": 0.9469, "step": 3070 }, { "epoch": 0.20807819024146487, "grad_norm": 3.2175025939941406, "learning_rate": 0.0003872946330777656, "loss": 0.943, "step": 3071 }, { "epoch": 0.20814594608328887, "grad_norm": 3.3367183208465576, "learning_rate": 0.00038728915662650606, "loss": 0.9921, "step": 3072 }, { "epoch": 0.20821370192511285, "grad_norm": 3.4488627910614014, "learning_rate": 0.00038728368017524646, "loss": 0.8805, "step": 3073 }, { "epoch": 0.20828145776693685, "grad_norm": 3.7233943939208984, "learning_rate": 0.00038727820372398686, "loss": 1.2822, "step": 3074 }, { "epoch": 0.20834921360876082, "grad_norm": 2.8348429203033447, "learning_rate": 0.00038727272727272726, "loss": 0.7892, "step": 3075 }, { "epoch": 0.20841696945058483, "grad_norm": 3.1588516235351562, "learning_rate": 0.0003872672508214677, "loss": 1.0107, "step": 3076 }, { "epoch": 0.2084847252924088, "grad_norm": 2.604145050048828, "learning_rate": 0.00038726177437020817, "loss": 0.8472, "step": 3077 }, { "epoch": 0.2085524811342328, "grad_norm": 3.444066047668457, "learning_rate": 0.00038725629791894856, "loss": 0.7593, "step": 3078 }, { "epoch": 0.20862023697605678, "grad_norm": 2.6919965744018555, "learning_rate": 0.00038725082146768896, "loss": 0.9178, "step": 3079 }, { "epoch": 0.20868799281788078, "grad_norm": 3.8902170658111572, "learning_rate": 0.00038724534501642936, "loss": 1.2513, "step": 3080 }, { "epoch": 0.20875574865970475, "grad_norm": 4.317392349243164, "learning_rate": 0.00038723986856516976, "loss": 1.026, "step": 3081 }, { "epoch": 0.20882350450152873, "grad_norm": 2.897770881652832, "learning_rate": 0.0003872343921139102, "loss": 0.91, "step": 3082 }, { "epoch": 0.20889126034335273, "grad_norm": 3.7551252841949463, "learning_rate": 0.0003872289156626506, "loss": 1.0085, "step": 3083 }, { "epoch": 0.2089590161851767, "grad_norm": 2.75354266166687, "learning_rate": 0.00038722343921139107, "loss": 0.7836, "step": 3084 }, { "epoch": 0.2090267720270007, "grad_norm": 2.7469089031219482, "learning_rate": 0.00038721796276013147, "loss": 0.8302, "step": 3085 }, { "epoch": 0.20909452786882468, "grad_norm": 3.335780382156372, "learning_rate": 0.00038721248630887187, "loss": 0.988, "step": 3086 }, { "epoch": 0.20916228371064868, "grad_norm": 3.8519208431243896, "learning_rate": 0.00038720700985761227, "loss": 1.077, "step": 3087 }, { "epoch": 0.20923003955247266, "grad_norm": 3.1594107151031494, "learning_rate": 0.0003872015334063527, "loss": 0.8523, "step": 3088 }, { "epoch": 0.20929779539429666, "grad_norm": 3.792149305343628, "learning_rate": 0.0003871960569550931, "loss": 1.0773, "step": 3089 }, { "epoch": 0.20936555123612063, "grad_norm": 4.654153823852539, "learning_rate": 0.0003871905805038335, "loss": 1.1554, "step": 3090 }, { "epoch": 0.20943330707794464, "grad_norm": 4.500635147094727, "learning_rate": 0.0003871851040525739, "loss": 1.3423, "step": 3091 }, { "epoch": 0.2095010629197686, "grad_norm": 2.9006171226501465, "learning_rate": 0.00038717962760131437, "loss": 0.8354, "step": 3092 }, { "epoch": 0.2095688187615926, "grad_norm": 3.1118552684783936, "learning_rate": 0.0003871741511500548, "loss": 0.9194, "step": 3093 }, { "epoch": 0.2096365746034166, "grad_norm": 3.410881996154785, "learning_rate": 0.0003871686746987952, "loss": 0.8852, "step": 3094 }, { "epoch": 0.2097043304452406, "grad_norm": 3.783707618713379, "learning_rate": 0.0003871631982475356, "loss": 0.9804, "step": 3095 }, { "epoch": 0.20977208628706456, "grad_norm": 3.084458589553833, "learning_rate": 0.000387157721796276, "loss": 0.8004, "step": 3096 }, { "epoch": 0.20983984212888854, "grad_norm": 2.707623243331909, "learning_rate": 0.0003871522453450164, "loss": 0.8773, "step": 3097 }, { "epoch": 0.20990759797071254, "grad_norm": 3.749215841293335, "learning_rate": 0.0003871467688937569, "loss": 0.9985, "step": 3098 }, { "epoch": 0.2099753538125365, "grad_norm": 3.891214609146118, "learning_rate": 0.0003871412924424973, "loss": 1.0906, "step": 3099 }, { "epoch": 0.21004310965436052, "grad_norm": 3.404768943786621, "learning_rate": 0.0003871358159912377, "loss": 0.967, "step": 3100 }, { "epoch": 0.2101108654961845, "grad_norm": 3.955134153366089, "learning_rate": 0.0003871303395399781, "loss": 0.9697, "step": 3101 }, { "epoch": 0.2101786213380085, "grad_norm": 3.0474636554718018, "learning_rate": 0.0003871248630887185, "loss": 0.8256, "step": 3102 }, { "epoch": 0.21024637717983247, "grad_norm": 3.5473544597625732, "learning_rate": 0.0003871193866374589, "loss": 0.8495, "step": 3103 }, { "epoch": 0.21031413302165647, "grad_norm": 4.0794854164123535, "learning_rate": 0.0003871139101861994, "loss": 1.0686, "step": 3104 }, { "epoch": 0.21038188886348044, "grad_norm": 3.6187057495117188, "learning_rate": 0.0003871084337349398, "loss": 0.9369, "step": 3105 }, { "epoch": 0.21044964470530445, "grad_norm": 3.510897397994995, "learning_rate": 0.0003871029572836802, "loss": 1.1107, "step": 3106 }, { "epoch": 0.21051740054712842, "grad_norm": 3.079817295074463, "learning_rate": 0.0003870974808324206, "loss": 0.8698, "step": 3107 }, { "epoch": 0.21058515638895242, "grad_norm": 3.6142637729644775, "learning_rate": 0.00038709200438116103, "loss": 0.9384, "step": 3108 }, { "epoch": 0.2106529122307764, "grad_norm": 3.942026138305664, "learning_rate": 0.00038708652792990143, "loss": 1.0713, "step": 3109 }, { "epoch": 0.2107206680726004, "grad_norm": 3.389601945877075, "learning_rate": 0.0003870810514786419, "loss": 0.9488, "step": 3110 }, { "epoch": 0.21078842391442437, "grad_norm": 2.8886373043060303, "learning_rate": 0.0003870755750273823, "loss": 1.0434, "step": 3111 }, { "epoch": 0.21085617975624835, "grad_norm": 2.563314199447632, "learning_rate": 0.0003870700985761227, "loss": 0.794, "step": 3112 }, { "epoch": 0.21092393559807235, "grad_norm": 2.9957382678985596, "learning_rate": 0.0003870646221248631, "loss": 0.7061, "step": 3113 }, { "epoch": 0.21099169143989632, "grad_norm": 4.0754170417785645, "learning_rate": 0.00038705914567360353, "loss": 0.9812, "step": 3114 }, { "epoch": 0.21105944728172032, "grad_norm": 3.1081702709198, "learning_rate": 0.00038705366922234393, "loss": 0.6549, "step": 3115 }, { "epoch": 0.2111272031235443, "grad_norm": 3.2139029502868652, "learning_rate": 0.0003870481927710844, "loss": 0.7877, "step": 3116 }, { "epoch": 0.2111949589653683, "grad_norm": 3.662554979324341, "learning_rate": 0.0003870427163198248, "loss": 1.1303, "step": 3117 }, { "epoch": 0.21126271480719228, "grad_norm": 4.457626819610596, "learning_rate": 0.0003870372398685652, "loss": 0.9923, "step": 3118 }, { "epoch": 0.21133047064901628, "grad_norm": 7.068685531616211, "learning_rate": 0.0003870317634173056, "loss": 0.941, "step": 3119 }, { "epoch": 0.21139822649084025, "grad_norm": 3.881685972213745, "learning_rate": 0.00038702628696604604, "loss": 1.0968, "step": 3120 }, { "epoch": 0.21146598233266425, "grad_norm": 4.029959201812744, "learning_rate": 0.00038702081051478644, "loss": 1.0996, "step": 3121 }, { "epoch": 0.21153373817448823, "grad_norm": 2.6053762435913086, "learning_rate": 0.00038701533406352683, "loss": 0.8367, "step": 3122 }, { "epoch": 0.21160149401631223, "grad_norm": 3.769785165786743, "learning_rate": 0.0003870098576122673, "loss": 0.9545, "step": 3123 }, { "epoch": 0.2116692498581362, "grad_norm": 3.5461783409118652, "learning_rate": 0.0003870043811610077, "loss": 0.9154, "step": 3124 }, { "epoch": 0.2117370056999602, "grad_norm": 4.105605125427246, "learning_rate": 0.0003869989047097481, "loss": 1.1228, "step": 3125 }, { "epoch": 0.21180476154178418, "grad_norm": 3.5911357402801514, "learning_rate": 0.00038699342825848854, "loss": 0.9285, "step": 3126 }, { "epoch": 0.21187251738360816, "grad_norm": 2.5789434909820557, "learning_rate": 0.00038698795180722894, "loss": 0.6774, "step": 3127 }, { "epoch": 0.21194027322543216, "grad_norm": 5.305942058563232, "learning_rate": 0.00038698247535596934, "loss": 0.8623, "step": 3128 }, { "epoch": 0.21200802906725613, "grad_norm": 3.409611225128174, "learning_rate": 0.00038697699890470974, "loss": 1.1521, "step": 3129 }, { "epoch": 0.21207578490908013, "grad_norm": 3.1284379959106445, "learning_rate": 0.00038697152245345014, "loss": 0.7601, "step": 3130 }, { "epoch": 0.2121435407509041, "grad_norm": 2.8303184509277344, "learning_rate": 0.00038696604600219064, "loss": 0.8204, "step": 3131 }, { "epoch": 0.2122112965927281, "grad_norm": 3.074784278869629, "learning_rate": 0.00038696056955093104, "loss": 0.7463, "step": 3132 }, { "epoch": 0.21227905243455208, "grad_norm": 4.0078229904174805, "learning_rate": 0.00038695509309967144, "loss": 0.995, "step": 3133 }, { "epoch": 0.2123468082763761, "grad_norm": 2.8091914653778076, "learning_rate": 0.00038694961664841184, "loss": 0.6638, "step": 3134 }, { "epoch": 0.21241456411820006, "grad_norm": 4.509549140930176, "learning_rate": 0.00038694414019715224, "loss": 0.9929, "step": 3135 }, { "epoch": 0.21248231996002406, "grad_norm": 3.168811321258545, "learning_rate": 0.0003869386637458927, "loss": 0.7937, "step": 3136 }, { "epoch": 0.21255007580184804, "grad_norm": 4.677186012268066, "learning_rate": 0.0003869331872946331, "loss": 0.9251, "step": 3137 }, { "epoch": 0.21261783164367204, "grad_norm": 3.096297264099121, "learning_rate": 0.0003869277108433735, "loss": 0.8592, "step": 3138 }, { "epoch": 0.212685587485496, "grad_norm": 4.172886371612549, "learning_rate": 0.00038692223439211395, "loss": 1.0343, "step": 3139 }, { "epoch": 0.21275334332732002, "grad_norm": 3.6096184253692627, "learning_rate": 0.00038691675794085435, "loss": 1.0737, "step": 3140 }, { "epoch": 0.212821099169144, "grad_norm": 3.596954345703125, "learning_rate": 0.00038691128148959475, "loss": 0.9474, "step": 3141 }, { "epoch": 0.21288885501096796, "grad_norm": 3.5519378185272217, "learning_rate": 0.0003869058050383352, "loss": 1.0144, "step": 3142 }, { "epoch": 0.21295661085279197, "grad_norm": 3.6861398220062256, "learning_rate": 0.0003869003285870756, "loss": 1.1177, "step": 3143 }, { "epoch": 0.21302436669461594, "grad_norm": 3.0142388343811035, "learning_rate": 0.000386894852135816, "loss": 1.0247, "step": 3144 }, { "epoch": 0.21309212253643994, "grad_norm": 2.605402708053589, "learning_rate": 0.0003868893756845564, "loss": 0.783, "step": 3145 }, { "epoch": 0.21315987837826392, "grad_norm": 4.018885612487793, "learning_rate": 0.0003868838992332968, "loss": 1.1006, "step": 3146 }, { "epoch": 0.21322763422008792, "grad_norm": 2.9617702960968018, "learning_rate": 0.00038687842278203725, "loss": 0.9044, "step": 3147 }, { "epoch": 0.2132953900619119, "grad_norm": 3.6294941902160645, "learning_rate": 0.0003868729463307777, "loss": 0.885, "step": 3148 }, { "epoch": 0.2133631459037359, "grad_norm": 4.113534450531006, "learning_rate": 0.0003868674698795181, "loss": 1.2714, "step": 3149 }, { "epoch": 0.21343090174555987, "grad_norm": 2.841132164001465, "learning_rate": 0.0003868619934282585, "loss": 0.9587, "step": 3150 }, { "epoch": 0.21349865758738387, "grad_norm": 3.4558990001678467, "learning_rate": 0.0003868565169769989, "loss": 0.9674, "step": 3151 }, { "epoch": 0.21356641342920785, "grad_norm": 3.669705629348755, "learning_rate": 0.00038685104052573935, "loss": 0.9453, "step": 3152 }, { "epoch": 0.21363416927103185, "grad_norm": 4.54177188873291, "learning_rate": 0.00038684556407447975, "loss": 0.9363, "step": 3153 }, { "epoch": 0.21370192511285582, "grad_norm": 5.341638565063477, "learning_rate": 0.0003868400876232202, "loss": 0.8942, "step": 3154 }, { "epoch": 0.21376968095467982, "grad_norm": 3.188917636871338, "learning_rate": 0.0003868346111719606, "loss": 0.9156, "step": 3155 }, { "epoch": 0.2138374367965038, "grad_norm": 2.9051613807678223, "learning_rate": 0.000386829134720701, "loss": 0.9587, "step": 3156 }, { "epoch": 0.21390519263832777, "grad_norm": 4.053856372833252, "learning_rate": 0.0003868236582694414, "loss": 1.0757, "step": 3157 }, { "epoch": 0.21397294848015178, "grad_norm": 3.427412509918213, "learning_rate": 0.00038681818181818186, "loss": 0.8125, "step": 3158 }, { "epoch": 0.21404070432197575, "grad_norm": 4.6371917724609375, "learning_rate": 0.00038681270536692226, "loss": 1.1889, "step": 3159 }, { "epoch": 0.21410846016379975, "grad_norm": 2.95896053314209, "learning_rate": 0.00038680722891566266, "loss": 0.8206, "step": 3160 }, { "epoch": 0.21417621600562373, "grad_norm": 3.255932569503784, "learning_rate": 0.00038680175246440305, "loss": 1.0005, "step": 3161 }, { "epoch": 0.21424397184744773, "grad_norm": 3.863858938217163, "learning_rate": 0.0003867962760131435, "loss": 0.9829, "step": 3162 }, { "epoch": 0.2143117276892717, "grad_norm": 3.808131217956543, "learning_rate": 0.0003867907995618839, "loss": 1.1583, "step": 3163 }, { "epoch": 0.2143794835310957, "grad_norm": 3.7781200408935547, "learning_rate": 0.00038678532311062436, "loss": 0.732, "step": 3164 }, { "epoch": 0.21444723937291968, "grad_norm": 2.9623594284057617, "learning_rate": 0.00038677984665936476, "loss": 0.8591, "step": 3165 }, { "epoch": 0.21451499521474368, "grad_norm": 2.8965985774993896, "learning_rate": 0.00038677437020810516, "loss": 0.9441, "step": 3166 }, { "epoch": 0.21458275105656766, "grad_norm": 2.694587469100952, "learning_rate": 0.00038676889375684556, "loss": 0.8323, "step": 3167 }, { "epoch": 0.21465050689839166, "grad_norm": 2.5995397567749023, "learning_rate": 0.00038676341730558596, "loss": 0.7472, "step": 3168 }, { "epoch": 0.21471826274021563, "grad_norm": 3.668618679046631, "learning_rate": 0.0003867579408543264, "loss": 0.7061, "step": 3169 }, { "epoch": 0.21478601858203963, "grad_norm": 4.599798202514648, "learning_rate": 0.00038675246440306686, "loss": 0.9915, "step": 3170 }, { "epoch": 0.2148537744238636, "grad_norm": 3.325688362121582, "learning_rate": 0.00038674698795180726, "loss": 0.9439, "step": 3171 }, { "epoch": 0.21492153026568758, "grad_norm": 2.875779151916504, "learning_rate": 0.00038674151150054766, "loss": 0.6822, "step": 3172 }, { "epoch": 0.21498928610751158, "grad_norm": 3.5521769523620605, "learning_rate": 0.00038673603504928806, "loss": 0.8075, "step": 3173 }, { "epoch": 0.21505704194933556, "grad_norm": 4.692173004150391, "learning_rate": 0.0003867305585980285, "loss": 1.2805, "step": 3174 }, { "epoch": 0.21512479779115956, "grad_norm": 3.8736305236816406, "learning_rate": 0.0003867250821467689, "loss": 0.8393, "step": 3175 }, { "epoch": 0.21519255363298354, "grad_norm": 3.734736204147339, "learning_rate": 0.0003867196056955093, "loss": 1.0848, "step": 3176 }, { "epoch": 0.21526030947480754, "grad_norm": 2.8379523754119873, "learning_rate": 0.0003867141292442497, "loss": 0.8798, "step": 3177 }, { "epoch": 0.2153280653166315, "grad_norm": 2.8278801441192627, "learning_rate": 0.00038670865279299017, "loss": 0.8569, "step": 3178 }, { "epoch": 0.2153958211584555, "grad_norm": 3.4038522243499756, "learning_rate": 0.00038670317634173057, "loss": 0.8173, "step": 3179 }, { "epoch": 0.2154635770002795, "grad_norm": 3.670212507247925, "learning_rate": 0.000386697699890471, "loss": 0.9258, "step": 3180 }, { "epoch": 0.2155313328421035, "grad_norm": 2.3656914234161377, "learning_rate": 0.0003866922234392114, "loss": 0.6878, "step": 3181 }, { "epoch": 0.21559908868392746, "grad_norm": 3.1545722484588623, "learning_rate": 0.0003866867469879518, "loss": 0.964, "step": 3182 }, { "epoch": 0.21566684452575147, "grad_norm": 3.670808792114258, "learning_rate": 0.0003866812705366922, "loss": 1.0518, "step": 3183 }, { "epoch": 0.21573460036757544, "grad_norm": 2.640873908996582, "learning_rate": 0.0003866757940854326, "loss": 0.7049, "step": 3184 }, { "epoch": 0.21580235620939944, "grad_norm": 3.5294442176818848, "learning_rate": 0.00038667031763417307, "loss": 0.8125, "step": 3185 }, { "epoch": 0.21587011205122342, "grad_norm": 2.996356725692749, "learning_rate": 0.0003866648411829135, "loss": 1.0532, "step": 3186 }, { "epoch": 0.2159378678930474, "grad_norm": 3.573580026626587, "learning_rate": 0.0003866593647316539, "loss": 0.8542, "step": 3187 }, { "epoch": 0.2160056237348714, "grad_norm": 4.565465450286865, "learning_rate": 0.0003866538882803943, "loss": 1.0028, "step": 3188 }, { "epoch": 0.21607337957669537, "grad_norm": 3.44734525680542, "learning_rate": 0.0003866484118291347, "loss": 1.1246, "step": 3189 }, { "epoch": 0.21614113541851937, "grad_norm": 2.922597646713257, "learning_rate": 0.0003866429353778752, "loss": 0.6692, "step": 3190 }, { "epoch": 0.21620889126034334, "grad_norm": 6.10439395904541, "learning_rate": 0.0003866374589266156, "loss": 0.9091, "step": 3191 }, { "epoch": 0.21627664710216735, "grad_norm": 3.820887327194214, "learning_rate": 0.00038663198247535597, "loss": 0.8253, "step": 3192 }, { "epoch": 0.21634440294399132, "grad_norm": 3.5457265377044678, "learning_rate": 0.0003866265060240964, "loss": 0.9971, "step": 3193 }, { "epoch": 0.21641215878581532, "grad_norm": 5.0301361083984375, "learning_rate": 0.0003866210295728368, "loss": 0.8361, "step": 3194 }, { "epoch": 0.2164799146276393, "grad_norm": 4.070746421813965, "learning_rate": 0.0003866155531215772, "loss": 1.129, "step": 3195 }, { "epoch": 0.2165476704694633, "grad_norm": 3.0056049823760986, "learning_rate": 0.0003866100766703177, "loss": 0.8028, "step": 3196 }, { "epoch": 0.21661542631128727, "grad_norm": 3.5070459842681885, "learning_rate": 0.0003866046002190581, "loss": 1.0661, "step": 3197 }, { "epoch": 0.21668318215311128, "grad_norm": 4.141531944274902, "learning_rate": 0.0003865991237677985, "loss": 1.0264, "step": 3198 }, { "epoch": 0.21675093799493525, "grad_norm": 3.307365655899048, "learning_rate": 0.0003865936473165389, "loss": 0.8389, "step": 3199 }, { "epoch": 0.21681869383675922, "grad_norm": 3.986813545227051, "learning_rate": 0.0003865881708652793, "loss": 0.8368, "step": 3200 }, { "epoch": 0.21688644967858323, "grad_norm": 3.709231376647949, "learning_rate": 0.00038658269441401973, "loss": 1.0078, "step": 3201 }, { "epoch": 0.2169542055204072, "grad_norm": 3.3721349239349365, "learning_rate": 0.0003865772179627602, "loss": 0.9174, "step": 3202 }, { "epoch": 0.2170219613622312, "grad_norm": 3.6890652179718018, "learning_rate": 0.0003865717415115006, "loss": 0.9724, "step": 3203 }, { "epoch": 0.21708971720405518, "grad_norm": 3.4990837574005127, "learning_rate": 0.000386566265060241, "loss": 0.9037, "step": 3204 }, { "epoch": 0.21715747304587918, "grad_norm": 3.7443063259124756, "learning_rate": 0.0003865607886089814, "loss": 0.9933, "step": 3205 }, { "epoch": 0.21722522888770315, "grad_norm": 4.51320219039917, "learning_rate": 0.0003865553121577218, "loss": 1.2428, "step": 3206 }, { "epoch": 0.21729298472952716, "grad_norm": 3.54730224609375, "learning_rate": 0.00038654983570646223, "loss": 0.8428, "step": 3207 }, { "epoch": 0.21736074057135113, "grad_norm": 2.9358415603637695, "learning_rate": 0.00038654435925520263, "loss": 0.7421, "step": 3208 }, { "epoch": 0.21742849641317513, "grad_norm": 3.6926915645599365, "learning_rate": 0.0003865388828039431, "loss": 0.8903, "step": 3209 }, { "epoch": 0.2174962522549991, "grad_norm": 3.064396858215332, "learning_rate": 0.0003865334063526835, "loss": 0.7064, "step": 3210 }, { "epoch": 0.2175640080968231, "grad_norm": 3.6823248863220215, "learning_rate": 0.0003865279299014239, "loss": 0.8738, "step": 3211 }, { "epoch": 0.21763176393864708, "grad_norm": 3.0942206382751465, "learning_rate": 0.00038652245345016434, "loss": 0.865, "step": 3212 }, { "epoch": 0.21769951978047108, "grad_norm": 2.995757818222046, "learning_rate": 0.00038651697699890474, "loss": 0.8362, "step": 3213 }, { "epoch": 0.21776727562229506, "grad_norm": 3.0151493549346924, "learning_rate": 0.00038651150054764513, "loss": 0.7578, "step": 3214 }, { "epoch": 0.21783503146411903, "grad_norm": 3.0874853134155273, "learning_rate": 0.00038650602409638553, "loss": 0.683, "step": 3215 }, { "epoch": 0.21790278730594304, "grad_norm": 3.3230679035186768, "learning_rate": 0.000386500547645126, "loss": 1.1995, "step": 3216 }, { "epoch": 0.217970543147767, "grad_norm": 3.588575601577759, "learning_rate": 0.0003864950711938664, "loss": 0.6937, "step": 3217 }, { "epoch": 0.218038298989591, "grad_norm": 4.005873203277588, "learning_rate": 0.00038648959474260684, "loss": 0.8626, "step": 3218 }, { "epoch": 0.21810605483141499, "grad_norm": 3.6696572303771973, "learning_rate": 0.00038648411829134724, "loss": 1.0165, "step": 3219 }, { "epoch": 0.218173810673239, "grad_norm": 3.4730069637298584, "learning_rate": 0.00038647864184008764, "loss": 0.8819, "step": 3220 }, { "epoch": 0.21824156651506296, "grad_norm": 3.596906900405884, "learning_rate": 0.00038647316538882804, "loss": 0.9022, "step": 3221 }, { "epoch": 0.21830932235688696, "grad_norm": 5.192864894866943, "learning_rate": 0.00038646768893756844, "loss": 1.1926, "step": 3222 }, { "epoch": 0.21837707819871094, "grad_norm": 4.08858585357666, "learning_rate": 0.0003864622124863089, "loss": 1.1517, "step": 3223 }, { "epoch": 0.21844483404053494, "grad_norm": 3.3537964820861816, "learning_rate": 0.00038645673603504934, "loss": 0.8688, "step": 3224 }, { "epoch": 0.21851258988235892, "grad_norm": 3.6729698181152344, "learning_rate": 0.00038645125958378974, "loss": 0.9936, "step": 3225 }, { "epoch": 0.21858034572418292, "grad_norm": 4.167658805847168, "learning_rate": 0.00038644578313253014, "loss": 1.1169, "step": 3226 }, { "epoch": 0.2186481015660069, "grad_norm": 2.9464125633239746, "learning_rate": 0.00038644030668127054, "loss": 0.8648, "step": 3227 }, { "epoch": 0.2187158574078309, "grad_norm": 4.127668380737305, "learning_rate": 0.000386434830230011, "loss": 1.1282, "step": 3228 }, { "epoch": 0.21878361324965487, "grad_norm": 3.5825445652008057, "learning_rate": 0.0003864293537787514, "loss": 0.8803, "step": 3229 }, { "epoch": 0.21885136909147884, "grad_norm": 3.144740343093872, "learning_rate": 0.0003864238773274918, "loss": 0.8019, "step": 3230 }, { "epoch": 0.21891912493330284, "grad_norm": 3.5088999271392822, "learning_rate": 0.0003864184008762322, "loss": 0.9729, "step": 3231 }, { "epoch": 0.21898688077512682, "grad_norm": 4.277069568634033, "learning_rate": 0.00038641292442497265, "loss": 1.2088, "step": 3232 }, { "epoch": 0.21905463661695082, "grad_norm": 3.9226300716400146, "learning_rate": 0.00038640744797371304, "loss": 0.8578, "step": 3233 }, { "epoch": 0.2191223924587748, "grad_norm": 3.2542054653167725, "learning_rate": 0.0003864019715224535, "loss": 0.8491, "step": 3234 }, { "epoch": 0.2191901483005988, "grad_norm": 4.1487555503845215, "learning_rate": 0.0003863964950711939, "loss": 0.937, "step": 3235 }, { "epoch": 0.21925790414242277, "grad_norm": 3.1875462532043457, "learning_rate": 0.0003863910186199343, "loss": 0.8816, "step": 3236 }, { "epoch": 0.21932565998424677, "grad_norm": 3.487698554992676, "learning_rate": 0.0003863855421686747, "loss": 0.8083, "step": 3237 }, { "epoch": 0.21939341582607075, "grad_norm": 2.930428981781006, "learning_rate": 0.0003863800657174151, "loss": 0.8553, "step": 3238 }, { "epoch": 0.21946117166789475, "grad_norm": 3.352018356323242, "learning_rate": 0.00038637458926615555, "loss": 1.0124, "step": 3239 }, { "epoch": 0.21952892750971872, "grad_norm": 3.139572858810425, "learning_rate": 0.000386369112814896, "loss": 0.9454, "step": 3240 }, { "epoch": 0.21959668335154273, "grad_norm": 3.50776743888855, "learning_rate": 0.0003863636363636364, "loss": 1.0861, "step": 3241 }, { "epoch": 0.2196644391933667, "grad_norm": 4.472263813018799, "learning_rate": 0.0003863581599123768, "loss": 1.1886, "step": 3242 }, { "epoch": 0.2197321950351907, "grad_norm": 3.2098097801208496, "learning_rate": 0.0003863526834611172, "loss": 0.759, "step": 3243 }, { "epoch": 0.21979995087701468, "grad_norm": 3.656064748764038, "learning_rate": 0.0003863472070098576, "loss": 0.7931, "step": 3244 }, { "epoch": 0.21986770671883865, "grad_norm": 3.4928789138793945, "learning_rate": 0.00038634173055859805, "loss": 1.1735, "step": 3245 }, { "epoch": 0.21993546256066265, "grad_norm": 3.3951058387756348, "learning_rate": 0.00038633625410733845, "loss": 0.8016, "step": 3246 }, { "epoch": 0.22000321840248663, "grad_norm": 5.2316508293151855, "learning_rate": 0.0003863307776560789, "loss": 1.4139, "step": 3247 }, { "epoch": 0.22007097424431063, "grad_norm": 4.763803958892822, "learning_rate": 0.0003863253012048193, "loss": 1.0731, "step": 3248 }, { "epoch": 0.2201387300861346, "grad_norm": 3.7690165042877197, "learning_rate": 0.0003863198247535597, "loss": 0.8311, "step": 3249 }, { "epoch": 0.2202064859279586, "grad_norm": 3.196415424346924, "learning_rate": 0.00038631434830230016, "loss": 0.8192, "step": 3250 }, { "epoch": 0.22027424176978258, "grad_norm": 3.4688775539398193, "learning_rate": 0.00038630887185104056, "loss": 0.8734, "step": 3251 }, { "epoch": 0.22034199761160658, "grad_norm": 2.9779715538024902, "learning_rate": 0.00038630339539978096, "loss": 0.9098, "step": 3252 }, { "epoch": 0.22040975345343056, "grad_norm": 4.155899524688721, "learning_rate": 0.00038629791894852135, "loss": 1.1303, "step": 3253 }, { "epoch": 0.22047750929525456, "grad_norm": 2.8792154788970947, "learning_rate": 0.00038629244249726175, "loss": 0.8388, "step": 3254 }, { "epoch": 0.22054526513707853, "grad_norm": 3.0970845222473145, "learning_rate": 0.0003862869660460022, "loss": 0.7923, "step": 3255 }, { "epoch": 0.22061302097890254, "grad_norm": 2.97513747215271, "learning_rate": 0.00038628148959474266, "loss": 0.8896, "step": 3256 }, { "epoch": 0.2206807768207265, "grad_norm": 3.0410900115966797, "learning_rate": 0.00038627601314348306, "loss": 0.9037, "step": 3257 }, { "epoch": 0.2207485326625505, "grad_norm": 4.922347545623779, "learning_rate": 0.00038627053669222346, "loss": 1.2886, "step": 3258 }, { "epoch": 0.22081628850437449, "grad_norm": 3.047973155975342, "learning_rate": 0.00038626506024096386, "loss": 0.8655, "step": 3259 }, { "epoch": 0.22088404434619846, "grad_norm": 4.60847806930542, "learning_rate": 0.00038625958378970426, "loss": 0.928, "step": 3260 }, { "epoch": 0.22095180018802246, "grad_norm": 3.3728489875793457, "learning_rate": 0.0003862541073384447, "loss": 0.9607, "step": 3261 }, { "epoch": 0.22101955602984644, "grad_norm": 4.806407928466797, "learning_rate": 0.0003862486308871851, "loss": 1.2125, "step": 3262 }, { "epoch": 0.22108731187167044, "grad_norm": 3.5676422119140625, "learning_rate": 0.00038624315443592556, "loss": 0.8729, "step": 3263 }, { "epoch": 0.2211550677134944, "grad_norm": 3.9356565475463867, "learning_rate": 0.00038623767798466596, "loss": 0.9958, "step": 3264 }, { "epoch": 0.22122282355531842, "grad_norm": 2.7237555980682373, "learning_rate": 0.00038623220153340636, "loss": 0.7361, "step": 3265 }, { "epoch": 0.2212905793971424, "grad_norm": 3.026766300201416, "learning_rate": 0.0003862267250821468, "loss": 0.9174, "step": 3266 }, { "epoch": 0.2213583352389664, "grad_norm": 3.4095518589019775, "learning_rate": 0.0003862212486308872, "loss": 1.0723, "step": 3267 }, { "epoch": 0.22142609108079037, "grad_norm": 5.167072296142578, "learning_rate": 0.0003862157721796276, "loss": 0.8879, "step": 3268 }, { "epoch": 0.22149384692261437, "grad_norm": 2.764686107635498, "learning_rate": 0.000386210295728368, "loss": 0.7583, "step": 3269 }, { "epoch": 0.22156160276443834, "grad_norm": 4.608572483062744, "learning_rate": 0.0003862048192771084, "loss": 1.243, "step": 3270 }, { "epoch": 0.22162935860626234, "grad_norm": 3.428542375564575, "learning_rate": 0.00038619934282584887, "loss": 0.8606, "step": 3271 }, { "epoch": 0.22169711444808632, "grad_norm": 3.8383169174194336, "learning_rate": 0.0003861938663745893, "loss": 1.0265, "step": 3272 }, { "epoch": 0.22176487028991032, "grad_norm": 3.5066847801208496, "learning_rate": 0.0003861883899233297, "loss": 0.9652, "step": 3273 }, { "epoch": 0.2218326261317343, "grad_norm": 3.234522819519043, "learning_rate": 0.0003861829134720701, "loss": 0.986, "step": 3274 }, { "epoch": 0.22190038197355827, "grad_norm": 3.424370765686035, "learning_rate": 0.0003861774370208105, "loss": 0.9005, "step": 3275 }, { "epoch": 0.22196813781538227, "grad_norm": 3.1895580291748047, "learning_rate": 0.0003861719605695509, "loss": 0.8121, "step": 3276 }, { "epoch": 0.22203589365720625, "grad_norm": 3.1825530529022217, "learning_rate": 0.00038616648411829137, "loss": 0.9125, "step": 3277 }, { "epoch": 0.22210364949903025, "grad_norm": 3.2350778579711914, "learning_rate": 0.00038616100766703177, "loss": 0.9312, "step": 3278 }, { "epoch": 0.22217140534085422, "grad_norm": 3.469127655029297, "learning_rate": 0.0003861555312157722, "loss": 0.9424, "step": 3279 }, { "epoch": 0.22223916118267822, "grad_norm": 2.920137882232666, "learning_rate": 0.0003861500547645126, "loss": 0.7592, "step": 3280 }, { "epoch": 0.2223069170245022, "grad_norm": 3.4994685649871826, "learning_rate": 0.000386144578313253, "loss": 1.076, "step": 3281 }, { "epoch": 0.2223746728663262, "grad_norm": 3.880659818649292, "learning_rate": 0.0003861391018619934, "loss": 0.9621, "step": 3282 }, { "epoch": 0.22244242870815017, "grad_norm": 4.283945083618164, "learning_rate": 0.0003861336254107339, "loss": 1.321, "step": 3283 }, { "epoch": 0.22251018454997418, "grad_norm": 3.0589187145233154, "learning_rate": 0.00038612814895947427, "loss": 0.9018, "step": 3284 }, { "epoch": 0.22257794039179815, "grad_norm": 3.9865033626556396, "learning_rate": 0.00038612267250821467, "loss": 1.0113, "step": 3285 }, { "epoch": 0.22264569623362215, "grad_norm": 3.5975570678710938, "learning_rate": 0.0003861171960569551, "loss": 1.2628, "step": 3286 }, { "epoch": 0.22271345207544613, "grad_norm": 3.0980255603790283, "learning_rate": 0.0003861117196056955, "loss": 0.9407, "step": 3287 }, { "epoch": 0.22278120791727013, "grad_norm": 3.8600916862487793, "learning_rate": 0.000386106243154436, "loss": 0.9274, "step": 3288 }, { "epoch": 0.2228489637590941, "grad_norm": 3.670548439025879, "learning_rate": 0.0003861007667031764, "loss": 0.8947, "step": 3289 }, { "epoch": 0.22291671960091808, "grad_norm": 3.208301067352295, "learning_rate": 0.0003860952902519168, "loss": 0.7756, "step": 3290 }, { "epoch": 0.22298447544274208, "grad_norm": 2.8278963565826416, "learning_rate": 0.0003860898138006572, "loss": 0.6993, "step": 3291 }, { "epoch": 0.22305223128456605, "grad_norm": 9.34854793548584, "learning_rate": 0.0003860843373493976, "loss": 0.9927, "step": 3292 }, { "epoch": 0.22311998712639006, "grad_norm": 3.388206720352173, "learning_rate": 0.00038607886089813803, "loss": 1.0319, "step": 3293 }, { "epoch": 0.22318774296821403, "grad_norm": 4.022616863250732, "learning_rate": 0.0003860733844468785, "loss": 0.9612, "step": 3294 }, { "epoch": 0.22325549881003803, "grad_norm": 3.1215081214904785, "learning_rate": 0.0003860679079956189, "loss": 0.824, "step": 3295 }, { "epoch": 0.223323254651862, "grad_norm": 2.60457181930542, "learning_rate": 0.0003860624315443593, "loss": 0.6243, "step": 3296 }, { "epoch": 0.223391010493686, "grad_norm": 2.602731704711914, "learning_rate": 0.0003860569550930997, "loss": 0.8362, "step": 3297 }, { "epoch": 0.22345876633550998, "grad_norm": 4.482247829437256, "learning_rate": 0.0003860514786418401, "loss": 0.9832, "step": 3298 }, { "epoch": 0.22352652217733399, "grad_norm": 3.846708059310913, "learning_rate": 0.00038604600219058053, "loss": 1.0121, "step": 3299 }, { "epoch": 0.22359427801915796, "grad_norm": 4.125100135803223, "learning_rate": 0.00038604052573932093, "loss": 1.015, "step": 3300 }, { "epoch": 0.22366203386098196, "grad_norm": 3.5581533908843994, "learning_rate": 0.00038603504928806133, "loss": 1.0234, "step": 3301 }, { "epoch": 0.22372978970280594, "grad_norm": 3.0680363178253174, "learning_rate": 0.0003860295728368018, "loss": 0.8798, "step": 3302 }, { "epoch": 0.22379754554462994, "grad_norm": 4.137269496917725, "learning_rate": 0.0003860240963855422, "loss": 1.146, "step": 3303 }, { "epoch": 0.2238653013864539, "grad_norm": 3.1315622329711914, "learning_rate": 0.00038601861993428264, "loss": 0.8505, "step": 3304 }, { "epoch": 0.2239330572282779, "grad_norm": 2.8968334197998047, "learning_rate": 0.00038601314348302304, "loss": 0.7175, "step": 3305 }, { "epoch": 0.2240008130701019, "grad_norm": 2.7565760612487793, "learning_rate": 0.00038600766703176343, "loss": 0.7164, "step": 3306 }, { "epoch": 0.22406856891192586, "grad_norm": 4.303454875946045, "learning_rate": 0.00038600219058050383, "loss": 1.217, "step": 3307 }, { "epoch": 0.22413632475374987, "grad_norm": 3.8144803047180176, "learning_rate": 0.00038599671412924423, "loss": 0.7857, "step": 3308 }, { "epoch": 0.22420408059557384, "grad_norm": 5.346485137939453, "learning_rate": 0.0003859912376779847, "loss": 1.0353, "step": 3309 }, { "epoch": 0.22427183643739784, "grad_norm": 3.725198984146118, "learning_rate": 0.00038598576122672514, "loss": 0.8318, "step": 3310 }, { "epoch": 0.22433959227922182, "grad_norm": 4.739047527313232, "learning_rate": 0.00038598028477546554, "loss": 1.013, "step": 3311 }, { "epoch": 0.22440734812104582, "grad_norm": 3.1888105869293213, "learning_rate": 0.00038597480832420594, "loss": 0.7755, "step": 3312 }, { "epoch": 0.2244751039628698, "grad_norm": 3.478789806365967, "learning_rate": 0.00038596933187294634, "loss": 0.7876, "step": 3313 }, { "epoch": 0.2245428598046938, "grad_norm": 3.094334363937378, "learning_rate": 0.00038596385542168674, "loss": 0.8198, "step": 3314 }, { "epoch": 0.22461061564651777, "grad_norm": 3.139134407043457, "learning_rate": 0.0003859583789704272, "loss": 0.6934, "step": 3315 }, { "epoch": 0.22467837148834177, "grad_norm": 4.222721099853516, "learning_rate": 0.0003859529025191676, "loss": 0.995, "step": 3316 }, { "epoch": 0.22474612733016575, "grad_norm": 6.486714839935303, "learning_rate": 0.00038594742606790804, "loss": 1.092, "step": 3317 }, { "epoch": 0.22481388317198975, "grad_norm": 5.751864910125732, "learning_rate": 0.00038594194961664844, "loss": 1.3278, "step": 3318 }, { "epoch": 0.22488163901381372, "grad_norm": 5.510232448577881, "learning_rate": 0.00038593647316538884, "loss": 0.9396, "step": 3319 }, { "epoch": 0.2249493948556377, "grad_norm": 3.875171422958374, "learning_rate": 0.00038593099671412924, "loss": 0.9943, "step": 3320 }, { "epoch": 0.2250171506974617, "grad_norm": 4.330343246459961, "learning_rate": 0.0003859255202628697, "loss": 0.9082, "step": 3321 }, { "epoch": 0.22508490653928567, "grad_norm": 3.1461009979248047, "learning_rate": 0.0003859200438116101, "loss": 0.8741, "step": 3322 }, { "epoch": 0.22515266238110968, "grad_norm": 3.735506772994995, "learning_rate": 0.0003859145673603505, "loss": 1.1276, "step": 3323 }, { "epoch": 0.22522041822293365, "grad_norm": 3.0065128803253174, "learning_rate": 0.0003859090909090909, "loss": 0.8093, "step": 3324 }, { "epoch": 0.22528817406475765, "grad_norm": 4.059788703918457, "learning_rate": 0.00038590361445783134, "loss": 1.1634, "step": 3325 }, { "epoch": 0.22535592990658163, "grad_norm": 3.328057289123535, "learning_rate": 0.0003858981380065718, "loss": 1.0669, "step": 3326 }, { "epoch": 0.22542368574840563, "grad_norm": 3.5306568145751953, "learning_rate": 0.0003858926615553122, "loss": 0.9011, "step": 3327 }, { "epoch": 0.2254914415902296, "grad_norm": 3.616150140762329, "learning_rate": 0.0003858871851040526, "loss": 0.7039, "step": 3328 }, { "epoch": 0.2255591974320536, "grad_norm": 3.138782262802124, "learning_rate": 0.000385881708652793, "loss": 0.8822, "step": 3329 }, { "epoch": 0.22562695327387758, "grad_norm": 2.966705560684204, "learning_rate": 0.0003858762322015334, "loss": 0.6991, "step": 3330 }, { "epoch": 0.22569470911570158, "grad_norm": 2.5443427562713623, "learning_rate": 0.00038587075575027385, "loss": 0.765, "step": 3331 }, { "epoch": 0.22576246495752555, "grad_norm": 3.599168300628662, "learning_rate": 0.00038586527929901425, "loss": 1.0406, "step": 3332 }, { "epoch": 0.22583022079934956, "grad_norm": 3.751220226287842, "learning_rate": 0.0003858598028477547, "loss": 1.2506, "step": 3333 }, { "epoch": 0.22589797664117353, "grad_norm": 4.456590175628662, "learning_rate": 0.0003858543263964951, "loss": 1.2845, "step": 3334 }, { "epoch": 0.2259657324829975, "grad_norm": 4.015080451965332, "learning_rate": 0.0003858488499452355, "loss": 1.0018, "step": 3335 }, { "epoch": 0.2260334883248215, "grad_norm": 3.462250232696533, "learning_rate": 0.0003858433734939759, "loss": 0.9436, "step": 3336 }, { "epoch": 0.22610124416664548, "grad_norm": 4.5326385498046875, "learning_rate": 0.00038583789704271635, "loss": 0.9292, "step": 3337 }, { "epoch": 0.22616900000846948, "grad_norm": 2.59982967376709, "learning_rate": 0.00038583242059145675, "loss": 0.6989, "step": 3338 }, { "epoch": 0.22623675585029346, "grad_norm": 4.161834716796875, "learning_rate": 0.00038582694414019715, "loss": 0.968, "step": 3339 }, { "epoch": 0.22630451169211746, "grad_norm": 3.513258218765259, "learning_rate": 0.00038582146768893755, "loss": 1.1126, "step": 3340 }, { "epoch": 0.22637226753394143, "grad_norm": 3.807417869567871, "learning_rate": 0.000385815991237678, "loss": 0.9582, "step": 3341 }, { "epoch": 0.22644002337576544, "grad_norm": 2.849374532699585, "learning_rate": 0.00038581051478641846, "loss": 0.6842, "step": 3342 }, { "epoch": 0.2265077792175894, "grad_norm": 2.980308771133423, "learning_rate": 0.00038580503833515886, "loss": 0.8717, "step": 3343 }, { "epoch": 0.2265755350594134, "grad_norm": 5.147204399108887, "learning_rate": 0.00038579956188389926, "loss": 1.1525, "step": 3344 }, { "epoch": 0.2266432909012374, "grad_norm": 2.9929161071777344, "learning_rate": 0.00038579408543263965, "loss": 0.7712, "step": 3345 }, { "epoch": 0.2267110467430614, "grad_norm": 17.001392364501953, "learning_rate": 0.00038578860898138005, "loss": 0.9209, "step": 3346 }, { "epoch": 0.22677880258488536, "grad_norm": 90.29581451416016, "learning_rate": 0.0003857831325301205, "loss": 4.8926, "step": 3347 }, { "epoch": 0.22684655842670937, "grad_norm": 2.4456446170806885, "learning_rate": 0.00038577765607886096, "loss": 0.6685, "step": 3348 }, { "epoch": 0.22691431426853334, "grad_norm": 3.6926991939544678, "learning_rate": 0.00038577217962760136, "loss": 1.1418, "step": 3349 }, { "epoch": 0.22698207011035731, "grad_norm": 3.314230442047119, "learning_rate": 0.00038576670317634176, "loss": 0.9646, "step": 3350 }, { "epoch": 0.22704982595218132, "grad_norm": 4.015190601348877, "learning_rate": 0.00038576122672508216, "loss": 1.0425, "step": 3351 }, { "epoch": 0.2271175817940053, "grad_norm": 2.56612491607666, "learning_rate": 0.00038575575027382256, "loss": 0.7559, "step": 3352 }, { "epoch": 0.2271853376358293, "grad_norm": 3.4861040115356445, "learning_rate": 0.000385750273822563, "loss": 0.7995, "step": 3353 }, { "epoch": 0.22725309347765327, "grad_norm": 2.8572921752929688, "learning_rate": 0.0003857447973713034, "loss": 0.9158, "step": 3354 }, { "epoch": 0.22732084931947727, "grad_norm": 2.811467409133911, "learning_rate": 0.0003857393209200438, "loss": 0.772, "step": 3355 }, { "epoch": 0.22738860516130124, "grad_norm": 2.514394521713257, "learning_rate": 0.00038573384446878426, "loss": 0.7494, "step": 3356 }, { "epoch": 0.22745636100312525, "grad_norm": 2.8694980144500732, "learning_rate": 0.00038572836801752466, "loss": 0.8199, "step": 3357 }, { "epoch": 0.22752411684494922, "grad_norm": 2.919438362121582, "learning_rate": 0.00038572289156626506, "loss": 0.7662, "step": 3358 }, { "epoch": 0.22759187268677322, "grad_norm": 2.8485865592956543, "learning_rate": 0.0003857174151150055, "loss": 0.8239, "step": 3359 }, { "epoch": 0.2276596285285972, "grad_norm": 3.329538345336914, "learning_rate": 0.0003857119386637459, "loss": 0.9703, "step": 3360 }, { "epoch": 0.2277273843704212, "grad_norm": 6.457708835601807, "learning_rate": 0.0003857064622124863, "loss": 0.7194, "step": 3361 }, { "epoch": 0.22779514021224517, "grad_norm": 3.5136470794677734, "learning_rate": 0.0003857009857612267, "loss": 1.0188, "step": 3362 }, { "epoch": 0.22786289605406918, "grad_norm": 3.066953182220459, "learning_rate": 0.00038569550930996717, "loss": 0.9819, "step": 3363 }, { "epoch": 0.22793065189589315, "grad_norm": 3.5111289024353027, "learning_rate": 0.0003856900328587076, "loss": 0.9593, "step": 3364 }, { "epoch": 0.22799840773771712, "grad_norm": 3.7057344913482666, "learning_rate": 0.000385684556407448, "loss": 0.9064, "step": 3365 }, { "epoch": 0.22806616357954113, "grad_norm": 4.136794567108154, "learning_rate": 0.0003856790799561884, "loss": 0.8064, "step": 3366 }, { "epoch": 0.2281339194213651, "grad_norm": 5.08695650100708, "learning_rate": 0.0003856736035049288, "loss": 0.8585, "step": 3367 }, { "epoch": 0.2282016752631891, "grad_norm": 3.313508987426758, "learning_rate": 0.0003856681270536692, "loss": 0.9318, "step": 3368 }, { "epoch": 0.22826943110501308, "grad_norm": 3.9543840885162354, "learning_rate": 0.00038566265060240967, "loss": 0.9329, "step": 3369 }, { "epoch": 0.22833718694683708, "grad_norm": 4.701480388641357, "learning_rate": 0.00038565717415115007, "loss": 0.8461, "step": 3370 }, { "epoch": 0.22840494278866105, "grad_norm": 3.0694451332092285, "learning_rate": 0.00038565169769989047, "loss": 0.8549, "step": 3371 }, { "epoch": 0.22847269863048505, "grad_norm": 4.941560745239258, "learning_rate": 0.0003856462212486309, "loss": 0.9822, "step": 3372 }, { "epoch": 0.22854045447230903, "grad_norm": 3.8070406913757324, "learning_rate": 0.0003856407447973713, "loss": 1.073, "step": 3373 }, { "epoch": 0.22860821031413303, "grad_norm": 2.937474012374878, "learning_rate": 0.0003856352683461117, "loss": 0.8574, "step": 3374 }, { "epoch": 0.228675966155957, "grad_norm": 3.4327750205993652, "learning_rate": 0.00038562979189485217, "loss": 0.9061, "step": 3375 }, { "epoch": 0.228743721997781, "grad_norm": 3.4981727600097656, "learning_rate": 0.00038562431544359257, "loss": 0.9346, "step": 3376 }, { "epoch": 0.22881147783960498, "grad_norm": 3.516223430633545, "learning_rate": 0.00038561883899233297, "loss": 0.8493, "step": 3377 }, { "epoch": 0.22887923368142898, "grad_norm": 3.2403836250305176, "learning_rate": 0.00038561336254107337, "loss": 1.0263, "step": 3378 }, { "epoch": 0.22894698952325296, "grad_norm": 3.306828737258911, "learning_rate": 0.0003856078860898138, "loss": 0.9544, "step": 3379 }, { "epoch": 0.22901474536507693, "grad_norm": 2.5655391216278076, "learning_rate": 0.0003856024096385543, "loss": 0.7852, "step": 3380 }, { "epoch": 0.22908250120690093, "grad_norm": 2.8811683654785156, "learning_rate": 0.0003855969331872947, "loss": 0.8604, "step": 3381 }, { "epoch": 0.2291502570487249, "grad_norm": 4.709669589996338, "learning_rate": 0.0003855914567360351, "loss": 0.9792, "step": 3382 }, { "epoch": 0.2292180128905489, "grad_norm": 3.180150032043457, "learning_rate": 0.0003855859802847755, "loss": 0.9009, "step": 3383 }, { "epoch": 0.22928576873237289, "grad_norm": 4.754932880401611, "learning_rate": 0.0003855805038335159, "loss": 1.0886, "step": 3384 }, { "epoch": 0.2293535245741969, "grad_norm": 3.078165054321289, "learning_rate": 0.00038557502738225633, "loss": 0.7447, "step": 3385 }, { "epoch": 0.22942128041602086, "grad_norm": 4.425560474395752, "learning_rate": 0.0003855695509309967, "loss": 0.9518, "step": 3386 }, { "epoch": 0.22948903625784486, "grad_norm": 3.0872108936309814, "learning_rate": 0.0003855640744797372, "loss": 0.8921, "step": 3387 }, { "epoch": 0.22955679209966884, "grad_norm": 3.4044382572174072, "learning_rate": 0.0003855585980284776, "loss": 0.9192, "step": 3388 }, { "epoch": 0.22962454794149284, "grad_norm": 3.983921766281128, "learning_rate": 0.000385553121577218, "loss": 0.9262, "step": 3389 }, { "epoch": 0.22969230378331681, "grad_norm": 3.5800323486328125, "learning_rate": 0.0003855476451259584, "loss": 1.1124, "step": 3390 }, { "epoch": 0.22976005962514082, "grad_norm": 3.3075690269470215, "learning_rate": 0.00038554216867469883, "loss": 0.6932, "step": 3391 }, { "epoch": 0.2298278154669648, "grad_norm": 3.4148035049438477, "learning_rate": 0.00038553669222343923, "loss": 1.0199, "step": 3392 }, { "epoch": 0.2298955713087888, "grad_norm": 3.788975238800049, "learning_rate": 0.00038553121577217963, "loss": 1.0431, "step": 3393 }, { "epoch": 0.22996332715061277, "grad_norm": 4.082870006561279, "learning_rate": 0.00038552573932092003, "loss": 0.9449, "step": 3394 }, { "epoch": 0.23003108299243674, "grad_norm": 4.332869052886963, "learning_rate": 0.0003855202628696605, "loss": 1.0971, "step": 3395 }, { "epoch": 0.23009883883426074, "grad_norm": 3.3177127838134766, "learning_rate": 0.0003855147864184009, "loss": 0.8408, "step": 3396 }, { "epoch": 0.23016659467608472, "grad_norm": 3.0329833030700684, "learning_rate": 0.00038550930996714133, "loss": 0.7704, "step": 3397 }, { "epoch": 0.23023435051790872, "grad_norm": 3.886368751525879, "learning_rate": 0.00038550383351588173, "loss": 1.2036, "step": 3398 }, { "epoch": 0.2303021063597327, "grad_norm": 2.6347174644470215, "learning_rate": 0.00038549835706462213, "loss": 0.7221, "step": 3399 }, { "epoch": 0.2303698622015567, "grad_norm": 3.0649521350860596, "learning_rate": 0.00038549288061336253, "loss": 0.8738, "step": 3400 }, { "epoch": 0.23043761804338067, "grad_norm": 3.466278314590454, "learning_rate": 0.000385487404162103, "loss": 0.746, "step": 3401 }, { "epoch": 0.23050537388520467, "grad_norm": 3.7289535999298096, "learning_rate": 0.0003854819277108434, "loss": 0.8961, "step": 3402 }, { "epoch": 0.23057312972702865, "grad_norm": 3.3163657188415527, "learning_rate": 0.00038547645125958384, "loss": 0.8762, "step": 3403 }, { "epoch": 0.23064088556885265, "grad_norm": 3.7055232524871826, "learning_rate": 0.00038547097480832424, "loss": 0.915, "step": 3404 }, { "epoch": 0.23070864141067662, "grad_norm": 4.6322407722473145, "learning_rate": 0.00038546549835706464, "loss": 0.9481, "step": 3405 }, { "epoch": 0.23077639725250063, "grad_norm": 3.7476983070373535, "learning_rate": 0.00038546002190580504, "loss": 1.107, "step": 3406 }, { "epoch": 0.2308441530943246, "grad_norm": 4.79381799697876, "learning_rate": 0.0003854545454545455, "loss": 0.8741, "step": 3407 }, { "epoch": 0.2309119089361486, "grad_norm": 2.9633309841156006, "learning_rate": 0.0003854490690032859, "loss": 0.8226, "step": 3408 }, { "epoch": 0.23097966477797258, "grad_norm": 3.6430296897888184, "learning_rate": 0.0003854435925520263, "loss": 0.7634, "step": 3409 }, { "epoch": 0.23104742061979655, "grad_norm": 3.2793545722961426, "learning_rate": 0.0003854381161007667, "loss": 0.9798, "step": 3410 }, { "epoch": 0.23111517646162055, "grad_norm": 3.385540246963501, "learning_rate": 0.00038543263964950714, "loss": 1.0561, "step": 3411 }, { "epoch": 0.23118293230344453, "grad_norm": 2.9721062183380127, "learning_rate": 0.00038542716319824754, "loss": 0.7212, "step": 3412 }, { "epoch": 0.23125068814526853, "grad_norm": 3.544701337814331, "learning_rate": 0.000385421686746988, "loss": 0.9381, "step": 3413 }, { "epoch": 0.2313184439870925, "grad_norm": 2.6547632217407227, "learning_rate": 0.0003854162102957284, "loss": 0.8176, "step": 3414 }, { "epoch": 0.2313861998289165, "grad_norm": 3.5925381183624268, "learning_rate": 0.0003854107338444688, "loss": 1.0717, "step": 3415 }, { "epoch": 0.23145395567074048, "grad_norm": 2.7051608562469482, "learning_rate": 0.0003854052573932092, "loss": 0.8203, "step": 3416 }, { "epoch": 0.23152171151256448, "grad_norm": 4.542996883392334, "learning_rate": 0.00038539978094194964, "loss": 0.9346, "step": 3417 }, { "epoch": 0.23158946735438846, "grad_norm": 3.785576581954956, "learning_rate": 0.0003853943044906901, "loss": 1.0932, "step": 3418 }, { "epoch": 0.23165722319621246, "grad_norm": 3.438602924346924, "learning_rate": 0.0003853888280394305, "loss": 0.835, "step": 3419 }, { "epoch": 0.23172497903803643, "grad_norm": 2.952747106552124, "learning_rate": 0.0003853833515881709, "loss": 0.8179, "step": 3420 }, { "epoch": 0.23179273487986043, "grad_norm": 3.240570545196533, "learning_rate": 0.0003853778751369113, "loss": 0.8928, "step": 3421 }, { "epoch": 0.2318604907216844, "grad_norm": 2.6418635845184326, "learning_rate": 0.0003853723986856517, "loss": 0.6984, "step": 3422 }, { "epoch": 0.2319282465635084, "grad_norm": 4.2474541664123535, "learning_rate": 0.00038536692223439215, "loss": 1.0001, "step": 3423 }, { "epoch": 0.23199600240533239, "grad_norm": 3.3554728031158447, "learning_rate": 0.00038536144578313255, "loss": 0.8196, "step": 3424 }, { "epoch": 0.23206375824715636, "grad_norm": 3.1898434162139893, "learning_rate": 0.00038535596933187295, "loss": 0.9446, "step": 3425 }, { "epoch": 0.23213151408898036, "grad_norm": 4.611783027648926, "learning_rate": 0.0003853504928806134, "loss": 0.884, "step": 3426 }, { "epoch": 0.23219926993080434, "grad_norm": 3.369152784347534, "learning_rate": 0.0003853450164293538, "loss": 0.8741, "step": 3427 }, { "epoch": 0.23226702577262834, "grad_norm": 3.848773241043091, "learning_rate": 0.0003853395399780942, "loss": 0.7307, "step": 3428 }, { "epoch": 0.2323347816144523, "grad_norm": 3.504258394241333, "learning_rate": 0.00038533406352683465, "loss": 0.9336, "step": 3429 }, { "epoch": 0.23240253745627631, "grad_norm": 3.5585741996765137, "learning_rate": 0.00038532858707557505, "loss": 0.7945, "step": 3430 }, { "epoch": 0.2324702932981003, "grad_norm": 3.4187331199645996, "learning_rate": 0.00038532311062431545, "loss": 0.9932, "step": 3431 }, { "epoch": 0.2325380491399243, "grad_norm": 3.8284411430358887, "learning_rate": 0.00038531763417305585, "loss": 1.0536, "step": 3432 }, { "epoch": 0.23260580498174827, "grad_norm": 2.968740224838257, "learning_rate": 0.00038531215772179625, "loss": 0.8009, "step": 3433 }, { "epoch": 0.23267356082357227, "grad_norm": 3.044172763824463, "learning_rate": 0.0003853066812705367, "loss": 0.6621, "step": 3434 }, { "epoch": 0.23274131666539624, "grad_norm": 3.588742971420288, "learning_rate": 0.00038530120481927716, "loss": 1.0881, "step": 3435 }, { "epoch": 0.23280907250722024, "grad_norm": 6.516396522521973, "learning_rate": 0.00038529572836801755, "loss": 0.7891, "step": 3436 }, { "epoch": 0.23287682834904422, "grad_norm": 2.7659990787506104, "learning_rate": 0.00038529025191675795, "loss": 0.8283, "step": 3437 }, { "epoch": 0.23294458419086822, "grad_norm": 3.939044237136841, "learning_rate": 0.00038528477546549835, "loss": 0.9839, "step": 3438 }, { "epoch": 0.2330123400326922, "grad_norm": 3.7872183322906494, "learning_rate": 0.0003852792990142388, "loss": 0.8786, "step": 3439 }, { "epoch": 0.23308009587451617, "grad_norm": 3.5533335208892822, "learning_rate": 0.0003852738225629792, "loss": 1.1712, "step": 3440 }, { "epoch": 0.23314785171634017, "grad_norm": 3.5558178424835205, "learning_rate": 0.0003852683461117196, "loss": 0.9026, "step": 3441 }, { "epoch": 0.23321560755816415, "grad_norm": 5.821711540222168, "learning_rate": 0.00038526286966046006, "loss": 1.2298, "step": 3442 }, { "epoch": 0.23328336339998815, "grad_norm": 3.5713350772857666, "learning_rate": 0.00038525739320920046, "loss": 0.9837, "step": 3443 }, { "epoch": 0.23335111924181212, "grad_norm": 3.7953977584838867, "learning_rate": 0.00038525191675794086, "loss": 0.9427, "step": 3444 }, { "epoch": 0.23341887508363612, "grad_norm": 3.675002098083496, "learning_rate": 0.0003852464403066813, "loss": 0.7763, "step": 3445 }, { "epoch": 0.2334866309254601, "grad_norm": 2.8685288429260254, "learning_rate": 0.0003852409638554217, "loss": 0.8524, "step": 3446 }, { "epoch": 0.2335543867672841, "grad_norm": 3.051927328109741, "learning_rate": 0.0003852354874041621, "loss": 0.7917, "step": 3447 }, { "epoch": 0.23362214260910807, "grad_norm": 3.42769718170166, "learning_rate": 0.0003852300109529025, "loss": 0.7829, "step": 3448 }, { "epoch": 0.23368989845093208, "grad_norm": 3.420353889465332, "learning_rate": 0.00038522453450164296, "loss": 0.9384, "step": 3449 }, { "epoch": 0.23375765429275605, "grad_norm": 4.482166290283203, "learning_rate": 0.00038521905805038336, "loss": 0.8393, "step": 3450 }, { "epoch": 0.23382541013458005, "grad_norm": 3.7440428733825684, "learning_rate": 0.0003852135815991238, "loss": 1.0556, "step": 3451 }, { "epoch": 0.23389316597640403, "grad_norm": 4.012784004211426, "learning_rate": 0.0003852081051478642, "loss": 0.9514, "step": 3452 }, { "epoch": 0.233960921818228, "grad_norm": 3.5449283123016357, "learning_rate": 0.0003852026286966046, "loss": 0.7942, "step": 3453 }, { "epoch": 0.234028677660052, "grad_norm": 3.6617982387542725, "learning_rate": 0.000385197152245345, "loss": 0.9537, "step": 3454 }, { "epoch": 0.23409643350187598, "grad_norm": 3.6189146041870117, "learning_rate": 0.00038519167579408547, "loss": 1.041, "step": 3455 }, { "epoch": 0.23416418934369998, "grad_norm": 3.9514875411987305, "learning_rate": 0.00038518619934282586, "loss": 0.9627, "step": 3456 }, { "epoch": 0.23423194518552395, "grad_norm": 3.212777614593506, "learning_rate": 0.0003851807228915663, "loss": 0.9405, "step": 3457 }, { "epoch": 0.23429970102734796, "grad_norm": 3.4932689666748047, "learning_rate": 0.0003851752464403067, "loss": 1.075, "step": 3458 }, { "epoch": 0.23436745686917193, "grad_norm": 2.6850225925445557, "learning_rate": 0.0003851697699890471, "loss": 0.8483, "step": 3459 }, { "epoch": 0.23443521271099593, "grad_norm": 3.243868589401245, "learning_rate": 0.0003851642935377875, "loss": 0.8097, "step": 3460 }, { "epoch": 0.2345029685528199, "grad_norm": 3.221231698989868, "learning_rate": 0.00038515881708652797, "loss": 0.9772, "step": 3461 }, { "epoch": 0.2345707243946439, "grad_norm": 3.6649513244628906, "learning_rate": 0.00038515334063526837, "loss": 0.9905, "step": 3462 }, { "epoch": 0.23463848023646788, "grad_norm": 2.813424587249756, "learning_rate": 0.00038514786418400877, "loss": 0.7572, "step": 3463 }, { "epoch": 0.23470623607829189, "grad_norm": 4.343161106109619, "learning_rate": 0.00038514238773274917, "loss": 0.7859, "step": 3464 }, { "epoch": 0.23477399192011586, "grad_norm": 3.841916799545288, "learning_rate": 0.0003851369112814896, "loss": 1.0273, "step": 3465 }, { "epoch": 0.23484174776193986, "grad_norm": 3.720700263977051, "learning_rate": 0.00038513143483023, "loss": 1.0152, "step": 3466 }, { "epoch": 0.23490950360376384, "grad_norm": 4.789869785308838, "learning_rate": 0.00038512595837897047, "loss": 0.9471, "step": 3467 }, { "epoch": 0.2349772594455878, "grad_norm": 3.556755304336548, "learning_rate": 0.00038512048192771087, "loss": 0.9468, "step": 3468 }, { "epoch": 0.2350450152874118, "grad_norm": 3.355682849884033, "learning_rate": 0.00038511500547645127, "loss": 0.9641, "step": 3469 }, { "epoch": 0.2351127711292358, "grad_norm": 2.8267431259155273, "learning_rate": 0.00038510952902519167, "loss": 0.6691, "step": 3470 }, { "epoch": 0.2351805269710598, "grad_norm": 4.00866174697876, "learning_rate": 0.00038510405257393207, "loss": 0.9212, "step": 3471 }, { "epoch": 0.23524828281288376, "grad_norm": 3.6491472721099854, "learning_rate": 0.0003850985761226725, "loss": 1.061, "step": 3472 }, { "epoch": 0.23531603865470777, "grad_norm": 2.7635536193847656, "learning_rate": 0.000385093099671413, "loss": 0.625, "step": 3473 }, { "epoch": 0.23538379449653174, "grad_norm": 3.980212450027466, "learning_rate": 0.0003850876232201534, "loss": 0.9621, "step": 3474 }, { "epoch": 0.23545155033835574, "grad_norm": 3.0759336948394775, "learning_rate": 0.0003850821467688938, "loss": 0.7169, "step": 3475 }, { "epoch": 0.23551930618017972, "grad_norm": 2.9005484580993652, "learning_rate": 0.0003850766703176342, "loss": 0.8001, "step": 3476 }, { "epoch": 0.23558706202200372, "grad_norm": 3.1984972953796387, "learning_rate": 0.00038507119386637463, "loss": 0.8531, "step": 3477 }, { "epoch": 0.2356548178638277, "grad_norm": 3.1212916374206543, "learning_rate": 0.000385065717415115, "loss": 0.8028, "step": 3478 }, { "epoch": 0.2357225737056517, "grad_norm": 4.0563578605651855, "learning_rate": 0.0003850602409638554, "loss": 1.0634, "step": 3479 }, { "epoch": 0.23579032954747567, "grad_norm": 3.4054884910583496, "learning_rate": 0.0003850547645125959, "loss": 1.0115, "step": 3480 }, { "epoch": 0.23585808538929967, "grad_norm": 3.257073402404785, "learning_rate": 0.0003850492880613363, "loss": 1.047, "step": 3481 }, { "epoch": 0.23592584123112365, "grad_norm": 3.7565500736236572, "learning_rate": 0.0003850438116100767, "loss": 0.9759, "step": 3482 }, { "epoch": 0.23599359707294762, "grad_norm": 6.433558940887451, "learning_rate": 0.00038503833515881713, "loss": 0.9733, "step": 3483 }, { "epoch": 0.23606135291477162, "grad_norm": 3.777838945388794, "learning_rate": 0.00038503285870755753, "loss": 0.8168, "step": 3484 }, { "epoch": 0.2361291087565956, "grad_norm": 3.1287827491760254, "learning_rate": 0.00038502738225629793, "loss": 0.8772, "step": 3485 }, { "epoch": 0.2361968645984196, "grad_norm": 2.5524959564208984, "learning_rate": 0.00038502190580503833, "loss": 0.8454, "step": 3486 }, { "epoch": 0.23626462044024357, "grad_norm": 2.5516135692596436, "learning_rate": 0.00038501642935377873, "loss": 0.8229, "step": 3487 }, { "epoch": 0.23633237628206757, "grad_norm": 3.3609588146209717, "learning_rate": 0.0003850109529025192, "loss": 0.9663, "step": 3488 }, { "epoch": 0.23640013212389155, "grad_norm": 3.0082757472991943, "learning_rate": 0.00038500547645125963, "loss": 0.7959, "step": 3489 }, { "epoch": 0.23646788796571555, "grad_norm": 3.1708953380584717, "learning_rate": 0.00038500000000000003, "loss": 0.9676, "step": 3490 }, { "epoch": 0.23653564380753953, "grad_norm": 3.4143319129943848, "learning_rate": 0.00038499452354874043, "loss": 0.8733, "step": 3491 }, { "epoch": 0.23660339964936353, "grad_norm": 5.2941999435424805, "learning_rate": 0.00038498904709748083, "loss": 0.9099, "step": 3492 }, { "epoch": 0.2366711554911875, "grad_norm": 3.165513038635254, "learning_rate": 0.0003849835706462213, "loss": 0.8935, "step": 3493 }, { "epoch": 0.2367389113330115, "grad_norm": 4.730727672576904, "learning_rate": 0.0003849780941949617, "loss": 1.0097, "step": 3494 }, { "epoch": 0.23680666717483548, "grad_norm": 2.826871395111084, "learning_rate": 0.0003849726177437021, "loss": 0.8137, "step": 3495 }, { "epoch": 0.23687442301665948, "grad_norm": 3.622166633605957, "learning_rate": 0.00038496714129244254, "loss": 1.1167, "step": 3496 }, { "epoch": 0.23694217885848345, "grad_norm": 3.233224868774414, "learning_rate": 0.00038496166484118294, "loss": 0.8041, "step": 3497 }, { "epoch": 0.23700993470030743, "grad_norm": 5.051055431365967, "learning_rate": 0.00038495618838992334, "loss": 1.0417, "step": 3498 }, { "epoch": 0.23707769054213143, "grad_norm": 2.873582124710083, "learning_rate": 0.0003849507119386638, "loss": 0.8768, "step": 3499 }, { "epoch": 0.2371454463839554, "grad_norm": 3.918994665145874, "learning_rate": 0.0003849452354874042, "loss": 0.7139, "step": 3500 }, { "epoch": 0.2372132022257794, "grad_norm": 2.7591445446014404, "learning_rate": 0.0003849397590361446, "loss": 0.7101, "step": 3501 }, { "epoch": 0.23728095806760338, "grad_norm": 2.902021646499634, "learning_rate": 0.000384934282584885, "loss": 0.7779, "step": 3502 }, { "epoch": 0.23734871390942738, "grad_norm": 3.3899986743927, "learning_rate": 0.0003849288061336254, "loss": 1.0221, "step": 3503 }, { "epoch": 0.23741646975125136, "grad_norm": 3.125257968902588, "learning_rate": 0.00038492332968236584, "loss": 0.8824, "step": 3504 }, { "epoch": 0.23748422559307536, "grad_norm": 2.6779568195343018, "learning_rate": 0.0003849178532311063, "loss": 0.6717, "step": 3505 }, { "epoch": 0.23755198143489933, "grad_norm": 3.4571759700775146, "learning_rate": 0.0003849123767798467, "loss": 0.9512, "step": 3506 }, { "epoch": 0.23761973727672334, "grad_norm": 3.9164414405822754, "learning_rate": 0.0003849069003285871, "loss": 1.0673, "step": 3507 }, { "epoch": 0.2376874931185473, "grad_norm": 3.7116291522979736, "learning_rate": 0.0003849014238773275, "loss": 0.8889, "step": 3508 }, { "epoch": 0.2377552489603713, "grad_norm": 3.6754040718078613, "learning_rate": 0.0003848959474260679, "loss": 0.9608, "step": 3509 }, { "epoch": 0.2378230048021953, "grad_norm": 3.6156272888183594, "learning_rate": 0.00038489047097480834, "loss": 1.0282, "step": 3510 }, { "epoch": 0.2378907606440193, "grad_norm": 3.9002699851989746, "learning_rate": 0.0003848849945235488, "loss": 0.9404, "step": 3511 }, { "epoch": 0.23795851648584326, "grad_norm": 4.33397102355957, "learning_rate": 0.0003848795180722892, "loss": 0.9854, "step": 3512 }, { "epoch": 0.23802627232766724, "grad_norm": 3.4970011711120605, "learning_rate": 0.0003848740416210296, "loss": 0.8529, "step": 3513 }, { "epoch": 0.23809402816949124, "grad_norm": 3.519333600997925, "learning_rate": 0.00038486856516977, "loss": 0.952, "step": 3514 }, { "epoch": 0.23816178401131521, "grad_norm": 3.3597099781036377, "learning_rate": 0.00038486308871851045, "loss": 1.0487, "step": 3515 }, { "epoch": 0.23822953985313922, "grad_norm": 4.351285934448242, "learning_rate": 0.00038485761226725085, "loss": 0.881, "step": 3516 }, { "epoch": 0.2382972956949632, "grad_norm": 4.334640026092529, "learning_rate": 0.00038485213581599125, "loss": 0.8924, "step": 3517 }, { "epoch": 0.2383650515367872, "grad_norm": 3.781426191329956, "learning_rate": 0.00038484665936473165, "loss": 0.7698, "step": 3518 }, { "epoch": 0.23843280737861117, "grad_norm": 4.889671802520752, "learning_rate": 0.0003848411829134721, "loss": 0.89, "step": 3519 }, { "epoch": 0.23850056322043517, "grad_norm": 3.7696127891540527, "learning_rate": 0.0003848357064622125, "loss": 1.0221, "step": 3520 }, { "epoch": 0.23856831906225914, "grad_norm": 3.7246408462524414, "learning_rate": 0.00038483023001095295, "loss": 1.0707, "step": 3521 }, { "epoch": 0.23863607490408315, "grad_norm": 3.759204149246216, "learning_rate": 0.00038482475355969335, "loss": 1.0519, "step": 3522 }, { "epoch": 0.23870383074590712, "grad_norm": 3.753209114074707, "learning_rate": 0.00038481927710843375, "loss": 0.8522, "step": 3523 }, { "epoch": 0.23877158658773112, "grad_norm": 2.9252936840057373, "learning_rate": 0.00038481380065717415, "loss": 1.0213, "step": 3524 }, { "epoch": 0.2388393424295551, "grad_norm": 3.0121278762817383, "learning_rate": 0.00038480832420591455, "loss": 0.9012, "step": 3525 }, { "epoch": 0.2389070982713791, "grad_norm": 3.9473061561584473, "learning_rate": 0.000384802847754655, "loss": 1.0919, "step": 3526 }, { "epoch": 0.23897485411320307, "grad_norm": 5.666573524475098, "learning_rate": 0.00038479737130339546, "loss": 0.7552, "step": 3527 }, { "epoch": 0.23904260995502705, "grad_norm": 4.097467422485352, "learning_rate": 0.00038479189485213585, "loss": 1.1349, "step": 3528 }, { "epoch": 0.23911036579685105, "grad_norm": 3.3828299045562744, "learning_rate": 0.00038478641840087625, "loss": 0.9631, "step": 3529 }, { "epoch": 0.23917812163867502, "grad_norm": 3.756110191345215, "learning_rate": 0.00038478094194961665, "loss": 0.9188, "step": 3530 }, { "epoch": 0.23924587748049903, "grad_norm": 3.6144723892211914, "learning_rate": 0.0003847754654983571, "loss": 0.8087, "step": 3531 }, { "epoch": 0.239313633322323, "grad_norm": 3.5948920249938965, "learning_rate": 0.0003847699890470975, "loss": 0.8524, "step": 3532 }, { "epoch": 0.239381389164147, "grad_norm": 3.6412158012390137, "learning_rate": 0.0003847645125958379, "loss": 0.8304, "step": 3533 }, { "epoch": 0.23944914500597098, "grad_norm": 3.1989147663116455, "learning_rate": 0.0003847590361445783, "loss": 0.8756, "step": 3534 }, { "epoch": 0.23951690084779498, "grad_norm": 5.724002361297607, "learning_rate": 0.00038475355969331876, "loss": 1.0716, "step": 3535 }, { "epoch": 0.23958465668961895, "grad_norm": 3.8183436393737793, "learning_rate": 0.00038474808324205916, "loss": 1.0032, "step": 3536 }, { "epoch": 0.23965241253144295, "grad_norm": 3.840303897857666, "learning_rate": 0.0003847426067907996, "loss": 1.042, "step": 3537 }, { "epoch": 0.23972016837326693, "grad_norm": 3.46730637550354, "learning_rate": 0.00038473713033954, "loss": 0.7219, "step": 3538 }, { "epoch": 0.23978792421509093, "grad_norm": 3.6060099601745605, "learning_rate": 0.0003847316538882804, "loss": 0.8408, "step": 3539 }, { "epoch": 0.2398556800569149, "grad_norm": 3.0371289253234863, "learning_rate": 0.0003847261774370208, "loss": 0.8476, "step": 3540 }, { "epoch": 0.2399234358987389, "grad_norm": 4.043479919433594, "learning_rate": 0.0003847207009857612, "loss": 0.8577, "step": 3541 }, { "epoch": 0.23999119174056288, "grad_norm": 3.5064337253570557, "learning_rate": 0.00038471522453450166, "loss": 1.0555, "step": 3542 }, { "epoch": 0.24005894758238686, "grad_norm": 3.5323991775512695, "learning_rate": 0.0003847097480832421, "loss": 0.8611, "step": 3543 }, { "epoch": 0.24012670342421086, "grad_norm": 4.869713306427002, "learning_rate": 0.0003847042716319825, "loss": 1.1686, "step": 3544 }, { "epoch": 0.24019445926603483, "grad_norm": 3.4327447414398193, "learning_rate": 0.0003846987951807229, "loss": 0.806, "step": 3545 }, { "epoch": 0.24026221510785883, "grad_norm": 3.1635055541992188, "learning_rate": 0.0003846933187294633, "loss": 0.9743, "step": 3546 }, { "epoch": 0.2403299709496828, "grad_norm": 2.770780324935913, "learning_rate": 0.0003846878422782037, "loss": 0.7404, "step": 3547 }, { "epoch": 0.2403977267915068, "grad_norm": 2.9262092113494873, "learning_rate": 0.00038468236582694416, "loss": 0.9241, "step": 3548 }, { "epoch": 0.24046548263333078, "grad_norm": 2.8331191539764404, "learning_rate": 0.00038467688937568456, "loss": 0.8595, "step": 3549 }, { "epoch": 0.2405332384751548, "grad_norm": 3.5966126918792725, "learning_rate": 0.000384671412924425, "loss": 1.1457, "step": 3550 }, { "epoch": 0.24060099431697876, "grad_norm": 3.2753615379333496, "learning_rate": 0.0003846659364731654, "loss": 0.7817, "step": 3551 }, { "epoch": 0.24066875015880276, "grad_norm": 3.666522264480591, "learning_rate": 0.0003846604600219058, "loss": 1.0483, "step": 3552 }, { "epoch": 0.24073650600062674, "grad_norm": 3.658071994781494, "learning_rate": 0.00038465498357064627, "loss": 0.8674, "step": 3553 }, { "epoch": 0.24080426184245074, "grad_norm": 2.676384210586548, "learning_rate": 0.00038464950711938667, "loss": 0.8398, "step": 3554 }, { "epoch": 0.24087201768427471, "grad_norm": 2.86118221282959, "learning_rate": 0.00038464403066812707, "loss": 0.8704, "step": 3555 }, { "epoch": 0.24093977352609872, "grad_norm": 2.81595778465271, "learning_rate": 0.00038463855421686747, "loss": 0.7396, "step": 3556 }, { "epoch": 0.2410075293679227, "grad_norm": 4.023396968841553, "learning_rate": 0.00038463307776560787, "loss": 1.0471, "step": 3557 }, { "epoch": 0.24107528520974666, "grad_norm": 2.933497667312622, "learning_rate": 0.0003846276013143483, "loss": 0.8205, "step": 3558 }, { "epoch": 0.24114304105157067, "grad_norm": 4.319410800933838, "learning_rate": 0.00038462212486308877, "loss": 1.0236, "step": 3559 }, { "epoch": 0.24121079689339464, "grad_norm": 2.817918539047241, "learning_rate": 0.00038461664841182917, "loss": 0.9278, "step": 3560 }, { "epoch": 0.24127855273521864, "grad_norm": 8.175501823425293, "learning_rate": 0.00038461117196056957, "loss": 0.932, "step": 3561 }, { "epoch": 0.24134630857704262, "grad_norm": 2.6383726596832275, "learning_rate": 0.00038460569550930997, "loss": 0.746, "step": 3562 }, { "epoch": 0.24141406441886662, "grad_norm": 3.5217936038970947, "learning_rate": 0.00038460021905805037, "loss": 0.9239, "step": 3563 }, { "epoch": 0.2414818202606906, "grad_norm": 4.681241989135742, "learning_rate": 0.0003845947426067908, "loss": 0.789, "step": 3564 }, { "epoch": 0.2415495761025146, "grad_norm": 4.095000267028809, "learning_rate": 0.0003845892661555312, "loss": 1.0066, "step": 3565 }, { "epoch": 0.24161733194433857, "grad_norm": 2.88785457611084, "learning_rate": 0.0003845837897042717, "loss": 0.7456, "step": 3566 }, { "epoch": 0.24168508778616257, "grad_norm": 3.384957790374756, "learning_rate": 0.0003845783132530121, "loss": 0.8015, "step": 3567 }, { "epoch": 0.24175284362798655, "grad_norm": 2.8680386543273926, "learning_rate": 0.0003845728368017525, "loss": 0.9248, "step": 3568 }, { "epoch": 0.24182059946981055, "grad_norm": 3.7184362411499023, "learning_rate": 0.0003845673603504929, "loss": 0.8646, "step": 3569 }, { "epoch": 0.24188835531163452, "grad_norm": 3.1110970973968506, "learning_rate": 0.0003845618838992333, "loss": 0.6712, "step": 3570 }, { "epoch": 0.24195611115345853, "grad_norm": 6.174555778503418, "learning_rate": 0.0003845564074479737, "loss": 0.9191, "step": 3571 }, { "epoch": 0.2420238669952825, "grad_norm": 3.1740756034851074, "learning_rate": 0.0003845509309967141, "loss": 0.7829, "step": 3572 }, { "epoch": 0.24209162283710647, "grad_norm": 3.3278098106384277, "learning_rate": 0.0003845454545454545, "loss": 0.9331, "step": 3573 }, { "epoch": 0.24215937867893048, "grad_norm": 3.312324285507202, "learning_rate": 0.000384539978094195, "loss": 0.785, "step": 3574 }, { "epoch": 0.24222713452075445, "grad_norm": 3.792536497116089, "learning_rate": 0.00038453450164293543, "loss": 1.0792, "step": 3575 }, { "epoch": 0.24229489036257845, "grad_norm": 3.6044788360595703, "learning_rate": 0.00038452902519167583, "loss": 0.8771, "step": 3576 }, { "epoch": 0.24236264620440243, "grad_norm": 5.130095481872559, "learning_rate": 0.00038452354874041623, "loss": 0.8476, "step": 3577 }, { "epoch": 0.24243040204622643, "grad_norm": 3.3492159843444824, "learning_rate": 0.00038451807228915663, "loss": 0.8217, "step": 3578 }, { "epoch": 0.2424981578880504, "grad_norm": 4.7940826416015625, "learning_rate": 0.00038451259583789703, "loss": 1.1556, "step": 3579 }, { "epoch": 0.2425659137298744, "grad_norm": 4.007096290588379, "learning_rate": 0.0003845071193866375, "loss": 0.8479, "step": 3580 }, { "epoch": 0.24263366957169838, "grad_norm": 3.4432461261749268, "learning_rate": 0.00038450164293537793, "loss": 1.1665, "step": 3581 }, { "epoch": 0.24270142541352238, "grad_norm": 4.2289042472839355, "learning_rate": 0.00038449616648411833, "loss": 0.8096, "step": 3582 }, { "epoch": 0.24276918125534636, "grad_norm": 3.279273509979248, "learning_rate": 0.00038449069003285873, "loss": 1.0627, "step": 3583 }, { "epoch": 0.24283693709717036, "grad_norm": 3.1666338443756104, "learning_rate": 0.00038448521358159913, "loss": 0.8252, "step": 3584 }, { "epoch": 0.24290469293899433, "grad_norm": 5.830041885375977, "learning_rate": 0.00038447973713033953, "loss": 0.8419, "step": 3585 }, { "epoch": 0.24297244878081833, "grad_norm": 3.1612913608551025, "learning_rate": 0.00038447426067908, "loss": 0.963, "step": 3586 }, { "epoch": 0.2430402046226423, "grad_norm": 3.0637261867523193, "learning_rate": 0.0003844687842278204, "loss": 0.7428, "step": 3587 }, { "epoch": 0.24310796046446628, "grad_norm": 4.1766357421875, "learning_rate": 0.0003844633077765608, "loss": 0.9666, "step": 3588 }, { "epoch": 0.24317571630629028, "grad_norm": 2.6001627445220947, "learning_rate": 0.00038445783132530124, "loss": 0.6888, "step": 3589 }, { "epoch": 0.24324347214811426, "grad_norm": 4.364114761352539, "learning_rate": 0.00038445235487404164, "loss": 1.0244, "step": 3590 }, { "epoch": 0.24331122798993826, "grad_norm": 3.9264612197875977, "learning_rate": 0.0003844468784227821, "loss": 0.985, "step": 3591 }, { "epoch": 0.24337898383176224, "grad_norm": 3.905233860015869, "learning_rate": 0.0003844414019715225, "loss": 0.892, "step": 3592 }, { "epoch": 0.24344673967358624, "grad_norm": 2.610853433609009, "learning_rate": 0.0003844359255202629, "loss": 0.7664, "step": 3593 }, { "epoch": 0.2435144955154102, "grad_norm": 3.0197854042053223, "learning_rate": 0.0003844304490690033, "loss": 0.8688, "step": 3594 }, { "epoch": 0.24358225135723421, "grad_norm": 2.698666572570801, "learning_rate": 0.0003844249726177437, "loss": 0.8186, "step": 3595 }, { "epoch": 0.2436500071990582, "grad_norm": 3.040863275527954, "learning_rate": 0.00038441949616648414, "loss": 1.0042, "step": 3596 }, { "epoch": 0.2437177630408822, "grad_norm": 3.1363484859466553, "learning_rate": 0.0003844140197152246, "loss": 0.8847, "step": 3597 }, { "epoch": 0.24378551888270616, "grad_norm": 2.8835747241973877, "learning_rate": 0.000384408543263965, "loss": 0.8451, "step": 3598 }, { "epoch": 0.24385327472453017, "grad_norm": 3.3994038105010986, "learning_rate": 0.0003844030668127054, "loss": 0.72, "step": 3599 }, { "epoch": 0.24392103056635414, "grad_norm": 3.5576508045196533, "learning_rate": 0.0003843975903614458, "loss": 0.9419, "step": 3600 }, { "epoch": 0.24398878640817814, "grad_norm": 2.9685850143432617, "learning_rate": 0.0003843921139101862, "loss": 0.7367, "step": 3601 }, { "epoch": 0.24405654225000212, "grad_norm": 2.9869747161865234, "learning_rate": 0.00038438663745892664, "loss": 0.9096, "step": 3602 }, { "epoch": 0.2441242980918261, "grad_norm": 3.1118412017822266, "learning_rate": 0.00038438116100766704, "loss": 0.7894, "step": 3603 }, { "epoch": 0.2441920539336501, "grad_norm": 3.8750014305114746, "learning_rate": 0.00038437568455640744, "loss": 0.9525, "step": 3604 }, { "epoch": 0.24425980977547407, "grad_norm": 3.0228381156921387, "learning_rate": 0.0003843702081051479, "loss": 0.6956, "step": 3605 }, { "epoch": 0.24432756561729807, "grad_norm": 4.395471096038818, "learning_rate": 0.0003843647316538883, "loss": 1.2877, "step": 3606 }, { "epoch": 0.24439532145912204, "grad_norm": 3.6463301181793213, "learning_rate": 0.0003843592552026287, "loss": 1.0584, "step": 3607 }, { "epoch": 0.24446307730094605, "grad_norm": 3.233916759490967, "learning_rate": 0.00038435377875136915, "loss": 0.6924, "step": 3608 }, { "epoch": 0.24453083314277002, "grad_norm": 3.1753616333007812, "learning_rate": 0.00038434830230010955, "loss": 0.7678, "step": 3609 }, { "epoch": 0.24459858898459402, "grad_norm": 2.5761382579803467, "learning_rate": 0.00038434282584884995, "loss": 0.7123, "step": 3610 }, { "epoch": 0.244666344826418, "grad_norm": 4.036708831787109, "learning_rate": 0.00038433734939759034, "loss": 0.8565, "step": 3611 }, { "epoch": 0.244734100668242, "grad_norm": 4.253012657165527, "learning_rate": 0.0003843318729463308, "loss": 0.8407, "step": 3612 }, { "epoch": 0.24480185651006597, "grad_norm": 3.0888428688049316, "learning_rate": 0.00038432639649507125, "loss": 0.7569, "step": 3613 }, { "epoch": 0.24486961235188998, "grad_norm": 3.641345500946045, "learning_rate": 0.00038432092004381165, "loss": 0.9349, "step": 3614 }, { "epoch": 0.24493736819371395, "grad_norm": 3.3644180297851562, "learning_rate": 0.00038431544359255205, "loss": 1.1062, "step": 3615 }, { "epoch": 0.24500512403553795, "grad_norm": 4.179390907287598, "learning_rate": 0.00038430996714129245, "loss": 0.9211, "step": 3616 }, { "epoch": 0.24507287987736193, "grad_norm": 3.50895619392395, "learning_rate": 0.00038430449069003285, "loss": 0.9339, "step": 3617 }, { "epoch": 0.2451406357191859, "grad_norm": 5.801991939544678, "learning_rate": 0.0003842990142387733, "loss": 1.0981, "step": 3618 }, { "epoch": 0.2452083915610099, "grad_norm": 3.113947629928589, "learning_rate": 0.0003842935377875137, "loss": 1.0309, "step": 3619 }, { "epoch": 0.24527614740283388, "grad_norm": 3.378770351409912, "learning_rate": 0.00038428806133625415, "loss": 0.689, "step": 3620 }, { "epoch": 0.24534390324465788, "grad_norm": 3.041553497314453, "learning_rate": 0.00038428258488499455, "loss": 0.8364, "step": 3621 }, { "epoch": 0.24541165908648185, "grad_norm": 3.19685959815979, "learning_rate": 0.00038427710843373495, "loss": 0.7991, "step": 3622 }, { "epoch": 0.24547941492830586, "grad_norm": 3.5349223613739014, "learning_rate": 0.00038427163198247535, "loss": 1.0397, "step": 3623 }, { "epoch": 0.24554717077012983, "grad_norm": 4.404790878295898, "learning_rate": 0.0003842661555312158, "loss": 1.0932, "step": 3624 }, { "epoch": 0.24561492661195383, "grad_norm": 3.362189769744873, "learning_rate": 0.0003842606790799562, "loss": 0.8153, "step": 3625 }, { "epoch": 0.2456826824537778, "grad_norm": 3.7418336868286133, "learning_rate": 0.0003842552026286966, "loss": 0.9646, "step": 3626 }, { "epoch": 0.2457504382956018, "grad_norm": 2.701110601425171, "learning_rate": 0.000384249726177437, "loss": 0.7417, "step": 3627 }, { "epoch": 0.24581819413742578, "grad_norm": 3.8121039867401123, "learning_rate": 0.00038424424972617746, "loss": 1.3079, "step": 3628 }, { "epoch": 0.24588594997924978, "grad_norm": 2.9305543899536133, "learning_rate": 0.0003842387732749179, "loss": 0.6759, "step": 3629 }, { "epoch": 0.24595370582107376, "grad_norm": 3.2821357250213623, "learning_rate": 0.0003842332968236583, "loss": 0.8017, "step": 3630 }, { "epoch": 0.24602146166289776, "grad_norm": 6.656386852264404, "learning_rate": 0.0003842278203723987, "loss": 1.005, "step": 3631 }, { "epoch": 0.24608921750472174, "grad_norm": 3.3356103897094727, "learning_rate": 0.0003842223439211391, "loss": 0.8939, "step": 3632 }, { "epoch": 0.2461569733465457, "grad_norm": 2.781855583190918, "learning_rate": 0.0003842168674698795, "loss": 0.7511, "step": 3633 }, { "epoch": 0.2462247291883697, "grad_norm": 3.0457804203033447, "learning_rate": 0.00038421139101861996, "loss": 0.9523, "step": 3634 }, { "epoch": 0.2462924850301937, "grad_norm": 2.585984945297241, "learning_rate": 0.00038420591456736036, "loss": 0.7108, "step": 3635 }, { "epoch": 0.2463602408720177, "grad_norm": 2.7748820781707764, "learning_rate": 0.0003842004381161008, "loss": 0.7178, "step": 3636 }, { "epoch": 0.24642799671384166, "grad_norm": 3.4772417545318604, "learning_rate": 0.0003841949616648412, "loss": 0.8056, "step": 3637 }, { "epoch": 0.24649575255566566, "grad_norm": 3.6336772441864014, "learning_rate": 0.0003841894852135816, "loss": 0.9024, "step": 3638 }, { "epoch": 0.24656350839748964, "grad_norm": 2.643249750137329, "learning_rate": 0.000384184008762322, "loss": 0.6573, "step": 3639 }, { "epoch": 0.24663126423931364, "grad_norm": 3.0842294692993164, "learning_rate": 0.00038417853231106246, "loss": 0.9045, "step": 3640 }, { "epoch": 0.24669902008113762, "grad_norm": 2.8832128047943115, "learning_rate": 0.00038417305585980286, "loss": 0.7339, "step": 3641 }, { "epoch": 0.24676677592296162, "grad_norm": 3.883462905883789, "learning_rate": 0.00038416757940854326, "loss": 1.0677, "step": 3642 }, { "epoch": 0.2468345317647856, "grad_norm": 4.286027908325195, "learning_rate": 0.0003841621029572837, "loss": 1.0355, "step": 3643 }, { "epoch": 0.2469022876066096, "grad_norm": 3.275578737258911, "learning_rate": 0.0003841566265060241, "loss": 0.758, "step": 3644 }, { "epoch": 0.24697004344843357, "grad_norm": 3.220132350921631, "learning_rate": 0.0003841511500547645, "loss": 0.8784, "step": 3645 }, { "epoch": 0.24703779929025757, "grad_norm": 2.976148843765259, "learning_rate": 0.00038414567360350497, "loss": 0.8917, "step": 3646 }, { "epoch": 0.24710555513208154, "grad_norm": 3.7267327308654785, "learning_rate": 0.00038414019715224537, "loss": 1.0633, "step": 3647 }, { "epoch": 0.24717331097390552, "grad_norm": 3.152019739151001, "learning_rate": 0.00038413472070098577, "loss": 0.7872, "step": 3648 }, { "epoch": 0.24724106681572952, "grad_norm": 3.847932815551758, "learning_rate": 0.00038412924424972617, "loss": 0.9946, "step": 3649 }, { "epoch": 0.2473088226575535, "grad_norm": 2.987445831298828, "learning_rate": 0.0003841237677984666, "loss": 0.8574, "step": 3650 }, { "epoch": 0.2473765784993775, "grad_norm": 3.6175968647003174, "learning_rate": 0.00038411829134720707, "loss": 0.7839, "step": 3651 }, { "epoch": 0.24744433434120147, "grad_norm": 2.919098138809204, "learning_rate": 0.00038411281489594747, "loss": 0.9751, "step": 3652 }, { "epoch": 0.24751209018302547, "grad_norm": 3.8991780281066895, "learning_rate": 0.00038410733844468787, "loss": 1.0142, "step": 3653 }, { "epoch": 0.24757984602484945, "grad_norm": 3.620443105697632, "learning_rate": 0.00038410186199342827, "loss": 0.9295, "step": 3654 }, { "epoch": 0.24764760186667345, "grad_norm": 4.178861618041992, "learning_rate": 0.00038409638554216867, "loss": 0.9741, "step": 3655 }, { "epoch": 0.24771535770849742, "grad_norm": 2.68290376663208, "learning_rate": 0.0003840909090909091, "loss": 0.758, "step": 3656 }, { "epoch": 0.24778311355032143, "grad_norm": 4.443676471710205, "learning_rate": 0.0003840854326396495, "loss": 1.1349, "step": 3657 }, { "epoch": 0.2478508693921454, "grad_norm": 3.5604474544525146, "learning_rate": 0.0003840799561883899, "loss": 0.8092, "step": 3658 }, { "epoch": 0.2479186252339694, "grad_norm": 3.1093716621398926, "learning_rate": 0.0003840744797371304, "loss": 0.8698, "step": 3659 }, { "epoch": 0.24798638107579338, "grad_norm": 3.4392693042755127, "learning_rate": 0.0003840690032858708, "loss": 1.0999, "step": 3660 }, { "epoch": 0.24805413691761738, "grad_norm": 4.430802822113037, "learning_rate": 0.00038406352683461117, "loss": 0.8996, "step": 3661 }, { "epoch": 0.24812189275944135, "grad_norm": 3.1374351978302, "learning_rate": 0.0003840580503833516, "loss": 0.8434, "step": 3662 }, { "epoch": 0.24818964860126533, "grad_norm": 3.239025115966797, "learning_rate": 0.000384052573932092, "loss": 0.6915, "step": 3663 }, { "epoch": 0.24825740444308933, "grad_norm": 5.312155723571777, "learning_rate": 0.0003840470974808324, "loss": 0.8982, "step": 3664 }, { "epoch": 0.2483251602849133, "grad_norm": 4.996529579162598, "learning_rate": 0.0003840416210295728, "loss": 1.2739, "step": 3665 }, { "epoch": 0.2483929161267373, "grad_norm": 3.8779194355010986, "learning_rate": 0.0003840361445783133, "loss": 0.8554, "step": 3666 }, { "epoch": 0.24846067196856128, "grad_norm": 4.90615177154541, "learning_rate": 0.00038403066812705373, "loss": 1.1101, "step": 3667 }, { "epoch": 0.24852842781038528, "grad_norm": 3.0782370567321777, "learning_rate": 0.00038402519167579413, "loss": 0.9249, "step": 3668 }, { "epoch": 0.24859618365220926, "grad_norm": 3.553218364715576, "learning_rate": 0.00038401971522453453, "loss": 1.0345, "step": 3669 }, { "epoch": 0.24866393949403326, "grad_norm": 4.608200550079346, "learning_rate": 0.00038401423877327493, "loss": 1.0789, "step": 3670 }, { "epoch": 0.24873169533585723, "grad_norm": 2.9339096546173096, "learning_rate": 0.00038400876232201533, "loss": 0.8319, "step": 3671 }, { "epoch": 0.24879945117768124, "grad_norm": 3.093270778656006, "learning_rate": 0.0003840032858707558, "loss": 0.8568, "step": 3672 }, { "epoch": 0.2488672070195052, "grad_norm": 3.6574976444244385, "learning_rate": 0.0003839978094194962, "loss": 0.9469, "step": 3673 }, { "epoch": 0.2489349628613292, "grad_norm": 2.7866458892822266, "learning_rate": 0.00038399233296823663, "loss": 0.7737, "step": 3674 }, { "epoch": 0.2490027187031532, "grad_norm": 4.70128059387207, "learning_rate": 0.00038398685651697703, "loss": 1.1959, "step": 3675 }, { "epoch": 0.2490704745449772, "grad_norm": 3.5798444747924805, "learning_rate": 0.00038398138006571743, "loss": 1.0328, "step": 3676 }, { "epoch": 0.24913823038680116, "grad_norm": 3.2754852771759033, "learning_rate": 0.00038397590361445783, "loss": 0.9629, "step": 3677 }, { "epoch": 0.24920598622862514, "grad_norm": 3.3738067150115967, "learning_rate": 0.0003839704271631983, "loss": 1.0307, "step": 3678 }, { "epoch": 0.24927374207044914, "grad_norm": 2.7617058753967285, "learning_rate": 0.0003839649507119387, "loss": 0.8117, "step": 3679 }, { "epoch": 0.2493414979122731, "grad_norm": 3.428276538848877, "learning_rate": 0.0003839594742606791, "loss": 1.1433, "step": 3680 }, { "epoch": 0.24940925375409712, "grad_norm": 2.8827009201049805, "learning_rate": 0.0003839539978094195, "loss": 0.7803, "step": 3681 }, { "epoch": 0.2494770095959211, "grad_norm": 3.3733298778533936, "learning_rate": 0.00038394852135815994, "loss": 0.8253, "step": 3682 }, { "epoch": 0.2495447654377451, "grad_norm": 3.572075605392456, "learning_rate": 0.00038394304490690034, "loss": 0.7953, "step": 3683 }, { "epoch": 0.24961252127956907, "grad_norm": 3.9213287830352783, "learning_rate": 0.0003839375684556408, "loss": 0.9151, "step": 3684 }, { "epoch": 0.24968027712139307, "grad_norm": 2.8898632526397705, "learning_rate": 0.0003839320920043812, "loss": 0.8617, "step": 3685 }, { "epoch": 0.24974803296321704, "grad_norm": 3.041987180709839, "learning_rate": 0.0003839266155531216, "loss": 0.8889, "step": 3686 }, { "epoch": 0.24981578880504104, "grad_norm": 3.3263607025146484, "learning_rate": 0.000383921139101862, "loss": 0.9432, "step": 3687 }, { "epoch": 0.24988354464686502, "grad_norm": 2.908055543899536, "learning_rate": 0.00038391566265060244, "loss": 0.7205, "step": 3688 }, { "epoch": 0.24995130048868902, "grad_norm": 3.964475154876709, "learning_rate": 0.00038391018619934284, "loss": 0.8656, "step": 3689 }, { "epoch": 0.24995130048868902, "eval_loss": 0.8432719707489014, "eval_noise_accuracy": 0.0, "eval_runtime": 15206.8395, "eval_samples_per_second": 0.338, "eval_steps_per_second": 0.085, "eval_wer": 62.08277152527448, "step": 3689 } ], "logging_steps": 1, "max_steps": 73790, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 3689, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.104229336317952e+19, "train_batch_size": 2, "trial_name": null, "trial_params": null }