{ "best_metric": 3.1351470947265625, "best_model_checkpoint": "miner_id_24/checkpoint-600", "epoch": 0.5759078547432411, "eval_steps": 200, "global_step": 1800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00031994880819068947, "grad_norm": 22.934980392456055, "learning_rate": 4e-05, "loss": 6.7197, "step": 1 }, { "epoch": 0.00031994880819068947, "eval_loss": 3.3727481365203857, "eval_runtime": 231.8319, "eval_samples_per_second": 5.677, "eval_steps_per_second": 1.419, "step": 1 }, { "epoch": 0.0006398976163813789, "grad_norm": 17.148805618286133, "learning_rate": 8e-05, "loss": 6.7449, "step": 2 }, { "epoch": 0.0009598464245720685, "grad_norm": 7.445838451385498, "learning_rate": 0.00012, "loss": 5.9164, "step": 3 }, { "epoch": 0.0012797952327627579, "grad_norm": 5.818986415863037, "learning_rate": 0.00016, "loss": 5.9623, "step": 4 }, { "epoch": 0.0015997440409534474, "grad_norm": 45.06270980834961, "learning_rate": 0.0002, "loss": 6.327, "step": 5 }, { "epoch": 0.001919692849144137, "grad_norm": 2.642199993133545, "learning_rate": 0.00024, "loss": 5.9756, "step": 6 }, { "epoch": 0.0022396416573348264, "grad_norm": 10.692281723022461, "learning_rate": 0.00028, "loss": 6.0676, "step": 7 }, { "epoch": 0.0025595904655255157, "grad_norm": 18.375436782836914, "learning_rate": 0.00032, "loss": 6.2809, "step": 8 }, { "epoch": 0.0028795392737162055, "grad_norm": 8.690791130065918, "learning_rate": 0.00036, "loss": 6.5217, "step": 9 }, { "epoch": 0.003199488081906895, "grad_norm": 278.91900634765625, "learning_rate": 0.0004, "loss": 20.744, "step": 10 }, { "epoch": 0.0035194368900975845, "grad_norm": 62.391605377197266, "learning_rate": 0.0003999998992085638, "loss": 8.1234, "step": 11 }, { "epoch": 0.003839385698288274, "grad_norm": 67.343017578125, "learning_rate": 0.00039999959683436215, "loss": 8.2013, "step": 12 }, { "epoch": 0.004159334506478964, "grad_norm": 30.584428787231445, "learning_rate": 0.0003999990928777159, "loss": 7.7704, "step": 13 }, { "epoch": 0.004479283314669653, "grad_norm": 35.005210876464844, "learning_rate": 0.0003999983873391596, "loss": 9.1118, "step": 14 }, { "epoch": 0.004799232122860342, "grad_norm": 23.695714950561523, "learning_rate": 0.00039999748021944193, "loss": 7.7276, "step": 15 }, { "epoch": 0.0051191809310510315, "grad_norm": 34.03984451293945, "learning_rate": 0.0003999963715195253, "loss": 7.4958, "step": 16 }, { "epoch": 0.005439129739241722, "grad_norm": 13.545119285583496, "learning_rate": 0.0003999950612405859, "loss": 6.6119, "step": 17 }, { "epoch": 0.005759078547432411, "grad_norm": 21.298782348632812, "learning_rate": 0.0003999935493840139, "loss": 7.3331, "step": 18 }, { "epoch": 0.0060790273556231, "grad_norm": 8.125480651855469, "learning_rate": 0.0003999918359514135, "loss": 6.7062, "step": 19 }, { "epoch": 0.00639897616381379, "grad_norm": 12.093789100646973, "learning_rate": 0.0003999899209446023, "loss": 6.6066, "step": 20 }, { "epoch": 0.006718924972004479, "grad_norm": 5.6320085525512695, "learning_rate": 0.00039998780436561234, "loss": 6.4927, "step": 21 }, { "epoch": 0.007038873780195169, "grad_norm": 4.095371723175049, "learning_rate": 0.00039998548621668904, "loss": 6.3477, "step": 22 }, { "epoch": 0.007358822588385858, "grad_norm": 6.548256874084473, "learning_rate": 0.00039998296650029197, "loss": 6.627, "step": 23 }, { "epoch": 0.007678771396576548, "grad_norm": 5.446329116821289, "learning_rate": 0.0003999802452190944, "loss": 6.4111, "step": 24 }, { "epoch": 0.007998720204767237, "grad_norm": 338.24359130859375, "learning_rate": 0.0003999773223759835, "loss": 17.7788, "step": 25 }, { "epoch": 0.008318669012957927, "grad_norm": 5.659632205963135, "learning_rate": 0.0003999741979740603, "loss": 6.6552, "step": 26 }, { "epoch": 0.008638617821148616, "grad_norm": 5.3004679679870605, "learning_rate": 0.00039997087201663976, "loss": 6.6323, "step": 27 }, { "epoch": 0.008958566629339306, "grad_norm": 4.444129943847656, "learning_rate": 0.00039996734450725046, "loss": 6.5521, "step": 28 }, { "epoch": 0.009278515437529996, "grad_norm": 5.8855695724487305, "learning_rate": 0.0003999636154496351, "loss": 6.4258, "step": 29 }, { "epoch": 0.009598464245720684, "grad_norm": 4.542850971221924, "learning_rate": 0.00039995968484774993, "loss": 6.635, "step": 30 }, { "epoch": 0.009918413053911375, "grad_norm": 11.79590892791748, "learning_rate": 0.0003999555527057653, "loss": 6.1833, "step": 31 }, { "epoch": 0.010238361862102063, "grad_norm": 7.367285251617432, "learning_rate": 0.00039995121902806506, "loss": 6.7455, "step": 32 }, { "epoch": 0.010558310670292753, "grad_norm": 5.325396537780762, "learning_rate": 0.0003999466838192473, "loss": 6.6042, "step": 33 }, { "epoch": 0.010878259478483443, "grad_norm": 4.978571891784668, "learning_rate": 0.00039994194708412365, "loss": 6.8796, "step": 34 }, { "epoch": 0.011198208286674132, "grad_norm": 7.301144599914551, "learning_rate": 0.0003999370088277195, "loss": 6.5597, "step": 35 }, { "epoch": 0.011518157094864822, "grad_norm": 3.852827787399292, "learning_rate": 0.00039993186905527427, "loss": 6.4115, "step": 36 }, { "epoch": 0.01183810590305551, "grad_norm": 12.045527458190918, "learning_rate": 0.000399926527772241, "loss": 6.6072, "step": 37 }, { "epoch": 0.0121580547112462, "grad_norm": 4.803719997406006, "learning_rate": 0.00039992098498428663, "loss": 6.6006, "step": 38 }, { "epoch": 0.01247800351943689, "grad_norm": 14.13237476348877, "learning_rate": 0.0003999152406972919, "loss": 6.3613, "step": 39 }, { "epoch": 0.01279795232762758, "grad_norm": 5.234755516052246, "learning_rate": 0.00039990929491735117, "loss": 6.4241, "step": 40 }, { "epoch": 0.01311790113581827, "grad_norm": 913.4176635742188, "learning_rate": 0.0003999031476507727, "loss": 9.6577, "step": 41 }, { "epoch": 0.013437849944008958, "grad_norm": 203.76280212402344, "learning_rate": 0.0003998967989040786, "loss": 11.1443, "step": 42 }, { "epoch": 0.013757798752199648, "grad_norm": 201.66928100585938, "learning_rate": 0.0003998902486840046, "loss": 8.6425, "step": 43 }, { "epoch": 0.014077747560390338, "grad_norm": 100.8696060180664, "learning_rate": 0.0003998834969975002, "loss": 9.0582, "step": 44 }, { "epoch": 0.014397696368581027, "grad_norm": 39.69711685180664, "learning_rate": 0.0003998765438517287, "loss": 7.4971, "step": 45 }, { "epoch": 0.014717645176771717, "grad_norm": 37.98813247680664, "learning_rate": 0.0003998693892540672, "loss": 6.8231, "step": 46 }, { "epoch": 0.015037593984962405, "grad_norm": 10.686429977416992, "learning_rate": 0.0003998620332121064, "loss": 6.8307, "step": 47 }, { "epoch": 0.015357542793153095, "grad_norm": 23.25743865966797, "learning_rate": 0.0003998544757336509, "loss": 7.0079, "step": 48 }, { "epoch": 0.015677491601343786, "grad_norm": 22.725971221923828, "learning_rate": 0.0003998467168267187, "loss": 6.8757, "step": 49 }, { "epoch": 0.015997440409534474, "grad_norm": 8.51931095123291, "learning_rate": 0.0003998387564995418, "loss": 6.8021, "step": 50 }, { "epoch": 0.016317389217725162, "grad_norm": 6.992290496826172, "learning_rate": 0.0003998305947605658, "loss": 7.0493, "step": 51 }, { "epoch": 0.016637338025915854, "grad_norm": 4.017070770263672, "learning_rate": 0.0003998222316184501, "loss": 6.7286, "step": 52 }, { "epoch": 0.016957286834106543, "grad_norm": 4.484210968017578, "learning_rate": 0.00039981366708206743, "loss": 6.7036, "step": 53 }, { "epoch": 0.01727723564229723, "grad_norm": 8.955873489379883, "learning_rate": 0.0003998049011605047, "loss": 7.0503, "step": 54 }, { "epoch": 0.017597184450487923, "grad_norm": 6.5257792472839355, "learning_rate": 0.00039979593386306223, "loss": 6.905, "step": 55 }, { "epoch": 0.01791713325867861, "grad_norm": 6.154618740081787, "learning_rate": 0.00039978676519925374, "loss": 6.761, "step": 56 }, { "epoch": 0.0182370820668693, "grad_norm": 6.142489910125732, "learning_rate": 0.00039977739517880703, "loss": 6.6128, "step": 57 }, { "epoch": 0.018557030875059992, "grad_norm": 10.0971040725708, "learning_rate": 0.0003997678238116633, "loss": 6.9919, "step": 58 }, { "epoch": 0.01887697968325068, "grad_norm": 6.254273414611816, "learning_rate": 0.00039975805110797745, "loss": 6.7167, "step": 59 }, { "epoch": 0.01919692849144137, "grad_norm": 6.39977502822876, "learning_rate": 0.0003997480770781178, "loss": 6.6531, "step": 60 }, { "epoch": 0.019516877299632057, "grad_norm": 4.507251739501953, "learning_rate": 0.0003997379017326666, "loss": 6.7717, "step": 61 }, { "epoch": 0.01983682610782275, "grad_norm": 5.046263694763184, "learning_rate": 0.00039972752508241944, "loss": 6.807, "step": 62 }, { "epoch": 0.020156774916013438, "grad_norm": 7.079762935638428, "learning_rate": 0.0003997169471383855, "loss": 6.7065, "step": 63 }, { "epoch": 0.020476723724204126, "grad_norm": 6.362200736999512, "learning_rate": 0.00039970616791178777, "loss": 6.8103, "step": 64 }, { "epoch": 0.020796672532394818, "grad_norm": 41.3747673034668, "learning_rate": 0.00039969518741406234, "loss": 6.8975, "step": 65 }, { "epoch": 0.021116621340585506, "grad_norm": 5.478196620941162, "learning_rate": 0.0003996840056568593, "loss": 6.658, "step": 66 }, { "epoch": 0.021436570148776195, "grad_norm": 3.9833760261535645, "learning_rate": 0.000399672622652042, "loss": 6.7767, "step": 67 }, { "epoch": 0.021756518956966887, "grad_norm": 3.953526496887207, "learning_rate": 0.0003996610384116874, "loss": 6.4851, "step": 68 }, { "epoch": 0.022076467765157575, "grad_norm": 4.118646621704102, "learning_rate": 0.0003996492529480859, "loss": 6.6975, "step": 69 }, { "epoch": 0.022396416573348264, "grad_norm": 3.562244415283203, "learning_rate": 0.00039963726627374155, "loss": 6.619, "step": 70 }, { "epoch": 0.022716365381538955, "grad_norm": 7.297116279602051, "learning_rate": 0.00039962507840137163, "loss": 6.8697, "step": 71 }, { "epoch": 0.023036314189729644, "grad_norm": 4.813419818878174, "learning_rate": 0.0003996126893439071, "loss": 6.8995, "step": 72 }, { "epoch": 0.023356262997920332, "grad_norm": 7.643682956695557, "learning_rate": 0.0003996000991144922, "loss": 7.0324, "step": 73 }, { "epoch": 0.02367621180611102, "grad_norm": 5.103664398193359, "learning_rate": 0.00039958730772648483, "loss": 6.7137, "step": 74 }, { "epoch": 0.023996160614301713, "grad_norm": 19.361007690429688, "learning_rate": 0.000399574315193456, "loss": 6.6955, "step": 75 }, { "epoch": 0.0243161094224924, "grad_norm": 191.5496826171875, "learning_rate": 0.0003995611215291904, "loss": 12.0782, "step": 76 }, { "epoch": 0.02463605823068309, "grad_norm": 52.53076171875, "learning_rate": 0.00039954772674768605, "loss": 8.514, "step": 77 }, { "epoch": 0.02495600703887378, "grad_norm": 138.49893188476562, "learning_rate": 0.0003995341308631543, "loss": 31.316, "step": 78 }, { "epoch": 0.02527595584706447, "grad_norm": 22.098583221435547, "learning_rate": 0.00039952033389001985, "loss": 8.0865, "step": 79 }, { "epoch": 0.02559590465525516, "grad_norm": 17.31378936767578, "learning_rate": 0.00039950633584292063, "loss": 7.4476, "step": 80 }, { "epoch": 0.02591585346344585, "grad_norm": 12.283071517944336, "learning_rate": 0.00039949213673670826, "loss": 7.1162, "step": 81 }, { "epoch": 0.02623580227163654, "grad_norm": 11.07091999053955, "learning_rate": 0.00039947773658644735, "loss": 6.9467, "step": 82 }, { "epoch": 0.026555751079827227, "grad_norm": 5.554511547088623, "learning_rate": 0.00039946313540741593, "loss": 6.9002, "step": 83 }, { "epoch": 0.026875699888017916, "grad_norm": 7.668940544128418, "learning_rate": 0.0003994483332151053, "loss": 6.7886, "step": 84 }, { "epoch": 0.027195648696208607, "grad_norm": 6.739316463470459, "learning_rate": 0.0003994333300252201, "loss": 6.7452, "step": 85 }, { "epoch": 0.027515597504399296, "grad_norm": 4.137218952178955, "learning_rate": 0.0003994181258536781, "loss": 6.4451, "step": 86 }, { "epoch": 0.027835546312589984, "grad_norm": 6.443964958190918, "learning_rate": 0.0003994027207166103, "loss": 6.5939, "step": 87 }, { "epoch": 0.028155495120780676, "grad_norm": 5.331020832061768, "learning_rate": 0.00039938711463036105, "loss": 6.473, "step": 88 }, { "epoch": 0.028475443928971365, "grad_norm": 3.314025640487671, "learning_rate": 0.00039937130761148775, "loss": 6.4812, "step": 89 }, { "epoch": 0.028795392737162053, "grad_norm": 3.9543004035949707, "learning_rate": 0.0003993552996767611, "loss": 6.7518, "step": 90 }, { "epoch": 0.029115341545352745, "grad_norm": 4.591469764709473, "learning_rate": 0.00039933909084316493, "loss": 6.8772, "step": 91 }, { "epoch": 0.029435290353543433, "grad_norm": 5.4851837158203125, "learning_rate": 0.00039932268112789624, "loss": 6.7562, "step": 92 }, { "epoch": 0.029755239161734122, "grad_norm": 3.0986509323120117, "learning_rate": 0.00039930607054836504, "loss": 6.7457, "step": 93 }, { "epoch": 0.03007518796992481, "grad_norm": 6.524487018585205, "learning_rate": 0.00039928925912219456, "loss": 6.6129, "step": 94 }, { "epoch": 0.030395136778115502, "grad_norm": 5.0719313621521, "learning_rate": 0.0003992722468672211, "loss": 6.5837, "step": 95 }, { "epoch": 0.03071508558630619, "grad_norm": 4.822458744049072, "learning_rate": 0.00039925503380149405, "loss": 6.5082, "step": 96 }, { "epoch": 0.03103503439449688, "grad_norm": 5.4806599617004395, "learning_rate": 0.00039923761994327574, "loss": 6.8089, "step": 97 }, { "epoch": 0.03135498320268757, "grad_norm": 5.32270622253418, "learning_rate": 0.00039922000531104174, "loss": 6.8169, "step": 98 }, { "epoch": 0.03167493201087826, "grad_norm": 4.849973678588867, "learning_rate": 0.00039920218992348046, "loss": 6.5797, "step": 99 }, { "epoch": 0.03199488081906895, "grad_norm": 3.636552095413208, "learning_rate": 0.00039918417379949326, "loss": 6.6796, "step": 100 }, { "epoch": 0.032314829627259636, "grad_norm": 7.389275550842285, "learning_rate": 0.0003991659569581948, "loss": 6.5838, "step": 101 }, { "epoch": 0.032634778435450325, "grad_norm": 6.951813220977783, "learning_rate": 0.0003991475394189123, "loss": 6.6162, "step": 102 }, { "epoch": 0.03295472724364102, "grad_norm": 6.038928985595703, "learning_rate": 0.000399128921201186, "loss": 6.5367, "step": 103 }, { "epoch": 0.03327467605183171, "grad_norm": 4.697129249572754, "learning_rate": 0.0003991101023247693, "loss": 6.822, "step": 104 }, { "epoch": 0.0335946248600224, "grad_norm": 3.3993539810180664, "learning_rate": 0.00039909108280962826, "loss": 6.7785, "step": 105 }, { "epoch": 0.033914573668213085, "grad_norm": 5.630346775054932, "learning_rate": 0.0003990718626759419, "loss": 6.6505, "step": 106 }, { "epoch": 0.034234522476403774, "grad_norm": 4.182599067687988, "learning_rate": 0.00039905244194410203, "loss": 6.7451, "step": 107 }, { "epoch": 0.03455447128459446, "grad_norm": 3.2485625743865967, "learning_rate": 0.00039903282063471324, "loss": 6.5288, "step": 108 }, { "epoch": 0.03487442009278516, "grad_norm": 10.302080154418945, "learning_rate": 0.00039901299876859313, "loss": 6.9176, "step": 109 }, { "epoch": 0.035194368900975846, "grad_norm": 10.48038387298584, "learning_rate": 0.00039899297636677197, "loss": 6.9726, "step": 110 }, { "epoch": 0.035514317709166535, "grad_norm": 5.058242321014404, "learning_rate": 0.00039897275345049263, "loss": 6.6908, "step": 111 }, { "epoch": 0.03583426651735722, "grad_norm": 6.116095066070557, "learning_rate": 0.000398952330041211, "loss": 6.5533, "step": 112 }, { "epoch": 0.03615421532554791, "grad_norm": 6.3986639976501465, "learning_rate": 0.0003989317061605955, "loss": 6.8069, "step": 113 }, { "epoch": 0.0364741641337386, "grad_norm": 5.98941707611084, "learning_rate": 0.0003989108818305273, "loss": 6.8029, "step": 114 }, { "epoch": 0.03679411294192929, "grad_norm": 4.1077470779418945, "learning_rate": 0.00039888985707310024, "loss": 6.5364, "step": 115 }, { "epoch": 0.037114061750119984, "grad_norm": 7.463810443878174, "learning_rate": 0.00039886863191062076, "loss": 6.7412, "step": 116 }, { "epoch": 0.03743401055831067, "grad_norm": 4.7363362312316895, "learning_rate": 0.000398847206365608, "loss": 6.6096, "step": 117 }, { "epoch": 0.03775395936650136, "grad_norm": 7.381870269775391, "learning_rate": 0.00039882558046079364, "loss": 6.9752, "step": 118 }, { "epoch": 0.03807390817469205, "grad_norm": 11.667238235473633, "learning_rate": 0.000398803754219122, "loss": 6.9014, "step": 119 }, { "epoch": 0.03839385698288274, "grad_norm": 10.061962127685547, "learning_rate": 0.0003987817276637498, "loss": 6.6761, "step": 120 }, { "epoch": 0.038713805791073426, "grad_norm": 6.85370397567749, "learning_rate": 0.00039875950081804653, "loss": 6.7025, "step": 121 }, { "epoch": 0.039033754599264114, "grad_norm": 6.230566501617432, "learning_rate": 0.0003987370737055939, "loss": 6.7836, "step": 122 }, { "epoch": 0.03935370340745481, "grad_norm": 9.128447532653809, "learning_rate": 0.0003987144463501864, "loss": 7.0278, "step": 123 }, { "epoch": 0.0396736522156455, "grad_norm": 4.422046184539795, "learning_rate": 0.0003986916187758306, "loss": 6.7092, "step": 124 }, { "epoch": 0.03999360102383619, "grad_norm": 5.277736186981201, "learning_rate": 0.00039866859100674585, "loss": 6.5118, "step": 125 }, { "epoch": 0.040313549832026875, "grad_norm": 5.670833110809326, "learning_rate": 0.0003986453630673637, "loss": 6.6172, "step": 126 }, { "epoch": 0.040633498640217564, "grad_norm": 8.088812828063965, "learning_rate": 0.00039862193498232815, "loss": 6.7693, "step": 127 }, { "epoch": 0.04095344744840825, "grad_norm": 4.622250556945801, "learning_rate": 0.0003985983067764955, "loss": 6.7558, "step": 128 }, { "epoch": 0.04127339625659895, "grad_norm": 3.96929669380188, "learning_rate": 0.0003985744784749343, "loss": 6.6975, "step": 129 }, { "epoch": 0.041593345064789636, "grad_norm": 4.888178825378418, "learning_rate": 0.00039855045010292565, "loss": 6.7552, "step": 130 }, { "epoch": 0.041913293872980324, "grad_norm": 3.579385280609131, "learning_rate": 0.0003985262216859627, "loss": 6.5003, "step": 131 }, { "epoch": 0.04223324268117101, "grad_norm": 7.001453876495361, "learning_rate": 0.0003985017932497508, "loss": 6.6921, "step": 132 }, { "epoch": 0.0425531914893617, "grad_norm": 3.956249713897705, "learning_rate": 0.00039847716482020767, "loss": 6.6835, "step": 133 }, { "epoch": 0.04287314029755239, "grad_norm": 6.118314266204834, "learning_rate": 0.0003984523364234632, "loss": 6.6699, "step": 134 }, { "epoch": 0.04319308910574308, "grad_norm": 6.215986728668213, "learning_rate": 0.00039842730808585926, "loss": 6.5631, "step": 135 }, { "epoch": 0.04351303791393377, "grad_norm": 5.343611717224121, "learning_rate": 0.00039840207983395017, "loss": 6.4691, "step": 136 }, { "epoch": 0.04383298672212446, "grad_norm": 5.891202449798584, "learning_rate": 0.00039837665169450195, "loss": 6.3602, "step": 137 }, { "epoch": 0.04415293553031515, "grad_norm": 5.650427341461182, "learning_rate": 0.000398351023694493, "loss": 6.5209, "step": 138 }, { "epoch": 0.04447288433850584, "grad_norm": 8.644763946533203, "learning_rate": 0.0003983251958611137, "loss": 6.7151, "step": 139 }, { "epoch": 0.04479283314669653, "grad_norm": 6.320985317230225, "learning_rate": 0.00039829916822176634, "loss": 6.615, "step": 140 }, { "epoch": 0.045112781954887216, "grad_norm": 6.1446428298950195, "learning_rate": 0.0003982729408040653, "loss": 6.5516, "step": 141 }, { "epoch": 0.04543273076307791, "grad_norm": 5.089375972747803, "learning_rate": 0.00039824651363583693, "loss": 6.5904, "step": 142 }, { "epoch": 0.0457526795712686, "grad_norm": 5.653002738952637, "learning_rate": 0.00039821988674511934, "loss": 6.5341, "step": 143 }, { "epoch": 0.04607262837945929, "grad_norm": 9.837624549865723, "learning_rate": 0.0003981930601601628, "loss": 6.656, "step": 144 }, { "epoch": 0.046392577187649976, "grad_norm": 5.145659446716309, "learning_rate": 0.0003981660339094293, "loss": 6.3306, "step": 145 }, { "epoch": 0.046712525995840665, "grad_norm": 4.123940944671631, "learning_rate": 0.00039813880802159254, "loss": 6.5867, "step": 146 }, { "epoch": 0.04703247480403135, "grad_norm": 7.826724529266357, "learning_rate": 0.0003981113825255383, "loss": 6.6373, "step": 147 }, { "epoch": 0.04735242361222204, "grad_norm": 7.781490802764893, "learning_rate": 0.00039808375745036396, "loss": 6.6397, "step": 148 }, { "epoch": 0.04767237242041274, "grad_norm": 5.552570343017578, "learning_rate": 0.0003980559328253787, "loss": 6.673, "step": 149 }, { "epoch": 0.047992321228603425, "grad_norm": 6.54767370223999, "learning_rate": 0.00039802790868010335, "loss": 6.5612, "step": 150 }, { "epoch": 0.048312270036794114, "grad_norm": 8.244477272033691, "learning_rate": 0.00039799968504427056, "loss": 6.74, "step": 151 }, { "epoch": 0.0486322188449848, "grad_norm": 7.447508811950684, "learning_rate": 0.0003979712619478245, "loss": 6.7098, "step": 152 }, { "epoch": 0.04895216765317549, "grad_norm": 7.424586772918701, "learning_rate": 0.00039794263942092103, "loss": 6.6281, "step": 153 }, { "epoch": 0.04927211646136618, "grad_norm": 6.1032891273498535, "learning_rate": 0.00039791381749392754, "loss": 6.5981, "step": 154 }, { "epoch": 0.04959206526955687, "grad_norm": 5.076866149902344, "learning_rate": 0.00039788479619742314, "loss": 6.5594, "step": 155 }, { "epoch": 0.04991201407774756, "grad_norm": 6.170379161834717, "learning_rate": 0.00039785557556219807, "loss": 6.6772, "step": 156 }, { "epoch": 0.05023196288593825, "grad_norm": 3.279794454574585, "learning_rate": 0.00039782615561925457, "loss": 6.6905, "step": 157 }, { "epoch": 0.05055191169412894, "grad_norm": 4.933487892150879, "learning_rate": 0.000397796536399806, "loss": 6.5733, "step": 158 }, { "epoch": 0.05087186050231963, "grad_norm": 4.821317672729492, "learning_rate": 0.00039776671793527734, "loss": 6.5997, "step": 159 }, { "epoch": 0.05119180931051032, "grad_norm": 4.446855068206787, "learning_rate": 0.00039773670025730466, "loss": 6.5848, "step": 160 }, { "epoch": 0.051511758118701005, "grad_norm": 5.981055736541748, "learning_rate": 0.0003977064833977358, "loss": 6.5767, "step": 161 }, { "epoch": 0.0518317069268917, "grad_norm": 5.806812286376953, "learning_rate": 0.0003976760673886296, "loss": 6.6296, "step": 162 }, { "epoch": 0.05215165573508239, "grad_norm": 7.575981140136719, "learning_rate": 0.0003976454522622563, "loss": 6.521, "step": 163 }, { "epoch": 0.05247160454327308, "grad_norm": 7.472120761871338, "learning_rate": 0.00039761463805109744, "loss": 6.6819, "step": 164 }, { "epoch": 0.052791553351463766, "grad_norm": 5.2565412521362305, "learning_rate": 0.0003975836247878458, "loss": 6.5501, "step": 165 }, { "epoch": 0.053111502159654454, "grad_norm": 5.9353413581848145, "learning_rate": 0.0003975524125054051, "loss": 6.7042, "step": 166 }, { "epoch": 0.05343145096784514, "grad_norm": 4.687541961669922, "learning_rate": 0.00039752100123689065, "loss": 6.6537, "step": 167 }, { "epoch": 0.05375139977603583, "grad_norm": 5.580118179321289, "learning_rate": 0.00039748939101562846, "loss": 6.6087, "step": 168 }, { "epoch": 0.054071348584226527, "grad_norm": 6.239872932434082, "learning_rate": 0.00039745758187515585, "loss": 6.6247, "step": 169 }, { "epoch": 0.054391297392417215, "grad_norm": 7.768801689147949, "learning_rate": 0.000397425573849221, "loss": 6.6399, "step": 170 }, { "epoch": 0.0547112462006079, "grad_norm": 7.151449203491211, "learning_rate": 0.00039739336697178343, "loss": 6.4196, "step": 171 }, { "epoch": 0.05503119500879859, "grad_norm": 9.474916458129883, "learning_rate": 0.0003973609612770133, "loss": 6.8067, "step": 172 }, { "epoch": 0.05535114381698928, "grad_norm": 7.509045124053955, "learning_rate": 0.00039732835679929184, "loss": 6.91, "step": 173 }, { "epoch": 0.05567109262517997, "grad_norm": 6.453274726867676, "learning_rate": 0.00039729555357321123, "loss": 6.6127, "step": 174 }, { "epoch": 0.055991041433370664, "grad_norm": 6.131414413452148, "learning_rate": 0.00039726255163357444, "loss": 6.6598, "step": 175 }, { "epoch": 0.05631099024156135, "grad_norm": 7.787036895751953, "learning_rate": 0.00039722935101539527, "loss": 6.6519, "step": 176 }, { "epoch": 0.05663093904975204, "grad_norm": 7.098258018493652, "learning_rate": 0.00039719595175389833, "loss": 6.6283, "step": 177 }, { "epoch": 0.05695088785794273, "grad_norm": 6.19675350189209, "learning_rate": 0.0003971623538845191, "loss": 6.5764, "step": 178 }, { "epoch": 0.05727083666613342, "grad_norm": 6.365749359130859, "learning_rate": 0.0003971285574429034, "loss": 6.8207, "step": 179 }, { "epoch": 0.057590785474324106, "grad_norm": 6.72816801071167, "learning_rate": 0.0003970945624649082, "loss": 6.5185, "step": 180 }, { "epoch": 0.057910734282514795, "grad_norm": 6.747672080993652, "learning_rate": 0.00039706036898660095, "loss": 6.5095, "step": 181 }, { "epoch": 0.05823068309070549, "grad_norm": 8.517967224121094, "learning_rate": 0.0003970259770442594, "loss": 6.6087, "step": 182 }, { "epoch": 0.05855063189889618, "grad_norm": 7.719740390777588, "learning_rate": 0.00039699138667437234, "loss": 6.6355, "step": 183 }, { "epoch": 0.05887058070708687, "grad_norm": 5.7464518547058105, "learning_rate": 0.0003969565979136387, "loss": 6.5026, "step": 184 }, { "epoch": 0.059190529515277555, "grad_norm": 7.535585403442383, "learning_rate": 0.00039692161079896816, "loss": 6.5819, "step": 185 }, { "epoch": 0.059510478323468244, "grad_norm": 10.726088523864746, "learning_rate": 0.0003968864253674806, "loss": 6.6332, "step": 186 }, { "epoch": 0.05983042713165893, "grad_norm": 6.952598571777344, "learning_rate": 0.0003968510416565067, "loss": 6.4593, "step": 187 }, { "epoch": 0.06015037593984962, "grad_norm": 5.812473297119141, "learning_rate": 0.0003968154597035869, "loss": 6.4498, "step": 188 }, { "epoch": 0.060470324748040316, "grad_norm": 6.252615928649902, "learning_rate": 0.00039677967954647263, "loss": 6.4724, "step": 189 }, { "epoch": 0.060790273556231005, "grad_norm": 5.516043663024902, "learning_rate": 0.00039674370122312505, "loss": 6.6256, "step": 190 }, { "epoch": 0.06111022236442169, "grad_norm": 7.377323627471924, "learning_rate": 0.00039670752477171604, "loss": 6.8132, "step": 191 }, { "epoch": 0.06143017117261238, "grad_norm": 7.111225128173828, "learning_rate": 0.0003966711502306273, "loss": 6.6017, "step": 192 }, { "epoch": 0.06175011998080307, "grad_norm": 7.866201877593994, "learning_rate": 0.0003966345776384509, "loss": 6.6965, "step": 193 }, { "epoch": 0.06207006878899376, "grad_norm": 8.679105758666992, "learning_rate": 0.00039659780703398895, "loss": 6.482, "step": 194 }, { "epoch": 0.062390017597184454, "grad_norm": 6.715935230255127, "learning_rate": 0.00039656083845625377, "loss": 6.5263, "step": 195 }, { "epoch": 0.06270996640537514, "grad_norm": 6.897395610809326, "learning_rate": 0.0003965236719444675, "loss": 6.7682, "step": 196 }, { "epoch": 0.06302991521356582, "grad_norm": 5.384663105010986, "learning_rate": 0.0003964863075380626, "loss": 6.5211, "step": 197 }, { "epoch": 0.06334986402175652, "grad_norm": 7.211827278137207, "learning_rate": 0.0003964487452766811, "loss": 6.7227, "step": 198 }, { "epoch": 0.06366981282994721, "grad_norm": 6.876810550689697, "learning_rate": 0.0003964109852001753, "loss": 6.4332, "step": 199 }, { "epoch": 0.0639897616381379, "grad_norm": 5.7009453773498535, "learning_rate": 0.0003963730273486072, "loss": 6.5789, "step": 200 }, { "epoch": 0.0639897616381379, "eval_loss": 3.317033290863037, "eval_runtime": 233.4326, "eval_samples_per_second": 5.638, "eval_steps_per_second": 1.409, "step": 200 }, { "epoch": 0.06430971044632859, "grad_norm": 5.751672744750977, "learning_rate": 0.00039633487176224863, "loss": 6.6452, "step": 201 }, { "epoch": 0.06462965925451927, "grad_norm": 7.075699806213379, "learning_rate": 0.0003962965184815812, "loss": 6.6866, "step": 202 }, { "epoch": 0.06494960806270997, "grad_norm": 8.748412132263184, "learning_rate": 0.00039625796754729636, "loss": 6.6261, "step": 203 }, { "epoch": 0.06526955687090065, "grad_norm": 9.29478931427002, "learning_rate": 0.00039621921900029523, "loss": 6.8016, "step": 204 }, { "epoch": 0.06558950567909134, "grad_norm": 6.518805027008057, "learning_rate": 0.00039618027288168843, "loss": 6.6664, "step": 205 }, { "epoch": 0.06590945448728204, "grad_norm": 7.012482166290283, "learning_rate": 0.0003961411292327965, "loss": 6.5855, "step": 206 }, { "epoch": 0.06622940329547272, "grad_norm": 4.071852684020996, "learning_rate": 0.00039610178809514925, "loss": 6.7401, "step": 207 }, { "epoch": 0.06654935210366342, "grad_norm": 6.612091541290283, "learning_rate": 0.0003960622495104862, "loss": 6.456, "step": 208 }, { "epoch": 0.0668693009118541, "grad_norm": 4.622671604156494, "learning_rate": 0.00039602251352075635, "loss": 6.629, "step": 209 }, { "epoch": 0.0671892497200448, "grad_norm": 5.038511276245117, "learning_rate": 0.00039598258016811805, "loss": 6.4263, "step": 210 }, { "epoch": 0.06750919852823548, "grad_norm": 5.86072301864624, "learning_rate": 0.00039594244949493903, "loss": 6.4438, "step": 211 }, { "epoch": 0.06782914733642617, "grad_norm": 5.774945259094238, "learning_rate": 0.0003959021215437965, "loss": 6.8048, "step": 212 }, { "epoch": 0.06814909614461687, "grad_norm": 6.453885555267334, "learning_rate": 0.000395861596357477, "loss": 6.5328, "step": 213 }, { "epoch": 0.06846904495280755, "grad_norm": 9.138883590698242, "learning_rate": 0.000395820873978976, "loss": 6.6628, "step": 214 }, { "epoch": 0.06878899376099824, "grad_norm": 7.3241376876831055, "learning_rate": 0.00039577995445149865, "loss": 6.5668, "step": 215 }, { "epoch": 0.06910894256918892, "grad_norm": 6.592808246612549, "learning_rate": 0.00039573883781845884, "loss": 6.5612, "step": 216 }, { "epoch": 0.06942889137737962, "grad_norm": 4.612181186676025, "learning_rate": 0.00039569752412347994, "loss": 6.5892, "step": 217 }, { "epoch": 0.06974884018557032, "grad_norm": 5.334681034088135, "learning_rate": 0.0003956560134103941, "loss": 6.5731, "step": 218 }, { "epoch": 0.070068788993761, "grad_norm": 9.595178604125977, "learning_rate": 0.0003956143057232427, "loss": 6.5792, "step": 219 }, { "epoch": 0.07038873780195169, "grad_norm": 11.286225318908691, "learning_rate": 0.000395572401106276, "loss": 6.6624, "step": 220 }, { "epoch": 0.07070868661014237, "grad_norm": 7.74772834777832, "learning_rate": 0.00039553029960395325, "loss": 6.7217, "step": 221 }, { "epoch": 0.07102863541833307, "grad_norm": 6.920510768890381, "learning_rate": 0.0003954880012609425, "loss": 6.6113, "step": 222 }, { "epoch": 0.07134858422652375, "grad_norm": 6.8180670738220215, "learning_rate": 0.00039544550612212085, "loss": 6.773, "step": 223 }, { "epoch": 0.07166853303471445, "grad_norm": 8.286237716674805, "learning_rate": 0.00039540281423257394, "loss": 6.6248, "step": 224 }, { "epoch": 0.07198848184290514, "grad_norm": 4.819367408752441, "learning_rate": 0.00039535992563759627, "loss": 6.7728, "step": 225 }, { "epoch": 0.07230843065109582, "grad_norm": 6.927038192749023, "learning_rate": 0.000395316840382691, "loss": 6.7501, "step": 226 }, { "epoch": 0.07262837945928652, "grad_norm": 6.734741687774658, "learning_rate": 0.00039527355851357005, "loss": 6.3144, "step": 227 }, { "epoch": 0.0729483282674772, "grad_norm": 7.074644565582275, "learning_rate": 0.0003952300800761538, "loss": 6.7793, "step": 228 }, { "epoch": 0.0732682770756679, "grad_norm": 15.208846092224121, "learning_rate": 0.00039518640511657126, "loss": 6.4693, "step": 229 }, { "epoch": 0.07358822588385858, "grad_norm": 5.9610090255737305, "learning_rate": 0.00039514253368116, "loss": 6.7493, "step": 230 }, { "epoch": 0.07390817469204927, "grad_norm": 17.66107940673828, "learning_rate": 0.00039509846581646583, "loss": 6.6046, "step": 231 }, { "epoch": 0.07422812350023997, "grad_norm": 6.6399407386779785, "learning_rate": 0.00039505420156924316, "loss": 6.7082, "step": 232 }, { "epoch": 0.07454807230843065, "grad_norm": 5.893923759460449, "learning_rate": 0.00039500974098645463, "loss": 6.4106, "step": 233 }, { "epoch": 0.07486802111662134, "grad_norm": 5.578292369842529, "learning_rate": 0.00039496508411527144, "loss": 6.6654, "step": 234 }, { "epoch": 0.07518796992481203, "grad_norm": 9.475028991699219, "learning_rate": 0.00039492023100307273, "loss": 6.4931, "step": 235 }, { "epoch": 0.07550791873300272, "grad_norm": 8.972395896911621, "learning_rate": 0.00039487518169744587, "loss": 6.3256, "step": 236 }, { "epoch": 0.0758278675411934, "grad_norm": 34.38764953613281, "learning_rate": 0.00039482993624618667, "loss": 6.2344, "step": 237 }, { "epoch": 0.0761478163493841, "grad_norm": 12.006391525268555, "learning_rate": 0.00039478449469729877, "loss": 6.7194, "step": 238 }, { "epoch": 0.0764677651575748, "grad_norm": 13.673998832702637, "learning_rate": 0.00039473885709899394, "loss": 6.4708, "step": 239 }, { "epoch": 0.07678771396576548, "grad_norm": 9.385038375854492, "learning_rate": 0.00039469302349969194, "loss": 6.6304, "step": 240 }, { "epoch": 0.07710766277395617, "grad_norm": 565.4649047851562, "learning_rate": 0.00039464699394802064, "loss": 6.527, "step": 241 }, { "epoch": 0.07742761158214685, "grad_norm": 35.85231399536133, "learning_rate": 0.00039460076849281544, "loss": 7.1016, "step": 242 }, { "epoch": 0.07774756039033755, "grad_norm": 16.34745979309082, "learning_rate": 0.00039455434718311996, "loss": 6.5491, "step": 243 }, { "epoch": 0.07806750919852823, "grad_norm": 10.916836738586426, "learning_rate": 0.0003945077300681855, "loss": 6.5222, "step": 244 }, { "epoch": 0.07838745800671892, "grad_norm": 9.447298049926758, "learning_rate": 0.00039446091719747093, "loss": 6.4075, "step": 245 }, { "epoch": 0.07870740681490962, "grad_norm": 9.539137840270996, "learning_rate": 0.00039441390862064306, "loss": 6.6475, "step": 246 }, { "epoch": 0.0790273556231003, "grad_norm": 7.535435199737549, "learning_rate": 0.0003943667043875762, "loss": 6.2207, "step": 247 }, { "epoch": 0.079347304431291, "grad_norm": 6.822633743286133, "learning_rate": 0.0003943193045483523, "loss": 6.3106, "step": 248 }, { "epoch": 0.07966725323948168, "grad_norm": 7.698075294494629, "learning_rate": 0.0003942717091532607, "loss": 6.6178, "step": 249 }, { "epoch": 0.07998720204767237, "grad_norm": 8.507805824279785, "learning_rate": 0.0003942239182527985, "loss": 6.2366, "step": 250 }, { "epoch": 0.08030715085586307, "grad_norm": 17.260297775268555, "learning_rate": 0.00039417593189766995, "loss": 6.3205, "step": 251 }, { "epoch": 0.08062709966405375, "grad_norm": 9.483721733093262, "learning_rate": 0.00039412775013878673, "loss": 6.5035, "step": 252 }, { "epoch": 0.08094704847224445, "grad_norm": 7.060410499572754, "learning_rate": 0.00039407937302726796, "loss": 6.5481, "step": 253 }, { "epoch": 0.08126699728043513, "grad_norm": 10.322737693786621, "learning_rate": 0.00039403080061443994, "loss": 6.3955, "step": 254 }, { "epoch": 0.08158694608862582, "grad_norm": 12.29690170288086, "learning_rate": 0.0003939820329518361, "loss": 6.6427, "step": 255 }, { "epoch": 0.0819068948968165, "grad_norm": 6.971141338348389, "learning_rate": 0.0003939330700911972, "loss": 6.4136, "step": 256 }, { "epoch": 0.0822268437050072, "grad_norm": 7.917289733886719, "learning_rate": 0.0003938839120844709, "loss": 6.5704, "step": 257 }, { "epoch": 0.0825467925131979, "grad_norm": 6.552984237670898, "learning_rate": 0.00039383455898381206, "loss": 6.3583, "step": 258 }, { "epoch": 0.08286674132138858, "grad_norm": 6.688981056213379, "learning_rate": 0.0003937850108415825, "loss": 6.3685, "step": 259 }, { "epoch": 0.08318669012957927, "grad_norm": 7.041812896728516, "learning_rate": 0.0003937352677103509, "loss": 6.5508, "step": 260 }, { "epoch": 0.08350663893776995, "grad_norm": 10.411174774169922, "learning_rate": 0.0003936853296428928, "loss": 6.4838, "step": 261 }, { "epoch": 0.08382658774596065, "grad_norm": 7.945723533630371, "learning_rate": 0.00039363519669219073, "loss": 6.4606, "step": 262 }, { "epoch": 0.08414653655415133, "grad_norm": 6.014384746551514, "learning_rate": 0.0003935848689114338, "loss": 6.3841, "step": 263 }, { "epoch": 0.08446648536234203, "grad_norm": 4.446471214294434, "learning_rate": 0.00039353434635401786, "loss": 6.4982, "step": 264 }, { "epoch": 0.08478643417053272, "grad_norm": 6.786041259765625, "learning_rate": 0.0003934836290735455, "loss": 6.6322, "step": 265 }, { "epoch": 0.0851063829787234, "grad_norm": 10.881097793579102, "learning_rate": 0.0003934327171238259, "loss": 6.543, "step": 266 }, { "epoch": 0.0854263317869141, "grad_norm": 5.207790851593018, "learning_rate": 0.00039338161055887466, "loss": 6.4335, "step": 267 }, { "epoch": 0.08574628059510478, "grad_norm": 4.667728900909424, "learning_rate": 0.0003933303094329139, "loss": 6.3188, "step": 268 }, { "epoch": 0.08606622940329547, "grad_norm": 8.13780403137207, "learning_rate": 0.0003932788138003723, "loss": 6.4419, "step": 269 }, { "epoch": 0.08638617821148616, "grad_norm": 6.202636241912842, "learning_rate": 0.00039322712371588477, "loss": 6.3797, "step": 270 }, { "epoch": 0.08670612701967685, "grad_norm": 6.082520961761475, "learning_rate": 0.0003931752392342924, "loss": 6.4058, "step": 271 }, { "epoch": 0.08702607582786755, "grad_norm": 8.391554832458496, "learning_rate": 0.00039312316041064283, "loss": 6.6174, "step": 272 }, { "epoch": 0.08734602463605823, "grad_norm": 8.977246284484863, "learning_rate": 0.0003930708873001898, "loss": 6.3543, "step": 273 }, { "epoch": 0.08766597344424892, "grad_norm": 7.828481197357178, "learning_rate": 0.00039301841995839285, "loss": 6.4354, "step": 274 }, { "epoch": 0.0879859222524396, "grad_norm": 6.4253387451171875, "learning_rate": 0.0003929657584409181, "loss": 6.4135, "step": 275 }, { "epoch": 0.0883058710606303, "grad_norm": 6.666152000427246, "learning_rate": 0.0003929129028036374, "loss": 6.157, "step": 276 }, { "epoch": 0.08862581986882098, "grad_norm": 5.554171562194824, "learning_rate": 0.00039285985310262854, "loss": 6.2411, "step": 277 }, { "epoch": 0.08894576867701168, "grad_norm": 5.765629291534424, "learning_rate": 0.0003928066093941753, "loss": 6.3223, "step": 278 }, { "epoch": 0.08926571748520237, "grad_norm": 8.940204620361328, "learning_rate": 0.0003927531717347673, "loss": 6.4208, "step": 279 }, { "epoch": 0.08958566629339305, "grad_norm": 6.127601623535156, "learning_rate": 0.00039269954018109976, "loss": 6.1999, "step": 280 }, { "epoch": 0.08990561510158375, "grad_norm": 9.167810440063477, "learning_rate": 0.00039264571479007387, "loss": 6.4898, "step": 281 }, { "epoch": 0.09022556390977443, "grad_norm": 7.327286720275879, "learning_rate": 0.0003925916956187963, "loss": 6.5342, "step": 282 }, { "epoch": 0.09054551271796513, "grad_norm": 8.352947235107422, "learning_rate": 0.00039253748272457937, "loss": 6.3908, "step": 283 }, { "epoch": 0.09086546152615582, "grad_norm": 6.588724136352539, "learning_rate": 0.00039248307616494095, "loss": 6.5415, "step": 284 }, { "epoch": 0.0911854103343465, "grad_norm": 6.817644119262695, "learning_rate": 0.00039242847599760435, "loss": 6.2451, "step": 285 }, { "epoch": 0.0915053591425372, "grad_norm": 4.902499198913574, "learning_rate": 0.00039237368228049827, "loss": 6.2267, "step": 286 }, { "epoch": 0.09182530795072788, "grad_norm": 6.349551200866699, "learning_rate": 0.0003923186950717569, "loss": 6.3978, "step": 287 }, { "epoch": 0.09214525675891858, "grad_norm": 4.191233158111572, "learning_rate": 0.00039226351442971937, "loss": 6.2194, "step": 288 }, { "epoch": 0.09246520556710926, "grad_norm": 6.8765387535095215, "learning_rate": 0.0003922081404129305, "loss": 6.4729, "step": 289 }, { "epoch": 0.09278515437529995, "grad_norm": 5.389702320098877, "learning_rate": 0.00039215257308013987, "loss": 6.2693, "step": 290 }, { "epoch": 0.09310510318349065, "grad_norm": 5.059742450714111, "learning_rate": 0.0003920968124903025, "loss": 6.3552, "step": 291 }, { "epoch": 0.09342505199168133, "grad_norm": 7.156213283538818, "learning_rate": 0.0003920408587025782, "loss": 6.1788, "step": 292 }, { "epoch": 0.09374500079987202, "grad_norm": 7.247560977935791, "learning_rate": 0.0003919847117763319, "loss": 6.6775, "step": 293 }, { "epoch": 0.0940649496080627, "grad_norm": 5.354501724243164, "learning_rate": 0.0003919283717711332, "loss": 6.5433, "step": 294 }, { "epoch": 0.0943848984162534, "grad_norm": 6.527252674102783, "learning_rate": 0.00039187183874675696, "loss": 6.4697, "step": 295 }, { "epoch": 0.09470484722444408, "grad_norm": 6.278224945068359, "learning_rate": 0.00039181511276318253, "loss": 6.4026, "step": 296 }, { "epoch": 0.09502479603263478, "grad_norm": 9.525894165039062, "learning_rate": 0.00039175819388059395, "loss": 6.3048, "step": 297 }, { "epoch": 0.09534474484082547, "grad_norm": 10.046257019042969, "learning_rate": 0.0003917010821593801, "loss": 6.3838, "step": 298 }, { "epoch": 0.09566469364901616, "grad_norm": 5.030762672424316, "learning_rate": 0.0003916437776601344, "loss": 6.3252, "step": 299 }, { "epoch": 0.09598464245720685, "grad_norm": 9.128808975219727, "learning_rate": 0.00039158628044365476, "loss": 6.4524, "step": 300 }, { "epoch": 0.09630459126539753, "grad_norm": 5.657589912414551, "learning_rate": 0.00039152859057094353, "loss": 6.3005, "step": 301 }, { "epoch": 0.09662454007358823, "grad_norm": 8.320329666137695, "learning_rate": 0.0003914707081032074, "loss": 6.2912, "step": 302 }, { "epoch": 0.09694448888177891, "grad_norm": 4.0212578773498535, "learning_rate": 0.00039141263310185783, "loss": 6.3545, "step": 303 }, { "epoch": 0.0972644376899696, "grad_norm": 5.65709924697876, "learning_rate": 0.00039135436562850984, "loss": 6.2217, "step": 304 }, { "epoch": 0.0975843864981603, "grad_norm": 6.352542400360107, "learning_rate": 0.00039129590574498325, "loss": 6.5012, "step": 305 }, { "epoch": 0.09790433530635098, "grad_norm": 4.846847057342529, "learning_rate": 0.0003912372535133018, "loss": 6.2432, "step": 306 }, { "epoch": 0.09822428411454168, "grad_norm": 4.925963401794434, "learning_rate": 0.0003911784089956931, "loss": 6.3315, "step": 307 }, { "epoch": 0.09854423292273236, "grad_norm": 8.602291107177734, "learning_rate": 0.0003911193722545893, "loss": 6.4698, "step": 308 }, { "epoch": 0.09886418173092305, "grad_norm": 6.059446811676025, "learning_rate": 0.00039106014335262587, "loss": 6.3634, "step": 309 }, { "epoch": 0.09918413053911374, "grad_norm": 5.607949256896973, "learning_rate": 0.0003910007223526427, "loss": 6.1728, "step": 310 }, { "epoch": 0.09950407934730443, "grad_norm": 6.827522277832031, "learning_rate": 0.0003909411093176831, "loss": 6.1584, "step": 311 }, { "epoch": 0.09982402815549513, "grad_norm": 9.145511627197266, "learning_rate": 0.00039088130431099425, "loss": 6.6123, "step": 312 }, { "epoch": 0.10014397696368581, "grad_norm": 8.265061378479004, "learning_rate": 0.00039082130739602716, "loss": 6.5472, "step": 313 }, { "epoch": 0.1004639257718765, "grad_norm": 8.550250053405762, "learning_rate": 0.0003907611186364361, "loss": 6.2075, "step": 314 }, { "epoch": 0.10078387458006718, "grad_norm": 7.309517860412598, "learning_rate": 0.0003907007380960792, "loss": 6.5685, "step": 315 }, { "epoch": 0.10110382338825788, "grad_norm": 6.228018760681152, "learning_rate": 0.00039064016583901796, "loss": 6.3311, "step": 316 }, { "epoch": 0.10142377219644858, "grad_norm": 8.36740493774414, "learning_rate": 0.0003905794019295173, "loss": 6.2873, "step": 317 }, { "epoch": 0.10174372100463926, "grad_norm": 6.596613883972168, "learning_rate": 0.0003905184464320453, "loss": 6.3647, "step": 318 }, { "epoch": 0.10206366981282995, "grad_norm": 7.987593650817871, "learning_rate": 0.00039045729941127364, "loss": 6.2189, "step": 319 }, { "epoch": 0.10238361862102063, "grad_norm": 7.624737739562988, "learning_rate": 0.0003903959609320769, "loss": 6.4019, "step": 320 }, { "epoch": 0.10270356742921133, "grad_norm": 6.667533874511719, "learning_rate": 0.0003903344310595329, "loss": 6.2599, "step": 321 }, { "epoch": 0.10302351623740201, "grad_norm": 9.014090538024902, "learning_rate": 0.0003902727098589226, "loss": 6.4371, "step": 322 }, { "epoch": 0.1033434650455927, "grad_norm": 6.295741558074951, "learning_rate": 0.0003902107973957299, "loss": 6.0999, "step": 323 }, { "epoch": 0.1036634138537834, "grad_norm": 5.217776298522949, "learning_rate": 0.00039014869373564146, "loss": 6.1554, "step": 324 }, { "epoch": 0.10398336266197408, "grad_norm": 8.615114212036133, "learning_rate": 0.0003900863989445471, "loss": 6.4223, "step": 325 }, { "epoch": 0.10430331147016478, "grad_norm": 9.223677635192871, "learning_rate": 0.00039002391308853905, "loss": 6.2714, "step": 326 }, { "epoch": 0.10462326027835546, "grad_norm": 8.394550323486328, "learning_rate": 0.00038996123623391267, "loss": 6.3747, "step": 327 }, { "epoch": 0.10494320908654615, "grad_norm": 8.384407043457031, "learning_rate": 0.00038989836844716564, "loss": 6.2019, "step": 328 }, { "epoch": 0.10526315789473684, "grad_norm": 6.654740333557129, "learning_rate": 0.00038983530979499835, "loss": 6.2863, "step": 329 }, { "epoch": 0.10558310670292753, "grad_norm": 5.682619571685791, "learning_rate": 0.00038977206034431373, "loss": 6.2633, "step": 330 }, { "epoch": 0.10590305551111823, "grad_norm": 6.896644592285156, "learning_rate": 0.0003897086201622169, "loss": 6.3236, "step": 331 }, { "epoch": 0.10622300431930891, "grad_norm": 6.741705894470215, "learning_rate": 0.0003896449893160157, "loss": 6.1388, "step": 332 }, { "epoch": 0.1065429531274996, "grad_norm": 7.0619215965271, "learning_rate": 0.00038958116787321995, "loss": 6.3887, "step": 333 }, { "epoch": 0.10686290193569029, "grad_norm": 5.026659965515137, "learning_rate": 0.0003895171559015418, "loss": 6.2476, "step": 334 }, { "epoch": 0.10718285074388098, "grad_norm": 6.366108417510986, "learning_rate": 0.00038945295346889567, "loss": 6.4004, "step": 335 }, { "epoch": 0.10750279955207166, "grad_norm": 6.229933738708496, "learning_rate": 0.0003893885606433978, "loss": 6.4047, "step": 336 }, { "epoch": 0.10782274836026236, "grad_norm": 5.269782543182373, "learning_rate": 0.0003893239774933667, "loss": 6.2193, "step": 337 }, { "epoch": 0.10814269716845305, "grad_norm": 5.160742282867432, "learning_rate": 0.00038925920408732254, "loss": 6.3671, "step": 338 }, { "epoch": 0.10846264597664373, "grad_norm": 6.694570064544678, "learning_rate": 0.00038919424049398756, "loss": 6.18, "step": 339 }, { "epoch": 0.10878259478483443, "grad_norm": 7.192866325378418, "learning_rate": 0.00038912908678228556, "loss": 6.1159, "step": 340 }, { "epoch": 0.10910254359302511, "grad_norm": 7.5637688636779785, "learning_rate": 0.00038906374302134235, "loss": 6.1341, "step": 341 }, { "epoch": 0.1094224924012158, "grad_norm": 10.56783676147461, "learning_rate": 0.00038899820928048524, "loss": 6.1261, "step": 342 }, { "epoch": 0.10974244120940649, "grad_norm": 6.504526615142822, "learning_rate": 0.00038893248562924286, "loss": 6.157, "step": 343 }, { "epoch": 0.11006239001759718, "grad_norm": 7.0983757972717285, "learning_rate": 0.00038886657213734567, "loss": 6.3047, "step": 344 }, { "epoch": 0.11038233882578788, "grad_norm": 12.745441436767578, "learning_rate": 0.00038880046887472547, "loss": 6.2377, "step": 345 }, { "epoch": 0.11070228763397856, "grad_norm": 9.568143844604492, "learning_rate": 0.0003887341759115151, "loss": 6.3565, "step": 346 }, { "epoch": 0.11102223644216926, "grad_norm": 9.615344047546387, "learning_rate": 0.0003886676933180493, "loss": 6.2896, "step": 347 }, { "epoch": 0.11134218525035994, "grad_norm": 8.619441032409668, "learning_rate": 0.0003886010211648632, "loss": 6.2012, "step": 348 }, { "epoch": 0.11166213405855063, "grad_norm": 13.255605697631836, "learning_rate": 0.00038853415952269367, "loss": 6.4429, "step": 349 }, { "epoch": 0.11198208286674133, "grad_norm": 9.434603691101074, "learning_rate": 0.00038846710846247845, "loss": 6.1695, "step": 350 }, { "epoch": 0.11230203167493201, "grad_norm": 7.766512870788574, "learning_rate": 0.00038839986805535605, "loss": 6.2969, "step": 351 }, { "epoch": 0.1126219804831227, "grad_norm": 5.024114608764648, "learning_rate": 0.00038833243837266615, "loss": 6.3783, "step": 352 }, { "epoch": 0.11294192929131339, "grad_norm": 6.237715721130371, "learning_rate": 0.00038826481948594897, "loss": 6.1483, "step": 353 }, { "epoch": 0.11326187809950408, "grad_norm": 7.264336109161377, "learning_rate": 0.00038819701146694573, "loss": 6.1925, "step": 354 }, { "epoch": 0.11358182690769476, "grad_norm": 7.290820121765137, "learning_rate": 0.0003881290143875981, "loss": 6.3151, "step": 355 }, { "epoch": 0.11390177571588546, "grad_norm": 5.954084873199463, "learning_rate": 0.0003880608283200486, "loss": 6.181, "step": 356 }, { "epoch": 0.11422172452407615, "grad_norm": 7.405073165893555, "learning_rate": 0.0003879924533366399, "loss": 6.2829, "step": 357 }, { "epoch": 0.11454167333226684, "grad_norm": 6.803983688354492, "learning_rate": 0.0003879238895099154, "loss": 6.3297, "step": 358 }, { "epoch": 0.11486162214045753, "grad_norm": 8.649042129516602, "learning_rate": 0.00038785513691261877, "loss": 6.2688, "step": 359 }, { "epoch": 0.11518157094864821, "grad_norm": 6.858957767486572, "learning_rate": 0.0003877861956176938, "loss": 6.2604, "step": 360 }, { "epoch": 0.11550151975683891, "grad_norm": 7.34728479385376, "learning_rate": 0.00038771706569828475, "loss": 6.2437, "step": 361 }, { "epoch": 0.11582146856502959, "grad_norm": 6.574677467346191, "learning_rate": 0.00038764774722773577, "loss": 6.113, "step": 362 }, { "epoch": 0.11614141737322028, "grad_norm": 9.14777660369873, "learning_rate": 0.00038757824027959133, "loss": 6.2634, "step": 363 }, { "epoch": 0.11646136618141098, "grad_norm": 6.340327262878418, "learning_rate": 0.00038750854492759546, "loss": 6.1939, "step": 364 }, { "epoch": 0.11678131498960166, "grad_norm": 6.838183879852295, "learning_rate": 0.0003874386612456925, "loss": 6.1019, "step": 365 }, { "epoch": 0.11710126379779236, "grad_norm": 9.026721954345703, "learning_rate": 0.0003873685893080263, "loss": 6.1435, "step": 366 }, { "epoch": 0.11742121260598304, "grad_norm": 5.471182823181152, "learning_rate": 0.0003872983291889406, "loss": 6.1873, "step": 367 }, { "epoch": 0.11774116141417373, "grad_norm": 44.7470588684082, "learning_rate": 0.0003872278809629788, "loss": 6.318, "step": 368 }, { "epoch": 0.11806111022236442, "grad_norm": 11.135316848754883, "learning_rate": 0.00038715724470488375, "loss": 6.1779, "step": 369 }, { "epoch": 0.11838105903055511, "grad_norm": 6.037003517150879, "learning_rate": 0.00038708642048959787, "loss": 6.1901, "step": 370 }, { "epoch": 0.1187010078387458, "grad_norm": 7.986204147338867, "learning_rate": 0.0003870154083922631, "loss": 6.3507, "step": 371 }, { "epoch": 0.11902095664693649, "grad_norm": 6.852958679199219, "learning_rate": 0.00038694420848822046, "loss": 6.3366, "step": 372 }, { "epoch": 0.11934090545512718, "grad_norm": 7.698732376098633, "learning_rate": 0.0003868728208530105, "loss": 6.2442, "step": 373 }, { "epoch": 0.11966085426331786, "grad_norm": 8.554845809936523, "learning_rate": 0.0003868012455623728, "loss": 6.1877, "step": 374 }, { "epoch": 0.11998080307150856, "grad_norm": 8.99901008605957, "learning_rate": 0.0003867294826922459, "loss": 6.2821, "step": 375 }, { "epoch": 0.12030075187969924, "grad_norm": 8.72900676727295, "learning_rate": 0.0003866575323187678, "loss": 6.1797, "step": 376 }, { "epoch": 0.12062070068788994, "grad_norm": 5.478520393371582, "learning_rate": 0.00038658539451827493, "loss": 6.3879, "step": 377 }, { "epoch": 0.12094064949608063, "grad_norm": 7.450143814086914, "learning_rate": 0.0003865130693673028, "loss": 6.2649, "step": 378 }, { "epoch": 0.12126059830427131, "grad_norm": 9.570409774780273, "learning_rate": 0.00038644055694258585, "loss": 6.2301, "step": 379 }, { "epoch": 0.12158054711246201, "grad_norm": 6.57025146484375, "learning_rate": 0.00038636785732105685, "loss": 6.3892, "step": 380 }, { "epoch": 0.12190049592065269, "grad_norm": 6.987082481384277, "learning_rate": 0.0003862949705798475, "loss": 5.8789, "step": 381 }, { "epoch": 0.12222044472884339, "grad_norm": 7.735273838043213, "learning_rate": 0.00038622189679628774, "loss": 6.2546, "step": 382 }, { "epoch": 0.12254039353703408, "grad_norm": 9.814201354980469, "learning_rate": 0.00038614863604790635, "loss": 6.6222, "step": 383 }, { "epoch": 0.12286034234522476, "grad_norm": 9.477630615234375, "learning_rate": 0.00038607518841243, "loss": 6.457, "step": 384 }, { "epoch": 0.12318029115341546, "grad_norm": 7.4210333824157715, "learning_rate": 0.00038600155396778405, "loss": 6.0636, "step": 385 }, { "epoch": 0.12350023996160614, "grad_norm": 8.321981430053711, "learning_rate": 0.0003859277327920918, "loss": 6.1512, "step": 386 }, { "epoch": 0.12382018876979684, "grad_norm": 7.436471939086914, "learning_rate": 0.0003858537249636748, "loss": 6.255, "step": 387 }, { "epoch": 0.12414013757798752, "grad_norm": 8.560404777526855, "learning_rate": 0.0003857795305610525, "loss": 6.3151, "step": 388 }, { "epoch": 0.12446008638617821, "grad_norm": 10.054532051086426, "learning_rate": 0.00038570514966294243, "loss": 6.2584, "step": 389 }, { "epoch": 0.12478003519436891, "grad_norm": 9.69568157196045, "learning_rate": 0.0003856305823482599, "loss": 6.4013, "step": 390 }, { "epoch": 0.1250999840025596, "grad_norm": 9.335556030273438, "learning_rate": 0.00038555582869611807, "loss": 6.4033, "step": 391 }, { "epoch": 0.12541993281075028, "grad_norm": 8.94521427154541, "learning_rate": 0.00038548088878582776, "loss": 6.4043, "step": 392 }, { "epoch": 0.12573988161894098, "grad_norm": 11.994088172912598, "learning_rate": 0.0003854057626968974, "loss": 6.2114, "step": 393 }, { "epoch": 0.12605983042713165, "grad_norm": 6.300990581512451, "learning_rate": 0.000385330450509033, "loss": 6.2302, "step": 394 }, { "epoch": 0.12637977923532234, "grad_norm": 16.85110092163086, "learning_rate": 0.00038525495230213783, "loss": 6.0428, "step": 395 }, { "epoch": 0.12669972804351304, "grad_norm": 9.76857852935791, "learning_rate": 0.0003851792681563127, "loss": 6.486, "step": 396 }, { "epoch": 0.12701967685170373, "grad_norm": 9.043763160705566, "learning_rate": 0.0003851033981518557, "loss": 6.3237, "step": 397 }, { "epoch": 0.12733962565989443, "grad_norm": 16.929244995117188, "learning_rate": 0.00038502734236926214, "loss": 6.3279, "step": 398 }, { "epoch": 0.1276595744680851, "grad_norm": 19.378320693969727, "learning_rate": 0.00038495110088922413, "loss": 6.2533, "step": 399 }, { "epoch": 0.1279795232762758, "grad_norm": 7.617816925048828, "learning_rate": 0.00038487467379263123, "loss": 6.5729, "step": 400 }, { "epoch": 0.1279795232762758, "eval_loss": 3.206190586090088, "eval_runtime": 233.5688, "eval_samples_per_second": 5.634, "eval_steps_per_second": 1.409, "step": 400 }, { "epoch": 0.1282994720844665, "grad_norm": 15.050670623779297, "learning_rate": 0.0003847980611605696, "loss": 6.5357, "step": 401 }, { "epoch": 0.12861942089265718, "grad_norm": 10.210688591003418, "learning_rate": 0.00038472126307432245, "loss": 6.5739, "step": 402 }, { "epoch": 0.12893936970084788, "grad_norm": 9.43270492553711, "learning_rate": 0.00038464427961536955, "loss": 6.8275, "step": 403 }, { "epoch": 0.12925931850903855, "grad_norm": 8.865079879760742, "learning_rate": 0.0003845671108653876, "loss": 6.6035, "step": 404 }, { "epoch": 0.12957926731722924, "grad_norm": 8.50759506225586, "learning_rate": 0.0003844897569062497, "loss": 6.4124, "step": 405 }, { "epoch": 0.12989921612541994, "grad_norm": 10.129754066467285, "learning_rate": 0.00038441221782002546, "loss": 6.5082, "step": 406 }, { "epoch": 0.13021916493361063, "grad_norm": 8.176865577697754, "learning_rate": 0.000384334493688981, "loss": 6.3945, "step": 407 }, { "epoch": 0.1305391137418013, "grad_norm": 7.027590751647949, "learning_rate": 0.0003842565845955786, "loss": 6.3583, "step": 408 }, { "epoch": 0.130859062549992, "grad_norm": 9.852948188781738, "learning_rate": 0.000384178490622477, "loss": 6.1044, "step": 409 }, { "epoch": 0.1311790113581827, "grad_norm": 8.926599502563477, "learning_rate": 0.0003841002118525309, "loss": 6.2928, "step": 410 }, { "epoch": 0.13149896016637339, "grad_norm": 8.307205200195312, "learning_rate": 0.0003840217483687912, "loss": 6.5742, "step": 411 }, { "epoch": 0.13181890897456408, "grad_norm": 7.831887722015381, "learning_rate": 0.0003839431002545047, "loss": 6.2626, "step": 412 }, { "epoch": 0.13213885778275475, "grad_norm": 7.879708766937256, "learning_rate": 0.0003838642675931141, "loss": 6.5028, "step": 413 }, { "epoch": 0.13245880659094544, "grad_norm": 5.989245891571045, "learning_rate": 0.00038378525046825783, "loss": 6.3706, "step": 414 }, { "epoch": 0.13277875539913614, "grad_norm": 7.094750881195068, "learning_rate": 0.0003837060489637702, "loss": 6.2248, "step": 415 }, { "epoch": 0.13309870420732683, "grad_norm": 7.013360977172852, "learning_rate": 0.00038362666316368095, "loss": 6.6161, "step": 416 }, { "epoch": 0.13341865301551753, "grad_norm": 6.406157970428467, "learning_rate": 0.00038354709315221547, "loss": 6.4046, "step": 417 }, { "epoch": 0.1337386018237082, "grad_norm": 7.2502055168151855, "learning_rate": 0.00038346733901379454, "loss": 6.2165, "step": 418 }, { "epoch": 0.1340585506318989, "grad_norm": 7.486465930938721, "learning_rate": 0.00038338740083303436, "loss": 6.3809, "step": 419 }, { "epoch": 0.1343784994400896, "grad_norm": 4.760215759277344, "learning_rate": 0.0003833072786947463, "loss": 6.2624, "step": 420 }, { "epoch": 0.13469844824828028, "grad_norm": 7.41804838180542, "learning_rate": 0.0003832269726839369, "loss": 6.219, "step": 421 }, { "epoch": 0.13501839705647095, "grad_norm": 5.859868049621582, "learning_rate": 0.00038314648288580795, "loss": 6.2853, "step": 422 }, { "epoch": 0.13533834586466165, "grad_norm": 7.259220123291016, "learning_rate": 0.000383065809385756, "loss": 6.4797, "step": 423 }, { "epoch": 0.13565829467285234, "grad_norm": 5.617652893066406, "learning_rate": 0.0003829849522693727, "loss": 6.1839, "step": 424 }, { "epoch": 0.13597824348104304, "grad_norm": 7.392098426818848, "learning_rate": 0.00038290391162244423, "loss": 6.3839, "step": 425 }, { "epoch": 0.13629819228923373, "grad_norm": 5.7139997482299805, "learning_rate": 0.0003828226875309518, "loss": 6.2123, "step": 426 }, { "epoch": 0.1366181410974244, "grad_norm": 5.242659568786621, "learning_rate": 0.0003827412800810712, "loss": 6.3428, "step": 427 }, { "epoch": 0.1369380899056151, "grad_norm": 5.612354278564453, "learning_rate": 0.00038265968935917254, "loss": 6.3251, "step": 428 }, { "epoch": 0.1372580387138058, "grad_norm": 5.880339622497559, "learning_rate": 0.0003825779154518207, "loss": 6.3662, "step": 429 }, { "epoch": 0.1375779875219965, "grad_norm": 5.417006492614746, "learning_rate": 0.00038249595844577446, "loss": 6.4292, "step": 430 }, { "epoch": 0.13789793633018718, "grad_norm": 8.451333999633789, "learning_rate": 0.0003824138184279874, "loss": 6.3248, "step": 431 }, { "epoch": 0.13821788513837785, "grad_norm": 7.415943622589111, "learning_rate": 0.00038233149548560683, "loss": 6.3606, "step": 432 }, { "epoch": 0.13853783394656854, "grad_norm": 4.678287029266357, "learning_rate": 0.00038224898970597426, "loss": 6.2415, "step": 433 }, { "epoch": 0.13885778275475924, "grad_norm": 7.330494403839111, "learning_rate": 0.0003821663011766254, "loss": 6.4611, "step": 434 }, { "epoch": 0.13917773156294994, "grad_norm": 5.664525985717773, "learning_rate": 0.0003820834299852896, "loss": 6.0102, "step": 435 }, { "epoch": 0.13949768037114063, "grad_norm": 4.248838901519775, "learning_rate": 0.00038200037621989005, "loss": 6.2752, "step": 436 }, { "epoch": 0.1398176291793313, "grad_norm": 7.762814998626709, "learning_rate": 0.0003819171399685437, "loss": 6.0838, "step": 437 }, { "epoch": 0.140137577987522, "grad_norm": 6.642805576324463, "learning_rate": 0.00038183372131956114, "loss": 6.055, "step": 438 }, { "epoch": 0.1404575267957127, "grad_norm": 12.796443939208984, "learning_rate": 0.0003817501203614464, "loss": 6.1023, "step": 439 }, { "epoch": 0.14077747560390338, "grad_norm": 13.292308807373047, "learning_rate": 0.0003816663371828969, "loss": 6.6153, "step": 440 }, { "epoch": 0.14109742441209405, "grad_norm": 5.322185516357422, "learning_rate": 0.00038158237187280343, "loss": 6.0856, "step": 441 }, { "epoch": 0.14141737322028475, "grad_norm": 9.385347366333008, "learning_rate": 0.00038149822452025024, "loss": 6.1498, "step": 442 }, { "epoch": 0.14173732202847544, "grad_norm": 8.704651832580566, "learning_rate": 0.00038141389521451427, "loss": 6.1346, "step": 443 }, { "epoch": 0.14205727083666614, "grad_norm": 6.753723621368408, "learning_rate": 0.0003813293840450659, "loss": 6.3897, "step": 444 }, { "epoch": 0.14237721964485683, "grad_norm": 9.21335220336914, "learning_rate": 0.0003812446911015682, "loss": 6.3132, "step": 445 }, { "epoch": 0.1426971684530475, "grad_norm": 6.543299674987793, "learning_rate": 0.00038115981647387733, "loss": 6.2129, "step": 446 }, { "epoch": 0.1430171172612382, "grad_norm": 21.838804244995117, "learning_rate": 0.000381074760252042, "loss": 6.3327, "step": 447 }, { "epoch": 0.1433370660694289, "grad_norm": 28.30170440673828, "learning_rate": 0.00038098952252630373, "loss": 6.2693, "step": 448 }, { "epoch": 0.1436570148776196, "grad_norm": 20.272245407104492, "learning_rate": 0.0003809041033870965, "loss": 6.173, "step": 449 }, { "epoch": 0.14397696368581028, "grad_norm": 11.198784828186035, "learning_rate": 0.00038081850292504677, "loss": 6.2571, "step": 450 }, { "epoch": 0.14429691249400095, "grad_norm": 6.138251304626465, "learning_rate": 0.00038073272123097354, "loss": 6.2325, "step": 451 }, { "epoch": 0.14461686130219165, "grad_norm": 8.177600860595703, "learning_rate": 0.0003806467583958879, "loss": 6.3511, "step": 452 }, { "epoch": 0.14493681011038234, "grad_norm": 6.5708184242248535, "learning_rate": 0.0003805606145109932, "loss": 6.073, "step": 453 }, { "epoch": 0.14525675891857304, "grad_norm": 7.593567848205566, "learning_rate": 0.00038047428966768486, "loss": 6.5693, "step": 454 }, { "epoch": 0.1455767077267637, "grad_norm": 10.439419746398926, "learning_rate": 0.00038038778395755034, "loss": 6.4299, "step": 455 }, { "epoch": 0.1458966565349544, "grad_norm": 11.682433128356934, "learning_rate": 0.00038030109747236887, "loss": 6.3646, "step": 456 }, { "epoch": 0.1462166053431451, "grad_norm": 6.629786491394043, "learning_rate": 0.00038021423030411164, "loss": 6.2166, "step": 457 }, { "epoch": 0.1465365541513358, "grad_norm": 9.529027938842773, "learning_rate": 0.0003801271825449414, "loss": 6.7073, "step": 458 }, { "epoch": 0.14685650295952649, "grad_norm": 13.9060697555542, "learning_rate": 0.0003800399542872126, "loss": 6.2038, "step": 459 }, { "epoch": 0.14717645176771715, "grad_norm": 7.617699146270752, "learning_rate": 0.0003799525456234712, "loss": 6.247, "step": 460 }, { "epoch": 0.14749640057590785, "grad_norm": 6.537087440490723, "learning_rate": 0.0003798649566464544, "loss": 6.3626, "step": 461 }, { "epoch": 0.14781634938409854, "grad_norm": 5.183995723724365, "learning_rate": 0.00037977718744909084, "loss": 6.332, "step": 462 }, { "epoch": 0.14813629819228924, "grad_norm": 6.627659797668457, "learning_rate": 0.00037968923812450037, "loss": 6.3186, "step": 463 }, { "epoch": 0.14845624700047994, "grad_norm": 5.604552745819092, "learning_rate": 0.00037960110876599396, "loss": 6.3636, "step": 464 }, { "epoch": 0.1487761958086706, "grad_norm": 5.800177097320557, "learning_rate": 0.0003795127994670735, "loss": 6.4762, "step": 465 }, { "epoch": 0.1490961446168613, "grad_norm": 7.276813983917236, "learning_rate": 0.0003794243103214319, "loss": 6.2994, "step": 466 }, { "epoch": 0.149416093425052, "grad_norm": 7.041287422180176, "learning_rate": 0.00037933564142295274, "loss": 6.3795, "step": 467 }, { "epoch": 0.1497360422332427, "grad_norm": 7.782250881195068, "learning_rate": 0.0003792467928657105, "loss": 6.506, "step": 468 }, { "epoch": 0.15005599104143338, "grad_norm": 8.970823287963867, "learning_rate": 0.0003791577647439701, "loss": 6.4383, "step": 469 }, { "epoch": 0.15037593984962405, "grad_norm": 7.6800360679626465, "learning_rate": 0.00037906855715218695, "loss": 6.307, "step": 470 }, { "epoch": 0.15069588865781475, "grad_norm": 10.980112075805664, "learning_rate": 0.0003789791701850072, "loss": 6.4015, "step": 471 }, { "epoch": 0.15101583746600544, "grad_norm": 6.547383785247803, "learning_rate": 0.00037888960393726677, "loss": 6.229, "step": 472 }, { "epoch": 0.15133578627419614, "grad_norm": 8.423219680786133, "learning_rate": 0.0003787998585039923, "loss": 6.2121, "step": 473 }, { "epoch": 0.1516557350823868, "grad_norm": 5.707985877990723, "learning_rate": 0.0003787099339804003, "loss": 6.3282, "step": 474 }, { "epoch": 0.1519756838905775, "grad_norm": 6.869352340698242, "learning_rate": 0.0003786198304618973, "loss": 6.1333, "step": 475 }, { "epoch": 0.1522956326987682, "grad_norm": 6.416158676147461, "learning_rate": 0.00037852954804407974, "loss": 6.37, "step": 476 }, { "epoch": 0.1526155815069589, "grad_norm": 8.767664909362793, "learning_rate": 0.0003784390868227339, "loss": 6.1501, "step": 477 }, { "epoch": 0.1529355303151496, "grad_norm": 10.395624160766602, "learning_rate": 0.0003783484468938356, "loss": 6.5037, "step": 478 }, { "epoch": 0.15325547912334025, "grad_norm": 6.741879940032959, "learning_rate": 0.0003782576283535506, "loss": 6.4512, "step": 479 }, { "epoch": 0.15357542793153095, "grad_norm": 9.098990440368652, "learning_rate": 0.00037816663129823394, "loss": 6.2566, "step": 480 }, { "epoch": 0.15389537673972165, "grad_norm": 10.70953369140625, "learning_rate": 0.0003780754558244299, "loss": 6.1686, "step": 481 }, { "epoch": 0.15421532554791234, "grad_norm": 6.690832138061523, "learning_rate": 0.0003779841020288725, "loss": 6.1794, "step": 482 }, { "epoch": 0.15453527435610304, "grad_norm": 6.798882484436035, "learning_rate": 0.0003778925700084845, "loss": 6.0711, "step": 483 }, { "epoch": 0.1548552231642937, "grad_norm": 11.702363014221191, "learning_rate": 0.00037780085986037796, "loss": 6.6549, "step": 484 }, { "epoch": 0.1551751719724844, "grad_norm": 8.392105102539062, "learning_rate": 0.000377708971681854, "loss": 6.1966, "step": 485 }, { "epoch": 0.1554951207806751, "grad_norm": 9.752997398376465, "learning_rate": 0.0003776169055704025, "loss": 6.8049, "step": 486 }, { "epoch": 0.1558150695888658, "grad_norm": 55.3693733215332, "learning_rate": 0.00037752466162370207, "loss": 6.1044, "step": 487 }, { "epoch": 0.15613501839705646, "grad_norm": 65.37632751464844, "learning_rate": 0.0003774322399396202, "loss": 6.4519, "step": 488 }, { "epoch": 0.15645496720524715, "grad_norm": 7.421685218811035, "learning_rate": 0.0003773396406162127, "loss": 6.3113, "step": 489 }, { "epoch": 0.15677491601343785, "grad_norm": 9.326229095458984, "learning_rate": 0.00037724686375172403, "loss": 6.4404, "step": 490 }, { "epoch": 0.15709486482162854, "grad_norm": 8.919943809509277, "learning_rate": 0.000377153909444587, "loss": 6.3796, "step": 491 }, { "epoch": 0.15741481362981924, "grad_norm": 12.594194412231445, "learning_rate": 0.00037706077779342263, "loss": 6.1395, "step": 492 }, { "epoch": 0.1577347624380099, "grad_norm": 13.710543632507324, "learning_rate": 0.0003769674688970401, "loss": 6.1843, "step": 493 }, { "epoch": 0.1580547112462006, "grad_norm": 7.772939682006836, "learning_rate": 0.0003768739828544366, "loss": 6.2831, "step": 494 }, { "epoch": 0.1583746600543913, "grad_norm": 89.74982452392578, "learning_rate": 0.00037678031976479733, "loss": 6.3358, "step": 495 }, { "epoch": 0.158694608862582, "grad_norm": 9.788384437561035, "learning_rate": 0.0003766864797274954, "loss": 6.4116, "step": 496 }, { "epoch": 0.1590145576707727, "grad_norm": 13.687607765197754, "learning_rate": 0.0003765924628420916, "loss": 6.3342, "step": 497 }, { "epoch": 0.15933450647896336, "grad_norm": 10.730764389038086, "learning_rate": 0.0003764982692083341, "loss": 6.3028, "step": 498 }, { "epoch": 0.15965445528715405, "grad_norm": 7.144099712371826, "learning_rate": 0.00037640389892615897, "loss": 6.1418, "step": 499 }, { "epoch": 0.15997440409534475, "grad_norm": 6.012965202331543, "learning_rate": 0.00037630935209568954, "loss": 6.3804, "step": 500 }, { "epoch": 0.16029435290353544, "grad_norm": 19.48784065246582, "learning_rate": 0.0003762146288172364, "loss": 6.3017, "step": 501 }, { "epoch": 0.16061430171172614, "grad_norm": 10.153864860534668, "learning_rate": 0.0003761197291912974, "loss": 6.2839, "step": 502 }, { "epoch": 0.1609342505199168, "grad_norm": 10.028372764587402, "learning_rate": 0.00037602465331855754, "loss": 6.1487, "step": 503 }, { "epoch": 0.1612541993281075, "grad_norm": 7.426041126251221, "learning_rate": 0.00037592940129988866, "loss": 6.3812, "step": 504 }, { "epoch": 0.1615741481362982, "grad_norm": 5.519888877868652, "learning_rate": 0.00037583397323634964, "loss": 6.1925, "step": 505 }, { "epoch": 0.1618940969444889, "grad_norm": 7.253342628479004, "learning_rate": 0.00037573836922918615, "loss": 6.2379, "step": 506 }, { "epoch": 0.16221404575267956, "grad_norm": 5.459297180175781, "learning_rate": 0.00037564258937983035, "loss": 6.3595, "step": 507 }, { "epoch": 0.16253399456087025, "grad_norm": 4.96316385269165, "learning_rate": 0.000375546633789901, "loss": 6.0939, "step": 508 }, { "epoch": 0.16285394336906095, "grad_norm": 4.441860675811768, "learning_rate": 0.00037545050256120353, "loss": 6.1986, "step": 509 }, { "epoch": 0.16317389217725164, "grad_norm": 9.1715087890625, "learning_rate": 0.0003753541957957295, "loss": 6.2663, "step": 510 }, { "epoch": 0.16349384098544234, "grad_norm": 5.704549789428711, "learning_rate": 0.0003752577135956568, "loss": 6.0975, "step": 511 }, { "epoch": 0.163813789793633, "grad_norm": 6.922268390655518, "learning_rate": 0.00037516105606334943, "loss": 6.4677, "step": 512 }, { "epoch": 0.1641337386018237, "grad_norm": 5.864443778991699, "learning_rate": 0.0003750642233013573, "loss": 6.0617, "step": 513 }, { "epoch": 0.1644536874100144, "grad_norm": 6.080179691314697, "learning_rate": 0.0003749672154124165, "loss": 6.2692, "step": 514 }, { "epoch": 0.1647736362182051, "grad_norm": 7.958470344543457, "learning_rate": 0.00037487003249944863, "loss": 6.3754, "step": 515 }, { "epoch": 0.1650935850263958, "grad_norm": 7.878583908081055, "learning_rate": 0.0003747726746655612, "loss": 6.3376, "step": 516 }, { "epoch": 0.16541353383458646, "grad_norm": 6.210682392120361, "learning_rate": 0.00037467514201404726, "loss": 6.4979, "step": 517 }, { "epoch": 0.16573348264277715, "grad_norm": 13.755587577819824, "learning_rate": 0.0003745774346483851, "loss": 6.4135, "step": 518 }, { "epoch": 0.16605343145096785, "grad_norm": 8.76400375366211, "learning_rate": 0.00037447955267223875, "loss": 6.1651, "step": 519 }, { "epoch": 0.16637338025915854, "grad_norm": 6.616308212280273, "learning_rate": 0.00037438149618945727, "loss": 6.2581, "step": 520 }, { "epoch": 0.1666933290673492, "grad_norm": 8.678804397583008, "learning_rate": 0.0003742832653040748, "loss": 6.3311, "step": 521 }, { "epoch": 0.1670132778755399, "grad_norm": 6.813952445983887, "learning_rate": 0.00037418486012031087, "loss": 6.2027, "step": 522 }, { "epoch": 0.1673332266837306, "grad_norm": 8.717082023620605, "learning_rate": 0.0003740862807425694, "loss": 6.3091, "step": 523 }, { "epoch": 0.1676531754919213, "grad_norm": 7.110479354858398, "learning_rate": 0.00037398752727543967, "loss": 6.3897, "step": 524 }, { "epoch": 0.167973124300112, "grad_norm": 5.483648777008057, "learning_rate": 0.0003738885998236952, "loss": 6.2346, "step": 525 }, { "epoch": 0.16829307310830266, "grad_norm": 6.923319339752197, "learning_rate": 0.00037378949849229445, "loss": 6.2363, "step": 526 }, { "epoch": 0.16861302191649336, "grad_norm": 5.014647960662842, "learning_rate": 0.0003736902233863801, "loss": 6.249, "step": 527 }, { "epoch": 0.16893297072468405, "grad_norm": 8.451704978942871, "learning_rate": 0.00037359077461127945, "loss": 6.3163, "step": 528 }, { "epoch": 0.16925291953287475, "grad_norm": 5.6020073890686035, "learning_rate": 0.00037349115227250373, "loss": 6.3071, "step": 529 }, { "epoch": 0.16957286834106544, "grad_norm": 38.06607437133789, "learning_rate": 0.0003733913564757487, "loss": 6.1327, "step": 530 }, { "epoch": 0.1698928171492561, "grad_norm": 8.072343826293945, "learning_rate": 0.00037329138732689385, "loss": 6.2813, "step": 531 }, { "epoch": 0.1702127659574468, "grad_norm": 14.569067001342773, "learning_rate": 0.0003731912449320027, "loss": 6.14, "step": 532 }, { "epoch": 0.1705327147656375, "grad_norm": 10.168042182922363, "learning_rate": 0.0003730909293973226, "loss": 6.3262, "step": 533 }, { "epoch": 0.1708526635738282, "grad_norm": 10.32205581665039, "learning_rate": 0.00037299044082928455, "loss": 6.1903, "step": 534 }, { "epoch": 0.1711726123820189, "grad_norm": 6.994221210479736, "learning_rate": 0.0003728897793345032, "loss": 6.2946, "step": 535 }, { "epoch": 0.17149256119020956, "grad_norm": 6.205694675445557, "learning_rate": 0.0003727889450197765, "loss": 6.4397, "step": 536 }, { "epoch": 0.17181250999840025, "grad_norm": 6.186336517333984, "learning_rate": 0.000372687937992086, "loss": 6.3172, "step": 537 }, { "epoch": 0.17213245880659095, "grad_norm": 12.758987426757812, "learning_rate": 0.00037258675835859635, "loss": 6.3897, "step": 538 }, { "epoch": 0.17245240761478164, "grad_norm": 11.200096130371094, "learning_rate": 0.0003724854062266553, "loss": 6.0816, "step": 539 }, { "epoch": 0.1727723564229723, "grad_norm": 13.330513954162598, "learning_rate": 0.00037238388170379364, "loss": 6.5373, "step": 540 }, { "epoch": 0.173092305231163, "grad_norm": 6.715458869934082, "learning_rate": 0.00037228218489772515, "loss": 6.1138, "step": 541 }, { "epoch": 0.1734122540393537, "grad_norm": 8.349139213562012, "learning_rate": 0.0003721803159163463, "loss": 6.1373, "step": 542 }, { "epoch": 0.1737322028475444, "grad_norm": 9.18417739868164, "learning_rate": 0.00037207827486773624, "loss": 6.2775, "step": 543 }, { "epoch": 0.1740521516557351, "grad_norm": 8.832735061645508, "learning_rate": 0.0003719760618601567, "loss": 6.2763, "step": 544 }, { "epoch": 0.17437210046392576, "grad_norm": 14.054030418395996, "learning_rate": 0.00037187367700205185, "loss": 6.6554, "step": 545 }, { "epoch": 0.17469204927211646, "grad_norm": 8.51115894317627, "learning_rate": 0.0003717711204020482, "loss": 6.3151, "step": 546 }, { "epoch": 0.17501199808030715, "grad_norm": 10.541391372680664, "learning_rate": 0.00037166839216895445, "loss": 6.1285, "step": 547 }, { "epoch": 0.17533194688849785, "grad_norm": 21.057283401489258, "learning_rate": 0.0003715654924117614, "loss": 6.4187, "step": 548 }, { "epoch": 0.17565189569668854, "grad_norm": 17.178653717041016, "learning_rate": 0.00037146242123964183, "loss": 6.2921, "step": 549 }, { "epoch": 0.1759718445048792, "grad_norm": 10.637142181396484, "learning_rate": 0.00037135917876195037, "loss": 6.168, "step": 550 }, { "epoch": 0.1762917933130699, "grad_norm": 7.821049213409424, "learning_rate": 0.0003712557650882234, "loss": 6.3907, "step": 551 }, { "epoch": 0.1766117421212606, "grad_norm": 9.258932113647461, "learning_rate": 0.000371152180328179, "loss": 6.2813, "step": 552 }, { "epoch": 0.1769316909294513, "grad_norm": 8.969828605651855, "learning_rate": 0.0003710484245917167, "loss": 6.2085, "step": 553 }, { "epoch": 0.17725163973764196, "grad_norm": 10.377073287963867, "learning_rate": 0.0003709444979889174, "loss": 6.3274, "step": 554 }, { "epoch": 0.17757158854583266, "grad_norm": 13.76203727722168, "learning_rate": 0.0003708404006300434, "loss": 6.7071, "step": 555 }, { "epoch": 0.17789153735402335, "grad_norm": 9.469388961791992, "learning_rate": 0.00037073613262553803, "loss": 6.1904, "step": 556 }, { "epoch": 0.17821148616221405, "grad_norm": 7.818362712860107, "learning_rate": 0.00037063169408602586, "loss": 6.3993, "step": 557 }, { "epoch": 0.17853143497040475, "grad_norm": 10.85080337524414, "learning_rate": 0.000370527085122312, "loss": 6.2407, "step": 558 }, { "epoch": 0.1788513837785954, "grad_norm": 7.716219902038574, "learning_rate": 0.0003704223058453829, "loss": 6.3401, "step": 559 }, { "epoch": 0.1791713325867861, "grad_norm": 4.769658088684082, "learning_rate": 0.00037031735636640524, "loss": 6.3418, "step": 560 }, { "epoch": 0.1794912813949768, "grad_norm": 7.051829814910889, "learning_rate": 0.00037021223679672673, "loss": 6.2469, "step": 561 }, { "epoch": 0.1798112302031675, "grad_norm": 9.345251083374023, "learning_rate": 0.000370106947247875, "loss": 6.5334, "step": 562 }, { "epoch": 0.1801311790113582, "grad_norm": 8.516511917114258, "learning_rate": 0.0003700014878315585, "loss": 6.2604, "step": 563 }, { "epoch": 0.18045112781954886, "grad_norm": 8.440566062927246, "learning_rate": 0.0003698958586596657, "loss": 6.2302, "step": 564 }, { "epoch": 0.18077107662773956, "grad_norm": 5.112747669219971, "learning_rate": 0.0003697900598442651, "loss": 6.3096, "step": 565 }, { "epoch": 0.18109102543593025, "grad_norm": 5.404264450073242, "learning_rate": 0.00036968409149760534, "loss": 6.3788, "step": 566 }, { "epoch": 0.18141097424412095, "grad_norm": 8.593727111816406, "learning_rate": 0.0003695779537321149, "loss": 6.0832, "step": 567 }, { "epoch": 0.18173092305231164, "grad_norm": 5.876023769378662, "learning_rate": 0.00036947164666040184, "loss": 5.996, "step": 568 }, { "epoch": 0.1820508718605023, "grad_norm": 5.251745700836182, "learning_rate": 0.0003693651703952541, "loss": 6.1838, "step": 569 }, { "epoch": 0.182370820668693, "grad_norm": 6.7820539474487305, "learning_rate": 0.00036925852504963893, "loss": 6.2634, "step": 570 }, { "epoch": 0.1826907694768837, "grad_norm": 6.269637584686279, "learning_rate": 0.00036915171073670307, "loss": 6.4802, "step": 571 }, { "epoch": 0.1830107182850744, "grad_norm": 23.37257957458496, "learning_rate": 0.0003690447275697725, "loss": 6.1899, "step": 572 }, { "epoch": 0.18333066709326507, "grad_norm": 5.05233097076416, "learning_rate": 0.00036893757566235227, "loss": 6.1816, "step": 573 }, { "epoch": 0.18365061590145576, "grad_norm": 8.209317207336426, "learning_rate": 0.00036883025512812675, "loss": 6.4029, "step": 574 }, { "epoch": 0.18397056470964646, "grad_norm": 10.015338897705078, "learning_rate": 0.00036872276608095874, "loss": 6.1534, "step": 575 }, { "epoch": 0.18429051351783715, "grad_norm": 6.191135883331299, "learning_rate": 0.00036861510863489023, "loss": 6.3621, "step": 576 }, { "epoch": 0.18461046232602785, "grad_norm": 9.056325912475586, "learning_rate": 0.0003685072829041417, "loss": 6.3678, "step": 577 }, { "epoch": 0.18493041113421851, "grad_norm": 11.877366065979004, "learning_rate": 0.00036839928900311223, "loss": 6.2934, "step": 578 }, { "epoch": 0.1852503599424092, "grad_norm": 11.008338928222656, "learning_rate": 0.0003682911270463793, "loss": 6.2898, "step": 579 }, { "epoch": 0.1855703087505999, "grad_norm": 8.387709617614746, "learning_rate": 0.0003681827971486986, "loss": 6.3564, "step": 580 }, { "epoch": 0.1858902575587906, "grad_norm": 5.683431148529053, "learning_rate": 0.0003680742994250042, "loss": 6.358, "step": 581 }, { "epoch": 0.1862102063669813, "grad_norm": 9.834399223327637, "learning_rate": 0.000367965633990408, "loss": 6.4083, "step": 582 }, { "epoch": 0.18653015517517196, "grad_norm": 6.106937408447266, "learning_rate": 0.0003678568009602001, "loss": 6.1561, "step": 583 }, { "epoch": 0.18685010398336266, "grad_norm": 10.490397453308105, "learning_rate": 0.0003677478004498481, "loss": 6.326, "step": 584 }, { "epoch": 0.18717005279155335, "grad_norm": 7.496698379516602, "learning_rate": 0.0003676386325749976, "loss": 6.3687, "step": 585 }, { "epoch": 0.18749000159974405, "grad_norm": 7.097468376159668, "learning_rate": 0.0003675292974514715, "loss": 6.2169, "step": 586 }, { "epoch": 0.18780995040793472, "grad_norm": 14.11148738861084, "learning_rate": 0.0003674197951952704, "loss": 6.1241, "step": 587 }, { "epoch": 0.1881298992161254, "grad_norm": 13.260645866394043, "learning_rate": 0.00036731012592257194, "loss": 6.1671, "step": 588 }, { "epoch": 0.1884498480243161, "grad_norm": 8.077284812927246, "learning_rate": 0.0003672002897497312, "loss": 6.2584, "step": 589 }, { "epoch": 0.1887697968325068, "grad_norm": 5.156368255615234, "learning_rate": 0.00036709028679328013, "loss": 6.2548, "step": 590 }, { "epoch": 0.1890897456406975, "grad_norm": 7.575767517089844, "learning_rate": 0.0003669801171699279, "loss": 6.426, "step": 591 }, { "epoch": 0.18940969444888817, "grad_norm": 8.419235229492188, "learning_rate": 0.0003668697809965602, "loss": 6.2028, "step": 592 }, { "epoch": 0.18972964325707886, "grad_norm": 6.528421401977539, "learning_rate": 0.0003667592783902397, "loss": 6.2471, "step": 593 }, { "epoch": 0.19004959206526956, "grad_norm": 7.039434909820557, "learning_rate": 0.00036664860946820536, "loss": 6.4782, "step": 594 }, { "epoch": 0.19036954087346025, "grad_norm": 5.257431507110596, "learning_rate": 0.00036653777434787294, "loss": 6.1964, "step": 595 }, { "epoch": 0.19068948968165095, "grad_norm": 7.877770900726318, "learning_rate": 0.0003664267731468342, "loss": 6.501, "step": 596 }, { "epoch": 0.19100943848984162, "grad_norm": 6.9174933433532715, "learning_rate": 0.00036631560598285735, "loss": 6.1698, "step": 597 }, { "epoch": 0.1913293872980323, "grad_norm": 5.704330921173096, "learning_rate": 0.00036620427297388646, "loss": 6.1756, "step": 598 }, { "epoch": 0.191649336106223, "grad_norm": 7.497727870941162, "learning_rate": 0.00036609277423804183, "loss": 6.1859, "step": 599 }, { "epoch": 0.1919692849144137, "grad_norm": 10.520272254943848, "learning_rate": 0.0003659811098936193, "loss": 6.2496, "step": 600 }, { "epoch": 0.1919692849144137, "eval_loss": 3.1351470947265625, "eval_runtime": 233.6712, "eval_samples_per_second": 5.632, "eval_steps_per_second": 1.408, "step": 600 }, { "epoch": 0.1922892337226044, "grad_norm": 5.824222087860107, "learning_rate": 0.00036586928005909067, "loss": 6.3228, "step": 601 }, { "epoch": 0.19260918253079506, "grad_norm": 5.025035381317139, "learning_rate": 0.00036575728485310305, "loss": 6.0095, "step": 602 }, { "epoch": 0.19292913133898576, "grad_norm": 7.741343975067139, "learning_rate": 0.0003656451243944793, "loss": 6.4401, "step": 603 }, { "epoch": 0.19324908014717646, "grad_norm": 12.790258407592773, "learning_rate": 0.00036553279880221736, "loss": 6.0217, "step": 604 }, { "epoch": 0.19356902895536715, "grad_norm": 8.1022310256958, "learning_rate": 0.00036542030819549054, "loss": 6.2374, "step": 605 }, { "epoch": 0.19388897776355782, "grad_norm": 10.174038887023926, "learning_rate": 0.0003653076526936471, "loss": 6.1404, "step": 606 }, { "epoch": 0.1942089265717485, "grad_norm": 5.43154239654541, "learning_rate": 0.0003651948324162104, "loss": 6.0869, "step": 607 }, { "epoch": 0.1945288753799392, "grad_norm": 7.523396015167236, "learning_rate": 0.0003650818474828783, "loss": 6.244, "step": 608 }, { "epoch": 0.1948488241881299, "grad_norm": 9.378880500793457, "learning_rate": 0.0003649686980135238, "loss": 6.3951, "step": 609 }, { "epoch": 0.1951687729963206, "grad_norm": 9.061972618103027, "learning_rate": 0.0003648553841281942, "loss": 6.3282, "step": 610 }, { "epoch": 0.19548872180451127, "grad_norm": 6.108553409576416, "learning_rate": 0.0003647419059471112, "loss": 6.2572, "step": 611 }, { "epoch": 0.19580867061270196, "grad_norm": 5.526744842529297, "learning_rate": 0.00036462826359067097, "loss": 6.2983, "step": 612 }, { "epoch": 0.19612861942089266, "grad_norm": 15.063833236694336, "learning_rate": 0.00036451445717944364, "loss": 6.0462, "step": 613 }, { "epoch": 0.19644856822908335, "grad_norm": 8.600305557250977, "learning_rate": 0.00036440048683417374, "loss": 6.0926, "step": 614 }, { "epoch": 0.19676851703727405, "grad_norm": 6.220649242401123, "learning_rate": 0.00036428635267577944, "loss": 6.4756, "step": 615 }, { "epoch": 0.19708846584546472, "grad_norm": 10.122354507446289, "learning_rate": 0.0003641720548253528, "loss": 6.3049, "step": 616 }, { "epoch": 0.1974084146536554, "grad_norm": 8.642034530639648, "learning_rate": 0.00036405759340415956, "loss": 5.996, "step": 617 }, { "epoch": 0.1977283634618461, "grad_norm": 7.122521877288818, "learning_rate": 0.0003639429685336391, "loss": 6.2142, "step": 618 }, { "epoch": 0.1980483122700368, "grad_norm": 13.584242820739746, "learning_rate": 0.0003638281803354039, "loss": 6.2225, "step": 619 }, { "epoch": 0.19836826107822747, "grad_norm": 15.066466331481934, "learning_rate": 0.00036371322893124, "loss": 6.4064, "step": 620 }, { "epoch": 0.19868820988641817, "grad_norm": 13.614229202270508, "learning_rate": 0.00036359811444310665, "loss": 6.5411, "step": 621 }, { "epoch": 0.19900815869460886, "grad_norm": 21.728307723999023, "learning_rate": 0.0003634828369931358, "loss": 6.1478, "step": 622 }, { "epoch": 0.19932810750279956, "grad_norm": 13.957999229431152, "learning_rate": 0.0003633673967036327, "loss": 6.4172, "step": 623 }, { "epoch": 0.19964805631099025, "grad_norm": 8.37833023071289, "learning_rate": 0.0003632517936970751, "loss": 6.4588, "step": 624 }, { "epoch": 0.19996800511918092, "grad_norm": 10.713350296020508, "learning_rate": 0.0003631360280961134, "loss": 6.2626, "step": 625 }, { "epoch": 0.20028795392737161, "grad_norm": 10.606658935546875, "learning_rate": 0.00036302010002357057, "loss": 6.4442, "step": 626 }, { "epoch": 0.2006079027355623, "grad_norm": 6.849085330963135, "learning_rate": 0.00036290400960244204, "loss": 6.1653, "step": 627 }, { "epoch": 0.200927851543753, "grad_norm": 9.328271865844727, "learning_rate": 0.0003627877569558953, "loss": 6.2049, "step": 628 }, { "epoch": 0.2012478003519437, "grad_norm": 20.28767967224121, "learning_rate": 0.00036267134220727, "loss": 6.4698, "step": 629 }, { "epoch": 0.20156774916013437, "grad_norm": 9.141938209533691, "learning_rate": 0.00036255476548007794, "loss": 6.1397, "step": 630 }, { "epoch": 0.20188769796832506, "grad_norm": 7.7828369140625, "learning_rate": 0.00036243802689800257, "loss": 6.0992, "step": 631 }, { "epoch": 0.20220764677651576, "grad_norm": 9.355428695678711, "learning_rate": 0.0003623211265848993, "loss": 6.1, "step": 632 }, { "epoch": 0.20252759558470645, "grad_norm": 6.362244129180908, "learning_rate": 0.0003622040646647947, "loss": 6.1492, "step": 633 }, { "epoch": 0.20284754439289715, "grad_norm": 15.05521011352539, "learning_rate": 0.0003620868412618873, "loss": 6.4308, "step": 634 }, { "epoch": 0.20316749320108782, "grad_norm": 10.501233100891113, "learning_rate": 0.00036196945650054674, "loss": 6.0315, "step": 635 }, { "epoch": 0.2034874420092785, "grad_norm": 10.360550880432129, "learning_rate": 0.00036185191050531367, "loss": 6.0577, "step": 636 }, { "epoch": 0.2038073908174692, "grad_norm": 13.611196517944336, "learning_rate": 0.0003617342034009001, "loss": 6.3088, "step": 637 }, { "epoch": 0.2041273396256599, "grad_norm": 19.629377365112305, "learning_rate": 0.0003616163353121889, "loss": 6.4983, "step": 638 }, { "epoch": 0.20444728843385057, "grad_norm": 9.073545455932617, "learning_rate": 0.0003614983063642336, "loss": 6.2678, "step": 639 }, { "epoch": 0.20476723724204127, "grad_norm": 15.663934707641602, "learning_rate": 0.0003613801166822585, "loss": 6.2223, "step": 640 }, { "epoch": 0.20508718605023196, "grad_norm": 13.69797134399414, "learning_rate": 0.00036126176639165845, "loss": 6.1535, "step": 641 }, { "epoch": 0.20540713485842266, "grad_norm": 10.088582038879395, "learning_rate": 0.0003611432556179986, "loss": 6.0588, "step": 642 }, { "epoch": 0.20572708366661335, "grad_norm": 52.31211853027344, "learning_rate": 0.0003610245844870145, "loss": 6.5132, "step": 643 }, { "epoch": 0.20604703247480402, "grad_norm": 15.596002578735352, "learning_rate": 0.0003609057531246118, "loss": 6.3505, "step": 644 }, { "epoch": 0.20636698128299472, "grad_norm": 14.089200019836426, "learning_rate": 0.00036078676165686603, "loss": 6.1755, "step": 645 }, { "epoch": 0.2066869300911854, "grad_norm": 22.81130599975586, "learning_rate": 0.00036066761021002276, "loss": 6.7358, "step": 646 }, { "epoch": 0.2070068788993761, "grad_norm": 27.079946517944336, "learning_rate": 0.0003605482989104972, "loss": 6.1623, "step": 647 }, { "epoch": 0.2073268277075668, "grad_norm": 90.47372436523438, "learning_rate": 0.00036042882788487403, "loss": 6.2518, "step": 648 }, { "epoch": 0.20764677651575747, "grad_norm": 12.399378776550293, "learning_rate": 0.00036030919725990764, "loss": 6.1738, "step": 649 }, { "epoch": 0.20796672532394817, "grad_norm": 24.64794158935547, "learning_rate": 0.00036018940716252165, "loss": 6.2701, "step": 650 }, { "epoch": 0.20828667413213886, "grad_norm": 31.136415481567383, "learning_rate": 0.0003600694577198088, "loss": 6.4434, "step": 651 }, { "epoch": 0.20860662294032956, "grad_norm": 11.99364948272705, "learning_rate": 0.000359949349059031, "loss": 6.2209, "step": 652 }, { "epoch": 0.20892657174852022, "grad_norm": 22.257827758789062, "learning_rate": 0.000359829081307619, "loss": 6.0483, "step": 653 }, { "epoch": 0.20924652055671092, "grad_norm": 42.29777145385742, "learning_rate": 0.0003597086545931724, "loss": 6.1647, "step": 654 }, { "epoch": 0.20956646936490161, "grad_norm": 22.489221572875977, "learning_rate": 0.00035958806904345936, "loss": 6.2579, "step": 655 }, { "epoch": 0.2098864181730923, "grad_norm": 28.098785400390625, "learning_rate": 0.0003594673247864167, "loss": 6.2493, "step": 656 }, { "epoch": 0.210206366981283, "grad_norm": 13.18166732788086, "learning_rate": 0.00035934642195014954, "loss": 6.3545, "step": 657 }, { "epoch": 0.21052631578947367, "grad_norm": 19.027376174926758, "learning_rate": 0.0003592253606629312, "loss": 6.3131, "step": 658 }, { "epoch": 0.21084626459766437, "grad_norm": 17.17510414123535, "learning_rate": 0.0003591041410532032, "loss": 6.2641, "step": 659 }, { "epoch": 0.21116621340585506, "grad_norm": 24.58960723876953, "learning_rate": 0.000358982763249575, "loss": 6.1744, "step": 660 }, { "epoch": 0.21148616221404576, "grad_norm": 74.80911254882812, "learning_rate": 0.0003588612273808239, "loss": 6.2601, "step": 661 }, { "epoch": 0.21180611102223645, "grad_norm": 24.598215103149414, "learning_rate": 0.0003587395335758949, "loss": 6.0995, "step": 662 }, { "epoch": 0.21212605983042712, "grad_norm": 27.990663528442383, "learning_rate": 0.00035861768196390054, "loss": 6.3451, "step": 663 }, { "epoch": 0.21244600863861782, "grad_norm": 30.33852767944336, "learning_rate": 0.0003584956726741208, "loss": 6.1803, "step": 664 }, { "epoch": 0.2127659574468085, "grad_norm": 21.035871505737305, "learning_rate": 0.00035837350583600296, "loss": 6.0933, "step": 665 }, { "epoch": 0.2130859062549992, "grad_norm": 18.981107711791992, "learning_rate": 0.0003582511815791615, "loss": 6.2382, "step": 666 }, { "epoch": 0.2134058550631899, "grad_norm": 15.4559965133667, "learning_rate": 0.0003581287000333779, "loss": 6.2025, "step": 667 }, { "epoch": 0.21372580387138057, "grad_norm": 42.094425201416016, "learning_rate": 0.00035800606132860036, "loss": 6.2438, "step": 668 }, { "epoch": 0.21404575267957127, "grad_norm": 394.44744873046875, "learning_rate": 0.000357883265594944, "loss": 6.1518, "step": 669 }, { "epoch": 0.21436570148776196, "grad_norm": 23.389293670654297, "learning_rate": 0.00035776031296269053, "loss": 6.2643, "step": 670 }, { "epoch": 0.21468565029595266, "grad_norm": 26.32927703857422, "learning_rate": 0.00035763720356228807, "loss": 6.1562, "step": 671 }, { "epoch": 0.21500559910414332, "grad_norm": 51.09504318237305, "learning_rate": 0.000357513937524351, "loss": 6.1792, "step": 672 }, { "epoch": 0.21532554791233402, "grad_norm": 55.16264724731445, "learning_rate": 0.00035739051497966, "loss": 6.2488, "step": 673 }, { "epoch": 0.21564549672052472, "grad_norm": 13.69837760925293, "learning_rate": 0.00035726693605916184, "loss": 6.0671, "step": 674 }, { "epoch": 0.2159654455287154, "grad_norm": 30.63555908203125, "learning_rate": 0.00035714320089396903, "loss": 6.208, "step": 675 }, { "epoch": 0.2162853943369061, "grad_norm": 13.71101188659668, "learning_rate": 0.00035701930961536, "loss": 6.1028, "step": 676 }, { "epoch": 0.21660534314509677, "grad_norm": 16.373701095581055, "learning_rate": 0.00035689526235477867, "loss": 6.2419, "step": 677 }, { "epoch": 0.21692529195328747, "grad_norm": 19.588153839111328, "learning_rate": 0.0003567710592438346, "loss": 6.1758, "step": 678 }, { "epoch": 0.21724524076147816, "grad_norm": 31.715362548828125, "learning_rate": 0.0003566467004143025, "loss": 6.2205, "step": 679 }, { "epoch": 0.21756518956966886, "grad_norm": 32.14853286743164, "learning_rate": 0.0003565221859981226, "loss": 6.2068, "step": 680 }, { "epoch": 0.21788513837785956, "grad_norm": 28.47088623046875, "learning_rate": 0.0003563975161274, "loss": 6.3612, "step": 681 }, { "epoch": 0.21820508718605022, "grad_norm": 27.772294998168945, "learning_rate": 0.00035627269093440467, "loss": 6.3503, "step": 682 }, { "epoch": 0.21852503599424092, "grad_norm": 27.19485855102539, "learning_rate": 0.00035614771055157143, "loss": 6.222, "step": 683 }, { "epoch": 0.2188449848024316, "grad_norm": 14.713149070739746, "learning_rate": 0.00035602257511149993, "loss": 6.3099, "step": 684 }, { "epoch": 0.2191649336106223, "grad_norm": 10.625816345214844, "learning_rate": 0.00035589728474695403, "loss": 6.1701, "step": 685 }, { "epoch": 0.21948488241881298, "grad_norm": 17.59633445739746, "learning_rate": 0.0003557718395908622, "loss": 6.1059, "step": 686 }, { "epoch": 0.21980483122700367, "grad_norm": 40.79133987426758, "learning_rate": 0.000355646239776317, "loss": 6.2505, "step": 687 }, { "epoch": 0.22012478003519437, "grad_norm": 21.789743423461914, "learning_rate": 0.0003555204854365751, "loss": 6.2508, "step": 688 }, { "epoch": 0.22044472884338506, "grad_norm": 287.25286865234375, "learning_rate": 0.0003553945767050572, "loss": 6.3592, "step": 689 }, { "epoch": 0.22076467765157576, "grad_norm": 122.58528900146484, "learning_rate": 0.0003552685137153477, "loss": 6.5598, "step": 690 }, { "epoch": 0.22108462645976643, "grad_norm": 65.58609771728516, "learning_rate": 0.00035514229660119475, "loss": 6.2963, "step": 691 }, { "epoch": 0.22140457526795712, "grad_norm": 78.26929473876953, "learning_rate": 0.00035501592549650986, "loss": 6.6145, "step": 692 }, { "epoch": 0.22172452407614782, "grad_norm": 468.92840576171875, "learning_rate": 0.0003548894005353682, "loss": 6.5893, "step": 693 }, { "epoch": 0.2220444728843385, "grad_norm": 29858.107421875, "learning_rate": 0.00035476272185200786, "loss": 6.8783, "step": 694 }, { "epoch": 0.2223644216925292, "grad_norm": 567.8423461914062, "learning_rate": 0.00035463588958083023, "loss": 7.506, "step": 695 }, { "epoch": 0.22268437050071987, "grad_norm": 269.0973815917969, "learning_rate": 0.00035450890385639957, "loss": 6.98, "step": 696 }, { "epoch": 0.22300431930891057, "grad_norm": 70.39237976074219, "learning_rate": 0.00035438176481344307, "loss": 6.6005, "step": 697 }, { "epoch": 0.22332426811710127, "grad_norm": 35.886253356933594, "learning_rate": 0.0003542544725868503, "loss": 6.3986, "step": 698 }, { "epoch": 0.22364421692529196, "grad_norm": 132.87338256835938, "learning_rate": 0.0003541270273116737, "loss": 6.4347, "step": 699 }, { "epoch": 0.22396416573348266, "grad_norm": 243.24110412597656, "learning_rate": 0.00035399942912312784, "loss": 6.6825, "step": 700 }, { "epoch": 0.22428411454167332, "grad_norm": 47.076541900634766, "learning_rate": 0.00035387167815658955, "loss": 6.5573, "step": 701 }, { "epoch": 0.22460406334986402, "grad_norm": 47.393280029296875, "learning_rate": 0.0003537437745475981, "loss": 6.5841, "step": 702 }, { "epoch": 0.22492401215805471, "grad_norm": 173.40023803710938, "learning_rate": 0.0003536157184318541, "loss": 6.5547, "step": 703 }, { "epoch": 0.2252439609662454, "grad_norm": 100.88816833496094, "learning_rate": 0.00035348750994522037, "loss": 6.5479, "step": 704 }, { "epoch": 0.22556390977443608, "grad_norm": 708.522216796875, "learning_rate": 0.00035335914922372135, "loss": 6.4429, "step": 705 }, { "epoch": 0.22588385858262677, "grad_norm": 5974.12744140625, "learning_rate": 0.000353230636403543, "loss": 6.8338, "step": 706 }, { "epoch": 0.22620380739081747, "grad_norm": 1240.005615234375, "learning_rate": 0.0003531019716210326, "loss": 7.7215, "step": 707 }, { "epoch": 0.22652375619900816, "grad_norm": 3238.5185546875, "learning_rate": 0.00035297315501269845, "loss": 10.7748, "step": 708 }, { "epoch": 0.22684370500719886, "grad_norm": 6019.43310546875, "learning_rate": 0.00035284418671521035, "loss": 10.382, "step": 709 }, { "epoch": 0.22716365381538953, "grad_norm": 4291.9306640625, "learning_rate": 0.00035271506686539877, "loss": 8.5623, "step": 710 }, { "epoch": 0.22748360262358022, "grad_norm": 1000.52294921875, "learning_rate": 0.00035258579560025504, "loss": 6.9676, "step": 711 }, { "epoch": 0.22780355143177092, "grad_norm": 653.64990234375, "learning_rate": 0.0003524563730569311, "loss": 6.8677, "step": 712 }, { "epoch": 0.2281235002399616, "grad_norm": 226.84967041015625, "learning_rate": 0.0003523267993727394, "loss": 6.5646, "step": 713 }, { "epoch": 0.2284434490481523, "grad_norm": 69.58618927001953, "learning_rate": 0.00035219707468515276, "loss": 6.8388, "step": 714 }, { "epoch": 0.22876339785634298, "grad_norm": 723.3787841796875, "learning_rate": 0.00035206719913180427, "loss": 6.8472, "step": 715 }, { "epoch": 0.22908334666453367, "grad_norm": 97.80406951904297, "learning_rate": 0.00035193717285048694, "loss": 7.5312, "step": 716 }, { "epoch": 0.22940329547272437, "grad_norm": 114.102294921875, "learning_rate": 0.00035180699597915383, "loss": 8.069, "step": 717 }, { "epoch": 0.22972324428091506, "grad_norm": 236.4305419921875, "learning_rate": 0.0003516766686559177, "loss": 8.6465, "step": 718 }, { "epoch": 0.23004319308910573, "grad_norm": 99.95079040527344, "learning_rate": 0.0003515461910190509, "loss": 7.5548, "step": 719 }, { "epoch": 0.23036314189729643, "grad_norm": 84.75758361816406, "learning_rate": 0.0003514155632069854, "loss": 6.442, "step": 720 }, { "epoch": 0.23068309070548712, "grad_norm": 212.29820251464844, "learning_rate": 0.00035128478535831234, "loss": 8.096, "step": 721 }, { "epoch": 0.23100303951367782, "grad_norm": 57.954811096191406, "learning_rate": 0.0003511538576117821, "loss": 6.924, "step": 722 }, { "epoch": 0.2313229883218685, "grad_norm": 51.535804748535156, "learning_rate": 0.0003510227801063042, "loss": 6.6562, "step": 723 }, { "epoch": 0.23164293713005918, "grad_norm": 1651.22705078125, "learning_rate": 0.0003508915529809469, "loss": 7.1549, "step": 724 }, { "epoch": 0.23196288593824987, "grad_norm": 103.06959533691406, "learning_rate": 0.0003507601763749372, "loss": 7.1778, "step": 725 }, { "epoch": 0.23228283474644057, "grad_norm": 127.42375183105469, "learning_rate": 0.00035062865042766086, "loss": 7.0844, "step": 726 }, { "epoch": 0.23260278355463126, "grad_norm": 57.47138214111328, "learning_rate": 0.0003504969752786619, "loss": 6.7034, "step": 727 }, { "epoch": 0.23292273236282196, "grad_norm": 21.57773208618164, "learning_rate": 0.00035036515106764277, "loss": 6.5883, "step": 728 }, { "epoch": 0.23324268117101263, "grad_norm": 43.577693939208984, "learning_rate": 0.00035023317793446396, "loss": 7.1802, "step": 729 }, { "epoch": 0.23356262997920332, "grad_norm": 47.857906341552734, "learning_rate": 0.000350101056019144, "loss": 7.1532, "step": 730 }, { "epoch": 0.23388257878739402, "grad_norm": 41.412353515625, "learning_rate": 0.0003499687854618593, "loss": 6.8486, "step": 731 }, { "epoch": 0.23420252759558471, "grad_norm": 194.344482421875, "learning_rate": 0.000349836366402944, "loss": 6.6898, "step": 732 }, { "epoch": 0.2345224764037754, "grad_norm": 97.9305419921875, "learning_rate": 0.00034970379898288976, "loss": 7.0291, "step": 733 }, { "epoch": 0.23484242521196608, "grad_norm": 617602.9375, "learning_rate": 0.00034957108334234563, "loss": 6.7315, "step": 734 }, { "epoch": 0.23516237402015677, "grad_norm": 82.41645050048828, "learning_rate": 0.00034943821962211784, "loss": 6.4541, "step": 735 }, { "epoch": 0.23548232282834747, "grad_norm": 112.80008697509766, "learning_rate": 0.0003493052079631699, "loss": 6.5458, "step": 736 }, { "epoch": 0.23580227163653816, "grad_norm": 57.59135437011719, "learning_rate": 0.00034917204850662225, "loss": 6.7299, "step": 737 }, { "epoch": 0.23612222044472883, "grad_norm": 183.47340393066406, "learning_rate": 0.0003490387413937519, "loss": 6.6954, "step": 738 }, { "epoch": 0.23644216925291953, "grad_norm": 95.19715118408203, "learning_rate": 0.00034890528676599284, "loss": 6.6315, "step": 739 }, { "epoch": 0.23676211806111022, "grad_norm": 77.85346221923828, "learning_rate": 0.00034877168476493535, "loss": 6.9589, "step": 740 }, { "epoch": 0.23708206686930092, "grad_norm": 208.43527221679688, "learning_rate": 0.00034863793553232614, "loss": 6.5637, "step": 741 }, { "epoch": 0.2374020156774916, "grad_norm": 156.19229125976562, "learning_rate": 0.0003485040392100682, "loss": 7.0049, "step": 742 }, { "epoch": 0.23772196448568228, "grad_norm": 199.3235321044922, "learning_rate": 0.00034836999594022044, "loss": 6.2481, "step": 743 }, { "epoch": 0.23804191329387298, "grad_norm": 67.07147979736328, "learning_rate": 0.0003482358058649978, "loss": 6.3019, "step": 744 }, { "epoch": 0.23836186210206367, "grad_norm": 166.8501434326172, "learning_rate": 0.00034810146912677087, "loss": 6.3432, "step": 745 }, { "epoch": 0.23868181091025437, "grad_norm": 97.20511627197266, "learning_rate": 0.0003479669858680659, "loss": 6.3981, "step": 746 }, { "epoch": 0.23900175971844506, "grad_norm": 57.774749755859375, "learning_rate": 0.0003478323562315646, "loss": 6.3404, "step": 747 }, { "epoch": 0.23932170852663573, "grad_norm": 90.69790649414062, "learning_rate": 0.0003476975803601039, "loss": 6.4574, "step": 748 }, { "epoch": 0.23964165733482642, "grad_norm": 170.02110290527344, "learning_rate": 0.000347562658396676, "loss": 6.6967, "step": 749 }, { "epoch": 0.23996160614301712, "grad_norm": 80.30382537841797, "learning_rate": 0.000347427590484428, "loss": 6.3286, "step": 750 }, { "epoch": 0.24028155495120782, "grad_norm": 78.86782836914062, "learning_rate": 0.00034729237676666194, "loss": 6.2999, "step": 751 }, { "epoch": 0.24060150375939848, "grad_norm": 73.93916320800781, "learning_rate": 0.0003471570173868345, "loss": 6.4426, "step": 752 }, { "epoch": 0.24092145256758918, "grad_norm": 140.0988311767578, "learning_rate": 0.0003470215124885569, "loss": 6.4757, "step": 753 }, { "epoch": 0.24124140137577987, "grad_norm": 86.3206558227539, "learning_rate": 0.00034688586221559467, "loss": 6.3304, "step": 754 }, { "epoch": 0.24156135018397057, "grad_norm": 338.0812072753906, "learning_rate": 0.0003467500667118678, "loss": 6.3679, "step": 755 }, { "epoch": 0.24188129899216126, "grad_norm": 412.3385314941406, "learning_rate": 0.0003466141261214501, "loss": 6.5072, "step": 756 }, { "epoch": 0.24220124780035193, "grad_norm": 205.6704864501953, "learning_rate": 0.0003464780405885695, "loss": 6.3448, "step": 757 }, { "epoch": 0.24252119660854263, "grad_norm": 256.4744873046875, "learning_rate": 0.00034634181025760765, "loss": 6.3772, "step": 758 }, { "epoch": 0.24284114541673332, "grad_norm": 461.74176025390625, "learning_rate": 0.00034620543527309983, "loss": 6.2274, "step": 759 }, { "epoch": 0.24316109422492402, "grad_norm": 165.07127380371094, "learning_rate": 0.00034606891577973475, "loss": 6.3089, "step": 760 }, { "epoch": 0.2434810430331147, "grad_norm": 1090.9415283203125, "learning_rate": 0.00034593225192235447, "loss": 6.5381, "step": 761 }, { "epoch": 0.24380099184130538, "grad_norm": 1298.724365234375, "learning_rate": 0.00034579544384595427, "loss": 6.4875, "step": 762 }, { "epoch": 0.24412094064949608, "grad_norm": 2974.205322265625, "learning_rate": 0.0003456584916956823, "loss": 6.609, "step": 763 }, { "epoch": 0.24444088945768677, "grad_norm": 506.7083435058594, "learning_rate": 0.0003455213956168397, "loss": 6.5206, "step": 764 }, { "epoch": 0.24476083826587747, "grad_norm": 2491.966796875, "learning_rate": 0.0003453841557548802, "loss": 6.3059, "step": 765 }, { "epoch": 0.24508078707406816, "grad_norm": 558.4273681640625, "learning_rate": 0.00034524677225541034, "loss": 6.3385, "step": 766 }, { "epoch": 0.24540073588225883, "grad_norm": 2481.155029296875, "learning_rate": 0.00034510924526418864, "loss": 6.4753, "step": 767 }, { "epoch": 0.24572068469044953, "grad_norm": 6455.59375, "learning_rate": 0.00034497157492712616, "loss": 6.2157, "step": 768 }, { "epoch": 0.24604063349864022, "grad_norm": 4837.32080078125, "learning_rate": 0.000344833761390286, "loss": 6.5173, "step": 769 }, { "epoch": 0.24636058230683092, "grad_norm": 1091.15869140625, "learning_rate": 0.00034469580479988315, "loss": 6.4935, "step": 770 }, { "epoch": 0.24668053111502158, "grad_norm": 2150.258544921875, "learning_rate": 0.0003445577053022843, "loss": 6.5266, "step": 771 }, { "epoch": 0.24700047992321228, "grad_norm": 3155.990478515625, "learning_rate": 0.0003444194630440079, "loss": 6.4712, "step": 772 }, { "epoch": 0.24732042873140297, "grad_norm": 745.5328979492188, "learning_rate": 0.0003442810781717238, "loss": 6.4966, "step": 773 }, { "epoch": 0.24764037753959367, "grad_norm": 2718.595947265625, "learning_rate": 0.00034414255083225315, "loss": 6.3514, "step": 774 }, { "epoch": 0.24796032634778437, "grad_norm": 1184.953369140625, "learning_rate": 0.0003440038811725683, "loss": 6.5564, "step": 775 }, { "epoch": 0.24828027515597503, "grad_norm": 1916.226318359375, "learning_rate": 0.00034386506933979243, "loss": 6.3485, "step": 776 }, { "epoch": 0.24860022396416573, "grad_norm": 2737.73095703125, "learning_rate": 0.0003437261154811997, "loss": 6.4106, "step": 777 }, { "epoch": 0.24892017277235642, "grad_norm": 821.8926391601562, "learning_rate": 0.00034358701974421523, "loss": 6.3447, "step": 778 }, { "epoch": 0.24924012158054712, "grad_norm": 781.9716796875, "learning_rate": 0.000343447782276414, "loss": 6.2201, "step": 779 }, { "epoch": 0.24956007038873781, "grad_norm": 2429.21484375, "learning_rate": 0.0003433084032255219, "loss": 6.4331, "step": 780 }, { "epoch": 0.24988001919692848, "grad_norm": 7888.99853515625, "learning_rate": 0.0003431688827394149, "loss": 6.4502, "step": 781 }, { "epoch": 0.2501999680051192, "grad_norm": 728.1339721679688, "learning_rate": 0.000343029220966119, "loss": 6.5406, "step": 782 }, { "epoch": 0.2505199168133099, "grad_norm": 1479.98095703125, "learning_rate": 0.0003428894180538101, "loss": 6.636, "step": 783 }, { "epoch": 0.25083986562150057, "grad_norm": 322.9638366699219, "learning_rate": 0.0003427494741508138, "loss": 6.4123, "step": 784 }, { "epoch": 0.25115981442969126, "grad_norm": 567.7588500976562, "learning_rate": 0.00034260938940560527, "loss": 6.5927, "step": 785 }, { "epoch": 0.25147976323788196, "grad_norm": 912.3056640625, "learning_rate": 0.00034246916396680934, "loss": 6.3635, "step": 786 }, { "epoch": 0.25179971204607265, "grad_norm": 867.4058227539062, "learning_rate": 0.00034232879798319984, "loss": 6.4766, "step": 787 }, { "epoch": 0.2521196608542633, "grad_norm": 3383.78759765625, "learning_rate": 0.0003421882916036998, "loss": 6.5765, "step": 788 }, { "epoch": 0.252439609662454, "grad_norm": 551.5289916992188, "learning_rate": 0.0003420476449773813, "loss": 6.1571, "step": 789 }, { "epoch": 0.2527595584706447, "grad_norm": 3835.2880859375, "learning_rate": 0.00034190685825346504, "loss": 6.3987, "step": 790 }, { "epoch": 0.2530795072788354, "grad_norm": 299.2125549316406, "learning_rate": 0.00034176593158132055, "loss": 6.3278, "step": 791 }, { "epoch": 0.2533994560870261, "grad_norm": 542.0026245117188, "learning_rate": 0.0003416248651104656, "loss": 6.4289, "step": 792 }, { "epoch": 0.25371940489521677, "grad_norm": 3202.345947265625, "learning_rate": 0.00034148365899056655, "loss": 6.3677, "step": 793 }, { "epoch": 0.25403935370340747, "grad_norm": 2746.37109375, "learning_rate": 0.00034134231337143775, "loss": 6.3684, "step": 794 }, { "epoch": 0.25435930251159816, "grad_norm": 289.69354248046875, "learning_rate": 0.00034120082840304164, "loss": 6.25, "step": 795 }, { "epoch": 0.25467925131978886, "grad_norm": 5020.22607421875, "learning_rate": 0.00034105920423548833, "loss": 6.4192, "step": 796 }, { "epoch": 0.2549992001279795, "grad_norm": 21526.37109375, "learning_rate": 0.00034091744101903595, "loss": 6.7782, "step": 797 }, { "epoch": 0.2553191489361702, "grad_norm": 96381.3203125, "learning_rate": 0.0003407755389040898, "loss": 6.4677, "step": 798 }, { "epoch": 0.2556390977443609, "grad_norm": 383685.59375, "learning_rate": 0.00034063349804120275, "loss": 6.3452, "step": 799 }, { "epoch": 0.2559590465525516, "grad_norm": 1976569.875, "learning_rate": 0.0003404913185810748, "loss": 7.6792, "step": 800 }, { "epoch": 0.2559590465525516, "eval_loss": 9.59782886505127, "eval_runtime": 233.5229, "eval_samples_per_second": 5.635, "eval_steps_per_second": 1.409, "step": 800 }, { "epoch": 0.2562789953607423, "grad_norm": 472340.75, "learning_rate": 0.0003403490006745532, "loss": 18.063, "step": 801 }, { "epoch": 0.256598944168933, "grad_norm": 32471.482421875, "learning_rate": 0.00034020654447263155, "loss": 26.347, "step": 802 }, { "epoch": 0.25691889297712367, "grad_norm": 151727.5, "learning_rate": 0.00034006395012645086, "loss": 24.9178, "step": 803 }, { "epoch": 0.25723884178531436, "grad_norm": 43865.3671875, "learning_rate": 0.00033992121778729826, "loss": 24.8413, "step": 804 }, { "epoch": 0.25755879059350506, "grad_norm": 47757.33984375, "learning_rate": 0.00033977834760660745, "loss": 25.444, "step": 805 }, { "epoch": 0.25787873940169576, "grad_norm": 14128.7724609375, "learning_rate": 0.00033963533973595825, "loss": 25.3313, "step": 806 }, { "epoch": 0.2581986882098864, "grad_norm": 1130996.625, "learning_rate": 0.00033949219432707667, "loss": 26.6114, "step": 807 }, { "epoch": 0.2585186370180771, "grad_norm": 4760.6435546875, "learning_rate": 0.0003393489115318347, "loss": 25.624, "step": 808 }, { "epoch": 0.2588385858262678, "grad_norm": 365082.65625, "learning_rate": 0.00033920549150225, "loss": 26.4853, "step": 809 }, { "epoch": 0.2591585346344585, "grad_norm": 972.4833374023438, "learning_rate": 0.0003390619343904857, "loss": 26.3925, "step": 810 }, { "epoch": 0.2594784834426492, "grad_norm": 56389.19140625, "learning_rate": 0.0003389182403488507, "loss": 25.6556, "step": 811 }, { "epoch": 0.2597984322508399, "grad_norm": 21225.556640625, "learning_rate": 0.0003387744095297988, "loss": 24.6976, "step": 812 }, { "epoch": 0.26011838105903057, "grad_norm": 1668.451416015625, "learning_rate": 0.00033863044208592925, "loss": 22.8956, "step": 813 }, { "epoch": 0.26043832986722126, "grad_norm": 19362.66015625, "learning_rate": 0.00033848633816998603, "loss": 24.3001, "step": 814 }, { "epoch": 0.26075827867541196, "grad_norm": 32938.8125, "learning_rate": 0.000338342097934858, "loss": 23.744, "step": 815 }, { "epoch": 0.2610782274836026, "grad_norm": 2615.0556640625, "learning_rate": 0.00033819772153357875, "loss": 23.1489, "step": 816 }, { "epoch": 0.2613981762917933, "grad_norm": 2801.425048828125, "learning_rate": 0.00033805320911932597, "loss": 21.2644, "step": 817 }, { "epoch": 0.261718125099984, "grad_norm": 26318.5703125, "learning_rate": 0.00033790856084542223, "loss": 20.3282, "step": 818 }, { "epoch": 0.2620380739081747, "grad_norm": 16279.3955078125, "learning_rate": 0.0003377637768653337, "loss": 17.2759, "step": 819 }, { "epoch": 0.2623580227163654, "grad_norm": 13162.205078125, "learning_rate": 0.0003376188573326708, "loss": 16.7004, "step": 820 }, { "epoch": 0.2626779715245561, "grad_norm": 42018.765625, "learning_rate": 0.00033747380240118773, "loss": 16.1286, "step": 821 }, { "epoch": 0.26299792033274677, "grad_norm": 2374.1533203125, "learning_rate": 0.0003373286122247823, "loss": 17.0077, "step": 822 }, { "epoch": 0.26331786914093747, "grad_norm": 2869.451904296875, "learning_rate": 0.000337183286957496, "loss": 16.8362, "step": 823 }, { "epoch": 0.26363781794912816, "grad_norm": 169.99826049804688, "learning_rate": 0.0003370378267535132, "loss": 16.7969, "step": 824 }, { "epoch": 0.2639577667573188, "grad_norm": 118.742919921875, "learning_rate": 0.00033689223176716187, "loss": 13.8806, "step": 825 }, { "epoch": 0.2642777155655095, "grad_norm": 21638.884765625, "learning_rate": 0.00033674650215291296, "loss": 12.1067, "step": 826 }, { "epoch": 0.2645976643737002, "grad_norm": 2294.127197265625, "learning_rate": 0.00033660063806537993, "loss": 12.1123, "step": 827 }, { "epoch": 0.2649176131818909, "grad_norm": 34425.94921875, "learning_rate": 0.0003364546396593192, "loss": 13.1807, "step": 828 }, { "epoch": 0.2652375619900816, "grad_norm": 600.3480224609375, "learning_rate": 0.00033630850708962957, "loss": 12.5817, "step": 829 }, { "epoch": 0.2655575107982723, "grad_norm": 82.4616928100586, "learning_rate": 0.00033616224051135225, "loss": 12.0756, "step": 830 }, { "epoch": 0.265877459606463, "grad_norm": 69.83650207519531, "learning_rate": 0.0003360158400796705, "loss": 10.1748, "step": 831 }, { "epoch": 0.26619740841465367, "grad_norm": 150.00209045410156, "learning_rate": 0.0003358693059499098, "loss": 8.8591, "step": 832 }, { "epoch": 0.26651735722284436, "grad_norm": 30079.291015625, "learning_rate": 0.0003357226382775373, "loss": 8.8002, "step": 833 }, { "epoch": 0.26683730603103506, "grad_norm": 18880.88671875, "learning_rate": 0.00033557583721816194, "loss": 10.3358, "step": 834 }, { "epoch": 0.2671572548392257, "grad_norm": 7514.67333984375, "learning_rate": 0.00033542890292753404, "loss": 12.7088, "step": 835 }, { "epoch": 0.2674772036474164, "grad_norm": 165107.78125, "learning_rate": 0.0003352818355615455, "loss": 15.0574, "step": 836 }, { "epoch": 0.2677971524556071, "grad_norm": 10771.0146484375, "learning_rate": 0.0003351346352762292, "loss": 16.6562, "step": 837 }, { "epoch": 0.2681171012637978, "grad_norm": 32579.517578125, "learning_rate": 0.00033498730222775917, "loss": 16.5778, "step": 838 }, { "epoch": 0.2684370500719885, "grad_norm": 737471.125, "learning_rate": 0.00033483983657245017, "loss": 17.4677, "step": 839 }, { "epoch": 0.2687569988801792, "grad_norm": 1129485.5, "learning_rate": 0.00033469223846675783, "loss": 16.0327, "step": 840 }, { "epoch": 0.26907694768836987, "grad_norm": 21750792.0, "learning_rate": 0.0003345445080672781, "loss": 16.6764, "step": 841 }, { "epoch": 0.26939689649656057, "grad_norm": 626546.5625, "learning_rate": 0.0003343966455307474, "loss": 18.6074, "step": 842 }, { "epoch": 0.26971684530475126, "grad_norm": 1119687.125, "learning_rate": 0.0003342486510140424, "loss": 19.7998, "step": 843 }, { "epoch": 0.2700367941129419, "grad_norm": 4357446.0, "learning_rate": 0.00033410052467417985, "loss": 17.5433, "step": 844 }, { "epoch": 0.2703567429211326, "grad_norm": 617141.25, "learning_rate": 0.0003339522666683159, "loss": 19.1042, "step": 845 }, { "epoch": 0.2706766917293233, "grad_norm": 355878.9375, "learning_rate": 0.000333803877153747, "loss": 20.8072, "step": 846 }, { "epoch": 0.270996640537514, "grad_norm": 624676.4375, "learning_rate": 0.0003336553562879088, "loss": 14.6778, "step": 847 }, { "epoch": 0.2713165893457047, "grad_norm": 330477.0625, "learning_rate": 0.00033350670422837637, "loss": 19.176, "step": 848 }, { "epoch": 0.2716365381538954, "grad_norm": 941151.25, "learning_rate": 0.0003333579211328639, "loss": 21.5224, "step": 849 }, { "epoch": 0.2719564869620861, "grad_norm": 3218129.5, "learning_rate": 0.00033320900715922477, "loss": 23.2502, "step": 850 }, { "epoch": 0.27227643577027677, "grad_norm": 1361448.25, "learning_rate": 0.00033305996246545106, "loss": 23.624, "step": 851 }, { "epoch": 0.27259638457846747, "grad_norm": 282881.1875, "learning_rate": 0.00033291078720967365, "loss": 24.8605, "step": 852 }, { "epoch": 0.27291633338665816, "grad_norm": 652137.875, "learning_rate": 0.0003327614815501618, "loss": 25.1488, "step": 853 }, { "epoch": 0.2732362821948488, "grad_norm": 11083729.0, "learning_rate": 0.0003326120456453232, "loss": 26.0858, "step": 854 }, { "epoch": 0.2735562310030395, "grad_norm": 3168683.75, "learning_rate": 0.0003324624796537039, "loss": 26.6461, "step": 855 }, { "epoch": 0.2738761798112302, "grad_norm": 3187546.0, "learning_rate": 0.00033231278373398773, "loss": 25.3838, "step": 856 }, { "epoch": 0.2741961286194209, "grad_norm": 396312.03125, "learning_rate": 0.00033216295804499636, "loss": 25.4869, "step": 857 }, { "epoch": 0.2745160774276116, "grad_norm": 1157155.25, "learning_rate": 0.00033201300274568934, "loss": 26.773, "step": 858 }, { "epoch": 0.2748360262358023, "grad_norm": 1152320.75, "learning_rate": 0.00033186291799516354, "loss": 26.3161, "step": 859 }, { "epoch": 0.275155975043993, "grad_norm": 3598519.75, "learning_rate": 0.00033171270395265334, "loss": 24.7172, "step": 860 }, { "epoch": 0.27547592385218367, "grad_norm": 4380961.0, "learning_rate": 0.00033156236077753006, "loss": 25.4326, "step": 861 }, { "epoch": 0.27579587266037436, "grad_norm": 219964.46875, "learning_rate": 0.00033141188862930235, "loss": 25.278, "step": 862 }, { "epoch": 0.276115821468565, "grad_norm": 9706468.0, "learning_rate": 0.0003312612876676153, "loss": 24.5966, "step": 863 }, { "epoch": 0.2764357702767557, "grad_norm": 398041.8125, "learning_rate": 0.0003311105580522511, "loss": 23.8814, "step": 864 }, { "epoch": 0.2767557190849464, "grad_norm": 520553.59375, "learning_rate": 0.00033095969994312806, "loss": 22.963, "step": 865 }, { "epoch": 0.2770756678931371, "grad_norm": 210379.578125, "learning_rate": 0.00033080871350030103, "loss": 25.1315, "step": 866 }, { "epoch": 0.2773956167013278, "grad_norm": 1593170.0, "learning_rate": 0.00033065759888396097, "loss": 25.1688, "step": 867 }, { "epoch": 0.2777155655095185, "grad_norm": 714547.0625, "learning_rate": 0.0003305063562544348, "loss": 24.1757, "step": 868 }, { "epoch": 0.2780355143177092, "grad_norm": 1169557.25, "learning_rate": 0.00033035498577218533, "loss": 24.4053, "step": 869 }, { "epoch": 0.27835546312589987, "grad_norm": 957691.8125, "learning_rate": 0.0003302034875978109, "loss": 22.7094, "step": 870 }, { "epoch": 0.27867541193409057, "grad_norm": 261482.71875, "learning_rate": 0.0003300518618920454, "loss": 22.1931, "step": 871 }, { "epoch": 0.27899536074228126, "grad_norm": 193108.375, "learning_rate": 0.00032990010881575805, "loss": 23.5923, "step": 872 }, { "epoch": 0.2793153095504719, "grad_norm": 75381.0078125, "learning_rate": 0.00032974822852995314, "loss": 22.1936, "step": 873 }, { "epoch": 0.2796352583586626, "grad_norm": 101304.7265625, "learning_rate": 0.00032959622119576996, "loss": 23.972, "step": 874 }, { "epoch": 0.2799552071668533, "grad_norm": 797569.8125, "learning_rate": 0.00032944408697448255, "loss": 23.1834, "step": 875 }, { "epoch": 0.280275155975044, "grad_norm": 54563.6953125, "learning_rate": 0.0003292918260274997, "loss": 19.8762, "step": 876 }, { "epoch": 0.2805951047832347, "grad_norm": 765206.625, "learning_rate": 0.0003291394385163645, "loss": 17.3695, "step": 877 }, { "epoch": 0.2809150535914254, "grad_norm": 145896.859375, "learning_rate": 0.00032898692460275446, "loss": 14.1639, "step": 878 }, { "epoch": 0.2812350023996161, "grad_norm": 78758.6640625, "learning_rate": 0.000328834284448481, "loss": 18.8681, "step": 879 }, { "epoch": 0.28155495120780677, "grad_norm": 878149.4375, "learning_rate": 0.00032868151821548966, "loss": 25.91, "step": 880 }, { "epoch": 0.28187490001599746, "grad_norm": 2420195.5, "learning_rate": 0.00032852862606585966, "loss": 30.7789, "step": 881 }, { "epoch": 0.2821948488241881, "grad_norm": 305033.84375, "learning_rate": 0.0003283756081618039, "loss": 32.8377, "step": 882 }, { "epoch": 0.2825147976323788, "grad_norm": 422428.90625, "learning_rate": 0.00032822246466566855, "loss": 32.3465, "step": 883 }, { "epoch": 0.2828347464405695, "grad_norm": 78632.2578125, "learning_rate": 0.0003280691957399332, "loss": 32.9845, "step": 884 }, { "epoch": 0.2831546952487602, "grad_norm": 286373.71875, "learning_rate": 0.00032791580154721035, "loss": 34.3491, "step": 885 }, { "epoch": 0.2834746440569509, "grad_norm": 12756404.0, "learning_rate": 0.0003277622822502455, "loss": 32.8115, "step": 886 }, { "epoch": 0.2837945928651416, "grad_norm": 259764.84375, "learning_rate": 0.00032760863801191696, "loss": 33.0063, "step": 887 }, { "epoch": 0.2841145416733323, "grad_norm": 1352647.25, "learning_rate": 0.0003274548689952354, "loss": 33.3023, "step": 888 }, { "epoch": 0.284434490481523, "grad_norm": 2583883.25, "learning_rate": 0.000327300975363344, "loss": 34.0707, "step": 889 }, { "epoch": 0.28475443928971367, "grad_norm": 768095.625, "learning_rate": 0.0003271469572795181, "loss": 33.3534, "step": 890 }, { "epoch": 0.2850743880979043, "grad_norm": 38086.27734375, "learning_rate": 0.0003269928149071652, "loss": 35.1093, "step": 891 }, { "epoch": 0.285394336906095, "grad_norm": 11101828.0, "learning_rate": 0.00032683854840982453, "loss": 36.15, "step": 892 }, { "epoch": 0.2857142857142857, "grad_norm": 936936.6875, "learning_rate": 0.000326684157951167, "loss": 35.3095, "step": 893 }, { "epoch": 0.2860342345224764, "grad_norm": 1324180.875, "learning_rate": 0.0003265296436949952, "loss": 34.8645, "step": 894 }, { "epoch": 0.2863541833306671, "grad_norm": 108139.625, "learning_rate": 0.00032637500580524285, "loss": 34.3561, "step": 895 }, { "epoch": 0.2866741321388578, "grad_norm": 200987.484375, "learning_rate": 0.00032622024444597497, "loss": 36.2547, "step": 896 }, { "epoch": 0.2869940809470485, "grad_norm": 147518.8125, "learning_rate": 0.00032606535978138763, "loss": 34.7164, "step": 897 }, { "epoch": 0.2873140297552392, "grad_norm": 1045666.5625, "learning_rate": 0.0003259103519758076, "loss": 34.6775, "step": 898 }, { "epoch": 0.28763397856342987, "grad_norm": 728341.4375, "learning_rate": 0.0003257552211936924, "loss": 34.3695, "step": 899 }, { "epoch": 0.28795392737162057, "grad_norm": 47716.6015625, "learning_rate": 0.00032559996759962985, "loss": 34.6398, "step": 900 }, { "epoch": 0.2882738761798112, "grad_norm": 2483745.0, "learning_rate": 0.0003254445913583383, "loss": 34.867, "step": 901 }, { "epoch": 0.2885938249880019, "grad_norm": 4208087.5, "learning_rate": 0.00032528909263466614, "loss": 33.2978, "step": 902 }, { "epoch": 0.2889137737961926, "grad_norm": 668148.4375, "learning_rate": 0.0003251334715935915, "loss": 32.0358, "step": 903 }, { "epoch": 0.2892337226043833, "grad_norm": 382070.53125, "learning_rate": 0.0003249777284002227, "loss": 34.4869, "step": 904 }, { "epoch": 0.289553671412574, "grad_norm": 2095629.125, "learning_rate": 0.00032482186321979717, "loss": 32.9963, "step": 905 }, { "epoch": 0.2898736202207647, "grad_norm": 721848.8125, "learning_rate": 0.00032466587621768226, "loss": 36.1113, "step": 906 }, { "epoch": 0.2901935690289554, "grad_norm": 826675.1875, "learning_rate": 0.0003245097675593743, "loss": 33.8295, "step": 907 }, { "epoch": 0.2905135178371461, "grad_norm": 1047299.1875, "learning_rate": 0.0003243535374104985, "loss": 33.8889, "step": 908 }, { "epoch": 0.29083346664533677, "grad_norm": 83939.1484375, "learning_rate": 0.00032419718593680943, "loss": 34.5913, "step": 909 }, { "epoch": 0.2911534154535274, "grad_norm": 69454.4609375, "learning_rate": 0.00032404071330419, "loss": 34.2654, "step": 910 }, { "epoch": 0.2914733642617181, "grad_norm": 22721304.0, "learning_rate": 0.00032388411967865184, "loss": 33.8081, "step": 911 }, { "epoch": 0.2917933130699088, "grad_norm": 357308.65625, "learning_rate": 0.0003237274052263349, "loss": 32.9188, "step": 912 }, { "epoch": 0.2921132618780995, "grad_norm": 278778.5, "learning_rate": 0.0003235705701135074, "loss": 32.0302, "step": 913 }, { "epoch": 0.2924332106862902, "grad_norm": 4960189.5, "learning_rate": 0.0003234136145065655, "loss": 32.5315, "step": 914 }, { "epoch": 0.2927531594944809, "grad_norm": 55529.8125, "learning_rate": 0.0003232565385720331, "loss": 32.6047, "step": 915 }, { "epoch": 0.2930731083026716, "grad_norm": 1051795.0, "learning_rate": 0.000323099342476562, "loss": 32.6275, "step": 916 }, { "epoch": 0.2933930571108623, "grad_norm": 1545867.0, "learning_rate": 0.0003229420263869313, "loss": 34.8654, "step": 917 }, { "epoch": 0.29371300591905297, "grad_norm": 58457.1015625, "learning_rate": 0.0003227845904700475, "loss": 30.3776, "step": 918 }, { "epoch": 0.29403295472724367, "grad_norm": 1061561.125, "learning_rate": 0.00032262703489294414, "loss": 31.8167, "step": 919 }, { "epoch": 0.2943529035354343, "grad_norm": 1284448.125, "learning_rate": 0.00032246935982278185, "loss": 31.0466, "step": 920 }, { "epoch": 0.294672852343625, "grad_norm": 319359.875, "learning_rate": 0.0003223115654268479, "loss": 29.9253, "step": 921 }, { "epoch": 0.2949928011518157, "grad_norm": 1560787.0, "learning_rate": 0.0003221536518725563, "loss": 30.4316, "step": 922 }, { "epoch": 0.2953127499600064, "grad_norm": 2599579.75, "learning_rate": 0.00032199561932744734, "loss": 29.6555, "step": 923 }, { "epoch": 0.2956326987681971, "grad_norm": 76841.046875, "learning_rate": 0.0003218374679591877, "loss": 29.959, "step": 924 }, { "epoch": 0.2959526475763878, "grad_norm": 34526.2109375, "learning_rate": 0.0003216791979355699, "loss": 29.0153, "step": 925 }, { "epoch": 0.2962725963845785, "grad_norm": 8784803.0, "learning_rate": 0.0003215208094245127, "loss": 30.9955, "step": 926 }, { "epoch": 0.2965925451927692, "grad_norm": 39772.76171875, "learning_rate": 0.00032136230259406016, "loss": 31.03, "step": 927 }, { "epoch": 0.29691249400095987, "grad_norm": 44792.16796875, "learning_rate": 0.00032120367761238227, "loss": 31.4258, "step": 928 }, { "epoch": 0.2972324428091505, "grad_norm": 399403.78125, "learning_rate": 0.00032104493464777404, "loss": 33.6536, "step": 929 }, { "epoch": 0.2975523916173412, "grad_norm": 869917.6875, "learning_rate": 0.0003208860738686558, "loss": 32.0902, "step": 930 }, { "epoch": 0.2978723404255319, "grad_norm": 78028.140625, "learning_rate": 0.00032072709544357306, "loss": 32.4948, "step": 931 }, { "epoch": 0.2981922892337226, "grad_norm": 3609.693603515625, "learning_rate": 0.0003205679995411958, "loss": 32.1421, "step": 932 }, { "epoch": 0.2985122380419133, "grad_norm": 36646.99609375, "learning_rate": 0.00032040878633031884, "loss": 25.8291, "step": 933 }, { "epoch": 0.298832186850104, "grad_norm": 6927.86328125, "learning_rate": 0.0003202494559798614, "loss": 26.697, "step": 934 }, { "epoch": 0.2991521356582947, "grad_norm": 745786.875, "learning_rate": 0.0003200900086588671, "loss": 22.2229, "step": 935 }, { "epoch": 0.2994720844664854, "grad_norm": 208755.265625, "learning_rate": 0.0003199304445365035, "loss": 18.1356, "step": 936 }, { "epoch": 0.2997920332746761, "grad_norm": 7917.80859375, "learning_rate": 0.0003197707637820621, "loss": 16.6657, "step": 937 }, { "epoch": 0.30011198208286677, "grad_norm": 498.3856506347656, "learning_rate": 0.00031961096656495824, "loss": 15.2576, "step": 938 }, { "epoch": 0.3004319308910574, "grad_norm": 245.81466674804688, "learning_rate": 0.00031945105305473075, "loss": 10.8893, "step": 939 }, { "epoch": 0.3007518796992481, "grad_norm": 946.4498291015625, "learning_rate": 0.000319291023421042, "loss": 7.6083, "step": 940 }, { "epoch": 0.3010718285074388, "grad_norm": 136.80340576171875, "learning_rate": 0.0003191308778336772, "loss": 7.4698, "step": 941 }, { "epoch": 0.3013917773156295, "grad_norm": 423.9597473144531, "learning_rate": 0.000318970616462545, "loss": 8.2354, "step": 942 }, { "epoch": 0.3017117261238202, "grad_norm": 12609.2431640625, "learning_rate": 0.00031881023947767656, "loss": 8.5875, "step": 943 }, { "epoch": 0.3020316749320109, "grad_norm": 63.26814651489258, "learning_rate": 0.000318649747049226, "loss": 9.1278, "step": 944 }, { "epoch": 0.3023516237402016, "grad_norm": 1825.032470703125, "learning_rate": 0.0003184891393474696, "loss": 9.1317, "step": 945 }, { "epoch": 0.3026715725483923, "grad_norm": 76.4158706665039, "learning_rate": 0.0003183284165428061, "loss": 8.2171, "step": 946 }, { "epoch": 0.30299152135658297, "grad_norm": 14688.9970703125, "learning_rate": 0.00031816757880575664, "loss": 7.872, "step": 947 }, { "epoch": 0.3033114701647736, "grad_norm": 21862.908203125, "learning_rate": 0.0003180066263069637, "loss": 7.8133, "step": 948 }, { "epoch": 0.3036314189729643, "grad_norm": 132.2907257080078, "learning_rate": 0.000317845559217192, "loss": 7.9487, "step": 949 }, { "epoch": 0.303951367781155, "grad_norm": 125.90247344970703, "learning_rate": 0.00031768437770732775, "loss": 7.259, "step": 950 }, { "epoch": 0.3042713165893457, "grad_norm": 71.32536315917969, "learning_rate": 0.0003175230819483784, "loss": 6.9182, "step": 951 }, { "epoch": 0.3045912653975364, "grad_norm": 315.1258239746094, "learning_rate": 0.0003173616721114726, "loss": 6.9307, "step": 952 }, { "epoch": 0.3049112142057271, "grad_norm": 13234.1328125, "learning_rate": 0.0003172001483678603, "loss": 6.8386, "step": 953 }, { "epoch": 0.3052311630139178, "grad_norm": 37406.86328125, "learning_rate": 0.00031703851088891197, "loss": 7.2485, "step": 954 }, { "epoch": 0.3055511118221085, "grad_norm": 352988.6875, "learning_rate": 0.0003168767598461191, "loss": 6.8438, "step": 955 }, { "epoch": 0.3058710606302992, "grad_norm": 262.562744140625, "learning_rate": 0.00031671489541109326, "loss": 8.3405, "step": 956 }, { "epoch": 0.3061910094384898, "grad_norm": 1149.9610595703125, "learning_rate": 0.0003165529177555668, "loss": 8.3142, "step": 957 }, { "epoch": 0.3065109582466805, "grad_norm": 331.04718017578125, "learning_rate": 0.0003163908270513917, "loss": 8.1344, "step": 958 }, { "epoch": 0.3068309070548712, "grad_norm": 112.65670013427734, "learning_rate": 0.0003162286234705403, "loss": 7.2487, "step": 959 }, { "epoch": 0.3071508558630619, "grad_norm": 115.52394104003906, "learning_rate": 0.00031606630718510433, "loss": 7.4428, "step": 960 }, { "epoch": 0.3074708046712526, "grad_norm": 409.6588134765625, "learning_rate": 0.00031590387836729543, "loss": 6.6182, "step": 961 }, { "epoch": 0.3077907534794433, "grad_norm": 140.50241088867188, "learning_rate": 0.0003157413371894444, "loss": 7.4124, "step": 962 }, { "epoch": 0.308110702287634, "grad_norm": 126.857666015625, "learning_rate": 0.0003155786838240014, "loss": 6.8525, "step": 963 }, { "epoch": 0.3084306510958247, "grad_norm": 4386.052734375, "learning_rate": 0.0003154159184435355, "loss": 7.0282, "step": 964 }, { "epoch": 0.3087505999040154, "grad_norm": 79.79589080810547, "learning_rate": 0.00031525304122073476, "loss": 7.3552, "step": 965 }, { "epoch": 0.3090705487122061, "grad_norm": 207.43475341796875, "learning_rate": 0.0003150900523284056, "loss": 6.9154, "step": 966 }, { "epoch": 0.3093904975203967, "grad_norm": 44.77931594848633, "learning_rate": 0.00031492695193947325, "loss": 6.9198, "step": 967 }, { "epoch": 0.3097104463285874, "grad_norm": 103.20629119873047, "learning_rate": 0.0003147637402269812, "loss": 7.0039, "step": 968 }, { "epoch": 0.3100303951367781, "grad_norm": 31.550006866455078, "learning_rate": 0.0003146004173640908, "loss": 6.7761, "step": 969 }, { "epoch": 0.3103503439449688, "grad_norm": 14.499809265136719, "learning_rate": 0.0003144369835240815, "loss": 6.6403, "step": 970 }, { "epoch": 0.3106702927531595, "grad_norm": 47.8282470703125, "learning_rate": 0.00031427343888035047, "loss": 6.626, "step": 971 }, { "epoch": 0.3109902415613502, "grad_norm": 15.349066734313965, "learning_rate": 0.00031410978360641253, "loss": 7.1983, "step": 972 }, { "epoch": 0.3113101903695409, "grad_norm": 12.059926986694336, "learning_rate": 0.0003139460178758997, "loss": 6.6756, "step": 973 }, { "epoch": 0.3116301391777316, "grad_norm": 42.02625274658203, "learning_rate": 0.0003137821418625613, "loss": 6.7595, "step": 974 }, { "epoch": 0.3119500879859223, "grad_norm": 18.728851318359375, "learning_rate": 0.0003136181557402637, "loss": 6.7694, "step": 975 }, { "epoch": 0.3122700367941129, "grad_norm": 17.670698165893555, "learning_rate": 0.00031345405968298995, "loss": 6.9929, "step": 976 }, { "epoch": 0.3125899856023036, "grad_norm": 12.548014640808105, "learning_rate": 0.00031328985386483974, "loss": 6.7901, "step": 977 }, { "epoch": 0.3129099344104943, "grad_norm": 15.494758605957031, "learning_rate": 0.0003131255384600294, "loss": 6.6275, "step": 978 }, { "epoch": 0.313229883218685, "grad_norm": 22.92428207397461, "learning_rate": 0.00031296111364289143, "loss": 6.7544, "step": 979 }, { "epoch": 0.3135498320268757, "grad_norm": 8.640791893005371, "learning_rate": 0.00031279657958787434, "loss": 6.6713, "step": 980 }, { "epoch": 0.3138697808350664, "grad_norm": 11.602739334106445, "learning_rate": 0.00031263193646954255, "loss": 6.8735, "step": 981 }, { "epoch": 0.3141897296432571, "grad_norm": 7.338200569152832, "learning_rate": 0.0003124671844625763, "loss": 6.8, "step": 982 }, { "epoch": 0.3145096784514478, "grad_norm": 8.646294593811035, "learning_rate": 0.00031230232374177127, "loss": 6.4102, "step": 983 }, { "epoch": 0.3148296272596385, "grad_norm": 8.402074813842773, "learning_rate": 0.0003121373544820385, "loss": 6.7865, "step": 984 }, { "epoch": 0.3151495760678292, "grad_norm": 9.351253509521484, "learning_rate": 0.00031197227685840414, "loss": 6.6593, "step": 985 }, { "epoch": 0.3154695248760198, "grad_norm": 13.015589714050293, "learning_rate": 0.0003118070910460094, "loss": 6.7232, "step": 986 }, { "epoch": 0.3157894736842105, "grad_norm": 10.829553604125977, "learning_rate": 0.0003116417972201102, "loss": 6.6565, "step": 987 }, { "epoch": 0.3161094224924012, "grad_norm": 8.133262634277344, "learning_rate": 0.00031147639555607713, "loss": 6.8315, "step": 988 }, { "epoch": 0.3164293713005919, "grad_norm": 5.855452537536621, "learning_rate": 0.0003113108862293952, "loss": 6.7788, "step": 989 }, { "epoch": 0.3167493201087826, "grad_norm": 8.390470504760742, "learning_rate": 0.0003111452694156634, "loss": 6.6702, "step": 990 }, { "epoch": 0.3170692689169733, "grad_norm": 7.9397969245910645, "learning_rate": 0.0003109795452905952, "loss": 6.8752, "step": 991 }, { "epoch": 0.317389217725164, "grad_norm": 9.200600624084473, "learning_rate": 0.0003108137140300175, "loss": 6.7201, "step": 992 }, { "epoch": 0.3177091665333547, "grad_norm": 6.002403259277344, "learning_rate": 0.0003106477758098712, "loss": 6.5127, "step": 993 }, { "epoch": 0.3180291153415454, "grad_norm": 7.685515880584717, "learning_rate": 0.00031048173080621034, "loss": 6.6819, "step": 994 }, { "epoch": 0.318349064149736, "grad_norm": 6.026432037353516, "learning_rate": 0.0003103155791952026, "loss": 6.6265, "step": 995 }, { "epoch": 0.3186690129579267, "grad_norm": 7.417787551879883, "learning_rate": 0.0003101493211531285, "loss": 6.6986, "step": 996 }, { "epoch": 0.3189889617661174, "grad_norm": 12.300549507141113, "learning_rate": 0.00030998295685638166, "loss": 6.87, "step": 997 }, { "epoch": 0.3193089105743081, "grad_norm": 7.922842025756836, "learning_rate": 0.00030981648648146837, "loss": 6.8495, "step": 998 }, { "epoch": 0.3196288593824988, "grad_norm": 18.15106201171875, "learning_rate": 0.00030964991020500745, "loss": 6.6748, "step": 999 }, { "epoch": 0.3199488081906895, "grad_norm": 7.818212985992432, "learning_rate": 0.0003094832282037299, "loss": 6.6174, "step": 1000 }, { "epoch": 0.3199488081906895, "eval_loss": 3.358996868133545, "eval_runtime": 233.9815, "eval_samples_per_second": 5.624, "eval_steps_per_second": 1.406, "step": 1000 }, { "epoch": 0.3202687569988802, "grad_norm": 9.613029479980469, "learning_rate": 0.0003093164406544793, "loss": 6.5549, "step": 1001 }, { "epoch": 0.3205887058070709, "grad_norm": 6.538544654846191, "learning_rate": 0.0003091495477342108, "loss": 6.6206, "step": 1002 }, { "epoch": 0.3209086546152616, "grad_norm": 6.343434810638428, "learning_rate": 0.00030898254961999173, "loss": 6.7164, "step": 1003 }, { "epoch": 0.3212286034234523, "grad_norm": 6.2874908447265625, "learning_rate": 0.0003088154464890007, "loss": 6.6797, "step": 1004 }, { "epoch": 0.3215485522316429, "grad_norm": 18.32168960571289, "learning_rate": 0.00030864823851852786, "loss": 6.3663, "step": 1005 }, { "epoch": 0.3218685010398336, "grad_norm": 4.97805643081665, "learning_rate": 0.00030848092588597463, "loss": 6.4058, "step": 1006 }, { "epoch": 0.3221884498480243, "grad_norm": 7.486648082733154, "learning_rate": 0.00030831350876885344, "loss": 6.6289, "step": 1007 }, { "epoch": 0.322508398656215, "grad_norm": 7.603539943695068, "learning_rate": 0.0003081459873447875, "loss": 6.5563, "step": 1008 }, { "epoch": 0.3228283474644057, "grad_norm": 9.133359909057617, "learning_rate": 0.00030797836179151085, "loss": 6.5898, "step": 1009 }, { "epoch": 0.3231482962725964, "grad_norm": 8.029521942138672, "learning_rate": 0.0003078106322868679, "loss": 6.8213, "step": 1010 }, { "epoch": 0.3234682450807871, "grad_norm": 5.991820335388184, "learning_rate": 0.0003076427990088133, "loss": 6.5975, "step": 1011 }, { "epoch": 0.3237881938889778, "grad_norm": 4.940555572509766, "learning_rate": 0.0003074748621354119, "loss": 6.628, "step": 1012 }, { "epoch": 0.3241081426971685, "grad_norm": 7.488973140716553, "learning_rate": 0.0003073068218448384, "loss": 6.56, "step": 1013 }, { "epoch": 0.3244280915053591, "grad_norm": 10.482954025268555, "learning_rate": 0.0003071386783153772, "loss": 6.7303, "step": 1014 }, { "epoch": 0.3247480403135498, "grad_norm": 7.40532112121582, "learning_rate": 0.00030697043172542244, "loss": 6.8107, "step": 1015 }, { "epoch": 0.3250679891217405, "grad_norm": 16.26544189453125, "learning_rate": 0.00030680208225347737, "loss": 6.5008, "step": 1016 }, { "epoch": 0.3253879379299312, "grad_norm": 13.164572715759277, "learning_rate": 0.0003066336300781544, "loss": 6.5557, "step": 1017 }, { "epoch": 0.3257078867381219, "grad_norm": 12.598750114440918, "learning_rate": 0.000306465075378175, "loss": 6.7085, "step": 1018 }, { "epoch": 0.3260278355463126, "grad_norm": 6.103877544403076, "learning_rate": 0.00030629641833236935, "loss": 6.6045, "step": 1019 }, { "epoch": 0.3263477843545033, "grad_norm": 6.347523212432861, "learning_rate": 0.0003061276591196764, "loss": 6.8141, "step": 1020 }, { "epoch": 0.326667733162694, "grad_norm": 3.747049570083618, "learning_rate": 0.00030595879791914314, "loss": 6.5425, "step": 1021 }, { "epoch": 0.3269876819708847, "grad_norm": 6.858709335327148, "learning_rate": 0.00030578983490992505, "loss": 6.4573, "step": 1022 }, { "epoch": 0.3273076307790753, "grad_norm": 3.5340542793273926, "learning_rate": 0.00030562077027128555, "loss": 6.506, "step": 1023 }, { "epoch": 0.327627579587266, "grad_norm": 6.104124546051025, "learning_rate": 0.000305451604182596, "loss": 6.5108, "step": 1024 }, { "epoch": 0.3279475283954567, "grad_norm": 7.606841087341309, "learning_rate": 0.000305282336823335, "loss": 6.7387, "step": 1025 }, { "epoch": 0.3282674772036474, "grad_norm": 6.288745880126953, "learning_rate": 0.0003051129683730891, "loss": 6.7012, "step": 1026 }, { "epoch": 0.3285874260118381, "grad_norm": 7.002671241760254, "learning_rate": 0.0003049434990115518, "loss": 6.5738, "step": 1027 }, { "epoch": 0.3289073748200288, "grad_norm": 5.6297478675842285, "learning_rate": 0.00030477392891852374, "loss": 6.6822, "step": 1028 }, { "epoch": 0.3292273236282195, "grad_norm": 9.002836227416992, "learning_rate": 0.0003046042582739124, "loss": 6.7199, "step": 1029 }, { "epoch": 0.3295472724364102, "grad_norm": 4.909145832061768, "learning_rate": 0.000304434487257732, "loss": 6.6323, "step": 1030 }, { "epoch": 0.3298672212446009, "grad_norm": 15.304366111755371, "learning_rate": 0.00030426461605010324, "loss": 6.6586, "step": 1031 }, { "epoch": 0.3301871700527916, "grad_norm": 16.558542251586914, "learning_rate": 0.0003040946448312531, "loss": 6.5575, "step": 1032 }, { "epoch": 0.3305071188609822, "grad_norm": 17.546894073486328, "learning_rate": 0.00030392457378151456, "loss": 6.4663, "step": 1033 }, { "epoch": 0.3308270676691729, "grad_norm": 6.221606254577637, "learning_rate": 0.00030375440308132666, "loss": 6.6658, "step": 1034 }, { "epoch": 0.3311470164773636, "grad_norm": 5.4567999839782715, "learning_rate": 0.0003035841329112343, "loss": 6.5654, "step": 1035 }, { "epoch": 0.3314669652855543, "grad_norm": 17.624677658081055, "learning_rate": 0.0003034137634518875, "loss": 6.582, "step": 1036 }, { "epoch": 0.331786914093745, "grad_norm": 17.85716438293457, "learning_rate": 0.00030324329488404207, "loss": 6.7594, "step": 1037 }, { "epoch": 0.3321068629019357, "grad_norm": 15.087934494018555, "learning_rate": 0.00030307272738855855, "loss": 6.6988, "step": 1038 }, { "epoch": 0.3324268117101264, "grad_norm": 8.74818229675293, "learning_rate": 0.0003029020611464029, "loss": 6.6417, "step": 1039 }, { "epoch": 0.3327467605183171, "grad_norm": 6.144550323486328, "learning_rate": 0.00030273129633864534, "loss": 6.5553, "step": 1040 }, { "epoch": 0.3330667093265078, "grad_norm": 14.558652877807617, "learning_rate": 0.0003025604331464612, "loss": 6.6014, "step": 1041 }, { "epoch": 0.3333866581346984, "grad_norm": 7.271969318389893, "learning_rate": 0.0003023894717511297, "loss": 6.5538, "step": 1042 }, { "epoch": 0.3337066069428891, "grad_norm": 9.705676078796387, "learning_rate": 0.0003022184123340346, "loss": 6.6535, "step": 1043 }, { "epoch": 0.3340265557510798, "grad_norm": 7.595553398132324, "learning_rate": 0.0003020472550766634, "loss": 6.7706, "step": 1044 }, { "epoch": 0.3343465045592705, "grad_norm": 7.8142476081848145, "learning_rate": 0.00030187600016060766, "loss": 6.7936, "step": 1045 }, { "epoch": 0.3346664533674612, "grad_norm": 7.204625129699707, "learning_rate": 0.0003017046477675624, "loss": 6.4859, "step": 1046 }, { "epoch": 0.3349864021756519, "grad_norm": 8.921597480773926, "learning_rate": 0.0003015331980793259, "loss": 6.5621, "step": 1047 }, { "epoch": 0.3353063509838426, "grad_norm": 7.932427406311035, "learning_rate": 0.0003013616512778001, "loss": 6.566, "step": 1048 }, { "epoch": 0.3356262997920333, "grad_norm": 11.233190536499023, "learning_rate": 0.00030119000754498965, "loss": 6.7973, "step": 1049 }, { "epoch": 0.335946248600224, "grad_norm": 7.584317684173584, "learning_rate": 0.00030101826706300193, "loss": 6.5683, "step": 1050 }, { "epoch": 0.3362661974084147, "grad_norm": 8.914484024047852, "learning_rate": 0.0003008464300140474, "loss": 6.6189, "step": 1051 }, { "epoch": 0.3365861462166053, "grad_norm": 11.658357620239258, "learning_rate": 0.0003006744965804385, "loss": 6.6833, "step": 1052 }, { "epoch": 0.336906095024796, "grad_norm": 8.101858139038086, "learning_rate": 0.0003005024669445904, "loss": 6.7025, "step": 1053 }, { "epoch": 0.3372260438329867, "grad_norm": 5.961119651794434, "learning_rate": 0.00030033034128901996, "loss": 6.7396, "step": 1054 }, { "epoch": 0.3375459926411774, "grad_norm": 10.675737380981445, "learning_rate": 0.00030015811979634593, "loss": 6.4708, "step": 1055 }, { "epoch": 0.3378659414493681, "grad_norm": 8.396323204040527, "learning_rate": 0.00029998580264928906, "loss": 6.5822, "step": 1056 }, { "epoch": 0.3381858902575588, "grad_norm": 4.825063228607178, "learning_rate": 0.00029981339003067124, "loss": 6.6281, "step": 1057 }, { "epoch": 0.3385058390657495, "grad_norm": 12.628974914550781, "learning_rate": 0.0002996408821234158, "loss": 6.78, "step": 1058 }, { "epoch": 0.3388257878739402, "grad_norm": 9.515558242797852, "learning_rate": 0.00029946827911054723, "loss": 6.7697, "step": 1059 }, { "epoch": 0.3391457366821309, "grad_norm": 5.6787543296813965, "learning_rate": 0.00029929558117519086, "loss": 6.6566, "step": 1060 }, { "epoch": 0.3394656854903215, "grad_norm": 8.688566207885742, "learning_rate": 0.00029912278850057253, "loss": 6.6127, "step": 1061 }, { "epoch": 0.3397856342985122, "grad_norm": 6.97818660736084, "learning_rate": 0.0002989499012700191, "loss": 6.75, "step": 1062 }, { "epoch": 0.3401055831067029, "grad_norm": 8.22523021697998, "learning_rate": 0.0002987769196669572, "loss": 6.6283, "step": 1063 }, { "epoch": 0.3404255319148936, "grad_norm": 10.774093627929688, "learning_rate": 0.0002986038438749139, "loss": 6.611, "step": 1064 }, { "epoch": 0.3407454807230843, "grad_norm": 8.792933464050293, "learning_rate": 0.00029843067407751606, "loss": 6.5654, "step": 1065 }, { "epoch": 0.341065429531275, "grad_norm": 5.798463344573975, "learning_rate": 0.0002982574104584904, "loss": 6.7735, "step": 1066 }, { "epoch": 0.3413853783394657, "grad_norm": 7.097434997558594, "learning_rate": 0.00029808405320166313, "loss": 6.4257, "step": 1067 }, { "epoch": 0.3417053271476564, "grad_norm": 5.740554332733154, "learning_rate": 0.00029791060249095976, "loss": 6.6911, "step": 1068 }, { "epoch": 0.3420252759558471, "grad_norm": 6.828253746032715, "learning_rate": 0.00029773705851040503, "loss": 6.4999, "step": 1069 }, { "epoch": 0.3423452247640378, "grad_norm": 6.237061977386475, "learning_rate": 0.00029756342144412253, "loss": 6.3495, "step": 1070 }, { "epoch": 0.3426651735722284, "grad_norm": 5.364548206329346, "learning_rate": 0.00029738969147633475, "loss": 6.6146, "step": 1071 }, { "epoch": 0.3429851223804191, "grad_norm": 6.4642815589904785, "learning_rate": 0.0002972158687913626, "loss": 6.6634, "step": 1072 }, { "epoch": 0.3433050711886098, "grad_norm": 8.208625793457031, "learning_rate": 0.00029704195357362545, "loss": 6.5723, "step": 1073 }, { "epoch": 0.3436250199968005, "grad_norm": 10.248831748962402, "learning_rate": 0.00029686794600764085, "loss": 6.6042, "step": 1074 }, { "epoch": 0.3439449688049912, "grad_norm": 7.775475978851318, "learning_rate": 0.0002966938462780242, "loss": 6.3936, "step": 1075 }, { "epoch": 0.3442649176131819, "grad_norm": 7.968334197998047, "learning_rate": 0.000296519654569489, "loss": 6.5476, "step": 1076 }, { "epoch": 0.3445848664213726, "grad_norm": 5.832833766937256, "learning_rate": 0.00029634537106684587, "loss": 6.6565, "step": 1077 }, { "epoch": 0.3449048152295633, "grad_norm": 7.168907165527344, "learning_rate": 0.0002961709959550032, "loss": 6.4957, "step": 1078 }, { "epoch": 0.345224764037754, "grad_norm": 8.945564270019531, "learning_rate": 0.00029599652941896643, "loss": 6.7646, "step": 1079 }, { "epoch": 0.3455447128459446, "grad_norm": 5.535223960876465, "learning_rate": 0.00029582197164383796, "loss": 6.3885, "step": 1080 }, { "epoch": 0.3458646616541353, "grad_norm": 6.619454860687256, "learning_rate": 0.00029564732281481715, "loss": 6.5704, "step": 1081 }, { "epoch": 0.346184610462326, "grad_norm": 8.324727058410645, "learning_rate": 0.00029547258311719973, "loss": 6.4878, "step": 1082 }, { "epoch": 0.3465045592705167, "grad_norm": 6.584716320037842, "learning_rate": 0.00029529775273637803, "loss": 6.5915, "step": 1083 }, { "epoch": 0.3468245080787074, "grad_norm": 4.810291290283203, "learning_rate": 0.00029512283185784046, "loss": 6.3791, "step": 1084 }, { "epoch": 0.3471444568868981, "grad_norm": 29.184206008911133, "learning_rate": 0.0002949478206671717, "loss": 6.7601, "step": 1085 }, { "epoch": 0.3474644056950888, "grad_norm": 9.653064727783203, "learning_rate": 0.0002947727193500518, "loss": 6.6253, "step": 1086 }, { "epoch": 0.3477843545032795, "grad_norm": 13.180891990661621, "learning_rate": 0.0002945975280922569, "loss": 6.5926, "step": 1087 }, { "epoch": 0.3481043033114702, "grad_norm": 9.94357681274414, "learning_rate": 0.0002944222470796582, "loss": 6.6027, "step": 1088 }, { "epoch": 0.3484242521196608, "grad_norm": 9.287586212158203, "learning_rate": 0.0002942468764982223, "loss": 6.7376, "step": 1089 }, { "epoch": 0.3487442009278515, "grad_norm": 17.233806610107422, "learning_rate": 0.000294071416534011, "loss": 6.8934, "step": 1090 }, { "epoch": 0.3490641497360422, "grad_norm": 11.148905754089355, "learning_rate": 0.00029389586737318046, "loss": 6.4543, "step": 1091 }, { "epoch": 0.3493840985442329, "grad_norm": 7.293564319610596, "learning_rate": 0.00029372022920198186, "loss": 6.5114, "step": 1092 }, { "epoch": 0.3497040473524236, "grad_norm": 7.520803928375244, "learning_rate": 0.0002935445022067609, "loss": 6.7753, "step": 1093 }, { "epoch": 0.3500239961606143, "grad_norm": 8.508971214294434, "learning_rate": 0.00029336868657395704, "loss": 6.5503, "step": 1094 }, { "epoch": 0.350343944968805, "grad_norm": 7.20286750793457, "learning_rate": 0.0002931927824901042, "loss": 6.5862, "step": 1095 }, { "epoch": 0.3506638937769957, "grad_norm": 6.576849460601807, "learning_rate": 0.00029301679014183006, "loss": 6.5876, "step": 1096 }, { "epoch": 0.3509838425851864, "grad_norm": 5.83701753616333, "learning_rate": 0.0002928407097158558, "loss": 6.3791, "step": 1097 }, { "epoch": 0.3513037913933771, "grad_norm": 9.229516983032227, "learning_rate": 0.00029266454139899615, "loss": 6.6795, "step": 1098 }, { "epoch": 0.3516237402015677, "grad_norm": 16.494159698486328, "learning_rate": 0.00029248828537815924, "loss": 6.571, "step": 1099 }, { "epoch": 0.3519436890097584, "grad_norm": 10.0813627243042, "learning_rate": 0.00029231194184034596, "loss": 6.4374, "step": 1100 }, { "epoch": 0.3522636378179491, "grad_norm": 7.692905902862549, "learning_rate": 0.0002921355109726502, "loss": 6.6512, "step": 1101 }, { "epoch": 0.3525835866261398, "grad_norm": 15.104799270629883, "learning_rate": 0.00029195899296225846, "loss": 6.4528, "step": 1102 }, { "epoch": 0.3529035354343305, "grad_norm": 18.930479049682617, "learning_rate": 0.00029178238799644983, "loss": 6.4953, "step": 1103 }, { "epoch": 0.3532234842425212, "grad_norm": 10.989952087402344, "learning_rate": 0.00029160569626259555, "loss": 6.5066, "step": 1104 }, { "epoch": 0.3535434330507119, "grad_norm": 8.65145206451416, "learning_rate": 0.00029142891794815877, "loss": 6.7426, "step": 1105 }, { "epoch": 0.3538633818589026, "grad_norm": 4.995960235595703, "learning_rate": 0.00029125205324069477, "loss": 6.4861, "step": 1106 }, { "epoch": 0.3541833306670933, "grad_norm": 6.398393630981445, "learning_rate": 0.00029107510232785036, "loss": 6.3449, "step": 1107 }, { "epoch": 0.35450327947528393, "grad_norm": 5.589430809020996, "learning_rate": 0.00029089806539736384, "loss": 6.5538, "step": 1108 }, { "epoch": 0.3548232282834746, "grad_norm": 4.985332012176514, "learning_rate": 0.0002907209426370647, "loss": 6.544, "step": 1109 }, { "epoch": 0.3551431770916653, "grad_norm": 5.552674293518066, "learning_rate": 0.00029054373423487357, "loss": 6.5413, "step": 1110 }, { "epoch": 0.355463125899856, "grad_norm": 7.634738922119141, "learning_rate": 0.00029036644037880186, "loss": 6.475, "step": 1111 }, { "epoch": 0.3557830747080467, "grad_norm": 11.79802131652832, "learning_rate": 0.0002901890612569518, "loss": 6.8223, "step": 1112 }, { "epoch": 0.3561030235162374, "grad_norm": 13.318756103515625, "learning_rate": 0.000290011597057516, "loss": 6.2933, "step": 1113 }, { "epoch": 0.3564229723244281, "grad_norm": 14.60718822479248, "learning_rate": 0.00028983404796877716, "loss": 6.803, "step": 1114 }, { "epoch": 0.3567429211326188, "grad_norm": 8.220056533813477, "learning_rate": 0.00028965641417910833, "loss": 6.4244, "step": 1115 }, { "epoch": 0.3570628699408095, "grad_norm": 5.330780506134033, "learning_rate": 0.0002894786958769723, "loss": 6.4827, "step": 1116 }, { "epoch": 0.3573828187490002, "grad_norm": 7.304154872894287, "learning_rate": 0.00028930089325092146, "loss": 6.4375, "step": 1117 }, { "epoch": 0.3577027675571908, "grad_norm": 7.895937442779541, "learning_rate": 0.0002891230064895977, "loss": 6.5026, "step": 1118 }, { "epoch": 0.3580227163653815, "grad_norm": 8.539824485778809, "learning_rate": 0.0002889450357817324, "loss": 6.5781, "step": 1119 }, { "epoch": 0.3583426651735722, "grad_norm": 11.084145545959473, "learning_rate": 0.0002887669813161455, "loss": 6.4965, "step": 1120 }, { "epoch": 0.3586626139817629, "grad_norm": 8.87822151184082, "learning_rate": 0.00028858884328174635, "loss": 6.2127, "step": 1121 }, { "epoch": 0.3589825627899536, "grad_norm": 12.374408721923828, "learning_rate": 0.0002884106218675326, "loss": 6.3412, "step": 1122 }, { "epoch": 0.3593025115981443, "grad_norm": 7.8099470138549805, "learning_rate": 0.0002882323172625906, "loss": 6.2976, "step": 1123 }, { "epoch": 0.359622460406335, "grad_norm": 9.879372596740723, "learning_rate": 0.0002880539296560947, "loss": 6.3613, "step": 1124 }, { "epoch": 0.3599424092145257, "grad_norm": 18.98383140563965, "learning_rate": 0.0002878754592373075, "loss": 6.564, "step": 1125 }, { "epoch": 0.3602623580227164, "grad_norm": 27.977420806884766, "learning_rate": 0.00028769690619557945, "loss": 6.2547, "step": 1126 }, { "epoch": 0.36058230683090703, "grad_norm": 9.814420700073242, "learning_rate": 0.00028751827072034876, "loss": 6.447, "step": 1127 }, { "epoch": 0.3609022556390977, "grad_norm": 9.564621925354004, "learning_rate": 0.00028733955300114066, "loss": 6.6715, "step": 1128 }, { "epoch": 0.3612222044472884, "grad_norm": 7.441284656524658, "learning_rate": 0.00028716075322756827, "loss": 6.3489, "step": 1129 }, { "epoch": 0.3615421532554791, "grad_norm": 6.929698944091797, "learning_rate": 0.0002869818715893312, "loss": 6.6691, "step": 1130 }, { "epoch": 0.3618621020636698, "grad_norm": 9.472748756408691, "learning_rate": 0.0002868029082762163, "loss": 6.4778, "step": 1131 }, { "epoch": 0.3621820508718605, "grad_norm": 7.518265724182129, "learning_rate": 0.00028662386347809687, "loss": 6.5739, "step": 1132 }, { "epoch": 0.3625019996800512, "grad_norm": 7.980709552764893, "learning_rate": 0.00028644473738493275, "loss": 6.5246, "step": 1133 }, { "epoch": 0.3628219484882419, "grad_norm": 7.017282009124756, "learning_rate": 0.00028626553018677, "loss": 6.2618, "step": 1134 }, { "epoch": 0.3631418972964326, "grad_norm": 8.980280876159668, "learning_rate": 0.0002860862420737407, "loss": 6.4679, "step": 1135 }, { "epoch": 0.3634618461046233, "grad_norm": 11.132031440734863, "learning_rate": 0.0002859068732360628, "loss": 6.4575, "step": 1136 }, { "epoch": 0.3637817949128139, "grad_norm": 10.001900672912598, "learning_rate": 0.00028572742386404, "loss": 6.4749, "step": 1137 }, { "epoch": 0.3641017437210046, "grad_norm": 8.816960334777832, "learning_rate": 0.0002855478941480613, "loss": 6.6358, "step": 1138 }, { "epoch": 0.3644216925291953, "grad_norm": 7.853034019470215, "learning_rate": 0.0002853682842786009, "loss": 6.4368, "step": 1139 }, { "epoch": 0.364741641337386, "grad_norm": 7.836308479309082, "learning_rate": 0.00028518859444621834, "loss": 6.3961, "step": 1140 }, { "epoch": 0.3650615901455767, "grad_norm": 7.645723819732666, "learning_rate": 0.0002850088248415577, "loss": 6.3327, "step": 1141 }, { "epoch": 0.3653815389537674, "grad_norm": 5.902100563049316, "learning_rate": 0.00028482897565534763, "loss": 6.418, "step": 1142 }, { "epoch": 0.3657014877619581, "grad_norm": 6.9731292724609375, "learning_rate": 0.0002846490470784016, "loss": 6.1619, "step": 1143 }, { "epoch": 0.3660214365701488, "grad_norm": 9.789436340332031, "learning_rate": 0.00028446903930161695, "loss": 6.6124, "step": 1144 }, { "epoch": 0.3663413853783395, "grad_norm": 6.7149577140808105, "learning_rate": 0.0002842889525159753, "loss": 6.5035, "step": 1145 }, { "epoch": 0.36666133418653013, "grad_norm": 7.927312850952148, "learning_rate": 0.00028410878691254173, "loss": 6.3137, "step": 1146 }, { "epoch": 0.3669812829947208, "grad_norm": 6.585173606872559, "learning_rate": 0.00028392854268246545, "loss": 6.3402, "step": 1147 }, { "epoch": 0.3673012318029115, "grad_norm": 7.843646049499512, "learning_rate": 0.0002837482200169787, "loss": 6.4396, "step": 1148 }, { "epoch": 0.3676211806111022, "grad_norm": 6.062797546386719, "learning_rate": 0.0002835678191073971, "loss": 5.9964, "step": 1149 }, { "epoch": 0.3679411294192929, "grad_norm": 7.792223930358887, "learning_rate": 0.0002833873401451192, "loss": 6.4738, "step": 1150 }, { "epoch": 0.3682610782274836, "grad_norm": 21.281097412109375, "learning_rate": 0.00028320678332162646, "loss": 6.2936, "step": 1151 }, { "epoch": 0.3685810270356743, "grad_norm": 33.57594299316406, "learning_rate": 0.0002830261488284829, "loss": 6.4598, "step": 1152 }, { "epoch": 0.368900975843865, "grad_norm": 38.78255844116211, "learning_rate": 0.0002828454368573348, "loss": 6.349, "step": 1153 }, { "epoch": 0.3692209246520557, "grad_norm": 21.15943717956543, "learning_rate": 0.00028266464759991105, "loss": 6.493, "step": 1154 }, { "epoch": 0.36954087346024633, "grad_norm": 870.9165649414062, "learning_rate": 0.00028248378124802204, "loss": 6.4243, "step": 1155 }, { "epoch": 0.36986082226843703, "grad_norm": 67.86760711669922, "learning_rate": 0.00028230283799356024, "loss": 6.4124, "step": 1156 }, { "epoch": 0.3701807710766277, "grad_norm": 148.6990966796875, "learning_rate": 0.00028212181802849973, "loss": 6.3447, "step": 1157 }, { "epoch": 0.3705007198848184, "grad_norm": 224.3986358642578, "learning_rate": 0.0002819407215448958, "loss": 6.8178, "step": 1158 }, { "epoch": 0.3708206686930091, "grad_norm": 116.84649658203125, "learning_rate": 0.0002817595487348851, "loss": 6.6928, "step": 1159 }, { "epoch": 0.3711406175011998, "grad_norm": 5677.07763671875, "learning_rate": 0.0002815782997906852, "loss": 6.7519, "step": 1160 }, { "epoch": 0.3714605663093905, "grad_norm": 1888.601806640625, "learning_rate": 0.0002813969749045943, "loss": 6.6966, "step": 1161 }, { "epoch": 0.3717805151175812, "grad_norm": 8388.8984375, "learning_rate": 0.00028121557426899154, "loss": 6.9312, "step": 1162 }, { "epoch": 0.3721004639257719, "grad_norm": 9415.212890625, "learning_rate": 0.00028103409807633595, "loss": 7.3199, "step": 1163 }, { "epoch": 0.3724204127339626, "grad_norm": 678.4221801757812, "learning_rate": 0.000280852546519167, "loss": 7.6811, "step": 1164 }, { "epoch": 0.37274036154215323, "grad_norm": 33225.8359375, "learning_rate": 0.0002806709197901042, "loss": 7.352, "step": 1165 }, { "epoch": 0.3730603103503439, "grad_norm": 23.9914493560791, "learning_rate": 0.00028048921808184667, "loss": 7.4617, "step": 1166 }, { "epoch": 0.3733802591585346, "grad_norm": 100466.765625, "learning_rate": 0.000280307441587173, "loss": 6.8622, "step": 1167 }, { "epoch": 0.3737002079667253, "grad_norm": 165.09849548339844, "learning_rate": 0.00028012559049894135, "loss": 7.0926, "step": 1168 }, { "epoch": 0.374020156774916, "grad_norm": 143.38955688476562, "learning_rate": 0.0002799436650100889, "loss": 7.2583, "step": 1169 }, { "epoch": 0.3743401055831067, "grad_norm": 3611.675048828125, "learning_rate": 0.0002797616653136316, "loss": 7.6278, "step": 1170 }, { "epoch": 0.3746600543912974, "grad_norm": 1785.349365234375, "learning_rate": 0.0002795795916026645, "loss": 7.9622, "step": 1171 }, { "epoch": 0.3749800031994881, "grad_norm": 273.0044860839844, "learning_rate": 0.0002793974440703608, "loss": 8.2006, "step": 1172 }, { "epoch": 0.3752999520076788, "grad_norm": 1025.5972900390625, "learning_rate": 0.00027921522290997247, "loss": 7.8807, "step": 1173 }, { "epoch": 0.37561990081586943, "grad_norm": 7521.10107421875, "learning_rate": 0.00027903292831482905, "loss": 7.7593, "step": 1174 }, { "epoch": 0.37593984962406013, "grad_norm": 276.4880676269531, "learning_rate": 0.0002788505604783383, "loss": 7.5434, "step": 1175 }, { "epoch": 0.3762597984322508, "grad_norm": 286.2404479980469, "learning_rate": 0.00027866811959398585, "loss": 7.0517, "step": 1176 }, { "epoch": 0.3765797472404415, "grad_norm": 31.163135528564453, "learning_rate": 0.0002784856058553345, "loss": 7.2771, "step": 1177 }, { "epoch": 0.3768996960486322, "grad_norm": 22.811885833740234, "learning_rate": 0.00027830301945602445, "loss": 7.0636, "step": 1178 }, { "epoch": 0.3772196448568229, "grad_norm": 449.5051574707031, "learning_rate": 0.000278120360589773, "loss": 6.9615, "step": 1179 }, { "epoch": 0.3775395936650136, "grad_norm": 16.285924911499023, "learning_rate": 0.0002779376294503745, "loss": 7.1608, "step": 1180 }, { "epoch": 0.3778595424732043, "grad_norm": 16.230222702026367, "learning_rate": 0.00027775482623169977, "loss": 6.7927, "step": 1181 }, { "epoch": 0.378179491281395, "grad_norm": 9.623656272888184, "learning_rate": 0.0002775719511276961, "loss": 6.8221, "step": 1182 }, { "epoch": 0.3784994400895857, "grad_norm": 9.932913780212402, "learning_rate": 0.00027738900433238716, "loss": 6.6573, "step": 1183 }, { "epoch": 0.37881938889777633, "grad_norm": 7.48627233505249, "learning_rate": 0.0002772059860398726, "loss": 6.6814, "step": 1184 }, { "epoch": 0.37913933770596703, "grad_norm": 7.055801868438721, "learning_rate": 0.00027702289644432804, "loss": 6.5999, "step": 1185 }, { "epoch": 0.3794592865141577, "grad_norm": 8.798673629760742, "learning_rate": 0.00027683973574000464, "loss": 6.5275, "step": 1186 }, { "epoch": 0.3797792353223484, "grad_norm": 17.74211883544922, "learning_rate": 0.000276656504121229, "loss": 6.732, "step": 1187 }, { "epoch": 0.3800991841305391, "grad_norm": 11.82168960571289, "learning_rate": 0.000276473201782403, "loss": 6.5822, "step": 1188 }, { "epoch": 0.3804191329387298, "grad_norm": 8.84131908416748, "learning_rate": 0.0002762898289180036, "loss": 6.7404, "step": 1189 }, { "epoch": 0.3807390817469205, "grad_norm": 9.530952453613281, "learning_rate": 0.00027610638572258254, "loss": 6.5298, "step": 1190 }, { "epoch": 0.3810590305551112, "grad_norm": 15.368577003479004, "learning_rate": 0.0002759228723907661, "loss": 6.4386, "step": 1191 }, { "epoch": 0.3813789793633019, "grad_norm": 19.41617202758789, "learning_rate": 0.000275739289117255, "loss": 6.7247, "step": 1192 }, { "epoch": 0.38169892817149254, "grad_norm": 10.758553504943848, "learning_rate": 0.0002755556360968244, "loss": 6.4879, "step": 1193 }, { "epoch": 0.38201887697968323, "grad_norm": 6.776547431945801, "learning_rate": 0.000275371913524323, "loss": 6.4938, "step": 1194 }, { "epoch": 0.3823388257878739, "grad_norm": 9.408045768737793, "learning_rate": 0.0002751881215946738, "loss": 6.5052, "step": 1195 }, { "epoch": 0.3826587745960646, "grad_norm": 9.227543830871582, "learning_rate": 0.00027500426050287293, "loss": 6.4985, "step": 1196 }, { "epoch": 0.3829787234042553, "grad_norm": 114.55646514892578, "learning_rate": 0.0002748203304439903, "loss": 6.5849, "step": 1197 }, { "epoch": 0.383298672212446, "grad_norm": 588.6021728515625, "learning_rate": 0.0002746363316131687, "loss": 6.7832, "step": 1198 }, { "epoch": 0.3836186210206367, "grad_norm": 147.17431640625, "learning_rate": 0.000274452264205624, "loss": 6.6167, "step": 1199 }, { "epoch": 0.3839385698288274, "grad_norm": 28.505613327026367, "learning_rate": 0.00027426812841664487, "loss": 6.6407, "step": 1200 }, { "epoch": 0.3839385698288274, "eval_loss": 3.3596253395080566, "eval_runtime": 233.5546, "eval_samples_per_second": 5.635, "eval_steps_per_second": 1.409, "step": 1200 }, { "epoch": 0.3842585186370181, "grad_norm": 27.103090286254883, "learning_rate": 0.0002740839244415924, "loss": 6.8103, "step": 1201 }, { "epoch": 0.3845784674452088, "grad_norm": 11.38796615600586, "learning_rate": 0.00027389965247590016, "loss": 6.7159, "step": 1202 }, { "epoch": 0.38489841625339943, "grad_norm": 9.166263580322266, "learning_rate": 0.0002737153127150736, "loss": 6.5407, "step": 1203 }, { "epoch": 0.38521836506159013, "grad_norm": 10.899460792541504, "learning_rate": 0.00027353090535469065, "loss": 6.6115, "step": 1204 }, { "epoch": 0.3855383138697808, "grad_norm": 12.029467582702637, "learning_rate": 0.00027334643059040035, "loss": 6.6065, "step": 1205 }, { "epoch": 0.3858582626779715, "grad_norm": 8.318588256835938, "learning_rate": 0.0002731618886179235, "loss": 6.8032, "step": 1206 }, { "epoch": 0.3861782114861622, "grad_norm": 8.223920822143555, "learning_rate": 0.00027297727963305227, "loss": 6.5092, "step": 1207 }, { "epoch": 0.3864981602943529, "grad_norm": 22.681262969970703, "learning_rate": 0.0002727926038316499, "loss": 6.8623, "step": 1208 }, { "epoch": 0.3868181091025436, "grad_norm": 10.376825332641602, "learning_rate": 0.0002726078614096504, "loss": 6.6216, "step": 1209 }, { "epoch": 0.3871380579107343, "grad_norm": 5.749302864074707, "learning_rate": 0.0002724230525630586, "loss": 6.6049, "step": 1210 }, { "epoch": 0.387458006718925, "grad_norm": 8.080632209777832, "learning_rate": 0.00027223817748794985, "loss": 6.6569, "step": 1211 }, { "epoch": 0.38777795552711564, "grad_norm": 12.558454513549805, "learning_rate": 0.00027205323638046947, "loss": 6.3948, "step": 1212 }, { "epoch": 0.38809790433530633, "grad_norm": 9.190141677856445, "learning_rate": 0.0002718682294368331, "loss": 6.6437, "step": 1213 }, { "epoch": 0.388417853143497, "grad_norm": 4.570112228393555, "learning_rate": 0.00027168315685332633, "loss": 6.5194, "step": 1214 }, { "epoch": 0.3887378019516877, "grad_norm": 9.28131103515625, "learning_rate": 0.0002714980188263041, "loss": 6.5532, "step": 1215 }, { "epoch": 0.3890577507598784, "grad_norm": 11.132205963134766, "learning_rate": 0.00027131281555219084, "loss": 6.6146, "step": 1216 }, { "epoch": 0.3893776995680691, "grad_norm": 7.617869853973389, "learning_rate": 0.00027112754722748037, "loss": 6.5418, "step": 1217 }, { "epoch": 0.3896976483762598, "grad_norm": 6.569556713104248, "learning_rate": 0.00027094221404873537, "loss": 6.5742, "step": 1218 }, { "epoch": 0.3900175971844505, "grad_norm": 9.705401420593262, "learning_rate": 0.0002707568162125875, "loss": 6.5249, "step": 1219 }, { "epoch": 0.3903375459926412, "grad_norm": 8.43759822845459, "learning_rate": 0.00027057135391573683, "loss": 6.7944, "step": 1220 }, { "epoch": 0.39065749480083184, "grad_norm": 9.743579864501953, "learning_rate": 0.00027038582735495196, "loss": 6.6869, "step": 1221 }, { "epoch": 0.39097744360902253, "grad_norm": 9.03455638885498, "learning_rate": 0.0002702002367270695, "loss": 6.4196, "step": 1222 }, { "epoch": 0.39129739241721323, "grad_norm": 11.69813060760498, "learning_rate": 0.0002700145822289942, "loss": 6.5824, "step": 1223 }, { "epoch": 0.3916173412254039, "grad_norm": 11.977036476135254, "learning_rate": 0.00026982886405769855, "loss": 6.8119, "step": 1224 }, { "epoch": 0.3919372900335946, "grad_norm": 7.943235874176025, "learning_rate": 0.00026964308241022255, "loss": 6.6036, "step": 1225 }, { "epoch": 0.3922572388417853, "grad_norm": 5.831643581390381, "learning_rate": 0.00026945723748367353, "loss": 6.747, "step": 1226 }, { "epoch": 0.392577187649976, "grad_norm": 13.803399085998535, "learning_rate": 0.00026927132947522604, "loss": 6.6117, "step": 1227 }, { "epoch": 0.3928971364581667, "grad_norm": 8.283905029296875, "learning_rate": 0.0002690853585821214, "loss": 6.6316, "step": 1228 }, { "epoch": 0.3932170852663574, "grad_norm": 8.696572303771973, "learning_rate": 0.00026889932500166785, "loss": 6.6446, "step": 1229 }, { "epoch": 0.3935370340745481, "grad_norm": 4.740365982055664, "learning_rate": 0.00026871322893124, "loss": 6.6858, "step": 1230 }, { "epoch": 0.39385698288273874, "grad_norm": 11.38948917388916, "learning_rate": 0.0002685270705682788, "loss": 6.748, "step": 1231 }, { "epoch": 0.39417693169092943, "grad_norm": 9.747567176818848, "learning_rate": 0.00026834085011029135, "loss": 6.4157, "step": 1232 }, { "epoch": 0.39449688049912013, "grad_norm": 7.497620582580566, "learning_rate": 0.0002681545677548505, "loss": 6.6276, "step": 1233 }, { "epoch": 0.3948168293073108, "grad_norm": 10.829628944396973, "learning_rate": 0.0002679682236995948, "loss": 6.6541, "step": 1234 }, { "epoch": 0.3951367781155015, "grad_norm": 6.678272247314453, "learning_rate": 0.0002677818181422284, "loss": 6.4207, "step": 1235 }, { "epoch": 0.3954567269236922, "grad_norm": 6.341800689697266, "learning_rate": 0.0002675953512805206, "loss": 6.695, "step": 1236 }, { "epoch": 0.3957766757318829, "grad_norm": 7.859994888305664, "learning_rate": 0.0002674088233123056, "loss": 6.3617, "step": 1237 }, { "epoch": 0.3960966245400736, "grad_norm": 10.069356918334961, "learning_rate": 0.0002672222344354828, "loss": 6.7558, "step": 1238 }, { "epoch": 0.3964165733482643, "grad_norm": 14.660593032836914, "learning_rate": 0.0002670355848480158, "loss": 6.6104, "step": 1239 }, { "epoch": 0.39673652215645494, "grad_norm": 5.288352012634277, "learning_rate": 0.000266848874747933, "loss": 6.5524, "step": 1240 }, { "epoch": 0.39705647096464564, "grad_norm": 6.08315372467041, "learning_rate": 0.0002666621043333266, "loss": 6.2126, "step": 1241 }, { "epoch": 0.39737641977283633, "grad_norm": 6.1647162437438965, "learning_rate": 0.00026647527380235314, "loss": 6.6435, "step": 1242 }, { "epoch": 0.397696368581027, "grad_norm": 5.414215087890625, "learning_rate": 0.0002662883833532328, "loss": 6.2672, "step": 1243 }, { "epoch": 0.3980163173892177, "grad_norm": 5.315961837768555, "learning_rate": 0.00026610143318424925, "loss": 6.3607, "step": 1244 }, { "epoch": 0.3983362661974084, "grad_norm": 5.834237098693848, "learning_rate": 0.0002659144234937497, "loss": 6.558, "step": 1245 }, { "epoch": 0.3986562150055991, "grad_norm": 9.148299217224121, "learning_rate": 0.0002657273544801444, "loss": 6.4416, "step": 1246 }, { "epoch": 0.3989761638137898, "grad_norm": 10.394495010375977, "learning_rate": 0.0002655402263419065, "loss": 6.5473, "step": 1247 }, { "epoch": 0.3992961126219805, "grad_norm": 9.015913009643555, "learning_rate": 0.000265353039277572, "loss": 6.4597, "step": 1248 }, { "epoch": 0.3996160614301712, "grad_norm": 10.161762237548828, "learning_rate": 0.00026516579348573934, "loss": 6.5067, "step": 1249 }, { "epoch": 0.39993601023836184, "grad_norm": 6.956129550933838, "learning_rate": 0.00026497848916506926, "loss": 6.4775, "step": 1250 }, { "epoch": 0.40025595904655253, "grad_norm": 5.3303937911987305, "learning_rate": 0.0002647911265142846, "loss": 6.4659, "step": 1251 }, { "epoch": 0.40057590785474323, "grad_norm": 5.570010662078857, "learning_rate": 0.00026460370573217016, "loss": 6.4517, "step": 1252 }, { "epoch": 0.4008958566629339, "grad_norm": 6.759359836578369, "learning_rate": 0.0002644162270175723, "loss": 6.4963, "step": 1253 }, { "epoch": 0.4012158054711246, "grad_norm": 14.08566665649414, "learning_rate": 0.0002642286905693989, "loss": 6.6086, "step": 1254 }, { "epoch": 0.4015357542793153, "grad_norm": 6.671782493591309, "learning_rate": 0.0002640410965866192, "loss": 6.3949, "step": 1255 }, { "epoch": 0.401855703087506, "grad_norm": 9.904322624206543, "learning_rate": 0.0002638534452682632, "loss": 6.5513, "step": 1256 }, { "epoch": 0.4021756518956967, "grad_norm": 11.320886611938477, "learning_rate": 0.00026366573681342213, "loss": 6.4079, "step": 1257 }, { "epoch": 0.4024956007038874, "grad_norm": 5.8289666175842285, "learning_rate": 0.00026347797142124745, "loss": 6.3216, "step": 1258 }, { "epoch": 0.40281554951207804, "grad_norm": 13.474091529846191, "learning_rate": 0.0002632901492909513, "loss": 6.4256, "step": 1259 }, { "epoch": 0.40313549832026874, "grad_norm": 16.653573989868164, "learning_rate": 0.0002631022706218058, "loss": 6.7427, "step": 1260 }, { "epoch": 0.40345544712845943, "grad_norm": 9.67142391204834, "learning_rate": 0.00026291433561314323, "loss": 6.5105, "step": 1261 }, { "epoch": 0.4037753959366501, "grad_norm": 7.529284477233887, "learning_rate": 0.0002627263444643557, "loss": 6.4653, "step": 1262 }, { "epoch": 0.4040953447448408, "grad_norm": 9.487723350524902, "learning_rate": 0.00026253829737489455, "loss": 6.2462, "step": 1263 }, { "epoch": 0.4044152935530315, "grad_norm": 8.2636079788208, "learning_rate": 0.0002623501945442708, "loss": 6.3761, "step": 1264 }, { "epoch": 0.4047352423612222, "grad_norm": 5.9740471839904785, "learning_rate": 0.00026216203617205453, "loss": 6.3738, "step": 1265 }, { "epoch": 0.4050551911694129, "grad_norm": 7.923804759979248, "learning_rate": 0.0002619738224578746, "loss": 6.1306, "step": 1266 }, { "epoch": 0.4053751399776036, "grad_norm": 6.172807216644287, "learning_rate": 0.0002617855536014188, "loss": 6.4058, "step": 1267 }, { "epoch": 0.4056950887857943, "grad_norm": 10.529424667358398, "learning_rate": 0.0002615972298024334, "loss": 6.286, "step": 1268 }, { "epoch": 0.40601503759398494, "grad_norm": 7.2672953605651855, "learning_rate": 0.0002614088512607227, "loss": 6.3429, "step": 1269 }, { "epoch": 0.40633498640217564, "grad_norm": 12.593116760253906, "learning_rate": 0.0002612204181761493, "loss": 6.5201, "step": 1270 }, { "epoch": 0.40665493521036633, "grad_norm": 12.00728988647461, "learning_rate": 0.00026103193074863377, "loss": 6.3686, "step": 1271 }, { "epoch": 0.406974884018557, "grad_norm": 6.9003777503967285, "learning_rate": 0.0002608433891781541, "loss": 6.3091, "step": 1272 }, { "epoch": 0.4072948328267477, "grad_norm": 11.43606948852539, "learning_rate": 0.0002606547936647458, "loss": 6.4177, "step": 1273 }, { "epoch": 0.4076147816349384, "grad_norm": 8.18825912475586, "learning_rate": 0.0002604661444085017, "loss": 6.379, "step": 1274 }, { "epoch": 0.4079347304431291, "grad_norm": 9.621562957763672, "learning_rate": 0.0002602774416095715, "loss": 6.4082, "step": 1275 }, { "epoch": 0.4082546792513198, "grad_norm": 10.449783325195312, "learning_rate": 0.000260088685468162, "loss": 6.2317, "step": 1276 }, { "epoch": 0.4085746280595105, "grad_norm": 14.534072875976562, "learning_rate": 0.0002598998761845361, "loss": 6.5101, "step": 1277 }, { "epoch": 0.40889457686770114, "grad_norm": 19.77684211730957, "learning_rate": 0.0002597110139590135, "loss": 6.4038, "step": 1278 }, { "epoch": 0.40921452567589184, "grad_norm": 9.12231159210205, "learning_rate": 0.00025952209899197, "loss": 6.2977, "step": 1279 }, { "epoch": 0.40953447448408253, "grad_norm": 9.007134437561035, "learning_rate": 0.0002593331314838372, "loss": 6.6574, "step": 1280 }, { "epoch": 0.40985442329227323, "grad_norm": 12.240474700927734, "learning_rate": 0.0002591441116351025, "loss": 6.6026, "step": 1281 }, { "epoch": 0.4101743721004639, "grad_norm": 18.05267333984375, "learning_rate": 0.000258955039646309, "loss": 6.2505, "step": 1282 }, { "epoch": 0.4104943209086546, "grad_norm": 12.290742874145508, "learning_rate": 0.000258765915718055, "loss": 6.5617, "step": 1283 }, { "epoch": 0.4108142697168453, "grad_norm": 15.071090698242188, "learning_rate": 0.0002585767400509937, "loss": 6.5613, "step": 1284 }, { "epoch": 0.411134218525036, "grad_norm": 8.901190757751465, "learning_rate": 0.00025838751284583346, "loss": 6.3584, "step": 1285 }, { "epoch": 0.4114541673332267, "grad_norm": 11.258878707885742, "learning_rate": 0.0002581982343033374, "loss": 6.5876, "step": 1286 }, { "epoch": 0.41177411614141735, "grad_norm": 9.922440528869629, "learning_rate": 0.00025800890462432277, "loss": 6.2798, "step": 1287 }, { "epoch": 0.41209406494960804, "grad_norm": 7.8548150062561035, "learning_rate": 0.0002578195240096614, "loss": 6.4563, "step": 1288 }, { "epoch": 0.41241401375779874, "grad_norm": 9.597755432128906, "learning_rate": 0.0002576300926602788, "loss": 6.2798, "step": 1289 }, { "epoch": 0.41273396256598943, "grad_norm": 5.551302433013916, "learning_rate": 0.0002574406107771548, "loss": 6.3571, "step": 1290 }, { "epoch": 0.4130539113741801, "grad_norm": 9.343033790588379, "learning_rate": 0.0002572510785613225, "loss": 6.375, "step": 1291 }, { "epoch": 0.4133738601823708, "grad_norm": 6.203455924987793, "learning_rate": 0.0002570614962138682, "loss": 6.032, "step": 1292 }, { "epoch": 0.4136938089905615, "grad_norm": 7.929701328277588, "learning_rate": 0.00025687186393593206, "loss": 6.3534, "step": 1293 }, { "epoch": 0.4140137577987522, "grad_norm": 12.085379600524902, "learning_rate": 0.0002566821819287065, "loss": 6.4062, "step": 1294 }, { "epoch": 0.4143337066069429, "grad_norm": 8.04161262512207, "learning_rate": 0.0002564924503934372, "loss": 6.4253, "step": 1295 }, { "epoch": 0.4146536554151336, "grad_norm": 11.36021614074707, "learning_rate": 0.00025630266953142214, "loss": 6.1811, "step": 1296 }, { "epoch": 0.41497360422332424, "grad_norm": 12.349037170410156, "learning_rate": 0.00025611283954401175, "loss": 6.4346, "step": 1297 }, { "epoch": 0.41529355303151494, "grad_norm": 11.792349815368652, "learning_rate": 0.00025592296063260835, "loss": 6.4392, "step": 1298 }, { "epoch": 0.41561350183970563, "grad_norm": 9.208128929138184, "learning_rate": 0.00025573303299866653, "loss": 6.3419, "step": 1299 }, { "epoch": 0.41593345064789633, "grad_norm": 14.85993766784668, "learning_rate": 0.0002555430568436923, "loss": 6.3156, "step": 1300 }, { "epoch": 0.416253399456087, "grad_norm": 14.815731048583984, "learning_rate": 0.0002553530323692432, "loss": 6.3872, "step": 1301 }, { "epoch": 0.4165733482642777, "grad_norm": 9.829910278320312, "learning_rate": 0.0002551629597769282, "loss": 6.4641, "step": 1302 }, { "epoch": 0.4168932970724684, "grad_norm": 16.248035430908203, "learning_rate": 0.000254972839268407, "loss": 6.2669, "step": 1303 }, { "epoch": 0.4172132458806591, "grad_norm": 11.95917797088623, "learning_rate": 0.00025478267104539053, "loss": 6.4028, "step": 1304 }, { "epoch": 0.4175331946888498, "grad_norm": 10.625663757324219, "learning_rate": 0.00025459245530964, "loss": 6.4577, "step": 1305 }, { "epoch": 0.41785314349704045, "grad_norm": 6.940323352813721, "learning_rate": 0.00025440219226296725, "loss": 6.3556, "step": 1306 }, { "epoch": 0.41817309230523114, "grad_norm": 10.21389102935791, "learning_rate": 0.0002542118821072342, "loss": 6.4131, "step": 1307 }, { "epoch": 0.41849304111342184, "grad_norm": 13.550383567810059, "learning_rate": 0.0002540215250443528, "loss": 6.4616, "step": 1308 }, { "epoch": 0.41881298992161253, "grad_norm": 9.647680282592773, "learning_rate": 0.0002538311212762847, "loss": 6.0977, "step": 1309 }, { "epoch": 0.41913293872980323, "grad_norm": 10.69679069519043, "learning_rate": 0.0002536406710050412, "loss": 6.1197, "step": 1310 }, { "epoch": 0.4194528875379939, "grad_norm": 15.03259563446045, "learning_rate": 0.0002534501744326829, "loss": 6.497, "step": 1311 }, { "epoch": 0.4197728363461846, "grad_norm": 14.628292083740234, "learning_rate": 0.00025325963176131946, "loss": 6.3991, "step": 1312 }, { "epoch": 0.4200927851543753, "grad_norm": 13.270496368408203, "learning_rate": 0.0002530690431931096, "loss": 6.7255, "step": 1313 }, { "epoch": 0.420412733962566, "grad_norm": 7.5467143058776855, "learning_rate": 0.00025287840893026064, "loss": 6.0528, "step": 1314 }, { "epoch": 0.4207326827707567, "grad_norm": 9.551958084106445, "learning_rate": 0.0002526877291750283, "loss": 6.1203, "step": 1315 }, { "epoch": 0.42105263157894735, "grad_norm": 9.613594055175781, "learning_rate": 0.0002524970041297166, "loss": 6.3185, "step": 1316 }, { "epoch": 0.42137258038713804, "grad_norm": 16.20762825012207, "learning_rate": 0.00025230623399667777, "loss": 6.2841, "step": 1317 }, { "epoch": 0.42169252919532874, "grad_norm": 19.544544219970703, "learning_rate": 0.0002521154189783118, "loss": 6.2749, "step": 1318 }, { "epoch": 0.42201247800351943, "grad_norm": 30.432388305664062, "learning_rate": 0.00025192455927706617, "loss": 6.417, "step": 1319 }, { "epoch": 0.4223324268117101, "grad_norm": 12.61909294128418, "learning_rate": 0.0002517336550954359, "loss": 6.5085, "step": 1320 }, { "epoch": 0.4226523756199008, "grad_norm": 12.66275691986084, "learning_rate": 0.0002515427066359632, "loss": 6.239, "step": 1321 }, { "epoch": 0.4229723244280915, "grad_norm": 32.47134780883789, "learning_rate": 0.0002513517141012371, "loss": 6.5225, "step": 1322 }, { "epoch": 0.4232922732362822, "grad_norm": 15.030749320983887, "learning_rate": 0.0002511606776938936, "loss": 6.2803, "step": 1323 }, { "epoch": 0.4236122220444729, "grad_norm": 10.074403762817383, "learning_rate": 0.00025096959761661524, "loss": 6.3504, "step": 1324 }, { "epoch": 0.42393217085266355, "grad_norm": 13.217743873596191, "learning_rate": 0.0002507784740721306, "loss": 6.2698, "step": 1325 }, { "epoch": 0.42425211966085424, "grad_norm": 16.90913200378418, "learning_rate": 0.0002505873072632148, "loss": 6.2857, "step": 1326 }, { "epoch": 0.42457206846904494, "grad_norm": 17.0783634185791, "learning_rate": 0.0002503960973926886, "loss": 6.2195, "step": 1327 }, { "epoch": 0.42489201727723563, "grad_norm": 7.002859115600586, "learning_rate": 0.00025020484466341844, "loss": 6.2902, "step": 1328 }, { "epoch": 0.42521196608542633, "grad_norm": 14.14289379119873, "learning_rate": 0.0002500135492783163, "loss": 6.3848, "step": 1329 }, { "epoch": 0.425531914893617, "grad_norm": 9.69235897064209, "learning_rate": 0.0002498222114403395, "loss": 6.1554, "step": 1330 }, { "epoch": 0.4258518637018077, "grad_norm": 30.017566680908203, "learning_rate": 0.0002496308313524902, "loss": 6.3937, "step": 1331 }, { "epoch": 0.4261718125099984, "grad_norm": 13.781780242919922, "learning_rate": 0.00024943940921781557, "loss": 6.1807, "step": 1332 }, { "epoch": 0.4264917613181891, "grad_norm": 10.11452579498291, "learning_rate": 0.0002492479452394072, "loss": 6.385, "step": 1333 }, { "epoch": 0.4268117101263798, "grad_norm": 18.071516036987305, "learning_rate": 0.00024905643962040133, "loss": 6.35, "step": 1334 }, { "epoch": 0.42713165893457045, "grad_norm": 12.902596473693848, "learning_rate": 0.00024886489256397825, "loss": 6.4579, "step": 1335 }, { "epoch": 0.42745160774276114, "grad_norm": 9.532163619995117, "learning_rate": 0.000248673304273362, "loss": 6.2096, "step": 1336 }, { "epoch": 0.42777155655095184, "grad_norm": 20.802745819091797, "learning_rate": 0.0002484816749518207, "loss": 6.4637, "step": 1337 }, { "epoch": 0.42809150535914253, "grad_norm": 11.478161811828613, "learning_rate": 0.00024829000480266594, "loss": 6.3374, "step": 1338 }, { "epoch": 0.4284114541673332, "grad_norm": 8.678462028503418, "learning_rate": 0.0002480982940292524, "loss": 6.2866, "step": 1339 }, { "epoch": 0.4287314029755239, "grad_norm": 9.180252075195312, "learning_rate": 0.0002479065428349782, "loss": 6.3384, "step": 1340 }, { "epoch": 0.4290513517837146, "grad_norm": 11.826372146606445, "learning_rate": 0.00024771475142328406, "loss": 6.3178, "step": 1341 }, { "epoch": 0.4293713005919053, "grad_norm": 8.149714469909668, "learning_rate": 0.00024752291999765344, "loss": 6.4101, "step": 1342 }, { "epoch": 0.429691249400096, "grad_norm": 8.667763710021973, "learning_rate": 0.0002473310487616123, "loss": 6.2569, "step": 1343 }, { "epoch": 0.43001119820828665, "grad_norm": 8.507345199584961, "learning_rate": 0.00024713913791872896, "loss": 6.0874, "step": 1344 }, { "epoch": 0.43033114701647734, "grad_norm": 12.065765380859375, "learning_rate": 0.00024694718767261336, "loss": 6.2609, "step": 1345 }, { "epoch": 0.43065109582466804, "grad_norm": 7.034212112426758, "learning_rate": 0.00024675519822691777, "loss": 6.3743, "step": 1346 }, { "epoch": 0.43097104463285874, "grad_norm": 8.030159950256348, "learning_rate": 0.0002465631697853357, "loss": 6.1686, "step": 1347 }, { "epoch": 0.43129099344104943, "grad_norm": 36.66036605834961, "learning_rate": 0.00024637110255160203, "loss": 6.2742, "step": 1348 }, { "epoch": 0.4316109422492401, "grad_norm": 12.500212669372559, "learning_rate": 0.00024617899672949305, "loss": 6.1638, "step": 1349 }, { "epoch": 0.4319308910574308, "grad_norm": 20.530799865722656, "learning_rate": 0.0002459868525228257, "loss": 6.5203, "step": 1350 }, { "epoch": 0.4322508398656215, "grad_norm": 6.235330104827881, "learning_rate": 0.0002457946701354578, "loss": 6.2691, "step": 1351 }, { "epoch": 0.4325707886738122, "grad_norm": 10.153731346130371, "learning_rate": 0.00024560244977128774, "loss": 6.3439, "step": 1352 }, { "epoch": 0.43289073748200285, "grad_norm": 9.818161964416504, "learning_rate": 0.000245410191634254, "loss": 6.1459, "step": 1353 }, { "epoch": 0.43321068629019355, "grad_norm": 16.71061134338379, "learning_rate": 0.0002452178959283353, "loss": 6.4604, "step": 1354 }, { "epoch": 0.43353063509838424, "grad_norm": 9.193108558654785, "learning_rate": 0.00024502556285755023, "loss": 6.3588, "step": 1355 }, { "epoch": 0.43385058390657494, "grad_norm": 9.433093070983887, "learning_rate": 0.00024483319262595687, "loss": 6.4425, "step": 1356 }, { "epoch": 0.43417053271476563, "grad_norm": 8.100132942199707, "learning_rate": 0.0002446407854376529, "loss": 6.434, "step": 1357 }, { "epoch": 0.43449048152295633, "grad_norm": 8.084160804748535, "learning_rate": 0.00024444834149677506, "loss": 6.4744, "step": 1358 }, { "epoch": 0.434810430331147, "grad_norm": 7.267811298370361, "learning_rate": 0.00024425586100749916, "loss": 6.0862, "step": 1359 }, { "epoch": 0.4351303791393377, "grad_norm": 6.129974365234375, "learning_rate": 0.0002440633441740398, "loss": 6.3388, "step": 1360 }, { "epoch": 0.4354503279475284, "grad_norm": 10.397089004516602, "learning_rate": 0.00024387079120065014, "loss": 6.349, "step": 1361 }, { "epoch": 0.4357702767557191, "grad_norm": 10.434456825256348, "learning_rate": 0.00024367820229162157, "loss": 6.1166, "step": 1362 }, { "epoch": 0.43609022556390975, "grad_norm": 8.7677001953125, "learning_rate": 0.00024348557765128384, "loss": 6.2898, "step": 1363 }, { "epoch": 0.43641017437210045, "grad_norm": 11.238868713378906, "learning_rate": 0.0002432929174840044, "loss": 6.1991, "step": 1364 }, { "epoch": 0.43673012318029114, "grad_norm": 7.583499431610107, "learning_rate": 0.00024310022199418833, "loss": 6.0778, "step": 1365 }, { "epoch": 0.43705007198848184, "grad_norm": 11.78762149810791, "learning_rate": 0.0002429074913862786, "loss": 6.3973, "step": 1366 }, { "epoch": 0.43737002079667253, "grad_norm": 7.4696431159973145, "learning_rate": 0.0002427147258647549, "loss": 6.0472, "step": 1367 }, { "epoch": 0.4376899696048632, "grad_norm": 11.547538757324219, "learning_rate": 0.00024252192563413435, "loss": 6.4374, "step": 1368 }, { "epoch": 0.4380099184130539, "grad_norm": 11.28888988494873, "learning_rate": 0.00024232909089897065, "loss": 6.3143, "step": 1369 }, { "epoch": 0.4383298672212446, "grad_norm": 12.933130264282227, "learning_rate": 0.00024213622186385436, "loss": 6.26, "step": 1370 }, { "epoch": 0.4386498160294353, "grad_norm": 10.536492347717285, "learning_rate": 0.00024194331873341222, "loss": 6.2753, "step": 1371 }, { "epoch": 0.43896976483762595, "grad_norm": 18.023189544677734, "learning_rate": 0.00024175038171230718, "loss": 6.4572, "step": 1372 }, { "epoch": 0.43928971364581665, "grad_norm": 15.396516799926758, "learning_rate": 0.00024155741100523824, "loss": 6.0828, "step": 1373 }, { "epoch": 0.43960966245400734, "grad_norm": 12.426717758178711, "learning_rate": 0.00024136440681694007, "loss": 6.3676, "step": 1374 }, { "epoch": 0.43992961126219804, "grad_norm": 10.206119537353516, "learning_rate": 0.00024117136935218283, "loss": 6.1777, "step": 1375 }, { "epoch": 0.44024956007038873, "grad_norm": 17.221784591674805, "learning_rate": 0.00024097829881577205, "loss": 6.2916, "step": 1376 }, { "epoch": 0.44056950887857943, "grad_norm": 72.50157165527344, "learning_rate": 0.0002407851954125484, "loss": 6.234, "step": 1377 }, { "epoch": 0.4408894576867701, "grad_norm": 7.173483848571777, "learning_rate": 0.0002405920593473872, "loss": 6.2403, "step": 1378 }, { "epoch": 0.4412094064949608, "grad_norm": 7.698493480682373, "learning_rate": 0.0002403988908251988, "loss": 6.2424, "step": 1379 }, { "epoch": 0.4415293553031515, "grad_norm": 11.574590682983398, "learning_rate": 0.00024020569005092749, "loss": 6.2996, "step": 1380 }, { "epoch": 0.4418493041113422, "grad_norm": 10.622515678405762, "learning_rate": 0.00024001245722955216, "loss": 6.158, "step": 1381 }, { "epoch": 0.44216925291953285, "grad_norm": 11.94666576385498, "learning_rate": 0.00023981919256608564, "loss": 6.165, "step": 1382 }, { "epoch": 0.44248920172772355, "grad_norm": 10.147541046142578, "learning_rate": 0.00023962589626557446, "loss": 6.2964, "step": 1383 }, { "epoch": 0.44280915053591424, "grad_norm": 9.275951385498047, "learning_rate": 0.00023943256853309862, "loss": 6.3666, "step": 1384 }, { "epoch": 0.44312909934410494, "grad_norm": 12.353401184082031, "learning_rate": 0.0002392392095737718, "loss": 6.2231, "step": 1385 }, { "epoch": 0.44344904815229563, "grad_norm": 145.6978759765625, "learning_rate": 0.0002390458195927404, "loss": 6.0643, "step": 1386 }, { "epoch": 0.44376899696048633, "grad_norm": 9.182119369506836, "learning_rate": 0.00023885239879518406, "loss": 6.2337, "step": 1387 }, { "epoch": 0.444088945768677, "grad_norm": 10.039813995361328, "learning_rate": 0.000238658947386315, "loss": 6.3979, "step": 1388 }, { "epoch": 0.4444088945768677, "grad_norm": 8.057585716247559, "learning_rate": 0.00023846546557137782, "loss": 6.1908, "step": 1389 }, { "epoch": 0.4447288433850584, "grad_norm": 8.275229454040527, "learning_rate": 0.00023827195355564958, "loss": 6.4919, "step": 1390 }, { "epoch": 0.44504879219324905, "grad_norm": 14.329585075378418, "learning_rate": 0.00023807841154443912, "loss": 6.3344, "step": 1391 }, { "epoch": 0.44536874100143975, "grad_norm": 17.71234893798828, "learning_rate": 0.00023788483974308738, "loss": 6.1686, "step": 1392 }, { "epoch": 0.44568868980963045, "grad_norm": 17.175947189331055, "learning_rate": 0.00023769123835696676, "loss": 6.2478, "step": 1393 }, { "epoch": 0.44600863861782114, "grad_norm": 12.383498191833496, "learning_rate": 0.00023749760759148104, "loss": 6.2889, "step": 1394 }, { "epoch": 0.44632858742601184, "grad_norm": 26.6887149810791, "learning_rate": 0.0002373039476520651, "loss": 6.2837, "step": 1395 }, { "epoch": 0.44664853623420253, "grad_norm": 35.60121536254883, "learning_rate": 0.00023711025874418508, "loss": 6.2465, "step": 1396 }, { "epoch": 0.4469684850423932, "grad_norm": 26.76144790649414, "learning_rate": 0.00023691654107333755, "loss": 6.4341, "step": 1397 }, { "epoch": 0.4472884338505839, "grad_norm": 37.42619323730469, "learning_rate": 0.0002367227948450496, "loss": 6.4073, "step": 1398 }, { "epoch": 0.4476083826587746, "grad_norm": 24.048145294189453, "learning_rate": 0.00023652902026487883, "loss": 6.1317, "step": 1399 }, { "epoch": 0.4479283314669653, "grad_norm": 28.50050163269043, "learning_rate": 0.0002363352175384128, "loss": 6.2892, "step": 1400 }, { "epoch": 0.4479283314669653, "eval_loss": 3.1516573429107666, "eval_runtime": 233.5122, "eval_samples_per_second": 5.636, "eval_steps_per_second": 1.409, "step": 1400 }, { "epoch": 0.44824828027515595, "grad_norm": 12.275089263916016, "learning_rate": 0.00023614138687126887, "loss": 6.384, "step": 1401 }, { "epoch": 0.44856822908334665, "grad_norm": 14.296133995056152, "learning_rate": 0.00023594752846909414, "loss": 6.2824, "step": 1402 }, { "epoch": 0.44888817789153734, "grad_norm": 23.578289031982422, "learning_rate": 0.0002357536425375651, "loss": 6.1043, "step": 1403 }, { "epoch": 0.44920812669972804, "grad_norm": 17.572853088378906, "learning_rate": 0.00023555972928238737, "loss": 6.3525, "step": 1404 }, { "epoch": 0.44952807550791873, "grad_norm": 13.461702346801758, "learning_rate": 0.00023536578890929582, "loss": 6.3522, "step": 1405 }, { "epoch": 0.44984802431610943, "grad_norm": 14.324708938598633, "learning_rate": 0.00023517182162405368, "loss": 6.2278, "step": 1406 }, { "epoch": 0.4501679731243001, "grad_norm": 17.58342933654785, "learning_rate": 0.0002349778276324532, "loss": 6.4268, "step": 1407 }, { "epoch": 0.4504879219324908, "grad_norm": 13.684356689453125, "learning_rate": 0.0002347838071403146, "loss": 6.353, "step": 1408 }, { "epoch": 0.4508078707406815, "grad_norm": 13.513612747192383, "learning_rate": 0.0002345897603534862, "loss": 6.361, "step": 1409 }, { "epoch": 0.45112781954887216, "grad_norm": 12.80638313293457, "learning_rate": 0.0002343956874778447, "loss": 6.3029, "step": 1410 }, { "epoch": 0.45144776835706285, "grad_norm": 16.215721130371094, "learning_rate": 0.00023420158871929393, "loss": 6.3764, "step": 1411 }, { "epoch": 0.45176771716525355, "grad_norm": 25.084028244018555, "learning_rate": 0.00023400746428376538, "loss": 6.2, "step": 1412 }, { "epoch": 0.45208766597344424, "grad_norm": 12.46812915802002, "learning_rate": 0.00023381331437721784, "loss": 6.2141, "step": 1413 }, { "epoch": 0.45240761478163494, "grad_norm": 18.606529235839844, "learning_rate": 0.00023361913920563705, "loss": 6.2929, "step": 1414 }, { "epoch": 0.45272756358982563, "grad_norm": 12.622028350830078, "learning_rate": 0.00023342493897503567, "loss": 6.2309, "step": 1415 }, { "epoch": 0.4530475123980163, "grad_norm": 30.32240867614746, "learning_rate": 0.00023323071389145278, "loss": 6.3812, "step": 1416 }, { "epoch": 0.453367461206207, "grad_norm": 35.68700408935547, "learning_rate": 0.00023303646416095384, "loss": 6.3141, "step": 1417 }, { "epoch": 0.4536874100143977, "grad_norm": 25.01828956604004, "learning_rate": 0.0002328421899896307, "loss": 6.363, "step": 1418 }, { "epoch": 0.45400735882258836, "grad_norm": 16.23634910583496, "learning_rate": 0.00023264789158360086, "loss": 6.3071, "step": 1419 }, { "epoch": 0.45432730763077905, "grad_norm": 15.429924011230469, "learning_rate": 0.0002324535691490076, "loss": 6.3, "step": 1420 }, { "epoch": 0.45464725643896975, "grad_norm": 30.830224990844727, "learning_rate": 0.00023225922289201988, "loss": 6.2047, "step": 1421 }, { "epoch": 0.45496720524716044, "grad_norm": 14.196430206298828, "learning_rate": 0.00023206485301883163, "loss": 6.3659, "step": 1422 }, { "epoch": 0.45528715405535114, "grad_norm": 16.008033752441406, "learning_rate": 0.0002318704597356621, "loss": 6.3481, "step": 1423 }, { "epoch": 0.45560710286354184, "grad_norm": 11.479830741882324, "learning_rate": 0.00023167604324875516, "loss": 6.3219, "step": 1424 }, { "epoch": 0.45592705167173253, "grad_norm": 17.422664642333984, "learning_rate": 0.00023148160376437957, "loss": 6.3007, "step": 1425 }, { "epoch": 0.4562470004799232, "grad_norm": 10.73954963684082, "learning_rate": 0.00023128714148882825, "loss": 6.3275, "step": 1426 }, { "epoch": 0.4565669492881139, "grad_norm": 9.496156692504883, "learning_rate": 0.0002310926566284183, "loss": 6.179, "step": 1427 }, { "epoch": 0.4568868980963046, "grad_norm": 13.453152656555176, "learning_rate": 0.00023089814938949098, "loss": 6.0879, "step": 1428 }, { "epoch": 0.45720684690449526, "grad_norm": 10.89134693145752, "learning_rate": 0.00023070361997841107, "loss": 6.3478, "step": 1429 }, { "epoch": 0.45752679571268595, "grad_norm": 10.230031967163086, "learning_rate": 0.00023050906860156708, "loss": 6.0896, "step": 1430 }, { "epoch": 0.45784674452087665, "grad_norm": 16.458038330078125, "learning_rate": 0.00023031449546537065, "loss": 6.5301, "step": 1431 }, { "epoch": 0.45816669332906734, "grad_norm": 13.3131103515625, "learning_rate": 0.00023011990077625663, "loss": 6.2881, "step": 1432 }, { "epoch": 0.45848664213725804, "grad_norm": 9.168004989624023, "learning_rate": 0.00022992528474068266, "loss": 6.3154, "step": 1433 }, { "epoch": 0.45880659094544873, "grad_norm": 35.18294143676758, "learning_rate": 0.00022973064756512903, "loss": 6.2735, "step": 1434 }, { "epoch": 0.45912653975363943, "grad_norm": 11.018784523010254, "learning_rate": 0.0002295359894560985, "loss": 6.4403, "step": 1435 }, { "epoch": 0.4594464885618301, "grad_norm": 8.920594215393066, "learning_rate": 0.00022934131062011607, "loss": 6.0729, "step": 1436 }, { "epoch": 0.4597664373700208, "grad_norm": 13.392729759216309, "learning_rate": 0.00022914661126372855, "loss": 6.2305, "step": 1437 }, { "epoch": 0.46008638617821146, "grad_norm": 12.815661430358887, "learning_rate": 0.00022895189159350486, "loss": 6.2008, "step": 1438 }, { "epoch": 0.46040633498640215, "grad_norm": 16.973766326904297, "learning_rate": 0.00022875715181603506, "loss": 6.1381, "step": 1439 }, { "epoch": 0.46072628379459285, "grad_norm": 13.373908996582031, "learning_rate": 0.00022856239213793088, "loss": 6.3298, "step": 1440 }, { "epoch": 0.46104623260278355, "grad_norm": 8.596816062927246, "learning_rate": 0.00022836761276582497, "loss": 6.3952, "step": 1441 }, { "epoch": 0.46136618141097424, "grad_norm": 57.45488739013672, "learning_rate": 0.00022817281390637089, "loss": 6.3979, "step": 1442 }, { "epoch": 0.46168613021916494, "grad_norm": 22.90778160095215, "learning_rate": 0.00022797799576624304, "loss": 6.2665, "step": 1443 }, { "epoch": 0.46200607902735563, "grad_norm": 17.244796752929688, "learning_rate": 0.0002277831585521361, "loss": 6.3962, "step": 1444 }, { "epoch": 0.4623260278355463, "grad_norm": 10.941102027893066, "learning_rate": 0.00022758830247076505, "loss": 6.1673, "step": 1445 }, { "epoch": 0.462645976643737, "grad_norm": 15.169724464416504, "learning_rate": 0.00022739342772886488, "loss": 6.5136, "step": 1446 }, { "epoch": 0.4629659254519277, "grad_norm": 9.183232307434082, "learning_rate": 0.0002271985345331904, "loss": 6.1549, "step": 1447 }, { "epoch": 0.46328587426011836, "grad_norm": 10.990023612976074, "learning_rate": 0.00022700362309051593, "loss": 6.2485, "step": 1448 }, { "epoch": 0.46360582306830905, "grad_norm": 10.13391399383545, "learning_rate": 0.00022680869360763528, "loss": 6.3059, "step": 1449 }, { "epoch": 0.46392577187649975, "grad_norm": 18.721927642822266, "learning_rate": 0.00022661374629136125, "loss": 6.3244, "step": 1450 }, { "epoch": 0.46424572068469044, "grad_norm": 7.544312000274658, "learning_rate": 0.00022641878134852558, "loss": 6.2333, "step": 1451 }, { "epoch": 0.46456566949288114, "grad_norm": 11.716202735900879, "learning_rate": 0.00022622379898597897, "loss": 6.0736, "step": 1452 }, { "epoch": 0.46488561830107183, "grad_norm": 8.775861740112305, "learning_rate": 0.00022602879941059013, "loss": 6.2285, "step": 1453 }, { "epoch": 0.46520556710926253, "grad_norm": 8.8626708984375, "learning_rate": 0.0002258337828292464, "loss": 6.3011, "step": 1454 }, { "epoch": 0.4655255159174532, "grad_norm": 15.481707572937012, "learning_rate": 0.00022563874944885317, "loss": 6.1798, "step": 1455 }, { "epoch": 0.4658454647256439, "grad_norm": 12.100132942199707, "learning_rate": 0.0002254436994763334, "loss": 6.1638, "step": 1456 }, { "epoch": 0.46616541353383456, "grad_norm": 10.16502857208252, "learning_rate": 0.00022524863311862783, "loss": 6.4787, "step": 1457 }, { "epoch": 0.46648536234202526, "grad_norm": 10.59926986694336, "learning_rate": 0.00022505355058269455, "loss": 6.2177, "step": 1458 }, { "epoch": 0.46680531115021595, "grad_norm": 10.0380220413208, "learning_rate": 0.00022485845207550882, "loss": 6.2375, "step": 1459 }, { "epoch": 0.46712525995840665, "grad_norm": 10.464765548706055, "learning_rate": 0.00022466333780406283, "loss": 6.1937, "step": 1460 }, { "epoch": 0.46744520876659734, "grad_norm": 10.147469520568848, "learning_rate": 0.00022446820797536555, "loss": 6.36, "step": 1461 }, { "epoch": 0.46776515757478804, "grad_norm": 10.764010429382324, "learning_rate": 0.00022427306279644232, "loss": 6.3815, "step": 1462 }, { "epoch": 0.46808510638297873, "grad_norm": 71.32337188720703, "learning_rate": 0.00022407790247433492, "loss": 6.2424, "step": 1463 }, { "epoch": 0.46840505519116943, "grad_norm": 11.261139869689941, "learning_rate": 0.0002238827272161011, "loss": 6.441, "step": 1464 }, { "epoch": 0.4687250039993601, "grad_norm": 10.223528861999512, "learning_rate": 0.00022368753722881444, "loss": 6.3204, "step": 1465 }, { "epoch": 0.4690449528075508, "grad_norm": 9.939703941345215, "learning_rate": 0.00022349233271956438, "loss": 6.2861, "step": 1466 }, { "epoch": 0.46936490161574146, "grad_norm": 11.785332679748535, "learning_rate": 0.00022329711389545528, "loss": 6.128, "step": 1467 }, { "epoch": 0.46968485042393215, "grad_norm": 6.066036224365234, "learning_rate": 0.00022310188096360726, "loss": 6.2245, "step": 1468 }, { "epoch": 0.47000479923212285, "grad_norm": 12.680907249450684, "learning_rate": 0.00022290663413115507, "loss": 6.3803, "step": 1469 }, { "epoch": 0.47032474804031354, "grad_norm": 9.246479034423828, "learning_rate": 0.00022271137360524822, "loss": 6.426, "step": 1470 }, { "epoch": 0.47064469684850424, "grad_norm": 12.644569396972656, "learning_rate": 0.00022251609959305082, "loss": 6.344, "step": 1471 }, { "epoch": 0.47096464565669494, "grad_norm": 10.939471244812012, "learning_rate": 0.00022232081230174125, "loss": 6.0337, "step": 1472 }, { "epoch": 0.47128459446488563, "grad_norm": 7.102919101715088, "learning_rate": 0.00022212551193851203, "loss": 6.3417, "step": 1473 }, { "epoch": 0.4716045432730763, "grad_norm": 13.819934844970703, "learning_rate": 0.00022193019871056958, "loss": 6.1443, "step": 1474 }, { "epoch": 0.471924492081267, "grad_norm": 7.774622917175293, "learning_rate": 0.0002217348728251338, "loss": 6.2144, "step": 1475 }, { "epoch": 0.47224444088945766, "grad_norm": 12.074188232421875, "learning_rate": 0.00022153953448943815, "loss": 6.1406, "step": 1476 }, { "epoch": 0.47256438969764836, "grad_norm": 8.314803123474121, "learning_rate": 0.00022134418391072937, "loss": 6.1434, "step": 1477 }, { "epoch": 0.47288433850583905, "grad_norm": 11.96036148071289, "learning_rate": 0.00022114882129626695, "loss": 6.2643, "step": 1478 }, { "epoch": 0.47320428731402975, "grad_norm": 15.879840850830078, "learning_rate": 0.00022095344685332338, "loss": 6.2671, "step": 1479 }, { "epoch": 0.47352423612222044, "grad_norm": 13.139908790588379, "learning_rate": 0.00022075806078918363, "loss": 6.0182, "step": 1480 }, { "epoch": 0.47384418493041114, "grad_norm": 13.335620880126953, "learning_rate": 0.00022056266331114494, "loss": 6.137, "step": 1481 }, { "epoch": 0.47416413373860183, "grad_norm": 26.10761260986328, "learning_rate": 0.00022036725462651672, "loss": 6.2739, "step": 1482 }, { "epoch": 0.47448408254679253, "grad_norm": 141.016845703125, "learning_rate": 0.00022017183494262027, "loss": 6.2765, "step": 1483 }, { "epoch": 0.4748040313549832, "grad_norm": 30.745948791503906, "learning_rate": 0.00021997640446678852, "loss": 6.3736, "step": 1484 }, { "epoch": 0.47512398016317386, "grad_norm": 25.929046630859375, "learning_rate": 0.00021978096340636585, "loss": 6.1721, "step": 1485 }, { "epoch": 0.47544392897136456, "grad_norm": 748.2965087890625, "learning_rate": 0.00021958551196870797, "loss": 6.4215, "step": 1486 }, { "epoch": 0.47576387777955526, "grad_norm": 4598.34619140625, "learning_rate": 0.00021939005036118142, "loss": 6.2953, "step": 1487 }, { "epoch": 0.47608382658774595, "grad_norm": 4421.1689453125, "learning_rate": 0.0002191945787911638, "loss": 6.3312, "step": 1488 }, { "epoch": 0.47640377539593665, "grad_norm": 47090.1484375, "learning_rate": 0.00021899909746604294, "loss": 6.2649, "step": 1489 }, { "epoch": 0.47672372420412734, "grad_norm": 3408.970703125, "learning_rate": 0.00021880360659321725, "loss": 6.8902, "step": 1490 }, { "epoch": 0.47704367301231804, "grad_norm": 60290.5234375, "learning_rate": 0.0002186081063800953, "loss": 11.7343, "step": 1491 }, { "epoch": 0.47736362182050873, "grad_norm": 331439.40625, "learning_rate": 0.00021841259703409528, "loss": 19.0283, "step": 1492 }, { "epoch": 0.4776835706286994, "grad_norm": 360846.0, "learning_rate": 0.00021821707876264545, "loss": 20.4589, "step": 1493 }, { "epoch": 0.4780035194368901, "grad_norm": 7549.9189453125, "learning_rate": 0.00021802155177318334, "loss": 23.1843, "step": 1494 }, { "epoch": 0.47832346824508076, "grad_norm": 4522283.0, "learning_rate": 0.00021782601627315566, "loss": 23.6448, "step": 1495 }, { "epoch": 0.47864341705327146, "grad_norm": 7154.123046875, "learning_rate": 0.00021763047247001837, "loss": 25.8975, "step": 1496 }, { "epoch": 0.47896336586146215, "grad_norm": 17986.6015625, "learning_rate": 0.00021743492057123596, "loss": 26.1484, "step": 1497 }, { "epoch": 0.47928331466965285, "grad_norm": 13135.7294921875, "learning_rate": 0.00021723936078428175, "loss": 27.4075, "step": 1498 }, { "epoch": 0.47960326347784354, "grad_norm": 39999.80859375, "learning_rate": 0.0002170437933166374, "loss": 26.3727, "step": 1499 }, { "epoch": 0.47992321228603424, "grad_norm": 6688.4423828125, "learning_rate": 0.00021684821837579245, "loss": 26.3504, "step": 1500 }, { "epoch": 0.48024316109422494, "grad_norm": 20799.302734375, "learning_rate": 0.0002166526361692448, "loss": 26.8786, "step": 1501 }, { "epoch": 0.48056310990241563, "grad_norm": 12096.689453125, "learning_rate": 0.0002164570469044997, "loss": 27.476, "step": 1502 }, { "epoch": 0.4808830587106063, "grad_norm": 13212.9248046875, "learning_rate": 0.00021626145078907006, "loss": 25.8305, "step": 1503 }, { "epoch": 0.48120300751879697, "grad_norm": 25206.7734375, "learning_rate": 0.00021606584803047607, "loss": 27.1057, "step": 1504 }, { "epoch": 0.48152295632698766, "grad_norm": 9155.57421875, "learning_rate": 0.00021587023883624484, "loss": 25.6381, "step": 1505 }, { "epoch": 0.48184290513517836, "grad_norm": 136102.078125, "learning_rate": 0.00021567462341391043, "loss": 26.7886, "step": 1506 }, { "epoch": 0.48216285394336905, "grad_norm": 130571.6015625, "learning_rate": 0.00021547900197101347, "loss": 27.0289, "step": 1507 }, { "epoch": 0.48248280275155975, "grad_norm": 51434.0234375, "learning_rate": 0.00021528337471510094, "loss": 25.754, "step": 1508 }, { "epoch": 0.48280275155975044, "grad_norm": 1381.776611328125, "learning_rate": 0.00021508774185372595, "loss": 26.9629, "step": 1509 }, { "epoch": 0.48312270036794114, "grad_norm": 265793.59375, "learning_rate": 0.00021489210359444788, "loss": 25.9567, "step": 1510 }, { "epoch": 0.48344264917613183, "grad_norm": 845718.25, "learning_rate": 0.00021469646014483137, "loss": 26.3276, "step": 1511 }, { "epoch": 0.48376259798432253, "grad_norm": 225879.328125, "learning_rate": 0.0002145008117124467, "loss": 25.9805, "step": 1512 }, { "epoch": 0.4840825467925132, "grad_norm": 436685.65625, "learning_rate": 0.00021430515850486974, "loss": 25.5142, "step": 1513 }, { "epoch": 0.48440249560070386, "grad_norm": 14073.525390625, "learning_rate": 0.00021410950072968111, "loss": 26.5983, "step": 1514 }, { "epoch": 0.48472244440889456, "grad_norm": 28912.51953125, "learning_rate": 0.00021391383859446634, "loss": 26.5296, "step": 1515 }, { "epoch": 0.48504239321708525, "grad_norm": 229632.765625, "learning_rate": 0.0002137181723068157, "loss": 26.8914, "step": 1516 }, { "epoch": 0.48536234202527595, "grad_norm": 4752278.5, "learning_rate": 0.00021352250207432366, "loss": 26.7629, "step": 1517 }, { "epoch": 0.48568229083346665, "grad_norm": 16323.1162109375, "learning_rate": 0.00021332682810458912, "loss": 26.3109, "step": 1518 }, { "epoch": 0.48600223964165734, "grad_norm": 81523.640625, "learning_rate": 0.00021313115060521472, "loss": 26.0956, "step": 1519 }, { "epoch": 0.48632218844984804, "grad_norm": 651489.125, "learning_rate": 0.000212935469783807, "loss": 23.9349, "step": 1520 }, { "epoch": 0.48664213725803873, "grad_norm": 111789.7890625, "learning_rate": 0.00021273978584797595, "loss": 26.5997, "step": 1521 }, { "epoch": 0.4869620860662294, "grad_norm": 445851.625, "learning_rate": 0.00021254409900533494, "loss": 25.0127, "step": 1522 }, { "epoch": 0.48728203487442007, "grad_norm": 8311.8564453125, "learning_rate": 0.00021234840946350025, "loss": 25.4959, "step": 1523 }, { "epoch": 0.48760198368261076, "grad_norm": 19274.490234375, "learning_rate": 0.00021215271743009128, "loss": 25.6081, "step": 1524 }, { "epoch": 0.48792193249080146, "grad_norm": 4866.87109375, "learning_rate": 0.00021195702311272991, "loss": 25.6694, "step": 1525 }, { "epoch": 0.48824188129899215, "grad_norm": 75604.7421875, "learning_rate": 0.0002117613267190403, "loss": 25.3733, "step": 1526 }, { "epoch": 0.48856183010718285, "grad_norm": 223031.828125, "learning_rate": 0.00021156562845664917, "loss": 25.7078, "step": 1527 }, { "epoch": 0.48888177891537354, "grad_norm": 186668.125, "learning_rate": 0.00021136992853318503, "loss": 24.8685, "step": 1528 }, { "epoch": 0.48920172772356424, "grad_norm": 10703.2900390625, "learning_rate": 0.00021117422715627812, "loss": 25.8425, "step": 1529 }, { "epoch": 0.48952167653175493, "grad_norm": 112749.9765625, "learning_rate": 0.00021097852453356018, "loss": 25.8862, "step": 1530 }, { "epoch": 0.48984162533994563, "grad_norm": 4226.068359375, "learning_rate": 0.0002107828208726644, "loss": 24.3479, "step": 1531 }, { "epoch": 0.4901615741481363, "grad_norm": 18113.078125, "learning_rate": 0.0002105871163812251, "loss": 26.1326, "step": 1532 }, { "epoch": 0.49048152295632697, "grad_norm": 48981.3203125, "learning_rate": 0.0002103914112668774, "loss": 24.9588, "step": 1533 }, { "epoch": 0.49080147176451766, "grad_norm": 295377.0625, "learning_rate": 0.00021019570573725687, "loss": 26.1572, "step": 1534 }, { "epoch": 0.49112142057270836, "grad_norm": 3137.26806640625, "learning_rate": 0.00021, "loss": 25.481, "step": 1535 }, { "epoch": 0.49144136938089905, "grad_norm": 5773.8046875, "learning_rate": 0.00020980429426274312, "loss": 25.5111, "step": 1536 }, { "epoch": 0.49176131818908975, "grad_norm": 813.1864013671875, "learning_rate": 0.00020960858873312268, "loss": 24.7992, "step": 1537 }, { "epoch": 0.49208126699728044, "grad_norm": 91637.9609375, "learning_rate": 0.00020941288361877493, "loss": 25.2568, "step": 1538 }, { "epoch": 0.49240121580547114, "grad_norm": 295463.03125, "learning_rate": 0.0002092171791273356, "loss": 25.5154, "step": 1539 }, { "epoch": 0.49272116461366183, "grad_norm": 172231.9375, "learning_rate": 0.00020902147546643986, "loss": 25.6833, "step": 1540 }, { "epoch": 0.49304111342185253, "grad_norm": 13051.9150390625, "learning_rate": 0.0002088257728437219, "loss": 24.4191, "step": 1541 }, { "epoch": 0.49336106223004317, "grad_norm": 2055.154296875, "learning_rate": 0.000208630071466815, "loss": 25.3225, "step": 1542 }, { "epoch": 0.49368101103823386, "grad_norm": 91596.3203125, "learning_rate": 0.00020843437154335082, "loss": 24.9479, "step": 1543 }, { "epoch": 0.49400095984642456, "grad_norm": 4480.8583984375, "learning_rate": 0.00020823867328095968, "loss": 25.0226, "step": 1544 }, { "epoch": 0.49432090865461525, "grad_norm": 417.2145080566406, "learning_rate": 0.0002080429768872702, "loss": 24.4093, "step": 1545 }, { "epoch": 0.49464085746280595, "grad_norm": 1596.4095458984375, "learning_rate": 0.00020784728256990876, "loss": 25.3089, "step": 1546 }, { "epoch": 0.49496080627099664, "grad_norm": 66.6065444946289, "learning_rate": 0.00020765159053649974, "loss": 24.7633, "step": 1547 }, { "epoch": 0.49528075507918734, "grad_norm": 168.79971313476562, "learning_rate": 0.00020745590099466513, "loss": 23.7519, "step": 1548 }, { "epoch": 0.49560070388737804, "grad_norm": 3104.95068359375, "learning_rate": 0.00020726021415202407, "loss": 23.3702, "step": 1549 }, { "epoch": 0.49592065269556873, "grad_norm": 2233.52978515625, "learning_rate": 0.00020706453021619302, "loss": 21.5469, "step": 1550 }, { "epoch": 0.49624060150375937, "grad_norm": 2162.917236328125, "learning_rate": 0.00020686884939478533, "loss": 20.991, "step": 1551 }, { "epoch": 0.49656055031195007, "grad_norm": 55.1094970703125, "learning_rate": 0.0002066731718954109, "loss": 18.9229, "step": 1552 }, { "epoch": 0.49688049912014076, "grad_norm": 92.81909942626953, "learning_rate": 0.00020647749792567635, "loss": 19.7671, "step": 1553 }, { "epoch": 0.49720044792833146, "grad_norm": 364.9884338378906, "learning_rate": 0.00020628182769318434, "loss": 16.8552, "step": 1554 }, { "epoch": 0.49752039673652215, "grad_norm": 394.3650817871094, "learning_rate": 0.00020608616140553365, "loss": 16.8443, "step": 1555 }, { "epoch": 0.49784034554471285, "grad_norm": 948.4185180664062, "learning_rate": 0.00020589049927031896, "loss": 15.1372, "step": 1556 }, { "epoch": 0.49816029435290354, "grad_norm": 13.938966751098633, "learning_rate": 0.0002056948414951303, "loss": 16.3998, "step": 1557 }, { "epoch": 0.49848024316109424, "grad_norm": 33.71576690673828, "learning_rate": 0.0002054991882875533, "loss": 16.4799, "step": 1558 }, { "epoch": 0.49880019196928493, "grad_norm": 582.8843994140625, "learning_rate": 0.00020530353985516876, "loss": 14.1065, "step": 1559 }, { "epoch": 0.49912014077747563, "grad_norm": 6479.2900390625, "learning_rate": 0.00020510789640555216, "loss": 13.1232, "step": 1560 }, { "epoch": 0.49944008958566627, "grad_norm": 27.09345245361328, "learning_rate": 0.000204912258146274, "loss": 13.4933, "step": 1561 }, { "epoch": 0.49976003839385696, "grad_norm": 38.53468704223633, "learning_rate": 0.00020471662528489913, "loss": 11.0668, "step": 1562 }, { "epoch": 0.5000799872020477, "grad_norm": 3335.210693359375, "learning_rate": 0.00020452099802898658, "loss": 11.4144, "step": 1563 }, { "epoch": 0.5003999360102384, "grad_norm": 26.711124420166016, "learning_rate": 0.00020432537658608962, "loss": 10.6639, "step": 1564 }, { "epoch": 0.5007198848184291, "grad_norm": 17.574525833129883, "learning_rate": 0.0002041297611637552, "loss": 9.7858, "step": 1565 }, { "epoch": 0.5010398336266197, "grad_norm": 64.17845916748047, "learning_rate": 0.00020393415196952392, "loss": 8.1859, "step": 1566 }, { "epoch": 0.5013597824348104, "grad_norm": 23.2316951751709, "learning_rate": 0.00020373854921092996, "loss": 8.0373, "step": 1567 }, { "epoch": 0.5016797312430011, "grad_norm": 12.098379135131836, "learning_rate": 0.00020354295309550033, "loss": 7.2053, "step": 1568 }, { "epoch": 0.5019996800511918, "grad_norm": 18.08388900756836, "learning_rate": 0.0002033473638307552, "loss": 7.1583, "step": 1569 }, { "epoch": 0.5023196288593825, "grad_norm": 24.80642318725586, "learning_rate": 0.00020315178162420757, "loss": 7.4845, "step": 1570 }, { "epoch": 0.5026395776675732, "grad_norm": 211.3668975830078, "learning_rate": 0.00020295620668336266, "loss": 7.5237, "step": 1571 }, { "epoch": 0.5029595264757639, "grad_norm": 168.52293395996094, "learning_rate": 0.00020276063921571824, "loss": 7.6717, "step": 1572 }, { "epoch": 0.5032794752839546, "grad_norm": 45.54515838623047, "learning_rate": 0.00020256507942876409, "loss": 7.8054, "step": 1573 }, { "epoch": 0.5035994240921453, "grad_norm": 218.6697998046875, "learning_rate": 0.00020236952752998168, "loss": 7.4359, "step": 1574 }, { "epoch": 0.503919372900336, "grad_norm": 259.8759765625, "learning_rate": 0.00020217398372684439, "loss": 7.7714, "step": 1575 }, { "epoch": 0.5042393217085266, "grad_norm": 2446.431396484375, "learning_rate": 0.00020197844822681673, "loss": 7.4991, "step": 1576 }, { "epoch": 0.5045592705167173, "grad_norm": 38.335880279541016, "learning_rate": 0.00020178292123735454, "loss": 7.4279, "step": 1577 }, { "epoch": 0.504879219324908, "grad_norm": 64.94971466064453, "learning_rate": 0.00020158740296590474, "loss": 7.4041, "step": 1578 }, { "epoch": 0.5051991681330987, "grad_norm": 106.72026824951172, "learning_rate": 0.00020139189361990476, "loss": 7.404, "step": 1579 }, { "epoch": 0.5055191169412894, "grad_norm": 183.95343017578125, "learning_rate": 0.00020119639340678274, "loss": 7.0569, "step": 1580 }, { "epoch": 0.5058390657494801, "grad_norm": 561.7801513671875, "learning_rate": 0.0002010009025339571, "loss": 7.1187, "step": 1581 }, { "epoch": 0.5061590145576708, "grad_norm": 766.6022338867188, "learning_rate": 0.0002008054212088362, "loss": 7.0652, "step": 1582 }, { "epoch": 0.5064789633658615, "grad_norm": 2744.870361328125, "learning_rate": 0.0002006099496388185, "loss": 6.8775, "step": 1583 }, { "epoch": 0.5067989121740522, "grad_norm": 38.441341400146484, "learning_rate": 0.00020041448803129205, "loss": 7.1471, "step": 1584 }, { "epoch": 0.5071188609822428, "grad_norm": 338.59832763671875, "learning_rate": 0.00020021903659363414, "loss": 6.8759, "step": 1585 }, { "epoch": 0.5074388097904335, "grad_norm": 12.734524726867676, "learning_rate": 0.00020002359553321158, "loss": 7.1117, "step": 1586 }, { "epoch": 0.5077587585986242, "grad_norm": 10831.9697265625, "learning_rate": 0.00019982816505737978, "loss": 7.1011, "step": 1587 }, { "epoch": 0.5080787074068149, "grad_norm": 6180.69482421875, "learning_rate": 0.00019963274537348327, "loss": 6.7045, "step": 1588 }, { "epoch": 0.5083986562150056, "grad_norm": 75.3614730834961, "learning_rate": 0.0001994373366888551, "loss": 6.8817, "step": 1589 }, { "epoch": 0.5087186050231963, "grad_norm": 21362.6875, "learning_rate": 0.0001992419392108164, "loss": 6.5996, "step": 1590 }, { "epoch": 0.509038553831387, "grad_norm": 40.761505126953125, "learning_rate": 0.00019904655314667663, "loss": 6.6685, "step": 1591 }, { "epoch": 0.5093585026395777, "grad_norm": 61433.0625, "learning_rate": 0.0001988511787037331, "loss": 6.6834, "step": 1592 }, { "epoch": 0.5096784514477684, "grad_norm": 76.52178192138672, "learning_rate": 0.00019865581608927068, "loss": 6.7812, "step": 1593 }, { "epoch": 0.509998400255959, "grad_norm": 11.229225158691406, "learning_rate": 0.00019846046551056187, "loss": 6.7718, "step": 1594 }, { "epoch": 0.5103183490641497, "grad_norm": 56.32878112792969, "learning_rate": 0.00019826512717486625, "loss": 6.5842, "step": 1595 }, { "epoch": 0.5106382978723404, "grad_norm": 42.66135025024414, "learning_rate": 0.00019806980128943047, "loss": 6.5896, "step": 1596 }, { "epoch": 0.5109582466805311, "grad_norm": 115.71044158935547, "learning_rate": 0.00019787448806148802, "loss": 6.5618, "step": 1597 }, { "epoch": 0.5112781954887218, "grad_norm": 42.69276428222656, "learning_rate": 0.00019767918769825874, "loss": 6.6004, "step": 1598 }, { "epoch": 0.5115981442969125, "grad_norm": 238.4598846435547, "learning_rate": 0.0001974839004069492, "loss": 6.6235, "step": 1599 }, { "epoch": 0.5119180931051032, "grad_norm": 34.8486213684082, "learning_rate": 0.00019728862639475182, "loss": 6.7821, "step": 1600 }, { "epoch": 0.5119180931051032, "eval_loss": 3.3196933269500732, "eval_runtime": 233.9804, "eval_samples_per_second": 5.624, "eval_steps_per_second": 1.406, "step": 1600 }, { "epoch": 0.5122380419132939, "grad_norm": 56.0380744934082, "learning_rate": 0.00019709336586884495, "loss": 6.848, "step": 1601 }, { "epoch": 0.5125579907214846, "grad_norm": 299.5946044921875, "learning_rate": 0.00019689811903639273, "loss": 6.3987, "step": 1602 }, { "epoch": 0.5128779395296752, "grad_norm": 10.880533218383789, "learning_rate": 0.00019670288610454474, "loss": 6.7342, "step": 1603 }, { "epoch": 0.513197888337866, "grad_norm": 355.7524719238281, "learning_rate": 0.0001965076672804357, "loss": 6.8169, "step": 1604 }, { "epoch": 0.5135178371460566, "grad_norm": 2261.9794921875, "learning_rate": 0.00019631246277118555, "loss": 6.3585, "step": 1605 }, { "epoch": 0.5138377859542473, "grad_norm": 1438.713134765625, "learning_rate": 0.00019611727278389898, "loss": 6.8331, "step": 1606 }, { "epoch": 0.514157734762438, "grad_norm": 745.37744140625, "learning_rate": 0.00019592209752566513, "loss": 6.8245, "step": 1607 }, { "epoch": 0.5144776835706287, "grad_norm": 58.433902740478516, "learning_rate": 0.0001957269372035578, "loss": 6.8176, "step": 1608 }, { "epoch": 0.5147976323788194, "grad_norm": 17.290542602539062, "learning_rate": 0.00019553179202463453, "loss": 6.855, "step": 1609 }, { "epoch": 0.5151175811870101, "grad_norm": 33.13834762573242, "learning_rate": 0.0001953366621959372, "loss": 6.8946, "step": 1610 }, { "epoch": 0.5154375299952008, "grad_norm": 18.5881404876709, "learning_rate": 0.00019514154792449125, "loss": 6.5586, "step": 1611 }, { "epoch": 0.5157574788033915, "grad_norm": 1200.3309326171875, "learning_rate": 0.00019494644941730547, "loss": 6.5655, "step": 1612 }, { "epoch": 0.5160774276115822, "grad_norm": 13.903018951416016, "learning_rate": 0.00019475136688137219, "loss": 6.877, "step": 1613 }, { "epoch": 0.5163973764197728, "grad_norm": 208.99990844726562, "learning_rate": 0.00019455630052366666, "loss": 6.798, "step": 1614 }, { "epoch": 0.5167173252279635, "grad_norm": 31.752573013305664, "learning_rate": 0.00019436125055114688, "loss": 6.5439, "step": 1615 }, { "epoch": 0.5170372740361542, "grad_norm": 2381.63525390625, "learning_rate": 0.00019416621717075356, "loss": 6.5238, "step": 1616 }, { "epoch": 0.5173572228443449, "grad_norm": 2820.570556640625, "learning_rate": 0.0001939712005894099, "loss": 6.6507, "step": 1617 }, { "epoch": 0.5176771716525356, "grad_norm": 2531.220947265625, "learning_rate": 0.00019377620101402113, "loss": 6.8637, "step": 1618 }, { "epoch": 0.5179971204607263, "grad_norm": 14558.5615234375, "learning_rate": 0.00019358121865147446, "loss": 6.5084, "step": 1619 }, { "epoch": 0.518317069268917, "grad_norm": 119.33515167236328, "learning_rate": 0.00019338625370863882, "loss": 6.6764, "step": 1620 }, { "epoch": 0.5186370180771077, "grad_norm": 4097.06103515625, "learning_rate": 0.00019319130639236477, "loss": 6.5498, "step": 1621 }, { "epoch": 0.5189569668852984, "grad_norm": 118.87272644042969, "learning_rate": 0.00019299637690948414, "loss": 6.7413, "step": 1622 }, { "epoch": 0.519276915693489, "grad_norm": 319.0539245605469, "learning_rate": 0.0001928014654668096, "loss": 6.8565, "step": 1623 }, { "epoch": 0.5195968645016797, "grad_norm": 1015.8532104492188, "learning_rate": 0.00019260657227113513, "loss": 6.9909, "step": 1624 }, { "epoch": 0.5199168133098704, "grad_norm": 6110.1357421875, "learning_rate": 0.000192411697529235, "loss": 6.6244, "step": 1625 }, { "epoch": 0.5202367621180611, "grad_norm": 920.1796264648438, "learning_rate": 0.0001922168414478639, "loss": 6.6993, "step": 1626 }, { "epoch": 0.5205567109262518, "grad_norm": 1895.49169921875, "learning_rate": 0.00019202200423375695, "loss": 6.7248, "step": 1627 }, { "epoch": 0.5208766597344425, "grad_norm": 190.3346710205078, "learning_rate": 0.00019182718609362913, "loss": 6.6856, "step": 1628 }, { "epoch": 0.5211966085426332, "grad_norm": 1900.810302734375, "learning_rate": 0.0001916323872341751, "loss": 6.4773, "step": 1629 }, { "epoch": 0.5215165573508239, "grad_norm": 372.069091796875, "learning_rate": 0.00019143760786206922, "loss": 6.8977, "step": 1630 }, { "epoch": 0.5218365061590146, "grad_norm": 470.37896728515625, "learning_rate": 0.00019124284818396498, "loss": 6.7348, "step": 1631 }, { "epoch": 0.5221564549672052, "grad_norm": 345.2624206542969, "learning_rate": 0.00019104810840649518, "loss": 6.5007, "step": 1632 }, { "epoch": 0.522476403775396, "grad_norm": 14990.578125, "learning_rate": 0.0001908533887362715, "loss": 6.7623, "step": 1633 }, { "epoch": 0.5227963525835866, "grad_norm": 8434.7958984375, "learning_rate": 0.00019065868937988398, "loss": 6.4984, "step": 1634 }, { "epoch": 0.5231163013917773, "grad_norm": 393.5131530761719, "learning_rate": 0.0001904640105439015, "loss": 6.7975, "step": 1635 }, { "epoch": 0.523436250199968, "grad_norm": 68.34281921386719, "learning_rate": 0.00019026935243487105, "loss": 6.8138, "step": 1636 }, { "epoch": 0.5237561990081587, "grad_norm": 2655.11767578125, "learning_rate": 0.00019007471525931736, "loss": 6.7472, "step": 1637 }, { "epoch": 0.5240761478163494, "grad_norm": 271.6803894042969, "learning_rate": 0.00018988009922374336, "loss": 6.7314, "step": 1638 }, { "epoch": 0.5243960966245401, "grad_norm": 49.694091796875, "learning_rate": 0.00018968550453462945, "loss": 6.5887, "step": 1639 }, { "epoch": 0.5247160454327308, "grad_norm": 40.31327819824219, "learning_rate": 0.00018949093139843294, "loss": 6.5703, "step": 1640 }, { "epoch": 0.5250359942409214, "grad_norm": 606.8063354492188, "learning_rate": 0.000189296380021589, "loss": 6.6527, "step": 1641 }, { "epoch": 0.5253559430491122, "grad_norm": 328.6357727050781, "learning_rate": 0.0001891018506105091, "loss": 6.9398, "step": 1642 }, { "epoch": 0.5256758918573028, "grad_norm": 14.112878799438477, "learning_rate": 0.00018890734337158172, "loss": 6.6962, "step": 1643 }, { "epoch": 0.5259958406654935, "grad_norm": 42.98262405395508, "learning_rate": 0.00018871285851117188, "loss": 6.6392, "step": 1644 }, { "epoch": 0.5263157894736842, "grad_norm": 71.3088150024414, "learning_rate": 0.00018851839623562047, "loss": 6.5748, "step": 1645 }, { "epoch": 0.5266357382818749, "grad_norm": 28.81409454345703, "learning_rate": 0.0001883239567512448, "loss": 6.6802, "step": 1646 }, { "epoch": 0.5269556870900656, "grad_norm": 8.297423362731934, "learning_rate": 0.000188129540264338, "loss": 6.5324, "step": 1647 }, { "epoch": 0.5272756358982563, "grad_norm": 16.27468490600586, "learning_rate": 0.0001879351469811684, "loss": 6.5321, "step": 1648 }, { "epoch": 0.527595584706447, "grad_norm": 9.677922248840332, "learning_rate": 0.00018774077710798014, "loss": 6.5972, "step": 1649 }, { "epoch": 0.5279155335146376, "grad_norm": 36.90768814086914, "learning_rate": 0.00018754643085099247, "loss": 6.8554, "step": 1650 }, { "epoch": 0.5282354823228284, "grad_norm": 38.28261947631836, "learning_rate": 0.00018735210841639918, "loss": 6.5748, "step": 1651 }, { "epoch": 0.528555431131019, "grad_norm": 8.838459968566895, "learning_rate": 0.00018715781001036938, "loss": 6.6891, "step": 1652 }, { "epoch": 0.5288753799392097, "grad_norm": 9.053256034851074, "learning_rate": 0.0001869635358390462, "loss": 6.6757, "step": 1653 }, { "epoch": 0.5291953287474004, "grad_norm": 13.860633850097656, "learning_rate": 0.00018676928610854727, "loss": 6.7456, "step": 1654 }, { "epoch": 0.5295152775555911, "grad_norm": 6.551642417907715, "learning_rate": 0.00018657506102496443, "loss": 6.4432, "step": 1655 }, { "epoch": 0.5298352263637818, "grad_norm": 6.068846702575684, "learning_rate": 0.000186380860794363, "loss": 6.5114, "step": 1656 }, { "epoch": 0.5301551751719725, "grad_norm": 298.4500732421875, "learning_rate": 0.00018618668562278218, "loss": 6.5394, "step": 1657 }, { "epoch": 0.5304751239801632, "grad_norm": 6.769711971282959, "learning_rate": 0.00018599253571623472, "loss": 6.6384, "step": 1658 }, { "epoch": 0.5307950727883539, "grad_norm": 22.864620208740234, "learning_rate": 0.0001857984112807061, "loss": 6.7885, "step": 1659 }, { "epoch": 0.5311150215965446, "grad_norm": 5.3118720054626465, "learning_rate": 0.00018560431252215528, "loss": 6.6045, "step": 1660 }, { "epoch": 0.5314349704047352, "grad_norm": 60.282066345214844, "learning_rate": 0.00018541023964651382, "loss": 6.6525, "step": 1661 }, { "epoch": 0.531754919212926, "grad_norm": 6.3130035400390625, "learning_rate": 0.00018521619285968545, "loss": 6.8046, "step": 1662 }, { "epoch": 0.5320748680211166, "grad_norm": 7.7585954666137695, "learning_rate": 0.00018502217236754682, "loss": 6.9111, "step": 1663 }, { "epoch": 0.5323948168293073, "grad_norm": 14.32033920288086, "learning_rate": 0.00018482817837594634, "loss": 6.5773, "step": 1664 }, { "epoch": 0.532714765637498, "grad_norm": 54.441688537597656, "learning_rate": 0.00018463421109070422, "loss": 6.4621, "step": 1665 }, { "epoch": 0.5330347144456887, "grad_norm": 8.818940162658691, "learning_rate": 0.00018444027071761268, "loss": 6.6185, "step": 1666 }, { "epoch": 0.5333546632538794, "grad_norm": 8.330538749694824, "learning_rate": 0.000184246357462435, "loss": 6.7875, "step": 1667 }, { "epoch": 0.5336746120620701, "grad_norm": 8.418845176696777, "learning_rate": 0.00018405247153090588, "loss": 6.6936, "step": 1668 }, { "epoch": 0.5339945608702608, "grad_norm": 4.972550392150879, "learning_rate": 0.0001838586131287312, "loss": 6.6284, "step": 1669 }, { "epoch": 0.5343145096784514, "grad_norm": 4.819149971008301, "learning_rate": 0.00018366478246158722, "loss": 6.6123, "step": 1670 }, { "epoch": 0.5346344584866422, "grad_norm": 5.511754512786865, "learning_rate": 0.0001834709797351211, "loss": 6.5928, "step": 1671 }, { "epoch": 0.5349544072948328, "grad_norm": 4.756268501281738, "learning_rate": 0.00018327720515495043, "loss": 6.719, "step": 1672 }, { "epoch": 0.5352743561030235, "grad_norm": 16.621448516845703, "learning_rate": 0.00018308345892666247, "loss": 6.6068, "step": 1673 }, { "epoch": 0.5355943049112142, "grad_norm": 7.359424591064453, "learning_rate": 0.0001828897412558149, "loss": 6.4707, "step": 1674 }, { "epoch": 0.5359142537194049, "grad_norm": 5.423340320587158, "learning_rate": 0.00018269605234793492, "loss": 6.5131, "step": 1675 }, { "epoch": 0.5362342025275956, "grad_norm": 6.9871320724487305, "learning_rate": 0.00018250239240851898, "loss": 6.4692, "step": 1676 }, { "epoch": 0.5365541513357863, "grad_norm": 204.94081115722656, "learning_rate": 0.00018230876164303334, "loss": 6.6845, "step": 1677 }, { "epoch": 0.536874100143977, "grad_norm": 6.5210700035095215, "learning_rate": 0.00018211516025691267, "loss": 6.634, "step": 1678 }, { "epoch": 0.5371940489521676, "grad_norm": 6.336790084838867, "learning_rate": 0.00018192158845556087, "loss": 6.4995, "step": 1679 }, { "epoch": 0.5375139977603584, "grad_norm": 5.2590718269348145, "learning_rate": 0.00018172804644435054, "loss": 6.472, "step": 1680 }, { "epoch": 0.537833946568549, "grad_norm": 7.329125881195068, "learning_rate": 0.00018153453442862222, "loss": 6.7036, "step": 1681 }, { "epoch": 0.5381538953767397, "grad_norm": 9.183751106262207, "learning_rate": 0.00018134105261368499, "loss": 6.676, "step": 1682 }, { "epoch": 0.5384738441849304, "grad_norm": 10.038751602172852, "learning_rate": 0.00018114760120481598, "loss": 6.6576, "step": 1683 }, { "epoch": 0.5387937929931211, "grad_norm": 8.166698455810547, "learning_rate": 0.00018095418040725965, "loss": 6.5329, "step": 1684 }, { "epoch": 0.5391137418013118, "grad_norm": 8.943130493164062, "learning_rate": 0.0001807607904262282, "loss": 6.8779, "step": 1685 }, { "epoch": 0.5394336906095025, "grad_norm": 11.258163452148438, "learning_rate": 0.00018056743146690143, "loss": 6.6053, "step": 1686 }, { "epoch": 0.5397536394176932, "grad_norm": 7.90877628326416, "learning_rate": 0.00018037410373442558, "loss": 6.5302, "step": 1687 }, { "epoch": 0.5400735882258838, "grad_norm": 10.706826210021973, "learning_rate": 0.0001801808074339144, "loss": 6.5518, "step": 1688 }, { "epoch": 0.5403935370340746, "grad_norm": 5.881041526794434, "learning_rate": 0.00017998754277044786, "loss": 6.8653, "step": 1689 }, { "epoch": 0.5407134858422652, "grad_norm": 7.093472003936768, "learning_rate": 0.00017979430994907253, "loss": 6.8847, "step": 1690 }, { "epoch": 0.541033434650456, "grad_norm": 7.041862964630127, "learning_rate": 0.0001796011091748013, "loss": 6.6155, "step": 1691 }, { "epoch": 0.5413533834586466, "grad_norm": 6.642461776733398, "learning_rate": 0.0001794079406526128, "loss": 6.5458, "step": 1692 }, { "epoch": 0.5416733322668373, "grad_norm": 6.088051795959473, "learning_rate": 0.0001792148045874516, "loss": 6.7226, "step": 1693 }, { "epoch": 0.541993281075028, "grad_norm": 8.008899688720703, "learning_rate": 0.000179021701184228, "loss": 6.7852, "step": 1694 }, { "epoch": 0.5423132298832187, "grad_norm": 5.187869071960449, "learning_rate": 0.00017882863064781721, "loss": 6.6511, "step": 1695 }, { "epoch": 0.5426331786914094, "grad_norm": 47.277042388916016, "learning_rate": 0.00017863559318305992, "loss": 6.5314, "step": 1696 }, { "epoch": 0.5429531274996001, "grad_norm": 8.942819595336914, "learning_rate": 0.00017844258899476183, "loss": 6.6763, "step": 1697 }, { "epoch": 0.5432730763077908, "grad_norm": 9.141765594482422, "learning_rate": 0.00017824961828769286, "loss": 6.5762, "step": 1698 }, { "epoch": 0.5435930251159814, "grad_norm": 7.712893962860107, "learning_rate": 0.00017805668126658785, "loss": 6.6161, "step": 1699 }, { "epoch": 0.5439129739241721, "grad_norm": 6.733684539794922, "learning_rate": 0.0001778637781361457, "loss": 6.7396, "step": 1700 }, { "epoch": 0.5442329227323628, "grad_norm": 7.395744800567627, "learning_rate": 0.0001776709091010293, "loss": 6.694, "step": 1701 }, { "epoch": 0.5445528715405535, "grad_norm": 6.6596150398254395, "learning_rate": 0.00017747807436586574, "loss": 6.505, "step": 1702 }, { "epoch": 0.5448728203487442, "grad_norm": 7.438435077667236, "learning_rate": 0.00017728527413524516, "loss": 6.834, "step": 1703 }, { "epoch": 0.5451927691569349, "grad_norm": 5.447627544403076, "learning_rate": 0.0001770925086137214, "loss": 6.5036, "step": 1704 }, { "epoch": 0.5455127179651256, "grad_norm": 8.212823867797852, "learning_rate": 0.00017689977800581166, "loss": 6.5378, "step": 1705 }, { "epoch": 0.5458326667733163, "grad_norm": 10.185394287109375, "learning_rate": 0.00017670708251599568, "loss": 6.647, "step": 1706 }, { "epoch": 0.546152615581507, "grad_norm": 7.351102352142334, "learning_rate": 0.00017651442234871612, "loss": 6.6004, "step": 1707 }, { "epoch": 0.5464725643896976, "grad_norm": 8.159398078918457, "learning_rate": 0.00017632179770837845, "loss": 6.5938, "step": 1708 }, { "epoch": 0.5467925131978884, "grad_norm": 5.244987487792969, "learning_rate": 0.0001761292087993499, "loss": 6.8191, "step": 1709 }, { "epoch": 0.547112462006079, "grad_norm": 5.631025314331055, "learning_rate": 0.00017593665582596026, "loss": 6.5519, "step": 1710 }, { "epoch": 0.5474324108142697, "grad_norm": 5.977352142333984, "learning_rate": 0.00017574413899250092, "loss": 6.5664, "step": 1711 }, { "epoch": 0.5477523596224604, "grad_norm": 6.352726459503174, "learning_rate": 0.00017555165850322498, "loss": 6.6962, "step": 1712 }, { "epoch": 0.5480723084306511, "grad_norm": 8.479165077209473, "learning_rate": 0.00017535921456234715, "loss": 6.3559, "step": 1713 }, { "epoch": 0.5483922572388418, "grad_norm": 6.653327465057373, "learning_rate": 0.00017516680737404317, "loss": 6.8439, "step": 1714 }, { "epoch": 0.5487122060470325, "grad_norm": 5.953420162200928, "learning_rate": 0.00017497443714244976, "loss": 6.5994, "step": 1715 }, { "epoch": 0.5490321548552232, "grad_norm": 5.354004383087158, "learning_rate": 0.00017478210407166473, "loss": 6.797, "step": 1716 }, { "epoch": 0.5493521036634138, "grad_norm": 8.759894371032715, "learning_rate": 0.00017458980836574604, "loss": 6.7733, "step": 1717 }, { "epoch": 0.5496720524716046, "grad_norm": 6.662622451782227, "learning_rate": 0.00017439755022871227, "loss": 6.6149, "step": 1718 }, { "epoch": 0.5499920012797952, "grad_norm": 5.91054105758667, "learning_rate": 0.00017420532986454226, "loss": 6.402, "step": 1719 }, { "epoch": 0.550311950087986, "grad_norm": 6.375626087188721, "learning_rate": 0.0001740131474771744, "loss": 6.5757, "step": 1720 }, { "epoch": 0.5506318988961766, "grad_norm": 7.947948932647705, "learning_rate": 0.00017382100327050705, "loss": 6.7252, "step": 1721 }, { "epoch": 0.5509518477043673, "grad_norm": 670.0234375, "learning_rate": 0.00017362889744839804, "loss": 6.6807, "step": 1722 }, { "epoch": 0.551271796512558, "grad_norm": 5.699699401855469, "learning_rate": 0.0001734368302146644, "loss": 6.6222, "step": 1723 }, { "epoch": 0.5515917453207487, "grad_norm": 5.992722034454346, "learning_rate": 0.0001732448017730823, "loss": 6.4582, "step": 1724 }, { "epoch": 0.5519116941289394, "grad_norm": 4.542269229888916, "learning_rate": 0.00017305281232738668, "loss": 6.699, "step": 1725 }, { "epoch": 0.55223164293713, "grad_norm": 6.930887222290039, "learning_rate": 0.00017286086208127114, "loss": 6.7525, "step": 1726 }, { "epoch": 0.5525515917453208, "grad_norm": 7.069804668426514, "learning_rate": 0.00017266895123838776, "loss": 6.6047, "step": 1727 }, { "epoch": 0.5528715405535114, "grad_norm": 5.9045915603637695, "learning_rate": 0.00017247708000234663, "loss": 6.5806, "step": 1728 }, { "epoch": 0.5531914893617021, "grad_norm": 7.1390299797058105, "learning_rate": 0.00017228524857671595, "loss": 6.7053, "step": 1729 }, { "epoch": 0.5535114381698928, "grad_norm": 6.806826591491699, "learning_rate": 0.00017209345716502186, "loss": 6.6264, "step": 1730 }, { "epoch": 0.5538313869780835, "grad_norm": 6.230347633361816, "learning_rate": 0.0001719017059707476, "loss": 6.7543, "step": 1731 }, { "epoch": 0.5541513357862742, "grad_norm": 6.432546138763428, "learning_rate": 0.00017170999519733416, "loss": 6.5587, "step": 1732 }, { "epoch": 0.5544712845944649, "grad_norm": 6.226624488830566, "learning_rate": 0.00017151832504817932, "loss": 6.444, "step": 1733 }, { "epoch": 0.5547912334026556, "grad_norm": 8.379231452941895, "learning_rate": 0.00017132669572663808, "loss": 6.3285, "step": 1734 }, { "epoch": 0.5551111822108462, "grad_norm": 6.226011276245117, "learning_rate": 0.00017113510743602188, "loss": 6.7741, "step": 1735 }, { "epoch": 0.555431131019037, "grad_norm": 9.727779388427734, "learning_rate": 0.00017094356037959871, "loss": 6.5921, "step": 1736 }, { "epoch": 0.5557510798272276, "grad_norm": 5.827595233917236, "learning_rate": 0.0001707520547605928, "loss": 6.3784, "step": 1737 }, { "epoch": 0.5560710286354184, "grad_norm": 5.6185994148254395, "learning_rate": 0.0001705605907821845, "loss": 6.5343, "step": 1738 }, { "epoch": 0.556390977443609, "grad_norm": 33.767696380615234, "learning_rate": 0.00017036916864750985, "loss": 6.639, "step": 1739 }, { "epoch": 0.5567109262517997, "grad_norm": 4.321077823638916, "learning_rate": 0.00017017778855966053, "loss": 6.4703, "step": 1740 }, { "epoch": 0.5570308750599904, "grad_norm": 7.326907634735107, "learning_rate": 0.00016998645072168373, "loss": 6.576, "step": 1741 }, { "epoch": 0.5573508238681811, "grad_norm": 7.584223747253418, "learning_rate": 0.0001697951553365816, "loss": 6.6078, "step": 1742 }, { "epoch": 0.5576707726763718, "grad_norm": 7.753310680389404, "learning_rate": 0.0001696039026073115, "loss": 6.4804, "step": 1743 }, { "epoch": 0.5579907214845625, "grad_norm": 6.727494716644287, "learning_rate": 0.00016941269273678525, "loss": 6.4974, "step": 1744 }, { "epoch": 0.5583106702927532, "grad_norm": 7.953418254852295, "learning_rate": 0.00016922152592786945, "loss": 6.7166, "step": 1745 }, { "epoch": 0.5586306191009438, "grad_norm": 6.279244422912598, "learning_rate": 0.00016903040238338489, "loss": 6.6045, "step": 1746 }, { "epoch": 0.5589505679091346, "grad_norm": 4.617511749267578, "learning_rate": 0.00016883932230610647, "loss": 6.5688, "step": 1747 }, { "epoch": 0.5592705167173252, "grad_norm": 5.750264644622803, "learning_rate": 0.00016864828589876297, "loss": 6.5031, "step": 1748 }, { "epoch": 0.559590465525516, "grad_norm": 7.819819450378418, "learning_rate": 0.00016845729336403692, "loss": 6.4559, "step": 1749 }, { "epoch": 0.5599104143337066, "grad_norm": 7.785708904266357, "learning_rate": 0.00016826634490456414, "loss": 6.5744, "step": 1750 }, { "epoch": 0.5602303631418973, "grad_norm": 7.5504560470581055, "learning_rate": 0.00016807544072293388, "loss": 6.6894, "step": 1751 }, { "epoch": 0.560550311950088, "grad_norm": 33.57929992675781, "learning_rate": 0.00016788458102168823, "loss": 6.4001, "step": 1752 }, { "epoch": 0.5608702607582787, "grad_norm": 12.012317657470703, "learning_rate": 0.00016769376600332222, "loss": 6.4613, "step": 1753 }, { "epoch": 0.5611902095664694, "grad_norm": 5.870274543762207, "learning_rate": 0.00016750299587028344, "loss": 6.5538, "step": 1754 }, { "epoch": 0.56151015837466, "grad_norm": 6.509860992431641, "learning_rate": 0.00016731227082497182, "loss": 6.4063, "step": 1755 }, { "epoch": 0.5618301071828508, "grad_norm": 6.6293044090271, "learning_rate": 0.00016712159106973943, "loss": 6.4052, "step": 1756 }, { "epoch": 0.5621500559910414, "grad_norm": 98.93505096435547, "learning_rate": 0.00016693095680689045, "loss": 6.7203, "step": 1757 }, { "epoch": 0.5624700047992321, "grad_norm": 6.476878643035889, "learning_rate": 0.0001667403682386806, "loss": 6.6422, "step": 1758 }, { "epoch": 0.5627899536074228, "grad_norm": 42.62853240966797, "learning_rate": 0.00016654982556731714, "loss": 6.5487, "step": 1759 }, { "epoch": 0.5631099024156135, "grad_norm": 21.046525955200195, "learning_rate": 0.00016635932899495886, "loss": 6.4535, "step": 1760 }, { "epoch": 0.5634298512238042, "grad_norm": 8.30802059173584, "learning_rate": 0.00016616887872371536, "loss": 6.4825, "step": 1761 }, { "epoch": 0.5637498000319949, "grad_norm": 8.385047912597656, "learning_rate": 0.00016597847495564724, "loss": 6.6208, "step": 1762 }, { "epoch": 0.5640697488401856, "grad_norm": 21.346595764160156, "learning_rate": 0.00016578811789276588, "loss": 6.4841, "step": 1763 }, { "epoch": 0.5643896976483762, "grad_norm": 5.010012149810791, "learning_rate": 0.00016559780773703277, "loss": 6.7083, "step": 1764 }, { "epoch": 0.564709646456567, "grad_norm": 4.354966640472412, "learning_rate": 0.00016540754469036005, "loss": 6.5749, "step": 1765 }, { "epoch": 0.5650295952647576, "grad_norm": 8.869613647460938, "learning_rate": 0.0001652173289546095, "loss": 6.4343, "step": 1766 }, { "epoch": 0.5653495440729484, "grad_norm": 12.243342399597168, "learning_rate": 0.00016502716073159298, "loss": 6.479, "step": 1767 }, { "epoch": 0.565669492881139, "grad_norm": 6.065547466278076, "learning_rate": 0.0001648370402230719, "loss": 6.8581, "step": 1768 }, { "epoch": 0.5659894416893297, "grad_norm": 14.517911911010742, "learning_rate": 0.00016464696763075686, "loss": 6.4405, "step": 1769 }, { "epoch": 0.5663093904975204, "grad_norm": 7.9618964195251465, "learning_rate": 0.00016445694315630777, "loss": 6.4442, "step": 1770 }, { "epoch": 0.5666293393057111, "grad_norm": 8.607887268066406, "learning_rate": 0.00016426696700133357, "loss": 6.6554, "step": 1771 }, { "epoch": 0.5669492881139018, "grad_norm": 7.719635486602783, "learning_rate": 0.00016407703936739172, "loss": 6.6127, "step": 1772 }, { "epoch": 0.5672692369220924, "grad_norm": 6.7710371017456055, "learning_rate": 0.00016388716045598832, "loss": 6.702, "step": 1773 }, { "epoch": 0.5675891857302832, "grad_norm": 5.212691307067871, "learning_rate": 0.00016369733046857788, "loss": 6.5324, "step": 1774 }, { "epoch": 0.5679091345384738, "grad_norm": 9.58516788482666, "learning_rate": 0.0001635075496065628, "loss": 6.6931, "step": 1775 }, { "epoch": 0.5682290833466646, "grad_norm": 6.463788986206055, "learning_rate": 0.00016331781807129355, "loss": 6.72, "step": 1776 }, { "epoch": 0.5685490321548552, "grad_norm": 6.6747002601623535, "learning_rate": 0.000163128136064068, "loss": 6.8406, "step": 1777 }, { "epoch": 0.568868980963046, "grad_norm": 22.55068016052246, "learning_rate": 0.00016293850378613177, "loss": 6.4797, "step": 1778 }, { "epoch": 0.5691889297712366, "grad_norm": 4.759416580200195, "learning_rate": 0.0001627489214386776, "loss": 6.6021, "step": 1779 }, { "epoch": 0.5695088785794273, "grad_norm": 6.074831962585449, "learning_rate": 0.0001625593892228452, "loss": 6.6699, "step": 1780 }, { "epoch": 0.569828827387618, "grad_norm": 5.710672855377197, "learning_rate": 0.00016236990733972114, "loss": 6.722, "step": 1781 }, { "epoch": 0.5701487761958086, "grad_norm": 6.164533615112305, "learning_rate": 0.00016218047599033867, "loss": 6.4717, "step": 1782 }, { "epoch": 0.5704687250039994, "grad_norm": 6.661571502685547, "learning_rate": 0.00016199109537567725, "loss": 6.5454, "step": 1783 }, { "epoch": 0.57078867381219, "grad_norm": 5.331099510192871, "learning_rate": 0.00016180176569666264, "loss": 6.4191, "step": 1784 }, { "epoch": 0.5711086226203808, "grad_norm": 36.22768783569336, "learning_rate": 0.00016161248715416656, "loss": 6.8651, "step": 1785 }, { "epoch": 0.5714285714285714, "grad_norm": 6.483286380767822, "learning_rate": 0.00016142325994900636, "loss": 6.5161, "step": 1786 }, { "epoch": 0.5717485202367621, "grad_norm": 7.084466457366943, "learning_rate": 0.00016123408428194512, "loss": 6.63, "step": 1787 }, { "epoch": 0.5720684690449528, "grad_norm": 7.926953315734863, "learning_rate": 0.00016104496035369102, "loss": 6.6144, "step": 1788 }, { "epoch": 0.5723884178531435, "grad_norm": 4.494540214538574, "learning_rate": 0.0001608558883648975, "loss": 6.5005, "step": 1789 }, { "epoch": 0.5727083666613342, "grad_norm": 7.054409503936768, "learning_rate": 0.00016066686851616292, "loss": 6.7112, "step": 1790 }, { "epoch": 0.5730283154695249, "grad_norm": 5.500720500946045, "learning_rate": 0.00016047790100803006, "loss": 6.6917, "step": 1791 }, { "epoch": 0.5733482642777156, "grad_norm": 5.7658891677856445, "learning_rate": 0.0001602889860409865, "loss": 6.4435, "step": 1792 }, { "epoch": 0.5736682130859062, "grad_norm": 5.84588623046875, "learning_rate": 0.00016010012381546397, "loss": 6.4001, "step": 1793 }, { "epoch": 0.573988161894097, "grad_norm": 6.311826229095459, "learning_rate": 0.0001599113145318381, "loss": 6.4495, "step": 1794 }, { "epoch": 0.5743081107022876, "grad_norm": 6.321776390075684, "learning_rate": 0.00015972255839042843, "loss": 6.4484, "step": 1795 }, { "epoch": 0.5746280595104784, "grad_norm": 5.284316062927246, "learning_rate": 0.00015953385559149834, "loss": 6.3737, "step": 1796 }, { "epoch": 0.574948008318669, "grad_norm": 38.006752014160156, "learning_rate": 0.0001593452063352542, "loss": 6.7715, "step": 1797 }, { "epoch": 0.5752679571268597, "grad_norm": 4.2852396965026855, "learning_rate": 0.00015915661082184596, "loss": 6.4685, "step": 1798 }, { "epoch": 0.5755879059350504, "grad_norm": 5.658292770385742, "learning_rate": 0.00015896806925136628, "loss": 6.4279, "step": 1799 }, { "epoch": 0.5759078547432411, "grad_norm": 5.834637641906738, "learning_rate": 0.00015877958182385071, "loss": 6.5669, "step": 1800 }, { "epoch": 0.5759078547432411, "eval_loss": 3.309910297393799, "eval_runtime": 234.1116, "eval_samples_per_second": 5.621, "eval_steps_per_second": 1.405, "step": 1800 } ], "logging_steps": 1, "max_steps": 3060, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 6, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 6 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.6635351854481408e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }