{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 35.5, "eval_steps": 100, "global_step": 142000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.025, "grad_norm": 0.7927406430244446, "learning_rate": 5.82e-05, "loss": 203.8328, "step": 100 }, { "epoch": 0.05, "grad_norm": 0.901040256023407, "learning_rate": 0.0001182, "loss": 181.6551, "step": 200 }, { "epoch": 0.075, "grad_norm": 0.14473982155323029, "learning_rate": 0.00017819999999999997, "loss": 174.6394, "step": 300 }, { "epoch": 0.1, "grad_norm": 0.13423211872577667, "learning_rate": 0.0002382, "loss": 171.818, "step": 400 }, { "epoch": 0.125, "grad_norm": 0.13924159109592438, "learning_rate": 0.0002982, "loss": 168.1486, "step": 500 }, { "epoch": 0.15, "grad_norm": 0.11850500851869583, "learning_rate": 0.000299996362272642, "loss": 162.8829, "step": 600 }, { "epoch": 0.175, "grad_norm": 0.15106040239334106, "learning_rate": 0.0002999926120382524, "loss": 158.5516, "step": 700 }, { "epoch": 0.2, "grad_norm": 0.11745048314332962, "learning_rate": 0.0002999888618038627, "loss": 154.1395, "step": 800 }, { "epoch": 0.225, "grad_norm": 0.22588345408439636, "learning_rate": 0.00029998511156947307, "loss": 150.8583, "step": 900 }, { "epoch": 0.25, "grad_norm": 0.1475830227136612, "learning_rate": 0.0002999813613350834, "loss": 148.7021, "step": 1000 }, { "epoch": 0.275, "grad_norm": 0.14757394790649414, "learning_rate": 0.00029997761110069375, "loss": 145.111, "step": 1100 }, { "epoch": 0.3, "grad_norm": 0.13360479474067688, "learning_rate": 0.0002999738608663041, "loss": 142.679, "step": 1200 }, { "epoch": 0.325, "grad_norm": 0.11122659593820572, "learning_rate": 0.0002999701106319145, "loss": 140.4614, "step": 1300 }, { "epoch": 0.35, "grad_norm": 0.10133378952741623, "learning_rate": 0.0002999663603975248, "loss": 137.84, "step": 1400 }, { "epoch": 0.375, "grad_norm": 0.12196547538042068, "learning_rate": 0.00029996261016313516, "loss": 136.1062, "step": 1500 }, { "epoch": 0.4, "grad_norm": 0.09694620966911316, "learning_rate": 0.0002999588599287455, "loss": 134.5708, "step": 1600 }, { "epoch": 0.425, "grad_norm": 0.14449502527713776, "learning_rate": 0.0002999551096943559, "loss": 131.672, "step": 1700 }, { "epoch": 0.45, "grad_norm": 0.10163229703903198, "learning_rate": 0.0002999513594599662, "loss": 128.9171, "step": 1800 }, { "epoch": 0.475, "grad_norm": 0.09789746254682541, "learning_rate": 0.00029994760922557657, "loss": 127.3757, "step": 1900 }, { "epoch": 0.5, "grad_norm": 0.0996888279914856, "learning_rate": 0.00029994385899118693, "loss": 124.4876, "step": 2000 }, { "epoch": 0.525, "grad_norm": 0.08484259247779846, "learning_rate": 0.0002999401087567973, "loss": 122.1805, "step": 2100 }, { "epoch": 0.55, "grad_norm": 0.11729967594146729, "learning_rate": 0.0002999363585224076, "loss": 117.8535, "step": 2200 }, { "epoch": 0.575, "grad_norm": 0.1445324867963791, "learning_rate": 0.000299932608288018, "loss": 116.244, "step": 2300 }, { "epoch": 0.6, "grad_norm": 0.11317744106054306, "learning_rate": 0.0002999288580536283, "loss": 113.5543, "step": 2400 }, { "epoch": 0.625, "grad_norm": 0.09375651925802231, "learning_rate": 0.0002999251078192387, "loss": 110.8541, "step": 2500 }, { "epoch": 0.65, "grad_norm": 0.0896710455417633, "learning_rate": 0.000299921357584849, "loss": 110.1387, "step": 2600 }, { "epoch": 0.675, "grad_norm": 0.09820675849914551, "learning_rate": 0.0002999176073504594, "loss": 107.0062, "step": 2700 }, { "epoch": 0.7, "grad_norm": 0.09842734783887863, "learning_rate": 0.0002999138571160697, "loss": 105.1786, "step": 2800 }, { "epoch": 0.725, "grad_norm": 0.09370853751897812, "learning_rate": 0.00029991010688168007, "loss": 103.8245, "step": 2900 }, { "epoch": 0.75, "grad_norm": 0.12121213972568512, "learning_rate": 0.00029990635664729043, "loss": 101.6897, "step": 3000 }, { "epoch": 0.775, "grad_norm": 0.09974240511655807, "learning_rate": 0.0002999026064129008, "loss": 100.3376, "step": 3100 }, { "epoch": 0.8, "grad_norm": 0.09277965873479843, "learning_rate": 0.0002998988561785111, "loss": 99.2098, "step": 3200 }, { "epoch": 0.825, "grad_norm": 0.12521271407604218, "learning_rate": 0.0002998951059441215, "loss": 98.4138, "step": 3300 }, { "epoch": 0.85, "grad_norm": 0.1051282286643982, "learning_rate": 0.00029989135570973184, "loss": 99.5873, "step": 3400 }, { "epoch": 0.875, "grad_norm": 0.13997547328472137, "learning_rate": 0.0002998876054753422, "loss": 97.4617, "step": 3500 }, { "epoch": 0.9, "grad_norm": 0.1003558561205864, "learning_rate": 0.0002998838552409525, "loss": 96.093, "step": 3600 }, { "epoch": 0.925, "grad_norm": 0.09967362880706787, "learning_rate": 0.0002998801050065629, "loss": 93.6796, "step": 3700 }, { "epoch": 0.95, "grad_norm": 0.13389019668102264, "learning_rate": 0.00029987635477217325, "loss": 92.9668, "step": 3800 }, { "epoch": 0.975, "grad_norm": 0.10552455484867096, "learning_rate": 0.0002998726045377836, "loss": 91.9125, "step": 3900 }, { "epoch": 1.0, "grad_norm": 0.10877016931772232, "learning_rate": 0.00029986885430339393, "loss": 91.2492, "step": 4000 }, { "epoch": 1.025, "grad_norm": 0.09188541024923325, "learning_rate": 0.0002998651040690043, "loss": 88.3832, "step": 4100 }, { "epoch": 1.05, "grad_norm": 0.10517989099025726, "learning_rate": 0.0002998613538346146, "loss": 87.4386, "step": 4200 }, { "epoch": 1.075, "grad_norm": 0.08605173230171204, "learning_rate": 0.000299857603600225, "loss": 86.7098, "step": 4300 }, { "epoch": 1.1, "grad_norm": 0.13910797238349915, "learning_rate": 0.00029985385336583534, "loss": 85.1566, "step": 4400 }, { "epoch": 1.125, "grad_norm": 0.08505425602197647, "learning_rate": 0.00029985010313144565, "loss": 86.1376, "step": 4500 }, { "epoch": 1.15, "grad_norm": 0.10330720990896225, "learning_rate": 0.000299846352897056, "loss": 84.9761, "step": 4600 }, { "epoch": 1.175, "grad_norm": 0.1150883138179779, "learning_rate": 0.0002998426026626664, "loss": 83.4733, "step": 4700 }, { "epoch": 1.2, "grad_norm": 0.08464270830154419, "learning_rate": 0.00029983885242827675, "loss": 84.0231, "step": 4800 }, { "epoch": 1.225, "grad_norm": 0.11479545384645462, "learning_rate": 0.00029983510219388707, "loss": 82.2074, "step": 4900 }, { "epoch": 1.25, "grad_norm": 0.10978193581104279, "learning_rate": 0.00029983135195949743, "loss": 81.2586, "step": 5000 }, { "epoch": 1.275, "grad_norm": 0.10087323933839798, "learning_rate": 0.0002998276017251078, "loss": 80.0028, "step": 5100 }, { "epoch": 1.3, "grad_norm": 0.0992458313703537, "learning_rate": 0.00029982385149071816, "loss": 81.4542, "step": 5200 }, { "epoch": 1.325, "grad_norm": 0.08898110687732697, "learning_rate": 0.0002998201012563285, "loss": 80.3485, "step": 5300 }, { "epoch": 1.35, "grad_norm": 0.11424868553876877, "learning_rate": 0.00029981635102193884, "loss": 79.4734, "step": 5400 }, { "epoch": 1.375, "grad_norm": 0.09483993798494339, "learning_rate": 0.0002998126007875492, "loss": 78.8044, "step": 5500 }, { "epoch": 1.4, "grad_norm": 0.08650317788124084, "learning_rate": 0.0002998088505531596, "loss": 78.476, "step": 5600 }, { "epoch": 1.425, "grad_norm": 0.08040408045053482, "learning_rate": 0.0002998051003187699, "loss": 77.8633, "step": 5700 }, { "epoch": 1.45, "grad_norm": 0.08953177183866501, "learning_rate": 0.00029980135008438025, "loss": 76.5257, "step": 5800 }, { "epoch": 1.475, "grad_norm": 0.10908912867307663, "learning_rate": 0.00029979759984999056, "loss": 76.2689, "step": 5900 }, { "epoch": 1.5, "grad_norm": 0.12598766386508942, "learning_rate": 0.00029979384961560093, "loss": 76.7776, "step": 6000 }, { "epoch": 1.525, "grad_norm": 0.0955086201429367, "learning_rate": 0.0002997900993812113, "loss": 76.5905, "step": 6100 }, { "epoch": 1.55, "grad_norm": 0.08597240597009659, "learning_rate": 0.00029978634914682166, "loss": 74.2009, "step": 6200 }, { "epoch": 1.575, "grad_norm": 0.08754386007785797, "learning_rate": 0.000299782598912432, "loss": 74.1175, "step": 6300 }, { "epoch": 1.6, "grad_norm": 0.12214329093694687, "learning_rate": 0.00029977884867804234, "loss": 73.2265, "step": 6400 }, { "epoch": 1.625, "grad_norm": 0.08221092820167542, "learning_rate": 0.0002997750984436527, "loss": 72.1494, "step": 6500 }, { "epoch": 1.65, "grad_norm": 0.1369631290435791, "learning_rate": 0.0002997713482092631, "loss": 73.5853, "step": 6600 }, { "epoch": 1.675, "grad_norm": 0.0787581205368042, "learning_rate": 0.0002997675979748734, "loss": 72.0935, "step": 6700 }, { "epoch": 1.7, "grad_norm": 0.07737889885902405, "learning_rate": 0.00029976384774048375, "loss": 71.3515, "step": 6800 }, { "epoch": 1.725, "grad_norm": 0.11298476159572601, "learning_rate": 0.0002997600975060941, "loss": 71.5356, "step": 6900 }, { "epoch": 1.75, "grad_norm": 0.07955294102430344, "learning_rate": 0.0002997563472717045, "loss": 71.9312, "step": 7000 }, { "epoch": 1.775, "grad_norm": 0.11449731886386871, "learning_rate": 0.0002997525970373148, "loss": 70.1805, "step": 7100 }, { "epoch": 1.8, "grad_norm": 0.07159914076328278, "learning_rate": 0.00029974884680292516, "loss": 70.1074, "step": 7200 }, { "epoch": 1.825, "grad_norm": 0.07785623520612717, "learning_rate": 0.00029974509656853553, "loss": 70.5433, "step": 7300 }, { "epoch": 1.85, "grad_norm": 0.0750761404633522, "learning_rate": 0.0002997413463341459, "loss": 68.6654, "step": 7400 }, { "epoch": 1.875, "grad_norm": 0.0909292995929718, "learning_rate": 0.0002997375960997562, "loss": 69.5312, "step": 7500 }, { "epoch": 1.9, "grad_norm": 0.1320108026266098, "learning_rate": 0.00029973384586536657, "loss": 67.3222, "step": 7600 }, { "epoch": 1.925, "grad_norm": 0.12221457809209824, "learning_rate": 0.0002997300956309769, "loss": 66.3137, "step": 7700 }, { "epoch": 1.95, "grad_norm": 0.11239924281835556, "learning_rate": 0.00029972634539658725, "loss": 67.8054, "step": 7800 }, { "epoch": 1.975, "grad_norm": 0.0858956053853035, "learning_rate": 0.0002997225951621976, "loss": 67.9956, "step": 7900 }, { "epoch": 2.0, "grad_norm": 0.10778280347585678, "learning_rate": 0.000299718844927808, "loss": 66.5141, "step": 8000 }, { "epoch": 2.025, "grad_norm": 0.10166219621896744, "learning_rate": 0.0002997150946934183, "loss": 65.9891, "step": 8100 }, { "epoch": 2.05, "grad_norm": 0.09062575548887253, "learning_rate": 0.00029971134445902866, "loss": 67.5705, "step": 8200 }, { "epoch": 2.075, "grad_norm": 0.0936209186911583, "learning_rate": 0.000299707594224639, "loss": 65.6743, "step": 8300 }, { "epoch": 2.1, "grad_norm": 0.08781470358371735, "learning_rate": 0.00029970384399024934, "loss": 66.3408, "step": 8400 }, { "epoch": 2.125, "grad_norm": 0.18813404440879822, "learning_rate": 0.0002997000937558597, "loss": 65.7238, "step": 8500 }, { "epoch": 2.15, "grad_norm": 0.09089367091655731, "learning_rate": 0.00029969634352147007, "loss": 64.8326, "step": 8600 }, { "epoch": 2.175, "grad_norm": 0.09775424748659134, "learning_rate": 0.00029969259328708044, "loss": 64.9571, "step": 8700 }, { "epoch": 2.2, "grad_norm": 0.07110758870840073, "learning_rate": 0.00029968888055503464, "loss": 64.1227, "step": 8800 }, { "epoch": 2.225, "grad_norm": 0.08944450318813324, "learning_rate": 0.000299685130320645, "loss": 63.0563, "step": 8900 }, { "epoch": 2.25, "grad_norm": 0.0880662053823471, "learning_rate": 0.0002996813800862554, "loss": 63.5158, "step": 9000 }, { "epoch": 2.275, "grad_norm": 0.08363056182861328, "learning_rate": 0.00029967762985186574, "loss": 63.1458, "step": 9100 }, { "epoch": 2.3, "grad_norm": 0.0970577672123909, "learning_rate": 0.00029967387961747605, "loss": 63.6672, "step": 9200 }, { "epoch": 2.325, "grad_norm": 0.07709024846553802, "learning_rate": 0.0002996701293830864, "loss": 62.5691, "step": 9300 }, { "epoch": 2.35, "grad_norm": 0.09662684798240662, "learning_rate": 0.00029966637914869673, "loss": 63.201, "step": 9400 }, { "epoch": 2.375, "grad_norm": 0.09886329621076584, "learning_rate": 0.0002996626289143071, "loss": 61.905, "step": 9500 }, { "epoch": 2.4, "grad_norm": 0.09152296930551529, "learning_rate": 0.00029965887867991746, "loss": 62.0162, "step": 9600 }, { "epoch": 2.425, "grad_norm": 0.08669120818376541, "learning_rate": 0.00029965512844552783, "loss": 61.177, "step": 9700 }, { "epoch": 2.45, "grad_norm": 0.08084509521722794, "learning_rate": 0.00029965137821113814, "loss": 60.4171, "step": 9800 }, { "epoch": 2.475, "grad_norm": 0.07486914098262787, "learning_rate": 0.0002996476279767485, "loss": 60.7016, "step": 9900 }, { "epoch": 2.5, "grad_norm": 0.09742671251296997, "learning_rate": 0.0002996438777423589, "loss": 60.1792, "step": 10000 }, { "epoch": 2.525, "grad_norm": 0.0987100750207901, "learning_rate": 0.00029964012750796924, "loss": 61.4537, "step": 10100 }, { "epoch": 2.55, "grad_norm": 0.06886423379182816, "learning_rate": 0.00029963637727357955, "loss": 61.8643, "step": 10200 }, { "epoch": 2.575, "grad_norm": 0.082525834441185, "learning_rate": 0.0002996326270391899, "loss": 60.4919, "step": 10300 }, { "epoch": 2.6, "grad_norm": 0.08272566646337509, "learning_rate": 0.0002996288768048003, "loss": 60.0661, "step": 10400 }, { "epoch": 2.625, "grad_norm": 0.09038376808166504, "learning_rate": 0.00029962512657041065, "loss": 60.936, "step": 10500 }, { "epoch": 2.65, "grad_norm": 0.07726665586233139, "learning_rate": 0.00029962137633602096, "loss": 59.5663, "step": 10600 }, { "epoch": 2.675, "grad_norm": 0.07424433529376984, "learning_rate": 0.00029961762610163133, "loss": 59.158, "step": 10700 }, { "epoch": 2.7, "grad_norm": 0.07766600698232651, "learning_rate": 0.0002996138758672417, "loss": 60.6268, "step": 10800 }, { "epoch": 2.725, "grad_norm": 0.06614714115858078, "learning_rate": 0.00029961012563285206, "loss": 59.6028, "step": 10900 }, { "epoch": 2.75, "grad_norm": 0.10867344588041306, "learning_rate": 0.0002996063753984624, "loss": 58.8979, "step": 11000 }, { "epoch": 2.775, "grad_norm": 0.08278031647205353, "learning_rate": 0.00029960262516407274, "loss": 58.4585, "step": 11100 }, { "epoch": 2.8, "grad_norm": 0.0777415856719017, "learning_rate": 0.00029959887492968305, "loss": 58.2955, "step": 11200 }, { "epoch": 2.825, "grad_norm": 0.08938944339752197, "learning_rate": 0.0002995951246952934, "loss": 58.4243, "step": 11300 }, { "epoch": 2.85, "grad_norm": 0.07335088402032852, "learning_rate": 0.0002995913744609038, "loss": 58.3433, "step": 11400 }, { "epoch": 2.875, "grad_norm": 0.08737402409315109, "learning_rate": 0.00029958762422651415, "loss": 58.083, "step": 11500 }, { "epoch": 2.9, "grad_norm": 0.08511873334646225, "learning_rate": 0.00029958387399212446, "loss": 57.179, "step": 11600 }, { "epoch": 2.925, "grad_norm": 0.10887938737869263, "learning_rate": 0.00029958012375773483, "loss": 56.4871, "step": 11700 }, { "epoch": 2.95, "grad_norm": 0.06436943262815475, "learning_rate": 0.0002995763735233452, "loss": 56.647, "step": 11800 }, { "epoch": 2.975, "grad_norm": 0.0767776370048523, "learning_rate": 0.00029957262328895556, "loss": 56.8327, "step": 11900 }, { "epoch": 3.0, "grad_norm": 0.07136838138103485, "learning_rate": 0.0002995688730545659, "loss": 56.1021, "step": 12000 }, { "epoch": 3.025, "grad_norm": 0.07126389443874359, "learning_rate": 0.00029956512282017624, "loss": 54.9375, "step": 12100 }, { "epoch": 3.05, "grad_norm": 0.08064913004636765, "learning_rate": 0.0002995613725857866, "loss": 55.8513, "step": 12200 }, { "epoch": 3.075, "grad_norm": 0.09110742062330246, "learning_rate": 0.0002995576223513969, "loss": 55.3327, "step": 12300 }, { "epoch": 3.1, "grad_norm": 0.0769059956073761, "learning_rate": 0.0002995538721170073, "loss": 54.0639, "step": 12400 }, { "epoch": 3.125, "grad_norm": 0.06642630696296692, "learning_rate": 0.0002995501218826176, "loss": 53.5245, "step": 12500 }, { "epoch": 3.15, "grad_norm": 0.07648100703954697, "learning_rate": 0.000299546371648228, "loss": 53.7525, "step": 12600 }, { "epoch": 3.175, "grad_norm": 0.07088977843523026, "learning_rate": 0.00029954262141383833, "loss": 52.302, "step": 12700 }, { "epoch": 3.2, "grad_norm": 0.07282839715480804, "learning_rate": 0.0002995388711794487, "loss": 52.6612, "step": 12800 }, { "epoch": 3.225, "grad_norm": 0.07733161747455597, "learning_rate": 0.000299535120945059, "loss": 51.6131, "step": 12900 }, { "epoch": 3.25, "grad_norm": 0.06774196773767471, "learning_rate": 0.00029953137071066937, "loss": 51.9959, "step": 13000 }, { "epoch": 3.275, "grad_norm": 0.08115985989570618, "learning_rate": 0.00029952762047627974, "loss": 49.8227, "step": 13100 }, { "epoch": 3.3, "grad_norm": 0.0886857658624649, "learning_rate": 0.0002995238702418901, "loss": 50.5718, "step": 13200 }, { "epoch": 3.325, "grad_norm": 0.07071532309055328, "learning_rate": 0.0002995201200075004, "loss": 51.6469, "step": 13300 }, { "epoch": 3.35, "grad_norm": 0.09553579241037369, "learning_rate": 0.0002995163697731108, "loss": 50.2462, "step": 13400 }, { "epoch": 3.375, "grad_norm": 0.07065360993146896, "learning_rate": 0.00029951261953872115, "loss": 49.4932, "step": 13500 }, { "epoch": 3.4, "grad_norm": 0.07770080119371414, "learning_rate": 0.0002995088693043315, "loss": 49.8068, "step": 13600 }, { "epoch": 3.425, "grad_norm": 0.08060113340616226, "learning_rate": 0.0002995051190699418, "loss": 48.4129, "step": 13700 }, { "epoch": 3.45, "grad_norm": 0.07022694498300552, "learning_rate": 0.0002995013688355522, "loss": 48.5766, "step": 13800 }, { "epoch": 3.475, "grad_norm": 0.08857674151659012, "learning_rate": 0.00029949761860116256, "loss": 47.6903, "step": 13900 }, { "epoch": 3.5, "grad_norm": 0.069500632584095, "learning_rate": 0.0002994938683667729, "loss": 48.2677, "step": 14000 }, { "epoch": 3.525, "grad_norm": 0.08871123939752579, "learning_rate": 0.00029949011813238324, "loss": 46.9917, "step": 14100 }, { "epoch": 3.55, "grad_norm": 0.08282507210969925, "learning_rate": 0.0002994863678979936, "loss": 47.6174, "step": 14200 }, { "epoch": 3.575, "grad_norm": 0.07892107963562012, "learning_rate": 0.0002994826176636039, "loss": 47.7429, "step": 14300 }, { "epoch": 3.6, "grad_norm": 0.08358065783977509, "learning_rate": 0.00029947886742921434, "loss": 46.8444, "step": 14400 }, { "epoch": 3.625, "grad_norm": 0.08042451739311218, "learning_rate": 0.00029947511719482465, "loss": 47.1196, "step": 14500 }, { "epoch": 3.65, "grad_norm": 0.07715913653373718, "learning_rate": 0.000299471366960435, "loss": 46.1787, "step": 14600 }, { "epoch": 3.675, "grad_norm": 0.07201175391674042, "learning_rate": 0.0002994676167260453, "loss": 44.82, "step": 14700 }, { "epoch": 3.7, "grad_norm": 0.07503117620944977, "learning_rate": 0.0002994638664916557, "loss": 45.3985, "step": 14800 }, { "epoch": 3.725, "grad_norm": 0.08126576244831085, "learning_rate": 0.00029946011625726606, "loss": 44.4742, "step": 14900 }, { "epoch": 3.75, "grad_norm": 0.07859744131565094, "learning_rate": 0.0002994563660228764, "loss": 44.9098, "step": 15000 }, { "epoch": 3.775, "grad_norm": 0.09183020889759064, "learning_rate": 0.00029945261578848674, "loss": 44.9649, "step": 15100 }, { "epoch": 3.8, "grad_norm": 0.07173748314380646, "learning_rate": 0.0002994488655540971, "loss": 44.2067, "step": 15200 }, { "epoch": 3.825, "grad_norm": 0.07911107689142227, "learning_rate": 0.00029944511531970747, "loss": 43.3721, "step": 15300 }, { "epoch": 3.85, "grad_norm": 0.0707039088010788, "learning_rate": 0.00029944136508531783, "loss": 43.5256, "step": 15400 }, { "epoch": 3.875, "grad_norm": 0.08927769958972931, "learning_rate": 0.00029943761485092815, "loss": 42.8865, "step": 15500 }, { "epoch": 3.9, "grad_norm": 0.0942542776465416, "learning_rate": 0.0002994338646165385, "loss": 43.4099, "step": 15600 }, { "epoch": 3.925, "grad_norm": 0.07037200033664703, "learning_rate": 0.0002994301143821489, "loss": 43.2838, "step": 15700 }, { "epoch": 3.95, "grad_norm": 0.07836440950632095, "learning_rate": 0.00029942636414775924, "loss": 42.6156, "step": 15800 }, { "epoch": 3.975, "grad_norm": 0.1048571839928627, "learning_rate": 0.00029942261391336956, "loss": 41.1921, "step": 15900 }, { "epoch": 4.0, "grad_norm": 0.07439113408327103, "learning_rate": 0.0002994188636789799, "loss": 40.3632, "step": 16000 }, { "epoch": 4.025, "grad_norm": 0.07776340842247009, "learning_rate": 0.00029941511344459023, "loss": 41.4027, "step": 16100 }, { "epoch": 4.05, "grad_norm": 0.08847617357969284, "learning_rate": 0.0002994113632102006, "loss": 39.8482, "step": 16200 }, { "epoch": 4.075, "grad_norm": 0.07630669325590134, "learning_rate": 0.00029940761297581097, "loss": 39.8514, "step": 16300 }, { "epoch": 4.1, "grad_norm": 0.09090664237737656, "learning_rate": 0.0002994038627414213, "loss": 39.827, "step": 16400 }, { "epoch": 4.125, "grad_norm": 0.07954572886228561, "learning_rate": 0.00029940011250703164, "loss": 39.1342, "step": 16500 }, { "epoch": 4.15, "grad_norm": 0.09102310240268707, "learning_rate": 0.000299396362272642, "loss": 39.2371, "step": 16600 }, { "epoch": 4.175, "grad_norm": 0.08122776448726654, "learning_rate": 0.0002993926120382524, "loss": 38.2627, "step": 16700 }, { "epoch": 4.2, "grad_norm": 0.0793018564581871, "learning_rate": 0.0002993888618038627, "loss": 37.7778, "step": 16800 }, { "epoch": 4.225, "grad_norm": 0.08967263251543045, "learning_rate": 0.00029938511156947306, "loss": 37.3333, "step": 16900 }, { "epoch": 4.25, "grad_norm": 0.08178253471851349, "learning_rate": 0.0002993813613350834, "loss": 37.0271, "step": 17000 }, { "epoch": 4.275, "grad_norm": 0.07139851152896881, "learning_rate": 0.0002993776111006938, "loss": 36.2547, "step": 17100 }, { "epoch": 4.3, "grad_norm": 0.0816299095749855, "learning_rate": 0.0002993738608663041, "loss": 35.7427, "step": 17200 }, { "epoch": 4.325, "grad_norm": 0.08794036507606506, "learning_rate": 0.00029937011063191447, "loss": 36.1878, "step": 17300 }, { "epoch": 4.35, "grad_norm": 0.07489024847745895, "learning_rate": 0.00029936636039752483, "loss": 35.8839, "step": 17400 }, { "epoch": 4.375, "grad_norm": 0.07704652100801468, "learning_rate": 0.0002993626101631352, "loss": 34.6569, "step": 17500 }, { "epoch": 4.4, "grad_norm": 0.08644381910562515, "learning_rate": 0.0002993588974310894, "loss": 36.0711, "step": 17600 }, { "epoch": 4.425, "grad_norm": 0.0718245580792427, "learning_rate": 0.00029935514719669977, "loss": 34.2787, "step": 17700 }, { "epoch": 4.45, "grad_norm": 0.06881660968065262, "learning_rate": 0.0002993513969623101, "loss": 34.3262, "step": 17800 }, { "epoch": 4.475, "grad_norm": 0.09241487085819244, "learning_rate": 0.00029934764672792045, "loss": 32.8671, "step": 17900 }, { "epoch": 4.5, "grad_norm": 0.10901615768671036, "learning_rate": 0.0002993438964935308, "loss": 32.8513, "step": 18000 }, { "epoch": 4.525, "grad_norm": 0.10043422877788544, "learning_rate": 0.0002993401462591412, "loss": 33.2156, "step": 18100 }, { "epoch": 4.55, "grad_norm": 0.0931539386510849, "learning_rate": 0.0002993363960247515, "loss": 32.9817, "step": 18200 }, { "epoch": 4.575, "grad_norm": 0.07910791784524918, "learning_rate": 0.00029933264579036186, "loss": 32.266, "step": 18300 }, { "epoch": 4.6, "grad_norm": 0.07403460144996643, "learning_rate": 0.0002993288955559722, "loss": 32.3611, "step": 18400 }, { "epoch": 4.625, "grad_norm": 0.0901438444852829, "learning_rate": 0.0002993251453215826, "loss": 31.6647, "step": 18500 }, { "epoch": 4.65, "grad_norm": 0.08572247624397278, "learning_rate": 0.0002993213950871929, "loss": 31.4374, "step": 18600 }, { "epoch": 4.675, "grad_norm": 0.10135528445243835, "learning_rate": 0.00029931764485280327, "loss": 30.899, "step": 18700 }, { "epoch": 4.7, "grad_norm": 0.07215873152017593, "learning_rate": 0.00029931389461841364, "loss": 30.9789, "step": 18800 }, { "epoch": 4.725, "grad_norm": 0.08922874182462692, "learning_rate": 0.000299310144384024, "loss": 30.7143, "step": 18900 }, { "epoch": 4.75, "grad_norm": 0.08180548250675201, "learning_rate": 0.0002993063941496343, "loss": 30.1035, "step": 19000 }, { "epoch": 4.775, "grad_norm": 0.07757364213466644, "learning_rate": 0.0002993026439152447, "loss": 29.8003, "step": 19100 }, { "epoch": 4.8, "grad_norm": 0.09399455040693283, "learning_rate": 0.00029929889368085505, "loss": 29.8595, "step": 19200 }, { "epoch": 4.825, "grad_norm": 0.08426772803068161, "learning_rate": 0.0002992951434464654, "loss": 29.8153, "step": 19300 }, { "epoch": 4.85, "grad_norm": 0.08488670736551285, "learning_rate": 0.0002992913932120757, "loss": 29.5577, "step": 19400 }, { "epoch": 4.875, "grad_norm": 0.06904991716146469, "learning_rate": 0.0002992876429776861, "loss": 28.5755, "step": 19500 }, { "epoch": 4.9, "grad_norm": 0.11179706454277039, "learning_rate": 0.0002992838927432964, "loss": 28.8428, "step": 19600 }, { "epoch": 4.925, "grad_norm": 0.0724404975771904, "learning_rate": 0.00029928014250890677, "loss": 28.2313, "step": 19700 }, { "epoch": 4.95, "grad_norm": 0.08049552142620087, "learning_rate": 0.00029927639227451714, "loss": 27.1596, "step": 19800 }, { "epoch": 4.975, "grad_norm": 0.07410436868667603, "learning_rate": 0.0002992726420401275, "loss": 26.9374, "step": 19900 }, { "epoch": 5.0, "grad_norm": 0.0729108527302742, "learning_rate": 0.0002992688918057378, "loss": 27.3767, "step": 20000 }, { "epoch": 5.025, "grad_norm": 0.0834740698337555, "learning_rate": 0.0002992651790736921, "loss": 26.5892, "step": 20100 }, { "epoch": 5.05, "grad_norm": 0.07734266668558121, "learning_rate": 0.00029926142883930244, "loss": 26.4578, "step": 20200 }, { "epoch": 5.075, "grad_norm": 0.07236121594905853, "learning_rate": 0.00029925767860491275, "loss": 27.4309, "step": 20300 }, { "epoch": 5.1, "grad_norm": 0.07896186411380768, "learning_rate": 0.0002992539283705231, "loss": 26.7645, "step": 20400 }, { "epoch": 5.125, "grad_norm": 0.09544118493795395, "learning_rate": 0.0002992501781361335, "loss": 26.149, "step": 20500 }, { "epoch": 5.15, "grad_norm": 0.07782524079084396, "learning_rate": 0.00029924642790174385, "loss": 25.7688, "step": 20600 }, { "epoch": 5.175, "grad_norm": 0.07927709072828293, "learning_rate": 0.00029924267766735416, "loss": 25.8487, "step": 20700 }, { "epoch": 5.2, "grad_norm": 0.07417237758636475, "learning_rate": 0.00029923892743296453, "loss": 25.6094, "step": 20800 }, { "epoch": 5.225, "grad_norm": 0.09987534582614899, "learning_rate": 0.0002992352147009188, "loss": 25.3336, "step": 20900 }, { "epoch": 5.25, "grad_norm": 0.08160518109798431, "learning_rate": 0.00029923146446652916, "loss": 25.2813, "step": 21000 }, { "epoch": 5.275, "grad_norm": 0.07650009542703629, "learning_rate": 0.00029922771423213947, "loss": 25.0793, "step": 21100 }, { "epoch": 5.3, "grad_norm": 0.07089775055646896, "learning_rate": 0.00029922396399774983, "loss": 24.9184, "step": 21200 }, { "epoch": 5.325, "grad_norm": 0.10953019559383392, "learning_rate": 0.00029922021376336015, "loss": 24.5976, "step": 21300 }, { "epoch": 5.35, "grad_norm": 0.07163265347480774, "learning_rate": 0.00029921646352897057, "loss": 24.3399, "step": 21400 }, { "epoch": 5.375, "grad_norm": 0.08414668589830399, "learning_rate": 0.0002992127132945809, "loss": 23.6757, "step": 21500 }, { "epoch": 5.4, "grad_norm": 0.07715445011854172, "learning_rate": 0.00029920896306019125, "loss": 24.2548, "step": 21600 }, { "epoch": 5.425, "grad_norm": 0.1033063754439354, "learning_rate": 0.00029920521282580156, "loss": 23.3908, "step": 21700 }, { "epoch": 5.45, "grad_norm": 0.0769144669175148, "learning_rate": 0.0002992014625914119, "loss": 23.693, "step": 21800 }, { "epoch": 5.475, "grad_norm": 0.07799799740314484, "learning_rate": 0.0002991977123570223, "loss": 23.9314, "step": 21900 }, { "epoch": 5.5, "grad_norm": 0.07105720043182373, "learning_rate": 0.00029919396212263266, "loss": 23.2387, "step": 22000 }, { "epoch": 5.525, "grad_norm": 0.0878797098994255, "learning_rate": 0.00029919021188824297, "loss": 22.7268, "step": 22100 }, { "epoch": 5.55, "grad_norm": 0.0924353376030922, "learning_rate": 0.00029918646165385333, "loss": 23.1994, "step": 22200 }, { "epoch": 5.575, "grad_norm": 0.09924343973398209, "learning_rate": 0.0002991827114194637, "loss": 22.7976, "step": 22300 }, { "epoch": 5.6, "grad_norm": 0.0845380574464798, "learning_rate": 0.00029917896118507407, "loss": 22.6053, "step": 22400 }, { "epoch": 5.625, "grad_norm": 0.09131123870611191, "learning_rate": 0.0002991752109506844, "loss": 22.813, "step": 22500 }, { "epoch": 5.65, "grad_norm": 0.08501371741294861, "learning_rate": 0.00029917146071629474, "loss": 22.3981, "step": 22600 }, { "epoch": 5.675, "grad_norm": 0.10916517674922943, "learning_rate": 0.0002991677104819051, "loss": 21.6828, "step": 22700 }, { "epoch": 5.7, "grad_norm": 0.08462018519639969, "learning_rate": 0.0002991639602475155, "loss": 22.0131, "step": 22800 }, { "epoch": 5.725, "grad_norm": 0.09394313395023346, "learning_rate": 0.0002991602100131258, "loss": 21.7932, "step": 22900 }, { "epoch": 5.75, "grad_norm": 0.08408233523368835, "learning_rate": 0.00029915645977873615, "loss": 21.8634, "step": 23000 }, { "epoch": 5.775, "grad_norm": 0.0706961527466774, "learning_rate": 0.00029915270954434647, "loss": 21.6353, "step": 23100 }, { "epoch": 5.8, "grad_norm": 0.08162959665060043, "learning_rate": 0.00029914895930995683, "loss": 21.356, "step": 23200 }, { "epoch": 5.825, "grad_norm": 0.08196116983890533, "learning_rate": 0.0002991452090755672, "loss": 21.3074, "step": 23300 }, { "epoch": 5.85, "grad_norm": 0.07449360191822052, "learning_rate": 0.00029914145884117756, "loss": 21.2129, "step": 23400 }, { "epoch": 5.875, "grad_norm": 0.08260208368301392, "learning_rate": 0.0002991377086067879, "loss": 20.7806, "step": 23500 }, { "epoch": 5.9, "grad_norm": 0.07383255660533905, "learning_rate": 0.00029913395837239824, "loss": 20.9318, "step": 23600 }, { "epoch": 5.925, "grad_norm": 0.08240984380245209, "learning_rate": 0.0002991302081380086, "loss": 20.5751, "step": 23700 }, { "epoch": 5.95, "grad_norm": 0.06921262294054031, "learning_rate": 0.000299126457903619, "loss": 20.9214, "step": 23800 }, { "epoch": 5.975, "grad_norm": 0.07990318536758423, "learning_rate": 0.0002991227076692293, "loss": 20.6422, "step": 23900 }, { "epoch": 6.0, "grad_norm": 0.083002008497715, "learning_rate": 0.00029911895743483965, "loss": 19.8315, "step": 24000 }, { "epoch": 6.025, "grad_norm": 0.08495783805847168, "learning_rate": 0.00029911520720045, "loss": 20.1271, "step": 24100 }, { "epoch": 6.05, "grad_norm": 0.1061740592122078, "learning_rate": 0.00029911145696606033, "loss": 20.1241, "step": 24200 }, { "epoch": 6.075, "grad_norm": 0.08326783776283264, "learning_rate": 0.0002991077067316707, "loss": 19.5344, "step": 24300 }, { "epoch": 6.1, "grad_norm": 0.08668112009763718, "learning_rate": 0.00029910395649728106, "loss": 19.8691, "step": 24400 }, { "epoch": 6.125, "grad_norm": 0.07595008611679077, "learning_rate": 0.00029910020626289143, "loss": 19.5726, "step": 24500 }, { "epoch": 6.15, "grad_norm": 0.09996142983436584, "learning_rate": 0.00029909645602850174, "loss": 19.3215, "step": 24600 }, { "epoch": 6.175, "grad_norm": 0.07515228539705276, "learning_rate": 0.0002990927057941121, "loss": 19.5642, "step": 24700 }, { "epoch": 6.2, "grad_norm": 0.06983605772256851, "learning_rate": 0.0002990889555597224, "loss": 19.1783, "step": 24800 }, { "epoch": 6.225, "grad_norm": 0.07114838808774948, "learning_rate": 0.0002990852053253328, "loss": 19.0791, "step": 24900 }, { "epoch": 6.25, "grad_norm": 0.08623602986335754, "learning_rate": 0.00029908145509094315, "loss": 19.5374, "step": 25000 }, { "epoch": 6.275, "grad_norm": 0.09096742421388626, "learning_rate": 0.0002990777048565535, "loss": 18.8189, "step": 25100 }, { "epoch": 6.3, "grad_norm": 0.08167672157287598, "learning_rate": 0.00029907395462216383, "loss": 18.4164, "step": 25200 }, { "epoch": 6.325, "grad_norm": 0.08562010526657104, "learning_rate": 0.0002990702043877742, "loss": 18.3827, "step": 25300 }, { "epoch": 6.35, "grad_norm": 0.08020398765802383, "learning_rate": 0.00029906645415338456, "loss": 18.8151, "step": 25400 }, { "epoch": 6.375, "grad_norm": 0.08050194382667542, "learning_rate": 0.00029906270391899493, "loss": 17.9696, "step": 25500 }, { "epoch": 6.4, "grad_norm": 0.09030721336603165, "learning_rate": 0.00029905895368460524, "loss": 17.9795, "step": 25600 }, { "epoch": 6.425, "grad_norm": 0.09238829463720322, "learning_rate": 0.0002990552034502156, "loss": 17.8095, "step": 25700 }, { "epoch": 6.45, "grad_norm": 0.08500493317842484, "learning_rate": 0.000299051453215826, "loss": 18.3223, "step": 25800 }, { "epoch": 6.475, "grad_norm": 0.08180621266365051, "learning_rate": 0.00029904770298143634, "loss": 17.7836, "step": 25900 }, { "epoch": 6.5, "grad_norm": 0.09796881675720215, "learning_rate": 0.00029904395274704665, "loss": 17.7483, "step": 26000 }, { "epoch": 6.525, "grad_norm": 0.08432163298130035, "learning_rate": 0.000299040202512657, "loss": 18.2479, "step": 26100 }, { "epoch": 6.55, "grad_norm": 0.08197837322950363, "learning_rate": 0.0002990364897806113, "loss": 17.7703, "step": 26200 }, { "epoch": 6.575, "grad_norm": 0.07721620053052902, "learning_rate": 0.00029903273954622164, "loss": 17.1537, "step": 26300 }, { "epoch": 6.6, "grad_norm": 0.0785108208656311, "learning_rate": 0.00029902898931183196, "loss": 17.5139, "step": 26400 }, { "epoch": 6.625, "grad_norm": 0.08640828728675842, "learning_rate": 0.0002990252390774423, "loss": 16.7445, "step": 26500 }, { "epoch": 6.65, "grad_norm": 0.09119407832622528, "learning_rate": 0.00029902148884305263, "loss": 17.1573, "step": 26600 }, { "epoch": 6.675, "grad_norm": 0.07212173193693161, "learning_rate": 0.000299017738608663, "loss": 17.0759, "step": 26700 }, { "epoch": 6.7, "grad_norm": 0.08220189809799194, "learning_rate": 0.00029901398837427337, "loss": 16.9119, "step": 26800 }, { "epoch": 6.725, "grad_norm": 0.10024359822273254, "learning_rate": 0.00029901023813988373, "loss": 16.7596, "step": 26900 }, { "epoch": 6.75, "grad_norm": 0.0850207731127739, "learning_rate": 0.00029900648790549405, "loss": 16.9184, "step": 27000 }, { "epoch": 6.775, "grad_norm": 0.07585939019918442, "learning_rate": 0.0002990027376711044, "loss": 16.4899, "step": 27100 }, { "epoch": 6.8, "grad_norm": 0.08519823104143143, "learning_rate": 0.0002989989874367148, "loss": 16.8922, "step": 27200 }, { "epoch": 6.825, "grad_norm": 0.08368838578462601, "learning_rate": 0.00029899523720232514, "loss": 16.8136, "step": 27300 }, { "epoch": 6.85, "grad_norm": 0.08928319811820984, "learning_rate": 0.00029899148696793546, "loss": 16.2412, "step": 27400 }, { "epoch": 6.875, "grad_norm": 0.08436159044504166, "learning_rate": 0.0002989877367335458, "loss": 16.6282, "step": 27500 }, { "epoch": 6.9, "grad_norm": 0.0907684713602066, "learning_rate": 0.0002989839864991562, "loss": 16.3234, "step": 27600 }, { "epoch": 6.925, "grad_norm": 0.08816706389188766, "learning_rate": 0.00029898023626476655, "loss": 16.164, "step": 27700 }, { "epoch": 6.95, "grad_norm": 0.08335541933774948, "learning_rate": 0.00029897648603037687, "loss": 16.1988, "step": 27800 }, { "epoch": 6.975, "grad_norm": 0.07165244221687317, "learning_rate": 0.00029897273579598723, "loss": 16.1657, "step": 27900 }, { "epoch": 7.0, "grad_norm": 0.0803430899977684, "learning_rate": 0.0002989689855615976, "loss": 15.7038, "step": 28000 }, { "epoch": 7.025, "grad_norm": 0.0674068033695221, "learning_rate": 0.0002989652353272079, "loss": 15.5932, "step": 28100 }, { "epoch": 7.05, "grad_norm": 0.07914315909147263, "learning_rate": 0.0002989614850928183, "loss": 16.1827, "step": 28200 }, { "epoch": 7.075, "grad_norm": 0.0919245108962059, "learning_rate": 0.0002989577348584286, "loss": 15.7686, "step": 28300 }, { "epoch": 7.1, "grad_norm": 0.09044385701417923, "learning_rate": 0.00029895398462403895, "loss": 15.6737, "step": 28400 }, { "epoch": 7.125, "grad_norm": 0.08890822529792786, "learning_rate": 0.0002989502343896493, "loss": 15.8661, "step": 28500 }, { "epoch": 7.15, "grad_norm": 0.08436182141304016, "learning_rate": 0.0002989464841552597, "loss": 15.5255, "step": 28600 }, { "epoch": 7.175, "grad_norm": 0.08775323629379272, "learning_rate": 0.00029894273392087, "loss": 15.4992, "step": 28700 }, { "epoch": 7.2, "grad_norm": 0.09018935263156891, "learning_rate": 0.00029893898368648036, "loss": 15.3418, "step": 28800 }, { "epoch": 7.225, "grad_norm": 0.08356596529483795, "learning_rate": 0.00029893523345209073, "loss": 15.0965, "step": 28900 }, { "epoch": 7.25, "grad_norm": 0.09058874845504761, "learning_rate": 0.0002989314832177011, "loss": 15.0762, "step": 29000 }, { "epoch": 7.275, "grad_norm": 0.07803665101528168, "learning_rate": 0.0002989277329833114, "loss": 14.6331, "step": 29100 }, { "epoch": 7.3, "grad_norm": 0.08148869127035141, "learning_rate": 0.0002989239827489218, "loss": 14.8405, "step": 29200 }, { "epoch": 7.325, "grad_norm": 0.08294442296028137, "learning_rate": 0.00029892023251453214, "loss": 15.2037, "step": 29300 }, { "epoch": 7.35, "grad_norm": 0.0803549587726593, "learning_rate": 0.0002989164822801425, "loss": 14.8633, "step": 29400 }, { "epoch": 7.375, "grad_norm": 0.08180885016918182, "learning_rate": 0.0002989127320457528, "loss": 14.8036, "step": 29500 }, { "epoch": 7.4, "grad_norm": 0.08756575733423233, "learning_rate": 0.0002989089818113632, "loss": 14.2077, "step": 29600 }, { "epoch": 7.425, "grad_norm": 0.0851132944226265, "learning_rate": 0.00029890523157697355, "loss": 14.569, "step": 29700 }, { "epoch": 7.45, "grad_norm": 0.08879829198122025, "learning_rate": 0.0002989014813425839, "loss": 14.5104, "step": 29800 }, { "epoch": 7.475, "grad_norm": 0.0918511152267456, "learning_rate": 0.00029889773110819423, "loss": 14.3482, "step": 29900 }, { "epoch": 7.5, "grad_norm": 0.07251127064228058, "learning_rate": 0.0002988939808738046, "loss": 14.2309, "step": 30000 }, { "epoch": 7.525, "grad_norm": 0.07517971098423004, "learning_rate": 0.0002988902306394149, "loss": 14.0291, "step": 30100 }, { "epoch": 7.55, "grad_norm": 0.08854610472917557, "learning_rate": 0.00029888651790736917, "loss": 14.1938, "step": 30200 }, { "epoch": 7.575, "grad_norm": 0.0849192887544632, "learning_rate": 0.00029888276767297954, "loss": 14.4531, "step": 30300 }, { "epoch": 7.6, "grad_norm": 0.08010224252939224, "learning_rate": 0.0002988790174385899, "loss": 14.2434, "step": 30400 }, { "epoch": 7.625, "grad_norm": 0.09017332643270493, "learning_rate": 0.0002988752672042002, "loss": 14.2892, "step": 30500 }, { "epoch": 7.65, "grad_norm": 0.08440462499856949, "learning_rate": 0.0002988715544721545, "loss": 13.8386, "step": 30600 }, { "epoch": 7.675, "grad_norm": 0.08667606860399246, "learning_rate": 0.00029886780423776484, "loss": 13.9581, "step": 30700 }, { "epoch": 7.7, "grad_norm": 0.08237945288419724, "learning_rate": 0.0002988640540033752, "loss": 13.8813, "step": 30800 }, { "epoch": 7.725, "grad_norm": 0.09895262122154236, "learning_rate": 0.0002988603037689855, "loss": 13.7951, "step": 30900 }, { "epoch": 7.75, "grad_norm": 0.07596876472234726, "learning_rate": 0.0002988565535345959, "loss": 13.7703, "step": 31000 }, { "epoch": 7.775, "grad_norm": 0.07925312221050262, "learning_rate": 0.00029885280330020625, "loss": 13.4507, "step": 31100 }, { "epoch": 7.8, "grad_norm": 0.06997061520814896, "learning_rate": 0.0002988490530658166, "loss": 13.2481, "step": 31200 }, { "epoch": 7.825, "grad_norm": 0.07986485958099365, "learning_rate": 0.00029884530283142693, "loss": 13.3403, "step": 31300 }, { "epoch": 7.85, "grad_norm": 0.0819752886891365, "learning_rate": 0.0002988415525970373, "loss": 13.5279, "step": 31400 }, { "epoch": 7.875, "grad_norm": 0.08534371107816696, "learning_rate": 0.00029883780236264766, "loss": 13.528, "step": 31500 }, { "epoch": 7.9, "grad_norm": 0.06895570456981659, "learning_rate": 0.00029883405212825803, "loss": 13.0555, "step": 31600 }, { "epoch": 7.925, "grad_norm": 0.07396534085273743, "learning_rate": 0.00029883030189386834, "loss": 13.1404, "step": 31700 }, { "epoch": 7.95, "grad_norm": 0.0788232609629631, "learning_rate": 0.00029882655165947865, "loss": 13.1032, "step": 31800 }, { "epoch": 7.975, "grad_norm": 0.0716477558016777, "learning_rate": 0.000298822801425089, "loss": 13.4664, "step": 31900 }, { "epoch": 8.0, "grad_norm": 0.07852466404438019, "learning_rate": 0.0002988190511906994, "loss": 13.006, "step": 32000 }, { "epoch": 8.025, "grad_norm": 0.1100274920463562, "learning_rate": 0.00029881530095630975, "loss": 13.0427, "step": 32100 }, { "epoch": 8.05, "grad_norm": 0.07130661606788635, "learning_rate": 0.00029881155072192006, "loss": 12.7575, "step": 32200 }, { "epoch": 8.075, "grad_norm": 0.0846419557929039, "learning_rate": 0.00029880780048753043, "loss": 12.788, "step": 32300 }, { "epoch": 8.1, "grad_norm": 0.07769067585468292, "learning_rate": 0.0002988040502531408, "loss": 12.8833, "step": 32400 }, { "epoch": 8.125, "grad_norm": 0.06623586267232895, "learning_rate": 0.00029880030001875116, "loss": 12.5255, "step": 32500 }, { "epoch": 8.15, "grad_norm": 0.0744013637304306, "learning_rate": 0.00029879654978436147, "loss": 12.7006, "step": 32600 }, { "epoch": 8.175, "grad_norm": 0.07793931663036346, "learning_rate": 0.00029879279954997184, "loss": 12.2209, "step": 32700 }, { "epoch": 8.2, "grad_norm": 0.07592390477657318, "learning_rate": 0.0002987890493155822, "loss": 12.2655, "step": 32800 }, { "epoch": 8.225, "grad_norm": 0.07824064791202545, "learning_rate": 0.00029878529908119257, "loss": 12.3666, "step": 32900 }, { "epoch": 8.25, "grad_norm": 0.06895022094249725, "learning_rate": 0.0002987815488468029, "loss": 12.3957, "step": 33000 }, { "epoch": 8.275, "grad_norm": 0.08005383610725403, "learning_rate": 0.00029877779861241325, "loss": 12.3892, "step": 33100 }, { "epoch": 8.3, "grad_norm": 0.0835549384355545, "learning_rate": 0.0002987740483780236, "loss": 12.1796, "step": 33200 }, { "epoch": 8.325, "grad_norm": 0.08501383662223816, "learning_rate": 0.000298770298143634, "loss": 11.9921, "step": 33300 }, { "epoch": 8.35, "grad_norm": 0.08822602778673172, "learning_rate": 0.0002987665479092443, "loss": 12.4392, "step": 33400 }, { "epoch": 8.375, "grad_norm": 0.07659414410591125, "learning_rate": 0.00029876279767485466, "loss": 12.0612, "step": 33500 }, { "epoch": 8.4, "grad_norm": 0.08337811380624771, "learning_rate": 0.00029875904744046497, "loss": 12.0035, "step": 33600 }, { "epoch": 8.425, "grad_norm": 0.07944267988204956, "learning_rate": 0.00029875529720607534, "loss": 11.8415, "step": 33700 }, { "epoch": 8.45, "grad_norm": 0.0773790031671524, "learning_rate": 0.0002987515469716857, "loss": 12.1775, "step": 33800 }, { "epoch": 8.475, "grad_norm": 0.08871705085039139, "learning_rate": 0.00029874779673729607, "loss": 12.208, "step": 33900 }, { "epoch": 8.5, "grad_norm": 0.07573138922452927, "learning_rate": 0.0002987440465029064, "loss": 11.6756, "step": 34000 }, { "epoch": 8.525, "grad_norm": 0.07265728712081909, "learning_rate": 0.00029874029626851675, "loss": 11.4454, "step": 34100 }, { "epoch": 8.55, "grad_norm": 0.0791819617152214, "learning_rate": 0.0002987365460341271, "loss": 11.9128, "step": 34200 }, { "epoch": 8.575, "grad_norm": 0.07876613736152649, "learning_rate": 0.0002987327957997375, "loss": 11.7746, "step": 34300 }, { "epoch": 8.6, "grad_norm": 0.08273490518331528, "learning_rate": 0.0002987290455653478, "loss": 11.6367, "step": 34400 }, { "epoch": 8.625, "grad_norm": 0.07402598857879639, "learning_rate": 0.00029872529533095816, "loss": 11.6052, "step": 34500 }, { "epoch": 8.65, "grad_norm": 0.06618580222129822, "learning_rate": 0.0002987215825989124, "loss": 11.7364, "step": 34600 }, { "epoch": 8.675, "grad_norm": 0.07777924090623856, "learning_rate": 0.0002987178323645228, "loss": 11.3839, "step": 34700 }, { "epoch": 8.7, "grad_norm": 0.09256916493177414, "learning_rate": 0.0002987140821301331, "loss": 11.4444, "step": 34800 }, { "epoch": 8.725, "grad_norm": 0.08080556988716125, "learning_rate": 0.00029871033189574346, "loss": 11.5891, "step": 34900 }, { "epoch": 8.75, "grad_norm": 0.08270179480314255, "learning_rate": 0.00029870658166135383, "loss": 11.3784, "step": 35000 }, { "epoch": 8.775, "grad_norm": 0.08168449997901917, "learning_rate": 0.0002987028314269642, "loss": 11.1576, "step": 35100 }, { "epoch": 8.8, "grad_norm": 0.07069560140371323, "learning_rate": 0.0002986990811925745, "loss": 11.2748, "step": 35200 }, { "epoch": 8.825, "grad_norm": 0.07771777361631393, "learning_rate": 0.0002986953309581849, "loss": 11.2124, "step": 35300 }, { "epoch": 8.85, "grad_norm": 0.0844758003950119, "learning_rate": 0.0002986915807237952, "loss": 10.9886, "step": 35400 }, { "epoch": 8.875, "grad_norm": 0.07531385868787766, "learning_rate": 0.00029868783048940555, "loss": 11.4722, "step": 35500 }, { "epoch": 8.9, "grad_norm": 0.08248105645179749, "learning_rate": 0.0002986840802550159, "loss": 11.1052, "step": 35600 }, { "epoch": 8.925, "grad_norm": 0.08126658946275711, "learning_rate": 0.0002986803300206263, "loss": 11.0637, "step": 35700 }, { "epoch": 8.95, "grad_norm": 0.07933900505304337, "learning_rate": 0.0002986765797862366, "loss": 10.6369, "step": 35800 }, { "epoch": 8.975, "grad_norm": 0.07628486305475235, "learning_rate": 0.00029867282955184696, "loss": 10.8511, "step": 35900 }, { "epoch": 9.0, "grad_norm": 0.07509356737136841, "learning_rate": 0.00029866907931745733, "loss": 10.9576, "step": 36000 }, { "epoch": 9.025, "grad_norm": 0.085249163210392, "learning_rate": 0.00029866532908306764, "loss": 10.9181, "step": 36100 }, { "epoch": 9.05, "grad_norm": 0.08377708494663239, "learning_rate": 0.000298661578848678, "loss": 10.7095, "step": 36200 }, { "epoch": 9.075, "grad_norm": 0.06539880484342575, "learning_rate": 0.00029865786611663227, "loss": 10.4937, "step": 36300 }, { "epoch": 9.1, "grad_norm": 0.08634931594133377, "learning_rate": 0.00029865411588224263, "loss": 11.043, "step": 36400 }, { "epoch": 9.125, "grad_norm": 0.06905148923397064, "learning_rate": 0.00029865036564785295, "loss": 11.0456, "step": 36500 }, { "epoch": 9.15, "grad_norm": 0.07896845042705536, "learning_rate": 0.0002986466154134633, "loss": 10.5105, "step": 36600 }, { "epoch": 9.175, "grad_norm": 0.07206033915281296, "learning_rate": 0.0002986428651790737, "loss": 10.7025, "step": 36700 }, { "epoch": 9.2, "grad_norm": 0.06719633936882019, "learning_rate": 0.00029863911494468405, "loss": 10.3498, "step": 36800 }, { "epoch": 9.225, "grad_norm": 0.07648395001888275, "learning_rate": 0.00029863536471029436, "loss": 10.4292, "step": 36900 }, { "epoch": 9.25, "grad_norm": 0.08475750684738159, "learning_rate": 0.0002986316144759047, "loss": 10.5922, "step": 37000 }, { "epoch": 9.275, "grad_norm": 0.09004350751638412, "learning_rate": 0.00029862786424151504, "loss": 10.3239, "step": 37100 }, { "epoch": 9.3, "grad_norm": 0.06373389810323715, "learning_rate": 0.0002986241140071254, "loss": 10.2006, "step": 37200 }, { "epoch": 9.325, "grad_norm": 0.07837036997079849, "learning_rate": 0.00029862036377273577, "loss": 10.193, "step": 37300 }, { "epoch": 9.35, "grad_norm": 0.07210332155227661, "learning_rate": 0.00029861661353834613, "loss": 10.2084, "step": 37400 }, { "epoch": 9.375, "grad_norm": 0.07254429906606674, "learning_rate": 0.00029861286330395645, "loss": 10.2551, "step": 37500 }, { "epoch": 9.4, "grad_norm": 0.06640215963125229, "learning_rate": 0.0002986091130695668, "loss": 10.2847, "step": 37600 }, { "epoch": 9.425, "grad_norm": 0.07777173817157745, "learning_rate": 0.0002986053628351772, "loss": 10.2434, "step": 37700 }, { "epoch": 9.45, "grad_norm": 0.07829392701387405, "learning_rate": 0.00029860161260078754, "loss": 10.0319, "step": 37800 }, { "epoch": 9.475, "grad_norm": 0.07961380481719971, "learning_rate": 0.00029859786236639786, "loss": 10.1739, "step": 37900 }, { "epoch": 9.5, "grad_norm": 0.07749368995428085, "learning_rate": 0.0002985941121320082, "loss": 9.6391, "step": 38000 }, { "epoch": 9.525, "grad_norm": 0.0826738029718399, "learning_rate": 0.0002985903618976186, "loss": 10.4704, "step": 38100 }, { "epoch": 9.55, "grad_norm": 0.06573819369077682, "learning_rate": 0.00029858661166322895, "loss": 9.7767, "step": 38200 }, { "epoch": 9.575, "grad_norm": 0.08020669966936111, "learning_rate": 0.00029858286142883927, "loss": 9.7305, "step": 38300 }, { "epoch": 9.6, "grad_norm": 0.06815823167562485, "learning_rate": 0.00029857911119444963, "loss": 9.597, "step": 38400 }, { "epoch": 9.625, "grad_norm": 0.07290255278348923, "learning_rate": 0.0002985753984624039, "loss": 9.8638, "step": 38500 }, { "epoch": 9.65, "grad_norm": 0.06887535005807877, "learning_rate": 0.00029857164822801426, "loss": 9.6939, "step": 38600 }, { "epoch": 9.675, "grad_norm": 0.08159805834293365, "learning_rate": 0.00029856789799362457, "loss": 9.8011, "step": 38700 }, { "epoch": 9.7, "grad_norm": 0.08071273565292358, "learning_rate": 0.00029856414775923494, "loss": 9.5514, "step": 38800 }, { "epoch": 9.725, "grad_norm": 0.07089462131261826, "learning_rate": 0.00029856039752484525, "loss": 9.8858, "step": 38900 }, { "epoch": 9.75, "grad_norm": 0.08935658633708954, "learning_rate": 0.0002985566472904556, "loss": 9.6155, "step": 39000 }, { "epoch": 9.775, "grad_norm": 0.08028286695480347, "learning_rate": 0.000298552897056066, "loss": 9.6638, "step": 39100 }, { "epoch": 9.8, "grad_norm": 0.07186749577522278, "learning_rate": 0.00029854914682167635, "loss": 9.3091, "step": 39200 }, { "epoch": 9.825, "grad_norm": 0.06545951217412949, "learning_rate": 0.00029854539658728666, "loss": 9.5374, "step": 39300 }, { "epoch": 9.85, "grad_norm": 0.0787624716758728, "learning_rate": 0.000298541646352897, "loss": 9.4178, "step": 39400 }, { "epoch": 9.875, "grad_norm": 0.07585486769676208, "learning_rate": 0.0002985378961185074, "loss": 9.2153, "step": 39500 }, { "epoch": 9.9, "grad_norm": 0.07809693366289139, "learning_rate": 0.0002985341458841177, "loss": 9.2729, "step": 39600 }, { "epoch": 9.925, "grad_norm": 0.12963560223579407, "learning_rate": 0.00029853039564972807, "loss": 9.1279, "step": 39700 }, { "epoch": 9.95, "grad_norm": 0.06803625822067261, "learning_rate": 0.00029852664541533844, "loss": 9.3529, "step": 39800 }, { "epoch": 9.975, "grad_norm": 0.07478567957878113, "learning_rate": 0.0002985228951809488, "loss": 9.1627, "step": 39900 }, { "epoch": 10.0, "grad_norm": 0.07844047993421555, "learning_rate": 0.0002985191449465591, "loss": 9.0775, "step": 40000 }, { "epoch": 10.025, "grad_norm": 0.07982715219259262, "learning_rate": 0.0002985153947121695, "loss": 9.4258, "step": 40100 }, { "epoch": 10.05, "grad_norm": 0.0806502029299736, "learning_rate": 0.00029851164447777985, "loss": 9.3455, "step": 40200 }, { "epoch": 10.075, "grad_norm": 0.06514900177717209, "learning_rate": 0.0002985078942433902, "loss": 8.9195, "step": 40300 }, { "epoch": 10.1, "grad_norm": 0.08182831853628159, "learning_rate": 0.0002985041440090005, "loss": 8.9772, "step": 40400 }, { "epoch": 10.125, "grad_norm": 0.07242997735738754, "learning_rate": 0.0002985003937746109, "loss": 9.3286, "step": 40500 }, { "epoch": 10.15, "grad_norm": 0.07168876379728317, "learning_rate": 0.0002984966435402212, "loss": 8.8118, "step": 40600 }, { "epoch": 10.175, "grad_norm": 0.07878579944372177, "learning_rate": 0.00029849289330583157, "loss": 9.0127, "step": 40700 }, { "epoch": 10.2, "grad_norm": 0.06614303588867188, "learning_rate": 0.00029848914307144194, "loss": 8.8964, "step": 40800 }, { "epoch": 10.225, "grad_norm": 0.07991635799407959, "learning_rate": 0.0002984853928370523, "loss": 8.7963, "step": 40900 }, { "epoch": 10.25, "grad_norm": 0.07721689343452454, "learning_rate": 0.0002984816426026626, "loss": 8.797, "step": 41000 }, { "epoch": 10.275, "grad_norm": 0.07666311413049698, "learning_rate": 0.000298477892368273, "loss": 8.6722, "step": 41100 }, { "epoch": 10.3, "grad_norm": 0.0791340246796608, "learning_rate": 0.00029847414213388335, "loss": 8.6547, "step": 41200 }, { "epoch": 10.325, "grad_norm": 0.0760653093457222, "learning_rate": 0.0002984703918994937, "loss": 8.696, "step": 41300 }, { "epoch": 10.35, "grad_norm": 0.06864143908023834, "learning_rate": 0.000298466641665104, "loss": 8.8221, "step": 41400 }, { "epoch": 10.375, "grad_norm": 0.07417836040258408, "learning_rate": 0.0002984628914307144, "loss": 8.5974, "step": 41500 }, { "epoch": 10.4, "grad_norm": 0.073348268866539, "learning_rate": 0.00029845914119632476, "loss": 8.309, "step": 41600 }, { "epoch": 10.425, "grad_norm": 0.0775461494922638, "learning_rate": 0.0002984553909619351, "loss": 8.6313, "step": 41700 }, { "epoch": 10.45, "grad_norm": 0.07109999656677246, "learning_rate": 0.00029845164072754543, "loss": 8.3238, "step": 41800 }, { "epoch": 10.475, "grad_norm": 0.06957342475652695, "learning_rate": 0.0002984478904931558, "loss": 8.3179, "step": 41900 }, { "epoch": 10.5, "grad_norm": 0.07247728109359741, "learning_rate": 0.00029844414025876617, "loss": 8.3806, "step": 42000 }, { "epoch": 10.525, "grad_norm": 0.08276287466287613, "learning_rate": 0.00029844039002437653, "loss": 8.495, "step": 42100 }, { "epoch": 10.55, "grad_norm": 0.07794822007417679, "learning_rate": 0.00029843663978998685, "loss": 8.0454, "step": 42200 }, { "epoch": 10.575, "grad_norm": 0.07254128903150558, "learning_rate": 0.0002984328895555972, "loss": 8.5174, "step": 42300 }, { "epoch": 10.6, "grad_norm": 0.08386515080928802, "learning_rate": 0.0002984291393212075, "loss": 8.5586, "step": 42400 }, { "epoch": 10.625, "grad_norm": 0.0731733962893486, "learning_rate": 0.0002984254265891618, "loss": 8.1163, "step": 42500 }, { "epoch": 10.65, "grad_norm": 0.07960132509469986, "learning_rate": 0.00029842167635477215, "loss": 8.0072, "step": 42600 }, { "epoch": 10.675, "grad_norm": 0.07048605382442474, "learning_rate": 0.0002984179261203825, "loss": 8.3243, "step": 42700 }, { "epoch": 10.7, "grad_norm": 0.07215945422649384, "learning_rate": 0.00029841417588599283, "loss": 8.2795, "step": 42800 }, { "epoch": 10.725, "grad_norm": 0.07723450660705566, "learning_rate": 0.0002984104256516032, "loss": 8.261, "step": 42900 }, { "epoch": 10.75, "grad_norm": 0.06688930839300156, "learning_rate": 0.00029840667541721356, "loss": 8.1896, "step": 43000 }, { "epoch": 10.775, "grad_norm": 0.07152280956506729, "learning_rate": 0.00029840292518282393, "loss": 7.8468, "step": 43100 }, { "epoch": 10.8, "grad_norm": 0.0700908899307251, "learning_rate": 0.00029839917494843424, "loss": 8.2157, "step": 43200 }, { "epoch": 10.825, "grad_norm": 0.08827432245016098, "learning_rate": 0.0002983954247140446, "loss": 8.2091, "step": 43300 }, { "epoch": 10.85, "grad_norm": 0.07007287442684174, "learning_rate": 0.00029839167447965497, "loss": 8.2475, "step": 43400 }, { "epoch": 10.875, "grad_norm": 0.07239579409360886, "learning_rate": 0.0002983879242452653, "loss": 7.9446, "step": 43500 }, { "epoch": 10.9, "grad_norm": 0.06851651519536972, "learning_rate": 0.00029838417401087565, "loss": 7.9521, "step": 43600 }, { "epoch": 10.925, "grad_norm": 0.07283764332532883, "learning_rate": 0.00029838042377648596, "loss": 7.9522, "step": 43700 }, { "epoch": 10.95, "grad_norm": 0.06353294104337692, "learning_rate": 0.0002983766735420964, "loss": 7.9084, "step": 43800 }, { "epoch": 10.975, "grad_norm": 0.07374967634677887, "learning_rate": 0.0002983729608100506, "loss": 7.6851, "step": 43900 }, { "epoch": 11.0, "grad_norm": 0.08643588423728943, "learning_rate": 0.00029836921057566096, "loss": 7.7639, "step": 44000 }, { "epoch": 11.025, "grad_norm": 0.06952405720949173, "learning_rate": 0.00029836546034127127, "loss": 7.8923, "step": 44100 }, { "epoch": 11.05, "grad_norm": 0.0842747688293457, "learning_rate": 0.00029836171010688163, "loss": 7.7411, "step": 44200 }, { "epoch": 11.075, "grad_norm": 0.07051684707403183, "learning_rate": 0.000298357959872492, "loss": 7.7914, "step": 44300 }, { "epoch": 11.1, "grad_norm": 0.07264287769794464, "learning_rate": 0.00029835420963810237, "loss": 7.7216, "step": 44400 }, { "epoch": 11.125, "grad_norm": 0.07382502406835556, "learning_rate": 0.0002983504594037127, "loss": 7.8505, "step": 44500 }, { "epoch": 11.15, "grad_norm": 0.07358778268098831, "learning_rate": 0.00029834670916932304, "loss": 7.7822, "step": 44600 }, { "epoch": 11.175, "grad_norm": 0.07758370041847229, "learning_rate": 0.0002983429589349334, "loss": 8.0006, "step": 44700 }, { "epoch": 11.2, "grad_norm": 0.07674399763345718, "learning_rate": 0.0002983392087005438, "loss": 7.2497, "step": 44800 }, { "epoch": 11.225, "grad_norm": 0.06659264862537384, "learning_rate": 0.0002983354584661541, "loss": 7.5115, "step": 44900 }, { "epoch": 11.25, "grad_norm": 0.0640081837773323, "learning_rate": 0.00029833170823176445, "loss": 7.4374, "step": 45000 }, { "epoch": 11.275, "grad_norm": 0.07784521579742432, "learning_rate": 0.0002983279579973748, "loss": 7.6097, "step": 45100 }, { "epoch": 11.3, "grad_norm": 0.08755332231521606, "learning_rate": 0.0002983242077629852, "loss": 7.5832, "step": 45200 }, { "epoch": 11.325, "grad_norm": 0.06300461292266846, "learning_rate": 0.0002983204575285955, "loss": 7.281, "step": 45300 }, { "epoch": 11.35, "grad_norm": 0.06807196140289307, "learning_rate": 0.00029831670729420586, "loss": 7.2347, "step": 45400 }, { "epoch": 11.375, "grad_norm": 0.07403436303138733, "learning_rate": 0.00029831295705981623, "loss": 7.0346, "step": 45500 }, { "epoch": 11.4, "grad_norm": 0.07038521021604538, "learning_rate": 0.0002983092068254266, "loss": 7.6505, "step": 45600 }, { "epoch": 11.425, "grad_norm": 0.08596746623516083, "learning_rate": 0.0002983054565910369, "loss": 7.2829, "step": 45700 }, { "epoch": 11.45, "grad_norm": 0.06901860982179642, "learning_rate": 0.0002983017063566473, "loss": 7.4822, "step": 45800 }, { "epoch": 11.475, "grad_norm": 0.07062174379825592, "learning_rate": 0.0002982979561222576, "loss": 7.2426, "step": 45900 }, { "epoch": 11.5, "grad_norm": 0.06718676537275314, "learning_rate": 0.00029829420588786795, "loss": 7.2257, "step": 46000 }, { "epoch": 11.525, "grad_norm": 0.10105819255113602, "learning_rate": 0.0002982904556534783, "loss": 7.1366, "step": 46100 }, { "epoch": 11.55, "grad_norm": 0.06286392360925674, "learning_rate": 0.0002982867054190887, "loss": 7.4181, "step": 46200 }, { "epoch": 11.575, "grad_norm": 0.09307048469781876, "learning_rate": 0.000298282955184699, "loss": 7.4101, "step": 46300 }, { "epoch": 11.6, "grad_norm": 0.06440640985965729, "learning_rate": 0.00029827920495030936, "loss": 7.3866, "step": 46400 }, { "epoch": 11.625, "grad_norm": 0.06852256506681442, "learning_rate": 0.00029827545471591973, "loss": 7.084, "step": 46500 }, { "epoch": 11.65, "grad_norm": 0.06919901072978973, "learning_rate": 0.0002982717044815301, "loss": 6.9507, "step": 46600 }, { "epoch": 11.675, "grad_norm": 0.0683809369802475, "learning_rate": 0.0002982679542471404, "loss": 7.1805, "step": 46700 }, { "epoch": 11.7, "grad_norm": 0.06878841668367386, "learning_rate": 0.0002982642040127508, "loss": 7.2514, "step": 46800 }, { "epoch": 11.725, "grad_norm": 0.06913451850414276, "learning_rate": 0.00029826045377836114, "loss": 6.9969, "step": 46900 }, { "epoch": 11.75, "grad_norm": 0.06999741494655609, "learning_rate": 0.0002982567035439715, "loss": 6.8401, "step": 47000 }, { "epoch": 11.775, "grad_norm": 0.07473236322402954, "learning_rate": 0.0002982529533095818, "loss": 6.8587, "step": 47100 }, { "epoch": 11.8, "grad_norm": 0.07786587625741959, "learning_rate": 0.0002982492030751922, "loss": 7.0751, "step": 47200 }, { "epoch": 11.825, "grad_norm": 0.0667233094573021, "learning_rate": 0.00029824545284080255, "loss": 6.9344, "step": 47300 }, { "epoch": 11.85, "grad_norm": 0.07131955772638321, "learning_rate": 0.0002982417026064129, "loss": 7.0165, "step": 47400 }, { "epoch": 11.875, "grad_norm": 0.08371793478727341, "learning_rate": 0.00029823795237202323, "loss": 6.7392, "step": 47500 }, { "epoch": 11.9, "grad_norm": 0.07992976158857346, "learning_rate": 0.00029823420213763354, "loss": 6.7678, "step": 47600 }, { "epoch": 11.925, "grad_norm": 0.07361280173063278, "learning_rate": 0.0002982304519032439, "loss": 6.5933, "step": 47700 }, { "epoch": 11.95, "grad_norm": 0.0853012353181839, "learning_rate": 0.00029822670166885427, "loss": 6.6292, "step": 47800 }, { "epoch": 11.975, "grad_norm": 0.07077699154615402, "learning_rate": 0.00029822298893680853, "loss": 7.045, "step": 47900 }, { "epoch": 12.0, "grad_norm": 0.06884802132844925, "learning_rate": 0.00029821923870241885, "loss": 6.7302, "step": 48000 }, { "epoch": 12.025, "grad_norm": 0.07187984138727188, "learning_rate": 0.0002982154884680292, "loss": 6.7884, "step": 48100 }, { "epoch": 12.05, "grad_norm": 0.06950085610151291, "learning_rate": 0.0002982117382336396, "loss": 6.6858, "step": 48200 }, { "epoch": 12.075, "grad_norm": 0.06879769265651703, "learning_rate": 0.00029820798799924994, "loss": 6.4815, "step": 48300 }, { "epoch": 12.1, "grad_norm": 0.07400238513946533, "learning_rate": 0.00029820423776486026, "loss": 6.7837, "step": 48400 }, { "epoch": 12.125, "grad_norm": 0.0689275860786438, "learning_rate": 0.0002982004875304706, "loss": 6.3745, "step": 48500 }, { "epoch": 12.15, "grad_norm": 0.07304348796606064, "learning_rate": 0.000298196737296081, "loss": 6.7639, "step": 48600 }, { "epoch": 12.175, "grad_norm": 0.07872481644153595, "learning_rate": 0.00029819298706169135, "loss": 6.5761, "step": 48700 }, { "epoch": 12.2, "grad_norm": 0.06597219407558441, "learning_rate": 0.00029818923682730167, "loss": 6.6663, "step": 48800 }, { "epoch": 12.225, "grad_norm": 0.060123708099126816, "learning_rate": 0.00029818548659291203, "loss": 6.5317, "step": 48900 }, { "epoch": 12.25, "grad_norm": 0.07376055419445038, "learning_rate": 0.00029818173635852234, "loss": 6.4394, "step": 49000 }, { "epoch": 12.275, "grad_norm": 0.06217016279697418, "learning_rate": 0.00029817798612413277, "loss": 6.4522, "step": 49100 }, { "epoch": 12.3, "grad_norm": 0.06492452323436737, "learning_rate": 0.0002981742358897431, "loss": 6.5623, "step": 49200 }, { "epoch": 12.325, "grad_norm": 0.08026625216007233, "learning_rate": 0.00029817048565535344, "loss": 6.3981, "step": 49300 }, { "epoch": 12.35, "grad_norm": 0.07046521455049515, "learning_rate": 0.00029816673542096376, "loss": 6.4173, "step": 49400 }, { "epoch": 12.375, "grad_norm": 0.07843586057424545, "learning_rate": 0.0002981629851865741, "loss": 6.499, "step": 49500 }, { "epoch": 12.4, "grad_norm": 0.06976750493049622, "learning_rate": 0.0002981592349521845, "loss": 6.4019, "step": 49600 }, { "epoch": 12.425, "grad_norm": 0.06601151078939438, "learning_rate": 0.00029815548471779485, "loss": 6.3474, "step": 49700 }, { "epoch": 12.45, "grad_norm": 0.07471803575754166, "learning_rate": 0.00029815173448340517, "loss": 6.1884, "step": 49800 }, { "epoch": 12.475, "grad_norm": 0.06310160458087921, "learning_rate": 0.0002981480217513594, "loss": 6.2996, "step": 49900 }, { "epoch": 12.5, "grad_norm": 0.060027483850717545, "learning_rate": 0.0002981442715169698, "loss": 6.2398, "step": 50000 }, { "epoch": 12.525, "grad_norm": 0.07511355727910995, "learning_rate": 0.00029814052128258016, "loss": 6.0126, "step": 50100 }, { "epoch": 12.55, "grad_norm": 0.09251129627227783, "learning_rate": 0.00029813677104819047, "loss": 6.1201, "step": 50200 }, { "epoch": 12.575, "grad_norm": 0.06512793153524399, "learning_rate": 0.00029813302081380084, "loss": 6.2464, "step": 50300 }, { "epoch": 12.6, "grad_norm": 0.06275767832994461, "learning_rate": 0.0002981292705794112, "loss": 6.215, "step": 50400 }, { "epoch": 12.625, "grad_norm": 0.07693471014499664, "learning_rate": 0.00029812552034502157, "loss": 6.1931, "step": 50500 }, { "epoch": 12.65, "grad_norm": 0.06782624125480652, "learning_rate": 0.0002981217701106319, "loss": 6.3334, "step": 50600 }, { "epoch": 12.675, "grad_norm": 0.06484679132699966, "learning_rate": 0.00029811801987624225, "loss": 5.9756, "step": 50700 }, { "epoch": 12.7, "grad_norm": 0.07431244850158691, "learning_rate": 0.0002981142696418526, "loss": 6.2173, "step": 50800 }, { "epoch": 12.725, "grad_norm": 0.07316889613866806, "learning_rate": 0.000298110519407463, "loss": 6.0987, "step": 50900 }, { "epoch": 12.75, "grad_norm": 0.06565624475479126, "learning_rate": 0.0002981067691730733, "loss": 6.0928, "step": 51000 }, { "epoch": 12.775, "grad_norm": 0.07335751503705978, "learning_rate": 0.00029810301893868366, "loss": 6.1505, "step": 51100 }, { "epoch": 12.8, "grad_norm": 0.0684492215514183, "learning_rate": 0.00029809926870429397, "loss": 5.9197, "step": 51200 }, { "epoch": 12.825, "grad_norm": 0.06604496389627457, "learning_rate": 0.00029809551846990434, "loss": 6.2255, "step": 51300 }, { "epoch": 12.85, "grad_norm": 0.06465475261211395, "learning_rate": 0.0002980917682355147, "loss": 5.8412, "step": 51400 }, { "epoch": 12.875, "grad_norm": 0.06663598865270615, "learning_rate": 0.000298088018001125, "loss": 5.7792, "step": 51500 }, { "epoch": 12.9, "grad_norm": 0.06258101016283035, "learning_rate": 0.0002980842677667354, "loss": 5.7024, "step": 51600 }, { "epoch": 12.925, "grad_norm": 0.06694167107343674, "learning_rate": 0.00029808051753234575, "loss": 5.9832, "step": 51700 }, { "epoch": 12.95, "grad_norm": 0.06682337820529938, "learning_rate": 0.0002980767672979561, "loss": 5.8905, "step": 51800 }, { "epoch": 12.975, "grad_norm": 0.07507793605327606, "learning_rate": 0.0002980730545659103, "loss": 5.8869, "step": 51900 }, { "epoch": 13.0, "grad_norm": 0.0638195350766182, "learning_rate": 0.0002980693043315207, "loss": 5.9508, "step": 52000 }, { "epoch": 13.025, "grad_norm": 0.089790940284729, "learning_rate": 0.00029806555409713105, "loss": 5.807, "step": 52100 }, { "epoch": 13.05, "grad_norm": 0.06941410899162292, "learning_rate": 0.0002980618038627414, "loss": 5.8974, "step": 52200 }, { "epoch": 13.075, "grad_norm": 0.06374108046293259, "learning_rate": 0.00029805805362835173, "loss": 6.02, "step": 52300 }, { "epoch": 13.1, "grad_norm": 0.06581106036901474, "learning_rate": 0.0002980543033939621, "loss": 5.8285, "step": 52400 }, { "epoch": 13.125, "grad_norm": 0.062402479350566864, "learning_rate": 0.00029805055315957246, "loss": 5.9327, "step": 52500 }, { "epoch": 13.15, "grad_norm": 0.0768311470746994, "learning_rate": 0.00029804680292518283, "loss": 5.7586, "step": 52600 }, { "epoch": 13.175, "grad_norm": 0.09206507354974747, "learning_rate": 0.00029804305269079314, "loss": 5.7239, "step": 52700 }, { "epoch": 13.2, "grad_norm": 0.09109029918909073, "learning_rate": 0.0002980393024564035, "loss": 5.8506, "step": 52800 }, { "epoch": 13.225, "grad_norm": 0.06463731825351715, "learning_rate": 0.0002980355522220138, "loss": 5.8716, "step": 52900 }, { "epoch": 13.25, "grad_norm": 0.07239048928022385, "learning_rate": 0.0002980318019876242, "loss": 5.515, "step": 53000 }, { "epoch": 13.275, "grad_norm": 0.06180089712142944, "learning_rate": 0.00029802805175323455, "loss": 5.4248, "step": 53100 }, { "epoch": 13.3, "grad_norm": 0.05961550027132034, "learning_rate": 0.0002980243015188449, "loss": 5.8408, "step": 53200 }, { "epoch": 13.325, "grad_norm": 0.06609106063842773, "learning_rate": 0.00029802055128445523, "loss": 5.5214, "step": 53300 }, { "epoch": 13.35, "grad_norm": 0.07037625461816788, "learning_rate": 0.0002980168010500656, "loss": 5.6422, "step": 53400 }, { "epoch": 13.375, "grad_norm": 0.05968979373574257, "learning_rate": 0.00029801305081567596, "loss": 5.4027, "step": 53500 }, { "epoch": 13.4, "grad_norm": 0.06201528012752533, "learning_rate": 0.00029800930058128633, "loss": 5.5331, "step": 53600 }, { "epoch": 13.425, "grad_norm": 0.07820463925600052, "learning_rate": 0.00029800555034689664, "loss": 5.6112, "step": 53700 }, { "epoch": 13.45, "grad_norm": 0.07531889528036118, "learning_rate": 0.000298001800112507, "loss": 5.5128, "step": 53800 }, { "epoch": 13.475, "grad_norm": 0.06690291315317154, "learning_rate": 0.00029799808738046127, "loss": 5.443, "step": 53900 }, { "epoch": 13.5, "grad_norm": 0.08288581669330597, "learning_rate": 0.00029799433714607163, "loss": 5.5471, "step": 54000 }, { "epoch": 13.525, "grad_norm": 0.06512220948934555, "learning_rate": 0.00029799058691168195, "loss": 5.475, "step": 54100 }, { "epoch": 13.55, "grad_norm": 0.07862843573093414, "learning_rate": 0.0002979868366772923, "loss": 5.6017, "step": 54200 }, { "epoch": 13.575, "grad_norm": 0.06599980592727661, "learning_rate": 0.0002979830864429027, "loss": 5.4367, "step": 54300 }, { "epoch": 13.6, "grad_norm": 0.07014311850070953, "learning_rate": 0.00029797933620851304, "loss": 5.3765, "step": 54400 }, { "epoch": 13.625, "grad_norm": 0.09498297423124313, "learning_rate": 0.00029797558597412336, "loss": 5.3329, "step": 54500 }, { "epoch": 13.65, "grad_norm": 0.06557220965623856, "learning_rate": 0.0002979718357397337, "loss": 5.4082, "step": 54600 }, { "epoch": 13.675, "grad_norm": 0.06320352107286453, "learning_rate": 0.00029796808550534403, "loss": 5.3671, "step": 54700 }, { "epoch": 13.7, "grad_norm": 0.07630398869514465, "learning_rate": 0.0002979643352709544, "loss": 5.4613, "step": 54800 }, { "epoch": 13.725, "grad_norm": 0.07285916805267334, "learning_rate": 0.00029796058503656477, "loss": 5.0222, "step": 54900 }, { "epoch": 13.75, "grad_norm": 0.07314100861549377, "learning_rate": 0.00029795683480217513, "loss": 5.1593, "step": 55000 }, { "epoch": 13.775, "grad_norm": 0.0632672905921936, "learning_rate": 0.00029795308456778544, "loss": 5.2524, "step": 55100 }, { "epoch": 13.8, "grad_norm": 0.06146818399429321, "learning_rate": 0.0002979493343333958, "loss": 5.2068, "step": 55200 }, { "epoch": 13.825, "grad_norm": 0.08438315987586975, "learning_rate": 0.0002979455840990062, "loss": 5.1854, "step": 55300 }, { "epoch": 13.85, "grad_norm": 0.06263713538646698, "learning_rate": 0.0002979418338646165, "loss": 5.1888, "step": 55400 }, { "epoch": 13.875, "grad_norm": 0.06485722959041595, "learning_rate": 0.00029793808363022685, "loss": 5.3774, "step": 55500 }, { "epoch": 13.9, "grad_norm": 0.09563236683607101, "learning_rate": 0.0002979343333958372, "loss": 5.201, "step": 55600 }, { "epoch": 13.925, "grad_norm": 0.06357564777135849, "learning_rate": 0.0002979305831614476, "loss": 5.1221, "step": 55700 }, { "epoch": 13.95, "grad_norm": 0.06070085987448692, "learning_rate": 0.0002979268329270579, "loss": 5.1584, "step": 55800 }, { "epoch": 13.975, "grad_norm": 0.0757615715265274, "learning_rate": 0.00029792312019501216, "loss": 5.0797, "step": 55900 }, { "epoch": 14.0, "grad_norm": 0.07182688266038895, "learning_rate": 0.0002979193699606225, "loss": 5.2988, "step": 56000 }, { "epoch": 14.025, "grad_norm": 0.06348109245300293, "learning_rate": 0.0002979156197262329, "loss": 4.992, "step": 56100 }, { "epoch": 14.05, "grad_norm": 0.07352128624916077, "learning_rate": 0.0002979118694918432, "loss": 4.9483, "step": 56200 }, { "epoch": 14.075, "grad_norm": 0.0681919977068901, "learning_rate": 0.00029790811925745357, "loss": 5.1792, "step": 56300 }, { "epoch": 14.1, "grad_norm": 0.06682088226079941, "learning_rate": 0.0002979043690230639, "loss": 4.8559, "step": 56400 }, { "epoch": 14.125, "grad_norm": 0.06291857361793518, "learning_rate": 0.00029790061878867425, "loss": 4.9382, "step": 56500 }, { "epoch": 14.15, "grad_norm": 0.07243198156356812, "learning_rate": 0.0002978968685542846, "loss": 5.0399, "step": 56600 }, { "epoch": 14.175, "grad_norm": 0.06961022317409515, "learning_rate": 0.000297893118319895, "loss": 5.0745, "step": 56700 }, { "epoch": 14.2, "grad_norm": 0.06203046441078186, "learning_rate": 0.0002978893680855053, "loss": 5.1403, "step": 56800 }, { "epoch": 14.225, "grad_norm": 0.06188129261136055, "learning_rate": 0.00029788561785111566, "loss": 4.9122, "step": 56900 }, { "epoch": 14.25, "grad_norm": 0.05759645998477936, "learning_rate": 0.000297881867616726, "loss": 5.0696, "step": 57000 }, { "epoch": 14.275, "grad_norm": 0.0592036135494709, "learning_rate": 0.0002978781173823364, "loss": 5.1164, "step": 57100 }, { "epoch": 14.3, "grad_norm": 0.06267797201871872, "learning_rate": 0.0002978743671479467, "loss": 5.0722, "step": 57200 }, { "epoch": 14.325, "grad_norm": 0.07611776143312454, "learning_rate": 0.00029787061691355707, "loss": 4.9118, "step": 57300 }, { "epoch": 14.35, "grad_norm": 0.061794403940439224, "learning_rate": 0.00029786686667916744, "loss": 5.013, "step": 57400 }, { "epoch": 14.375, "grad_norm": 0.2047680765390396, "learning_rate": 0.0002978631164447778, "loss": 4.7667, "step": 57500 }, { "epoch": 14.4, "grad_norm": 0.0633254125714302, "learning_rate": 0.0002978593662103881, "loss": 4.8633, "step": 57600 }, { "epoch": 14.425, "grad_norm": 0.06651504337787628, "learning_rate": 0.0002978556159759985, "loss": 4.9452, "step": 57700 }, { "epoch": 14.45, "grad_norm": 0.07252359390258789, "learning_rate": 0.00029785186574160885, "loss": 4.8268, "step": 57800 }, { "epoch": 14.475, "grad_norm": 0.07088153064250946, "learning_rate": 0.0002978481530095631, "loss": 4.7381, "step": 57900 }, { "epoch": 14.5, "grad_norm": 0.06644707918167114, "learning_rate": 0.0002978444027751734, "loss": 4.6716, "step": 58000 }, { "epoch": 14.525, "grad_norm": 0.06577486544847488, "learning_rate": 0.0002978406525407838, "loss": 4.8125, "step": 58100 }, { "epoch": 14.55, "grad_norm": 0.06577962636947632, "learning_rate": 0.0002978369023063941, "loss": 4.6842, "step": 58200 }, { "epoch": 14.575, "grad_norm": 0.060136351734399796, "learning_rate": 0.00029783315207200446, "loss": 4.6219, "step": 58300 }, { "epoch": 14.6, "grad_norm": 0.06826278567314148, "learning_rate": 0.00029782940183761483, "loss": 4.7876, "step": 58400 }, { "epoch": 14.625, "grad_norm": 0.06896788626909256, "learning_rate": 0.0002978256516032252, "loss": 4.8651, "step": 58500 }, { "epoch": 14.65, "grad_norm": 0.06548253446817398, "learning_rate": 0.0002978219013688355, "loss": 4.9228, "step": 58600 }, { "epoch": 14.675, "grad_norm": 0.08236391097307205, "learning_rate": 0.0002978181511344459, "loss": 4.7074, "step": 58700 }, { "epoch": 14.7, "grad_norm": 0.06781431287527084, "learning_rate": 0.00029781440090005624, "loss": 5.0659, "step": 58800 }, { "epoch": 14.725, "grad_norm": 0.06290601193904877, "learning_rate": 0.0002978106506656666, "loss": 4.8844, "step": 58900 }, { "epoch": 14.75, "grad_norm": 0.0578296072781086, "learning_rate": 0.0002978069004312769, "loss": 4.7095, "step": 59000 }, { "epoch": 14.775, "grad_norm": 0.05320196598768234, "learning_rate": 0.0002978031501968873, "loss": 4.6838, "step": 59100 }, { "epoch": 14.8, "grad_norm": 0.07847319543361664, "learning_rate": 0.00029779939996249765, "loss": 4.7263, "step": 59200 }, { "epoch": 14.825, "grad_norm": 0.07580792158842087, "learning_rate": 0.00029779564972810796, "loss": 4.5927, "step": 59300 }, { "epoch": 14.85, "grad_norm": 0.06336116045713425, "learning_rate": 0.00029779189949371833, "loss": 4.6524, "step": 59400 }, { "epoch": 14.875, "grad_norm": 0.0706322193145752, "learning_rate": 0.0002977881492593287, "loss": 4.5591, "step": 59500 }, { "epoch": 14.9, "grad_norm": 0.09078390896320343, "learning_rate": 0.00029778439902493906, "loss": 4.6377, "step": 59600 }, { "epoch": 14.925, "grad_norm": 0.07508181035518646, "learning_rate": 0.00029778064879054937, "loss": 4.4043, "step": 59700 }, { "epoch": 14.95, "grad_norm": 0.06288613379001617, "learning_rate": 0.00029777689855615974, "loss": 4.5363, "step": 59800 }, { "epoch": 14.975, "grad_norm": 0.0686824843287468, "learning_rate": 0.00029777318582411395, "loss": 4.6031, "step": 59900 }, { "epoch": 15.0, "grad_norm": 0.0657496452331543, "learning_rate": 0.0002977694355897243, "loss": 4.4645, "step": 60000 }, { "epoch": 15.025, "grad_norm": 0.0680643618106842, "learning_rate": 0.0002977656853553347, "loss": 4.6015, "step": 60100 }, { "epoch": 15.05, "grad_norm": 0.06540867686271667, "learning_rate": 0.00029776193512094504, "loss": 4.4411, "step": 60200 }, { "epoch": 15.075, "grad_norm": 0.060959845781326294, "learning_rate": 0.00029775818488655536, "loss": 4.2446, "step": 60300 }, { "epoch": 15.1, "grad_norm": 0.07395045459270477, "learning_rate": 0.0002977544346521657, "loss": 4.4593, "step": 60400 }, { "epoch": 15.125, "grad_norm": 0.0660228282213211, "learning_rate": 0.0002977506844177761, "loss": 4.2359, "step": 60500 }, { "epoch": 15.15, "grad_norm": 0.06423047930002213, "learning_rate": 0.00029774693418338645, "loss": 4.4333, "step": 60600 }, { "epoch": 15.175, "grad_norm": 0.07680130749940872, "learning_rate": 0.00029774318394899677, "loss": 4.4737, "step": 60700 }, { "epoch": 15.2, "grad_norm": 0.0686013400554657, "learning_rate": 0.00029773943371460713, "loss": 4.301, "step": 60800 }, { "epoch": 15.225, "grad_norm": 0.0519595630466938, "learning_rate": 0.0002977356834802175, "loss": 4.4112, "step": 60900 }, { "epoch": 15.25, "grad_norm": 0.06710193306207657, "learning_rate": 0.00029773193324582787, "loss": 4.3652, "step": 61000 }, { "epoch": 15.275, "grad_norm": 0.07808689773082733, "learning_rate": 0.0002977281830114382, "loss": 4.3473, "step": 61100 }, { "epoch": 15.3, "grad_norm": 0.0767969936132431, "learning_rate": 0.00029772443277704854, "loss": 4.3302, "step": 61200 }, { "epoch": 15.325, "grad_norm": 0.06145559623837471, "learning_rate": 0.0002977206825426589, "loss": 4.2091, "step": 61300 }, { "epoch": 15.35, "grad_norm": 0.09096598625183105, "learning_rate": 0.0002977169323082693, "loss": 4.4397, "step": 61400 }, { "epoch": 15.375, "grad_norm": 0.06596633046865463, "learning_rate": 0.0002977131820738796, "loss": 4.1544, "step": 61500 }, { "epoch": 15.4, "grad_norm": 0.0632476657629013, "learning_rate": 0.00029770943183948995, "loss": 4.1507, "step": 61600 }, { "epoch": 15.425, "grad_norm": 0.05707848072052002, "learning_rate": 0.00029770568160510027, "loss": 4.5147, "step": 61700 }, { "epoch": 15.45, "grad_norm": 0.06603705137968063, "learning_rate": 0.00029770193137071063, "loss": 4.3091, "step": 61800 }, { "epoch": 15.475, "grad_norm": 0.08647535741329193, "learning_rate": 0.000297698181136321, "loss": 4.4759, "step": 61900 }, { "epoch": 15.5, "grad_norm": 0.0747227743268013, "learning_rate": 0.00029769443090193136, "loss": 4.3265, "step": 62000 }, { "epoch": 15.525, "grad_norm": 0.06563801318407059, "learning_rate": 0.0002976906806675417, "loss": 4.5796, "step": 62100 }, { "epoch": 15.55, "grad_norm": 0.06297031790018082, "learning_rate": 0.00029768693043315204, "loss": 4.2309, "step": 62200 }, { "epoch": 15.575, "grad_norm": 0.05998208001255989, "learning_rate": 0.0002976831801987624, "loss": 4.244, "step": 62300 }, { "epoch": 15.6, "grad_norm": 0.057426031678915024, "learning_rate": 0.0002976794299643728, "loss": 4.4331, "step": 62400 }, { "epoch": 15.625, "grad_norm": 0.06295296549797058, "learning_rate": 0.0002976756797299831, "loss": 3.9931, "step": 62500 }, { "epoch": 15.65, "grad_norm": 0.07305531948804855, "learning_rate": 0.00029767192949559345, "loss": 4.1127, "step": 62600 }, { "epoch": 15.675, "grad_norm": 0.057404179126024246, "learning_rate": 0.0002976681792612038, "loss": 4.1888, "step": 62700 }, { "epoch": 15.7, "grad_norm": 0.05540831759572029, "learning_rate": 0.0002976644290268142, "loss": 4.3955, "step": 62800 }, { "epoch": 15.725, "grad_norm": 0.05315635725855827, "learning_rate": 0.0002976606787924245, "loss": 4.4189, "step": 62900 }, { "epoch": 15.75, "grad_norm": 0.06974928081035614, "learning_rate": 0.0002976569285580348, "loss": 4.2076, "step": 63000 }, { "epoch": 15.775, "grad_norm": 0.06797333806753159, "learning_rate": 0.00029765317832364523, "loss": 4.0685, "step": 63100 }, { "epoch": 15.8, "grad_norm": 0.07094912976026535, "learning_rate": 0.00029764942808925554, "loss": 4.0277, "step": 63200 }, { "epoch": 15.825, "grad_norm": 0.0728229507803917, "learning_rate": 0.0002976456778548659, "loss": 4.2609, "step": 63300 }, { "epoch": 15.85, "grad_norm": 0.05918316915631294, "learning_rate": 0.0002976419276204762, "loss": 4.2609, "step": 63400 }, { "epoch": 15.875, "grad_norm": 0.06454843282699585, "learning_rate": 0.0002976381773860866, "loss": 4.0982, "step": 63500 }, { "epoch": 15.9, "grad_norm": 0.07737816870212555, "learning_rate": 0.00029763442715169695, "loss": 4.0363, "step": 63600 }, { "epoch": 15.925, "grad_norm": 0.06324774026870728, "learning_rate": 0.0002976306769173073, "loss": 3.8008, "step": 63700 }, { "epoch": 15.95, "grad_norm": 0.05786865949630737, "learning_rate": 0.00029762692668291763, "loss": 3.8747, "step": 63800 }, { "epoch": 15.975, "grad_norm": 0.06020934507250786, "learning_rate": 0.0002976232139508719, "loss": 3.9662, "step": 63900 }, { "epoch": 16.0, "grad_norm": 0.06533800065517426, "learning_rate": 0.00029761946371648226, "loss": 4.0263, "step": 64000 }, { "epoch": 16.025, "grad_norm": 0.05861624330282211, "learning_rate": 0.0002976157134820926, "loss": 4.0456, "step": 64100 }, { "epoch": 16.05, "grad_norm": 0.06453926116228104, "learning_rate": 0.00029761196324770294, "loss": 3.9041, "step": 64200 }, { "epoch": 16.075, "grad_norm": 0.06458089500665665, "learning_rate": 0.0002976082130133133, "loss": 3.7986, "step": 64300 }, { "epoch": 16.1, "grad_norm": 0.05067475885152817, "learning_rate": 0.00029760446277892367, "loss": 3.9836, "step": 64400 }, { "epoch": 16.125, "grad_norm": 0.0557921938598156, "learning_rate": 0.00029760071254453403, "loss": 3.958, "step": 64500 }, { "epoch": 16.15, "grad_norm": 0.05821559205651283, "learning_rate": 0.00029759696231014435, "loss": 4.0563, "step": 64600 }, { "epoch": 16.175, "grad_norm": 0.06078817695379257, "learning_rate": 0.0002975932120757547, "loss": 4.0017, "step": 64700 }, { "epoch": 16.2, "grad_norm": 0.07187299430370331, "learning_rate": 0.0002975894618413651, "loss": 3.7798, "step": 64800 }, { "epoch": 16.225, "grad_norm": 0.05477326363325119, "learning_rate": 0.00029758571160697544, "loss": 3.7864, "step": 64900 }, { "epoch": 16.25, "grad_norm": 0.06654859334230423, "learning_rate": 0.00029758196137258576, "loss": 3.9514, "step": 65000 }, { "epoch": 16.275, "grad_norm": 0.0737365186214447, "learning_rate": 0.0002975782111381961, "loss": 3.9058, "step": 65100 }, { "epoch": 16.3, "grad_norm": 0.06597916781902313, "learning_rate": 0.00029757446090380643, "loss": 3.9946, "step": 65200 }, { "epoch": 16.325, "grad_norm": 0.05861925333738327, "learning_rate": 0.0002975707106694168, "loss": 3.9009, "step": 65300 }, { "epoch": 16.35, "grad_norm": 0.06207166984677315, "learning_rate": 0.00029756696043502717, "loss": 3.9892, "step": 65400 }, { "epoch": 16.375, "grad_norm": 0.07432432472705841, "learning_rate": 0.00029756321020063753, "loss": 3.7083, "step": 65500 }, { "epoch": 16.4, "grad_norm": 0.05656394734978676, "learning_rate": 0.00029755945996624784, "loss": 3.8139, "step": 65600 }, { "epoch": 16.425, "grad_norm": 0.07284687459468842, "learning_rate": 0.0002975557097318582, "loss": 3.8091, "step": 65700 }, { "epoch": 16.45, "grad_norm": 0.06415148079395294, "learning_rate": 0.0002975519594974686, "loss": 3.8954, "step": 65800 }, { "epoch": 16.475, "grad_norm": 0.06300424784421921, "learning_rate": 0.00029754824676542284, "loss": 3.5919, "step": 65900 }, { "epoch": 16.5, "grad_norm": 0.06578180938959122, "learning_rate": 0.00029754449653103315, "loss": 3.7936, "step": 66000 }, { "epoch": 16.525, "grad_norm": 0.07465810328722, "learning_rate": 0.0002975407462966435, "loss": 3.6781, "step": 66100 }, { "epoch": 16.55, "grad_norm": 0.05531006306409836, "learning_rate": 0.0002975369960622539, "loss": 3.8176, "step": 66200 }, { "epoch": 16.575, "grad_norm": 0.057088643312454224, "learning_rate": 0.00029753324582786425, "loss": 3.8375, "step": 66300 }, { "epoch": 16.6, "grad_norm": 0.06409061700105667, "learning_rate": 0.00029752949559347456, "loss": 3.6946, "step": 66400 }, { "epoch": 16.625, "grad_norm": 0.06034286320209503, "learning_rate": 0.0002975257453590849, "loss": 3.7127, "step": 66500 }, { "epoch": 16.65, "grad_norm": 0.06990322470664978, "learning_rate": 0.0002975219951246953, "loss": 3.7908, "step": 66600 }, { "epoch": 16.675, "grad_norm": 0.07301350682973862, "learning_rate": 0.0002975182448903056, "loss": 3.6067, "step": 66700 }, { "epoch": 16.7, "grad_norm": 0.06309019029140472, "learning_rate": 0.00029751449465591597, "loss": 3.6552, "step": 66800 }, { "epoch": 16.725, "grad_norm": 0.07269258797168732, "learning_rate": 0.0002975107444215263, "loss": 3.6489, "step": 66900 }, { "epoch": 16.75, "grad_norm": 0.07549503445625305, "learning_rate": 0.00029750699418713665, "loss": 3.6146, "step": 67000 }, { "epoch": 16.775, "grad_norm": 0.06944973766803741, "learning_rate": 0.000297503243952747, "loss": 3.592, "step": 67100 }, { "epoch": 16.8, "grad_norm": 0.05656867474317551, "learning_rate": 0.0002974994937183574, "loss": 3.7087, "step": 67200 }, { "epoch": 16.825, "grad_norm": 0.06444111466407776, "learning_rate": 0.0002974957434839677, "loss": 3.5458, "step": 67300 }, { "epoch": 16.85, "grad_norm": 0.05399918928742409, "learning_rate": 0.00029749199324957806, "loss": 3.6962, "step": 67400 }, { "epoch": 16.875, "grad_norm": 0.06424950808286667, "learning_rate": 0.0002974882430151884, "loss": 3.5515, "step": 67500 }, { "epoch": 16.9, "grad_norm": 0.05898202210664749, "learning_rate": 0.0002974844927807988, "loss": 3.6593, "step": 67600 }, { "epoch": 16.925, "grad_norm": 0.06607525050640106, "learning_rate": 0.0002974807425464091, "loss": 3.5478, "step": 67700 }, { "epoch": 16.95, "grad_norm": 0.06299087405204773, "learning_rate": 0.00029747699231201947, "loss": 3.7256, "step": 67800 }, { "epoch": 16.975, "grad_norm": 0.063835009932518, "learning_rate": 0.00029747327957997373, "loss": 3.538, "step": 67900 }, { "epoch": 17.0, "grad_norm": 0.05786048248410225, "learning_rate": 0.0002974695293455841, "loss": 3.7246, "step": 68000 }, { "epoch": 17.025, "grad_norm": 0.05804240703582764, "learning_rate": 0.0002974657791111944, "loss": 3.5207, "step": 68100 }, { "epoch": 17.05, "grad_norm": 0.06179894134402275, "learning_rate": 0.0002974620288768048, "loss": 3.5634, "step": 68200 }, { "epoch": 17.075, "grad_norm": 0.05166739225387573, "learning_rate": 0.00029745827864241514, "loss": 3.594, "step": 68300 }, { "epoch": 17.1, "grad_norm": 0.05808790773153305, "learning_rate": 0.0002974545284080255, "loss": 3.4721, "step": 68400 }, { "epoch": 17.125, "grad_norm": 0.058479100465774536, "learning_rate": 0.0002974507781736358, "loss": 3.4991, "step": 68500 }, { "epoch": 17.15, "grad_norm": 0.06585648655891418, "learning_rate": 0.0002974470279392462, "loss": 3.4487, "step": 68600 }, { "epoch": 17.175, "grad_norm": 0.07367991656064987, "learning_rate": 0.0002974432777048565, "loss": 3.715, "step": 68700 }, { "epoch": 17.2, "grad_norm": 0.06693430244922638, "learning_rate": 0.00029743952747046686, "loss": 3.4574, "step": 68800 }, { "epoch": 17.225, "grad_norm": 0.06379226595163345, "learning_rate": 0.00029743577723607723, "loss": 3.6117, "step": 68900 }, { "epoch": 17.25, "grad_norm": 0.0511956624686718, "learning_rate": 0.0002974320270016876, "loss": 3.7448, "step": 69000 }, { "epoch": 17.275, "grad_norm": 0.07336433976888657, "learning_rate": 0.0002974282767672979, "loss": 3.3539, "step": 69100 }, { "epoch": 17.3, "grad_norm": 0.0531037300825119, "learning_rate": 0.0002974245265329083, "loss": 3.4722, "step": 69200 }, { "epoch": 17.325, "grad_norm": 0.0836392492055893, "learning_rate": 0.00029742077629851864, "loss": 3.4829, "step": 69300 }, { "epoch": 17.35, "grad_norm": 0.0543275885283947, "learning_rate": 0.000297417026064129, "loss": 3.3048, "step": 69400 }, { "epoch": 17.375, "grad_norm": 0.05712301284074783, "learning_rate": 0.0002974132758297393, "loss": 3.3524, "step": 69500 }, { "epoch": 17.4, "grad_norm": 0.07685862481594086, "learning_rate": 0.0002974095255953497, "loss": 3.4212, "step": 69600 }, { "epoch": 17.425, "grad_norm": 0.06631585955619812, "learning_rate": 0.00029740577536096005, "loss": 3.3931, "step": 69700 }, { "epoch": 17.45, "grad_norm": 0.05916072428226471, "learning_rate": 0.0002974020251265704, "loss": 3.4396, "step": 69800 }, { "epoch": 17.475, "grad_norm": 0.06266429275274277, "learning_rate": 0.0002973983123945246, "loss": 3.618, "step": 69900 }, { "epoch": 17.5, "grad_norm": 0.07458827644586563, "learning_rate": 0.000297394562160135, "loss": 3.3892, "step": 70000 }, { "epoch": 17.525, "grad_norm": 0.05758730694651604, "learning_rate": 0.00029739081192574536, "loss": 3.3696, "step": 70100 }, { "epoch": 17.55, "grad_norm": 0.061953071504831314, "learning_rate": 0.0002973870616913557, "loss": 3.2163, "step": 70200 }, { "epoch": 17.575, "grad_norm": 0.06715140491724014, "learning_rate": 0.00029738331145696603, "loss": 3.5115, "step": 70300 }, { "epoch": 17.6, "grad_norm": 0.06628040969371796, "learning_rate": 0.0002973795612225764, "loss": 3.4019, "step": 70400 }, { "epoch": 17.625, "grad_norm": 0.06109810248017311, "learning_rate": 0.0002973758109881867, "loss": 3.327, "step": 70500 }, { "epoch": 17.65, "grad_norm": 0.05486061051487923, "learning_rate": 0.0002973720607537971, "loss": 3.4603, "step": 70600 }, { "epoch": 17.675, "grad_norm": 0.058648984879255295, "learning_rate": 0.00029736831051940744, "loss": 3.5709, "step": 70700 }, { "epoch": 17.7, "grad_norm": 0.06253077834844589, "learning_rate": 0.00029736456028501776, "loss": 3.1556, "step": 70800 }, { "epoch": 17.725, "grad_norm": 0.05633246898651123, "learning_rate": 0.0002973608100506281, "loss": 3.2396, "step": 70900 }, { "epoch": 17.75, "grad_norm": 0.07230902463197708, "learning_rate": 0.0002973570598162385, "loss": 3.2617, "step": 71000 }, { "epoch": 17.775, "grad_norm": 0.06703296303749084, "learning_rate": 0.00029735330958184886, "loss": 3.3253, "step": 71100 }, { "epoch": 17.8, "grad_norm": 0.05392139405012131, "learning_rate": 0.00029734955934745917, "loss": 3.1445, "step": 71200 }, { "epoch": 17.825, "grad_norm": 0.059445902705192566, "learning_rate": 0.00029734580911306953, "loss": 3.2005, "step": 71300 }, { "epoch": 17.85, "grad_norm": 0.05022546648979187, "learning_rate": 0.0002973420588786799, "loss": 3.2086, "step": 71400 }, { "epoch": 17.875, "grad_norm": 0.05383516103029251, "learning_rate": 0.00029733830864429027, "loss": 3.0877, "step": 71500 }, { "epoch": 17.9, "grad_norm": 0.055024441331624985, "learning_rate": 0.0002973345584099006, "loss": 3.27, "step": 71600 }, { "epoch": 17.925, "grad_norm": 0.0565604642033577, "learning_rate": 0.00029733080817551094, "loss": 3.4085, "step": 71700 }, { "epoch": 17.95, "grad_norm": 0.056899093091487885, "learning_rate": 0.0002973270579411213, "loss": 3.3568, "step": 71800 }, { "epoch": 17.975, "grad_norm": 0.06129912659525871, "learning_rate": 0.00029732334520907557, "loss": 3.1591, "step": 71900 }, { "epoch": 18.0, "grad_norm": 0.06037045270204544, "learning_rate": 0.0002973195949746859, "loss": 3.3884, "step": 72000 }, { "epoch": 18.025, "grad_norm": 0.059694815427064896, "learning_rate": 0.00029731584474029625, "loss": 3.0768, "step": 72100 }, { "epoch": 18.05, "grad_norm": 0.06282085925340652, "learning_rate": 0.00029731209450590656, "loss": 3.3816, "step": 72200 }, { "epoch": 18.075, "grad_norm": 0.05453978106379509, "learning_rate": 0.00029730834427151693, "loss": 3.1041, "step": 72300 }, { "epoch": 18.1, "grad_norm": 0.0587979331612587, "learning_rate": 0.0002973045940371273, "loss": 3.1357, "step": 72400 }, { "epoch": 18.125, "grad_norm": 0.05731925368309021, "learning_rate": 0.00029730084380273766, "loss": 3.0224, "step": 72500 }, { "epoch": 18.15, "grad_norm": 0.05748147889971733, "learning_rate": 0.00029729709356834797, "loss": 3.1868, "step": 72600 }, { "epoch": 18.175, "grad_norm": 0.2291877716779709, "learning_rate": 0.00029729334333395834, "loss": 3.5219, "step": 72700 }, { "epoch": 18.2, "grad_norm": 0.05291415750980377, "learning_rate": 0.0002972895930995687, "loss": 3.0433, "step": 72800 }, { "epoch": 18.225, "grad_norm": 0.05900726094841957, "learning_rate": 0.00029728584286517907, "loss": 3.2394, "step": 72900 }, { "epoch": 18.25, "grad_norm": 0.05879193916916847, "learning_rate": 0.0002972820926307894, "loss": 3.2482, "step": 73000 }, { "epoch": 18.275, "grad_norm": 0.061925821006298065, "learning_rate": 0.00029727834239639975, "loss": 3.1974, "step": 73100 }, { "epoch": 18.3, "grad_norm": 0.07049068808555603, "learning_rate": 0.0002972745921620101, "loss": 3.2512, "step": 73200 }, { "epoch": 18.325, "grad_norm": 0.06102385371923447, "learning_rate": 0.0002972708419276205, "loss": 3.1982, "step": 73300 }, { "epoch": 18.35, "grad_norm": 0.05520262196660042, "learning_rate": 0.0002972670916932308, "loss": 3.054, "step": 73400 }, { "epoch": 18.375, "grad_norm": 0.05517415702342987, "learning_rate": 0.00029726334145884116, "loss": 3.0914, "step": 73500 }, { "epoch": 18.4, "grad_norm": 0.06400242447853088, "learning_rate": 0.0002972595912244515, "loss": 3.1063, "step": 73600 }, { "epoch": 18.425, "grad_norm": 0.061084117740392685, "learning_rate": 0.0002972558409900619, "loss": 3.1149, "step": 73700 }, { "epoch": 18.45, "grad_norm": 0.09352370351552963, "learning_rate": 0.0002972520907556722, "loss": 3.0725, "step": 73800 }, { "epoch": 18.475, "grad_norm": 0.059218719601631165, "learning_rate": 0.00029724837802362646, "loss": 3.0702, "step": 73900 }, { "epoch": 18.5, "grad_norm": 0.06091728433966637, "learning_rate": 0.0002972446277892368, "loss": 2.8734, "step": 74000 }, { "epoch": 18.525, "grad_norm": 0.056753043085336685, "learning_rate": 0.00029724087755484714, "loss": 3.0829, "step": 74100 }, { "epoch": 18.55, "grad_norm": 0.053419552743434906, "learning_rate": 0.0002972371273204575, "loss": 3.1694, "step": 74200 }, { "epoch": 18.575, "grad_norm": 0.054798588156700134, "learning_rate": 0.0002972333770860679, "loss": 3.1102, "step": 74300 }, { "epoch": 18.6, "grad_norm": 0.058476317673921585, "learning_rate": 0.0002972296268516782, "loss": 3.1152, "step": 74400 }, { "epoch": 18.625, "grad_norm": 0.059114113450050354, "learning_rate": 0.00029722587661728855, "loss": 3.1364, "step": 74500 }, { "epoch": 18.65, "grad_norm": 0.06834947317838669, "learning_rate": 0.0002972221263828989, "loss": 3.126, "step": 74600 }, { "epoch": 18.675, "grad_norm": 0.05191313102841377, "learning_rate": 0.00029721837614850923, "loss": 3.0284, "step": 74700 }, { "epoch": 18.7, "grad_norm": 0.07164154201745987, "learning_rate": 0.0002972146259141196, "loss": 2.9633, "step": 74800 }, { "epoch": 18.725, "grad_norm": 0.05095268040895462, "learning_rate": 0.00029721087567972996, "loss": 3.0032, "step": 74900 }, { "epoch": 18.75, "grad_norm": 0.05199890211224556, "learning_rate": 0.00029720712544534033, "loss": 3.0957, "step": 75000 }, { "epoch": 18.775, "grad_norm": 0.08117477595806122, "learning_rate": 0.00029720337521095064, "loss": 3.0001, "step": 75100 }, { "epoch": 18.8, "grad_norm": 0.05241430178284645, "learning_rate": 0.000297199624976561, "loss": 2.9402, "step": 75200 }, { "epoch": 18.825, "grad_norm": 0.05886770412325859, "learning_rate": 0.0002971958747421714, "loss": 3.095, "step": 75300 }, { "epoch": 18.85, "grad_norm": 0.05727067589759827, "learning_rate": 0.00029719212450778174, "loss": 2.9662, "step": 75400 }, { "epoch": 18.875, "grad_norm": 0.0689665749669075, "learning_rate": 0.00029718837427339205, "loss": 2.7757, "step": 75500 }, { "epoch": 18.9, "grad_norm": 0.05945652350783348, "learning_rate": 0.0002971846240390024, "loss": 2.822, "step": 75600 }, { "epoch": 18.925, "grad_norm": 0.05478528141975403, "learning_rate": 0.00029718087380461273, "loss": 3.0564, "step": 75700 }, { "epoch": 18.95, "grad_norm": 0.05541827157139778, "learning_rate": 0.0002971771235702231, "loss": 2.9453, "step": 75800 }, { "epoch": 18.975, "grad_norm": 0.05722896754741669, "learning_rate": 0.00029717341083817736, "loss": 2.9093, "step": 75900 }, { "epoch": 19.0, "grad_norm": 0.055735573172569275, "learning_rate": 0.0002971696606037877, "loss": 2.9792, "step": 76000 }, { "epoch": 19.025, "grad_norm": 0.05422914773225784, "learning_rate": 0.00029716591036939804, "loss": 2.847, "step": 76100 }, { "epoch": 19.05, "grad_norm": 0.059790875762701035, "learning_rate": 0.0002971621601350084, "loss": 2.9788, "step": 76200 }, { "epoch": 19.075, "grad_norm": 0.07695723325014114, "learning_rate": 0.00029715840990061877, "loss": 2.9956, "step": 76300 }, { "epoch": 19.1, "grad_norm": 0.0579293929040432, "learning_rate": 0.00029715465966622913, "loss": 3.2976, "step": 76400 }, { "epoch": 19.125, "grad_norm": 0.05396733060479164, "learning_rate": 0.00029715090943183945, "loss": 2.9022, "step": 76500 }, { "epoch": 19.15, "grad_norm": 0.056989822536706924, "learning_rate": 0.0002971471591974498, "loss": 2.7142, "step": 76600 }, { "epoch": 19.175, "grad_norm": 0.05296149477362633, "learning_rate": 0.0002971434089630602, "loss": 2.8858, "step": 76700 }, { "epoch": 19.2, "grad_norm": 0.061122532933950424, "learning_rate": 0.00029713965872867054, "loss": 2.7446, "step": 76800 }, { "epoch": 19.225, "grad_norm": 0.05955662950873375, "learning_rate": 0.00029713590849428086, "loss": 2.6995, "step": 76900 }, { "epoch": 19.25, "grad_norm": 0.0610017292201519, "learning_rate": 0.0002971321582598912, "loss": 2.7667, "step": 77000 }, { "epoch": 19.275, "grad_norm": 0.05846131220459938, "learning_rate": 0.0002971284080255016, "loss": 2.7781, "step": 77100 }, { "epoch": 19.3, "grad_norm": 0.05651117116212845, "learning_rate": 0.00029712465779111195, "loss": 2.814, "step": 77200 }, { "epoch": 19.325, "grad_norm": 0.05765095725655556, "learning_rate": 0.00029712090755672227, "loss": 2.7334, "step": 77300 }, { "epoch": 19.35, "grad_norm": 0.0659993514418602, "learning_rate": 0.00029711715732233263, "loss": 2.8981, "step": 77400 }, { "epoch": 19.375, "grad_norm": 0.0573100671172142, "learning_rate": 0.00029711340708794294, "loss": 2.8433, "step": 77500 }, { "epoch": 19.4, "grad_norm": 0.06855395436286926, "learning_rate": 0.0002971096568535533, "loss": 2.9283, "step": 77600 }, { "epoch": 19.425, "grad_norm": 0.05601441487669945, "learning_rate": 0.0002971059066191637, "loss": 2.8565, "step": 77700 }, { "epoch": 19.45, "grad_norm": 0.07347328960895538, "learning_rate": 0.00029710215638477404, "loss": 2.7694, "step": 77800 }, { "epoch": 19.475, "grad_norm": 0.05399454012513161, "learning_rate": 0.00029709844365272825, "loss": 2.8265, "step": 77900 }, { "epoch": 19.5, "grad_norm": 0.05960391089320183, "learning_rate": 0.0002970946934183386, "loss": 2.86, "step": 78000 }, { "epoch": 19.525, "grad_norm": 0.050205573439598083, "learning_rate": 0.000297090943183949, "loss": 2.7896, "step": 78100 }, { "epoch": 19.55, "grad_norm": 0.061351437121629715, "learning_rate": 0.0002970871929495593, "loss": 2.7925, "step": 78200 }, { "epoch": 19.575, "grad_norm": 0.05008727312088013, "learning_rate": 0.00029708344271516966, "loss": 2.745, "step": 78300 }, { "epoch": 19.6, "grad_norm": 0.05771077796816826, "learning_rate": 0.0002970797299831239, "loss": 2.8652, "step": 78400 }, { "epoch": 19.625, "grad_norm": 0.053159620612859726, "learning_rate": 0.0002970759797487343, "loss": 2.8612, "step": 78500 }, { "epoch": 19.65, "grad_norm": 0.05607482045888901, "learning_rate": 0.0002970722295143446, "loss": 2.8825, "step": 78600 }, { "epoch": 19.675, "grad_norm": 0.05175361409783363, "learning_rate": 0.00029706847927995497, "loss": 2.9037, "step": 78700 }, { "epoch": 19.7, "grad_norm": 0.059691160917282104, "learning_rate": 0.00029706472904556533, "loss": 2.745, "step": 78800 }, { "epoch": 19.725, "grad_norm": 0.062432222068309784, "learning_rate": 0.0002970609788111757, "loss": 2.6383, "step": 78900 }, { "epoch": 19.75, "grad_norm": 0.06708359718322754, "learning_rate": 0.000297057228576786, "loss": 2.5807, "step": 79000 }, { "epoch": 19.775, "grad_norm": 0.060443244874477386, "learning_rate": 0.0002970534783423964, "loss": 2.7167, "step": 79100 }, { "epoch": 19.8, "grad_norm": 0.060145530849695206, "learning_rate": 0.0002970497281080067, "loss": 2.7283, "step": 79200 }, { "epoch": 19.825, "grad_norm": 0.06600401550531387, "learning_rate": 0.00029704597787361705, "loss": 2.8868, "step": 79300 }, { "epoch": 19.85, "grad_norm": 0.0514482781291008, "learning_rate": 0.0002970422276392274, "loss": 2.59, "step": 79400 }, { "epoch": 19.875, "grad_norm": 0.07618112862110138, "learning_rate": 0.0002970384774048378, "loss": 2.5086, "step": 79500 }, { "epoch": 19.9, "grad_norm": 0.05636357143521309, "learning_rate": 0.0002970347271704481, "loss": 2.7034, "step": 79600 }, { "epoch": 19.925, "grad_norm": 0.056812651455402374, "learning_rate": 0.00029703097693605847, "loss": 2.6996, "step": 79700 }, { "epoch": 19.95, "grad_norm": 0.07078476995229721, "learning_rate": 0.00029702722670166883, "loss": 2.8029, "step": 79800 }, { "epoch": 19.975, "grad_norm": 0.055067744106054306, "learning_rate": 0.0002970234764672792, "loss": 2.8455, "step": 79900 }, { "epoch": 20.0, "grad_norm": 0.054148148745298386, "learning_rate": 0.0002970197262328895, "loss": 2.7438, "step": 80000 }, { "epoch": 20.025, "grad_norm": 0.0576615035533905, "learning_rate": 0.0002970159759984999, "loss": 2.7244, "step": 80100 }, { "epoch": 20.05, "grad_norm": 0.05849044770002365, "learning_rate": 0.00029701222576411024, "loss": 2.6015, "step": 80200 }, { "epoch": 20.075, "grad_norm": 0.05542527511715889, "learning_rate": 0.0002970084755297206, "loss": 2.6276, "step": 80300 }, { "epoch": 20.1, "grad_norm": 0.06275394558906555, "learning_rate": 0.0002970047252953309, "loss": 2.601, "step": 80400 }, { "epoch": 20.125, "grad_norm": 0.05756799504160881, "learning_rate": 0.0002970009750609413, "loss": 2.6095, "step": 80500 }, { "epoch": 20.15, "grad_norm": 0.05315446853637695, "learning_rate": 0.00029699722482655165, "loss": 2.8117, "step": 80600 }, { "epoch": 20.175, "grad_norm": 0.06292139738798141, "learning_rate": 0.000296993474592162, "loss": 2.5364, "step": 80700 }, { "epoch": 20.2, "grad_norm": 0.05451088026165962, "learning_rate": 0.00029698972435777233, "loss": 2.6838, "step": 80800 }, { "epoch": 20.225, "grad_norm": 0.05063945800065994, "learning_rate": 0.0002969859741233827, "loss": 2.573, "step": 80900 }, { "epoch": 20.25, "grad_norm": 0.058889806270599365, "learning_rate": 0.000296982223888993, "loss": 2.4947, "step": 81000 }, { "epoch": 20.275, "grad_norm": 0.07975181192159653, "learning_rate": 0.0002969784736546034, "loss": 2.5364, "step": 81100 }, { "epoch": 20.3, "grad_norm": 0.05763572081923485, "learning_rate": 0.00029697472342021374, "loss": 2.4907, "step": 81200 }, { "epoch": 20.325, "grad_norm": 0.05867898836731911, "learning_rate": 0.0002969709731858241, "loss": 2.5361, "step": 81300 }, { "epoch": 20.35, "grad_norm": 0.0528886653482914, "learning_rate": 0.0002969672229514344, "loss": 2.6669, "step": 81400 }, { "epoch": 20.375, "grad_norm": 0.060931917279958725, "learning_rate": 0.0002969634727170448, "loss": 2.4697, "step": 81500 }, { "epoch": 20.4, "grad_norm": 0.05871622636914253, "learning_rate": 0.00029695972248265515, "loss": 2.4717, "step": 81600 }, { "epoch": 20.425, "grad_norm": 0.060853052884340286, "learning_rate": 0.0002969559722482655, "loss": 2.5891, "step": 81700 }, { "epoch": 20.45, "grad_norm": 0.052957359701395035, "learning_rate": 0.00029695222201387583, "loss": 2.5919, "step": 81800 }, { "epoch": 20.475, "grad_norm": 0.054768215864896774, "learning_rate": 0.0002969484717794862, "loss": 2.5348, "step": 81900 }, { "epoch": 20.5, "grad_norm": 0.049939971417188644, "learning_rate": 0.00029694472154509656, "loss": 2.6501, "step": 82000 }, { "epoch": 20.525, "grad_norm": 0.056562915444374084, "learning_rate": 0.0002969409713107069, "loss": 2.6031, "step": 82100 }, { "epoch": 20.55, "grad_norm": 0.05061310529708862, "learning_rate": 0.00029693722107631724, "loss": 2.5924, "step": 82200 }, { "epoch": 20.575, "grad_norm": 0.05474073067307472, "learning_rate": 0.0002969334708419276, "loss": 2.7109, "step": 82300 }, { "epoch": 20.6, "grad_norm": 0.062750443816185, "learning_rate": 0.00029692975810988187, "loss": 2.5636, "step": 82400 }, { "epoch": 20.625, "grad_norm": 0.05921516939997673, "learning_rate": 0.0002969260078754922, "loss": 2.4478, "step": 82500 }, { "epoch": 20.65, "grad_norm": 0.06074066460132599, "learning_rate": 0.00029692225764110254, "loss": 2.5207, "step": 82600 }, { "epoch": 20.675, "grad_norm": 0.06394727528095245, "learning_rate": 0.00029691850740671286, "loss": 2.7291, "step": 82700 }, { "epoch": 20.7, "grad_norm": 0.06293661147356033, "learning_rate": 0.0002969147571723232, "loss": 2.5454, "step": 82800 }, { "epoch": 20.725, "grad_norm": 0.049685824662446976, "learning_rate": 0.0002969110069379336, "loss": 2.7017, "step": 82900 }, { "epoch": 20.75, "grad_norm": 0.0517297200858593, "learning_rate": 0.00029690725670354396, "loss": 2.5524, "step": 83000 }, { "epoch": 20.775, "grad_norm": 0.061634670943021774, "learning_rate": 0.00029690350646915427, "loss": 2.4389, "step": 83100 }, { "epoch": 20.8, "grad_norm": 0.06085900962352753, "learning_rate": 0.00029689975623476463, "loss": 2.4254, "step": 83200 }, { "epoch": 20.825, "grad_norm": 0.05363364890217781, "learning_rate": 0.000296896006000375, "loss": 2.3591, "step": 83300 }, { "epoch": 20.85, "grad_norm": 0.051609691232442856, "learning_rate": 0.00029689225576598537, "loss": 2.5282, "step": 83400 }, { "epoch": 20.875, "grad_norm": 0.04989041015505791, "learning_rate": 0.0002968885055315957, "loss": 2.537, "step": 83500 }, { "epoch": 20.9, "grad_norm": 0.053229689598083496, "learning_rate": 0.00029688475529720604, "loss": 2.5949, "step": 83600 }, { "epoch": 20.925, "grad_norm": 0.05230165645480156, "learning_rate": 0.0002968810050628164, "loss": 2.4183, "step": 83700 }, { "epoch": 20.95, "grad_norm": 0.05094073340296745, "learning_rate": 0.0002968772548284268, "loss": 2.6191, "step": 83800 }, { "epoch": 20.975, "grad_norm": 0.05941576883196831, "learning_rate": 0.0002968735045940371, "loss": 2.3788, "step": 83900 }, { "epoch": 21.0, "grad_norm": 0.05283214896917343, "learning_rate": 0.00029686975435964745, "loss": 2.5303, "step": 84000 }, { "epoch": 21.025, "grad_norm": 0.06153716892004013, "learning_rate": 0.0002968660041252578, "loss": 2.4201, "step": 84100 }, { "epoch": 21.05, "grad_norm": 0.05074555054306984, "learning_rate": 0.0002968622538908682, "loss": 2.4179, "step": 84200 }, { "epoch": 21.075, "grad_norm": 0.05797216296195984, "learning_rate": 0.0002968585036564785, "loss": 2.3018, "step": 84300 }, { "epoch": 21.1, "grad_norm": 0.053176261484622955, "learning_rate": 0.00029685475342208886, "loss": 2.4499, "step": 84400 }, { "epoch": 21.125, "grad_norm": 0.0612250491976738, "learning_rate": 0.00029685104069004307, "loss": 2.5186, "step": 84500 }, { "epoch": 21.15, "grad_norm": 0.055981434881687164, "learning_rate": 0.00029684729045565344, "loss": 2.3994, "step": 84600 }, { "epoch": 21.175, "grad_norm": 0.07191935181617737, "learning_rate": 0.0002968435402212638, "loss": 2.4054, "step": 84700 }, { "epoch": 21.2, "grad_norm": 0.05557156354188919, "learning_rate": 0.00029683978998687417, "loss": 2.3924, "step": 84800 }, { "epoch": 21.225, "grad_norm": 0.06246166303753853, "learning_rate": 0.0002968360397524845, "loss": 2.4453, "step": 84900 }, { "epoch": 21.25, "grad_norm": 0.061136774718761444, "learning_rate": 0.00029683228951809485, "loss": 2.3465, "step": 85000 }, { "epoch": 21.275, "grad_norm": 0.06496226042509079, "learning_rate": 0.0002968285392837052, "loss": 2.356, "step": 85100 }, { "epoch": 21.3, "grad_norm": 0.10879474133253098, "learning_rate": 0.0002968247890493156, "loss": 2.3113, "step": 85200 }, { "epoch": 21.325, "grad_norm": 0.07896184921264648, "learning_rate": 0.0002968210388149259, "loss": 2.3167, "step": 85300 }, { "epoch": 21.35, "grad_norm": 0.05807124823331833, "learning_rate": 0.00029681728858053626, "loss": 2.464, "step": 85400 }, { "epoch": 21.375, "grad_norm": 0.05621746554970741, "learning_rate": 0.0002968135383461466, "loss": 2.4666, "step": 85500 }, { "epoch": 21.4, "grad_norm": 0.06423439085483551, "learning_rate": 0.000296809788111757, "loss": 2.4151, "step": 85600 }, { "epoch": 21.425, "grad_norm": 0.053314123302698135, "learning_rate": 0.0002968060378773673, "loss": 2.5222, "step": 85700 }, { "epoch": 21.45, "grad_norm": 0.060538969933986664, "learning_rate": 0.00029680228764297767, "loss": 2.2422, "step": 85800 }, { "epoch": 21.475, "grad_norm": 0.05905874818563461, "learning_rate": 0.00029679853740858804, "loss": 2.2856, "step": 85900 }, { "epoch": 21.5, "grad_norm": 0.05516530200839043, "learning_rate": 0.00029679478717419835, "loss": 2.3191, "step": 86000 }, { "epoch": 21.525, "grad_norm": 0.06160394474864006, "learning_rate": 0.0002967910369398087, "loss": 2.3382, "step": 86100 }, { "epoch": 21.55, "grad_norm": 0.05599430948495865, "learning_rate": 0.000296787286705419, "loss": 2.4985, "step": 86200 }, { "epoch": 21.575, "grad_norm": 0.06205850839614868, "learning_rate": 0.0002967835364710294, "loss": 2.4363, "step": 86300 }, { "epoch": 21.6, "grad_norm": 0.05747246369719505, "learning_rate": 0.00029677978623663976, "loss": 2.3009, "step": 86400 }, { "epoch": 21.625, "grad_norm": 0.05334313213825226, "learning_rate": 0.000296776073504594, "loss": 2.213, "step": 86500 }, { "epoch": 21.65, "grad_norm": 0.05755939334630966, "learning_rate": 0.00029677232327020433, "loss": 2.3473, "step": 86600 }, { "epoch": 21.675, "grad_norm": 0.06077682599425316, "learning_rate": 0.0002967685730358147, "loss": 2.3133, "step": 86700 }, { "epoch": 21.7, "grad_norm": 0.04741760715842247, "learning_rate": 0.00029676482280142506, "loss": 2.2298, "step": 86800 }, { "epoch": 21.725, "grad_norm": 0.05226515606045723, "learning_rate": 0.00029676107256703543, "loss": 2.3709, "step": 86900 }, { "epoch": 21.75, "grad_norm": 0.05925588309764862, "learning_rate": 0.00029675732233264574, "loss": 2.3128, "step": 87000 }, { "epoch": 21.775, "grad_norm": 0.05521254613995552, "learning_rate": 0.0002967535720982561, "loss": 2.1846, "step": 87100 }, { "epoch": 21.8, "grad_norm": 0.058398790657520294, "learning_rate": 0.0002967498218638665, "loss": 2.2529, "step": 87200 }, { "epoch": 21.825, "grad_norm": 0.051581237465143204, "learning_rate": 0.00029674607162947684, "loss": 2.3331, "step": 87300 }, { "epoch": 21.85, "grad_norm": 0.046482495963573456, "learning_rate": 0.00029674232139508715, "loss": 2.3946, "step": 87400 }, { "epoch": 21.875, "grad_norm": 0.053977347910404205, "learning_rate": 0.0002967385711606975, "loss": 2.3074, "step": 87500 }, { "epoch": 21.9, "grad_norm": 0.0516643263399601, "learning_rate": 0.0002967348209263079, "loss": 2.3192, "step": 87600 }, { "epoch": 21.925, "grad_norm": 0.04839833453297615, "learning_rate": 0.00029673107069191825, "loss": 2.2164, "step": 87700 }, { "epoch": 21.95, "grad_norm": 0.05504479259252548, "learning_rate": 0.00029672732045752856, "loss": 2.3114, "step": 87800 }, { "epoch": 21.975, "grad_norm": 0.05117473378777504, "learning_rate": 0.00029672357022313893, "loss": 2.2976, "step": 87900 }, { "epoch": 22.0, "grad_norm": 0.052601177245378494, "learning_rate": 0.00029671981998874924, "loss": 2.4827, "step": 88000 }, { "epoch": 22.025, "grad_norm": 0.04800357297062874, "learning_rate": 0.0002967160697543596, "loss": 2.2798, "step": 88100 }, { "epoch": 22.05, "grad_norm": 0.06387566775083542, "learning_rate": 0.00029671231951996997, "loss": 2.2325, "step": 88200 }, { "epoch": 22.075, "grad_norm": 0.05719434469938278, "learning_rate": 0.00029670856928558034, "loss": 2.2685, "step": 88300 }, { "epoch": 22.1, "grad_norm": 0.05765566602349281, "learning_rate": 0.00029670481905119065, "loss": 2.1859, "step": 88400 }, { "epoch": 22.125, "grad_norm": 0.06396758556365967, "learning_rate": 0.000296701068816801, "loss": 2.4629, "step": 88500 }, { "epoch": 22.15, "grad_norm": 0.04949299618601799, "learning_rate": 0.0002966973185824114, "loss": 2.2405, "step": 88600 }, { "epoch": 22.175, "grad_norm": 0.04977158457040787, "learning_rate": 0.00029669356834802175, "loss": 2.137, "step": 88700 }, { "epoch": 22.2, "grad_norm": 0.06776726990938187, "learning_rate": 0.00029668981811363206, "loss": 2.1948, "step": 88800 }, { "epoch": 22.225, "grad_norm": 0.05846365541219711, "learning_rate": 0.0002966860678792424, "loss": 2.0921, "step": 88900 }, { "epoch": 22.25, "grad_norm": 0.05889894440770149, "learning_rate": 0.0002966823176448528, "loss": 2.3352, "step": 89000 }, { "epoch": 22.275, "grad_norm": 0.04690111055970192, "learning_rate": 0.00029667856741046316, "loss": 2.3157, "step": 89100 }, { "epoch": 22.3, "grad_norm": 0.05615220591425896, "learning_rate": 0.00029667481717607347, "loss": 2.1161, "step": 89200 }, { "epoch": 22.325, "grad_norm": 0.0551600381731987, "learning_rate": 0.00029667106694168384, "loss": 2.125, "step": 89300 }, { "epoch": 22.35, "grad_norm": 0.050111789256334305, "learning_rate": 0.0002966673167072942, "loss": 2.1135, "step": 89400 }, { "epoch": 22.375, "grad_norm": 0.05537761375308037, "learning_rate": 0.00029666356647290457, "loss": 2.1623, "step": 89500 }, { "epoch": 22.4, "grad_norm": 0.0577760748565197, "learning_rate": 0.0002966598162385149, "loss": 2.1871, "step": 89600 }, { "epoch": 22.425, "grad_norm": 0.05141003802418709, "learning_rate": 0.00029665606600412525, "loss": 2.1437, "step": 89700 }, { "epoch": 22.45, "grad_norm": 0.05164093151688576, "learning_rate": 0.00029665231576973556, "loss": 2.2704, "step": 89800 }, { "epoch": 22.475, "grad_norm": 0.051070958375930786, "learning_rate": 0.0002966485655353459, "loss": 2.2791, "step": 89900 }, { "epoch": 22.5, "grad_norm": 0.054080720990896225, "learning_rate": 0.0002966448153009563, "loss": 2.1997, "step": 90000 }, { "epoch": 22.525, "grad_norm": 0.057264506816864014, "learning_rate": 0.0002966410650665666, "loss": 2.1997, "step": 90100 }, { "epoch": 22.55, "grad_norm": 0.0729178935289383, "learning_rate": 0.00029663731483217697, "loss": 2.1692, "step": 90200 }, { "epoch": 22.575, "grad_norm": 0.05248183757066727, "learning_rate": 0.00029663356459778734, "loss": 2.1341, "step": 90300 }, { "epoch": 22.6, "grad_norm": 0.05090828239917755, "learning_rate": 0.0002966298143633977, "loss": 2.2374, "step": 90400 }, { "epoch": 22.625, "grad_norm": 0.12061487883329391, "learning_rate": 0.0002966261016313519, "loss": 2.1671, "step": 90500 }, { "epoch": 22.65, "grad_norm": 0.06009404733777046, "learning_rate": 0.0002966223513969623, "loss": 2.2945, "step": 90600 }, { "epoch": 22.675, "grad_norm": 0.06756783276796341, "learning_rate": 0.00029661860116257264, "loss": 2.2064, "step": 90700 }, { "epoch": 22.7, "grad_norm": 0.04783422127366066, "learning_rate": 0.000296614850928183, "loss": 2.1548, "step": 90800 }, { "epoch": 22.725, "grad_norm": 0.06468702852725983, "learning_rate": 0.0002966111006937933, "loss": 2.0389, "step": 90900 }, { "epoch": 22.75, "grad_norm": 0.05485010892152786, "learning_rate": 0.0002966073504594037, "loss": 2.1214, "step": 91000 }, { "epoch": 22.775, "grad_norm": 0.05827448144555092, "learning_rate": 0.00029660360022501405, "loss": 2.2367, "step": 91100 }, { "epoch": 22.8, "grad_norm": 0.054152172058820724, "learning_rate": 0.0002965998499906244, "loss": 2.1022, "step": 91200 }, { "epoch": 22.825, "grad_norm": 0.04739788547158241, "learning_rate": 0.00029659609975623473, "loss": 2.1672, "step": 91300 }, { "epoch": 22.85, "grad_norm": 0.05551367625594139, "learning_rate": 0.0002965923495218451, "loss": 2.05, "step": 91400 }, { "epoch": 22.875, "grad_norm": 0.05317440256476402, "learning_rate": 0.0002965885992874554, "loss": 2.012, "step": 91500 }, { "epoch": 22.9, "grad_norm": 0.053941987454891205, "learning_rate": 0.00029658488655540967, "loss": 2.1268, "step": 91600 }, { "epoch": 22.925, "grad_norm": 0.05108709633350372, "learning_rate": 0.00029658113632102004, "loss": 2.1342, "step": 91700 }, { "epoch": 22.95, "grad_norm": 0.052761614322662354, "learning_rate": 0.0002965773860866304, "loss": 2.08, "step": 91800 }, { "epoch": 22.975, "grad_norm": 0.05674518644809723, "learning_rate": 0.0002965736358522407, "loss": 2.1533, "step": 91900 }, { "epoch": 23.0, "grad_norm": 0.06261865794658661, "learning_rate": 0.0002965698856178511, "loss": 2.0382, "step": 92000 }, { "epoch": 23.025, "grad_norm": 0.04918836057186127, "learning_rate": 0.00029656613538346145, "loss": 2.0315, "step": 92100 }, { "epoch": 23.05, "grad_norm": 0.04982222989201546, "learning_rate": 0.0002965623851490718, "loss": 2.1285, "step": 92200 }, { "epoch": 23.075, "grad_norm": 0.051534924656152725, "learning_rate": 0.0002965586349146821, "loss": 2.1746, "step": 92300 }, { "epoch": 23.1, "grad_norm": 0.059025805443525314, "learning_rate": 0.0002965548846802925, "loss": 2.1339, "step": 92400 }, { "epoch": 23.125, "grad_norm": 0.05158498138189316, "learning_rate": 0.00029655113444590286, "loss": 2.049, "step": 92500 }, { "epoch": 23.15, "grad_norm": 0.049751464277505875, "learning_rate": 0.0002965473842115132, "loss": 2.0587, "step": 92600 }, { "epoch": 23.175, "grad_norm": 0.05357548967003822, "learning_rate": 0.00029654363397712353, "loss": 2.1765, "step": 92700 }, { "epoch": 23.2, "grad_norm": 0.05639924481511116, "learning_rate": 0.0002965398837427339, "loss": 2.0229, "step": 92800 }, { "epoch": 23.225, "grad_norm": 0.057067710906267166, "learning_rate": 0.00029653613350834427, "loss": 2.1208, "step": 92900 }, { "epoch": 23.25, "grad_norm": 0.056406810879707336, "learning_rate": 0.00029653238327395463, "loss": 2.1044, "step": 93000 }, { "epoch": 23.275, "grad_norm": 0.05794864147901535, "learning_rate": 0.00029652863303956495, "loss": 1.9575, "step": 93100 }, { "epoch": 23.3, "grad_norm": 0.059239715337753296, "learning_rate": 0.0002965248828051753, "loss": 2.1206, "step": 93200 }, { "epoch": 23.325, "grad_norm": 0.05163438990712166, "learning_rate": 0.0002965211325707856, "loss": 1.9799, "step": 93300 }, { "epoch": 23.35, "grad_norm": 0.05853526294231415, "learning_rate": 0.000296517382336396, "loss": 2.0314, "step": 93400 }, { "epoch": 23.375, "grad_norm": 0.04642421007156372, "learning_rate": 0.00029651363210200636, "loss": 2.0252, "step": 93500 }, { "epoch": 23.4, "grad_norm": 0.05934316664934158, "learning_rate": 0.0002965098818676167, "loss": 2.0889, "step": 93600 }, { "epoch": 23.425, "grad_norm": 0.05159417912364006, "learning_rate": 0.00029650613163322703, "loss": 2.0017, "step": 93700 }, { "epoch": 23.45, "grad_norm": 0.04541020095348358, "learning_rate": 0.0002965023813988374, "loss": 2.0592, "step": 93800 }, { "epoch": 23.475, "grad_norm": 0.05421976000070572, "learning_rate": 0.00029649863116444777, "loss": 1.9184, "step": 93900 }, { "epoch": 23.5, "grad_norm": 0.05134705454111099, "learning_rate": 0.0002964948809300581, "loss": 2.2841, "step": 94000 }, { "epoch": 23.525, "grad_norm": 0.050796929746866226, "learning_rate": 0.00029649113069566844, "loss": 1.9773, "step": 94100 }, { "epoch": 23.55, "grad_norm": 0.062260136008262634, "learning_rate": 0.0002964873804612788, "loss": 2.1259, "step": 94200 }, { "epoch": 23.575, "grad_norm": 0.051263660192489624, "learning_rate": 0.0002964836302268892, "loss": 1.996, "step": 94300 }, { "epoch": 23.6, "grad_norm": 0.052974916994571686, "learning_rate": 0.0002964798799924995, "loss": 2.0231, "step": 94400 }, { "epoch": 23.625, "grad_norm": 0.06232937052845955, "learning_rate": 0.00029647612975810985, "loss": 1.9196, "step": 94500 }, { "epoch": 23.65, "grad_norm": 0.05306218937039375, "learning_rate": 0.0002964723795237202, "loss": 1.9388, "step": 94600 }, { "epoch": 23.675, "grad_norm": 0.05512924864888191, "learning_rate": 0.0002964686292893306, "loss": 2.1401, "step": 94700 }, { "epoch": 23.7, "grad_norm": 0.056388285011053085, "learning_rate": 0.0002964648790549409, "loss": 2.0013, "step": 94800 }, { "epoch": 23.725, "grad_norm": 0.05032140389084816, "learning_rate": 0.00029646112882055126, "loss": 1.9568, "step": 94900 }, { "epoch": 23.75, "grad_norm": 0.04757603630423546, "learning_rate": 0.0002964573785861616, "loss": 1.8944, "step": 95000 }, { "epoch": 23.775, "grad_norm": 0.05020546913146973, "learning_rate": 0.00029645362835177194, "loss": 2.0146, "step": 95100 }, { "epoch": 23.8, "grad_norm": 0.056530579924583435, "learning_rate": 0.0002964498781173823, "loss": 1.9345, "step": 95200 }, { "epoch": 23.825, "grad_norm": 0.07894182950258255, "learning_rate": 0.0002964461278829927, "loss": 2.1116, "step": 95300 }, { "epoch": 23.85, "grad_norm": 0.05175475776195526, "learning_rate": 0.000296442377648603, "loss": 2.1331, "step": 95400 }, { "epoch": 23.875, "grad_norm": 0.05405741557478905, "learning_rate": 0.00029643862741421335, "loss": 1.8724, "step": 95500 }, { "epoch": 23.9, "grad_norm": 0.06405475735664368, "learning_rate": 0.0002964349146821676, "loss": 1.8652, "step": 95600 }, { "epoch": 23.925, "grad_norm": 0.0548410564661026, "learning_rate": 0.000296431164447778, "loss": 1.9177, "step": 95700 }, { "epoch": 23.95, "grad_norm": 0.04941118508577347, "learning_rate": 0.0002964274142133883, "loss": 1.981, "step": 95800 }, { "epoch": 23.975, "grad_norm": 0.06233079358935356, "learning_rate": 0.00029642366397899866, "loss": 1.886, "step": 95900 }, { "epoch": 24.0, "grad_norm": 0.06110682711005211, "learning_rate": 0.000296419913744609, "loss": 1.906, "step": 96000 }, { "epoch": 24.025, "grad_norm": 0.056876040995121, "learning_rate": 0.0002964161635102194, "loss": 1.9632, "step": 96100 }, { "epoch": 24.05, "grad_norm": 0.056007348001003265, "learning_rate": 0.0002964124132758297, "loss": 1.8518, "step": 96200 }, { "epoch": 24.075, "grad_norm": 0.052707262337207794, "learning_rate": 0.00029640866304144007, "loss": 2.1039, "step": 96300 }, { "epoch": 24.1, "grad_norm": 0.05575592815876007, "learning_rate": 0.00029640491280705044, "loss": 1.8103, "step": 96400 }, { "epoch": 24.125, "grad_norm": 0.05587482079863548, "learning_rate": 0.0002964011625726608, "loss": 1.9645, "step": 96500 }, { "epoch": 24.15, "grad_norm": 0.08619283139705658, "learning_rate": 0.0002963974123382711, "loss": 1.9429, "step": 96600 }, { "epoch": 24.175, "grad_norm": 0.09571905434131622, "learning_rate": 0.0002963936621038815, "loss": 1.902, "step": 96700 }, { "epoch": 24.2, "grad_norm": 0.050410255789756775, "learning_rate": 0.0002963899118694918, "loss": 2.0446, "step": 96800 }, { "epoch": 24.225, "grad_norm": 0.060695916414260864, "learning_rate": 0.00029638616163510216, "loss": 1.9231, "step": 96900 }, { "epoch": 24.25, "grad_norm": 0.05033661425113678, "learning_rate": 0.0002963824114007125, "loss": 1.9065, "step": 97000 }, { "epoch": 24.275, "grad_norm": 0.05458163470029831, "learning_rate": 0.0002963786611663229, "loss": 1.858, "step": 97100 }, { "epoch": 24.3, "grad_norm": 0.05258990451693535, "learning_rate": 0.0002963749109319332, "loss": 2.0328, "step": 97200 }, { "epoch": 24.325, "grad_norm": 0.04619702324271202, "learning_rate": 0.00029637116069754357, "loss": 1.8548, "step": 97300 }, { "epoch": 24.35, "grad_norm": 0.06743716448545456, "learning_rate": 0.00029636741046315393, "loss": 1.9381, "step": 97400 }, { "epoch": 24.375, "grad_norm": 0.049068696796894073, "learning_rate": 0.0002963636602287643, "loss": 1.9359, "step": 97500 }, { "epoch": 24.4, "grad_norm": 0.061207227408885956, "learning_rate": 0.0002963599474967185, "loss": 1.8927, "step": 97600 }, { "epoch": 24.425, "grad_norm": 0.05484483018517494, "learning_rate": 0.0002963561972623289, "loss": 1.88, "step": 97700 }, { "epoch": 24.45, "grad_norm": 0.057467181235551834, "learning_rate": 0.00029635244702793924, "loss": 1.856, "step": 97800 }, { "epoch": 24.475, "grad_norm": 0.049861736595630646, "learning_rate": 0.00029634869679354955, "loss": 2.0343, "step": 97900 }, { "epoch": 24.5, "grad_norm": 0.049673888832330704, "learning_rate": 0.0002963449465591599, "loss": 1.8138, "step": 98000 }, { "epoch": 24.525, "grad_norm": 0.06320221722126007, "learning_rate": 0.0002963411963247703, "loss": 1.9389, "step": 98100 }, { "epoch": 24.55, "grad_norm": 0.0863277018070221, "learning_rate": 0.00029633744609038065, "loss": 1.9127, "step": 98200 }, { "epoch": 24.575, "grad_norm": 0.04973394796252251, "learning_rate": 0.00029633369585599096, "loss": 1.8468, "step": 98300 }, { "epoch": 24.6, "grad_norm": 0.061264049261808395, "learning_rate": 0.00029632994562160133, "loss": 1.9194, "step": 98400 }, { "epoch": 24.625, "grad_norm": 0.05264371261000633, "learning_rate": 0.00029632619538721164, "loss": 1.8896, "step": 98500 }, { "epoch": 24.65, "grad_norm": 0.054599445313215256, "learning_rate": 0.000296322445152822, "loss": 1.9001, "step": 98600 }, { "epoch": 24.675, "grad_norm": 0.05259576812386513, "learning_rate": 0.00029631869491843237, "loss": 1.8258, "step": 98700 }, { "epoch": 24.7, "grad_norm": 0.05342064052820206, "learning_rate": 0.00029631494468404274, "loss": 1.926, "step": 98800 }, { "epoch": 24.725, "grad_norm": 0.04714656248688698, "learning_rate": 0.00029631119444965305, "loss": 1.8823, "step": 98900 }, { "epoch": 24.75, "grad_norm": 0.050276800990104675, "learning_rate": 0.0002963074442152634, "loss": 1.823, "step": 99000 }, { "epoch": 24.775, "grad_norm": 0.051686566323041916, "learning_rate": 0.0002963036939808738, "loss": 1.8796, "step": 99100 }, { "epoch": 24.8, "grad_norm": 0.051118552684783936, "learning_rate": 0.00029629994374648415, "loss": 1.9002, "step": 99200 }, { "epoch": 24.825, "grad_norm": 0.05065715312957764, "learning_rate": 0.00029629619351209446, "loss": 1.868, "step": 99300 }, { "epoch": 24.85, "grad_norm": 0.043341364711523056, "learning_rate": 0.00029629244327770483, "loss": 1.9614, "step": 99400 }, { "epoch": 24.875, "grad_norm": 0.052784670144319534, "learning_rate": 0.0002962886930433152, "loss": 1.9323, "step": 99500 }, { "epoch": 24.9, "grad_norm": 0.055045951157808304, "learning_rate": 0.00029628494280892556, "loss": 1.8218, "step": 99600 }, { "epoch": 24.925, "grad_norm": 0.058140724897384644, "learning_rate": 0.00029628123007687977, "loss": 1.8894, "step": 99700 }, { "epoch": 24.95, "grad_norm": 0.058738358318805695, "learning_rate": 0.00029627747984249013, "loss": 1.7708, "step": 99800 }, { "epoch": 24.975, "grad_norm": 0.05485925078392029, "learning_rate": 0.0002962737296081005, "loss": 1.9136, "step": 99900 }, { "epoch": 25.0, "grad_norm": 0.05562080442905426, "learning_rate": 0.00029626997937371087, "loss": 1.9072, "step": 100000 }, { "epoch": 25.025, "grad_norm": 0.04997032880783081, "learning_rate": 0.0002962662291393212, "loss": 1.7119, "step": 100100 }, { "epoch": 25.05, "grad_norm": 0.05290250480175018, "learning_rate": 0.00029626247890493154, "loss": 1.706, "step": 100200 }, { "epoch": 25.075, "grad_norm": 0.04861506074666977, "learning_rate": 0.00029625872867054186, "loss": 1.7061, "step": 100300 }, { "epoch": 25.1, "grad_norm": 0.05706246569752693, "learning_rate": 0.0002962549784361522, "loss": 1.9067, "step": 100400 }, { "epoch": 25.125, "grad_norm": 0.055538617074489594, "learning_rate": 0.0002962512282017626, "loss": 1.8622, "step": 100500 }, { "epoch": 25.15, "grad_norm": 0.06384219229221344, "learning_rate": 0.00029624747796737295, "loss": 1.7935, "step": 100600 }, { "epoch": 25.175, "grad_norm": 0.057620443403720856, "learning_rate": 0.00029624372773298327, "loss": 1.8746, "step": 100700 }, { "epoch": 25.2, "grad_norm": 0.05917825549840927, "learning_rate": 0.00029623997749859363, "loss": 1.7152, "step": 100800 }, { "epoch": 25.225, "grad_norm": 0.061573103070259094, "learning_rate": 0.000296236227264204, "loss": 1.8928, "step": 100900 }, { "epoch": 25.25, "grad_norm": 0.04456368088722229, "learning_rate": 0.00029623247702981436, "loss": 1.798, "step": 101000 }, { "epoch": 25.275, "grad_norm": 0.06028895452618599, "learning_rate": 0.0002962287267954247, "loss": 1.8044, "step": 101100 }, { "epoch": 25.3, "grad_norm": 0.0548817440867424, "learning_rate": 0.00029622497656103504, "loss": 1.9204, "step": 101200 }, { "epoch": 25.325, "grad_norm": 0.045852452516555786, "learning_rate": 0.0002962212263266454, "loss": 1.924, "step": 101300 }, { "epoch": 25.35, "grad_norm": 0.04782922565937042, "learning_rate": 0.0002962174760922558, "loss": 1.7096, "step": 101400 }, { "epoch": 25.375, "grad_norm": 0.049990586936473846, "learning_rate": 0.0002962137258578661, "loss": 1.9654, "step": 101500 }, { "epoch": 25.4, "grad_norm": 0.04626760631799698, "learning_rate": 0.0002962099756234764, "loss": 1.7223, "step": 101600 }, { "epoch": 25.425, "grad_norm": 0.054343245923519135, "learning_rate": 0.0002962062253890868, "loss": 1.85, "step": 101700 }, { "epoch": 25.45, "grad_norm": 0.04563869535923004, "learning_rate": 0.000296202512657041, "loss": 1.8011, "step": 101800 }, { "epoch": 25.475, "grad_norm": 0.05334710702300072, "learning_rate": 0.0002961987624226514, "loss": 1.7863, "step": 101900 }, { "epoch": 25.5, "grad_norm": 0.05533549562096596, "learning_rate": 0.0002961950121882617, "loss": 1.7575, "step": 102000 }, { "epoch": 25.525, "grad_norm": 0.05645955726504326, "learning_rate": 0.00029619126195387207, "loss": 1.6948, "step": 102100 }, { "epoch": 25.55, "grad_norm": 0.05024164915084839, "learning_rate": 0.00029618751171948244, "loss": 1.6452, "step": 102200 }, { "epoch": 25.575, "grad_norm": 0.051269952207803726, "learning_rate": 0.0002961837614850928, "loss": 1.7991, "step": 102300 }, { "epoch": 25.6, "grad_norm": 0.05763736367225647, "learning_rate": 0.0002961800112507031, "loss": 1.7634, "step": 102400 }, { "epoch": 25.625, "grad_norm": 0.05718966946005821, "learning_rate": 0.0002961762610163135, "loss": 1.7013, "step": 102500 }, { "epoch": 25.65, "grad_norm": 0.05326114594936371, "learning_rate": 0.00029617251078192385, "loss": 1.6578, "step": 102600 }, { "epoch": 25.675, "grad_norm": 0.05004553496837616, "learning_rate": 0.0002961687605475342, "loss": 1.6707, "step": 102700 }, { "epoch": 25.7, "grad_norm": 0.047597501426935196, "learning_rate": 0.0002961650103131445, "loss": 1.8098, "step": 102800 }, { "epoch": 25.725, "grad_norm": 0.05360327288508415, "learning_rate": 0.0002961612600787549, "loss": 1.8259, "step": 102900 }, { "epoch": 25.75, "grad_norm": 0.04639869183301926, "learning_rate": 0.00029615750984436526, "loss": 1.8487, "step": 103000 }, { "epoch": 25.775, "grad_norm": 0.048653990030288696, "learning_rate": 0.0002961537596099756, "loss": 1.6956, "step": 103100 }, { "epoch": 25.8, "grad_norm": 0.043963368982076645, "learning_rate": 0.00029615000937558594, "loss": 1.6178, "step": 103200 }, { "epoch": 25.825, "grad_norm": 0.05706685408949852, "learning_rate": 0.0002961462591411963, "loss": 1.6809, "step": 103300 }, { "epoch": 25.85, "grad_norm": 0.05852410942316055, "learning_rate": 0.00029614250890680667, "loss": 1.6511, "step": 103400 }, { "epoch": 25.875, "grad_norm": 0.054208237677812576, "learning_rate": 0.00029613875867241703, "loss": 1.8168, "step": 103500 }, { "epoch": 25.9, "grad_norm": 0.05457128956913948, "learning_rate": 0.00029613500843802735, "loss": 1.7456, "step": 103600 }, { "epoch": 25.925, "grad_norm": 0.047613076865673065, "learning_rate": 0.0002961312582036377, "loss": 1.629, "step": 103700 }, { "epoch": 25.95, "grad_norm": 0.05182652920484543, "learning_rate": 0.0002961275454715919, "loss": 1.6386, "step": 103800 }, { "epoch": 25.975, "grad_norm": 0.046905118972063065, "learning_rate": 0.0002961237952372023, "loss": 1.8368, "step": 103900 }, { "epoch": 26.0, "grad_norm": 0.04973314702510834, "learning_rate": 0.00029612004500281265, "loss": 1.8125, "step": 104000 }, { "epoch": 26.025, "grad_norm": 0.048138804733753204, "learning_rate": 0.000296116294768423, "loss": 1.6797, "step": 104100 }, { "epoch": 26.05, "grad_norm": 0.0547357015311718, "learning_rate": 0.00029611254453403333, "loss": 1.67, "step": 104200 }, { "epoch": 26.075, "grad_norm": 0.05443267896771431, "learning_rate": 0.0002961087942996437, "loss": 1.6682, "step": 104300 }, { "epoch": 26.1, "grad_norm": 0.06275078654289246, "learning_rate": 0.00029610504406525406, "loss": 1.7022, "step": 104400 }, { "epoch": 26.125, "grad_norm": 0.05464591458439827, "learning_rate": 0.00029610129383086443, "loss": 1.8136, "step": 104500 }, { "epoch": 26.15, "grad_norm": 0.05352524295449257, "learning_rate": 0.00029609754359647474, "loss": 1.7319, "step": 104600 }, { "epoch": 26.175, "grad_norm": 0.05525488778948784, "learning_rate": 0.0002960937933620851, "loss": 1.766, "step": 104700 }, { "epoch": 26.2, "grad_norm": 0.05569114536046982, "learning_rate": 0.00029609004312769547, "loss": 1.7767, "step": 104800 }, { "epoch": 26.225, "grad_norm": 0.0440787635743618, "learning_rate": 0.00029608629289330584, "loss": 1.6786, "step": 104900 }, { "epoch": 26.25, "grad_norm": 0.05321473628282547, "learning_rate": 0.00029608254265891615, "loss": 1.6904, "step": 105000 }, { "epoch": 26.275, "grad_norm": 0.047589514404535294, "learning_rate": 0.0002960787924245265, "loss": 1.5513, "step": 105100 }, { "epoch": 26.3, "grad_norm": 0.0542590469121933, "learning_rate": 0.0002960750421901369, "loss": 1.8018, "step": 105200 }, { "epoch": 26.325, "grad_norm": 0.052015386521816254, "learning_rate": 0.0002960712919557472, "loss": 1.6334, "step": 105300 }, { "epoch": 26.35, "grad_norm": 0.16159088909626007, "learning_rate": 0.00029606754172135756, "loss": 1.5818, "step": 105400 }, { "epoch": 26.375, "grad_norm": 0.04810553416609764, "learning_rate": 0.00029606379148696787, "loss": 1.6274, "step": 105500 }, { "epoch": 26.4, "grad_norm": 0.053879667073488235, "learning_rate": 0.00029606004125257824, "loss": 1.8122, "step": 105600 }, { "epoch": 26.425, "grad_norm": 0.04980600252747536, "learning_rate": 0.0002960562910181886, "loss": 1.7187, "step": 105700 }, { "epoch": 26.45, "grad_norm": 0.059906307607889175, "learning_rate": 0.00029605257828614287, "loss": 1.7223, "step": 105800 }, { "epoch": 26.475, "grad_norm": 0.04634363576769829, "learning_rate": 0.0002960488280517532, "loss": 1.6282, "step": 105900 }, { "epoch": 26.5, "grad_norm": 0.052842844277620316, "learning_rate": 0.00029604507781736354, "loss": 1.6203, "step": 106000 }, { "epoch": 26.525, "grad_norm": 0.05409262329339981, "learning_rate": 0.0002960413275829739, "loss": 1.7725, "step": 106100 }, { "epoch": 26.55, "grad_norm": 0.04745221883058548, "learning_rate": 0.0002960375773485843, "loss": 1.6498, "step": 106200 }, { "epoch": 26.575, "grad_norm": 0.050988294184207916, "learning_rate": 0.0002960338271141946, "loss": 1.6534, "step": 106300 }, { "epoch": 26.6, "grad_norm": 0.046150580048561096, "learning_rate": 0.00029603007687980495, "loss": 1.7042, "step": 106400 }, { "epoch": 26.625, "grad_norm": 0.05468379706144333, "learning_rate": 0.0002960263266454153, "loss": 1.6467, "step": 106500 }, { "epoch": 26.65, "grad_norm": 0.05112981051206589, "learning_rate": 0.0002960225764110257, "loss": 1.5898, "step": 106600 }, { "epoch": 26.675, "grad_norm": 0.050162170082330704, "learning_rate": 0.000296018826176636, "loss": 1.7128, "step": 106700 }, { "epoch": 26.7, "grad_norm": 0.05202512443065643, "learning_rate": 0.00029601507594224637, "loss": 1.6162, "step": 106800 }, { "epoch": 26.725, "grad_norm": 0.05049065127968788, "learning_rate": 0.00029601132570785673, "loss": 1.7741, "step": 106900 }, { "epoch": 26.75, "grad_norm": 0.05425161495804787, "learning_rate": 0.000296007612975811, "loss": 1.5715, "step": 107000 }, { "epoch": 26.775, "grad_norm": 0.04676578938961029, "learning_rate": 0.0002960038627414213, "loss": 1.4396, "step": 107100 }, { "epoch": 26.8, "grad_norm": 0.04315830394625664, "learning_rate": 0.00029600011250703167, "loss": 1.648, "step": 107200 }, { "epoch": 26.825, "grad_norm": 0.052309952676296234, "learning_rate": 0.000295996362272642, "loss": 1.5737, "step": 107300 }, { "epoch": 26.85, "grad_norm": 0.05186279118061066, "learning_rate": 0.00029599261203825235, "loss": 1.5913, "step": 107400 }, { "epoch": 26.875, "grad_norm": 0.05266883224248886, "learning_rate": 0.0002959888618038627, "loss": 1.567, "step": 107500 }, { "epoch": 26.9, "grad_norm": 0.04454510286450386, "learning_rate": 0.0002959851115694731, "loss": 1.5123, "step": 107600 }, { "epoch": 26.925, "grad_norm": 0.05315356329083443, "learning_rate": 0.0002959813613350834, "loss": 1.6372, "step": 107700 }, { "epoch": 26.95, "grad_norm": 0.04607756808400154, "learning_rate": 0.00029597761110069376, "loss": 1.6074, "step": 107800 }, { "epoch": 26.975, "grad_norm": 0.04452488571405411, "learning_rate": 0.0002959738608663041, "loss": 1.5927, "step": 107900 }, { "epoch": 27.0, "grad_norm": 0.05356653034687042, "learning_rate": 0.0002959701106319145, "loss": 1.6214, "step": 108000 }, { "epoch": 27.025, "grad_norm": 0.04785982891917229, "learning_rate": 0.0002959663603975248, "loss": 1.6273, "step": 108100 }, { "epoch": 27.05, "grad_norm": 0.04626493901014328, "learning_rate": 0.00029596261016313517, "loss": 1.6494, "step": 108200 }, { "epoch": 27.075, "grad_norm": 0.04791727289557457, "learning_rate": 0.00029595885992874554, "loss": 1.5452, "step": 108300 }, { "epoch": 27.1, "grad_norm": 0.06166384369134903, "learning_rate": 0.0002959551096943559, "loss": 1.5749, "step": 108400 }, { "epoch": 27.125, "grad_norm": 0.05195313319563866, "learning_rate": 0.0002959513594599662, "loss": 1.536, "step": 108500 }, { "epoch": 27.15, "grad_norm": 0.0505547821521759, "learning_rate": 0.0002959476092255766, "loss": 1.6606, "step": 108600 }, { "epoch": 27.175, "grad_norm": 0.04837740212678909, "learning_rate": 0.00029594385899118695, "loss": 1.5617, "step": 108700 }, { "epoch": 27.2, "grad_norm": 0.04828809201717377, "learning_rate": 0.0002959401087567973, "loss": 1.7326, "step": 108800 }, { "epoch": 27.225, "grad_norm": 0.06565222144126892, "learning_rate": 0.0002959363585224076, "loss": 1.5621, "step": 108900 }, { "epoch": 27.25, "grad_norm": 0.05221616104245186, "learning_rate": 0.000295932608288018, "loss": 1.7385, "step": 109000 }, { "epoch": 27.275, "grad_norm": 0.05376584827899933, "learning_rate": 0.0002959288580536283, "loss": 1.5078, "step": 109100 }, { "epoch": 27.3, "grad_norm": 0.04505067691206932, "learning_rate": 0.00029592510781923867, "loss": 1.6082, "step": 109200 }, { "epoch": 27.325, "grad_norm": 0.047202132642269135, "learning_rate": 0.00029592135758484903, "loss": 1.5304, "step": 109300 }, { "epoch": 27.35, "grad_norm": 0.06032031401991844, "learning_rate": 0.00029591760735045935, "loss": 1.6035, "step": 109400 }, { "epoch": 27.375, "grad_norm": 0.044648509472608566, "learning_rate": 0.0002959138571160697, "loss": 1.5581, "step": 109500 }, { "epoch": 27.4, "grad_norm": 0.05649425461888313, "learning_rate": 0.0002959101068816801, "loss": 1.5482, "step": 109600 }, { "epoch": 27.425, "grad_norm": 0.05527213215827942, "learning_rate": 0.00029590635664729044, "loss": 1.6155, "step": 109700 }, { "epoch": 27.45, "grad_norm": 0.050836507230997086, "learning_rate": 0.00029590260641290076, "loss": 1.4239, "step": 109800 }, { "epoch": 27.475, "grad_norm": 0.06156973913311958, "learning_rate": 0.0002958988561785111, "loss": 1.4574, "step": 109900 }, { "epoch": 27.5, "grad_norm": 0.04659149423241615, "learning_rate": 0.0002958951059441215, "loss": 1.6488, "step": 110000 }, { "epoch": 27.525, "grad_norm": 0.05683763325214386, "learning_rate": 0.00029589135570973186, "loss": 1.6128, "step": 110100 }, { "epoch": 27.55, "grad_norm": 0.0504351444542408, "learning_rate": 0.00029588760547534217, "loss": 1.6495, "step": 110200 }, { "epoch": 27.575, "grad_norm": 0.04385405406355858, "learning_rate": 0.00029588385524095253, "loss": 1.5644, "step": 110300 }, { "epoch": 27.6, "grad_norm": 0.056605253368616104, "learning_rate": 0.0002958801050065629, "loss": 1.4853, "step": 110400 }, { "epoch": 27.625, "grad_norm": 0.061634745448827744, "learning_rate": 0.00029587635477217327, "loss": 1.7518, "step": 110500 }, { "epoch": 27.65, "grad_norm": 0.05308396369218826, "learning_rate": 0.0002958726045377836, "loss": 1.4906, "step": 110600 }, { "epoch": 27.675, "grad_norm": 0.05271327123045921, "learning_rate": 0.00029586885430339394, "loss": 1.591, "step": 110700 }, { "epoch": 27.7, "grad_norm": 0.04924798756837845, "learning_rate": 0.00029586510406900426, "loss": 1.5645, "step": 110800 }, { "epoch": 27.725, "grad_norm": 0.05398215353488922, "learning_rate": 0.0002958613538346146, "loss": 1.5635, "step": 110900 }, { "epoch": 27.75, "grad_norm": 0.04747261479496956, "learning_rate": 0.000295857603600225, "loss": 1.501, "step": 111000 }, { "epoch": 27.775, "grad_norm": 0.048297274857759476, "learning_rate": 0.00029585389086817925, "loss": 1.4673, "step": 111100 }, { "epoch": 27.8, "grad_norm": 0.047769028693437576, "learning_rate": 0.00029585014063378956, "loss": 1.5335, "step": 111200 }, { "epoch": 27.825, "grad_norm": 0.05535224825143814, "learning_rate": 0.00029584639039939993, "loss": 1.5235, "step": 111300 }, { "epoch": 27.85, "grad_norm": 0.04392020031809807, "learning_rate": 0.0002958426401650103, "loss": 1.5657, "step": 111400 }, { "epoch": 27.875, "grad_norm": 0.052205685526132584, "learning_rate": 0.00029583888993062066, "loss": 1.5018, "step": 111500 }, { "epoch": 27.9, "grad_norm": 0.0470951683819294, "learning_rate": 0.00029583513969623097, "loss": 1.3486, "step": 111600 }, { "epoch": 27.925, "grad_norm": 0.045637097209692, "learning_rate": 0.00029583138946184134, "loss": 1.5814, "step": 111700 }, { "epoch": 27.95, "grad_norm": 0.050197433680295944, "learning_rate": 0.0002958276392274517, "loss": 1.6106, "step": 111800 }, { "epoch": 27.975, "grad_norm": 0.047528669238090515, "learning_rate": 0.00029582388899306207, "loss": 1.5872, "step": 111900 }, { "epoch": 28.0, "grad_norm": 0.052580513060092926, "learning_rate": 0.0002958201387586724, "loss": 1.4037, "step": 112000 }, { "epoch": 28.025, "grad_norm": 0.05215739831328392, "learning_rate": 0.00029581638852428275, "loss": 1.5155, "step": 112100 }, { "epoch": 28.05, "grad_norm": 0.0481177382171154, "learning_rate": 0.0002958126382898931, "loss": 1.5689, "step": 112200 }, { "epoch": 28.075, "grad_norm": 0.06459362804889679, "learning_rate": 0.0002958088880555035, "loss": 1.4518, "step": 112300 }, { "epoch": 28.1, "grad_norm": 0.0489063635468483, "learning_rate": 0.0002958051378211138, "loss": 1.5451, "step": 112400 }, { "epoch": 28.125, "grad_norm": 0.05155845358967781, "learning_rate": 0.00029580138758672416, "loss": 1.4813, "step": 112500 }, { "epoch": 28.15, "grad_norm": 0.05029693618416786, "learning_rate": 0.00029579763735233447, "loss": 1.4739, "step": 112600 }, { "epoch": 28.175, "grad_norm": 0.06580676138401031, "learning_rate": 0.00029579388711794484, "loss": 1.5699, "step": 112700 }, { "epoch": 28.2, "grad_norm": 0.04858999699354172, "learning_rate": 0.0002957901368835552, "loss": 1.4865, "step": 112800 }, { "epoch": 28.225, "grad_norm": 0.048569995909929276, "learning_rate": 0.00029578638664916557, "loss": 1.466, "step": 112900 }, { "epoch": 28.25, "grad_norm": 0.05034118890762329, "learning_rate": 0.0002957826364147759, "loss": 1.5571, "step": 113000 }, { "epoch": 28.275, "grad_norm": 0.05421663448214531, "learning_rate": 0.00029577888618038625, "loss": 1.5187, "step": 113100 }, { "epoch": 28.3, "grad_norm": 0.04554268717765808, "learning_rate": 0.0002957751359459966, "loss": 1.4526, "step": 113200 }, { "epoch": 28.325, "grad_norm": 0.04670153930783272, "learning_rate": 0.0002957713857116069, "loss": 1.4785, "step": 113300 }, { "epoch": 28.35, "grad_norm": 0.05041331797838211, "learning_rate": 0.0002957676354772173, "loss": 1.4533, "step": 113400 }, { "epoch": 28.375, "grad_norm": 0.042034462094306946, "learning_rate": 0.00029576388524282766, "loss": 1.4947, "step": 113500 }, { "epoch": 28.4, "grad_norm": 0.050760041922330856, "learning_rate": 0.000295760135008438, "loss": 1.5469, "step": 113600 }, { "epoch": 28.425, "grad_norm": 0.04767528921365738, "learning_rate": 0.00029575638477404834, "loss": 1.4801, "step": 113700 }, { "epoch": 28.45, "grad_norm": 0.05914180353283882, "learning_rate": 0.0002957526720420026, "loss": 1.5372, "step": 113800 }, { "epoch": 28.475, "grad_norm": 0.05601555109024048, "learning_rate": 0.00029574892180761296, "loss": 1.4325, "step": 113900 }, { "epoch": 28.5, "grad_norm": 0.056612931191921234, "learning_rate": 0.00029574517157322333, "loss": 1.4873, "step": 114000 }, { "epoch": 28.525, "grad_norm": 0.04357181489467621, "learning_rate": 0.00029574142133883364, "loss": 1.4405, "step": 114100 }, { "epoch": 28.55, "grad_norm": 0.05303529277443886, "learning_rate": 0.000295737671104444, "loss": 1.4365, "step": 114200 }, { "epoch": 28.575, "grad_norm": 0.048596885055303574, "learning_rate": 0.0002957339208700543, "loss": 1.4425, "step": 114300 }, { "epoch": 28.6, "grad_norm": 0.05361025035381317, "learning_rate": 0.0002957301706356647, "loss": 1.4063, "step": 114400 }, { "epoch": 28.625, "grad_norm": 0.05975283682346344, "learning_rate": 0.00029572642040127505, "loss": 1.4549, "step": 114500 }, { "epoch": 28.65, "grad_norm": 0.04482881724834442, "learning_rate": 0.0002957226701668854, "loss": 1.3836, "step": 114600 }, { "epoch": 28.675, "grad_norm": 0.05114329233765602, "learning_rate": 0.00029571891993249573, "loss": 1.5901, "step": 114700 }, { "epoch": 28.7, "grad_norm": 0.04038051888346672, "learning_rate": 0.0002957151696981061, "loss": 1.5117, "step": 114800 }, { "epoch": 28.725, "grad_norm": 0.052758511155843735, "learning_rate": 0.00029571141946371646, "loss": 1.4111, "step": 114900 }, { "epoch": 28.75, "grad_norm": 0.049384575337171555, "learning_rate": 0.00029570766922932683, "loss": 1.4381, "step": 115000 }, { "epoch": 28.775, "grad_norm": 0.047072507441043854, "learning_rate": 0.00029570391899493714, "loss": 1.4444, "step": 115100 }, { "epoch": 28.8, "grad_norm": 0.05382237955927849, "learning_rate": 0.0002957001687605475, "loss": 1.4174, "step": 115200 }, { "epoch": 28.825, "grad_norm": 0.04967265948653221, "learning_rate": 0.00029569641852615787, "loss": 1.4709, "step": 115300 }, { "epoch": 28.85, "grad_norm": 0.045560047030448914, "learning_rate": 0.00029569266829176824, "loss": 1.5302, "step": 115400 }, { "epoch": 28.875, "grad_norm": 0.058798883110284805, "learning_rate": 0.00029568891805737855, "loss": 1.4022, "step": 115500 }, { "epoch": 28.9, "grad_norm": 0.04776821285486221, "learning_rate": 0.0002956851678229889, "loss": 1.3512, "step": 115600 }, { "epoch": 28.925, "grad_norm": 0.05173936486244202, "learning_rate": 0.0002956814175885993, "loss": 1.5405, "step": 115700 }, { "epoch": 28.95, "grad_norm": 0.04927581176161766, "learning_rate": 0.00029567766735420965, "loss": 1.435, "step": 115800 }, { "epoch": 28.975, "grad_norm": 0.04748755320906639, "learning_rate": 0.00029567391711981996, "loss": 1.4073, "step": 115900 }, { "epoch": 29.0, "grad_norm": 0.04827181622385979, "learning_rate": 0.0002956701668854303, "loss": 1.4046, "step": 116000 }, { "epoch": 29.025, "grad_norm": 0.05039271339774132, "learning_rate": 0.00029566645415338453, "loss": 1.3616, "step": 116100 }, { "epoch": 29.05, "grad_norm": 0.046831537038087845, "learning_rate": 0.0002956627039189949, "loss": 1.3991, "step": 116200 }, { "epoch": 29.075, "grad_norm": 0.056436687707901, "learning_rate": 0.00029565895368460527, "loss": 1.448, "step": 116300 }, { "epoch": 29.1, "grad_norm": 0.04817488044500351, "learning_rate": 0.00029565520345021563, "loss": 1.363, "step": 116400 }, { "epoch": 29.125, "grad_norm": 0.05330492928624153, "learning_rate": 0.00029565145321582594, "loss": 1.4313, "step": 116500 }, { "epoch": 29.15, "grad_norm": 0.05745427682995796, "learning_rate": 0.0002956477029814363, "loss": 1.5579, "step": 116600 }, { "epoch": 29.175, "grad_norm": 0.05263765901327133, "learning_rate": 0.0002956439527470467, "loss": 1.5836, "step": 116700 }, { "epoch": 29.2, "grad_norm": 0.044311635196208954, "learning_rate": 0.00029564020251265704, "loss": 1.4367, "step": 116800 }, { "epoch": 29.225, "grad_norm": 0.053102701902389526, "learning_rate": 0.00029563645227826735, "loss": 1.4936, "step": 116900 }, { "epoch": 29.25, "grad_norm": 0.04289867728948593, "learning_rate": 0.0002956327020438777, "loss": 1.438, "step": 117000 }, { "epoch": 29.275, "grad_norm": 0.05283905565738678, "learning_rate": 0.0002956289518094881, "loss": 1.5341, "step": 117100 }, { "epoch": 29.3, "grad_norm": 0.0411902479827404, "learning_rate": 0.0002956252015750984, "loss": 1.3774, "step": 117200 }, { "epoch": 29.325, "grad_norm": 0.0581793412566185, "learning_rate": 0.00029562145134070877, "loss": 1.4712, "step": 117300 }, { "epoch": 29.35, "grad_norm": 0.04655259847640991, "learning_rate": 0.00029561770110631913, "loss": 1.2906, "step": 117400 }, { "epoch": 29.375, "grad_norm": 0.05028205364942551, "learning_rate": 0.0002956139508719295, "loss": 1.3921, "step": 117500 }, { "epoch": 29.4, "grad_norm": 0.049044106155633926, "learning_rate": 0.0002956102006375398, "loss": 1.4684, "step": 117600 }, { "epoch": 29.425, "grad_norm": 0.05344530567526817, "learning_rate": 0.0002956064504031502, "loss": 1.399, "step": 117700 }, { "epoch": 29.45, "grad_norm": 0.05248359963297844, "learning_rate": 0.0002956027001687605, "loss": 1.3738, "step": 117800 }, { "epoch": 29.475, "grad_norm": 0.053722232580184937, "learning_rate": 0.00029559894993437085, "loss": 1.27, "step": 117900 }, { "epoch": 29.5, "grad_norm": 0.05581889674067497, "learning_rate": 0.0002955951996999812, "loss": 1.4523, "step": 118000 }, { "epoch": 29.525, "grad_norm": 0.04724375531077385, "learning_rate": 0.0002955914494655916, "loss": 1.2637, "step": 118100 }, { "epoch": 29.55, "grad_norm": 0.04487941041588783, "learning_rate": 0.0002955877367335458, "loss": 1.3064, "step": 118200 }, { "epoch": 29.575, "grad_norm": 0.04799391329288483, "learning_rate": 0.00029558398649915616, "loss": 1.4433, "step": 118300 }, { "epoch": 29.6, "grad_norm": 0.04437430948019028, "learning_rate": 0.0002955802362647665, "loss": 1.3427, "step": 118400 }, { "epoch": 29.625, "grad_norm": 0.04969744756817818, "learning_rate": 0.0002955764860303769, "loss": 1.3415, "step": 118500 }, { "epoch": 29.65, "grad_norm": 0.05268990993499756, "learning_rate": 0.0002955727357959872, "loss": 1.37, "step": 118600 }, { "epoch": 29.675, "grad_norm": 0.05563261732459068, "learning_rate": 0.00029556898556159757, "loss": 1.3404, "step": 118700 }, { "epoch": 29.7, "grad_norm": 0.045039862394332886, "learning_rate": 0.00029556523532720794, "loss": 1.2967, "step": 118800 }, { "epoch": 29.725, "grad_norm": 0.06740451604127884, "learning_rate": 0.0002955614850928183, "loss": 1.4316, "step": 118900 }, { "epoch": 29.75, "grad_norm": 0.046530742198228836, "learning_rate": 0.0002955577348584286, "loss": 1.3871, "step": 119000 }, { "epoch": 29.775, "grad_norm": 0.04662451893091202, "learning_rate": 0.000295553984624039, "loss": 1.3832, "step": 119100 }, { "epoch": 29.8, "grad_norm": 0.05180426687002182, "learning_rate": 0.00029555023438964935, "loss": 1.3783, "step": 119200 }, { "epoch": 29.825, "grad_norm": 0.04919251427054405, "learning_rate": 0.0002955464841552597, "loss": 1.3789, "step": 119300 }, { "epoch": 29.85, "grad_norm": 0.04741760343313217, "learning_rate": 0.00029554273392087, "loss": 1.392, "step": 119400 }, { "epoch": 29.875, "grad_norm": 0.05151817202568054, "learning_rate": 0.0002955389836864804, "loss": 1.3472, "step": 119500 }, { "epoch": 29.9, "grad_norm": 0.05211416259407997, "learning_rate": 0.0002955352334520907, "loss": 1.4448, "step": 119600 }, { "epoch": 29.925, "grad_norm": 0.04866619408130646, "learning_rate": 0.00029553148321770107, "loss": 1.3788, "step": 119700 }, { "epoch": 29.95, "grad_norm": 0.056409094482660294, "learning_rate": 0.00029552773298331143, "loss": 1.4182, "step": 119800 }, { "epoch": 29.975, "grad_norm": 0.045399557799100876, "learning_rate": 0.0002955239827489218, "loss": 1.3579, "step": 119900 }, { "epoch": 30.0, "grad_norm": 0.05333389341831207, "learning_rate": 0.0002955202325145321, "loss": 1.4833, "step": 120000 }, { "epoch": 30.025, "grad_norm": 0.047169484198093414, "learning_rate": 0.0002955164822801425, "loss": 1.3531, "step": 120100 }, { "epoch": 30.05, "grad_norm": 0.04647146537899971, "learning_rate": 0.00029551273204575285, "loss": 1.3722, "step": 120200 }, { "epoch": 30.075, "grad_norm": 0.05528531223535538, "learning_rate": 0.0002955089818113632, "loss": 1.268, "step": 120300 }, { "epoch": 30.1, "grad_norm": 0.050155188888311386, "learning_rate": 0.0002955052315769735, "loss": 1.3659, "step": 120400 }, { "epoch": 30.125, "grad_norm": 0.047319624572992325, "learning_rate": 0.0002955014813425839, "loss": 1.4225, "step": 120500 }, { "epoch": 30.15, "grad_norm": 0.04249805584549904, "learning_rate": 0.00029549773110819426, "loss": 1.4412, "step": 120600 }, { "epoch": 30.175, "grad_norm": 0.05880492925643921, "learning_rate": 0.0002954939808738046, "loss": 1.5054, "step": 120700 }, { "epoch": 30.2, "grad_norm": 0.047143761068582535, "learning_rate": 0.00029549023063941493, "loss": 1.3931, "step": 120800 }, { "epoch": 30.225, "grad_norm": 0.04481210932135582, "learning_rate": 0.00029548648040502525, "loss": 1.2962, "step": 120900 }, { "epoch": 30.25, "grad_norm": 0.044143520295619965, "learning_rate": 0.00029548273017063567, "loss": 1.2338, "step": 121000 }, { "epoch": 30.275, "grad_norm": 0.06169132515788078, "learning_rate": 0.000295478979936246, "loss": 1.3578, "step": 121100 }, { "epoch": 30.3, "grad_norm": 0.061004914343357086, "learning_rate": 0.00029547522970185634, "loss": 1.334, "step": 121200 }, { "epoch": 30.325, "grad_norm": 0.04402782768011093, "learning_rate": 0.00029547147946746666, "loss": 1.404, "step": 121300 }, { "epoch": 30.35, "grad_norm": 0.05749357491731644, "learning_rate": 0.000295467729233077, "loss": 1.2942, "step": 121400 }, { "epoch": 30.375, "grad_norm": 0.052716564387083054, "learning_rate": 0.0002954639789986874, "loss": 1.2753, "step": 121500 }, { "epoch": 30.4, "grad_norm": 0.04735216125845909, "learning_rate": 0.00029546022876429775, "loss": 1.3316, "step": 121600 }, { "epoch": 30.425, "grad_norm": 0.05518503487110138, "learning_rate": 0.00029545651603225196, "loss": 1.3901, "step": 121700 }, { "epoch": 30.45, "grad_norm": 0.04617263004183769, "learning_rate": 0.00029545276579786233, "loss": 1.3542, "step": 121800 }, { "epoch": 30.475, "grad_norm": 0.04624765366315842, "learning_rate": 0.0002954490155634727, "loss": 1.3594, "step": 121900 }, { "epoch": 30.5, "grad_norm": 0.05599815025925636, "learning_rate": 0.00029544526532908306, "loss": 1.3957, "step": 122000 }, { "epoch": 30.525, "grad_norm": 0.047623343765735626, "learning_rate": 0.00029544151509469337, "loss": 1.3099, "step": 122100 }, { "epoch": 30.55, "grad_norm": 0.04954765364527702, "learning_rate": 0.00029543776486030374, "loss": 1.4809, "step": 122200 }, { "epoch": 30.575, "grad_norm": 0.057207658886909485, "learning_rate": 0.0002954340146259141, "loss": 1.3149, "step": 122300 }, { "epoch": 30.6, "grad_norm": 0.04670143872499466, "learning_rate": 0.00029543026439152447, "loss": 1.3461, "step": 122400 }, { "epoch": 30.625, "grad_norm": 0.04433277249336243, "learning_rate": 0.0002954265141571348, "loss": 1.1924, "step": 122500 }, { "epoch": 30.65, "grad_norm": 0.045901257544755936, "learning_rate": 0.00029542276392274515, "loss": 1.3508, "step": 122600 }, { "epoch": 30.675, "grad_norm": 0.048084866255521774, "learning_rate": 0.0002954190136883555, "loss": 1.3341, "step": 122700 }, { "epoch": 30.7, "grad_norm": 0.04639054462313652, "learning_rate": 0.0002954152634539659, "loss": 1.2832, "step": 122800 }, { "epoch": 30.725, "grad_norm": 0.05224520340561867, "learning_rate": 0.0002954115132195762, "loss": 1.2682, "step": 122900 }, { "epoch": 30.75, "grad_norm": 0.05258006602525711, "learning_rate": 0.00029540776298518656, "loss": 1.3085, "step": 123000 }, { "epoch": 30.775, "grad_norm": 0.0506523959338665, "learning_rate": 0.00029540401275079687, "loss": 1.3224, "step": 123100 }, { "epoch": 30.8, "grad_norm": 0.046581752598285675, "learning_rate": 0.00029540026251640724, "loss": 1.2794, "step": 123200 }, { "epoch": 30.825, "grad_norm": 0.04979027807712555, "learning_rate": 0.0002953965122820176, "loss": 1.1661, "step": 123300 }, { "epoch": 30.85, "grad_norm": 0.07573187351226807, "learning_rate": 0.00029539276204762797, "loss": 1.3565, "step": 123400 }, { "epoch": 30.875, "grad_norm": 0.05088147893548012, "learning_rate": 0.0002953890118132383, "loss": 1.3488, "step": 123500 }, { "epoch": 30.9, "grad_norm": 0.05240534245967865, "learning_rate": 0.00029538526157884865, "loss": 1.336, "step": 123600 }, { "epoch": 30.925, "grad_norm": 0.04134645685553551, "learning_rate": 0.000295381511344459, "loss": 1.2747, "step": 123700 }, { "epoch": 30.95, "grad_norm": 0.05094057694077492, "learning_rate": 0.0002953777611100694, "loss": 1.3445, "step": 123800 }, { "epoch": 30.975, "grad_norm": 0.045938342809677124, "learning_rate": 0.0002953740108756797, "loss": 1.2555, "step": 123900 }, { "epoch": 31.0, "grad_norm": 0.04664922505617142, "learning_rate": 0.00029537026064129006, "loss": 1.3741, "step": 124000 }, { "epoch": 31.025, "grad_norm": 0.04887442663311958, "learning_rate": 0.0002953665104069004, "loss": 1.2055, "step": 124100 }, { "epoch": 31.05, "grad_norm": 0.04919900372624397, "learning_rate": 0.0002953627601725108, "loss": 1.1721, "step": 124200 }, { "epoch": 31.075, "grad_norm": 0.048029493540525436, "learning_rate": 0.0002953590099381211, "loss": 1.3029, "step": 124300 }, { "epoch": 31.1, "grad_norm": 0.053546350449323654, "learning_rate": 0.00029535525970373147, "loss": 1.3137, "step": 124400 }, { "epoch": 31.125, "grad_norm": 0.04450497403740883, "learning_rate": 0.0002953515094693418, "loss": 1.3236, "step": 124500 }, { "epoch": 31.15, "grad_norm": 0.04896382614970207, "learning_rate": 0.0002953477592349522, "loss": 1.2933, "step": 124600 }, { "epoch": 31.175, "grad_norm": 0.04476182907819748, "learning_rate": 0.0002953440465029064, "loss": 1.3332, "step": 124700 }, { "epoch": 31.2, "grad_norm": 0.054897475987672806, "learning_rate": 0.0002953402962685167, "loss": 1.3213, "step": 124800 }, { "epoch": 31.225, "grad_norm": 0.04679589346051216, "learning_rate": 0.0002953365460341271, "loss": 1.3065, "step": 124900 }, { "epoch": 31.25, "grad_norm": 0.04921596497297287, "learning_rate": 0.00029533279579973745, "loss": 1.1591, "step": 125000 }, { "epoch": 31.275, "grad_norm": 0.0433526448905468, "learning_rate": 0.0002953290455653478, "loss": 1.3262, "step": 125100 }, { "epoch": 31.3, "grad_norm": 0.043862484395504, "learning_rate": 0.00029532529533095813, "loss": 1.2693, "step": 125200 }, { "epoch": 31.325, "grad_norm": 0.06467683613300323, "learning_rate": 0.0002953215450965685, "loss": 1.3879, "step": 125300 }, { "epoch": 31.35, "grad_norm": 0.05398791283369064, "learning_rate": 0.00029531779486217886, "loss": 1.2593, "step": 125400 }, { "epoch": 31.375, "grad_norm": 0.06727266311645508, "learning_rate": 0.00029531404462778923, "loss": 1.3277, "step": 125500 }, { "epoch": 31.4, "grad_norm": 0.0463390052318573, "learning_rate": 0.00029531029439339954, "loss": 1.3013, "step": 125600 }, { "epoch": 31.425, "grad_norm": 0.04781678318977356, "learning_rate": 0.0002953065441590099, "loss": 1.2572, "step": 125700 }, { "epoch": 31.45, "grad_norm": 0.0504741370677948, "learning_rate": 0.00029530279392462027, "loss": 1.276, "step": 125800 }, { "epoch": 31.475, "grad_norm": 0.08227650821208954, "learning_rate": 0.00029529904369023064, "loss": 1.3546, "step": 125900 }, { "epoch": 31.5, "grad_norm": 0.04831939563155174, "learning_rate": 0.00029529529345584095, "loss": 1.2622, "step": 126000 }, { "epoch": 31.525, "grad_norm": 0.04759907349944115, "learning_rate": 0.0002952915432214513, "loss": 1.3973, "step": 126100 }, { "epoch": 31.55, "grad_norm": 0.0501595176756382, "learning_rate": 0.00029528779298706163, "loss": 1.309, "step": 126200 }, { "epoch": 31.575, "grad_norm": 0.04236988723278046, "learning_rate": 0.00029528404275267205, "loss": 1.2076, "step": 126300 }, { "epoch": 31.6, "grad_norm": 0.045248087495565414, "learning_rate": 0.00029528029251828236, "loss": 1.1881, "step": 126400 }, { "epoch": 31.625, "grad_norm": 0.05358180031180382, "learning_rate": 0.00029527654228389273, "loss": 1.242, "step": 126500 }, { "epoch": 31.65, "grad_norm": 0.06812089681625366, "learning_rate": 0.00029527279204950304, "loss": 1.3071, "step": 126600 }, { "epoch": 31.675, "grad_norm": 0.0523652583360672, "learning_rate": 0.0002952690418151134, "loss": 1.2635, "step": 126700 }, { "epoch": 31.7, "grad_norm": 0.054195646196603775, "learning_rate": 0.00029526529158072377, "loss": 1.3601, "step": 126800 }, { "epoch": 31.725, "grad_norm": 0.05106286332011223, "learning_rate": 0.00029526154134633414, "loss": 1.2716, "step": 126900 }, { "epoch": 31.75, "grad_norm": 0.04490172490477562, "learning_rate": 0.00029525779111194445, "loss": 1.1354, "step": 127000 }, { "epoch": 31.775, "grad_norm": 0.04846130311489105, "learning_rate": 0.0002952540408775548, "loss": 1.3259, "step": 127100 }, { "epoch": 31.8, "grad_norm": 0.050297126173973083, "learning_rate": 0.0002952502906431652, "loss": 1.1898, "step": 127200 }, { "epoch": 31.825, "grad_norm": 0.0532267764210701, "learning_rate": 0.00029524654040877555, "loss": 1.1544, "step": 127300 }, { "epoch": 31.85, "grad_norm": 0.03898947685956955, "learning_rate": 0.00029524282767672976, "loss": 1.3027, "step": 127400 }, { "epoch": 31.875, "grad_norm": 0.055518005043268204, "learning_rate": 0.0002952390774423401, "loss": 1.1795, "step": 127500 }, { "epoch": 31.9, "grad_norm": 0.045770760625600815, "learning_rate": 0.0002952353272079505, "loss": 1.2203, "step": 127600 }, { "epoch": 31.925, "grad_norm": 0.04108942300081253, "learning_rate": 0.00029523157697356085, "loss": 1.2737, "step": 127700 }, { "epoch": 31.95, "grad_norm": 0.04591604694724083, "learning_rate": 0.00029522782673917117, "loss": 1.2465, "step": 127800 }, { "epoch": 31.975, "grad_norm": 0.04735784977674484, "learning_rate": 0.00029522407650478153, "loss": 1.3007, "step": 127900 }, { "epoch": 32.0, "grad_norm": 0.04895665496587753, "learning_rate": 0.0002952203262703919, "loss": 1.3006, "step": 128000 }, { "epoch": 32.025, "grad_norm": 0.05351528897881508, "learning_rate": 0.00029521657603600226, "loss": 1.2599, "step": 128100 }, { "epoch": 32.05, "grad_norm": 0.04478209838271141, "learning_rate": 0.0002952128258016126, "loss": 1.2839, "step": 128200 }, { "epoch": 32.075, "grad_norm": 0.05886415019631386, "learning_rate": 0.00029520907556722294, "loss": 1.2412, "step": 128300 }, { "epoch": 32.1, "grad_norm": 0.04743971303105354, "learning_rate": 0.00029520532533283325, "loss": 1.2031, "step": 128400 }, { "epoch": 32.125, "grad_norm": 0.046698570251464844, "learning_rate": 0.0002952015750984436, "loss": 1.2691, "step": 128500 }, { "epoch": 32.15, "grad_norm": 0.04950440675020218, "learning_rate": 0.000295197824864054, "loss": 1.2178, "step": 128600 }, { "epoch": 32.175, "grad_norm": 0.047533079981803894, "learning_rate": 0.0002951940746296643, "loss": 1.1742, "step": 128700 }, { "epoch": 32.2, "grad_norm": 0.1709842085838318, "learning_rate": 0.00029519032439527466, "loss": 1.2904, "step": 128800 }, { "epoch": 32.225, "grad_norm": 0.053603630512952805, "learning_rate": 0.00029518657416088503, "loss": 1.2806, "step": 128900 }, { "epoch": 32.25, "grad_norm": 0.05528594180941582, "learning_rate": 0.0002951828239264954, "loss": 1.2891, "step": 129000 }, { "epoch": 32.275, "grad_norm": 0.051689211279153824, "learning_rate": 0.0002951790736921057, "loss": 1.3107, "step": 129100 }, { "epoch": 32.3, "grad_norm": 0.0504557229578495, "learning_rate": 0.0002951753234577161, "loss": 1.2528, "step": 129200 }, { "epoch": 32.325, "grad_norm": 0.048762448132038116, "learning_rate": 0.00029517157322332644, "loss": 1.1503, "step": 129300 }, { "epoch": 32.35, "grad_norm": 0.05114434286952019, "learning_rate": 0.0002951678229889368, "loss": 1.1685, "step": 129400 }, { "epoch": 32.375, "grad_norm": 0.04877127707004547, "learning_rate": 0.0002951640727545471, "loss": 1.1642, "step": 129500 }, { "epoch": 32.4, "grad_norm": 0.04645070433616638, "learning_rate": 0.0002951603225201575, "loss": 1.2363, "step": 129600 }, { "epoch": 32.425, "grad_norm": 0.049255430698394775, "learning_rate": 0.0002951565722857678, "loss": 1.286, "step": 129700 }, { "epoch": 32.45, "grad_norm": 0.05051419138908386, "learning_rate": 0.00029515282205137816, "loss": 1.2311, "step": 129800 }, { "epoch": 32.475, "grad_norm": 0.05819782614707947, "learning_rate": 0.00029514907181698853, "loss": 1.2218, "step": 129900 }, { "epoch": 32.5, "grad_norm": 0.04523173347115517, "learning_rate": 0.0002951453215825989, "loss": 1.17, "step": 130000 }, { "epoch": 32.525, "grad_norm": 0.047802697867155075, "learning_rate": 0.0002951415713482092, "loss": 1.2679, "step": 130100 }, { "epoch": 32.55, "grad_norm": 0.04578109085559845, "learning_rate": 0.0002951378211138196, "loss": 1.134, "step": 130200 }, { "epoch": 32.575, "grad_norm": 0.040033962577581406, "learning_rate": 0.00029513407087942994, "loss": 1.222, "step": 130300 }, { "epoch": 32.6, "grad_norm": 0.04128117114305496, "learning_rate": 0.0002951303206450403, "loss": 1.2106, "step": 130400 }, { "epoch": 32.625, "grad_norm": 0.04531345143914223, "learning_rate": 0.0002951265704106506, "loss": 1.186, "step": 130500 }, { "epoch": 32.65, "grad_norm": 0.043665412813425064, "learning_rate": 0.000295122820176261, "loss": 1.2078, "step": 130600 }, { "epoch": 32.675, "grad_norm": 0.04887350648641586, "learning_rate": 0.00029511906994187135, "loss": 1.2482, "step": 130700 }, { "epoch": 32.7, "grad_norm": 0.05151134356856346, "learning_rate": 0.0002951153197074817, "loss": 1.2568, "step": 130800 }, { "epoch": 32.725, "grad_norm": 0.042473357170820236, "learning_rate": 0.00029511156947309203, "loss": 1.1829, "step": 130900 }, { "epoch": 32.75, "grad_norm": 0.05092649906873703, "learning_rate": 0.0002951078192387024, "loss": 1.1481, "step": 131000 }, { "epoch": 32.775, "grad_norm": 0.044292863458395004, "learning_rate": 0.00029510406900431276, "loss": 1.1682, "step": 131100 }, { "epoch": 32.8, "grad_norm": 0.054200585931539536, "learning_rate": 0.0002951003187699231, "loss": 1.2387, "step": 131200 }, { "epoch": 32.825, "grad_norm": 0.04644659161567688, "learning_rate": 0.00029509656853553344, "loss": 1.2118, "step": 131300 }, { "epoch": 32.85, "grad_norm": 0.06080161780118942, "learning_rate": 0.0002950928558034877, "loss": 1.1483, "step": 131400 }, { "epoch": 32.875, "grad_norm": 0.07698054611682892, "learning_rate": 0.000295089105569098, "loss": 1.1887, "step": 131500 }, { "epoch": 32.9, "grad_norm": 0.038868315517902374, "learning_rate": 0.00029508535533470843, "loss": 1.1528, "step": 131600 }, { "epoch": 32.925, "grad_norm": 0.05261719226837158, "learning_rate": 0.00029508160510031874, "loss": 1.085, "step": 131700 }, { "epoch": 32.95, "grad_norm": 0.043816640973091125, "learning_rate": 0.0002950778548659291, "loss": 1.2063, "step": 131800 }, { "epoch": 32.975, "grad_norm": 0.042075928300619125, "learning_rate": 0.0002950741046315394, "loss": 1.1792, "step": 131900 }, { "epoch": 33.0, "grad_norm": 0.04904596507549286, "learning_rate": 0.0002950703543971498, "loss": 1.2376, "step": 132000 }, { "epoch": 33.025, "grad_norm": 0.051781512796878815, "learning_rate": 0.00029506660416276015, "loss": 1.181, "step": 132100 }, { "epoch": 33.05, "grad_norm": 0.055431291460990906, "learning_rate": 0.0002950628539283705, "loss": 1.1771, "step": 132200 }, { "epoch": 33.075, "grad_norm": 0.04665238782763481, "learning_rate": 0.00029505910369398083, "loss": 1.1322, "step": 132300 }, { "epoch": 33.1, "grad_norm": 0.04755477234721184, "learning_rate": 0.0002950553534595912, "loss": 1.2262, "step": 132400 }, { "epoch": 33.125, "grad_norm": 0.0748729407787323, "learning_rate": 0.00029505164072754546, "loss": 1.0936, "step": 132500 }, { "epoch": 33.15, "grad_norm": 0.05131325498223305, "learning_rate": 0.00029504789049315577, "loss": 1.2296, "step": 132600 }, { "epoch": 33.175, "grad_norm": 0.051855139434337616, "learning_rate": 0.00029504414025876614, "loss": 1.2527, "step": 132700 }, { "epoch": 33.2, "grad_norm": 0.04259216785430908, "learning_rate": 0.0002950403900243765, "loss": 1.1978, "step": 132800 }, { "epoch": 33.225, "grad_norm": 0.0451393760740757, "learning_rate": 0.00029503663978998687, "loss": 1.1695, "step": 132900 }, { "epoch": 33.25, "grad_norm": 0.0477844700217247, "learning_rate": 0.0002950328895555972, "loss": 1.1885, "step": 133000 }, { "epoch": 33.275, "grad_norm": 0.04242611676454544, "learning_rate": 0.00029502913932120755, "loss": 1.1393, "step": 133100 }, { "epoch": 33.3, "grad_norm": 0.046090077608823776, "learning_rate": 0.00029502538908681786, "loss": 1.1158, "step": 133200 }, { "epoch": 33.325, "grad_norm": 0.04372167959809303, "learning_rate": 0.0002950216388524283, "loss": 1.1583, "step": 133300 }, { "epoch": 33.35, "grad_norm": 0.044858288019895554, "learning_rate": 0.0002950178886180386, "loss": 1.1877, "step": 133400 }, { "epoch": 33.375, "grad_norm": 0.042134176939725876, "learning_rate": 0.00029501413838364896, "loss": 1.1365, "step": 133500 }, { "epoch": 33.4, "grad_norm": 0.05012949928641319, "learning_rate": 0.00029501038814925927, "loss": 1.2341, "step": 133600 }, { "epoch": 33.425, "grad_norm": 0.04589414969086647, "learning_rate": 0.00029500663791486964, "loss": 1.1346, "step": 133700 }, { "epoch": 33.45, "grad_norm": 0.059703532606363297, "learning_rate": 0.00029500288768048, "loss": 1.2177, "step": 133800 }, { "epoch": 33.475, "grad_norm": 0.04715392366051674, "learning_rate": 0.00029499913744609037, "loss": 1.2044, "step": 133900 }, { "epoch": 33.5, "grad_norm": 0.04391086474061012, "learning_rate": 0.0002949953872117007, "loss": 1.1846, "step": 134000 }, { "epoch": 33.525, "grad_norm": 0.04045191779732704, "learning_rate": 0.00029499163697731105, "loss": 1.2048, "step": 134100 }, { "epoch": 33.55, "grad_norm": 0.04283670708537102, "learning_rate": 0.0002949878867429214, "loss": 1.2246, "step": 134200 }, { "epoch": 33.575, "grad_norm": 0.04338289797306061, "learning_rate": 0.0002949841365085318, "loss": 1.2334, "step": 134300 }, { "epoch": 33.6, "grad_norm": 0.05026433989405632, "learning_rate": 0.0002949803862741421, "loss": 1.1017, "step": 134400 }, { "epoch": 33.625, "grad_norm": 0.04827344790101051, "learning_rate": 0.00029497663603975246, "loss": 1.1765, "step": 134500 }, { "epoch": 33.65, "grad_norm": 0.055267006158828735, "learning_rate": 0.0002949728858053628, "loss": 1.0555, "step": 134600 }, { "epoch": 33.675, "grad_norm": 0.05551549047231674, "learning_rate": 0.0002949691730733171, "loss": 1.1171, "step": 134700 }, { "epoch": 33.7, "grad_norm": 0.04356600344181061, "learning_rate": 0.0002949654228389274, "loss": 1.2224, "step": 134800 }, { "epoch": 33.725, "grad_norm": 0.049372829496860504, "learning_rate": 0.00029496167260453776, "loss": 1.0843, "step": 134900 }, { "epoch": 33.75, "grad_norm": 0.04735811799764633, "learning_rate": 0.00029495792237014813, "loss": 1.2027, "step": 135000 }, { "epoch": 33.775, "grad_norm": 0.048068366944789886, "learning_rate": 0.0002949541721357585, "loss": 1.182, "step": 135100 }, { "epoch": 33.8, "grad_norm": 0.05330264940857887, "learning_rate": 0.0002949504219013688, "loss": 1.1519, "step": 135200 }, { "epoch": 33.825, "grad_norm": 0.04151195287704468, "learning_rate": 0.0002949466716669792, "loss": 1.0107, "step": 135300 }, { "epoch": 33.85, "grad_norm": 0.04683278128504753, "learning_rate": 0.0002949429214325895, "loss": 1.2629, "step": 135400 }, { "epoch": 33.875, "grad_norm": 0.04796934127807617, "learning_rate": 0.00029493917119819985, "loss": 1.0715, "step": 135500 }, { "epoch": 33.9, "grad_norm": 0.048207636922597885, "learning_rate": 0.0002949354209638102, "loss": 1.1114, "step": 135600 }, { "epoch": 33.925, "grad_norm": 0.0472245067358017, "learning_rate": 0.0002949316707294206, "loss": 1.1557, "step": 135700 }, { "epoch": 33.95, "grad_norm": 0.051259011030197144, "learning_rate": 0.0002949279204950309, "loss": 1.1246, "step": 135800 }, { "epoch": 33.975, "grad_norm": 0.054303720593452454, "learning_rate": 0.00029492417026064126, "loss": 1.0731, "step": 135900 }, { "epoch": 34.0, "grad_norm": 0.06228245794773102, "learning_rate": 0.00029492042002625163, "loss": 1.1498, "step": 136000 }, { "epoch": 34.025, "grad_norm": 0.04442556947469711, "learning_rate": 0.000294916669791862, "loss": 1.1424, "step": 136100 }, { "epoch": 34.05, "grad_norm": 0.05475945398211479, "learning_rate": 0.0002949129195574723, "loss": 1.1854, "step": 136200 }, { "epoch": 34.075, "grad_norm": 0.058647606521844864, "learning_rate": 0.0002949091693230827, "loss": 1.2086, "step": 136300 }, { "epoch": 34.1, "grad_norm": 0.04777631536126137, "learning_rate": 0.00029490541908869304, "loss": 1.175, "step": 136400 }, { "epoch": 34.125, "grad_norm": 0.04744923487305641, "learning_rate": 0.00029490166885430335, "loss": 1.0887, "step": 136500 }, { "epoch": 34.15, "grad_norm": 0.04286637902259827, "learning_rate": 0.0002948979186199137, "loss": 1.1652, "step": 136600 }, { "epoch": 34.175, "grad_norm": 0.0456664115190506, "learning_rate": 0.00029489416838552403, "loss": 1.0565, "step": 136700 }, { "epoch": 34.2, "grad_norm": 0.06168069317936897, "learning_rate": 0.0002948904181511344, "loss": 1.2153, "step": 136800 }, { "epoch": 34.225, "grad_norm": 0.04141145944595337, "learning_rate": 0.00029488666791674476, "loss": 1.1138, "step": 136900 }, { "epoch": 34.25, "grad_norm": 0.04432584345340729, "learning_rate": 0.00029488291768235513, "loss": 1.1477, "step": 137000 }, { "epoch": 34.275, "grad_norm": 0.04956555366516113, "learning_rate": 0.00029487916744796544, "loss": 1.0743, "step": 137100 }, { "epoch": 34.3, "grad_norm": 0.04936617240309715, "learning_rate": 0.0002948754172135758, "loss": 0.988, "step": 137200 }, { "epoch": 34.325, "grad_norm": 0.04362035542726517, "learning_rate": 0.00029487166697918617, "loss": 1.0981, "step": 137300 }, { "epoch": 34.35, "grad_norm": 0.051287226378917694, "learning_rate": 0.00029486791674479654, "loss": 1.088, "step": 137400 }, { "epoch": 34.375, "grad_norm": 0.03998219966888428, "learning_rate": 0.00029486416651040685, "loss": 1.1762, "step": 137500 }, { "epoch": 34.4, "grad_norm": 0.048108555376529694, "learning_rate": 0.0002948604162760172, "loss": 1.084, "step": 137600 }, { "epoch": 34.425, "grad_norm": 0.04450273886322975, "learning_rate": 0.0002948566660416276, "loss": 1.0954, "step": 137700 }, { "epoch": 34.45, "grad_norm": 0.04805700480937958, "learning_rate": 0.00029485291580723795, "loss": 0.9584, "step": 137800 }, { "epoch": 34.475, "grad_norm": 0.05516688898205757, "learning_rate": 0.00029484916557284826, "loss": 1.2255, "step": 137900 }, { "epoch": 34.5, "grad_norm": 0.04300745949149132, "learning_rate": 0.0002948454153384586, "loss": 1.113, "step": 138000 }, { "epoch": 34.525, "grad_norm": 0.04395318031311035, "learning_rate": 0.000294841665104069, "loss": 1.0804, "step": 138100 }, { "epoch": 34.55, "grad_norm": 0.0548313707113266, "learning_rate": 0.00029483791486967936, "loss": 1.1407, "step": 138200 }, { "epoch": 34.575, "grad_norm": 0.04328515753149986, "learning_rate": 0.00029483416463528967, "loss": 1.1493, "step": 138300 }, { "epoch": 34.6, "grad_norm": 0.0498124323785305, "learning_rate": 0.00029483041440090004, "loss": 1.1091, "step": 138400 }, { "epoch": 34.625, "grad_norm": 0.0529802069067955, "learning_rate": 0.00029482666416651035, "loss": 1.1526, "step": 138500 }, { "epoch": 34.65, "grad_norm": 0.0480722077190876, "learning_rate": 0.0002948229139321207, "loss": 1.1961, "step": 138600 }, { "epoch": 34.675, "grad_norm": 0.03908173367381096, "learning_rate": 0.000294819201200075, "loss": 1.0955, "step": 138700 }, { "epoch": 34.7, "grad_norm": 0.04808943718671799, "learning_rate": 0.00029481545096568534, "loss": 1.1239, "step": 138800 }, { "epoch": 34.725, "grad_norm": 0.046047843992710114, "learning_rate": 0.00029481170073129565, "loss": 1.0062, "step": 138900 }, { "epoch": 34.75, "grad_norm": 0.041441336274147034, "learning_rate": 0.000294807950496906, "loss": 1.1386, "step": 139000 }, { "epoch": 34.775, "grad_norm": 0.044936537742614746, "learning_rate": 0.0002948042002625164, "loss": 1.0692, "step": 139100 }, { "epoch": 34.8, "grad_norm": 0.04202251508831978, "learning_rate": 0.00029480045002812675, "loss": 1.1048, "step": 139200 }, { "epoch": 34.825, "grad_norm": 0.06056401878595352, "learning_rate": 0.00029479669979373706, "loss": 1.0427, "step": 139300 }, { "epoch": 34.85, "grad_norm": 0.047068677842617035, "learning_rate": 0.00029479294955934743, "loss": 1.0166, "step": 139400 }, { "epoch": 34.875, "grad_norm": 0.0437459833920002, "learning_rate": 0.0002947891993249578, "loss": 1.1336, "step": 139500 }, { "epoch": 34.9, "grad_norm": 0.04363924637436867, "learning_rate": 0.00029478544909056816, "loss": 1.0419, "step": 139600 }, { "epoch": 34.925, "grad_norm": 0.04847422614693642, "learning_rate": 0.0002947816988561785, "loss": 1.1885, "step": 139700 }, { "epoch": 34.95, "grad_norm": 0.04593125358223915, "learning_rate": 0.00029477794862178884, "loss": 1.1173, "step": 139800 }, { "epoch": 34.975, "grad_norm": 0.04662812873721123, "learning_rate": 0.0002947741983873992, "loss": 1.0086, "step": 139900 }, { "epoch": 35.0, "grad_norm": 0.04696165770292282, "learning_rate": 0.0002947704481530096, "loss": 0.9674, "step": 140000 }, { "epoch": 35.025, "grad_norm": 0.04659904167056084, "learning_rate": 0.0002947666979186199, "loss": 1.0319, "step": 140100 }, { "epoch": 35.05, "grad_norm": 0.0433788076043129, "learning_rate": 0.00029476294768423025, "loss": 1.0989, "step": 140200 }, { "epoch": 35.075, "grad_norm": 0.04491908475756645, "learning_rate": 0.00029475919744984056, "loss": 1.0623, "step": 140300 }, { "epoch": 35.1, "grad_norm": 0.045701559633016586, "learning_rate": 0.00029475544721545093, "loss": 1.1146, "step": 140400 }, { "epoch": 35.125, "grad_norm": 0.04654062166810036, "learning_rate": 0.0002947517344834052, "loss": 1.0735, "step": 140500 }, { "epoch": 35.15, "grad_norm": 0.05366494506597519, "learning_rate": 0.0002947479842490155, "loss": 1.1706, "step": 140600 }, { "epoch": 35.175, "grad_norm": 0.047658320516347885, "learning_rate": 0.00029474423401462587, "loss": 1.1263, "step": 140700 }, { "epoch": 35.2, "grad_norm": 0.04554996266961098, "learning_rate": 0.00029474048378023624, "loss": 1.1135, "step": 140800 }, { "epoch": 35.225, "grad_norm": 0.04832541570067406, "learning_rate": 0.0002947367335458466, "loss": 1.0375, "step": 140900 }, { "epoch": 35.25, "grad_norm": 0.0434059239923954, "learning_rate": 0.0002947329833114569, "loss": 1.0696, "step": 141000 }, { "epoch": 35.275, "grad_norm": 0.04571983963251114, "learning_rate": 0.0002947292330770673, "loss": 1.1276, "step": 141100 }, { "epoch": 35.3, "grad_norm": 0.04176199808716774, "learning_rate": 0.00029472548284267765, "loss": 0.957, "step": 141200 }, { "epoch": 35.325, "grad_norm": 0.06178323179483414, "learning_rate": 0.000294721732608288, "loss": 1.0451, "step": 141300 }, { "epoch": 35.35, "grad_norm": 0.05882290005683899, "learning_rate": 0.0002947179823738983, "loss": 1.1542, "step": 141400 }, { "epoch": 35.375, "grad_norm": 0.04132578894495964, "learning_rate": 0.0002947142321395087, "loss": 0.9828, "step": 141500 }, { "epoch": 35.4, "grad_norm": 0.04464949667453766, "learning_rate": 0.00029471048190511906, "loss": 1.0171, "step": 141600 }, { "epoch": 35.425, "grad_norm": 0.04540353640913963, "learning_rate": 0.0002947067316707294, "loss": 1.1018, "step": 141700 }, { "epoch": 35.45, "grad_norm": 0.04491226375102997, "learning_rate": 0.00029470298143633973, "loss": 1.1166, "step": 141800 }, { "epoch": 35.475, "grad_norm": 0.0440848246216774, "learning_rate": 0.0002946992312019501, "loss": 1.039, "step": 141900 }, { "epoch": 35.5, "grad_norm": 0.04919476807117462, "learning_rate": 0.0002946954809675604, "loss": 0.9442, "step": 142000 } ], "logging_steps": 100, "max_steps": 8000000, "num_input_tokens_seen": 0, "num_train_epochs": 2000, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.68516799709184e+17, "train_batch_size": 125, "trial_name": null, "trial_params": null }