{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 1355, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007380073800738007, "grad_norm": 52.96585464477539, "learning_rate": 0.0, "loss": 7.329597473144531, "step": 1 }, { "epoch": 0.0014760147601476014, "grad_norm": 42.55315017700195, "learning_rate": 1.4e-05, "loss": 7.168418884277344, "step": 2 }, { "epoch": 0.002214022140221402, "grad_norm": 21.52372169494629, "learning_rate": 2.8e-05, "loss": 6.947352409362793, "step": 3 }, { "epoch": 0.002952029520295203, "grad_norm": 19.89319610595703, "learning_rate": 4.2e-05, "loss": 6.611477851867676, "step": 4 }, { "epoch": 0.0036900369003690036, "grad_norm": 12.127403259277344, "learning_rate": 5.6e-05, "loss": 6.690403938293457, "step": 5 }, { "epoch": 0.004428044280442804, "grad_norm": 11.600789070129395, "learning_rate": 7.000000000000001e-05, "loss": 6.540159225463867, "step": 6 }, { "epoch": 0.0051660516605166054, "grad_norm": 8.64883804321289, "learning_rate": 8.4e-05, "loss": 6.4675188064575195, "step": 7 }, { "epoch": 0.005904059040590406, "grad_norm": 8.694304466247559, "learning_rate": 9.800000000000001e-05, "loss": 6.344979286193848, "step": 8 }, { "epoch": 0.006642066420664207, "grad_norm": 8.474891662597656, "learning_rate": 0.000112, "loss": 6.483427047729492, "step": 9 }, { "epoch": 0.007380073800738007, "grad_norm": 8.267909049987793, "learning_rate": 0.000126, "loss": 6.328839302062988, "step": 10 }, { "epoch": 0.008118081180811807, "grad_norm": 6.391619682312012, "learning_rate": 0.00014000000000000001, "loss": 6.344330787658691, "step": 11 }, { "epoch": 0.008856088560885609, "grad_norm": 4.2130842208862305, "learning_rate": 0.000154, "loss": 6.2279744148254395, "step": 12 }, { "epoch": 0.00959409594095941, "grad_norm": 4.580661296844482, "learning_rate": 0.000168, "loss": 6.267205715179443, "step": 13 }, { "epoch": 0.010332103321033211, "grad_norm": 4.600402355194092, "learning_rate": 0.000182, "loss": 6.177546501159668, "step": 14 }, { "epoch": 0.01107011070110701, "grad_norm": 6.000468730926514, "learning_rate": 0.00019600000000000002, "loss": 6.123383522033691, "step": 15 }, { "epoch": 0.011808118081180811, "grad_norm": 6.2054548263549805, "learning_rate": 0.00020999999999999998, "loss": 6.029158592224121, "step": 16 }, { "epoch": 0.012546125461254613, "grad_norm": 5.766181945800781, "learning_rate": 0.000224, "loss": 6.146026611328125, "step": 17 }, { "epoch": 0.013284132841328414, "grad_norm": 3.5282742977142334, "learning_rate": 0.000238, "loss": 6.347329139709473, "step": 18 }, { "epoch": 0.014022140221402213, "grad_norm": 10.378168106079102, "learning_rate": 0.000252, "loss": 5.982622146606445, "step": 19 }, { "epoch": 0.014760147601476014, "grad_norm": 6.26217794418335, "learning_rate": 0.000266, "loss": 6.232936859130859, "step": 20 }, { "epoch": 0.015498154981549815, "grad_norm": 3.640542984008789, "learning_rate": 0.00028000000000000003, "loss": 6.282479763031006, "step": 21 }, { "epoch": 0.016236162361623615, "grad_norm": 4.074864864349365, "learning_rate": 0.000294, "loss": 6.1197309494018555, "step": 22 }, { "epoch": 0.016974169741697416, "grad_norm": 4.15755558013916, "learning_rate": 0.000308, "loss": 6.1190900802612305, "step": 23 }, { "epoch": 0.017712177121771217, "grad_norm": 8.528851509094238, "learning_rate": 0.000322, "loss": 6.108499050140381, "step": 24 }, { "epoch": 0.01845018450184502, "grad_norm": 4.248746395111084, "learning_rate": 0.000336, "loss": 5.993032932281494, "step": 25 }, { "epoch": 0.01918819188191882, "grad_norm": 5.643017292022705, "learning_rate": 0.00035, "loss": 6.173605918884277, "step": 26 }, { "epoch": 0.01992619926199262, "grad_norm": 3.0032365322113037, "learning_rate": 0.000364, "loss": 5.9087629318237305, "step": 27 }, { "epoch": 0.020664206642066422, "grad_norm": 6.890568733215332, "learning_rate": 0.000378, "loss": 6.026180267333984, "step": 28 }, { "epoch": 0.021402214022140223, "grad_norm": 4.55826473236084, "learning_rate": 0.00039200000000000004, "loss": 6.302541732788086, "step": 29 }, { "epoch": 0.02214022140221402, "grad_norm": 5.366292953491211, "learning_rate": 0.00040599999999999995, "loss": 6.086678981781006, "step": 30 }, { "epoch": 0.022878228782287822, "grad_norm": 2.9198176860809326, "learning_rate": 0.00041999999999999996, "loss": 6.034950256347656, "step": 31 }, { "epoch": 0.023616236162361623, "grad_norm": 3.0416109561920166, "learning_rate": 0.000434, "loss": 5.959887981414795, "step": 32 }, { "epoch": 0.024354243542435424, "grad_norm": 3.6983375549316406, "learning_rate": 0.000448, "loss": 5.958649635314941, "step": 33 }, { "epoch": 0.025092250922509225, "grad_norm": 3.3332769870758057, "learning_rate": 0.000462, "loss": 6.053283214569092, "step": 34 }, { "epoch": 0.025830258302583026, "grad_norm": 4.3135857582092285, "learning_rate": 0.000476, "loss": 5.938570499420166, "step": 35 }, { "epoch": 0.026568265682656828, "grad_norm": 3.9662985801696777, "learning_rate": 0.00049, "loss": 6.1224799156188965, "step": 36 }, { "epoch": 0.02730627306273063, "grad_norm": 3.4459118843078613, "learning_rate": 0.000504, "loss": 6.048614501953125, "step": 37 }, { "epoch": 0.028044280442804426, "grad_norm": 4.011275768280029, "learning_rate": 0.000518, "loss": 6.095024108886719, "step": 38 }, { "epoch": 0.028782287822878228, "grad_norm": 4.109455108642578, "learning_rate": 0.000532, "loss": 6.097041130065918, "step": 39 }, { "epoch": 0.02952029520295203, "grad_norm": 2.0187416076660156, "learning_rate": 0.000546, "loss": 5.855551719665527, "step": 40 }, { "epoch": 0.03025830258302583, "grad_norm": 4.543977737426758, "learning_rate": 0.0005600000000000001, "loss": 5.990810394287109, "step": 41 }, { "epoch": 0.03099630996309963, "grad_norm": 3.6285860538482666, "learning_rate": 0.000574, "loss": 6.1089982986450195, "step": 42 }, { "epoch": 0.03173431734317343, "grad_norm": 2.802408218383789, "learning_rate": 0.000588, "loss": 6.059175491333008, "step": 43 }, { "epoch": 0.03247232472324723, "grad_norm": 5.055509090423584, "learning_rate": 0.000602, "loss": 6.0541791915893555, "step": 44 }, { "epoch": 0.033210332103321034, "grad_norm": 5.420635223388672, "learning_rate": 0.000616, "loss": 5.914989471435547, "step": 45 }, { "epoch": 0.03394833948339483, "grad_norm": 3.779264211654663, "learning_rate": 0.00063, "loss": 5.772123336791992, "step": 46 }, { "epoch": 0.03468634686346864, "grad_norm": 4.194505214691162, "learning_rate": 0.000644, "loss": 6.127632141113281, "step": 47 }, { "epoch": 0.035424354243542434, "grad_norm": 2.183096170425415, "learning_rate": 0.000658, "loss": 5.898839950561523, "step": 48 }, { "epoch": 0.03616236162361624, "grad_norm": 3.0196142196655273, "learning_rate": 0.000672, "loss": 5.775443077087402, "step": 49 }, { "epoch": 0.03690036900369004, "grad_norm": 4.503098011016846, "learning_rate": 0.000686, "loss": 5.992775917053223, "step": 50 }, { "epoch": 0.037638376383763834, "grad_norm": 2.646671772003174, "learning_rate": 0.0007, "loss": 5.891811370849609, "step": 51 }, { "epoch": 0.03837638376383764, "grad_norm": 4.828780651092529, "learning_rate": 0.0006999989858164525, "loss": 5.944026947021484, "step": 52 }, { "epoch": 0.03911439114391144, "grad_norm": 5.056863784790039, "learning_rate": 0.0006999959432716873, "loss": 6.092121601104736, "step": 53 }, { "epoch": 0.03985239852398524, "grad_norm": 2.9205923080444336, "learning_rate": 0.0006999908723833372, "loss": 6.128796577453613, "step": 54 }, { "epoch": 0.04059040590405904, "grad_norm": 2.503229856491089, "learning_rate": 0.0006999837731807897, "loss": 5.857043266296387, "step": 55 }, { "epoch": 0.041328413284132844, "grad_norm": 2.815605640411377, "learning_rate": 0.0006999746457051868, "loss": 5.79864501953125, "step": 56 }, { "epoch": 0.04206642066420664, "grad_norm": 2.630692481994629, "learning_rate": 0.0006999634900094256, "loss": 6.038992881774902, "step": 57 }, { "epoch": 0.042804428044280446, "grad_norm": 2.103322982788086, "learning_rate": 0.0006999503061581567, "loss": 5.8827619552612305, "step": 58 }, { "epoch": 0.043542435424354244, "grad_norm": 4.4402265548706055, "learning_rate": 0.0006999350942277852, "loss": 6.193219184875488, "step": 59 }, { "epoch": 0.04428044280442804, "grad_norm": 2.784449815750122, "learning_rate": 0.0006999178543064694, "loss": 5.896166801452637, "step": 60 }, { "epoch": 0.045018450184501846, "grad_norm": 2.158843755722046, "learning_rate": 0.0006998985864941203, "loss": 5.9794487953186035, "step": 61 }, { "epoch": 0.045756457564575644, "grad_norm": 3.826530933380127, "learning_rate": 0.0006998772909024012, "loss": 5.747754096984863, "step": 62 }, { "epoch": 0.04649446494464945, "grad_norm": 1.7147290706634521, "learning_rate": 0.0006998539676547274, "loss": 5.834345817565918, "step": 63 }, { "epoch": 0.047232472324723246, "grad_norm": 2.6357598304748535, "learning_rate": 0.0006998286168862646, "loss": 5.970273017883301, "step": 64 }, { "epoch": 0.04797047970479705, "grad_norm": 2.0640320777893066, "learning_rate": 0.0006998012387439294, "loss": 6.20042610168457, "step": 65 }, { "epoch": 0.04870848708487085, "grad_norm": 1.840738296508789, "learning_rate": 0.0006997718333863869, "loss": 5.69251823425293, "step": 66 }, { "epoch": 0.04944649446494465, "grad_norm": 1.5103991031646729, "learning_rate": 0.0006997404009840512, "loss": 5.718031883239746, "step": 67 }, { "epoch": 0.05018450184501845, "grad_norm": 2.454057455062866, "learning_rate": 0.0006997069417190837, "loss": 5.718637466430664, "step": 68 }, { "epoch": 0.05092250922509225, "grad_norm": 1.7299764156341553, "learning_rate": 0.0006996714557853919, "loss": 5.874034404754639, "step": 69 }, { "epoch": 0.05166051660516605, "grad_norm": 1.983879566192627, "learning_rate": 0.0006996339433886285, "loss": 5.866864204406738, "step": 70 }, { "epoch": 0.05239852398523985, "grad_norm": 1.7243304252624512, "learning_rate": 0.0006995944047461907, "loss": 5.6140642166137695, "step": 71 }, { "epoch": 0.053136531365313655, "grad_norm": 2.1467807292938232, "learning_rate": 0.0006995528400872179, "loss": 5.7456207275390625, "step": 72 }, { "epoch": 0.05387453874538745, "grad_norm": 1.8860361576080322, "learning_rate": 0.0006995092496525912, "loss": 5.868312835693359, "step": 73 }, { "epoch": 0.05461254612546126, "grad_norm": 1.9977107048034668, "learning_rate": 0.000699463633694932, "loss": 5.8535919189453125, "step": 74 }, { "epoch": 0.055350553505535055, "grad_norm": 1.6792536973953247, "learning_rate": 0.0006994159924785998, "loss": 5.957564353942871, "step": 75 }, { "epoch": 0.05608856088560885, "grad_norm": 1.83674955368042, "learning_rate": 0.0006993663262796917, "loss": 5.801642894744873, "step": 76 }, { "epoch": 0.05682656826568266, "grad_norm": 1.7811754941940308, "learning_rate": 0.0006993146353860395, "loss": 5.8649001121521, "step": 77 }, { "epoch": 0.057564575645756455, "grad_norm": 2.130631446838379, "learning_rate": 0.0006992609200972095, "loss": 5.959519386291504, "step": 78 }, { "epoch": 0.05830258302583026, "grad_norm": 1.914402961730957, "learning_rate": 0.0006992051807244997, "loss": 5.6643877029418945, "step": 79 }, { "epoch": 0.05904059040590406, "grad_norm": 2.480494737625122, "learning_rate": 0.0006991474175909385, "loss": 5.705104827880859, "step": 80 }, { "epoch": 0.05977859778597786, "grad_norm": 1.6274583339691162, "learning_rate": 0.0006990876310312825, "loss": 5.786376953125, "step": 81 }, { "epoch": 0.06051660516605166, "grad_norm": 3.1301629543304443, "learning_rate": 0.0006990258213920147, "loss": 5.652984142303467, "step": 82 }, { "epoch": 0.061254612546125464, "grad_norm": 1.7219048738479614, "learning_rate": 0.0006989619890313428, "loss": 5.684242248535156, "step": 83 }, { "epoch": 0.06199261992619926, "grad_norm": 2.016432046890259, "learning_rate": 0.0006988961343191968, "loss": 5.919116973876953, "step": 84 }, { "epoch": 0.06273062730627306, "grad_norm": 1.9674099683761597, "learning_rate": 0.0006988282576372264, "loss": 5.706339359283447, "step": 85 }, { "epoch": 0.06346863468634686, "grad_norm": 1.6389487981796265, "learning_rate": 0.0006987583593788001, "loss": 6.144864082336426, "step": 86 }, { "epoch": 0.06420664206642067, "grad_norm": 1.9105358123779297, "learning_rate": 0.0006986864399490014, "loss": 5.812554359436035, "step": 87 }, { "epoch": 0.06494464944649446, "grad_norm": 2.2395148277282715, "learning_rate": 0.0006986124997646276, "loss": 5.818288803100586, "step": 88 }, { "epoch": 0.06568265682656826, "grad_norm": 1.4297045469284058, "learning_rate": 0.0006985365392541869, "loss": 5.988651275634766, "step": 89 }, { "epoch": 0.06642066420664207, "grad_norm": 2.393372058868408, "learning_rate": 0.0006984585588578955, "loss": 5.834245681762695, "step": 90 }, { "epoch": 0.06715867158671587, "grad_norm": 1.7424498796463013, "learning_rate": 0.0006983785590276763, "loss": 5.847927570343018, "step": 91 }, { "epoch": 0.06789667896678966, "grad_norm": 1.7180150747299194, "learning_rate": 0.0006982965402271549, "loss": 5.745847702026367, "step": 92 }, { "epoch": 0.06863468634686347, "grad_norm": 1.5406665802001953, "learning_rate": 0.0006982125029316576, "loss": 5.680943012237549, "step": 93 }, { "epoch": 0.06937269372693727, "grad_norm": 1.9634002447128296, "learning_rate": 0.0006981264476282089, "loss": 5.660253524780273, "step": 94 }, { "epoch": 0.07011070110701106, "grad_norm": 1.7053471803665161, "learning_rate": 0.0006980383748155278, "loss": 5.777673721313477, "step": 95 }, { "epoch": 0.07084870848708487, "grad_norm": 1.8611632585525513, "learning_rate": 0.0006979482850040258, "loss": 5.753267288208008, "step": 96 }, { "epoch": 0.07158671586715867, "grad_norm": 2.236954689025879, "learning_rate": 0.0006978561787158036, "loss": 5.762792587280273, "step": 97 }, { "epoch": 0.07232472324723248, "grad_norm": 1.5513856410980225, "learning_rate": 0.0006977620564846479, "loss": 5.847312927246094, "step": 98 }, { "epoch": 0.07306273062730627, "grad_norm": 1.6298314332962036, "learning_rate": 0.0006976659188560285, "loss": 5.568481922149658, "step": 99 }, { "epoch": 0.07380073800738007, "grad_norm": 1.6806327104568481, "learning_rate": 0.0006975677663870951, "loss": 5.746288776397705, "step": 100 }, { "epoch": 0.07453874538745388, "grad_norm": 1.5393524169921875, "learning_rate": 0.0006974675996466741, "loss": 5.562119960784912, "step": 101 }, { "epoch": 0.07527675276752767, "grad_norm": 1.3935660123825073, "learning_rate": 0.0006973654192152653, "loss": 5.655695915222168, "step": 102 }, { "epoch": 0.07601476014760147, "grad_norm": 1.8384559154510498, "learning_rate": 0.0006972612256850385, "loss": 5.717691421508789, "step": 103 }, { "epoch": 0.07675276752767528, "grad_norm": 1.4056777954101562, "learning_rate": 0.00069715501965983, "loss": 5.4914422035217285, "step": 104 }, { "epoch": 0.07749077490774908, "grad_norm": 1.5063185691833496, "learning_rate": 0.0006970468017551393, "loss": 5.804128170013428, "step": 105 }, { "epoch": 0.07822878228782287, "grad_norm": 1.5670958757400513, "learning_rate": 0.0006969365725981253, "loss": 5.555459976196289, "step": 106 }, { "epoch": 0.07896678966789668, "grad_norm": 1.4736913442611694, "learning_rate": 0.000696824332827603, "loss": 5.734355926513672, "step": 107 }, { "epoch": 0.07970479704797048, "grad_norm": 1.2875981330871582, "learning_rate": 0.0006967100830940393, "loss": 5.615688800811768, "step": 108 }, { "epoch": 0.08044280442804429, "grad_norm": 1.6725730895996094, "learning_rate": 0.0006965938240595497, "loss": 5.705436706542969, "step": 109 }, { "epoch": 0.08118081180811808, "grad_norm": 1.2822149991989136, "learning_rate": 0.000696475556397894, "loss": 5.77439022064209, "step": 110 }, { "epoch": 0.08191881918819188, "grad_norm": 1.5231584310531616, "learning_rate": 0.0006963552807944731, "loss": 5.540444374084473, "step": 111 }, { "epoch": 0.08265682656826569, "grad_norm": 1.3938168287277222, "learning_rate": 0.0006962329979463242, "loss": 5.578408241271973, "step": 112 }, { "epoch": 0.08339483394833948, "grad_norm": 1.80418062210083, "learning_rate": 0.0006961087085621174, "loss": 5.822021484375, "step": 113 }, { "epoch": 0.08413284132841328, "grad_norm": 1.3559857606887817, "learning_rate": 0.0006959824133621514, "loss": 5.527395248413086, "step": 114 }, { "epoch": 0.08487084870848709, "grad_norm": 1.6934373378753662, "learning_rate": 0.0006958541130783489, "loss": 5.64322566986084, "step": 115 }, { "epoch": 0.08560885608856089, "grad_norm": 2.645036220550537, "learning_rate": 0.0006957238084542531, "loss": 5.786375999450684, "step": 116 }, { "epoch": 0.08634686346863468, "grad_norm": 1.7617570161819458, "learning_rate": 0.0006955915002450227, "loss": 5.706923484802246, "step": 117 }, { "epoch": 0.08708487084870849, "grad_norm": 1.4721003770828247, "learning_rate": 0.0006954571892174282, "loss": 5.816807746887207, "step": 118 }, { "epoch": 0.08782287822878229, "grad_norm": 1.4024418592453003, "learning_rate": 0.0006953208761498471, "loss": 5.504622459411621, "step": 119 }, { "epoch": 0.08856088560885608, "grad_norm": 1.1762746572494507, "learning_rate": 0.0006951825618322589, "loss": 5.638977527618408, "step": 120 }, { "epoch": 0.08929889298892989, "grad_norm": 1.4858025312423706, "learning_rate": 0.0006950422470662416, "loss": 5.7883405685424805, "step": 121 }, { "epoch": 0.09003690036900369, "grad_norm": 1.197791576385498, "learning_rate": 0.0006948999326649661, "loss": 5.5270586013793945, "step": 122 }, { "epoch": 0.0907749077490775, "grad_norm": 1.280106782913208, "learning_rate": 0.000694755619453192, "loss": 5.614171504974365, "step": 123 }, { "epoch": 0.09151291512915129, "grad_norm": 1.1635382175445557, "learning_rate": 0.0006946093082672625, "loss": 5.714271545410156, "step": 124 }, { "epoch": 0.09225092250922509, "grad_norm": 1.5833303928375244, "learning_rate": 0.0006944609999551001, "loss": 5.534208297729492, "step": 125 }, { "epoch": 0.0929889298892989, "grad_norm": 1.2109582424163818, "learning_rate": 0.0006943106953762009, "loss": 5.419297218322754, "step": 126 }, { "epoch": 0.09372693726937269, "grad_norm": 1.551060676574707, "learning_rate": 0.0006941583954016304, "loss": 5.700986385345459, "step": 127 }, { "epoch": 0.09446494464944649, "grad_norm": 1.1270159482955933, "learning_rate": 0.0006940041009140178, "loss": 5.61196231842041, "step": 128 }, { "epoch": 0.0952029520295203, "grad_norm": 1.288231372833252, "learning_rate": 0.0006938478128075513, "loss": 5.599189758300781, "step": 129 }, { "epoch": 0.0959409594095941, "grad_norm": 1.7800358533859253, "learning_rate": 0.0006936895319879727, "loss": 5.359455108642578, "step": 130 }, { "epoch": 0.09667896678966789, "grad_norm": 1.5556919574737549, "learning_rate": 0.0006935292593725724, "loss": 5.530261516571045, "step": 131 }, { "epoch": 0.0974169741697417, "grad_norm": 1.737862229347229, "learning_rate": 0.0006933669958901836, "loss": 5.362129211425781, "step": 132 }, { "epoch": 0.0981549815498155, "grad_norm": 1.5239074230194092, "learning_rate": 0.0006932027424811779, "loss": 5.559414863586426, "step": 133 }, { "epoch": 0.0988929889298893, "grad_norm": 1.206781029701233, "learning_rate": 0.0006930365000974584, "loss": 5.415935516357422, "step": 134 }, { "epoch": 0.0996309963099631, "grad_norm": 1.5241954326629639, "learning_rate": 0.0006928682697024555, "loss": 5.514790058135986, "step": 135 }, { "epoch": 0.1003690036900369, "grad_norm": 1.7540452480316162, "learning_rate": 0.0006926980522711204, "loss": 5.370218276977539, "step": 136 }, { "epoch": 0.1011070110701107, "grad_norm": 1.4406752586364746, "learning_rate": 0.0006925258487899203, "loss": 5.334672451019287, "step": 137 }, { "epoch": 0.1018450184501845, "grad_norm": 1.2946128845214844, "learning_rate": 0.000692351660256832, "loss": 5.602551460266113, "step": 138 }, { "epoch": 0.1025830258302583, "grad_norm": 1.2579693794250488, "learning_rate": 0.0006921754876813361, "loss": 5.522645473480225, "step": 139 }, { "epoch": 0.1033210332103321, "grad_norm": 1.2886651754379272, "learning_rate": 0.0006919973320844118, "loss": 5.577740669250488, "step": 140 }, { "epoch": 0.10405904059040591, "grad_norm": 1.0571826696395874, "learning_rate": 0.0006918171944985303, "loss": 5.557397842407227, "step": 141 }, { "epoch": 0.1047970479704797, "grad_norm": 1.4176267385482788, "learning_rate": 0.0006916350759676493, "loss": 5.38129997253418, "step": 142 }, { "epoch": 0.1055350553505535, "grad_norm": 1.2939625978469849, "learning_rate": 0.0006914509775472065, "loss": 5.3804121017456055, "step": 143 }, { "epoch": 0.10627306273062731, "grad_norm": 1.3399301767349243, "learning_rate": 0.0006912649003041137, "loss": 5.509670734405518, "step": 144 }, { "epoch": 0.1070110701107011, "grad_norm": 1.1282126903533936, "learning_rate": 0.000691076845316751, "loss": 5.5377583503723145, "step": 145 }, { "epoch": 0.1077490774907749, "grad_norm": 1.372504711151123, "learning_rate": 0.00069088681367496, "loss": 5.6342878341674805, "step": 146 }, { "epoch": 0.10848708487084871, "grad_norm": 1.4673429727554321, "learning_rate": 0.0006906948064800376, "loss": 5.346056938171387, "step": 147 }, { "epoch": 0.10922509225092251, "grad_norm": 1.4786832332611084, "learning_rate": 0.0006905008248447296, "loss": 5.530672073364258, "step": 148 }, { "epoch": 0.1099630996309963, "grad_norm": 1.14403235912323, "learning_rate": 0.0006903048698932245, "loss": 5.126125812530518, "step": 149 }, { "epoch": 0.11070110701107011, "grad_norm": 1.4274934530258179, "learning_rate": 0.0006901069427611469, "loss": 5.36081600189209, "step": 150 }, { "epoch": 0.11143911439114391, "grad_norm": 1.224621295928955, "learning_rate": 0.0006899070445955507, "loss": 5.192722797393799, "step": 151 }, { "epoch": 0.1121771217712177, "grad_norm": 1.1289647817611694, "learning_rate": 0.0006897051765549127, "loss": 5.438913822174072, "step": 152 }, { "epoch": 0.11291512915129151, "grad_norm": 1.3115386962890625, "learning_rate": 0.0006895013398091256, "loss": 5.402008533477783, "step": 153 }, { "epoch": 0.11365313653136531, "grad_norm": 1.4054917097091675, "learning_rate": 0.0006892955355394918, "loss": 5.593056678771973, "step": 154 }, { "epoch": 0.11439114391143912, "grad_norm": 1.2027919292449951, "learning_rate": 0.0006890877649387155, "loss": 5.359673500061035, "step": 155 }, { "epoch": 0.11512915129151291, "grad_norm": 1.1730295419692993, "learning_rate": 0.0006888780292108971, "loss": 5.578248023986816, "step": 156 }, { "epoch": 0.11586715867158671, "grad_norm": 1.2120227813720703, "learning_rate": 0.0006886663295715254, "loss": 5.643091678619385, "step": 157 }, { "epoch": 0.11660516605166052, "grad_norm": 1.2268054485321045, "learning_rate": 0.0006884526672474704, "loss": 5.381834030151367, "step": 158 }, { "epoch": 0.11734317343173432, "grad_norm": 1.3834030628204346, "learning_rate": 0.0006882370434769769, "loss": 5.615821838378906, "step": 159 }, { "epoch": 0.11808118081180811, "grad_norm": 1.7289725542068481, "learning_rate": 0.0006880194595096567, "loss": 5.346611499786377, "step": 160 }, { "epoch": 0.11881918819188192, "grad_norm": 1.434497356414795, "learning_rate": 0.0006877999166064817, "loss": 5.427518844604492, "step": 161 }, { "epoch": 0.11955719557195572, "grad_norm": 1.2287393808364868, "learning_rate": 0.0006875784160397766, "loss": 5.595153331756592, "step": 162 }, { "epoch": 0.12029520295202951, "grad_norm": 1.327791690826416, "learning_rate": 0.0006873549590932111, "loss": 5.294317722320557, "step": 163 }, { "epoch": 0.12103321033210332, "grad_norm": 1.358208179473877, "learning_rate": 0.0006871295470617932, "loss": 5.65151309967041, "step": 164 }, { "epoch": 0.12177121771217712, "grad_norm": 1.1277738809585571, "learning_rate": 0.0006869021812518607, "loss": 5.721683979034424, "step": 165 }, { "epoch": 0.12250922509225093, "grad_norm": 1.407368540763855, "learning_rate": 0.0006866728629810749, "loss": 5.473011016845703, "step": 166 }, { "epoch": 0.12324723247232472, "grad_norm": 1.3105313777923584, "learning_rate": 0.0006864415935784116, "loss": 5.670052528381348, "step": 167 }, { "epoch": 0.12398523985239852, "grad_norm": 1.4188215732574463, "learning_rate": 0.0006862083743841545, "loss": 5.493824005126953, "step": 168 }, { "epoch": 0.12472324723247233, "grad_norm": 1.2717117071151733, "learning_rate": 0.0006859732067498869, "loss": 5.524445056915283, "step": 169 }, { "epoch": 0.12546125461254612, "grad_norm": 1.1162827014923096, "learning_rate": 0.0006857360920384839, "loss": 5.39989709854126, "step": 170 }, { "epoch": 0.12619926199261994, "grad_norm": 1.166066288948059, "learning_rate": 0.0006854970316241045, "loss": 5.495843887329102, "step": 171 }, { "epoch": 0.12693726937269373, "grad_norm": 1.9042305946350098, "learning_rate": 0.0006852560268921838, "loss": 5.403502464294434, "step": 172 }, { "epoch": 0.12767527675276752, "grad_norm": 1.0880268812179565, "learning_rate": 0.0006850130792394249, "loss": 5.439591407775879, "step": 173 }, { "epoch": 0.12841328413284134, "grad_norm": 1.0691889524459839, "learning_rate": 0.0006847681900737907, "loss": 5.504947185516357, "step": 174 }, { "epoch": 0.12915129151291513, "grad_norm": 1.2986247539520264, "learning_rate": 0.0006845213608144958, "loss": 5.43480920791626, "step": 175 }, { "epoch": 0.12988929889298892, "grad_norm": 1.1326215267181396, "learning_rate": 0.0006842725928919984, "loss": 5.448299407958984, "step": 176 }, { "epoch": 0.13062730627306274, "grad_norm": 1.1839748620986938, "learning_rate": 0.0006840218877479918, "loss": 5.370269775390625, "step": 177 }, { "epoch": 0.13136531365313653, "grad_norm": 1.3466558456420898, "learning_rate": 0.0006837692468353963, "loss": 5.503698348999023, "step": 178 }, { "epoch": 0.13210332103321032, "grad_norm": 1.2086361646652222, "learning_rate": 0.0006835146716183503, "loss": 5.3210554122924805, "step": 179 }, { "epoch": 0.13284132841328414, "grad_norm": 1.0457011461257935, "learning_rate": 0.0006832581635722026, "loss": 5.430882930755615, "step": 180 }, { "epoch": 0.13357933579335793, "grad_norm": 1.2964543104171753, "learning_rate": 0.0006829997241835029, "loss": 5.3685688972473145, "step": 181 }, { "epoch": 0.13431734317343175, "grad_norm": 1.12661612033844, "learning_rate": 0.0006827393549499941, "loss": 5.366943359375, "step": 182 }, { "epoch": 0.13505535055350554, "grad_norm": 1.4851716756820679, "learning_rate": 0.0006824770573806029, "loss": 5.4124755859375, "step": 183 }, { "epoch": 0.13579335793357933, "grad_norm": 2.0913474559783936, "learning_rate": 0.0006822128329954316, "loss": 5.477243423461914, "step": 184 }, { "epoch": 0.13653136531365315, "grad_norm": 1.6759217977523804, "learning_rate": 0.0006819466833257487, "loss": 5.315946578979492, "step": 185 }, { "epoch": 0.13726937269372694, "grad_norm": 1.5114970207214355, "learning_rate": 0.0006816786099139809, "loss": 5.488532066345215, "step": 186 }, { "epoch": 0.13800738007380073, "grad_norm": 1.229912519454956, "learning_rate": 0.0006814086143137029, "loss": 5.235088348388672, "step": 187 }, { "epoch": 0.13874538745387455, "grad_norm": 1.1838656663894653, "learning_rate": 0.0006811366980896299, "loss": 5.650766372680664, "step": 188 }, { "epoch": 0.13948339483394834, "grad_norm": 1.2359192371368408, "learning_rate": 0.0006808628628176073, "loss": 5.51072883605957, "step": 189 }, { "epoch": 0.14022140221402213, "grad_norm": 1.2534209489822388, "learning_rate": 0.0006805871100846018, "loss": 5.4855170249938965, "step": 190 }, { "epoch": 0.14095940959409595, "grad_norm": 1.1044737100601196, "learning_rate": 0.0006803094414886932, "loss": 5.416131973266602, "step": 191 }, { "epoch": 0.14169741697416974, "grad_norm": 1.1578259468078613, "learning_rate": 0.0006800298586390637, "loss": 5.303211688995361, "step": 192 }, { "epoch": 0.14243542435424356, "grad_norm": 1.2732160091400146, "learning_rate": 0.0006797483631559893, "loss": 5.596409320831299, "step": 193 }, { "epoch": 0.14317343173431735, "grad_norm": 1.3185418844223022, "learning_rate": 0.0006794649566708308, "loss": 5.081386089324951, "step": 194 }, { "epoch": 0.14391143911439114, "grad_norm": 1.2399559020996094, "learning_rate": 0.0006791796408260233, "loss": 5.367499828338623, "step": 195 }, { "epoch": 0.14464944649446496, "grad_norm": 1.4244142770767212, "learning_rate": 0.000678892417275068, "loss": 5.420333385467529, "step": 196 }, { "epoch": 0.14538745387453875, "grad_norm": 1.079671025276184, "learning_rate": 0.000678603287682521, "loss": 5.452577114105225, "step": 197 }, { "epoch": 0.14612546125461254, "grad_norm": 1.2236963510513306, "learning_rate": 0.0006783122537239852, "loss": 5.477599143981934, "step": 198 }, { "epoch": 0.14686346863468636, "grad_norm": 1.2248585224151611, "learning_rate": 0.0006780193170860999, "loss": 5.277920722961426, "step": 199 }, { "epoch": 0.14760147601476015, "grad_norm": 1.1838936805725098, "learning_rate": 0.0006777244794665307, "loss": 5.3089447021484375, "step": 200 }, { "epoch": 0.14833948339483394, "grad_norm": 1.0920487642288208, "learning_rate": 0.0006774277425739603, "loss": 5.312920570373535, "step": 201 }, { "epoch": 0.14907749077490776, "grad_norm": 1.5156118869781494, "learning_rate": 0.0006771291081280784, "loss": 5.365443229675293, "step": 202 }, { "epoch": 0.14981549815498155, "grad_norm": 1.1590790748596191, "learning_rate": 0.0006768285778595714, "loss": 5.726003646850586, "step": 203 }, { "epoch": 0.15055350553505534, "grad_norm": 1.1078206300735474, "learning_rate": 0.0006765261535101128, "loss": 5.49555778503418, "step": 204 }, { "epoch": 0.15129151291512916, "grad_norm": 1.1094913482666016, "learning_rate": 0.0006762218368323528, "loss": 5.463008880615234, "step": 205 }, { "epoch": 0.15202952029520295, "grad_norm": 1.043042540550232, "learning_rate": 0.0006759156295899086, "loss": 5.329763889312744, "step": 206 }, { "epoch": 0.15276752767527677, "grad_norm": 0.9944074153900146, "learning_rate": 0.0006756075335573533, "loss": 5.15687370300293, "step": 207 }, { "epoch": 0.15350553505535056, "grad_norm": 1.320447564125061, "learning_rate": 0.0006752975505202067, "loss": 5.366092681884766, "step": 208 }, { "epoch": 0.15424354243542435, "grad_norm": 0.9683417081832886, "learning_rate": 0.0006749856822749241, "loss": 5.286744117736816, "step": 209 }, { "epoch": 0.15498154981549817, "grad_norm": 1.0429140329360962, "learning_rate": 0.0006746719306288863, "loss": 5.36182165145874, "step": 210 }, { "epoch": 0.15571955719557196, "grad_norm": 0.9789266586303711, "learning_rate": 0.0006743562974003891, "loss": 5.401203155517578, "step": 211 }, { "epoch": 0.15645756457564575, "grad_norm": 1.5062106847763062, "learning_rate": 0.0006740387844186328, "loss": 5.4269890785217285, "step": 212 }, { "epoch": 0.15719557195571957, "grad_norm": 1.2152825593948364, "learning_rate": 0.0006737193935237112, "loss": 5.164780616760254, "step": 213 }, { "epoch": 0.15793357933579336, "grad_norm": 1.0402345657348633, "learning_rate": 0.0006733981265666012, "loss": 5.193200588226318, "step": 214 }, { "epoch": 0.15867158671586715, "grad_norm": 1.123574137687683, "learning_rate": 0.0006730749854091528, "loss": 5.191850185394287, "step": 215 }, { "epoch": 0.15940959409594097, "grad_norm": 1.2188745737075806, "learning_rate": 0.0006727499719240766, "loss": 5.239185810089111, "step": 216 }, { "epoch": 0.16014760147601476, "grad_norm": 1.1848610639572144, "learning_rate": 0.0006724230879949348, "loss": 5.381966590881348, "step": 217 }, { "epoch": 0.16088560885608857, "grad_norm": 1.0746642351150513, "learning_rate": 0.000672094335516129, "loss": 5.212441444396973, "step": 218 }, { "epoch": 0.16162361623616237, "grad_norm": 1.389511227607727, "learning_rate": 0.0006717637163928899, "loss": 5.391989707946777, "step": 219 }, { "epoch": 0.16236162361623616, "grad_norm": 1.4301270246505737, "learning_rate": 0.0006714312325412659, "loss": 5.462432861328125, "step": 220 }, { "epoch": 0.16309963099630997, "grad_norm": 0.9488272666931152, "learning_rate": 0.000671096885888112, "loss": 5.5252790451049805, "step": 221 }, { "epoch": 0.16383763837638377, "grad_norm": 1.0613595247268677, "learning_rate": 0.0006707606783710791, "loss": 5.263217926025391, "step": 222 }, { "epoch": 0.16457564575645756, "grad_norm": 1.04259192943573, "learning_rate": 0.0006704226119386022, "loss": 5.378625869750977, "step": 223 }, { "epoch": 0.16531365313653137, "grad_norm": 1.0206512212753296, "learning_rate": 0.0006700826885498893, "loss": 5.315357208251953, "step": 224 }, { "epoch": 0.16605166051660517, "grad_norm": 0.9734926819801331, "learning_rate": 0.0006697409101749102, "loss": 5.143043518066406, "step": 225 }, { "epoch": 0.16678966789667896, "grad_norm": 1.2763937711715698, "learning_rate": 0.0006693972787943851, "loss": 5.372148513793945, "step": 226 }, { "epoch": 0.16752767527675277, "grad_norm": 1.0929063558578491, "learning_rate": 0.0006690517963997727, "loss": 5.465537071228027, "step": 227 }, { "epoch": 0.16826568265682657, "grad_norm": 1.098317265510559, "learning_rate": 0.0006687044649932588, "loss": 5.2183990478515625, "step": 228 }, { "epoch": 0.16900369003690036, "grad_norm": 1.4758684635162354, "learning_rate": 0.0006683552865877454, "loss": 5.08128023147583, "step": 229 }, { "epoch": 0.16974169741697417, "grad_norm": 1.425257921218872, "learning_rate": 0.0006680042632068382, "loss": 5.4712233543396, "step": 230 }, { "epoch": 0.17047970479704797, "grad_norm": 1.0762962102890015, "learning_rate": 0.000667651396884835, "loss": 5.113118648529053, "step": 231 }, { "epoch": 0.17121771217712178, "grad_norm": 1.028893232345581, "learning_rate": 0.0006672966896667142, "loss": 5.485983848571777, "step": 232 }, { "epoch": 0.17195571955719557, "grad_norm": 0.9485227465629578, "learning_rate": 0.0006669401436081229, "loss": 5.1485090255737305, "step": 233 }, { "epoch": 0.17269372693726937, "grad_norm": 1.146479606628418, "learning_rate": 0.0006665817607753645, "loss": 5.232944011688232, "step": 234 }, { "epoch": 0.17343173431734318, "grad_norm": 1.1146334409713745, "learning_rate": 0.0006662215432453878, "loss": 5.381141662597656, "step": 235 }, { "epoch": 0.17416974169741697, "grad_norm": 1.4399679899215698, "learning_rate": 0.0006658594931057739, "loss": 5.011406421661377, "step": 236 }, { "epoch": 0.17490774907749077, "grad_norm": 1.268681287765503, "learning_rate": 0.0006654956124547241, "loss": 5.245846748352051, "step": 237 }, { "epoch": 0.17564575645756458, "grad_norm": 1.001254677772522, "learning_rate": 0.0006651299034010487, "loss": 5.424437522888184, "step": 238 }, { "epoch": 0.17638376383763837, "grad_norm": 1.2181994915008545, "learning_rate": 0.0006647623680641542, "loss": 5.456673622131348, "step": 239 }, { "epoch": 0.17712177121771217, "grad_norm": 1.1333999633789062, "learning_rate": 0.0006643930085740306, "loss": 5.315772533416748, "step": 240 }, { "epoch": 0.17785977859778598, "grad_norm": 1.0992910861968994, "learning_rate": 0.0006640218270712397, "loss": 5.436305999755859, "step": 241 }, { "epoch": 0.17859778597785977, "grad_norm": 1.0746663808822632, "learning_rate": 0.0006636488257069027, "loss": 5.308970928192139, "step": 242 }, { "epoch": 0.1793357933579336, "grad_norm": 1.0561422109603882, "learning_rate": 0.0006632740066426873, "loss": 5.426042079925537, "step": 243 }, { "epoch": 0.18007380073800738, "grad_norm": 1.031684160232544, "learning_rate": 0.0006628973720507951, "loss": 5.2547478675842285, "step": 244 }, { "epoch": 0.18081180811808117, "grad_norm": 1.0969058275222778, "learning_rate": 0.0006625189241139498, "loss": 5.28012752532959, "step": 245 }, { "epoch": 0.181549815498155, "grad_norm": 1.047112226486206, "learning_rate": 0.0006621386650253838, "loss": 5.20250129699707, "step": 246 }, { "epoch": 0.18228782287822878, "grad_norm": 0.9869337677955627, "learning_rate": 0.0006617565969888257, "loss": 5.25740909576416, "step": 247 }, { "epoch": 0.18302583025830257, "grad_norm": 1.0927937030792236, "learning_rate": 0.0006613727222184874, "loss": 5.5139288902282715, "step": 248 }, { "epoch": 0.1837638376383764, "grad_norm": 1.2841873168945312, "learning_rate": 0.000660987042939052, "loss": 5.233647346496582, "step": 249 }, { "epoch": 0.18450184501845018, "grad_norm": 0.9890136122703552, "learning_rate": 0.0006605995613856595, "loss": 5.420958518981934, "step": 250 }, { "epoch": 0.18523985239852397, "grad_norm": 0.8926162719726562, "learning_rate": 0.0006602102798038957, "loss": 5.308608055114746, "step": 251 }, { "epoch": 0.1859778597785978, "grad_norm": 1.0019422769546509, "learning_rate": 0.0006598192004497771, "loss": 5.302347660064697, "step": 252 }, { "epoch": 0.18671586715867158, "grad_norm": 0.8486745953559875, "learning_rate": 0.0006594263255897396, "loss": 5.099376678466797, "step": 253 }, { "epoch": 0.18745387453874537, "grad_norm": 1.0783238410949707, "learning_rate": 0.0006590316575006244, "loss": 5.218788146972656, "step": 254 }, { "epoch": 0.1881918819188192, "grad_norm": 0.9183611869812012, "learning_rate": 0.0006586351984696653, "loss": 5.240777969360352, "step": 255 }, { "epoch": 0.18892988929889298, "grad_norm": 0.9513900876045227, "learning_rate": 0.0006582369507944747, "loss": 5.222758769989014, "step": 256 }, { "epoch": 0.1896678966789668, "grad_norm": 0.9337455630302429, "learning_rate": 0.0006578369167830314, "loss": 5.062905311584473, "step": 257 }, { "epoch": 0.1904059040590406, "grad_norm": 1.158604383468628, "learning_rate": 0.0006574350987536662, "loss": 5.026293754577637, "step": 258 }, { "epoch": 0.19114391143911438, "grad_norm": 1.0550696849822998, "learning_rate": 0.000657031499035049, "loss": 5.1148905754089355, "step": 259 }, { "epoch": 0.1918819188191882, "grad_norm": 0.9606300592422485, "learning_rate": 0.0006566261199661753, "loss": 5.163092613220215, "step": 260 }, { "epoch": 0.192619926199262, "grad_norm": 1.0590009689331055, "learning_rate": 0.0006562189638963524, "loss": 5.3179521560668945, "step": 261 }, { "epoch": 0.19335793357933578, "grad_norm": 0.9940695762634277, "learning_rate": 0.0006558100331851859, "loss": 5.129310607910156, "step": 262 }, { "epoch": 0.1940959409594096, "grad_norm": 1.0227980613708496, "learning_rate": 0.0006553993302025659, "loss": 5.162182807922363, "step": 263 }, { "epoch": 0.1948339483394834, "grad_norm": 1.0441575050354004, "learning_rate": 0.0006549868573286539, "loss": 5.2034454345703125, "step": 264 }, { "epoch": 0.19557195571955718, "grad_norm": 1.1191506385803223, "learning_rate": 0.0006545726169538681, "loss": 4.916297435760498, "step": 265 }, { "epoch": 0.196309963099631, "grad_norm": 1.1132999658584595, "learning_rate": 0.00065415661147887, "loss": 5.2382707595825195, "step": 266 }, { "epoch": 0.1970479704797048, "grad_norm": 1.352728247642517, "learning_rate": 0.0006537388433145504, "loss": 5.228781700134277, "step": 267 }, { "epoch": 0.1977859778597786, "grad_norm": 1.0661629438400269, "learning_rate": 0.0006533193148820159, "loss": 5.341499328613281, "step": 268 }, { "epoch": 0.1985239852398524, "grad_norm": 1.1771162748336792, "learning_rate": 0.0006528980286125739, "loss": 5.339306831359863, "step": 269 }, { "epoch": 0.1992619926199262, "grad_norm": 0.9680821895599365, "learning_rate": 0.0006524749869477192, "loss": 5.367077827453613, "step": 270 }, { "epoch": 0.2, "grad_norm": 1.0213592052459717, "learning_rate": 0.00065205019233912, "loss": 5.1391096115112305, "step": 271 }, { "epoch": 0.2007380073800738, "grad_norm": 0.8894791603088379, "learning_rate": 0.0006516236472486032, "loss": 5.218973159790039, "step": 272 }, { "epoch": 0.2014760147601476, "grad_norm": 1.1796555519104004, "learning_rate": 0.00065119535414814, "loss": 5.110937118530273, "step": 273 }, { "epoch": 0.2022140221402214, "grad_norm": 0.9279013872146606, "learning_rate": 0.0006507653155198322, "loss": 5.301558494567871, "step": 274 }, { "epoch": 0.2029520295202952, "grad_norm": 0.9340477585792542, "learning_rate": 0.000650333533855898, "loss": 5.283820152282715, "step": 275 }, { "epoch": 0.203690036900369, "grad_norm": 1.0362911224365234, "learning_rate": 0.0006499000116586562, "loss": 4.982748031616211, "step": 276 }, { "epoch": 0.2044280442804428, "grad_norm": 1.1206884384155273, "learning_rate": 0.0006494647514405131, "loss": 4.973568916320801, "step": 277 }, { "epoch": 0.2051660516605166, "grad_norm": 1.0366051197052002, "learning_rate": 0.0006490277557239472, "loss": 5.242402076721191, "step": 278 }, { "epoch": 0.2059040590405904, "grad_norm": 1.0412499904632568, "learning_rate": 0.000648589027041495, "loss": 5.113008499145508, "step": 279 }, { "epoch": 0.2066420664206642, "grad_norm": 1.0954289436340332, "learning_rate": 0.0006481485679357359, "loss": 5.448449611663818, "step": 280 }, { "epoch": 0.207380073800738, "grad_norm": 0.9032571911811829, "learning_rate": 0.0006477063809592778, "loss": 4.939189910888672, "step": 281 }, { "epoch": 0.20811808118081182, "grad_norm": 0.890612006187439, "learning_rate": 0.0006472624686747421, "loss": 5.256400108337402, "step": 282 }, { "epoch": 0.2088560885608856, "grad_norm": 0.9753661751747131, "learning_rate": 0.000646816833654749, "loss": 5.353178024291992, "step": 283 }, { "epoch": 0.2095940959409594, "grad_norm": 0.8233433365821838, "learning_rate": 0.0006463694784819029, "loss": 5.223405838012695, "step": 284 }, { "epoch": 0.21033210332103322, "grad_norm": 1.0614573955535889, "learning_rate": 0.0006459204057487762, "loss": 5.132536888122559, "step": 285 }, { "epoch": 0.211070110701107, "grad_norm": 1.074107050895691, "learning_rate": 0.0006454696180578957, "loss": 5.2558369636535645, "step": 286 }, { "epoch": 0.2118081180811808, "grad_norm": 1.0157700777053833, "learning_rate": 0.0006450171180217273, "loss": 4.989593505859375, "step": 287 }, { "epoch": 0.21254612546125462, "grad_norm": 0.886896550655365, "learning_rate": 0.0006445629082626595, "loss": 5.041266441345215, "step": 288 }, { "epoch": 0.2132841328413284, "grad_norm": 0.8866286873817444, "learning_rate": 0.0006441069914129903, "loss": 5.1668171882629395, "step": 289 }, { "epoch": 0.2140221402214022, "grad_norm": 0.9136367440223694, "learning_rate": 0.0006436493701149102, "loss": 5.044548988342285, "step": 290 }, { "epoch": 0.21476014760147602, "grad_norm": 1.0716575384140015, "learning_rate": 0.0006431900470204876, "loss": 4.962906837463379, "step": 291 }, { "epoch": 0.2154981549815498, "grad_norm": 1.0485093593597412, "learning_rate": 0.0006427290247916537, "loss": 5.0265655517578125, "step": 292 }, { "epoch": 0.21623616236162363, "grad_norm": 0.9726313352584839, "learning_rate": 0.0006422663061001865, "loss": 5.10546875, "step": 293 }, { "epoch": 0.21697416974169742, "grad_norm": 0.8890307545661926, "learning_rate": 0.0006418018936276956, "loss": 4.885697841644287, "step": 294 }, { "epoch": 0.2177121771217712, "grad_norm": 1.256881594657898, "learning_rate": 0.0006413357900656066, "loss": 5.05020809173584, "step": 295 }, { "epoch": 0.21845018450184503, "grad_norm": 0.8236335515975952, "learning_rate": 0.0006408679981151456, "loss": 5.077518463134766, "step": 296 }, { "epoch": 0.21918819188191882, "grad_norm": 1.0636200904846191, "learning_rate": 0.0006403985204873235, "loss": 5.087857246398926, "step": 297 }, { "epoch": 0.2199261992619926, "grad_norm": 1.0351738929748535, "learning_rate": 0.0006399273599029202, "loss": 5.218321800231934, "step": 298 }, { "epoch": 0.22066420664206643, "grad_norm": 1.1184179782867432, "learning_rate": 0.000639454519092469, "loss": 5.41963529586792, "step": 299 }, { "epoch": 0.22140221402214022, "grad_norm": 1.005051851272583, "learning_rate": 0.0006389800007962404, "loss": 5.267976760864258, "step": 300 }, { "epoch": 0.222140221402214, "grad_norm": 0.8542754054069519, "learning_rate": 0.0006385038077642268, "loss": 5.143088340759277, "step": 301 }, { "epoch": 0.22287822878228783, "grad_norm": 1.0211315155029297, "learning_rate": 0.0006380259427561262, "loss": 5.287484169006348, "step": 302 }, { "epoch": 0.22361623616236162, "grad_norm": 0.9097702503204346, "learning_rate": 0.000637546408541326, "loss": 5.212584972381592, "step": 303 }, { "epoch": 0.2243542435424354, "grad_norm": 1.0342856645584106, "learning_rate": 0.0006370652078988876, "loss": 5.081629753112793, "step": 304 }, { "epoch": 0.22509225092250923, "grad_norm": 1.046463131904602, "learning_rate": 0.0006365823436175296, "loss": 5.043882369995117, "step": 305 }, { "epoch": 0.22583025830258302, "grad_norm": 0.9955232739448547, "learning_rate": 0.0006360978184956121, "loss": 5.135004997253418, "step": 306 }, { "epoch": 0.22656826568265684, "grad_norm": 0.8027132153511047, "learning_rate": 0.0006356116353411203, "loss": 5.337245941162109, "step": 307 }, { "epoch": 0.22730627306273063, "grad_norm": 1.2090736627578735, "learning_rate": 0.0006351237969716482, "loss": 5.095905780792236, "step": 308 }, { "epoch": 0.22804428044280442, "grad_norm": 1.090334177017212, "learning_rate": 0.0006346343062143824, "loss": 5.060598373413086, "step": 309 }, { "epoch": 0.22878228782287824, "grad_norm": 1.190928339958191, "learning_rate": 0.0006341431659060856, "loss": 5.230974197387695, "step": 310 }, { "epoch": 0.22952029520295203, "grad_norm": 1.0362082719802856, "learning_rate": 0.0006336503788930801, "loss": 4.835149765014648, "step": 311 }, { "epoch": 0.23025830258302582, "grad_norm": 1.1221998929977417, "learning_rate": 0.0006331559480312316, "loss": 5.359483242034912, "step": 312 }, { "epoch": 0.23099630996309964, "grad_norm": 0.8904932737350464, "learning_rate": 0.0006326598761859323, "loss": 5.057035446166992, "step": 313 }, { "epoch": 0.23173431734317343, "grad_norm": 0.9242574572563171, "learning_rate": 0.0006321621662320847, "loss": 5.011726379394531, "step": 314 }, { "epoch": 0.23247232472324722, "grad_norm": 1.059004306793213, "learning_rate": 0.0006316628210540842, "loss": 4.693303108215332, "step": 315 }, { "epoch": 0.23321033210332104, "grad_norm": 1.2370541095733643, "learning_rate": 0.0006311618435458034, "loss": 5.188898086547852, "step": 316 }, { "epoch": 0.23394833948339483, "grad_norm": 1.0214468240737915, "learning_rate": 0.0006306592366105744, "loss": 5.003267288208008, "step": 317 }, { "epoch": 0.23468634686346865, "grad_norm": 0.9486777782440186, "learning_rate": 0.0006301550031611726, "loss": 4.848117828369141, "step": 318 }, { "epoch": 0.23542435424354244, "grad_norm": 0.9645982980728149, "learning_rate": 0.0006296491461197996, "loss": 5.02429723739624, "step": 319 }, { "epoch": 0.23616236162361623, "grad_norm": 1.2168879508972168, "learning_rate": 0.0006291416684180662, "loss": 5.0632429122924805, "step": 320 }, { "epoch": 0.23690036900369005, "grad_norm": 0.8020527362823486, "learning_rate": 0.0006286325729969753, "loss": 4.867977142333984, "step": 321 }, { "epoch": 0.23763837638376384, "grad_norm": 0.900245726108551, "learning_rate": 0.0006281218628069054, "loss": 4.880187511444092, "step": 322 }, { "epoch": 0.23837638376383763, "grad_norm": 0.9947592616081238, "learning_rate": 0.0006276095408075927, "loss": 5.083812236785889, "step": 323 }, { "epoch": 0.23911439114391145, "grad_norm": 0.8965998888015747, "learning_rate": 0.0006270956099681148, "loss": 4.908682823181152, "step": 324 }, { "epoch": 0.23985239852398524, "grad_norm": 1.1312414407730103, "learning_rate": 0.0006265800732668727, "loss": 5.093230247497559, "step": 325 }, { "epoch": 0.24059040590405903, "grad_norm": 0.925629734992981, "learning_rate": 0.0006260629336915741, "loss": 4.874239444732666, "step": 326 }, { "epoch": 0.24132841328413285, "grad_norm": 1.307646632194519, "learning_rate": 0.0006255441942392159, "loss": 5.057682514190674, "step": 327 }, { "epoch": 0.24206642066420664, "grad_norm": 1.0218299627304077, "learning_rate": 0.0006250238579160666, "loss": 5.127986907958984, "step": 328 }, { "epoch": 0.24280442804428043, "grad_norm": 0.8645507097244263, "learning_rate": 0.0006245019277376496, "loss": 5.13686466217041, "step": 329 }, { "epoch": 0.24354243542435425, "grad_norm": 0.9296532273292542, "learning_rate": 0.0006239784067287245, "loss": 5.124481678009033, "step": 330 }, { "epoch": 0.24428044280442804, "grad_norm": 0.9728212952613831, "learning_rate": 0.0006234532979232711, "loss": 5.0022687911987305, "step": 331 }, { "epoch": 0.24501845018450186, "grad_norm": 0.8225215077400208, "learning_rate": 0.0006229266043644702, "loss": 4.9633378982543945, "step": 332 }, { "epoch": 0.24575645756457565, "grad_norm": 0.960574209690094, "learning_rate": 0.0006223983291046875, "loss": 4.844850540161133, "step": 333 }, { "epoch": 0.24649446494464944, "grad_norm": 0.9003048539161682, "learning_rate": 0.0006218684752054549, "loss": 5.180695056915283, "step": 334 }, { "epoch": 0.24723247232472326, "grad_norm": 0.9519006013870239, "learning_rate": 0.0006213370457374527, "loss": 4.989326477050781, "step": 335 }, { "epoch": 0.24797047970479705, "grad_norm": 0.9743554592132568, "learning_rate": 0.0006208040437804927, "loss": 4.731540679931641, "step": 336 }, { "epoch": 0.24870848708487084, "grad_norm": 0.9855546951293945, "learning_rate": 0.0006202694724234994, "loss": 5.105901718139648, "step": 337 }, { "epoch": 0.24944649446494466, "grad_norm": 1.8261312246322632, "learning_rate": 0.0006197333347644928, "loss": 5.079566478729248, "step": 338 }, { "epoch": 0.25018450184501845, "grad_norm": 1.058996558189392, "learning_rate": 0.0006191956339105701, "loss": 4.985716819763184, "step": 339 }, { "epoch": 0.25092250922509224, "grad_norm": 1.018185019493103, "learning_rate": 0.0006186563729778875, "loss": 4.921426296234131, "step": 340 }, { "epoch": 0.25166051660516603, "grad_norm": 1.154246211051941, "learning_rate": 0.0006181155550916423, "loss": 5.044010162353516, "step": 341 }, { "epoch": 0.2523985239852399, "grad_norm": 0.9054587483406067, "learning_rate": 0.0006175731833860554, "loss": 4.953484535217285, "step": 342 }, { "epoch": 0.25313653136531367, "grad_norm": 0.8154107928276062, "learning_rate": 0.0006170292610043523, "loss": 5.044363975524902, "step": 343 }, { "epoch": 0.25387453874538746, "grad_norm": 1.3822500705718994, "learning_rate": 0.0006164837910987449, "loss": 5.227883338928223, "step": 344 }, { "epoch": 0.25461254612546125, "grad_norm": 0.9022698402404785, "learning_rate": 0.000615936776830414, "loss": 4.860300064086914, "step": 345 }, { "epoch": 0.25535055350553504, "grad_norm": 1.0594429969787598, "learning_rate": 0.0006153882213694903, "loss": 5.256074905395508, "step": 346 }, { "epoch": 0.25608856088560883, "grad_norm": 0.9493646025657654, "learning_rate": 0.0006148381278950362, "loss": 4.957509994506836, "step": 347 }, { "epoch": 0.2568265682656827, "grad_norm": 1.0270938873291016, "learning_rate": 0.0006142864995950273, "loss": 4.809982776641846, "step": 348 }, { "epoch": 0.25756457564575647, "grad_norm": 1.663167953491211, "learning_rate": 0.0006137333396663342, "loss": 4.888598918914795, "step": 349 }, { "epoch": 0.25830258302583026, "grad_norm": 0.983447253704071, "learning_rate": 0.0006131786513147038, "loss": 5.165590763092041, "step": 350 }, { "epoch": 0.25904059040590405, "grad_norm": 0.980798065662384, "learning_rate": 0.0006126224377547408, "loss": 4.966999053955078, "step": 351 }, { "epoch": 0.25977859778597784, "grad_norm": 1.074250340461731, "learning_rate": 0.0006120647022098887, "loss": 4.936653137207031, "step": 352 }, { "epoch": 0.2605166051660517, "grad_norm": 0.9961046576499939, "learning_rate": 0.0006115054479124115, "loss": 5.0761308670043945, "step": 353 }, { "epoch": 0.2612546125461255, "grad_norm": 0.937942385673523, "learning_rate": 0.0006109446781033752, "loss": 4.909850597381592, "step": 354 }, { "epoch": 0.26199261992619927, "grad_norm": 1.0551375150680542, "learning_rate": 0.0006103823960326283, "loss": 4.967006683349609, "step": 355 }, { "epoch": 0.26273062730627306, "grad_norm": 1.0866034030914307, "learning_rate": 0.0006098186049587834, "loss": 5.049051284790039, "step": 356 }, { "epoch": 0.26346863468634685, "grad_norm": 1.0815985202789307, "learning_rate": 0.0006092533081491987, "loss": 4.931700229644775, "step": 357 }, { "epoch": 0.26420664206642064, "grad_norm": 1.0863465070724487, "learning_rate": 0.000608686508879958, "loss": 5.032581329345703, "step": 358 }, { "epoch": 0.2649446494464945, "grad_norm": 0.871529757976532, "learning_rate": 0.000608118210435853, "loss": 5.066904544830322, "step": 359 }, { "epoch": 0.2656826568265683, "grad_norm": 0.9786545038223267, "learning_rate": 0.0006075484161103631, "loss": 5.073785305023193, "step": 360 }, { "epoch": 0.26642066420664207, "grad_norm": 0.8924750089645386, "learning_rate": 0.000606977129205637, "loss": 4.997740745544434, "step": 361 }, { "epoch": 0.26715867158671586, "grad_norm": 1.49006986618042, "learning_rate": 0.0006064043530324738, "loss": 5.006748676300049, "step": 362 }, { "epoch": 0.26789667896678965, "grad_norm": 1.0208152532577515, "learning_rate": 0.0006058300909103026, "loss": 5.057985305786133, "step": 363 }, { "epoch": 0.2686346863468635, "grad_norm": 0.8836379051208496, "learning_rate": 0.000605254346167165, "loss": 4.999290466308594, "step": 364 }, { "epoch": 0.2693726937269373, "grad_norm": 0.8716019988059998, "learning_rate": 0.0006046771221396938, "loss": 5.058474540710449, "step": 365 }, { "epoch": 0.2701107011070111, "grad_norm": 1.1286225318908691, "learning_rate": 0.0006040984221730958, "loss": 4.990628719329834, "step": 366 }, { "epoch": 0.27084870848708487, "grad_norm": 1.099913477897644, "learning_rate": 0.0006035182496211308, "loss": 4.981925010681152, "step": 367 }, { "epoch": 0.27158671586715866, "grad_norm": 1.0594391822814941, "learning_rate": 0.0006029366078460929, "loss": 4.859918594360352, "step": 368 }, { "epoch": 0.27232472324723245, "grad_norm": 0.8651653528213501, "learning_rate": 0.0006023535002187907, "loss": 4.930809020996094, "step": 369 }, { "epoch": 0.2730627306273063, "grad_norm": 0.9700394868850708, "learning_rate": 0.0006017689301185279, "loss": 4.7630720138549805, "step": 370 }, { "epoch": 0.2738007380073801, "grad_norm": 0.9684885740280151, "learning_rate": 0.000601182900933084, "loss": 4.800506114959717, "step": 371 }, { "epoch": 0.2745387453874539, "grad_norm": 1.2140804529190063, "learning_rate": 0.0006005954160586941, "loss": 5.034149646759033, "step": 372 }, { "epoch": 0.27527675276752767, "grad_norm": 1.0811138153076172, "learning_rate": 0.0006000064789000295, "loss": 4.837162494659424, "step": 373 }, { "epoch": 0.27601476014760146, "grad_norm": 1.328092098236084, "learning_rate": 0.0005994160928701782, "loss": 5.215338706970215, "step": 374 }, { "epoch": 0.2767527675276753, "grad_norm": 0.9813052415847778, "learning_rate": 0.0005988242613906248, "loss": 5.164502143859863, "step": 375 }, { "epoch": 0.2774907749077491, "grad_norm": 1.1087919473648071, "learning_rate": 0.0005982309878912306, "loss": 5.113296031951904, "step": 376 }, { "epoch": 0.2782287822878229, "grad_norm": 1.0566635131835938, "learning_rate": 0.000597636275810214, "loss": 4.566821098327637, "step": 377 }, { "epoch": 0.2789667896678967, "grad_norm": 1.1309762001037598, "learning_rate": 0.0005970401285941305, "loss": 5.184887886047363, "step": 378 }, { "epoch": 0.27970479704797047, "grad_norm": 1.3037056922912598, "learning_rate": 0.0005964425496978528, "loss": 4.654736042022705, "step": 379 }, { "epoch": 0.28044280442804426, "grad_norm": 1.0882046222686768, "learning_rate": 0.0005958435425845504, "loss": 4.828828811645508, "step": 380 }, { "epoch": 0.2811808118081181, "grad_norm": 0.9877819418907166, "learning_rate": 0.0005952431107256698, "loss": 4.909351348876953, "step": 381 }, { "epoch": 0.2819188191881919, "grad_norm": 1.0387706756591797, "learning_rate": 0.0005946412576009148, "loss": 4.700501441955566, "step": 382 }, { "epoch": 0.2826568265682657, "grad_norm": 0.9511588215827942, "learning_rate": 0.0005940379866982255, "loss": 4.84822940826416, "step": 383 }, { "epoch": 0.2833948339483395, "grad_norm": 1.4258911609649658, "learning_rate": 0.0005934333015137585, "loss": 4.82274055480957, "step": 384 }, { "epoch": 0.28413284132841327, "grad_norm": 0.9327899217605591, "learning_rate": 0.0005928272055518667, "loss": 4.844176292419434, "step": 385 }, { "epoch": 0.2848708487084871, "grad_norm": 0.9245155453681946, "learning_rate": 0.0005922197023250793, "loss": 5.153466701507568, "step": 386 }, { "epoch": 0.2856088560885609, "grad_norm": 1.0576754808425903, "learning_rate": 0.0005916107953540805, "loss": 4.96760368347168, "step": 387 }, { "epoch": 0.2863468634686347, "grad_norm": 0.8730959892272949, "learning_rate": 0.0005910004881676898, "loss": 4.808976650238037, "step": 388 }, { "epoch": 0.2870848708487085, "grad_norm": 0.8937351107597351, "learning_rate": 0.0005903887843028418, "loss": 4.953003883361816, "step": 389 }, { "epoch": 0.2878228782287823, "grad_norm": 0.9199606776237488, "learning_rate": 0.0005897756873045648, "loss": 5.063399314880371, "step": 390 }, { "epoch": 0.28856088560885607, "grad_norm": 0.9909579753875732, "learning_rate": 0.0005891612007259613, "loss": 4.7940473556518555, "step": 391 }, { "epoch": 0.2892988929889299, "grad_norm": 0.9309024214744568, "learning_rate": 0.0005885453281281863, "loss": 4.881161689758301, "step": 392 }, { "epoch": 0.2900369003690037, "grad_norm": 1.1199829578399658, "learning_rate": 0.0005879280730804277, "loss": 5.138465404510498, "step": 393 }, { "epoch": 0.2907749077490775, "grad_norm": 0.9535595178604126, "learning_rate": 0.000587309439159885, "loss": 4.985682487487793, "step": 394 }, { "epoch": 0.2915129151291513, "grad_norm": 0.9754979014396667, "learning_rate": 0.0005866894299517488, "loss": 4.827736854553223, "step": 395 }, { "epoch": 0.2922509225092251, "grad_norm": 0.9567784667015076, "learning_rate": 0.0005860680490491798, "loss": 4.916905879974365, "step": 396 }, { "epoch": 0.29298892988929887, "grad_norm": 0.9050018191337585, "learning_rate": 0.0005854453000532884, "loss": 5.034615993499756, "step": 397 }, { "epoch": 0.2937269372693727, "grad_norm": 0.8965482115745544, "learning_rate": 0.0005848211865731131, "loss": 4.918941497802734, "step": 398 }, { "epoch": 0.2944649446494465, "grad_norm": 0.9906476140022278, "learning_rate": 0.0005841957122256004, "loss": 4.973904609680176, "step": 399 }, { "epoch": 0.2952029520295203, "grad_norm": 0.9818461537361145, "learning_rate": 0.0005835688806355835, "loss": 4.993786811828613, "step": 400 }, { "epoch": 0.2959409594095941, "grad_norm": 0.99074786901474, "learning_rate": 0.0005829406954357611, "loss": 5.0351457595825195, "step": 401 }, { "epoch": 0.2966789667896679, "grad_norm": 0.9858592748641968, "learning_rate": 0.0005823111602666765, "loss": 4.854518413543701, "step": 402 }, { "epoch": 0.2974169741697417, "grad_norm": 1.0143122673034668, "learning_rate": 0.0005816802787766969, "loss": 4.9962921142578125, "step": 403 }, { "epoch": 0.2981549815498155, "grad_norm": 0.8965126276016235, "learning_rate": 0.0005810480546219914, "loss": 4.845615386962891, "step": 404 }, { "epoch": 0.2988929889298893, "grad_norm": 0.9247124791145325, "learning_rate": 0.0005804144914665105, "loss": 4.576415061950684, "step": 405 }, { "epoch": 0.2996309963099631, "grad_norm": 1.0036989450454712, "learning_rate": 0.0005797795929819646, "loss": 4.833454132080078, "step": 406 }, { "epoch": 0.3003690036900369, "grad_norm": 1.1179319620132446, "learning_rate": 0.0005791433628478031, "loss": 5.014064311981201, "step": 407 }, { "epoch": 0.3011070110701107, "grad_norm": 1.1040972471237183, "learning_rate": 0.0005785058047511922, "loss": 4.786684513092041, "step": 408 }, { "epoch": 0.3018450184501845, "grad_norm": 0.9538096785545349, "learning_rate": 0.0005778669223869945, "loss": 4.815490245819092, "step": 409 }, { "epoch": 0.3025830258302583, "grad_norm": 1.1620954275131226, "learning_rate": 0.0005772267194577469, "loss": 4.706133842468262, "step": 410 }, { "epoch": 0.3033210332103321, "grad_norm": 1.137211799621582, "learning_rate": 0.0005765851996736397, "loss": 4.959315299987793, "step": 411 }, { "epoch": 0.3040590405904059, "grad_norm": 0.9818885922431946, "learning_rate": 0.0005759423667524947, "loss": 4.72605037689209, "step": 412 }, { "epoch": 0.3047970479704797, "grad_norm": 0.9897336959838867, "learning_rate": 0.0005752982244197436, "loss": 4.857034683227539, "step": 413 }, { "epoch": 0.30553505535055353, "grad_norm": 0.9276419281959534, "learning_rate": 0.0005746527764084068, "loss": 4.825818061828613, "step": 414 }, { "epoch": 0.3062730627306273, "grad_norm": 0.9956037998199463, "learning_rate": 0.0005740060264590714, "loss": 4.663302421569824, "step": 415 }, { "epoch": 0.3070110701107011, "grad_norm": 0.9424338340759277, "learning_rate": 0.00057335797831987, "loss": 4.876203536987305, "step": 416 }, { "epoch": 0.3077490774907749, "grad_norm": 0.9253562092781067, "learning_rate": 0.000572708635746458, "loss": 4.914989471435547, "step": 417 }, { "epoch": 0.3084870848708487, "grad_norm": 1.0256803035736084, "learning_rate": 0.000572058002501993, "loss": 4.6925859451293945, "step": 418 }, { "epoch": 0.3092250922509225, "grad_norm": 0.9552437663078308, "learning_rate": 0.0005714060823571126, "loss": 4.923905372619629, "step": 419 }, { "epoch": 0.30996309963099633, "grad_norm": 0.8474723696708679, "learning_rate": 0.0005707528790899117, "loss": 4.9794769287109375, "step": 420 }, { "epoch": 0.3107011070110701, "grad_norm": 0.9562621712684631, "learning_rate": 0.0005700983964859219, "loss": 4.790196418762207, "step": 421 }, { "epoch": 0.3114391143911439, "grad_norm": 0.9205673336982727, "learning_rate": 0.000569442638338089, "loss": 4.9875407218933105, "step": 422 }, { "epoch": 0.3121771217712177, "grad_norm": 0.9803183674812317, "learning_rate": 0.0005687856084467509, "loss": 4.777838230133057, "step": 423 }, { "epoch": 0.3129151291512915, "grad_norm": 1.1361783742904663, "learning_rate": 0.0005681273106196154, "loss": 4.891695976257324, "step": 424 }, { "epoch": 0.31365313653136534, "grad_norm": 0.937116265296936, "learning_rate": 0.0005674677486717386, "loss": 4.8178324699401855, "step": 425 }, { "epoch": 0.31439114391143913, "grad_norm": 0.7977975606918335, "learning_rate": 0.000566806926425503, "loss": 4.76682710647583, "step": 426 }, { "epoch": 0.3151291512915129, "grad_norm": 1.0328474044799805, "learning_rate": 0.0005661448477105944, "loss": 4.845677852630615, "step": 427 }, { "epoch": 0.3158671586715867, "grad_norm": 0.977131187915802, "learning_rate": 0.0005654815163639804, "loss": 4.799696922302246, "step": 428 }, { "epoch": 0.3166051660516605, "grad_norm": 1.0650629997253418, "learning_rate": 0.0005648169362298881, "loss": 4.999658584594727, "step": 429 }, { "epoch": 0.3173431734317343, "grad_norm": 1.0764038562774658, "learning_rate": 0.0005641511111597818, "loss": 4.789190292358398, "step": 430 }, { "epoch": 0.31808118081180814, "grad_norm": 0.8251600861549377, "learning_rate": 0.0005634840450123405, "loss": 4.80035400390625, "step": 431 }, { "epoch": 0.31881918819188193, "grad_norm": 0.9509308338165283, "learning_rate": 0.0005628157416534356, "loss": 4.975335597991943, "step": 432 }, { "epoch": 0.3195571955719557, "grad_norm": 0.9815534353256226, "learning_rate": 0.000562146204956109, "loss": 4.860112190246582, "step": 433 }, { "epoch": 0.3202952029520295, "grad_norm": 1.0006123781204224, "learning_rate": 0.0005614754388005494, "loss": 4.970834732055664, "step": 434 }, { "epoch": 0.3210332103321033, "grad_norm": 0.9528962969779968, "learning_rate": 0.0005608034470740712, "loss": 4.864804267883301, "step": 435 }, { "epoch": 0.32177121771217715, "grad_norm": 0.9165347218513489, "learning_rate": 0.0005601302336710914, "loss": 4.844846725463867, "step": 436 }, { "epoch": 0.32250922509225094, "grad_norm": 0.8079850673675537, "learning_rate": 0.0005594558024931068, "loss": 4.501960754394531, "step": 437 }, { "epoch": 0.32324723247232473, "grad_norm": 1.0351104736328125, "learning_rate": 0.000558780157448672, "loss": 4.843764305114746, "step": 438 }, { "epoch": 0.3239852398523985, "grad_norm": 1.180903673171997, "learning_rate": 0.0005581033024533757, "loss": 4.818984508514404, "step": 439 }, { "epoch": 0.3247232472324723, "grad_norm": 1.0522313117980957, "learning_rate": 0.0005574252414298192, "loss": 4.750001907348633, "step": 440 }, { "epoch": 0.3254612546125461, "grad_norm": 1.0479143857955933, "learning_rate": 0.0005567459783075928, "loss": 4.75580358505249, "step": 441 }, { "epoch": 0.32619926199261995, "grad_norm": 0.9943499565124512, "learning_rate": 0.000556065517023254, "loss": 4.778286933898926, "step": 442 }, { "epoch": 0.32693726937269374, "grad_norm": 1.0374329090118408, "learning_rate": 0.0005553838615203031, "loss": 4.718173027038574, "step": 443 }, { "epoch": 0.32767527675276753, "grad_norm": 0.8165338635444641, "learning_rate": 0.0005547010157491621, "loss": 4.73118257522583, "step": 444 }, { "epoch": 0.3284132841328413, "grad_norm": 0.840587854385376, "learning_rate": 0.0005540169836671505, "loss": 4.625949859619141, "step": 445 }, { "epoch": 0.3291512915129151, "grad_norm": 0.9079518914222717, "learning_rate": 0.0005533317692384632, "loss": 4.873010158538818, "step": 446 }, { "epoch": 0.3298892988929889, "grad_norm": 0.9068413376808167, "learning_rate": 0.000552645376434147, "loss": 4.96326208114624, "step": 447 }, { "epoch": 0.33062730627306275, "grad_norm": 0.91420578956604, "learning_rate": 0.0005519578092320779, "loss": 4.897895336151123, "step": 448 }, { "epoch": 0.33136531365313654, "grad_norm": 0.986501932144165, "learning_rate": 0.0005512690716169378, "loss": 5.1402740478515625, "step": 449 }, { "epoch": 0.33210332103321033, "grad_norm": 0.9404821395874023, "learning_rate": 0.0005505791675801916, "loss": 4.783200740814209, "step": 450 }, { "epoch": 0.3328413284132841, "grad_norm": 0.9507829546928406, "learning_rate": 0.0005498881011200641, "loss": 4.688559532165527, "step": 451 }, { "epoch": 0.3335793357933579, "grad_norm": 0.9329268932342529, "learning_rate": 0.0005491958762415166, "loss": 4.877443790435791, "step": 452 }, { "epoch": 0.33431734317343176, "grad_norm": 1.0837727785110474, "learning_rate": 0.0005485024969562237, "loss": 4.7941789627075195, "step": 453 }, { "epoch": 0.33505535055350555, "grad_norm": 1.0305438041687012, "learning_rate": 0.0005478079672825504, "loss": 4.639592170715332, "step": 454 }, { "epoch": 0.33579335793357934, "grad_norm": 0.8832004070281982, "learning_rate": 0.0005471122912455287, "loss": 4.873642444610596, "step": 455 }, { "epoch": 0.33653136531365313, "grad_norm": 0.9681616425514221, "learning_rate": 0.0005464154728768339, "loss": 4.844632148742676, "step": 456 }, { "epoch": 0.3372693726937269, "grad_norm": 0.9066919088363647, "learning_rate": 0.0005457175162147614, "loss": 4.705622673034668, "step": 457 }, { "epoch": 0.3380073800738007, "grad_norm": 0.8449764251708984, "learning_rate": 0.0005450184253042037, "loss": 4.834818363189697, "step": 458 }, { "epoch": 0.33874538745387456, "grad_norm": 1.009404182434082, "learning_rate": 0.0005443182041966266, "loss": 4.893503665924072, "step": 459 }, { "epoch": 0.33948339483394835, "grad_norm": 0.9231082201004028, "learning_rate": 0.0005436168569500456, "loss": 4.946817398071289, "step": 460 }, { "epoch": 0.34022140221402214, "grad_norm": 0.9415441155433655, "learning_rate": 0.0005429143876290025, "loss": 4.941875457763672, "step": 461 }, { "epoch": 0.34095940959409593, "grad_norm": 0.8538420796394348, "learning_rate": 0.0005422108003045423, "loss": 4.770623207092285, "step": 462 }, { "epoch": 0.3416974169741697, "grad_norm": 1.1796035766601562, "learning_rate": 0.0005415060990541887, "loss": 5.057588577270508, "step": 463 }, { "epoch": 0.34243542435424357, "grad_norm": 0.9784668684005737, "learning_rate": 0.0005408002879619213, "loss": 4.873748779296875, "step": 464 }, { "epoch": 0.34317343173431736, "grad_norm": 0.7987930774688721, "learning_rate": 0.0005400933711181515, "loss": 4.990841865539551, "step": 465 }, { "epoch": 0.34391143911439115, "grad_norm": 0.9904403686523438, "learning_rate": 0.0005393853526196988, "loss": 4.766284942626953, "step": 466 }, { "epoch": 0.34464944649446494, "grad_norm": 1.3431742191314697, "learning_rate": 0.0005386762365697678, "loss": 4.805080413818359, "step": 467 }, { "epoch": 0.34538745387453873, "grad_norm": 0.9435102343559265, "learning_rate": 0.0005379660270779224, "loss": 4.853346824645996, "step": 468 }, { "epoch": 0.3461254612546125, "grad_norm": 0.9755591154098511, "learning_rate": 0.0005372547282600649, "loss": 4.719388008117676, "step": 469 }, { "epoch": 0.34686346863468637, "grad_norm": 0.8468083739280701, "learning_rate": 0.0005365423442384097, "loss": 4.8452301025390625, "step": 470 }, { "epoch": 0.34760147601476016, "grad_norm": 0.8244897723197937, "learning_rate": 0.0005358288791414604, "loss": 4.7897844314575195, "step": 471 }, { "epoch": 0.34833948339483395, "grad_norm": 1.2169724702835083, "learning_rate": 0.0005351143371039861, "loss": 4.922556400299072, "step": 472 }, { "epoch": 0.34907749077490774, "grad_norm": 0.7928814888000488, "learning_rate": 0.0005343987222669969, "loss": 4.509468078613281, "step": 473 }, { "epoch": 0.34981549815498153, "grad_norm": 0.8514037132263184, "learning_rate": 0.0005336820387777202, "loss": 4.7827959060668945, "step": 474 }, { "epoch": 0.3505535055350554, "grad_norm": 1.0094245672225952, "learning_rate": 0.0005329642907895766, "loss": 4.922459602355957, "step": 475 }, { "epoch": 0.35129151291512917, "grad_norm": 0.8496381044387817, "learning_rate": 0.0005322454824621558, "loss": 4.833901405334473, "step": 476 }, { "epoch": 0.35202952029520296, "grad_norm": 0.9164729714393616, "learning_rate": 0.0005315256179611926, "loss": 4.579873085021973, "step": 477 }, { "epoch": 0.35276752767527675, "grad_norm": 0.9636603593826294, "learning_rate": 0.0005308047014585427, "loss": 4.682124614715576, "step": 478 }, { "epoch": 0.35350553505535054, "grad_norm": 0.8201807141304016, "learning_rate": 0.000530082737132158, "loss": 4.792829513549805, "step": 479 }, { "epoch": 0.35424354243542433, "grad_norm": 1.1766455173492432, "learning_rate": 0.0005293597291660638, "loss": 4.957970142364502, "step": 480 }, { "epoch": 0.3549815498154982, "grad_norm": 1.2056677341461182, "learning_rate": 0.0005286356817503329, "loss": 4.584798812866211, "step": 481 }, { "epoch": 0.35571955719557197, "grad_norm": 0.9210802316665649, "learning_rate": 0.0005279105990810624, "loss": 4.629232406616211, "step": 482 }, { "epoch": 0.35645756457564576, "grad_norm": 0.9124312400817871, "learning_rate": 0.0005271844853603489, "loss": 4.6753435134887695, "step": 483 }, { "epoch": 0.35719557195571955, "grad_norm": 0.9933983087539673, "learning_rate": 0.0005264573447962644, "loss": 4.6301984786987305, "step": 484 }, { "epoch": 0.35793357933579334, "grad_norm": 0.93276047706604, "learning_rate": 0.0005257291816028317, "loss": 4.541720390319824, "step": 485 }, { "epoch": 0.3586715867158672, "grad_norm": 1.028709053993225, "learning_rate": 0.000525, "loss": 4.660837650299072, "step": 486 }, { "epoch": 0.359409594095941, "grad_norm": 0.935407817363739, "learning_rate": 0.0005242698042136208, "loss": 4.810506820678711, "step": 487 }, { "epoch": 0.36014760147601477, "grad_norm": 0.9050341844558716, "learning_rate": 0.000523538598475423, "loss": 4.896993637084961, "step": 488 }, { "epoch": 0.36088560885608856, "grad_norm": 0.9780540466308594, "learning_rate": 0.0005228063870229883, "loss": 4.808036804199219, "step": 489 }, { "epoch": 0.36162361623616235, "grad_norm": 1.107608675956726, "learning_rate": 0.0005220731740997273, "loss": 4.784989833831787, "step": 490 }, { "epoch": 0.36236162361623614, "grad_norm": 1.0185608863830566, "learning_rate": 0.0005213389639548539, "loss": 4.635310173034668, "step": 491 }, { "epoch": 0.36309963099631, "grad_norm": 0.9982399940490723, "learning_rate": 0.0005206037608433617, "loss": 4.810551643371582, "step": 492 }, { "epoch": 0.3638376383763838, "grad_norm": 0.7861829400062561, "learning_rate": 0.0005198675690259988, "loss": 4.704036712646484, "step": 493 }, { "epoch": 0.36457564575645757, "grad_norm": 0.935389518737793, "learning_rate": 0.0005191303927692428, "loss": 5.006328582763672, "step": 494 }, { "epoch": 0.36531365313653136, "grad_norm": 0.9794636368751526, "learning_rate": 0.0005183922363452768, "loss": 4.736790180206299, "step": 495 }, { "epoch": 0.36605166051660515, "grad_norm": 0.915691614151001, "learning_rate": 0.0005176531040319643, "loss": 4.851039409637451, "step": 496 }, { "epoch": 0.36678966789667894, "grad_norm": 1.203650712966919, "learning_rate": 0.0005169130001128246, "loss": 4.811964988708496, "step": 497 }, { "epoch": 0.3675276752767528, "grad_norm": 1.001997947692871, "learning_rate": 0.000516171928877007, "loss": 4.62536096572876, "step": 498 }, { "epoch": 0.3682656826568266, "grad_norm": 0.9129559993743896, "learning_rate": 0.0005154298946192679, "loss": 4.895463466644287, "step": 499 }, { "epoch": 0.36900369003690037, "grad_norm": 0.9465571045875549, "learning_rate": 0.0005146869016399432, "loss": 4.418019771575928, "step": 500 }, { "epoch": 0.36974169741697416, "grad_norm": 1.1695398092269897, "learning_rate": 0.0005139429542449265, "loss": 4.949154376983643, "step": 501 }, { "epoch": 0.37047970479704795, "grad_norm": 0.9523534774780273, "learning_rate": 0.0005131980567456417, "loss": 4.633477687835693, "step": 502 }, { "epoch": 0.3712177121771218, "grad_norm": 0.9152795076370239, "learning_rate": 0.0005124522134590188, "loss": 4.966670989990234, "step": 503 }, { "epoch": 0.3719557195571956, "grad_norm": 1.2700462341308594, "learning_rate": 0.0005117054287074694, "loss": 4.756679534912109, "step": 504 }, { "epoch": 0.3726937269372694, "grad_norm": 1.0250760316848755, "learning_rate": 0.0005109577068188609, "loss": 5.008725166320801, "step": 505 }, { "epoch": 0.37343173431734317, "grad_norm": 1.0058673620224, "learning_rate": 0.0005102090521264917, "loss": 4.794961452484131, "step": 506 }, { "epoch": 0.37416974169741696, "grad_norm": 0.9521406292915344, "learning_rate": 0.0005094594689690664, "loss": 4.621726989746094, "step": 507 }, { "epoch": 0.37490774907749075, "grad_norm": 1.1385680437088013, "learning_rate": 0.0005087089616906701, "loss": 4.789394378662109, "step": 508 }, { "epoch": 0.3756457564575646, "grad_norm": 1.1471185684204102, "learning_rate": 0.0005079575346407434, "loss": 4.895359039306641, "step": 509 }, { "epoch": 0.3763837638376384, "grad_norm": 0.9710941910743713, "learning_rate": 0.0005072051921740577, "loss": 4.706118583679199, "step": 510 }, { "epoch": 0.3771217712177122, "grad_norm": 1.0084307193756104, "learning_rate": 0.0005064519386506892, "loss": 4.653249740600586, "step": 511 }, { "epoch": 0.37785977859778597, "grad_norm": 0.8812188506126404, "learning_rate": 0.000505697778435994, "loss": 4.762184143066406, "step": 512 }, { "epoch": 0.37859778597785976, "grad_norm": 0.902651846408844, "learning_rate": 0.0005049427159005829, "loss": 4.499927520751953, "step": 513 }, { "epoch": 0.3793357933579336, "grad_norm": 0.9034081697463989, "learning_rate": 0.000504186755420296, "loss": 4.713489055633545, "step": 514 }, { "epoch": 0.3800738007380074, "grad_norm": 0.9847108721733093, "learning_rate": 0.000503429901376177, "loss": 4.567604064941406, "step": 515 }, { "epoch": 0.3808118081180812, "grad_norm": 1.009543776512146, "learning_rate": 0.0005026721581544485, "loss": 4.737997055053711, "step": 516 }, { "epoch": 0.381549815498155, "grad_norm": 0.8869209289550781, "learning_rate": 0.0005019135301464861, "loss": 4.873485565185547, "step": 517 }, { "epoch": 0.38228782287822877, "grad_norm": 1.083253026008606, "learning_rate": 0.0005011540217487924, "loss": 4.698840618133545, "step": 518 }, { "epoch": 0.38302583025830256, "grad_norm": 0.9394760131835938, "learning_rate": 0.0005003936373629732, "loss": 4.629981994628906, "step": 519 }, { "epoch": 0.3837638376383764, "grad_norm": 0.9423143863677979, "learning_rate": 0.00049963238139571, "loss": 4.612939834594727, "step": 520 }, { "epoch": 0.3845018450184502, "grad_norm": 0.9476136565208435, "learning_rate": 0.000498870258258736, "loss": 4.849869728088379, "step": 521 }, { "epoch": 0.385239852398524, "grad_norm": 0.9509242177009583, "learning_rate": 0.0004981072723688098, "loss": 4.818325996398926, "step": 522 }, { "epoch": 0.3859778597785978, "grad_norm": 0.834679901599884, "learning_rate": 0.0004973434281476899, "loss": 4.750872611999512, "step": 523 }, { "epoch": 0.38671586715867157, "grad_norm": 0.799146831035614, "learning_rate": 0.0004965787300221089, "loss": 4.632112503051758, "step": 524 }, { "epoch": 0.3874538745387454, "grad_norm": 0.8336266875267029, "learning_rate": 0.0004958131824237484, "loss": 4.630362510681152, "step": 525 }, { "epoch": 0.3881918819188192, "grad_norm": 0.9787878394126892, "learning_rate": 0.0004950467897892132, "loss": 4.63228702545166, "step": 526 }, { "epoch": 0.388929889298893, "grad_norm": 1.0535902976989746, "learning_rate": 0.0004942795565600044, "loss": 4.849504470825195, "step": 527 }, { "epoch": 0.3896678966789668, "grad_norm": 0.953353226184845, "learning_rate": 0.0004935114871824956, "loss": 4.9335222244262695, "step": 528 }, { "epoch": 0.3904059040590406, "grad_norm": 0.9515429735183716, "learning_rate": 0.0004927425861079057, "loss": 4.6670451164245605, "step": 529 }, { "epoch": 0.39114391143911437, "grad_norm": 1.0168793201446533, "learning_rate": 0.0004919728577922739, "loss": 4.654256820678711, "step": 530 }, { "epoch": 0.3918819188191882, "grad_norm": 0.9733904600143433, "learning_rate": 0.000491202306696433, "loss": 4.637208938598633, "step": 531 }, { "epoch": 0.392619926199262, "grad_norm": 0.9687494039535522, "learning_rate": 0.0004904309372859844, "loss": 4.994683742523193, "step": 532 }, { "epoch": 0.3933579335793358, "grad_norm": 1.066312313079834, "learning_rate": 0.0004896587540312722, "loss": 4.801863670349121, "step": 533 }, { "epoch": 0.3940959409594096, "grad_norm": 0.9010938405990601, "learning_rate": 0.0004888857614073565, "loss": 4.627843856811523, "step": 534 }, { "epoch": 0.3948339483394834, "grad_norm": 0.9718567728996277, "learning_rate": 0.00048811196389398823, "loss": 4.693809509277344, "step": 535 }, { "epoch": 0.3955719557195572, "grad_norm": 1.1631206274032593, "learning_rate": 0.00048733736597558264, "loss": 4.649688720703125, "step": 536 }, { "epoch": 0.396309963099631, "grad_norm": 1.0241073369979858, "learning_rate": 0.0004865619721411941, "loss": 4.654960632324219, "step": 537 }, { "epoch": 0.3970479704797048, "grad_norm": 0.9926833510398865, "learning_rate": 0.0004857857868844891, "loss": 4.601881504058838, "step": 538 }, { "epoch": 0.3977859778597786, "grad_norm": 1.0237977504730225, "learning_rate": 0.0004850088147037211, "loss": 4.73300838470459, "step": 539 }, { "epoch": 0.3985239852398524, "grad_norm": 1.060038685798645, "learning_rate": 0.0004842310601017036, "loss": 4.862484931945801, "step": 540 }, { "epoch": 0.3992619926199262, "grad_norm": 1.2825253009796143, "learning_rate": 0.00048345252758578484, "loss": 4.497199058532715, "step": 541 }, { "epoch": 0.4, "grad_norm": 0.9599003791809082, "learning_rate": 0.00048267322166782123, "loss": 4.726795673370361, "step": 542 }, { "epoch": 0.4007380073800738, "grad_norm": 0.9577689170837402, "learning_rate": 0.0004818931468641511, "loss": 4.560695648193359, "step": 543 }, { "epoch": 0.4014760147601476, "grad_norm": 0.9004948139190674, "learning_rate": 0.0004811123076955693, "loss": 4.900054931640625, "step": 544 }, { "epoch": 0.4022140221402214, "grad_norm": 1.0973906517028809, "learning_rate": 0.0004803307086872996, "loss": 4.605217933654785, "step": 545 }, { "epoch": 0.4029520295202952, "grad_norm": 0.9591420888900757, "learning_rate": 0.0004795483543689701, "loss": 4.580148696899414, "step": 546 }, { "epoch": 0.40369003690036903, "grad_norm": 0.9317168593406677, "learning_rate": 0.00047876524927458554, "loss": 4.676855087280273, "step": 547 }, { "epoch": 0.4044280442804428, "grad_norm": 0.8841069936752319, "learning_rate": 0.0004779813979425022, "loss": 4.510927677154541, "step": 548 }, { "epoch": 0.4051660516605166, "grad_norm": 0.8826556205749512, "learning_rate": 0.0004771968049154005, "loss": 4.688409805297852, "step": 549 }, { "epoch": 0.4059040590405904, "grad_norm": 1.0118300914764404, "learning_rate": 0.00047641147474025973, "loss": 4.612986087799072, "step": 550 }, { "epoch": 0.4066420664206642, "grad_norm": 0.7752644419670105, "learning_rate": 0.00047562541196833106, "loss": 4.80881929397583, "step": 551 }, { "epoch": 0.407380073800738, "grad_norm": 0.8387100100517273, "learning_rate": 0.000474838621155111, "loss": 4.728845596313477, "step": 552 }, { "epoch": 0.40811808118081183, "grad_norm": 0.8829529285430908, "learning_rate": 0.00047405110686031575, "loss": 4.959627151489258, "step": 553 }, { "epoch": 0.4088560885608856, "grad_norm": 0.8168521523475647, "learning_rate": 0.000473262873647854, "loss": 4.744089603424072, "step": 554 }, { "epoch": 0.4095940959409594, "grad_norm": 0.8763787150382996, "learning_rate": 0.000472473926085801, "loss": 4.78373908996582, "step": 555 }, { "epoch": 0.4103321033210332, "grad_norm": 0.8329116106033325, "learning_rate": 0.00047168426874637167, "loss": 4.739480018615723, "step": 556 }, { "epoch": 0.411070110701107, "grad_norm": 0.8480055332183838, "learning_rate": 0.0004708939062058946, "loss": 4.604061126708984, "step": 557 }, { "epoch": 0.4118081180811808, "grad_norm": 0.8095789551734924, "learning_rate": 0.0004701028430447852, "loss": 4.522249221801758, "step": 558 }, { "epoch": 0.41254612546125463, "grad_norm": 1.0202033519744873, "learning_rate": 0.00046931108384751897, "loss": 4.50852632522583, "step": 559 }, { "epoch": 0.4132841328413284, "grad_norm": 1.0811773538589478, "learning_rate": 0.00046851863320260544, "loss": 4.552791118621826, "step": 560 }, { "epoch": 0.4140221402214022, "grad_norm": 1.1033447980880737, "learning_rate": 0.00046772549570256125, "loss": 4.458186149597168, "step": 561 }, { "epoch": 0.414760147601476, "grad_norm": 0.8855783939361572, "learning_rate": 0.00046693167594388357, "loss": 4.8609724044799805, "step": 562 }, { "epoch": 0.4154981549815498, "grad_norm": 0.8844220042228699, "learning_rate": 0.00046613717852702345, "loss": 4.472495079040527, "step": 563 }, { "epoch": 0.41623616236162364, "grad_norm": 1.000057339668274, "learning_rate": 0.0004653420080563592, "loss": 4.571652412414551, "step": 564 }, { "epoch": 0.41697416974169743, "grad_norm": 1.2004189491271973, "learning_rate": 0.0004645461691401697, "loss": 4.222049713134766, "step": 565 }, { "epoch": 0.4177121771217712, "grad_norm": 0.891960859298706, "learning_rate": 0.0004637496663906077, "loss": 4.547060966491699, "step": 566 }, { "epoch": 0.418450184501845, "grad_norm": 0.895974338054657, "learning_rate": 0.0004629525044236733, "loss": 4.556779861450195, "step": 567 }, { "epoch": 0.4191881918819188, "grad_norm": 0.9765421152114868, "learning_rate": 0.0004621546878591865, "loss": 4.732317924499512, "step": 568 }, { "epoch": 0.4199261992619926, "grad_norm": 0.8740888237953186, "learning_rate": 0.00046135622132076153, "loss": 4.561002731323242, "step": 569 }, { "epoch": 0.42066420664206644, "grad_norm": 0.768431544303894, "learning_rate": 0.00046055710943577896, "loss": 4.428035259246826, "step": 570 }, { "epoch": 0.42140221402214023, "grad_norm": 0.9561269879341125, "learning_rate": 0.0004597573568353595, "loss": 4.324114799499512, "step": 571 }, { "epoch": 0.422140221402214, "grad_norm": 0.9126472473144531, "learning_rate": 0.00045895696815433687, "loss": 4.664113521575928, "step": 572 }, { "epoch": 0.4228782287822878, "grad_norm": 0.8882591128349304, "learning_rate": 0.0004581559480312316, "loss": 4.339204788208008, "step": 573 }, { "epoch": 0.4236162361623616, "grad_norm": 1.081982135772705, "learning_rate": 0.00045735430110822303, "loss": 4.641040802001953, "step": 574 }, { "epoch": 0.42435424354243545, "grad_norm": 0.7895275950431824, "learning_rate": 0.0004565520320311235, "loss": 4.488674163818359, "step": 575 }, { "epoch": 0.42509225092250924, "grad_norm": 0.9767966866493225, "learning_rate": 0.0004557491454493504, "loss": 5.026608943939209, "step": 576 }, { "epoch": 0.42583025830258303, "grad_norm": 0.8868175148963928, "learning_rate": 0.0004549456460159004, "loss": 4.576347351074219, "step": 577 }, { "epoch": 0.4265682656826568, "grad_norm": 0.8501465320587158, "learning_rate": 0.00045414153838732135, "loss": 4.619839668273926, "step": 578 }, { "epoch": 0.4273062730627306, "grad_norm": 0.8614507913589478, "learning_rate": 0.00045333682722368597, "loss": 4.661761283874512, "step": 579 }, { "epoch": 0.4280442804428044, "grad_norm": 1.0277959108352661, "learning_rate": 0.0004525315171885648, "loss": 4.562242031097412, "step": 580 }, { "epoch": 0.42878228782287825, "grad_norm": 0.9864504933357239, "learning_rate": 0.00045172561294899884, "loss": 4.4832258224487305, "step": 581 }, { "epoch": 0.42952029520295204, "grad_norm": 0.8841885924339294, "learning_rate": 0.0004509191191754728, "loss": 4.594321250915527, "step": 582 }, { "epoch": 0.43025830258302583, "grad_norm": 0.8487964272499084, "learning_rate": 0.00045011204054188784, "loss": 4.805062294006348, "step": 583 }, { "epoch": 0.4309963099630996, "grad_norm": 1.027441143989563, "learning_rate": 0.0004493043817255347, "loss": 4.6832685470581055, "step": 584 }, { "epoch": 0.4317343173431734, "grad_norm": 0.9376983046531677, "learning_rate": 0.0004484961474070665, "loss": 4.687745094299316, "step": 585 }, { "epoch": 0.43247232472324726, "grad_norm": 0.927667498588562, "learning_rate": 0.00044768734227047146, "loss": 4.67139196395874, "step": 586 }, { "epoch": 0.43321033210332105, "grad_norm": 0.8729023933410645, "learning_rate": 0.00044687797100304596, "loss": 4.648367404937744, "step": 587 }, { "epoch": 0.43394833948339484, "grad_norm": 0.9207971692085266, "learning_rate": 0.0004460680382953672, "loss": 4.687824249267578, "step": 588 }, { "epoch": 0.43468634686346863, "grad_norm": 0.8276870846748352, "learning_rate": 0.00044525754884126634, "loss": 4.622544288635254, "step": 589 }, { "epoch": 0.4354243542435424, "grad_norm": 0.9223991632461548, "learning_rate": 0.0004444465073378007, "loss": 4.5522003173828125, "step": 590 }, { "epoch": 0.4361623616236162, "grad_norm": 1.1231549978256226, "learning_rate": 0.00044363491848522737, "loss": 4.543008804321289, "step": 591 }, { "epoch": 0.43690036900369006, "grad_norm": 0.867957592010498, "learning_rate": 0.00044282278698697504, "loss": 4.716594219207764, "step": 592 }, { "epoch": 0.43763837638376385, "grad_norm": 0.7886962890625, "learning_rate": 0.0004420101175496176, "loss": 4.6924920082092285, "step": 593 }, { "epoch": 0.43837638376383764, "grad_norm": 0.8600431680679321, "learning_rate": 0.00044119691488284644, "loss": 4.623996257781982, "step": 594 }, { "epoch": 0.43911439114391143, "grad_norm": 0.8535895347595215, "learning_rate": 0.0004403831836994428, "loss": 4.559450149536133, "step": 595 }, { "epoch": 0.4398523985239852, "grad_norm": 0.8784546256065369, "learning_rate": 0.00043956892871525123, "loss": 4.410243988037109, "step": 596 }, { "epoch": 0.44059040590405907, "grad_norm": 0.9997196197509766, "learning_rate": 0.0004387541546491518, "loss": 4.677160739898682, "step": 597 }, { "epoch": 0.44132841328413286, "grad_norm": 0.9354564547538757, "learning_rate": 0.000437938866223033, "loss": 4.577181816101074, "step": 598 }, { "epoch": 0.44206642066420665, "grad_norm": 0.8507137298583984, "learning_rate": 0.00043712306816176365, "loss": 4.933267593383789, "step": 599 }, { "epoch": 0.44280442804428044, "grad_norm": 0.7964354753494263, "learning_rate": 0.0004363067651931667, "loss": 4.742018222808838, "step": 600 }, { "epoch": 0.44354243542435423, "grad_norm": 0.8398452997207642, "learning_rate": 0.0004354899620479909, "loss": 4.496376991271973, "step": 601 }, { "epoch": 0.444280442804428, "grad_norm": 0.8302538990974426, "learning_rate": 0.00043467266345988365, "loss": 4.4834885597229, "step": 602 }, { "epoch": 0.44501845018450187, "grad_norm": 0.8685540556907654, "learning_rate": 0.00043385487416536397, "loss": 4.598426342010498, "step": 603 }, { "epoch": 0.44575645756457566, "grad_norm": 1.008470892906189, "learning_rate": 0.0004330365989037941, "loss": 4.579464912414551, "step": 604 }, { "epoch": 0.44649446494464945, "grad_norm": 0.9266964793205261, "learning_rate": 0.00043221784241735315, "loss": 4.776824474334717, "step": 605 }, { "epoch": 0.44723247232472324, "grad_norm": 0.8900343775749207, "learning_rate": 0.00043139860945100864, "loss": 4.573504447937012, "step": 606 }, { "epoch": 0.44797047970479703, "grad_norm": 0.9872782826423645, "learning_rate": 0.0004305789047524901, "loss": 4.563179969787598, "step": 607 }, { "epoch": 0.4487084870848708, "grad_norm": 0.8987732529640198, "learning_rate": 0.00042975873307226, "loss": 4.483942031860352, "step": 608 }, { "epoch": 0.44944649446494467, "grad_norm": 0.9567626714706421, "learning_rate": 0.000428938099163488, "loss": 4.630576133728027, "step": 609 }, { "epoch": 0.45018450184501846, "grad_norm": 0.8059940934181213, "learning_rate": 0.000428117007782022, "loss": 4.429983615875244, "step": 610 }, { "epoch": 0.45092250922509225, "grad_norm": 0.8970604538917542, "learning_rate": 0.0004272954636863613, "loss": 4.672665596008301, "step": 611 }, { "epoch": 0.45166051660516604, "grad_norm": 0.9387950301170349, "learning_rate": 0.0004264734716376287, "loss": 4.554316520690918, "step": 612 }, { "epoch": 0.45239852398523983, "grad_norm": 0.8920540809631348, "learning_rate": 0.0004256510363995433, "loss": 4.600342750549316, "step": 613 }, { "epoch": 0.4531365313653137, "grad_norm": 1.0435482263565063, "learning_rate": 0.0004248281627383923, "loss": 4.5729475021362305, "step": 614 }, { "epoch": 0.45387453874538747, "grad_norm": 0.8200010657310486, "learning_rate": 0.0004240048554230039, "loss": 4.369121551513672, "step": 615 }, { "epoch": 0.45461254612546126, "grad_norm": 0.9972869157791138, "learning_rate": 0.0004231811192247195, "loss": 4.570677757263184, "step": 616 }, { "epoch": 0.45535055350553505, "grad_norm": 0.9263824224472046, "learning_rate": 0.00042235695891736585, "loss": 4.355930328369141, "step": 617 }, { "epoch": 0.45608856088560884, "grad_norm": 1.002906084060669, "learning_rate": 0.00042153237927722775, "loss": 4.620849609375, "step": 618 }, { "epoch": 0.45682656826568263, "grad_norm": 0.9105566143989563, "learning_rate": 0.00042070738508302003, "loss": 4.353985786437988, "step": 619 }, { "epoch": 0.4575645756457565, "grad_norm": 0.8016074895858765, "learning_rate": 0.0004198819811158601, "loss": 4.468338966369629, "step": 620 }, { "epoch": 0.45830258302583027, "grad_norm": 0.8135733604431152, "learning_rate": 0.00041905617215924, "loss": 4.608132362365723, "step": 621 }, { "epoch": 0.45904059040590406, "grad_norm": 0.9293224215507507, "learning_rate": 0.00041822996299899906, "loss": 4.565390586853027, "step": 622 }, { "epoch": 0.45977859778597785, "grad_norm": 1.1056631803512573, "learning_rate": 0.00041740335842329566, "loss": 4.949249267578125, "step": 623 }, { "epoch": 0.46051660516605164, "grad_norm": 0.840045154094696, "learning_rate": 0.00041657636322257993, "loss": 4.710245609283447, "step": 624 }, { "epoch": 0.4612546125461255, "grad_norm": 0.9296345710754395, "learning_rate": 0.0004157489821895657, "loss": 4.73885440826416, "step": 625 }, { "epoch": 0.4619926199261993, "grad_norm": 0.8654890656471252, "learning_rate": 0.0004149212201192029, "loss": 4.420188903808594, "step": 626 }, { "epoch": 0.46273062730627307, "grad_norm": 1.0963070392608643, "learning_rate": 0.0004140930818086497, "loss": 4.5778985023498535, "step": 627 }, { "epoch": 0.46346863468634686, "grad_norm": 0.8319039940834045, "learning_rate": 0.00041326457205724445, "loss": 4.544205188751221, "step": 628 }, { "epoch": 0.46420664206642065, "grad_norm": 0.9679455757141113, "learning_rate": 0.0004124356956664786, "loss": 4.58363151550293, "step": 629 }, { "epoch": 0.46494464944649444, "grad_norm": 0.9498420357704163, "learning_rate": 0.00041160645743996803, "loss": 4.450014114379883, "step": 630 }, { "epoch": 0.4656826568265683, "grad_norm": 0.8234408497810364, "learning_rate": 0.0004107768621834257, "loss": 4.5670857429504395, "step": 631 }, { "epoch": 0.4664206642066421, "grad_norm": 1.0177075862884521, "learning_rate": 0.0004099469147046336, "loss": 4.445223808288574, "step": 632 }, { "epoch": 0.46715867158671587, "grad_norm": 0.7691503167152405, "learning_rate": 0.0004091166198134151, "loss": 4.425694465637207, "step": 633 }, { "epoch": 0.46789667896678966, "grad_norm": 0.966654896736145, "learning_rate": 0.00040828598232160696, "loss": 4.650933265686035, "step": 634 }, { "epoch": 0.46863468634686345, "grad_norm": 1.0035220384597778, "learning_rate": 0.0004074550070430312, "loss": 4.69790506362915, "step": 635 }, { "epoch": 0.4693726937269373, "grad_norm": 0.8333247900009155, "learning_rate": 0.0004066236987934677, "loss": 4.4094438552856445, "step": 636 }, { "epoch": 0.4701107011070111, "grad_norm": 0.9272137880325317, "learning_rate": 0.0004057920623906257, "loss": 4.437854766845703, "step": 637 }, { "epoch": 0.4708487084870849, "grad_norm": 0.9257310628890991, "learning_rate": 0.0004049601026541166, "loss": 4.607282638549805, "step": 638 }, { "epoch": 0.47158671586715867, "grad_norm": 0.9032636880874634, "learning_rate": 0.0004041278244054253, "loss": 4.529732704162598, "step": 639 }, { "epoch": 0.47232472324723246, "grad_norm": 0.8978585600852966, "learning_rate": 0.0004032952324678826, "loss": 4.577826499938965, "step": 640 }, { "epoch": 0.47306273062730625, "grad_norm": 0.9967110753059387, "learning_rate": 0.0004024623316666376, "loss": 4.280439376831055, "step": 641 }, { "epoch": 0.4738007380073801, "grad_norm": 0.9799245595932007, "learning_rate": 0.00040162912682862884, "loss": 4.567631721496582, "step": 642 }, { "epoch": 0.4745387453874539, "grad_norm": 0.9125800728797913, "learning_rate": 0.00040079562278255726, "loss": 4.556615352630615, "step": 643 }, { "epoch": 0.4752767527675277, "grad_norm": 0.8560841679573059, "learning_rate": 0.00039996182435885744, "loss": 4.567816734313965, "step": 644 }, { "epoch": 0.47601476014760147, "grad_norm": 0.8384515643119812, "learning_rate": 0.00039912773638967053, "loss": 4.32409143447876, "step": 645 }, { "epoch": 0.47675276752767526, "grad_norm": 0.9469295144081116, "learning_rate": 0.0003982933637088151, "loss": 4.505819797515869, "step": 646 }, { "epoch": 0.4774907749077491, "grad_norm": 0.8418838381767273, "learning_rate": 0.0003974587111517601, "loss": 4.288963317871094, "step": 647 }, { "epoch": 0.4782287822878229, "grad_norm": 0.9017887711524963, "learning_rate": 0.00039662378355559636, "loss": 4.349027633666992, "step": 648 }, { "epoch": 0.4789667896678967, "grad_norm": 0.9656051993370056, "learning_rate": 0.00039578858575900857, "loss": 4.5458478927612305, "step": 649 }, { "epoch": 0.4797047970479705, "grad_norm": 0.8434630632400513, "learning_rate": 0.0003949531226022474, "loss": 4.536887168884277, "step": 650 }, { "epoch": 0.48044280442804427, "grad_norm": 0.8146916627883911, "learning_rate": 0.0003941173989271013, "loss": 4.554960250854492, "step": 651 }, { "epoch": 0.48118081180811806, "grad_norm": 0.8592056035995483, "learning_rate": 0.0003932814195768687, "loss": 4.47853422164917, "step": 652 }, { "epoch": 0.4819188191881919, "grad_norm": 0.8136284351348877, "learning_rate": 0.0003924451893963294, "loss": 4.614603042602539, "step": 653 }, { "epoch": 0.4826568265682657, "grad_norm": 0.8898813724517822, "learning_rate": 0.0003916087132317173, "loss": 4.604781150817871, "step": 654 }, { "epoch": 0.4833948339483395, "grad_norm": 0.9451072216033936, "learning_rate": 0.0003907719959306915, "loss": 4.379412651062012, "step": 655 }, { "epoch": 0.4841328413284133, "grad_norm": 1.0912781953811646, "learning_rate": 0.0003899350423423087, "loss": 4.53802490234375, "step": 656 }, { "epoch": 0.48487084870848707, "grad_norm": 0.9553581476211548, "learning_rate": 0.0003890978573169949, "loss": 4.305476188659668, "step": 657 }, { "epoch": 0.48560885608856086, "grad_norm": 0.942167341709137, "learning_rate": 0.00038826044570651756, "loss": 4.399786949157715, "step": 658 }, { "epoch": 0.4863468634686347, "grad_norm": 0.9437850117683411, "learning_rate": 0.00038742281236395703, "loss": 4.361236572265625, "step": 659 }, { "epoch": 0.4870848708487085, "grad_norm": 0.9068073034286499, "learning_rate": 0.00038658496214367873, "loss": 4.441727638244629, "step": 660 }, { "epoch": 0.4878228782287823, "grad_norm": 0.9844712615013123, "learning_rate": 0.00038574689990130513, "loss": 4.4561309814453125, "step": 661 }, { "epoch": 0.4885608856088561, "grad_norm": 0.8944956064224243, "learning_rate": 0.00038490863049368704, "loss": 4.5960493087768555, "step": 662 }, { "epoch": 0.48929889298892987, "grad_norm": 0.8984336853027344, "learning_rate": 0.0003840701587788765, "loss": 4.440349578857422, "step": 663 }, { "epoch": 0.4900369003690037, "grad_norm": 1.0019009113311768, "learning_rate": 0.0003832314896160973, "loss": 4.5855865478515625, "step": 664 }, { "epoch": 0.4907749077490775, "grad_norm": 0.949760913848877, "learning_rate": 0.00038239262786571787, "loss": 4.4265828132629395, "step": 665 }, { "epoch": 0.4915129151291513, "grad_norm": 1.0857264995574951, "learning_rate": 0.0003815535783892229, "loss": 4.488886833190918, "step": 666 }, { "epoch": 0.4922509225092251, "grad_norm": 1.0607296228408813, "learning_rate": 0.00038071434604918463, "loss": 4.221587657928467, "step": 667 }, { "epoch": 0.4929889298892989, "grad_norm": 1.056148648262024, "learning_rate": 0.0003798749357092352, "loss": 4.554340362548828, "step": 668 }, { "epoch": 0.49372693726937267, "grad_norm": 0.8420839309692383, "learning_rate": 0.00037903535223403855, "loss": 4.401950359344482, "step": 669 }, { "epoch": 0.4944649446494465, "grad_norm": 0.8287214040756226, "learning_rate": 0.00037819560048926173, "loss": 4.45570182800293, "step": 670 }, { "epoch": 0.4952029520295203, "grad_norm": 1.0356512069702148, "learning_rate": 0.000377355685341547, "loss": 4.568255424499512, "step": 671 }, { "epoch": 0.4959409594095941, "grad_norm": 0.9578806161880493, "learning_rate": 0.0003765156116584837, "loss": 4.606746673583984, "step": 672 }, { "epoch": 0.4966789667896679, "grad_norm": 0.8309308886528015, "learning_rate": 0.00037567538430857976, "loss": 4.480656147003174, "step": 673 }, { "epoch": 0.4974169741697417, "grad_norm": 0.9627919793128967, "learning_rate": 0.0003748350081612339, "loss": 4.540738105773926, "step": 674 }, { "epoch": 0.4981549815498155, "grad_norm": 0.7901818752288818, "learning_rate": 0.00037399448808670706, "loss": 4.378629684448242, "step": 675 }, { "epoch": 0.4988929889298893, "grad_norm": 1.1135075092315674, "learning_rate": 0.0003731538289560941, "loss": 4.591548442840576, "step": 676 }, { "epoch": 0.4996309963099631, "grad_norm": 0.8672391772270203, "learning_rate": 0.0003723130356412962, "loss": 4.584698677062988, "step": 677 }, { "epoch": 0.5003690036900369, "grad_norm": 0.879558265209198, "learning_rate": 0.00037147211301499176, "loss": 4.36656379699707, "step": 678 }, { "epoch": 0.5011070110701107, "grad_norm": 0.8770106434822083, "learning_rate": 0.0003706310659506087, "loss": 4.566497802734375, "step": 679 }, { "epoch": 0.5018450184501845, "grad_norm": 0.8778314590454102, "learning_rate": 0.0003697898993222961, "loss": 4.343081474304199, "step": 680 }, { "epoch": 0.5025830258302583, "grad_norm": 0.9130513668060303, "learning_rate": 0.00036894861800489614, "loss": 4.3984694480896, "step": 681 }, { "epoch": 0.5033210332103321, "grad_norm": 0.8704879879951477, "learning_rate": 0.00036810722687391544, "loss": 4.561816215515137, "step": 682 }, { "epoch": 0.5040590405904058, "grad_norm": 1.010489821434021, "learning_rate": 0.00036726573080549704, "loss": 4.25577449798584, "step": 683 }, { "epoch": 0.5047970479704798, "grad_norm": 0.9569144248962402, "learning_rate": 0.0003664241346763924, "loss": 4.4627227783203125, "step": 684 }, { "epoch": 0.5055350553505535, "grad_norm": 0.8847797513008118, "learning_rate": 0.00036558244336393236, "loss": 4.437929153442383, "step": 685 }, { "epoch": 0.5062730627306273, "grad_norm": 0.8487216830253601, "learning_rate": 0.00036474066174599986, "loss": 4.435924053192139, "step": 686 }, { "epoch": 0.5070110701107011, "grad_norm": 0.8881345391273499, "learning_rate": 0.00036389879470100095, "loss": 4.873279094696045, "step": 687 }, { "epoch": 0.5077490774907749, "grad_norm": 0.8549903035163879, "learning_rate": 0.00036305684710783684, "loss": 4.272536754608154, "step": 688 }, { "epoch": 0.5084870848708487, "grad_norm": 0.906299889087677, "learning_rate": 0.0003622148238458754, "loss": 4.555997848510742, "step": 689 }, { "epoch": 0.5092250922509225, "grad_norm": 0.922178328037262, "learning_rate": 0.0003613727297949232, "loss": 4.573604583740234, "step": 690 }, { "epoch": 0.5099630996309963, "grad_norm": 0.8890010118484497, "learning_rate": 0.00036053056983519706, "loss": 4.512640953063965, "step": 691 }, { "epoch": 0.5107011070110701, "grad_norm": 0.8093462586402893, "learning_rate": 0.00035968834884729555, "loss": 4.304255485534668, "step": 692 }, { "epoch": 0.5114391143911439, "grad_norm": 0.9470013380050659, "learning_rate": 0.00035884607171217126, "loss": 4.261716365814209, "step": 693 }, { "epoch": 0.5121771217712177, "grad_norm": 1.0242949724197388, "learning_rate": 0.0003580037433111018, "loss": 4.365228652954102, "step": 694 }, { "epoch": 0.5129151291512916, "grad_norm": 0.8128859996795654, "learning_rate": 0.0003571613685256623, "loss": 4.409188270568848, "step": 695 }, { "epoch": 0.5136531365313654, "grad_norm": 0.9906793236732483, "learning_rate": 0.00035631895223769614, "loss": 4.144466876983643, "step": 696 }, { "epoch": 0.5143911439114391, "grad_norm": 0.8540819883346558, "learning_rate": 0.0003554764993292878, "loss": 4.1609907150268555, "step": 697 }, { "epoch": 0.5151291512915129, "grad_norm": 0.8967404365539551, "learning_rate": 0.00035463401468273365, "loss": 4.335708141326904, "step": 698 }, { "epoch": 0.5158671586715867, "grad_norm": 0.9019063115119934, "learning_rate": 0.00035379150318051397, "loss": 4.435550689697266, "step": 699 }, { "epoch": 0.5166051660516605, "grad_norm": 0.8940041065216064, "learning_rate": 0.00035294896970526504, "loss": 4.551334381103516, "step": 700 }, { "epoch": 0.5173431734317343, "grad_norm": 0.9932311773300171, "learning_rate": 0.0003521064191397499, "loss": 4.3837890625, "step": 701 }, { "epoch": 0.5180811808118081, "grad_norm": 0.8308423757553101, "learning_rate": 0.0003512638563668313, "loss": 4.352203845977783, "step": 702 }, { "epoch": 0.5188191881918819, "grad_norm": 1.0316524505615234, "learning_rate": 0.00035042128626944203, "loss": 4.69419527053833, "step": 703 }, { "epoch": 0.5195571955719557, "grad_norm": 0.845513105392456, "learning_rate": 0.00034957871373055796, "loss": 4.403134346008301, "step": 704 }, { "epoch": 0.5202952029520295, "grad_norm": 1.1955550909042358, "learning_rate": 0.0003487361436331689, "loss": 4.507143974304199, "step": 705 }, { "epoch": 0.5210332103321034, "grad_norm": 0.9265637993812561, "learning_rate": 0.0003478935808602501, "loss": 4.529629707336426, "step": 706 }, { "epoch": 0.5217712177121772, "grad_norm": 0.7645063400268555, "learning_rate": 0.0003470510302947351, "loss": 4.37443733215332, "step": 707 }, { "epoch": 0.522509225092251, "grad_norm": 0.8971974849700928, "learning_rate": 0.0003462084968194861, "loss": 4.33015251159668, "step": 708 }, { "epoch": 0.5232472324723247, "grad_norm": 0.813549816608429, "learning_rate": 0.00034536598531726646, "loss": 4.5936079025268555, "step": 709 }, { "epoch": 0.5239852398523985, "grad_norm": 0.953991711139679, "learning_rate": 0.0003445235006707122, "loss": 4.403616905212402, "step": 710 }, { "epoch": 0.5247232472324723, "grad_norm": 0.8936543464660645, "learning_rate": 0.0003436810477623038, "loss": 4.279123306274414, "step": 711 }, { "epoch": 0.5254612546125461, "grad_norm": 0.9178246855735779, "learning_rate": 0.00034283863147433776, "loss": 4.134098052978516, "step": 712 }, { "epoch": 0.5261992619926199, "grad_norm": 0.9482448101043701, "learning_rate": 0.0003419962566888981, "loss": 4.32552433013916, "step": 713 }, { "epoch": 0.5269372693726937, "grad_norm": 0.787865161895752, "learning_rate": 0.0003411539282878288, "loss": 4.394043922424316, "step": 714 }, { "epoch": 0.5276752767527675, "grad_norm": 0.9969366788864136, "learning_rate": 0.00034031165115270444, "loss": 4.486443996429443, "step": 715 }, { "epoch": 0.5284132841328413, "grad_norm": 0.8682870268821716, "learning_rate": 0.00033946943016480304, "loss": 4.508628845214844, "step": 716 }, { "epoch": 0.5291512915129152, "grad_norm": 0.9052722454071045, "learning_rate": 0.0003386272702050769, "loss": 4.4580583572387695, "step": 717 }, { "epoch": 0.529889298892989, "grad_norm": 0.909764289855957, "learning_rate": 0.00033778517615412477, "loss": 4.225852012634277, "step": 718 }, { "epoch": 0.5306273062730628, "grad_norm": 0.8234180808067322, "learning_rate": 0.0003369431528921632, "loss": 4.583276748657227, "step": 719 }, { "epoch": 0.5313653136531366, "grad_norm": 0.8727805614471436, "learning_rate": 0.0003361012052989992, "loss": 4.535766124725342, "step": 720 }, { "epoch": 0.5321033210332103, "grad_norm": 1.0564109086990356, "learning_rate": 0.00033525933825400014, "loss": 4.4810943603515625, "step": 721 }, { "epoch": 0.5328413284132841, "grad_norm": 0.9636712074279785, "learning_rate": 0.0003344175566360676, "loss": 4.364070415496826, "step": 722 }, { "epoch": 0.5335793357933579, "grad_norm": 0.9482673406600952, "learning_rate": 0.00033357586532360765, "loss": 4.78449821472168, "step": 723 }, { "epoch": 0.5343173431734317, "grad_norm": 0.7933990955352783, "learning_rate": 0.00033273426919450285, "loss": 4.399996280670166, "step": 724 }, { "epoch": 0.5350553505535055, "grad_norm": 0.8797454237937927, "learning_rate": 0.0003318927731260846, "loss": 4.585580825805664, "step": 725 }, { "epoch": 0.5357933579335793, "grad_norm": 0.8431602716445923, "learning_rate": 0.00033105138199510386, "loss": 4.289941787719727, "step": 726 }, { "epoch": 0.5365313653136531, "grad_norm": 0.8969424962997437, "learning_rate": 0.00033021010067770396, "loss": 4.353963375091553, "step": 727 }, { "epoch": 0.537269372693727, "grad_norm": 0.7266989350318909, "learning_rate": 0.00032936893404939135, "loss": 4.287866592407227, "step": 728 }, { "epoch": 0.5380073800738008, "grad_norm": 0.9192749261856079, "learning_rate": 0.0003285278869850084, "loss": 4.431896209716797, "step": 729 }, { "epoch": 0.5387453874538746, "grad_norm": 0.9108367562294006, "learning_rate": 0.0003276869643587038, "loss": 4.330748558044434, "step": 730 }, { "epoch": 0.5394833948339484, "grad_norm": 0.789059579372406, "learning_rate": 0.000326846171043906, "loss": 4.409814834594727, "step": 731 }, { "epoch": 0.5402214022140222, "grad_norm": 0.931719183921814, "learning_rate": 0.000326005511913293, "loss": 4.5224928855896, "step": 732 }, { "epoch": 0.5409594095940959, "grad_norm": 0.9140210747718811, "learning_rate": 0.00032516499183876614, "loss": 4.469390869140625, "step": 733 }, { "epoch": 0.5416974169741697, "grad_norm": 0.7886836528778076, "learning_rate": 0.0003243246156914203, "loss": 4.169953346252441, "step": 734 }, { "epoch": 0.5424354243542435, "grad_norm": 0.9898924827575684, "learning_rate": 0.00032348438834151636, "loss": 4.523615837097168, "step": 735 }, { "epoch": 0.5431734317343173, "grad_norm": 0.9171273112297058, "learning_rate": 0.00032264431465845307, "loss": 4.362099647521973, "step": 736 }, { "epoch": 0.5439114391143911, "grad_norm": 0.8603449463844299, "learning_rate": 0.0003218043995107383, "loss": 4.252144813537598, "step": 737 }, { "epoch": 0.5446494464944649, "grad_norm": 0.9839322566986084, "learning_rate": 0.0003209646477659615, "loss": 4.401839256286621, "step": 738 }, { "epoch": 0.5453874538745388, "grad_norm": 1.1770368814468384, "learning_rate": 0.00032012506429076476, "loss": 4.247356414794922, "step": 739 }, { "epoch": 0.5461254612546126, "grad_norm": 0.8217732310295105, "learning_rate": 0.0003192856539508155, "loss": 4.566009521484375, "step": 740 }, { "epoch": 0.5468634686346864, "grad_norm": 1.1834269762039185, "learning_rate": 0.00031844642161077717, "loss": 4.510600566864014, "step": 741 }, { "epoch": 0.5476014760147602, "grad_norm": 0.9773359298706055, "learning_rate": 0.0003176073721342822, "loss": 4.310590744018555, "step": 742 }, { "epoch": 0.548339483394834, "grad_norm": 0.9322510957717896, "learning_rate": 0.00031676851038390277, "loss": 4.397828102111816, "step": 743 }, { "epoch": 0.5490774907749078, "grad_norm": 0.9611193537712097, "learning_rate": 0.00031592984122112363, "loss": 4.509471893310547, "step": 744 }, { "epoch": 0.5498154981549815, "grad_norm": 0.8511263132095337, "learning_rate": 0.00031509136950631295, "loss": 4.605403900146484, "step": 745 }, { "epoch": 0.5505535055350553, "grad_norm": 1.1331124305725098, "learning_rate": 0.00031425310009869497, "loss": 4.705798625946045, "step": 746 }, { "epoch": 0.5512915129151291, "grad_norm": 0.9317970871925354, "learning_rate": 0.0003134150378563213, "loss": 4.538765907287598, "step": 747 }, { "epoch": 0.5520295202952029, "grad_norm": 0.7060513496398926, "learning_rate": 0.00031257718763604296, "loss": 4.484154224395752, "step": 748 }, { "epoch": 0.5527675276752767, "grad_norm": 0.9105408191680908, "learning_rate": 0.00031173955429348254, "loss": 4.227485656738281, "step": 749 }, { "epoch": 0.5535055350553506, "grad_norm": 0.8890596628189087, "learning_rate": 0.000310902142683005, "loss": 4.158082008361816, "step": 750 }, { "epoch": 0.5542435424354244, "grad_norm": 1.074188470840454, "learning_rate": 0.00031006495765769135, "loss": 4.741909980773926, "step": 751 }, { "epoch": 0.5549815498154982, "grad_norm": 1.0221657752990723, "learning_rate": 0.0003092280040693085, "loss": 4.476526260375977, "step": 752 }, { "epoch": 0.555719557195572, "grad_norm": 0.9633339643478394, "learning_rate": 0.00030839128676828277, "loss": 4.336530685424805, "step": 753 }, { "epoch": 0.5564575645756458, "grad_norm": 1.0310927629470825, "learning_rate": 0.0003075548106036706, "loss": 4.326992988586426, "step": 754 }, { "epoch": 0.5571955719557196, "grad_norm": 0.8011588454246521, "learning_rate": 0.0003067185804231314, "loss": 4.4770827293396, "step": 755 }, { "epoch": 0.5579335793357934, "grad_norm": 0.921048641204834, "learning_rate": 0.00030588260107289875, "loss": 4.608548164367676, "step": 756 }, { "epoch": 0.5586715867158671, "grad_norm": 0.9670724272727966, "learning_rate": 0.0003050468773977527, "loss": 4.357841491699219, "step": 757 }, { "epoch": 0.5594095940959409, "grad_norm": 1.0081647634506226, "learning_rate": 0.00030421141424099153, "loss": 4.160003662109375, "step": 758 }, { "epoch": 0.5601476014760147, "grad_norm": 0.8587222695350647, "learning_rate": 0.0003033762164444036, "loss": 4.5625104904174805, "step": 759 }, { "epoch": 0.5608856088560885, "grad_norm": 0.9064732789993286, "learning_rate": 0.00030254128884823995, "loss": 4.558137893676758, "step": 760 }, { "epoch": 0.5616236162361624, "grad_norm": 0.9167226552963257, "learning_rate": 0.00030170663629118484, "loss": 4.650042533874512, "step": 761 }, { "epoch": 0.5623616236162362, "grad_norm": 0.9208563566207886, "learning_rate": 0.0003008722636103295, "loss": 4.311919212341309, "step": 762 }, { "epoch": 0.56309963099631, "grad_norm": 0.8243905305862427, "learning_rate": 0.0003000381756411425, "loss": 4.592479705810547, "step": 763 }, { "epoch": 0.5638376383763838, "grad_norm": 0.88048255443573, "learning_rate": 0.00029920437721744285, "loss": 4.383855819702148, "step": 764 }, { "epoch": 0.5645756457564576, "grad_norm": 0.8309145569801331, "learning_rate": 0.0002983708731713712, "loss": 4.523615837097168, "step": 765 }, { "epoch": 0.5653136531365314, "grad_norm": 0.9054703116416931, "learning_rate": 0.0002975376683333625, "loss": 4.120911121368408, "step": 766 }, { "epoch": 0.5660516605166052, "grad_norm": 0.8789876103401184, "learning_rate": 0.0002967047675321174, "loss": 4.314697265625, "step": 767 }, { "epoch": 0.566789667896679, "grad_norm": 1.055936336517334, "learning_rate": 0.0002958721755945748, "loss": 4.497006416320801, "step": 768 }, { "epoch": 0.5675276752767527, "grad_norm": 1.1139589548110962, "learning_rate": 0.00029503989734588345, "loss": 4.493967056274414, "step": 769 }, { "epoch": 0.5682656826568265, "grad_norm": 0.8091505169868469, "learning_rate": 0.0002942079376093742, "loss": 4.10081672668457, "step": 770 }, { "epoch": 0.5690036900369003, "grad_norm": 0.8381765484809875, "learning_rate": 0.00029337630120653235, "loss": 4.278990745544434, "step": 771 }, { "epoch": 0.5697416974169742, "grad_norm": 0.8964424729347229, "learning_rate": 0.00029254499295696876, "loss": 4.365828514099121, "step": 772 }, { "epoch": 0.570479704797048, "grad_norm": 0.8812311887741089, "learning_rate": 0.0002917140176783931, "loss": 4.407172679901123, "step": 773 }, { "epoch": 0.5712177121771218, "grad_norm": 0.9463404417037964, "learning_rate": 0.0002908833801865849, "loss": 4.176614761352539, "step": 774 }, { "epoch": 0.5719557195571956, "grad_norm": 0.9128312468528748, "learning_rate": 0.0002900530852953665, "loss": 4.4936604499816895, "step": 775 }, { "epoch": 0.5726937269372694, "grad_norm": 0.9788138270378113, "learning_rate": 0.0002892231378165744, "loss": 4.425959587097168, "step": 776 }, { "epoch": 0.5734317343173432, "grad_norm": 0.8016911149024963, "learning_rate": 0.0002883935425600321, "loss": 4.351809024810791, "step": 777 }, { "epoch": 0.574169741697417, "grad_norm": 0.8947065472602844, "learning_rate": 0.00028756430433352146, "loss": 4.333946228027344, "step": 778 }, { "epoch": 0.5749077490774908, "grad_norm": 0.8357275724411011, "learning_rate": 0.0002867354279427556, "loss": 4.609579086303711, "step": 779 }, { "epoch": 0.5756457564575646, "grad_norm": 0.9476321339607239, "learning_rate": 0.0002859069181913503, "loss": 4.475932598114014, "step": 780 }, { "epoch": 0.5763837638376383, "grad_norm": 0.9456009268760681, "learning_rate": 0.00028507877988079717, "loss": 4.241294860839844, "step": 781 }, { "epoch": 0.5771217712177121, "grad_norm": 0.7762236595153809, "learning_rate": 0.0002842510178104343, "loss": 4.2514777183532715, "step": 782 }, { "epoch": 0.5778597785977859, "grad_norm": 0.8480483889579773, "learning_rate": 0.00028342363677742, "loss": 4.5362043380737305, "step": 783 }, { "epoch": 0.5785977859778598, "grad_norm": 0.8248271942138672, "learning_rate": 0.00028259664157670434, "loss": 4.289585113525391, "step": 784 }, { "epoch": 0.5793357933579336, "grad_norm": 0.9554965496063232, "learning_rate": 0.00028177003700100093, "loss": 4.234594345092773, "step": 785 }, { "epoch": 0.5800738007380074, "grad_norm": 1.0218883752822876, "learning_rate": 0.00028094382784076005, "loss": 4.032539367675781, "step": 786 }, { "epoch": 0.5808118081180812, "grad_norm": 0.9201107621192932, "learning_rate": 0.00028011801888413996, "loss": 4.4474711418151855, "step": 787 }, { "epoch": 0.581549815498155, "grad_norm": 0.9545875191688538, "learning_rate": 0.00027929261491698, "loss": 4.290918350219727, "step": 788 }, { "epoch": 0.5822878228782288, "grad_norm": 0.9154767394065857, "learning_rate": 0.00027846762072277235, "loss": 4.266115188598633, "step": 789 }, { "epoch": 0.5830258302583026, "grad_norm": 0.9572087526321411, "learning_rate": 0.00027764304108263425, "loss": 4.489130973815918, "step": 790 }, { "epoch": 0.5837638376383764, "grad_norm": 0.864920973777771, "learning_rate": 0.0002768188807752806, "loss": 4.22702693939209, "step": 791 }, { "epoch": 0.5845018450184502, "grad_norm": 0.9186403751373291, "learning_rate": 0.0002759951445769962, "loss": 4.370454788208008, "step": 792 }, { "epoch": 0.5852398523985239, "grad_norm": 0.9486933350563049, "learning_rate": 0.00027517183726160775, "loss": 4.345991611480713, "step": 793 }, { "epoch": 0.5859778597785977, "grad_norm": 0.9103389382362366, "learning_rate": 0.0002743489636004567, "loss": 4.232224941253662, "step": 794 }, { "epoch": 0.5867158671586716, "grad_norm": 0.9209710359573364, "learning_rate": 0.0002735265283623713, "loss": 3.9969122409820557, "step": 795 }, { "epoch": 0.5874538745387454, "grad_norm": 1.2172404527664185, "learning_rate": 0.00027270453631363876, "loss": 4.3851318359375, "step": 796 }, { "epoch": 0.5881918819188192, "grad_norm": 1.0857105255126953, "learning_rate": 0.00027188299221797806, "loss": 4.543056488037109, "step": 797 }, { "epoch": 0.588929889298893, "grad_norm": 0.8917638659477234, "learning_rate": 0.00027106190083651206, "loss": 4.233307838439941, "step": 798 }, { "epoch": 0.5896678966789668, "grad_norm": 0.9834994077682495, "learning_rate": 0.0002702412669277401, "loss": 4.3369035720825195, "step": 799 }, { "epoch": 0.5904059040590406, "grad_norm": 0.9920309782028198, "learning_rate": 0.00026942109524751, "loss": 4.263988971710205, "step": 800 }, { "epoch": 0.5911439114391144, "grad_norm": 0.7995727062225342, "learning_rate": 0.00026860139054899146, "loss": 4.237081050872803, "step": 801 }, { "epoch": 0.5918819188191882, "grad_norm": 0.8966661095619202, "learning_rate": 0.00026778215758264696, "loss": 4.278907299041748, "step": 802 }, { "epoch": 0.592619926199262, "grad_norm": 0.8927947282791138, "learning_rate": 0.000266963401096206, "loss": 4.3486151695251465, "step": 803 }, { "epoch": 0.5933579335793358, "grad_norm": 0.7980582118034363, "learning_rate": 0.0002661451258346361, "loss": 4.231438636779785, "step": 804 }, { "epoch": 0.5940959409594095, "grad_norm": 0.8703809380531311, "learning_rate": 0.00026532733654011635, "loss": 4.2430419921875, "step": 805 }, { "epoch": 0.5948339483394834, "grad_norm": 1.0357931852340698, "learning_rate": 0.00026451003795200913, "loss": 4.256633281707764, "step": 806 }, { "epoch": 0.5955719557195572, "grad_norm": 0.8626582026481628, "learning_rate": 0.00026369323480683333, "loss": 4.278927326202393, "step": 807 }, { "epoch": 0.596309963099631, "grad_norm": 0.8148908615112305, "learning_rate": 0.0002628769318382364, "loss": 4.354986190795898, "step": 808 }, { "epoch": 0.5970479704797048, "grad_norm": 0.7945446372032166, "learning_rate": 0.000262061133776967, "loss": 4.433017730712891, "step": 809 }, { "epoch": 0.5977859778597786, "grad_norm": 0.8125186562538147, "learning_rate": 0.00026124584535084825, "loss": 4.323663711547852, "step": 810 }, { "epoch": 0.5985239852398524, "grad_norm": 0.8656073808670044, "learning_rate": 0.00026043107128474876, "loss": 4.364239692687988, "step": 811 }, { "epoch": 0.5992619926199262, "grad_norm": 0.7823298573493958, "learning_rate": 0.00025961681630055737, "loss": 4.095296382904053, "step": 812 }, { "epoch": 0.6, "grad_norm": 0.8082625865936279, "learning_rate": 0.00025880308511715366, "loss": 4.251285552978516, "step": 813 }, { "epoch": 0.6007380073800738, "grad_norm": 0.8128904104232788, "learning_rate": 0.00025798988245038243, "loss": 4.234792709350586, "step": 814 }, { "epoch": 0.6014760147601476, "grad_norm": 0.9591745138168335, "learning_rate": 0.00025717721301302495, "loss": 4.191695213317871, "step": 815 }, { "epoch": 0.6022140221402214, "grad_norm": 0.8306787014007568, "learning_rate": 0.0002563650815147728, "loss": 4.182519912719727, "step": 816 }, { "epoch": 0.6029520295202953, "grad_norm": 1.0368632078170776, "learning_rate": 0.0002555534926621994, "loss": 4.357141971588135, "step": 817 }, { "epoch": 0.603690036900369, "grad_norm": 0.9401784539222717, "learning_rate": 0.00025474245115873377, "loss": 4.2874016761779785, "step": 818 }, { "epoch": 0.6044280442804428, "grad_norm": 0.9086504578590393, "learning_rate": 0.00025393196170463286, "loss": 4.135937690734863, "step": 819 }, { "epoch": 0.6051660516605166, "grad_norm": 0.8185088634490967, "learning_rate": 0.00025312202899695403, "loss": 4.31793212890625, "step": 820 }, { "epoch": 0.6059040590405904, "grad_norm": 0.8340873718261719, "learning_rate": 0.00025231265772952864, "loss": 4.332757949829102, "step": 821 }, { "epoch": 0.6066420664206642, "grad_norm": 0.9770723581314087, "learning_rate": 0.00025150385259293346, "loss": 4.115085124969482, "step": 822 }, { "epoch": 0.607380073800738, "grad_norm": 1.0393363237380981, "learning_rate": 0.0002506956182744653, "loss": 4.164813995361328, "step": 823 }, { "epoch": 0.6081180811808118, "grad_norm": 0.9465534090995789, "learning_rate": 0.00024988795945811215, "loss": 4.53727912902832, "step": 824 }, { "epoch": 0.6088560885608856, "grad_norm": 0.8929158449172974, "learning_rate": 0.00024908088082452724, "loss": 4.265376091003418, "step": 825 }, { "epoch": 0.6095940959409594, "grad_norm": 0.7848824262619019, "learning_rate": 0.00024827438705100116, "loss": 4.300992965698242, "step": 826 }, { "epoch": 0.6103321033210332, "grad_norm": 0.7737518548965454, "learning_rate": 0.00024746848281143524, "loss": 4.297072410583496, "step": 827 }, { "epoch": 0.6110701107011071, "grad_norm": 1.0166592597961426, "learning_rate": 0.00024666317277631403, "loss": 4.4208478927612305, "step": 828 }, { "epoch": 0.6118081180811809, "grad_norm": 0.8515886664390564, "learning_rate": 0.00024585846161267875, "loss": 4.542513847351074, "step": 829 }, { "epoch": 0.6125461254612546, "grad_norm": 0.8427137732505798, "learning_rate": 0.00024505435398409966, "loss": 4.270936965942383, "step": 830 }, { "epoch": 0.6132841328413284, "grad_norm": 0.811477541923523, "learning_rate": 0.0002442508545506495, "loss": 4.223374366760254, "step": 831 }, { "epoch": 0.6140221402214022, "grad_norm": 0.9186045527458191, "learning_rate": 0.00024344796796887656, "loss": 4.369760036468506, "step": 832 }, { "epoch": 0.614760147601476, "grad_norm": 0.809533417224884, "learning_rate": 0.0002426456988917769, "loss": 4.350223541259766, "step": 833 }, { "epoch": 0.6154981549815498, "grad_norm": 0.8991212248802185, "learning_rate": 0.00024184405196876844, "loss": 4.136372089385986, "step": 834 }, { "epoch": 0.6162361623616236, "grad_norm": 0.8988363742828369, "learning_rate": 0.00024104303184566307, "loss": 4.202424049377441, "step": 835 }, { "epoch": 0.6169741697416974, "grad_norm": 1.3087947368621826, "learning_rate": 0.00024024264316464065, "loss": 4.428619384765625, "step": 836 }, { "epoch": 0.6177121771217712, "grad_norm": 0.7776771783828735, "learning_rate": 0.0002394428905642211, "loss": 4.351472854614258, "step": 837 }, { "epoch": 0.618450184501845, "grad_norm": 0.996083676815033, "learning_rate": 0.00023864377867923852, "loss": 3.9067325592041016, "step": 838 }, { "epoch": 0.6191881918819189, "grad_norm": 0.8904930949211121, "learning_rate": 0.00023784531214081348, "loss": 4.205554008483887, "step": 839 }, { "epoch": 0.6199261992619927, "grad_norm": 0.9460301399230957, "learning_rate": 0.00023704749557632688, "loss": 4.3381452560424805, "step": 840 }, { "epoch": 0.6206642066420665, "grad_norm": 0.8847654461860657, "learning_rate": 0.00023625033360939239, "loss": 4.210631370544434, "step": 841 }, { "epoch": 0.6214022140221402, "grad_norm": 0.9049587249755859, "learning_rate": 0.00023545383085983034, "loss": 4.128975868225098, "step": 842 }, { "epoch": 0.622140221402214, "grad_norm": 0.881879985332489, "learning_rate": 0.00023465799194364087, "loss": 4.109155654907227, "step": 843 }, { "epoch": 0.6228782287822878, "grad_norm": 0.9331649541854858, "learning_rate": 0.00023386282147297657, "loss": 4.180877685546875, "step": 844 }, { "epoch": 0.6236162361623616, "grad_norm": 1.0155686140060425, "learning_rate": 0.00023306832405611643, "loss": 4.2506818771362305, "step": 845 }, { "epoch": 0.6243542435424354, "grad_norm": 0.9788922667503357, "learning_rate": 0.00023227450429743867, "loss": 4.536131858825684, "step": 846 }, { "epoch": 0.6250922509225092, "grad_norm": 1.0663362741470337, "learning_rate": 0.00023148136679739453, "loss": 4.059211730957031, "step": 847 }, { "epoch": 0.625830258302583, "grad_norm": 0.8880152702331543, "learning_rate": 0.00023068891615248102, "loss": 4.163819313049316, "step": 848 }, { "epoch": 0.6265682656826568, "grad_norm": 0.9166035056114197, "learning_rate": 0.0002298971569552149, "loss": 4.22659158706665, "step": 849 }, { "epoch": 0.6273062730627307, "grad_norm": 1.1947702169418335, "learning_rate": 0.00022910609379410546, "loss": 4.3044633865356445, "step": 850 }, { "epoch": 0.6280442804428045, "grad_norm": 1.251198410987854, "learning_rate": 0.0002283157312536284, "loss": 4.213165283203125, "step": 851 }, { "epoch": 0.6287822878228783, "grad_norm": 0.9441475868225098, "learning_rate": 0.00022752607391419904, "loss": 4.37963342666626, "step": 852 }, { "epoch": 0.629520295202952, "grad_norm": 0.8944138884544373, "learning_rate": 0.0002267371263521461, "loss": 4.479311943054199, "step": 853 }, { "epoch": 0.6302583025830258, "grad_norm": 0.9756674766540527, "learning_rate": 0.00022594889313968424, "loss": 4.323942184448242, "step": 854 }, { "epoch": 0.6309963099630996, "grad_norm": 0.9520359039306641, "learning_rate": 0.00022516137884488895, "loss": 4.259498596191406, "step": 855 }, { "epoch": 0.6317343173431734, "grad_norm": 0.8389827609062195, "learning_rate": 0.000224374588031669, "loss": 4.353797435760498, "step": 856 }, { "epoch": 0.6324723247232472, "grad_norm": 0.9523439407348633, "learning_rate": 0.0002235885252597402, "loss": 4.485894203186035, "step": 857 }, { "epoch": 0.633210332103321, "grad_norm": 0.8450521230697632, "learning_rate": 0.00022280319508459953, "loss": 4.3302717208862305, "step": 858 }, { "epoch": 0.6339483394833948, "grad_norm": 0.9799603819847107, "learning_rate": 0.00022201860205749792, "loss": 4.216465950012207, "step": 859 }, { "epoch": 0.6346863468634686, "grad_norm": 0.8215528726577759, "learning_rate": 0.00022123475072541456, "loss": 4.218143463134766, "step": 860 }, { "epoch": 0.6354243542435425, "grad_norm": 0.8392944931983948, "learning_rate": 0.00022045164563102993, "loss": 4.393090724945068, "step": 861 }, { "epoch": 0.6361623616236163, "grad_norm": 0.9801323413848877, "learning_rate": 0.00021966929131270053, "loss": 4.3347978591918945, "step": 862 }, { "epoch": 0.6369003690036901, "grad_norm": 1.0346145629882812, "learning_rate": 0.00021888769230443076, "loss": 4.304266452789307, "step": 863 }, { "epoch": 0.6376383763837639, "grad_norm": 0.8837590217590332, "learning_rate": 0.00021810685313584894, "loss": 4.318976879119873, "step": 864 }, { "epoch": 0.6383763837638377, "grad_norm": 0.9550504088401794, "learning_rate": 0.00021732677833217884, "loss": 4.0572285652160645, "step": 865 }, { "epoch": 0.6391143911439114, "grad_norm": 0.9023411273956299, "learning_rate": 0.00021654747241421515, "loss": 4.210879325866699, "step": 866 }, { "epoch": 0.6398523985239852, "grad_norm": 1.2458837032318115, "learning_rate": 0.00021576893989829648, "loss": 4.031771183013916, "step": 867 }, { "epoch": 0.640590405904059, "grad_norm": 0.931896448135376, "learning_rate": 0.00021499118529627893, "loss": 4.238314151763916, "step": 868 }, { "epoch": 0.6413284132841328, "grad_norm": 0.8535945415496826, "learning_rate": 0.00021421421311551095, "loss": 4.30747652053833, "step": 869 }, { "epoch": 0.6420664206642066, "grad_norm": 0.8937339186668396, "learning_rate": 0.0002134380278588059, "loss": 4.368441581726074, "step": 870 }, { "epoch": 0.6428044280442804, "grad_norm": 0.9691210985183716, "learning_rate": 0.00021266263402441746, "loss": 4.286958694458008, "step": 871 }, { "epoch": 0.6435424354243543, "grad_norm": 0.9562344551086426, "learning_rate": 0.00021188803610601187, "loss": 4.331124305725098, "step": 872 }, { "epoch": 0.6442804428044281, "grad_norm": 0.9085299372673035, "learning_rate": 0.00021111423859264362, "loss": 4.204074859619141, "step": 873 }, { "epoch": 0.6450184501845019, "grad_norm": 1.0217558145523071, "learning_rate": 0.00021034124596872776, "loss": 4.061552047729492, "step": 874 }, { "epoch": 0.6457564575645757, "grad_norm": 0.8775967359542847, "learning_rate": 0.00020956906271401554, "loss": 4.252497673034668, "step": 875 }, { "epoch": 0.6464944649446495, "grad_norm": 0.9603700637817383, "learning_rate": 0.00020879769330356705, "loss": 4.17333984375, "step": 876 }, { "epoch": 0.6472324723247233, "grad_norm": 0.9519745707511902, "learning_rate": 0.0002080271422077262, "loss": 4.414155006408691, "step": 877 }, { "epoch": 0.647970479704797, "grad_norm": 0.8470144271850586, "learning_rate": 0.00020725741389209423, "loss": 4.405782699584961, "step": 878 }, { "epoch": 0.6487084870848708, "grad_norm": 0.872512698173523, "learning_rate": 0.00020648851281750437, "loss": 4.448093414306641, "step": 879 }, { "epoch": 0.6494464944649446, "grad_norm": 1.0624064207077026, "learning_rate": 0.00020572044343999566, "loss": 4.4731950759887695, "step": 880 }, { "epoch": 0.6501845018450184, "grad_norm": 0.9333707094192505, "learning_rate": 0.00020495321021078686, "loss": 4.351503849029541, "step": 881 }, { "epoch": 0.6509225092250922, "grad_norm": 0.8607699275016785, "learning_rate": 0.00020418681757625152, "loss": 4.024420738220215, "step": 882 }, { "epoch": 0.6516605166051661, "grad_norm": 0.8372026085853577, "learning_rate": 0.00020342126997789113, "loss": 4.254813194274902, "step": 883 }, { "epoch": 0.6523985239852399, "grad_norm": 0.8102350234985352, "learning_rate": 0.00020265657185231017, "loss": 4.309717178344727, "step": 884 }, { "epoch": 0.6531365313653137, "grad_norm": 0.8655620217323303, "learning_rate": 0.0002018927276311902, "loss": 4.270059108734131, "step": 885 }, { "epoch": 0.6538745387453875, "grad_norm": 0.8550220727920532, "learning_rate": 0.00020112974174126406, "loss": 4.238635063171387, "step": 886 }, { "epoch": 0.6546125461254613, "grad_norm": 0.8815758228302002, "learning_rate": 0.00020036761860428999, "loss": 4.169132232666016, "step": 887 }, { "epoch": 0.6553505535055351, "grad_norm": 0.9161958694458008, "learning_rate": 0.00019960636263702692, "loss": 4.314050674438477, "step": 888 }, { "epoch": 0.6560885608856089, "grad_norm": 1.0340604782104492, "learning_rate": 0.00019884597825120762, "loss": 3.9258623123168945, "step": 889 }, { "epoch": 0.6568265682656826, "grad_norm": 0.896084725856781, "learning_rate": 0.000198086469853514, "loss": 4.141365051269531, "step": 890 }, { "epoch": 0.6575645756457564, "grad_norm": 0.9871026277542114, "learning_rate": 0.00019732784184555138, "loss": 4.212796211242676, "step": 891 }, { "epoch": 0.6583025830258302, "grad_norm": 1.0540019273757935, "learning_rate": 0.00019657009862382286, "loss": 4.061999797821045, "step": 892 }, { "epoch": 0.659040590405904, "grad_norm": 0.8863611817359924, "learning_rate": 0.00019581324457970407, "loss": 4.253866195678711, "step": 893 }, { "epoch": 0.6597785977859778, "grad_norm": 1.1371312141418457, "learning_rate": 0.00019505728409941711, "loss": 4.08126163482666, "step": 894 }, { "epoch": 0.6605166051660517, "grad_norm": 2.1047496795654297, "learning_rate": 0.00019430222156400606, "loss": 4.196209907531738, "step": 895 }, { "epoch": 0.6612546125461255, "grad_norm": 0.85357266664505, "learning_rate": 0.00019354806134931087, "loss": 4.412619590759277, "step": 896 }, { "epoch": 0.6619926199261993, "grad_norm": 1.048453450202942, "learning_rate": 0.00019279480782594244, "loss": 4.392220497131348, "step": 897 }, { "epoch": 0.6627306273062731, "grad_norm": 0.8711747527122498, "learning_rate": 0.00019204246535925654, "loss": 4.262413024902344, "step": 898 }, { "epoch": 0.6634686346863469, "grad_norm": 0.7952659130096436, "learning_rate": 0.00019129103830933008, "loss": 4.36223840713501, "step": 899 }, { "epoch": 0.6642066420664207, "grad_norm": 0.8127221465110779, "learning_rate": 0.00019054053103093366, "loss": 4.27398681640625, "step": 900 }, { "epoch": 0.6649446494464945, "grad_norm": 0.8177223801612854, "learning_rate": 0.0001897909478735083, "loss": 3.997640609741211, "step": 901 }, { "epoch": 0.6656826568265682, "grad_norm": 1.2305352687835693, "learning_rate": 0.00018904229318113914, "loss": 4.09181022644043, "step": 902 }, { "epoch": 0.666420664206642, "grad_norm": 0.862445056438446, "learning_rate": 0.00018829457129253057, "loss": 4.322624206542969, "step": 903 }, { "epoch": 0.6671586715867158, "grad_norm": 0.8462716937065125, "learning_rate": 0.00018754778654098123, "loss": 4.413826942443848, "step": 904 }, { "epoch": 0.6678966789667896, "grad_norm": 0.8606178164482117, "learning_rate": 0.00018680194325435839, "loss": 4.309714317321777, "step": 905 }, { "epoch": 0.6686346863468635, "grad_norm": 0.8559933304786682, "learning_rate": 0.00018605704575507347, "loss": 4.162710189819336, "step": 906 }, { "epoch": 0.6693726937269373, "grad_norm": 0.9497646689414978, "learning_rate": 0.00018531309836005675, "loss": 4.144913673400879, "step": 907 }, { "epoch": 0.6701107011070111, "grad_norm": 0.8656502962112427, "learning_rate": 0.00018457010538073236, "loss": 4.23277473449707, "step": 908 }, { "epoch": 0.6708487084870849, "grad_norm": 0.9220851063728333, "learning_rate": 0.00018382807112299283, "loss": 4.004146099090576, "step": 909 }, { "epoch": 0.6715867158671587, "grad_norm": 0.8644999265670776, "learning_rate": 0.0001830869998871755, "loss": 4.135645389556885, "step": 910 }, { "epoch": 0.6723247232472325, "grad_norm": 0.9802985787391663, "learning_rate": 0.0001823468959680356, "loss": 4.413508892059326, "step": 911 }, { "epoch": 0.6730627306273063, "grad_norm": 0.8389285802841187, "learning_rate": 0.0001816077636547232, "loss": 4.484038829803467, "step": 912 }, { "epoch": 0.67380073800738, "grad_norm": 0.9547582864761353, "learning_rate": 0.00018086960723075727, "loss": 4.3295416831970215, "step": 913 }, { "epoch": 0.6745387453874538, "grad_norm": 0.8170531392097473, "learning_rate": 0.00018013243097400128, "loss": 4.145027160644531, "step": 914 }, { "epoch": 0.6752767527675276, "grad_norm": 0.8581196665763855, "learning_rate": 0.00017939623915663833, "loss": 4.246807098388672, "step": 915 }, { "epoch": 0.6760147601476014, "grad_norm": 0.9968565702438354, "learning_rate": 0.000178661036045146, "loss": 4.355518817901611, "step": 916 }, { "epoch": 0.6767527675276753, "grad_norm": 1.08475923538208, "learning_rate": 0.00017792682590027278, "loss": 4.216618061065674, "step": 917 }, { "epoch": 0.6774907749077491, "grad_norm": 0.9199729561805725, "learning_rate": 0.00017719361297701167, "loss": 4.03296422958374, "step": 918 }, { "epoch": 0.6782287822878229, "grad_norm": 0.9441756010055542, "learning_rate": 0.00017646140152457717, "loss": 4.32381010055542, "step": 919 }, { "epoch": 0.6789667896678967, "grad_norm": 0.8643115162849426, "learning_rate": 0.00017573019578637913, "loss": 4.274387359619141, "step": 920 }, { "epoch": 0.6797047970479705, "grad_norm": 0.8102643489837646, "learning_rate": 0.00017500000000000008, "loss": 4.232758522033691, "step": 921 }, { "epoch": 0.6804428044280443, "grad_norm": 1.063491702079773, "learning_rate": 0.0001742708183971684, "loss": 4.3541483879089355, "step": 922 }, { "epoch": 0.6811808118081181, "grad_norm": 0.7408610582351685, "learning_rate": 0.00017354265520373567, "loss": 4.151790618896484, "step": 923 }, { "epoch": 0.6819188191881919, "grad_norm": 0.7934446930885315, "learning_rate": 0.0001728155146396511, "loss": 4.396363258361816, "step": 924 }, { "epoch": 0.6826568265682657, "grad_norm": 0.954188883304596, "learning_rate": 0.00017208940091893756, "loss": 3.97440767288208, "step": 925 }, { "epoch": 0.6833948339483394, "grad_norm": 1.0053012371063232, "learning_rate": 0.00017136431824966715, "loss": 4.055703163146973, "step": 926 }, { "epoch": 0.6841328413284132, "grad_norm": 0.8948765397071838, "learning_rate": 0.00017064027083393612, "loss": 4.3566484451293945, "step": 927 }, { "epoch": 0.6848708487084871, "grad_norm": 0.8520956039428711, "learning_rate": 0.000169917262867842, "loss": 3.928354263305664, "step": 928 }, { "epoch": 0.6856088560885609, "grad_norm": 0.8816937804222107, "learning_rate": 0.00016919529854145745, "loss": 4.179725170135498, "step": 929 }, { "epoch": 0.6863468634686347, "grad_norm": 1.3806109428405762, "learning_rate": 0.00016847438203880735, "loss": 4.185024738311768, "step": 930 }, { "epoch": 0.6870848708487085, "grad_norm": 0.7780953049659729, "learning_rate": 0.00016775451753784414, "loss": 4.327208995819092, "step": 931 }, { "epoch": 0.6878228782287823, "grad_norm": 1.103068470954895, "learning_rate": 0.00016703570921042344, "loss": 4.4666948318481445, "step": 932 }, { "epoch": 0.6885608856088561, "grad_norm": 0.8747889995574951, "learning_rate": 0.00016631796122227983, "loss": 4.146649360656738, "step": 933 }, { "epoch": 0.6892988929889299, "grad_norm": 0.9435983896255493, "learning_rate": 0.00016560127773300313, "loss": 4.554599761962891, "step": 934 }, { "epoch": 0.6900369003690037, "grad_norm": 0.800839364528656, "learning_rate": 0.00016488566289601388, "loss": 4.008693218231201, "step": 935 }, { "epoch": 0.6907749077490775, "grad_norm": 0.8912091851234436, "learning_rate": 0.00016417112085853969, "loss": 4.274938583374023, "step": 936 }, { "epoch": 0.6915129151291513, "grad_norm": 0.9369155168533325, "learning_rate": 0.00016345765576159042, "loss": 4.299943447113037, "step": 937 }, { "epoch": 0.692250922509225, "grad_norm": 0.7870283722877502, "learning_rate": 0.000162745271739935, "loss": 4.334400177001953, "step": 938 }, { "epoch": 0.6929889298892989, "grad_norm": 0.8504934310913086, "learning_rate": 0.00016203397292207758, "loss": 4.140174865722656, "step": 939 }, { "epoch": 0.6937269372693727, "grad_norm": 1.016496181488037, "learning_rate": 0.00016132376343023233, "loss": 4.296517848968506, "step": 940 }, { "epoch": 0.6944649446494465, "grad_norm": 1.214504599571228, "learning_rate": 0.00016061464738030106, "loss": 4.107439041137695, "step": 941 }, { "epoch": 0.6952029520295203, "grad_norm": 0.9972517490386963, "learning_rate": 0.0001599066288818485, "loss": 4.203211307525635, "step": 942 }, { "epoch": 0.6959409594095941, "grad_norm": 0.8465280532836914, "learning_rate": 0.0001591997120380788, "loss": 4.145995616912842, "step": 943 }, { "epoch": 0.6966789667896679, "grad_norm": 0.9349222779273987, "learning_rate": 0.00015849390094581142, "loss": 4.01326847076416, "step": 944 }, { "epoch": 0.6974169741697417, "grad_norm": 1.018925666809082, "learning_rate": 0.0001577891996954578, "loss": 4.1635026931762695, "step": 945 }, { "epoch": 0.6981549815498155, "grad_norm": 0.7925598621368408, "learning_rate": 0.0001570856123709975, "loss": 4.1779022216796875, "step": 946 }, { "epoch": 0.6988929889298893, "grad_norm": 0.932461142539978, "learning_rate": 0.00015638314304995454, "loss": 4.21356201171875, "step": 947 }, { "epoch": 0.6996309963099631, "grad_norm": 0.9300697445869446, "learning_rate": 0.00015568179580337333, "loss": 4.0165696144104, "step": 948 }, { "epoch": 0.7003690036900369, "grad_norm": 0.8354659676551819, "learning_rate": 0.0001549815746957962, "loss": 4.235401630401611, "step": 949 }, { "epoch": 0.7011070110701108, "grad_norm": 0.926152765750885, "learning_rate": 0.00015428248378523865, "loss": 4.415463447570801, "step": 950 }, { "epoch": 0.7018450184501845, "grad_norm": 1.0506153106689453, "learning_rate": 0.0001535845271231662, "loss": 4.269872665405273, "step": 951 }, { "epoch": 0.7025830258302583, "grad_norm": 0.8655766248703003, "learning_rate": 0.00015288770875447128, "loss": 4.350858688354492, "step": 952 }, { "epoch": 0.7033210332103321, "grad_norm": 0.7818317413330078, "learning_rate": 0.00015219203271744954, "loss": 4.015618801116943, "step": 953 }, { "epoch": 0.7040590405904059, "grad_norm": 0.8752409815788269, "learning_rate": 0.00015149750304377645, "loss": 4.2518510818481445, "step": 954 }, { "epoch": 0.7047970479704797, "grad_norm": 1.2109910249710083, "learning_rate": 0.00015080412375848357, "loss": 4.2393035888671875, "step": 955 }, { "epoch": 0.7055350553505535, "grad_norm": 0.925682544708252, "learning_rate": 0.00015011189887993598, "loss": 4.126298904418945, "step": 956 }, { "epoch": 0.7062730627306273, "grad_norm": 0.9237503409385681, "learning_rate": 0.00014942083241980837, "loss": 3.981215476989746, "step": 957 }, { "epoch": 0.7070110701107011, "grad_norm": 0.9711774587631226, "learning_rate": 0.0001487309283830623, "loss": 4.350025177001953, "step": 958 }, { "epoch": 0.7077490774907749, "grad_norm": 0.9577892422676086, "learning_rate": 0.00014804219076792202, "loss": 4.178315162658691, "step": 959 }, { "epoch": 0.7084870848708487, "grad_norm": 0.9610137343406677, "learning_rate": 0.00014735462356585302, "loss": 4.048961639404297, "step": 960 }, { "epoch": 0.7092250922509226, "grad_norm": 0.8772600889205933, "learning_rate": 0.0001466682307615368, "loss": 4.249874114990234, "step": 961 }, { "epoch": 0.7099630996309964, "grad_norm": 0.8270952105522156, "learning_rate": 0.00014598301633284952, "loss": 4.296774387359619, "step": 962 }, { "epoch": 0.7107011070110701, "grad_norm": 0.8505011796951294, "learning_rate": 0.00014529898425083793, "loss": 4.15446662902832, "step": 963 }, { "epoch": 0.7114391143911439, "grad_norm": 0.7727055549621582, "learning_rate": 0.00014461613847969687, "loss": 4.255171298980713, "step": 964 }, { "epoch": 0.7121771217712177, "grad_norm": 1.0215280055999756, "learning_rate": 0.00014393448297674613, "loss": 4.0843987464904785, "step": 965 }, { "epoch": 0.7129151291512915, "grad_norm": 0.9580904841423035, "learning_rate": 0.00014325402169240717, "loss": 4.22476863861084, "step": 966 }, { "epoch": 0.7136531365313653, "grad_norm": 0.8007642030715942, "learning_rate": 0.0001425747585701809, "loss": 4.2899861335754395, "step": 967 }, { "epoch": 0.7143911439114391, "grad_norm": 1.048153042793274, "learning_rate": 0.00014189669754662433, "loss": 4.137915134429932, "step": 968 }, { "epoch": 0.7151291512915129, "grad_norm": 0.9277073740959167, "learning_rate": 0.00014121984255132812, "loss": 4.19291877746582, "step": 969 }, { "epoch": 0.7158671586715867, "grad_norm": 0.9412124752998352, "learning_rate": 0.00014054419750689302, "loss": 4.134371757507324, "step": 970 }, { "epoch": 0.7166051660516605, "grad_norm": 0.9520360827445984, "learning_rate": 0.0001398697663289086, "loss": 3.9994983673095703, "step": 971 }, { "epoch": 0.7173431734317344, "grad_norm": 1.108952522277832, "learning_rate": 0.00013919655292592885, "loss": 4.142839431762695, "step": 972 }, { "epoch": 0.7180811808118082, "grad_norm": 0.882947564125061, "learning_rate": 0.0001385245611994507, "loss": 4.331300258636475, "step": 973 }, { "epoch": 0.718819188191882, "grad_norm": 0.9011394381523132, "learning_rate": 0.00013785379504389108, "loss": 4.304719924926758, "step": 974 }, { "epoch": 0.7195571955719557, "grad_norm": 0.9489427208900452, "learning_rate": 0.00013718425834656427, "loss": 3.873215675354004, "step": 975 }, { "epoch": 0.7202952029520295, "grad_norm": 0.8889840841293335, "learning_rate": 0.00013651595498765954, "loss": 4.21721076965332, "step": 976 }, { "epoch": 0.7210332103321033, "grad_norm": 0.8962631821632385, "learning_rate": 0.0001358488888402181, "loss": 4.268343925476074, "step": 977 }, { "epoch": 0.7217712177121771, "grad_norm": 1.0096079111099243, "learning_rate": 0.0001351830637701119, "loss": 4.305258750915527, "step": 978 }, { "epoch": 0.7225092250922509, "grad_norm": 0.8910917043685913, "learning_rate": 0.0001345184836360196, "loss": 4.095419883728027, "step": 979 }, { "epoch": 0.7232472324723247, "grad_norm": 0.8660383224487305, "learning_rate": 0.00013385515228940572, "loss": 4.2480149269104, "step": 980 }, { "epoch": 0.7239852398523985, "grad_norm": 0.7730628252029419, "learning_rate": 0.00013319307357449696, "loss": 4.004230499267578, "step": 981 }, { "epoch": 0.7247232472324723, "grad_norm": 0.9015150666236877, "learning_rate": 0.00013253225132826138, "loss": 4.344229698181152, "step": 982 }, { "epoch": 0.7254612546125462, "grad_norm": 0.8757840991020203, "learning_rate": 0.0001318726893803847, "loss": 4.284424781799316, "step": 983 }, { "epoch": 0.72619926199262, "grad_norm": 0.8267972469329834, "learning_rate": 0.00013121439155324918, "loss": 3.9191102981567383, "step": 984 }, { "epoch": 0.7269372693726938, "grad_norm": 0.998901903629303, "learning_rate": 0.00013055736166191095, "loss": 4.020920276641846, "step": 985 }, { "epoch": 0.7276752767527676, "grad_norm": 0.9288577437400818, "learning_rate": 0.00012990160351407804, "loss": 4.161448001861572, "step": 986 }, { "epoch": 0.7284132841328413, "grad_norm": 0.8598924279212952, "learning_rate": 0.00012924712091008842, "loss": 4.157841205596924, "step": 987 }, { "epoch": 0.7291512915129151, "grad_norm": 0.8927615880966187, "learning_rate": 0.0001285939176428874, "loss": 4.054559230804443, "step": 988 }, { "epoch": 0.7298892988929889, "grad_norm": 0.8624060750007629, "learning_rate": 0.00012794199749800698, "loss": 4.096704006195068, "step": 989 }, { "epoch": 0.7306273062730627, "grad_norm": 0.9361541271209717, "learning_rate": 0.00012729136425354204, "loss": 4.233707427978516, "step": 990 }, { "epoch": 0.7313653136531365, "grad_norm": 0.9343904256820679, "learning_rate": 0.00012664202168013005, "loss": 3.9704904556274414, "step": 991 }, { "epoch": 0.7321033210332103, "grad_norm": 0.9579162001609802, "learning_rate": 0.0001259939735409285, "loss": 4.047106742858887, "step": 992 }, { "epoch": 0.7328413284132841, "grad_norm": 0.9848127365112305, "learning_rate": 0.0001253472235915933, "loss": 4.055668830871582, "step": 993 }, { "epoch": 0.7335793357933579, "grad_norm": 0.8801389932632446, "learning_rate": 0.00012470177558025652, "loss": 4.0792717933654785, "step": 994 }, { "epoch": 0.7343173431734318, "grad_norm": 1.0689746141433716, "learning_rate": 0.0001240576332475054, "loss": 4.3891496658325195, "step": 995 }, { "epoch": 0.7350553505535056, "grad_norm": 0.9340549111366272, "learning_rate": 0.00012341480032636035, "loss": 3.9269206523895264, "step": 996 }, { "epoch": 0.7357933579335794, "grad_norm": 1.6336300373077393, "learning_rate": 0.0001227732805422531, "loss": 4.390302658081055, "step": 997 }, { "epoch": 0.7365313653136532, "grad_norm": 1.0127570629119873, "learning_rate": 0.00012213307761300567, "loss": 4.110518455505371, "step": 998 }, { "epoch": 0.7372693726937269, "grad_norm": 0.9814800024032593, "learning_rate": 0.00012149419524880778, "loss": 4.395967960357666, "step": 999 }, { "epoch": 0.7380073800738007, "grad_norm": 0.8709611892700195, "learning_rate": 0.00012085663715219694, "loss": 4.2395758628845215, "step": 1000 }, { "epoch": 0.7387453874538745, "grad_norm": 0.810457706451416, "learning_rate": 0.00012022040701803532, "loss": 4.192559242248535, "step": 1001 }, { "epoch": 0.7394833948339483, "grad_norm": 0.8743382692337036, "learning_rate": 0.00011958550853348949, "loss": 4.053243637084961, "step": 1002 }, { "epoch": 0.7402214022140221, "grad_norm": 1.0082160234451294, "learning_rate": 0.0001189519453780086, "loss": 4.024561405181885, "step": 1003 }, { "epoch": 0.7409594095940959, "grad_norm": 0.9494944214820862, "learning_rate": 0.00011831972122330317, "loss": 4.133411407470703, "step": 1004 }, { "epoch": 0.7416974169741697, "grad_norm": 0.729489266872406, "learning_rate": 0.00011768883973332351, "loss": 4.2208356857299805, "step": 1005 }, { "epoch": 0.7424354243542436, "grad_norm": 0.8364970684051514, "learning_rate": 0.000117059304564239, "loss": 4.144787788391113, "step": 1006 }, { "epoch": 0.7431734317343174, "grad_norm": 1.0048389434814453, "learning_rate": 0.00011643111936441654, "loss": 4.1646552085876465, "step": 1007 }, { "epoch": 0.7439114391143912, "grad_norm": 0.8014469742774963, "learning_rate": 0.00011580428777439973, "loss": 4.183121681213379, "step": 1008 }, { "epoch": 0.744649446494465, "grad_norm": 1.1298073530197144, "learning_rate": 0.00011517881342688705, "loss": 4.2498016357421875, "step": 1009 }, { "epoch": 0.7453874538745388, "grad_norm": 0.9686313271522522, "learning_rate": 0.00011455469994671158, "loss": 4.157444000244141, "step": 1010 }, { "epoch": 0.7461254612546125, "grad_norm": 0.7569875121116638, "learning_rate": 0.00011393195095082015, "loss": 4.179769515991211, "step": 1011 }, { "epoch": 0.7468634686346863, "grad_norm": 0.9126372933387756, "learning_rate": 0.00011331057004825114, "loss": 4.2508544921875, "step": 1012 }, { "epoch": 0.7476014760147601, "grad_norm": 0.9252088069915771, "learning_rate": 0.00011269056084011492, "loss": 4.427289009094238, "step": 1013 }, { "epoch": 0.7483394833948339, "grad_norm": 0.8704126477241516, "learning_rate": 0.00011207192691957224, "loss": 4.120467185974121, "step": 1014 }, { "epoch": 0.7490774907749077, "grad_norm": 0.7337223291397095, "learning_rate": 0.00011145467187181378, "loss": 4.24467658996582, "step": 1015 }, { "epoch": 0.7498154981549815, "grad_norm": 1.0976858139038086, "learning_rate": 0.0001108387992740388, "loss": 4.356447696685791, "step": 1016 }, { "epoch": 0.7505535055350554, "grad_norm": 0.9983663558959961, "learning_rate": 0.00011022431269543517, "loss": 4.160353660583496, "step": 1017 }, { "epoch": 0.7512915129151292, "grad_norm": 1.2688814401626587, "learning_rate": 0.00010961121569715825, "loss": 4.209506988525391, "step": 1018 }, { "epoch": 0.752029520295203, "grad_norm": 0.8176230788230896, "learning_rate": 0.00010899951183231028, "loss": 4.172100067138672, "step": 1019 }, { "epoch": 0.7527675276752768, "grad_norm": 0.8766177892684937, "learning_rate": 0.00010838920464591952, "loss": 4.03950834274292, "step": 1020 }, { "epoch": 0.7535055350553506, "grad_norm": 0.8611599802970886, "learning_rate": 0.00010778029767492066, "loss": 4.484358787536621, "step": 1021 }, { "epoch": 0.7542435424354244, "grad_norm": 0.8686861395835876, "learning_rate": 0.00010717279444813325, "loss": 4.179934501647949, "step": 1022 }, { "epoch": 0.7549815498154981, "grad_norm": 0.8060294985771179, "learning_rate": 0.00010656669848624154, "loss": 4.116765975952148, "step": 1023 }, { "epoch": 0.7557195571955719, "grad_norm": 0.9301735162734985, "learning_rate": 0.0001059620133017745, "loss": 3.9561753273010254, "step": 1024 }, { "epoch": 0.7564575645756457, "grad_norm": 0.8732739686965942, "learning_rate": 0.00010535874239908514, "loss": 4.087579250335693, "step": 1025 }, { "epoch": 0.7571955719557195, "grad_norm": 0.8588765859603882, "learning_rate": 0.00010475688927433018, "loss": 4.3742876052856445, "step": 1026 }, { "epoch": 0.7579335793357933, "grad_norm": 0.7727426290512085, "learning_rate": 0.0001041564574154497, "loss": 4.053977012634277, "step": 1027 }, { "epoch": 0.7586715867158672, "grad_norm": 0.7998579144477844, "learning_rate": 0.00010355745030214725, "loss": 4.124699592590332, "step": 1028 }, { "epoch": 0.759409594095941, "grad_norm": 0.8935081362724304, "learning_rate": 0.00010295987140586949, "loss": 4.136198997497559, "step": 1029 }, { "epoch": 0.7601476014760148, "grad_norm": 0.8178191184997559, "learning_rate": 0.00010236372418978614, "loss": 4.050296783447266, "step": 1030 }, { "epoch": 0.7608856088560886, "grad_norm": 0.8848076462745667, "learning_rate": 0.00010176901210876947, "loss": 3.9550304412841797, "step": 1031 }, { "epoch": 0.7616236162361624, "grad_norm": 0.9924454689025879, "learning_rate": 0.00010117573860937533, "loss": 4.258056640625, "step": 1032 }, { "epoch": 0.7623616236162362, "grad_norm": 1.1687978506088257, "learning_rate": 0.00010058390712982184, "loss": 4.050140380859375, "step": 1033 }, { "epoch": 0.76309963099631, "grad_norm": 0.8403159379959106, "learning_rate": 9.999352109997051e-05, "loss": 4.150047302246094, "step": 1034 }, { "epoch": 0.7638376383763837, "grad_norm": 0.92585289478302, "learning_rate": 9.940458394130595e-05, "loss": 3.9567012786865234, "step": 1035 }, { "epoch": 0.7645756457564575, "grad_norm": 0.9140007495880127, "learning_rate": 9.881709906691602e-05, "loss": 4.100074291229248, "step": 1036 }, { "epoch": 0.7653136531365313, "grad_norm": 0.9550725817680359, "learning_rate": 9.823106988147217e-05, "loss": 4.270690441131592, "step": 1037 }, { "epoch": 0.7660516605166051, "grad_norm": 1.0491443872451782, "learning_rate": 9.764649978120944e-05, "loss": 4.158552169799805, "step": 1038 }, { "epoch": 0.766789667896679, "grad_norm": 0.827170729637146, "learning_rate": 9.706339215390715e-05, "loss": 4.432864189147949, "step": 1039 }, { "epoch": 0.7675276752767528, "grad_norm": 1.0315954685211182, "learning_rate": 9.64817503788692e-05, "loss": 4.234312534332275, "step": 1040 }, { "epoch": 0.7682656826568266, "grad_norm": 0.9796032905578613, "learning_rate": 9.590157782690429e-05, "loss": 3.9558591842651367, "step": 1041 }, { "epoch": 0.7690036900369004, "grad_norm": 1.082369327545166, "learning_rate": 9.532287786030617e-05, "loss": 4.016860485076904, "step": 1042 }, { "epoch": 0.7697416974169742, "grad_norm": 0.9409294724464417, "learning_rate": 9.474565383283518e-05, "loss": 4.121254920959473, "step": 1043 }, { "epoch": 0.770479704797048, "grad_norm": 0.9006357192993164, "learning_rate": 9.416990908969736e-05, "loss": 4.089673042297363, "step": 1044 }, { "epoch": 0.7712177121771218, "grad_norm": 1.0219764709472656, "learning_rate": 9.359564696752622e-05, "loss": 3.96942138671875, "step": 1045 }, { "epoch": 0.7719557195571956, "grad_norm": 0.9810526967048645, "learning_rate": 9.302287079436289e-05, "loss": 3.9760637283325195, "step": 1046 }, { "epoch": 0.7726937269372693, "grad_norm": 0.9141161441802979, "learning_rate": 9.245158388963689e-05, "loss": 4.305903434753418, "step": 1047 }, { "epoch": 0.7734317343173431, "grad_norm": 2.4086737632751465, "learning_rate": 9.188178956414705e-05, "loss": 4.438955307006836, "step": 1048 }, { "epoch": 0.7741697416974169, "grad_norm": 0.9075452089309692, "learning_rate": 9.131349112004189e-05, "loss": 4.143951416015625, "step": 1049 }, { "epoch": 0.7749077490774908, "grad_norm": 0.8627065420150757, "learning_rate": 9.074669185080134e-05, "loss": 4.145493984222412, "step": 1050 }, { "epoch": 0.7756457564575646, "grad_norm": 0.9488852620124817, "learning_rate": 9.018139504121653e-05, "loss": 4.0962677001953125, "step": 1051 }, { "epoch": 0.7763837638376384, "grad_norm": 0.8987521529197693, "learning_rate": 8.96176039673717e-05, "loss": 4.037075996398926, "step": 1052 }, { "epoch": 0.7771217712177122, "grad_norm": 1.5608737468719482, "learning_rate": 8.905532189662476e-05, "loss": 4.093520164489746, "step": 1053 }, { "epoch": 0.777859778597786, "grad_norm": 1.0437077283859253, "learning_rate": 8.849455208758849e-05, "loss": 4.453344821929932, "step": 1054 }, { "epoch": 0.7785977859778598, "grad_norm": 0.9098041653633118, "learning_rate": 8.793529779011133e-05, "loss": 3.896477699279785, "step": 1055 }, { "epoch": 0.7793357933579336, "grad_norm": 0.7569055557250977, "learning_rate": 8.737756224525918e-05, "loss": 4.115358352661133, "step": 1056 }, { "epoch": 0.7800738007380074, "grad_norm": 1.0293289422988892, "learning_rate": 8.68213486852961e-05, "loss": 4.121119976043701, "step": 1057 }, { "epoch": 0.7808118081180812, "grad_norm": 0.8127309083938599, "learning_rate": 8.626666033366578e-05, "loss": 4.106558799743652, "step": 1058 }, { "epoch": 0.7815498154981549, "grad_norm": 0.9031323790550232, "learning_rate": 8.57135004049728e-05, "loss": 3.9452483654022217, "step": 1059 }, { "epoch": 0.7822878228782287, "grad_norm": 1.2205437421798706, "learning_rate": 8.516187210496385e-05, "loss": 3.8204894065856934, "step": 1060 }, { "epoch": 0.7830258302583026, "grad_norm": 0.907437801361084, "learning_rate": 8.461177863050975e-05, "loss": 4.430585861206055, "step": 1061 }, { "epoch": 0.7837638376383764, "grad_norm": 0.8979167342185974, "learning_rate": 8.406322316958601e-05, "loss": 4.146002292633057, "step": 1062 }, { "epoch": 0.7845018450184502, "grad_norm": 0.9116492867469788, "learning_rate": 8.351620890125513e-05, "loss": 4.052881240844727, "step": 1063 }, { "epoch": 0.785239852398524, "grad_norm": 1.1355615854263306, "learning_rate": 8.297073899564777e-05, "loss": 4.160739898681641, "step": 1064 }, { "epoch": 0.7859778597785978, "grad_norm": 0.8989261984825134, "learning_rate": 8.242681661394466e-05, "loss": 3.9555885791778564, "step": 1065 }, { "epoch": 0.7867158671586716, "grad_norm": 1.0729719400405884, "learning_rate": 8.188444490835773e-05, "loss": 4.048243999481201, "step": 1066 }, { "epoch": 0.7874538745387454, "grad_norm": 0.9004929661750793, "learning_rate": 8.134362702211263e-05, "loss": 4.261412143707275, "step": 1067 }, { "epoch": 0.7881918819188192, "grad_norm": 0.7723477482795715, "learning_rate": 8.080436608942988e-05, "loss": 3.9241394996643066, "step": 1068 }, { "epoch": 0.788929889298893, "grad_norm": 0.833265483379364, "learning_rate": 8.026666523550708e-05, "loss": 4.336735248565674, "step": 1069 }, { "epoch": 0.7896678966789668, "grad_norm": 0.9609919190406799, "learning_rate": 7.973052757650058e-05, "loss": 3.9808225631713867, "step": 1070 }, { "epoch": 0.7904059040590405, "grad_norm": 1.0244325399398804, "learning_rate": 7.919595621950728e-05, "loss": 4.093958854675293, "step": 1071 }, { "epoch": 0.7911439114391144, "grad_norm": 0.7694634199142456, "learning_rate": 7.866295426254735e-05, "loss": 3.9361343383789062, "step": 1072 }, { "epoch": 0.7918819188191882, "grad_norm": 0.8412328958511353, "learning_rate": 7.813152479454516e-05, "loss": 4.3025431632995605, "step": 1073 }, { "epoch": 0.792619926199262, "grad_norm": 0.9007997512817383, "learning_rate": 7.760167089531244e-05, "loss": 4.1600799560546875, "step": 1074 }, { "epoch": 0.7933579335793358, "grad_norm": 0.8552005887031555, "learning_rate": 7.707339563552973e-05, "loss": 3.9373395442962646, "step": 1075 }, { "epoch": 0.7940959409594096, "grad_norm": 0.9131635427474976, "learning_rate": 7.654670207672905e-05, "loss": 4.242855072021484, "step": 1076 }, { "epoch": 0.7948339483394834, "grad_norm": 0.8459916710853577, "learning_rate": 7.602159327127555e-05, "loss": 4.222464084625244, "step": 1077 }, { "epoch": 0.7955719557195572, "grad_norm": 0.9173424243927002, "learning_rate": 7.549807226235051e-05, "loss": 4.072568416595459, "step": 1078 }, { "epoch": 0.796309963099631, "grad_norm": 0.9082213640213013, "learning_rate": 7.497614208393341e-05, "loss": 3.9589667320251465, "step": 1079 }, { "epoch": 0.7970479704797048, "grad_norm": 0.8568102717399597, "learning_rate": 7.44558057607843e-05, "loss": 4.217202663421631, "step": 1080 }, { "epoch": 0.7977859778597786, "grad_norm": 0.9027300477027893, "learning_rate": 7.393706630842592e-05, "loss": 4.339812278747559, "step": 1081 }, { "epoch": 0.7985239852398524, "grad_norm": 0.8236647844314575, "learning_rate": 7.341992673312733e-05, "loss": 3.9794492721557617, "step": 1082 }, { "epoch": 0.7992619926199263, "grad_norm": 1.059795618057251, "learning_rate": 7.290439003188531e-05, "loss": 4.107804298400879, "step": 1083 }, { "epoch": 0.8, "grad_norm": 1.0308328866958618, "learning_rate": 7.239045919240731e-05, "loss": 4.0905232429504395, "step": 1084 }, { "epoch": 0.8007380073800738, "grad_norm": 0.9931803345680237, "learning_rate": 7.187813719309466e-05, "loss": 3.8613810539245605, "step": 1085 }, { "epoch": 0.8014760147601476, "grad_norm": 0.9674167037010193, "learning_rate": 7.136742700302469e-05, "loss": 4.229313850402832, "step": 1086 }, { "epoch": 0.8022140221402214, "grad_norm": 1.1589391231536865, "learning_rate": 7.085833158193391e-05, "loss": 4.372422695159912, "step": 1087 }, { "epoch": 0.8029520295202952, "grad_norm": 1.1077316999435425, "learning_rate": 7.035085388020041e-05, "loss": 4.1049089431762695, "step": 1088 }, { "epoch": 0.803690036900369, "grad_norm": 0.9430045485496521, "learning_rate": 6.984499683882739e-05, "loss": 4.282869338989258, "step": 1089 }, { "epoch": 0.8044280442804428, "grad_norm": 1.254410982131958, "learning_rate": 6.934076338942564e-05, "loss": 3.9536659717559814, "step": 1090 }, { "epoch": 0.8051660516605166, "grad_norm": 0.8754069209098816, "learning_rate": 6.883815645419675e-05, "loss": 4.139862060546875, "step": 1091 }, { "epoch": 0.8059040590405904, "grad_norm": 0.9515761733055115, "learning_rate": 6.833717894591579e-05, "loss": 4.331487655639648, "step": 1092 }, { "epoch": 0.8066420664206642, "grad_norm": 1.1361658573150635, "learning_rate": 6.783783376791533e-05, "loss": 4.143629550933838, "step": 1093 }, { "epoch": 0.8073800738007381, "grad_norm": 0.8871273398399353, "learning_rate": 6.734012381406767e-05, "loss": 4.211644172668457, "step": 1094 }, { "epoch": 0.8081180811808119, "grad_norm": 0.8796087503433228, "learning_rate": 6.684405196876843e-05, "loss": 4.109099864959717, "step": 1095 }, { "epoch": 0.8088560885608856, "grad_norm": 1.0282338857650757, "learning_rate": 6.634962110691991e-05, "loss": 3.9217135906219482, "step": 1096 }, { "epoch": 0.8095940959409594, "grad_norm": 0.8852423429489136, "learning_rate": 6.585683409391441e-05, "loss": 3.826831579208374, "step": 1097 }, { "epoch": 0.8103321033210332, "grad_norm": 1.1207947731018066, "learning_rate": 6.536569378561766e-05, "loss": 4.236572265625, "step": 1098 }, { "epoch": 0.811070110701107, "grad_norm": 0.7631810307502747, "learning_rate": 6.487620302835181e-05, "loss": 4.135857582092285, "step": 1099 }, { "epoch": 0.8118081180811808, "grad_norm": 1.0373399257659912, "learning_rate": 6.438836465887968e-05, "loss": 3.926546096801758, "step": 1100 }, { "epoch": 0.8125461254612546, "grad_norm": 0.8193474411964417, "learning_rate": 6.390218150438787e-05, "loss": 4.056336402893066, "step": 1101 }, { "epoch": 0.8132841328413284, "grad_norm": 0.8076398968696594, "learning_rate": 6.341765638247046e-05, "loss": 4.038424968719482, "step": 1102 }, { "epoch": 0.8140221402214022, "grad_norm": 0.9038758873939514, "learning_rate": 6.29347921011124e-05, "loss": 4.076757431030273, "step": 1103 }, { "epoch": 0.814760147601476, "grad_norm": 1.0241302251815796, "learning_rate": 6.245359145867404e-05, "loss": 4.188800811767578, "step": 1104 }, { "epoch": 0.8154981549815498, "grad_norm": 0.8670378923416138, "learning_rate": 6.197405724387391e-05, "loss": 3.7736902236938477, "step": 1105 }, { "epoch": 0.8162361623616237, "grad_norm": 0.8043569922447205, "learning_rate": 6.149619223577322e-05, "loss": 4.0094099044799805, "step": 1106 }, { "epoch": 0.8169741697416975, "grad_norm": 1.0722813606262207, "learning_rate": 6.101999920375964e-05, "loss": 4.505285263061523, "step": 1107 }, { "epoch": 0.8177121771217712, "grad_norm": 0.8136195540428162, "learning_rate": 6.054548090753103e-05, "loss": 3.993842840194702, "step": 1108 }, { "epoch": 0.818450184501845, "grad_norm": 0.8687028288841248, "learning_rate": 6.0072640097079836e-05, "loss": 4.127281188964844, "step": 1109 }, { "epoch": 0.8191881918819188, "grad_norm": 0.879191517829895, "learning_rate": 5.960147951267643e-05, "loss": 4.027138710021973, "step": 1110 }, { "epoch": 0.8199261992619926, "grad_norm": 0.8649862408638, "learning_rate": 5.913200188485442e-05, "loss": 4.080497741699219, "step": 1111 }, { "epoch": 0.8206642066420664, "grad_norm": 0.9337714314460754, "learning_rate": 5.866420993439344e-05, "loss": 4.245942115783691, "step": 1112 }, { "epoch": 0.8214022140221402, "grad_norm": 0.8696949481964111, "learning_rate": 5.81981063723045e-05, "loss": 4.227627754211426, "step": 1113 }, { "epoch": 0.822140221402214, "grad_norm": 0.9521300792694092, "learning_rate": 5.773369389981347e-05, "loss": 4.130904197692871, "step": 1114 }, { "epoch": 0.8228782287822878, "grad_norm": 1.0789848566055298, "learning_rate": 5.7270975208346306e-05, "loss": 4.207403182983398, "step": 1115 }, { "epoch": 0.8236162361623616, "grad_norm": 0.8551551103591919, "learning_rate": 5.680995297951237e-05, "loss": 4.2299041748046875, "step": 1116 }, { "epoch": 0.8243542435424355, "grad_norm": 0.790813684463501, "learning_rate": 5.635062988508984e-05, "loss": 4.201531410217285, "step": 1117 }, { "epoch": 0.8250922509225093, "grad_norm": 0.7844054698944092, "learning_rate": 5.5893008587009665e-05, "loss": 3.9883697032928467, "step": 1118 }, { "epoch": 0.825830258302583, "grad_norm": 0.8120241165161133, "learning_rate": 5.543709173734044e-05, "loss": 3.9854788780212402, "step": 1119 }, { "epoch": 0.8265682656826568, "grad_norm": 1.1635088920593262, "learning_rate": 5.498288197827285e-05, "loss": 3.948390007019043, "step": 1120 }, { "epoch": 0.8273062730627306, "grad_norm": 0.8426750898361206, "learning_rate": 5.4530381942104213e-05, "loss": 4.034334182739258, "step": 1121 }, { "epoch": 0.8280442804428044, "grad_norm": 0.8258629441261292, "learning_rate": 5.4079594251223894e-05, "loss": 4.009230613708496, "step": 1122 }, { "epoch": 0.8287822878228782, "grad_norm": 0.8874958157539368, "learning_rate": 5.363052151809721e-05, "loss": 3.9225668907165527, "step": 1123 }, { "epoch": 0.829520295202952, "grad_norm": 0.9092878103256226, "learning_rate": 5.318316634525092e-05, "loss": 4.106935977935791, "step": 1124 }, { "epoch": 0.8302583025830258, "grad_norm": 1.0611941814422607, "learning_rate": 5.273753132525793e-05, "loss": 4.086188793182373, "step": 1125 }, { "epoch": 0.8309963099630996, "grad_norm": 0.9324346780776978, "learning_rate": 5.229361904072231e-05, "loss": 4.163631916046143, "step": 1126 }, { "epoch": 0.8317343173431734, "grad_norm": 0.901092529296875, "learning_rate": 5.1851432064264184e-05, "loss": 3.8213887214660645, "step": 1127 }, { "epoch": 0.8324723247232473, "grad_norm": 0.8883295655250549, "learning_rate": 5.141097295850506e-05, "loss": 4.020335674285889, "step": 1128 }, { "epoch": 0.8332103321033211, "grad_norm": 0.8910163044929504, "learning_rate": 5.0972244276052794e-05, "loss": 3.904737949371338, "step": 1129 }, { "epoch": 0.8339483394833949, "grad_norm": 0.9039924144744873, "learning_rate": 5.053524855948689e-05, "loss": 3.9267964363098145, "step": 1130 }, { "epoch": 0.8346863468634687, "grad_norm": 0.7226197123527527, "learning_rate": 5.0099988341343834e-05, "loss": 4.004914283752441, "step": 1131 }, { "epoch": 0.8354243542435424, "grad_norm": 1.0319029092788696, "learning_rate": 4.966646614410193e-05, "loss": 3.922898769378662, "step": 1132 }, { "epoch": 0.8361623616236162, "grad_norm": 0.8679114580154419, "learning_rate": 4.92346844801677e-05, "loss": 4.130770206451416, "step": 1133 }, { "epoch": 0.83690036900369, "grad_norm": 1.1186548471450806, "learning_rate": 4.8804645851860066e-05, "loss": 4.051120758056641, "step": 1134 }, { "epoch": 0.8376383763837638, "grad_norm": 0.9294642806053162, "learning_rate": 4.8376352751396885e-05, "loss": 4.042642593383789, "step": 1135 }, { "epoch": 0.8383763837638376, "grad_norm": 0.9107891917228699, "learning_rate": 4.794980766087991e-05, "loss": 4.207566261291504, "step": 1136 }, { "epoch": 0.8391143911439114, "grad_norm": 0.8764515519142151, "learning_rate": 4.752501305228076e-05, "loss": 3.926863670349121, "step": 1137 }, { "epoch": 0.8398523985239852, "grad_norm": 0.7595376372337341, "learning_rate": 4.7101971387426126e-05, "loss": 4.053175926208496, "step": 1138 }, { "epoch": 0.8405904059040591, "grad_norm": 1.015899419784546, "learning_rate": 4.668068511798407e-05, "loss": 4.323257923126221, "step": 1139 }, { "epoch": 0.8413284132841329, "grad_norm": 0.7909930348396301, "learning_rate": 4.62611566854495e-05, "loss": 4.0056915283203125, "step": 1140 }, { "epoch": 0.8420664206642067, "grad_norm": 0.9827620387077332, "learning_rate": 4.5843388521130024e-05, "loss": 4.075970649719238, "step": 1141 }, { "epoch": 0.8428044280442805, "grad_norm": 0.8181502223014832, "learning_rate": 4.5427383046131974e-05, "loss": 4.204850673675537, "step": 1142 }, { "epoch": 0.8435424354243543, "grad_norm": 0.917636513710022, "learning_rate": 4.5013142671346035e-05, "loss": 4.204797744750977, "step": 1143 }, { "epoch": 0.844280442804428, "grad_norm": 0.8521405458450317, "learning_rate": 4.46006697974341e-05, "loss": 3.8248231410980225, "step": 1144 }, { "epoch": 0.8450184501845018, "grad_norm": 0.9880871176719666, "learning_rate": 4.41899668148142e-05, "loss": 4.135077476501465, "step": 1145 }, { "epoch": 0.8457564575645756, "grad_norm": 0.7729653120040894, "learning_rate": 4.3781036103647625e-05, "loss": 4.0869975090026855, "step": 1146 }, { "epoch": 0.8464944649446494, "grad_norm": 0.8864187598228455, "learning_rate": 4.337388003382462e-05, "loss": 3.949108600616455, "step": 1147 }, { "epoch": 0.8472324723247232, "grad_norm": 0.7802934646606445, "learning_rate": 4.296850096495096e-05, "loss": 4.134548664093018, "step": 1148 }, { "epoch": 0.847970479704797, "grad_norm": 1.1349821090698242, "learning_rate": 4.2564901246333816e-05, "loss": 3.8663113117218018, "step": 1149 }, { "epoch": 0.8487084870848709, "grad_norm": 0.8297522068023682, "learning_rate": 4.216308321696862e-05, "loss": 4.069552421569824, "step": 1150 }, { "epoch": 0.8494464944649447, "grad_norm": 0.8287118077278137, "learning_rate": 4.1763049205525295e-05, "loss": 4.17302131652832, "step": 1151 }, { "epoch": 0.8501845018450185, "grad_norm": 0.9213622212409973, "learning_rate": 4.136480153033484e-05, "loss": 3.975867748260498, "step": 1152 }, { "epoch": 0.8509225092250923, "grad_norm": 0.937636137008667, "learning_rate": 4.096834249937555e-05, "loss": 4.308503150939941, "step": 1153 }, { "epoch": 0.8516605166051661, "grad_norm": 0.894312858581543, "learning_rate": 4.0573674410260384e-05, "loss": 4.0722808837890625, "step": 1154 }, { "epoch": 0.8523985239852399, "grad_norm": 0.8254060745239258, "learning_rate": 4.0180799550222964e-05, "loss": 4.237331390380859, "step": 1155 }, { "epoch": 0.8531365313653136, "grad_norm": 0.9793713092803955, "learning_rate": 3.9789720196104374e-05, "loss": 3.960724115371704, "step": 1156 }, { "epoch": 0.8538745387453874, "grad_norm": 0.9438844323158264, "learning_rate": 3.940043861434043e-05, "loss": 4.011446952819824, "step": 1157 }, { "epoch": 0.8546125461254612, "grad_norm": 0.9776595234870911, "learning_rate": 3.901295706094806e-05, "loss": 4.202037334442139, "step": 1158 }, { "epoch": 0.855350553505535, "grad_norm": 0.8546213507652283, "learning_rate": 3.862727778151262e-05, "loss": 4.176602363586426, "step": 1159 }, { "epoch": 0.8560885608856088, "grad_norm": 0.9939232468605042, "learning_rate": 3.8243403011174406e-05, "loss": 4.394288063049316, "step": 1160 }, { "epoch": 0.8568265682656827, "grad_norm": 0.8461161851882935, "learning_rate": 3.786133497461622e-05, "loss": 4.105259895324707, "step": 1161 }, { "epoch": 0.8575645756457565, "grad_norm": 0.8759531378746033, "learning_rate": 3.748107588605018e-05, "loss": 3.866830348968506, "step": 1162 }, { "epoch": 0.8583025830258303, "grad_norm": 0.9277933239936829, "learning_rate": 3.710262794920493e-05, "loss": 4.112336158752441, "step": 1163 }, { "epoch": 0.8590405904059041, "grad_norm": 0.8277254104614258, "learning_rate": 3.672599335731272e-05, "loss": 4.080126762390137, "step": 1164 }, { "epoch": 0.8597785977859779, "grad_norm": 0.9238406419754028, "learning_rate": 3.635117429309721e-05, "loss": 3.9586308002471924, "step": 1165 }, { "epoch": 0.8605166051660517, "grad_norm": 0.8009730577468872, "learning_rate": 3.597817292876031e-05, "loss": 4.31672477722168, "step": 1166 }, { "epoch": 0.8612546125461255, "grad_norm": 0.9162194728851318, "learning_rate": 3.560699142596952e-05, "loss": 4.007983684539795, "step": 1167 }, { "epoch": 0.8619926199261992, "grad_norm": 0.809906542301178, "learning_rate": 3.523763193584591e-05, "loss": 4.362383842468262, "step": 1168 }, { "epoch": 0.862730627306273, "grad_norm": 1.0050405263900757, "learning_rate": 3.487009659895132e-05, "loss": 3.949605941772461, "step": 1169 }, { "epoch": 0.8634686346863468, "grad_norm": 0.821631669998169, "learning_rate": 3.4504387545276056e-05, "loss": 4.222439765930176, "step": 1170 }, { "epoch": 0.8642066420664206, "grad_norm": 1.0776225328445435, "learning_rate": 3.414050689422626e-05, "loss": 4.083227157592773, "step": 1171 }, { "epoch": 0.8649446494464945, "grad_norm": 0.9266446232795715, "learning_rate": 3.3778456754612195e-05, "loss": 3.8666300773620605, "step": 1172 }, { "epoch": 0.8656826568265683, "grad_norm": 0.7478688359260559, "learning_rate": 3.341823922463545e-05, "loss": 3.9161956310272217, "step": 1173 }, { "epoch": 0.8664206642066421, "grad_norm": 0.9101294875144958, "learning_rate": 3.305985639187726e-05, "loss": 4.048511505126953, "step": 1174 }, { "epoch": 0.8671586715867159, "grad_norm": 0.9844819903373718, "learning_rate": 3.270331033328581e-05, "loss": 4.01615571975708, "step": 1175 }, { "epoch": 0.8678966789667897, "grad_norm": 0.8775573968887329, "learning_rate": 3.2348603115165085e-05, "loss": 4.202104568481445, "step": 1176 }, { "epoch": 0.8686346863468635, "grad_norm": 0.8407407999038696, "learning_rate": 3.199573679316183e-05, "loss": 4.121450424194336, "step": 1177 }, { "epoch": 0.8693726937269373, "grad_norm": 0.9224722981452942, "learning_rate": 3.164471341225457e-05, "loss": 3.914332389831543, "step": 1178 }, { "epoch": 0.870110701107011, "grad_norm": 0.8708797693252563, "learning_rate": 3.1295535006741184e-05, "loss": 3.9288840293884277, "step": 1179 }, { "epoch": 0.8708487084870848, "grad_norm": 0.8619300127029419, "learning_rate": 3.0948203600227365e-05, "loss": 4.033664226531982, "step": 1180 }, { "epoch": 0.8715867158671586, "grad_norm": 1.0247814655303955, "learning_rate": 3.060272120561491e-05, "loss": 3.908498764038086, "step": 1181 }, { "epoch": 0.8723247232472324, "grad_norm": 0.8571381568908691, "learning_rate": 3.0259089825089657e-05, "loss": 3.937492847442627, "step": 1182 }, { "epoch": 0.8730627306273063, "grad_norm": 0.8049359917640686, "learning_rate": 2.9917311450110688e-05, "loss": 4.154782295227051, "step": 1183 }, { "epoch": 0.8738007380073801, "grad_norm": 0.8404570817947388, "learning_rate": 2.9577388061397813e-05, "loss": 3.8062617778778076, "step": 1184 }, { "epoch": 0.8745387453874539, "grad_norm": 0.8313830494880676, "learning_rate": 2.92393216289209e-05, "loss": 4.034999847412109, "step": 1185 }, { "epoch": 0.8752767527675277, "grad_norm": 0.8629732131958008, "learning_rate": 2.8903114111887997e-05, "loss": 3.9214658737182617, "step": 1186 }, { "epoch": 0.8760147601476015, "grad_norm": 0.788813054561615, "learning_rate": 2.8568767458734206e-05, "loss": 4.004258155822754, "step": 1187 }, { "epoch": 0.8767527675276753, "grad_norm": 1.0045006275177002, "learning_rate": 2.8236283607110122e-05, "loss": 4.084541320800781, "step": 1188 }, { "epoch": 0.8774907749077491, "grad_norm": 0.9397458434104919, "learning_rate": 2.7905664483871018e-05, "loss": 4.225802421569824, "step": 1189 }, { "epoch": 0.8782287822878229, "grad_norm": 0.9364494681358337, "learning_rate": 2.757691200506522e-05, "loss": 4.048999786376953, "step": 1190 }, { "epoch": 0.8789667896678967, "grad_norm": 0.9341748952865601, "learning_rate": 2.7250028075923393e-05, "loss": 4.081840515136719, "step": 1191 }, { "epoch": 0.8797047970479704, "grad_norm": 1.1266894340515137, "learning_rate": 2.6925014590847357e-05, "loss": 4.127097129821777, "step": 1192 }, { "epoch": 0.8804428044280442, "grad_norm": 1.646851658821106, "learning_rate": 2.660187343339872e-05, "loss": 3.9492740631103516, "step": 1193 }, { "epoch": 0.8811808118081181, "grad_norm": 0.8900485038757324, "learning_rate": 2.628060647628891e-05, "loss": 4.004465103149414, "step": 1194 }, { "epoch": 0.8819188191881919, "grad_norm": 1.2839107513427734, "learning_rate": 2.596121558136723e-05, "loss": 3.9589195251464844, "step": 1195 }, { "epoch": 0.8826568265682657, "grad_norm": 0.9112879037857056, "learning_rate": 2.564370259961085e-05, "loss": 3.956997871398926, "step": 1196 }, { "epoch": 0.8833948339483395, "grad_norm": 0.9632598161697388, "learning_rate": 2.532806937111368e-05, "loss": 4.068366050720215, "step": 1197 }, { "epoch": 0.8841328413284133, "grad_norm": 1.0053609609603882, "learning_rate": 2.5014317725075963e-05, "loss": 4.128815650939941, "step": 1198 }, { "epoch": 0.8848708487084871, "grad_norm": 0.9752780199050903, "learning_rate": 2.470244947979335e-05, "loss": 4.243814468383789, "step": 1199 }, { "epoch": 0.8856088560885609, "grad_norm": 0.8026096820831299, "learning_rate": 2.439246644264672e-05, "loss": 3.8507800102233887, "step": 1200 }, { "epoch": 0.8863468634686347, "grad_norm": 1.0604981184005737, "learning_rate": 2.4084370410091432e-05, "loss": 4.058777332305908, "step": 1201 }, { "epoch": 0.8870848708487085, "grad_norm": 0.9318236708641052, "learning_rate": 2.377816316764712e-05, "loss": 3.807260751724243, "step": 1202 }, { "epoch": 0.8878228782287823, "grad_norm": 0.902949333190918, "learning_rate": 2.347384648988722e-05, "loss": 4.102638244628906, "step": 1203 }, { "epoch": 0.888560885608856, "grad_norm": 0.8091539144515991, "learning_rate": 2.317142214042854e-05, "loss": 4.0851216316223145, "step": 1204 }, { "epoch": 0.8892988929889298, "grad_norm": 0.9384903907775879, "learning_rate": 2.28708918719216e-05, "loss": 3.8504605293273926, "step": 1205 }, { "epoch": 0.8900369003690037, "grad_norm": 1.5000700950622559, "learning_rate": 2.2572257426039673e-05, "loss": 3.7018003463745117, "step": 1206 }, { "epoch": 0.8907749077490775, "grad_norm": 0.853367269039154, "learning_rate": 2.2275520533469324e-05, "loss": 4.134353160858154, "step": 1207 }, { "epoch": 0.8915129151291513, "grad_norm": 0.9626766443252563, "learning_rate": 2.1980682913900136e-05, "loss": 3.798292636871338, "step": 1208 }, { "epoch": 0.8922509225092251, "grad_norm": 1.0532819032669067, "learning_rate": 2.1687746276014825e-05, "loss": 3.908432960510254, "step": 1209 }, { "epoch": 0.8929889298892989, "grad_norm": 0.9355469346046448, "learning_rate": 2.1396712317479066e-05, "loss": 3.97414493560791, "step": 1210 }, { "epoch": 0.8937269372693727, "grad_norm": 0.9721041917800903, "learning_rate": 2.110758272493209e-05, "loss": 4.170253753662109, "step": 1211 }, { "epoch": 0.8944649446494465, "grad_norm": 1.4950439929962158, "learning_rate": 2.082035917397661e-05, "loss": 3.9789202213287354, "step": 1212 }, { "epoch": 0.8952029520295203, "grad_norm": 0.7959940433502197, "learning_rate": 2.05350433291692e-05, "loss": 3.8894667625427246, "step": 1213 }, { "epoch": 0.8959409594095941, "grad_norm": 0.885735034942627, "learning_rate": 2.0251636844010645e-05, "loss": 4.206930637359619, "step": 1214 }, { "epoch": 0.8966789667896679, "grad_norm": 0.8489437103271484, "learning_rate": 1.997014136093635e-05, "loss": 4.1217241287231445, "step": 1215 }, { "epoch": 0.8974169741697416, "grad_norm": 0.9283955693244934, "learning_rate": 1.9690558511306816e-05, "loss": 4.022772789001465, "step": 1216 }, { "epoch": 0.8981549815498155, "grad_norm": 1.0239266157150269, "learning_rate": 1.9412889915398164e-05, "loss": 3.9056153297424316, "step": 1217 }, { "epoch": 0.8988929889298893, "grad_norm": 0.8964755535125732, "learning_rate": 1.91371371823928e-05, "loss": 4.042991638183594, "step": 1218 }, { "epoch": 0.8996309963099631, "grad_norm": 0.8521272540092468, "learning_rate": 1.88633019103701e-05, "loss": 4.2974958419799805, "step": 1219 }, { "epoch": 0.9003690036900369, "grad_norm": 0.8455408215522766, "learning_rate": 1.859138568629708e-05, "loss": 4.03305721282959, "step": 1220 }, { "epoch": 0.9011070110701107, "grad_norm": 0.9025142788887024, "learning_rate": 1.832139008601918e-05, "loss": 4.064189434051514, "step": 1221 }, { "epoch": 0.9018450184501845, "grad_norm": 0.789932131767273, "learning_rate": 1.8053316674251256e-05, "loss": 3.8885226249694824, "step": 1222 }, { "epoch": 0.9025830258302583, "grad_norm": 0.8709638118743896, "learning_rate": 1.7787167004568416e-05, "loss": 4.0818986892700195, "step": 1223 }, { "epoch": 0.9033210332103321, "grad_norm": 1.0169392824172974, "learning_rate": 1.75229426193971e-05, "loss": 4.12254524230957, "step": 1224 }, { "epoch": 0.9040590405904059, "grad_norm": 0.9385191798210144, "learning_rate": 1.7260645050005903e-05, "loss": 3.894554853439331, "step": 1225 }, { "epoch": 0.9047970479704797, "grad_norm": 0.87216717004776, "learning_rate": 1.7000275816497063e-05, "loss": 4.0138773918151855, "step": 1226 }, { "epoch": 0.9055350553505535, "grad_norm": 0.8227195143699646, "learning_rate": 1.6741836427797447e-05, "loss": 3.842376708984375, "step": 1227 }, { "epoch": 0.9062730627306274, "grad_norm": 0.9171870350837708, "learning_rate": 1.6485328381649667e-05, "loss": 4.184534072875977, "step": 1228 }, { "epoch": 0.9070110701107011, "grad_norm": 0.8216118216514587, "learning_rate": 1.6230753164603735e-05, "loss": 3.9486520290374756, "step": 1229 }, { "epoch": 0.9077490774907749, "grad_norm": 0.8233117461204529, "learning_rate": 1.597811225200816e-05, "loss": 4.167961597442627, "step": 1230 }, { "epoch": 0.9084870848708487, "grad_norm": 0.9360010623931885, "learning_rate": 1.5727407108001634e-05, "loss": 4.118611812591553, "step": 1231 }, { "epoch": 0.9092250922509225, "grad_norm": 0.8383175730705261, "learning_rate": 1.5478639185504255e-05, "loss": 4.2346062660217285, "step": 1232 }, { "epoch": 0.9099630996309963, "grad_norm": 0.7830789685249329, "learning_rate": 1.52318099262094e-05, "loss": 4.022680759429932, "step": 1233 }, { "epoch": 0.9107011070110701, "grad_norm": 0.8860730528831482, "learning_rate": 1.4986920760575173e-05, "loss": 3.8851001262664795, "step": 1234 }, { "epoch": 0.9114391143911439, "grad_norm": 1.0096216201782227, "learning_rate": 1.4743973107816294e-05, "loss": 4.16072940826416, "step": 1235 }, { "epoch": 0.9121771217712177, "grad_norm": 0.8702698945999146, "learning_rate": 1.4502968375895542e-05, "loss": 4.074400901794434, "step": 1236 }, { "epoch": 0.9129151291512915, "grad_norm": 1.0048964023590088, "learning_rate": 1.4263907961516103e-05, "loss": 4.206517219543457, "step": 1237 }, { "epoch": 0.9136531365313653, "grad_norm": 1.0056480169296265, "learning_rate": 1.40267932501131e-05, "loss": 4.110833644866943, "step": 1238 }, { "epoch": 0.9143911439114392, "grad_norm": 0.9925635457038879, "learning_rate": 1.379162561584547e-05, "loss": 3.903393507003784, "step": 1239 }, { "epoch": 0.915129151291513, "grad_norm": 1.1309577226638794, "learning_rate": 1.3558406421588386e-05, "loss": 4.20203971862793, "step": 1240 }, { "epoch": 0.9158671586715867, "grad_norm": 0.9794964790344238, "learning_rate": 1.332713701892514e-05, "loss": 4.138725280761719, "step": 1241 }, { "epoch": 0.9166051660516605, "grad_norm": 0.9882869720458984, "learning_rate": 1.3097818748139284e-05, "loss": 3.934995174407959, "step": 1242 }, { "epoch": 0.9173431734317343, "grad_norm": 0.9639918804168701, "learning_rate": 1.2870452938206834e-05, "loss": 3.992349147796631, "step": 1243 }, { "epoch": 0.9180811808118081, "grad_norm": 0.761677086353302, "learning_rate": 1.2645040906788873e-05, "loss": 4.091512680053711, "step": 1244 }, { "epoch": 0.9188191881918819, "grad_norm": 0.8653919100761414, "learning_rate": 1.2421583960223403e-05, "loss": 4.175684452056885, "step": 1245 }, { "epoch": 0.9195571955719557, "grad_norm": 0.8463162779808044, "learning_rate": 1.22000833935183e-05, "loss": 3.7140493392944336, "step": 1246 }, { "epoch": 0.9202952029520295, "grad_norm": 1.5709336996078491, "learning_rate": 1.1980540490343322e-05, "loss": 4.196260452270508, "step": 1247 }, { "epoch": 0.9210332103321033, "grad_norm": 1.0555191040039062, "learning_rate": 1.1762956523023177e-05, "loss": 4.01348876953125, "step": 1248 }, { "epoch": 0.9217712177121771, "grad_norm": 0.8740063309669495, "learning_rate": 1.1547332752529649e-05, "loss": 4.2362470626831055, "step": 1249 }, { "epoch": 0.922509225092251, "grad_norm": 0.7975241541862488, "learning_rate": 1.1333670428474634e-05, "loss": 3.9546077251434326, "step": 1250 }, { "epoch": 0.9232472324723248, "grad_norm": 0.9999665021896362, "learning_rate": 1.1121970789102842e-05, "loss": 4.26607608795166, "step": 1251 }, { "epoch": 0.9239852398523986, "grad_norm": 0.8974668979644775, "learning_rate": 1.0912235061284481e-05, "loss": 3.8792271614074707, "step": 1252 }, { "epoch": 0.9247232472324723, "grad_norm": 0.9566179513931274, "learning_rate": 1.0704464460508312e-05, "loss": 3.9453883171081543, "step": 1253 }, { "epoch": 0.9254612546125461, "grad_norm": 0.8908551335334778, "learning_rate": 1.0498660190874298e-05, "loss": 4.036525726318359, "step": 1254 }, { "epoch": 0.9261992619926199, "grad_norm": 0.8145546317100525, "learning_rate": 1.0294823445087275e-05, "loss": 4.188867568969727, "step": 1255 }, { "epoch": 0.9269372693726937, "grad_norm": 0.8675177097320557, "learning_rate": 1.0092955404449255e-05, "loss": 4.099850654602051, "step": 1256 }, { "epoch": 0.9276752767527675, "grad_norm": 0.8431053757667542, "learning_rate": 9.893057238853053e-06, "loss": 4.123414039611816, "step": 1257 }, { "epoch": 0.9284132841328413, "grad_norm": 0.8683810830116272, "learning_rate": 9.69513010677545e-06, "loss": 4.246320724487305, "step": 1258 }, { "epoch": 0.9291512915129151, "grad_norm": 0.8188611268997192, "learning_rate": 9.499175155270433e-06, "loss": 4.140353202819824, "step": 1259 }, { "epoch": 0.9298892988929889, "grad_norm": 0.9163283705711365, "learning_rate": 9.30519351996243e-06, "loss": 4.289680480957031, "step": 1260 }, { "epoch": 0.9306273062730628, "grad_norm": 0.9404510855674744, "learning_rate": 9.113186325039935e-06, "loss": 3.925518035888672, "step": 1261 }, { "epoch": 0.9313653136531366, "grad_norm": 1.0115693807601929, "learning_rate": 8.923154683248873e-06, "loss": 4.255781173706055, "step": 1262 }, { "epoch": 0.9321033210332104, "grad_norm": 0.9950158596038818, "learning_rate": 8.735099695886261e-06, "loss": 4.2205657958984375, "step": 1263 }, { "epoch": 0.9328413284132842, "grad_norm": 0.886117160320282, "learning_rate": 8.549022452793597e-06, "loss": 4.172645568847656, "step": 1264 }, { "epoch": 0.933579335793358, "grad_norm": 0.8960427045822144, "learning_rate": 8.364924032350728e-06, "loss": 4.060420513153076, "step": 1265 }, { "epoch": 0.9343173431734317, "grad_norm": 0.9799672961235046, "learning_rate": 8.18280550146967e-06, "loss": 4.113704681396484, "step": 1266 }, { "epoch": 0.9350553505535055, "grad_norm": 1.0600509643554688, "learning_rate": 8.002667915588191e-06, "loss": 4.008590221405029, "step": 1267 }, { "epoch": 0.9357933579335793, "grad_norm": 1.0815836191177368, "learning_rate": 7.824512318663873e-06, "loss": 3.9697742462158203, "step": 1268 }, { "epoch": 0.9365313653136531, "grad_norm": 0.8676935434341431, "learning_rate": 7.648339743168008e-06, "loss": 3.9918062686920166, "step": 1269 }, { "epoch": 0.9372693726937269, "grad_norm": 0.8141337633132935, "learning_rate": 7.474151210079654e-06, "loss": 4.000406742095947, "step": 1270 }, { "epoch": 0.9380073800738007, "grad_norm": 0.9304051995277405, "learning_rate": 7.301947728879571e-06, "loss": 4.1023969650268555, "step": 1271 }, { "epoch": 0.9387453874538746, "grad_norm": 0.8569762110710144, "learning_rate": 7.131730297544547e-06, "loss": 3.9308419227600098, "step": 1272 }, { "epoch": 0.9394833948339484, "grad_norm": 0.9410969018936157, "learning_rate": 6.963499902541575e-06, "loss": 4.188969612121582, "step": 1273 }, { "epoch": 0.9402214022140222, "grad_norm": 1.1418845653533936, "learning_rate": 6.7972575188220975e-06, "loss": 3.789651870727539, "step": 1274 }, { "epoch": 0.940959409594096, "grad_norm": 0.8757267594337463, "learning_rate": 6.633004109816293e-06, "loss": 4.139651298522949, "step": 1275 }, { "epoch": 0.9416974169741698, "grad_norm": 0.794495165348053, "learning_rate": 6.4707406274276015e-06, "loss": 4.01513671875, "step": 1276 }, { "epoch": 0.9424354243542435, "grad_norm": 0.8835275769233704, "learning_rate": 6.310468012027321e-06, "loss": 4.235984802246094, "step": 1277 }, { "epoch": 0.9431734317343173, "grad_norm": 0.8845841884613037, "learning_rate": 6.152187192448738e-06, "loss": 4.170893669128418, "step": 1278 }, { "epoch": 0.9439114391143911, "grad_norm": 0.8521038293838501, "learning_rate": 5.995899085982198e-06, "loss": 4.143123626708984, "step": 1279 }, { "epoch": 0.9446494464944649, "grad_norm": 0.8681446313858032, "learning_rate": 5.841604598369543e-06, "loss": 3.9806013107299805, "step": 1280 }, { "epoch": 0.9453874538745387, "grad_norm": 0.8543822765350342, "learning_rate": 5.689304623799063e-06, "loss": 4.092264175415039, "step": 1281 }, { "epoch": 0.9461254612546125, "grad_norm": 0.8498107194900513, "learning_rate": 5.5390000448999e-06, "loss": 3.961348056793213, "step": 1282 }, { "epoch": 0.9468634686346864, "grad_norm": 0.9987668991088867, "learning_rate": 5.390691732737501e-06, "loss": 3.853841781616211, "step": 1283 }, { "epoch": 0.9476014760147602, "grad_norm": 0.8435112833976746, "learning_rate": 5.244380546808064e-06, "loss": 4.074121475219727, "step": 1284 }, { "epoch": 0.948339483394834, "grad_norm": 0.9243870377540588, "learning_rate": 5.100067335033909e-06, "loss": 3.9349491596221924, "step": 1285 }, { "epoch": 0.9490774907749078, "grad_norm": 0.9198288917541504, "learning_rate": 4.957752933758391e-06, "loss": 4.085498332977295, "step": 1286 }, { "epoch": 0.9498154981549816, "grad_norm": 1.0337499380111694, "learning_rate": 4.817438167741045e-06, "loss": 4.015393257141113, "step": 1287 }, { "epoch": 0.9505535055350554, "grad_norm": 0.9356955289840698, "learning_rate": 4.679123850152955e-06, "loss": 4.079366683959961, "step": 1288 }, { "epoch": 0.9512915129151291, "grad_norm": 0.8707476854324341, "learning_rate": 4.542810782571749e-06, "loss": 3.9717764854431152, "step": 1289 }, { "epoch": 0.9520295202952029, "grad_norm": 0.8522350788116455, "learning_rate": 4.4084997549773184e-06, "loss": 3.9818825721740723, "step": 1290 }, { "epoch": 0.9527675276752767, "grad_norm": 0.9296215176582336, "learning_rate": 4.276191545747004e-06, "loss": 4.0599365234375, "step": 1291 }, { "epoch": 0.9535055350553505, "grad_norm": 0.9785073399543762, "learning_rate": 4.145886921651165e-06, "loss": 3.9496049880981445, "step": 1292 }, { "epoch": 0.9542435424354243, "grad_norm": 1.0265014171600342, "learning_rate": 4.017586637848669e-06, "loss": 3.9062700271606445, "step": 1293 }, { "epoch": 0.9549815498154982, "grad_norm": 0.8512267470359802, "learning_rate": 3.891291437882544e-06, "loss": 3.8862087726593018, "step": 1294 }, { "epoch": 0.955719557195572, "grad_norm": 0.9458624124526978, "learning_rate": 3.7670020536757775e-06, "loss": 4.076284408569336, "step": 1295 }, { "epoch": 0.9564575645756458, "grad_norm": 1.0091397762298584, "learning_rate": 3.6447192055269694e-06, "loss": 4.171298503875732, "step": 1296 }, { "epoch": 0.9571955719557196, "grad_norm": 1.085514783859253, "learning_rate": 3.5244436021060143e-06, "loss": 4.2273736000061035, "step": 1297 }, { "epoch": 0.9579335793357934, "grad_norm": 0.9626381397247314, "learning_rate": 3.4061759404503734e-06, "loss": 3.9600830078125, "step": 1298 }, { "epoch": 0.9586715867158672, "grad_norm": 0.892805814743042, "learning_rate": 3.2899169059607216e-06, "loss": 4.168842315673828, "step": 1299 }, { "epoch": 0.959409594095941, "grad_norm": 1.0419422388076782, "learning_rate": 3.1756671723969843e-06, "loss": 4.045411109924316, "step": 1300 }, { "epoch": 0.9601476014760147, "grad_norm": 0.9553176760673523, "learning_rate": 3.0634274018746466e-06, "loss": 4.090331077575684, "step": 1301 }, { "epoch": 0.9608856088560885, "grad_norm": 0.7899879813194275, "learning_rate": 2.9531982448607108e-06, "loss": 4.1696577072143555, "step": 1302 }, { "epoch": 0.9616236162361623, "grad_norm": 0.8720894455909729, "learning_rate": 2.8449803401700445e-06, "loss": 4.093484878540039, "step": 1303 }, { "epoch": 0.9623616236162361, "grad_norm": 0.93953937292099, "learning_rate": 2.738774314961534e-06, "loss": 4.138238430023193, "step": 1304 }, { "epoch": 0.9630996309963099, "grad_norm": 0.8301063179969788, "learning_rate": 2.6345807847347413e-06, "loss": 4.250385761260986, "step": 1305 }, { "epoch": 0.9638376383763838, "grad_norm": 0.9756575226783752, "learning_rate": 2.532400353325903e-06, "loss": 4.020941734313965, "step": 1306 }, { "epoch": 0.9645756457564576, "grad_norm": 0.955294132232666, "learning_rate": 2.4322336129049384e-06, "loss": 4.259998321533203, "step": 1307 }, { "epoch": 0.9653136531365314, "grad_norm": 0.8463285565376282, "learning_rate": 2.3340811439715223e-06, "loss": 4.142585754394531, "step": 1308 }, { "epoch": 0.9660516605166052, "grad_norm": 1.4179046154022217, "learning_rate": 2.237943515352098e-06, "loss": 3.9881856441497803, "step": 1309 }, { "epoch": 0.966789667896679, "grad_norm": 0.9249223470687866, "learning_rate": 2.1438212841963734e-06, "loss": 4.063835144042969, "step": 1310 }, { "epoch": 0.9675276752767528, "grad_norm": 0.8510631322860718, "learning_rate": 2.051714995974141e-06, "loss": 3.82594895362854, "step": 1311 }, { "epoch": 0.9682656826568266, "grad_norm": 1.0183271169662476, "learning_rate": 1.9616251844722042e-06, "loss": 4.1191205978393555, "step": 1312 }, { "epoch": 0.9690036900369003, "grad_norm": 0.9534389972686768, "learning_rate": 1.873552371791115e-06, "loss": 4.130201816558838, "step": 1313 }, { "epoch": 0.9697416974169741, "grad_norm": 0.9956035017967224, "learning_rate": 1.7874970683423364e-06, "loss": 3.9721202850341797, "step": 1314 }, { "epoch": 0.9704797047970479, "grad_norm": 0.8220584988594055, "learning_rate": 1.703459772845095e-06, "loss": 4.076860427856445, "step": 1315 }, { "epoch": 0.9712177121771217, "grad_norm": 0.9613221287727356, "learning_rate": 1.6214409723236623e-06, "loss": 3.937884569168091, "step": 1316 }, { "epoch": 0.9719557195571956, "grad_norm": 0.8680334687232971, "learning_rate": 1.5414411421044382e-06, "loss": 4.201882362365723, "step": 1317 }, { "epoch": 0.9726937269372694, "grad_norm": 0.759444534778595, "learning_rate": 1.4634607458131555e-06, "loss": 4.007597923278809, "step": 1318 }, { "epoch": 0.9734317343173432, "grad_norm": 1.0436582565307617, "learning_rate": 1.387500235372352e-06, "loss": 4.142274379730225, "step": 1319 }, { "epoch": 0.974169741697417, "grad_norm": 0.9300816059112549, "learning_rate": 1.3135600509985745e-06, "loss": 4.145612716674805, "step": 1320 }, { "epoch": 0.9749077490774908, "grad_norm": 0.9446290731430054, "learning_rate": 1.2416406211999298e-06, "loss": 4.090052604675293, "step": 1321 }, { "epoch": 0.9756457564575646, "grad_norm": 0.8639572262763977, "learning_rate": 1.171742362773559e-06, "loss": 3.974188804626465, "step": 1322 }, { "epoch": 0.9763837638376384, "grad_norm": 0.8869412541389465, "learning_rate": 1.1038656808032675e-06, "loss": 3.864497184753418, "step": 1323 }, { "epoch": 0.9771217712177122, "grad_norm": 1.0420503616333008, "learning_rate": 1.0380109686571549e-06, "loss": 4.054100036621094, "step": 1324 }, { "epoch": 0.977859778597786, "grad_norm": 0.8223108053207397, "learning_rate": 9.74178607985282e-07, "loss": 3.966116428375244, "step": 1325 }, { "epoch": 0.9785977859778597, "grad_norm": 0.9738601446151733, "learning_rate": 9.123689687175751e-07, "loss": 4.062865257263184, "step": 1326 }, { "epoch": 0.9793357933579335, "grad_norm": 0.9468348026275635, "learning_rate": 8.525824090615308e-07, "loss": 3.845522165298462, "step": 1327 }, { "epoch": 0.9800738007380074, "grad_norm": 0.9388923048973083, "learning_rate": 7.948192755002747e-07, "loss": 4.0565595626831055, "step": 1328 }, { "epoch": 0.9808118081180812, "grad_norm": 0.8884272575378418, "learning_rate": 7.390799027904627e-07, "loss": 3.9518604278564453, "step": 1329 }, { "epoch": 0.981549815498155, "grad_norm": 0.8875818252563477, "learning_rate": 6.85364613960493e-07, "loss": 4.188823223114014, "step": 1330 }, { "epoch": 0.9822878228782288, "grad_norm": 0.7383257150650024, "learning_rate": 6.336737203083698e-07, "loss": 4.029225826263428, "step": 1331 }, { "epoch": 0.9830258302583026, "grad_norm": 0.8472557663917542, "learning_rate": 5.840075214001095e-07, "loss": 4.239529609680176, "step": 1332 }, { "epoch": 0.9837638376383764, "grad_norm": 0.8474605679512024, "learning_rate": 5.363663050679535e-07, "loss": 4.032186508178711, "step": 1333 }, { "epoch": 0.9845018450184502, "grad_norm": 1.059869647026062, "learning_rate": 4.90750347408736e-07, "loss": 4.1005859375, "step": 1334 }, { "epoch": 0.985239852398524, "grad_norm": 0.8023120760917664, "learning_rate": 4.4715991278213576e-07, "loss": 3.901467800140381, "step": 1335 }, { "epoch": 0.9859778597785978, "grad_norm": 0.8805201053619385, "learning_rate": 4.0559525380935435e-07, "loss": 3.9754929542541504, "step": 1336 }, { "epoch": 0.9867158671586715, "grad_norm": 0.844008207321167, "learning_rate": 3.660566113714847e-07, "loss": 4.112626552581787, "step": 1337 }, { "epoch": 0.9874538745387453, "grad_norm": 0.8604761958122253, "learning_rate": 3.2854421460815075e-07, "loss": 3.6430814266204834, "step": 1338 }, { "epoch": 0.9881918819188192, "grad_norm": 0.8547300100326538, "learning_rate": 2.930582809162641e-07, "loss": 4.121878623962402, "step": 1339 }, { "epoch": 0.988929889298893, "grad_norm": 0.9670364260673523, "learning_rate": 2.5959901594870273e-07, "loss": 3.9410769939422607, "step": 1340 }, { "epoch": 0.9896678966789668, "grad_norm": 1.0530565977096558, "learning_rate": 2.281666136130678e-07, "loss": 3.989541530609131, "step": 1341 }, { "epoch": 0.9904059040590406, "grad_norm": 1.0222362279891968, "learning_rate": 1.9876125607067309e-07, "loss": 4.04118537902832, "step": 1342 }, { "epoch": 0.9911439114391144, "grad_norm": 0.7860491275787354, "learning_rate": 1.713831137353794e-07, "loss": 3.8432321548461914, "step": 1343 }, { "epoch": 0.9918819188191882, "grad_norm": 0.8329381942749023, "learning_rate": 1.460323452727008e-07, "loss": 3.9724719524383545, "step": 1344 }, { "epoch": 0.992619926199262, "grad_norm": 0.8542125225067139, "learning_rate": 1.2270909759879432e-07, "loss": 4.1086506843566895, "step": 1345 }, { "epoch": 0.9933579335793358, "grad_norm": 0.9141987562179565, "learning_rate": 1.0141350587972164e-07, "loss": 3.7987635135650635, "step": 1346 }, { "epoch": 0.9940959409594096, "grad_norm": 0.8679295778274536, "learning_rate": 8.214569353055534e-08, "loss": 4.131080627441406, "step": 1347 }, { "epoch": 0.9948339483394834, "grad_norm": 0.9177278876304626, "learning_rate": 6.490577221467953e-08, "loss": 4.434652328491211, "step": 1348 }, { "epoch": 0.9955719557195571, "grad_norm": 0.8580910563468933, "learning_rate": 4.9693841843245764e-08, "loss": 4.102374076843262, "step": 1349 }, { "epoch": 0.996309963099631, "grad_norm": 0.9112648367881775, "learning_rate": 3.6509990574473684e-08, "loss": 4.2107343673706055, "step": 1350 }, { "epoch": 0.9970479704797048, "grad_norm": 1.0079255104064941, "learning_rate": 2.535429481318463e-08, "loss": 3.846949577331543, "step": 1351 }, { "epoch": 0.9977859778597786, "grad_norm": 0.7972182035446167, "learning_rate": 1.622681921033542e-08, "loss": 3.9979095458984375, "step": 1352 }, { "epoch": 0.9985239852398524, "grad_norm": 0.8567417860031128, "learning_rate": 9.127616662746307e-09, "loss": 3.994305372238159, "step": 1353 }, { "epoch": 0.9992619926199262, "grad_norm": 0.8455564379692078, "learning_rate": 4.0567283126347055e-09, "loss": 4.195075988769531, "step": 1354 }, { "epoch": 1.0, "grad_norm": 0.7844815254211426, "learning_rate": 1.0141835475374616e-09, "loss": 4.3078813552856445, "step": 1355 } ], "logging_steps": 1, "max_steps": 1355, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.572106671245492e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }