{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.16412276382734287, "eval_steps": 500, "global_step": 1500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00010941517588489524, "grad_norm": 6.748959064483643, "learning_rate": 0.0001, "loss": 1.6219, "step": 1 }, { "epoch": 0.00021883035176979048, "grad_norm": 5.348644256591797, "learning_rate": 0.0002, "loss": 1.5018, "step": 2 }, { "epoch": 0.0003282455276546857, "grad_norm": 0.9826696515083313, "learning_rate": 0.0003, "loss": 1.2755, "step": 3 }, { "epoch": 0.00043766070353958096, "grad_norm": 1.5212827920913696, "learning_rate": 0.0004, "loss": 1.1538, "step": 4 }, { "epoch": 0.0005470758794244762, "grad_norm": 0.5145948529243469, "learning_rate": 0.0005, "loss": 1.0551, "step": 5 }, { "epoch": 0.0006564910553093714, "grad_norm": 0.41598066687583923, "learning_rate": 0.0004999452594701116, "loss": 1.0235, "step": 6 }, { "epoch": 0.0007659062311942666, "grad_norm": 1.0200426578521729, "learning_rate": 0.0004998905189402234, "loss": 1.1521, "step": 7 }, { "epoch": 0.0008753214070791619, "grad_norm": 0.7854887843132019, "learning_rate": 0.000499835778410335, "loss": 1.0688, "step": 8 }, { "epoch": 0.0009847365829640572, "grad_norm": 0.6088876128196716, "learning_rate": 0.0004997810378804467, "loss": 1.066, "step": 9 }, { "epoch": 0.0010941517588489524, "grad_norm": 0.7103877067565918, "learning_rate": 0.0004997262973505584, "loss": 1.0584, "step": 10 }, { "epoch": 0.0012035669347338475, "grad_norm": 0.44480615854263306, "learning_rate": 0.00049967155682067, "loss": 1.0342, "step": 11 }, { "epoch": 0.0013129821106187428, "grad_norm": 0.3577936589717865, "learning_rate": 0.0004996168162907817, "loss": 1.0479, "step": 12 }, { "epoch": 0.001422397286503638, "grad_norm": 0.2826107442378998, "learning_rate": 0.0004995620757608934, "loss": 1.0007, "step": 13 }, { "epoch": 0.0015318124623885333, "grad_norm": 0.41260913014411926, "learning_rate": 0.000499507335231005, "loss": 0.9696, "step": 14 }, { "epoch": 0.0016412276382734286, "grad_norm": 0.3187173902988434, "learning_rate": 0.0004994525947011167, "loss": 0.9202, "step": 15 }, { "epoch": 0.0017506428141583238, "grad_norm": 0.25816890597343445, "learning_rate": 0.0004993978541712284, "loss": 0.9584, "step": 16 }, { "epoch": 0.001860057990043219, "grad_norm": 0.22255472838878632, "learning_rate": 0.00049934311364134, "loss": 0.9722, "step": 17 }, { "epoch": 0.0019694731659281144, "grad_norm": 0.245590940117836, "learning_rate": 0.0004992883731114517, "loss": 0.9414, "step": 18 }, { "epoch": 0.0020788883418130094, "grad_norm": 0.1839824616909027, "learning_rate": 0.0004992336325815634, "loss": 0.9789, "step": 19 }, { "epoch": 0.002188303517697905, "grad_norm": 0.19542355835437775, "learning_rate": 0.000499178892051675, "loss": 0.9795, "step": 20 }, { "epoch": 0.0022977186935828, "grad_norm": 0.15559178590774536, "learning_rate": 0.0004991241515217867, "loss": 0.8858, "step": 21 }, { "epoch": 0.002407133869467695, "grad_norm": 0.16895073652267456, "learning_rate": 0.0004990694109918984, "loss": 0.9077, "step": 22 }, { "epoch": 0.0025165490453525905, "grad_norm": 0.16406983137130737, "learning_rate": 0.0004990146704620101, "loss": 0.8751, "step": 23 }, { "epoch": 0.0026259642212374855, "grad_norm": 0.1688385158777237, "learning_rate": 0.0004989599299321217, "loss": 0.9304, "step": 24 }, { "epoch": 0.002735379397122381, "grad_norm": 0.2053556591272354, "learning_rate": 0.0004989051894022335, "loss": 0.9449, "step": 25 }, { "epoch": 0.002844794573007276, "grad_norm": 0.1790640652179718, "learning_rate": 0.0004988504488723451, "loss": 1.0121, "step": 26 }, { "epoch": 0.0029542097488921715, "grad_norm": 0.177456334233284, "learning_rate": 0.0004987957083424567, "loss": 0.9087, "step": 27 }, { "epoch": 0.0030636249247770666, "grad_norm": 0.16537578403949738, "learning_rate": 0.0004987409678125685, "loss": 0.9006, "step": 28 }, { "epoch": 0.0031730401006619616, "grad_norm": 0.17812395095825195, "learning_rate": 0.0004986862272826801, "loss": 0.8916, "step": 29 }, { "epoch": 0.003282455276546857, "grad_norm": 0.20445552468299866, "learning_rate": 0.0004986314867527917, "loss": 0.9592, "step": 30 }, { "epoch": 0.003391870452431752, "grad_norm": 0.19043420255184174, "learning_rate": 0.0004985767462229035, "loss": 0.9394, "step": 31 }, { "epoch": 0.0035012856283166477, "grad_norm": 0.15710125863552094, "learning_rate": 0.0004985220056930151, "loss": 0.8872, "step": 32 }, { "epoch": 0.0036107008042015427, "grad_norm": 0.18654228746891022, "learning_rate": 0.0004984672651631267, "loss": 0.9067, "step": 33 }, { "epoch": 0.003720115980086438, "grad_norm": 0.1369885355234146, "learning_rate": 0.0004984125246332385, "loss": 0.9542, "step": 34 }, { "epoch": 0.0038295311559713332, "grad_norm": 0.14511971175670624, "learning_rate": 0.0004983577841033501, "loss": 0.958, "step": 35 }, { "epoch": 0.003938946331856229, "grad_norm": 0.14135710895061493, "learning_rate": 0.0004983030435734617, "loss": 0.8547, "step": 36 }, { "epoch": 0.004048361507741124, "grad_norm": 0.16319400072097778, "learning_rate": 0.0004982483030435735, "loss": 0.8845, "step": 37 }, { "epoch": 0.004157776683626019, "grad_norm": 0.15757504105567932, "learning_rate": 0.0004981935625136851, "loss": 0.9367, "step": 38 }, { "epoch": 0.004267191859510914, "grad_norm": 0.1803285777568817, "learning_rate": 0.0004981388219837967, "loss": 0.9021, "step": 39 }, { "epoch": 0.00437660703539581, "grad_norm": 0.20967647433280945, "learning_rate": 0.0004980840814539085, "loss": 0.9414, "step": 40 }, { "epoch": 0.004486022211280705, "grad_norm": 0.13856051862239838, "learning_rate": 0.0004980293409240201, "loss": 0.9014, "step": 41 }, { "epoch": 0.0045954373871656, "grad_norm": 0.1282176375389099, "learning_rate": 0.0004979746003941319, "loss": 0.9273, "step": 42 }, { "epoch": 0.004704852563050495, "grad_norm": 0.1424468755722046, "learning_rate": 0.0004979198598642435, "loss": 0.9141, "step": 43 }, { "epoch": 0.00481426773893539, "grad_norm": 0.13319255411624908, "learning_rate": 0.0004978651193343552, "loss": 0.9494, "step": 44 }, { "epoch": 0.004923682914820286, "grad_norm": 0.15850073099136353, "learning_rate": 0.0004978103788044669, "loss": 0.8939, "step": 45 }, { "epoch": 0.005033098090705181, "grad_norm": 0.12534229457378387, "learning_rate": 0.0004977556382745785, "loss": 0.8928, "step": 46 }, { "epoch": 0.005142513266590076, "grad_norm": 0.1287834197282791, "learning_rate": 0.0004977008977446902, "loss": 0.8928, "step": 47 }, { "epoch": 0.005251928442474971, "grad_norm": 0.13171854615211487, "learning_rate": 0.0004976461572148019, "loss": 0.8621, "step": 48 }, { "epoch": 0.005361343618359866, "grad_norm": 0.12498301267623901, "learning_rate": 0.0004975914166849135, "loss": 0.8832, "step": 49 }, { "epoch": 0.005470758794244762, "grad_norm": 0.13915930688381195, "learning_rate": 0.0004975366761550252, "loss": 0.8766, "step": 50 }, { "epoch": 0.005580173970129657, "grad_norm": 0.15387104451656342, "learning_rate": 0.0004974819356251369, "loss": 0.9552, "step": 51 }, { "epoch": 0.005689589146014552, "grad_norm": 0.12831054627895355, "learning_rate": 0.0004974271950952485, "loss": 0.8813, "step": 52 }, { "epoch": 0.005799004321899447, "grad_norm": 0.1431848704814911, "learning_rate": 0.0004973724545653602, "loss": 0.9444, "step": 53 }, { "epoch": 0.005908419497784343, "grad_norm": 0.1562032401561737, "learning_rate": 0.0004973177140354719, "loss": 0.8691, "step": 54 }, { "epoch": 0.006017834673669238, "grad_norm": 0.13038943707942963, "learning_rate": 0.0004972629735055835, "loss": 0.8521, "step": 55 }, { "epoch": 0.006127249849554133, "grad_norm": 0.17848436534404755, "learning_rate": 0.0004972082329756952, "loss": 0.9113, "step": 56 }, { "epoch": 0.006236665025439028, "grad_norm": 0.12383371591567993, "learning_rate": 0.0004971534924458069, "loss": 0.8843, "step": 57 }, { "epoch": 0.006346080201323923, "grad_norm": 0.157698854804039, "learning_rate": 0.0004970987519159186, "loss": 0.8829, "step": 58 }, { "epoch": 0.006455495377208819, "grad_norm": 0.13125739991664886, "learning_rate": 0.0004970440113860302, "loss": 0.942, "step": 59 }, { "epoch": 0.006564910553093714, "grad_norm": 0.14343556761741638, "learning_rate": 0.0004969892708561419, "loss": 0.8928, "step": 60 }, { "epoch": 0.006674325728978609, "grad_norm": 0.14037324488162994, "learning_rate": 0.0004969345303262536, "loss": 0.8987, "step": 61 }, { "epoch": 0.006783740904863504, "grad_norm": 0.14872388541698456, "learning_rate": 0.0004968797897963652, "loss": 0.8837, "step": 62 }, { "epoch": 0.006893156080748399, "grad_norm": 0.13239559531211853, "learning_rate": 0.000496825049266477, "loss": 0.8691, "step": 63 }, { "epoch": 0.007002571256633295, "grad_norm": 0.12761270999908447, "learning_rate": 0.0004967703087365886, "loss": 0.8896, "step": 64 }, { "epoch": 0.00711198643251819, "grad_norm": 0.13671258091926575, "learning_rate": 0.0004967155682067002, "loss": 0.9217, "step": 65 }, { "epoch": 0.007221401608403085, "grad_norm": 0.14649002254009247, "learning_rate": 0.000496660827676812, "loss": 0.8832, "step": 66 }, { "epoch": 0.0073308167842879805, "grad_norm": 0.13539502024650574, "learning_rate": 0.0004966060871469236, "loss": 0.9107, "step": 67 }, { "epoch": 0.007440231960172876, "grad_norm": 0.13384485244750977, "learning_rate": 0.0004965513466170352, "loss": 0.8369, "step": 68 }, { "epoch": 0.007549647136057771, "grad_norm": 0.12043328583240509, "learning_rate": 0.000496496606087147, "loss": 0.8589, "step": 69 }, { "epoch": 0.0076590623119426665, "grad_norm": 0.13288399577140808, "learning_rate": 0.0004964418655572586, "loss": 0.9253, "step": 70 }, { "epoch": 0.0077684774878275615, "grad_norm": 0.1376030147075653, "learning_rate": 0.0004963871250273702, "loss": 0.922, "step": 71 }, { "epoch": 0.007877892663712457, "grad_norm": 0.12452669441699982, "learning_rate": 0.000496332384497482, "loss": 0.9787, "step": 72 }, { "epoch": 0.007987307839597352, "grad_norm": 0.11370225250720978, "learning_rate": 0.0004962776439675936, "loss": 0.8997, "step": 73 }, { "epoch": 0.008096723015482248, "grad_norm": 0.12678754329681396, "learning_rate": 0.0004962229034377052, "loss": 0.8961, "step": 74 }, { "epoch": 0.008206138191367143, "grad_norm": 0.12327083945274353, "learning_rate": 0.000496168162907817, "loss": 0.8764, "step": 75 }, { "epoch": 0.008315553367252038, "grad_norm": 0.11941748112440109, "learning_rate": 0.0004961134223779286, "loss": 0.919, "step": 76 }, { "epoch": 0.008424968543136933, "grad_norm": 0.12977023422718048, "learning_rate": 0.0004960586818480403, "loss": 0.8596, "step": 77 }, { "epoch": 0.008534383719021828, "grad_norm": 0.13321086764335632, "learning_rate": 0.000496003941318152, "loss": 0.885, "step": 78 }, { "epoch": 0.008643798894906723, "grad_norm": 0.2903243601322174, "learning_rate": 0.0004959492007882637, "loss": 0.8988, "step": 79 }, { "epoch": 0.00875321407079162, "grad_norm": 0.1437711864709854, "learning_rate": 0.0004958944602583753, "loss": 0.7611, "step": 80 }, { "epoch": 0.008862629246676515, "grad_norm": 0.13083909451961517, "learning_rate": 0.000495839719728487, "loss": 0.8904, "step": 81 }, { "epoch": 0.00897204442256141, "grad_norm": 0.1541178673505783, "learning_rate": 0.0004957849791985987, "loss": 0.9018, "step": 82 }, { "epoch": 0.009081459598446305, "grad_norm": 0.13288000226020813, "learning_rate": 0.0004957302386687103, "loss": 0.8189, "step": 83 }, { "epoch": 0.0091908747743312, "grad_norm": 0.1527184247970581, "learning_rate": 0.000495675498138822, "loss": 0.9415, "step": 84 }, { "epoch": 0.009300289950216095, "grad_norm": 0.14945630729198456, "learning_rate": 0.0004956207576089337, "loss": 0.9519, "step": 85 }, { "epoch": 0.00940970512610099, "grad_norm": 0.169093057513237, "learning_rate": 0.0004955660170790453, "loss": 0.9417, "step": 86 }, { "epoch": 0.009519120301985885, "grad_norm": 0.14720186591148376, "learning_rate": 0.000495511276549157, "loss": 0.935, "step": 87 }, { "epoch": 0.00962853547787078, "grad_norm": 0.1609087884426117, "learning_rate": 0.0004954565360192687, "loss": 0.7815, "step": 88 }, { "epoch": 0.009737950653755677, "grad_norm": 0.14608846604824066, "learning_rate": 0.0004954017954893803, "loss": 0.8978, "step": 89 }, { "epoch": 0.009847365829640572, "grad_norm": 0.14358682930469513, "learning_rate": 0.000495347054959492, "loss": 0.8393, "step": 90 }, { "epoch": 0.009956781005525467, "grad_norm": 0.14243635535240173, "learning_rate": 0.0004952923144296037, "loss": 0.8549, "step": 91 }, { "epoch": 0.010066196181410362, "grad_norm": 0.1514996886253357, "learning_rate": 0.0004952375738997153, "loss": 0.8941, "step": 92 }, { "epoch": 0.010175611357295257, "grad_norm": 0.13923238217830658, "learning_rate": 0.0004951828333698271, "loss": 0.8625, "step": 93 }, { "epoch": 0.010285026533180152, "grad_norm": 0.14224301278591156, "learning_rate": 0.0004951280928399387, "loss": 0.8556, "step": 94 }, { "epoch": 0.010394441709065047, "grad_norm": 0.14466360211372375, "learning_rate": 0.0004950733523100503, "loss": 0.8589, "step": 95 }, { "epoch": 0.010503856884949942, "grad_norm": 0.13258099555969238, "learning_rate": 0.0004950186117801621, "loss": 0.9174, "step": 96 }, { "epoch": 0.010613272060834837, "grad_norm": 0.13822390139102936, "learning_rate": 0.0004949638712502737, "loss": 0.8659, "step": 97 }, { "epoch": 0.010722687236719732, "grad_norm": 0.13189956545829773, "learning_rate": 0.0004949091307203854, "loss": 0.8829, "step": 98 }, { "epoch": 0.010832102412604629, "grad_norm": 0.12616467475891113, "learning_rate": 0.0004948543901904971, "loss": 0.8405, "step": 99 }, { "epoch": 0.010941517588489524, "grad_norm": 0.457931786775589, "learning_rate": 0.0004947996496606087, "loss": 0.8525, "step": 100 }, { "epoch": 0.011050932764374419, "grad_norm": 0.12417889386415482, "learning_rate": 0.0004947449091307204, "loss": 0.8701, "step": 101 }, { "epoch": 0.011160347940259314, "grad_norm": 0.1292286515235901, "learning_rate": 0.0004946901686008321, "loss": 0.8196, "step": 102 }, { "epoch": 0.01126976311614421, "grad_norm": 0.7958583235740662, "learning_rate": 0.0004946354280709437, "loss": 0.8337, "step": 103 }, { "epoch": 0.011379178292029104, "grad_norm": 0.14659330248832703, "learning_rate": 0.0004945806875410554, "loss": 0.9135, "step": 104 }, { "epoch": 0.011488593467914, "grad_norm": 0.1416400521993637, "learning_rate": 0.0004945259470111671, "loss": 0.8083, "step": 105 }, { "epoch": 0.011598008643798894, "grad_norm": 0.15555381774902344, "learning_rate": 0.0004944712064812787, "loss": 0.9182, "step": 106 }, { "epoch": 0.01170742381968379, "grad_norm": 0.15293537080287933, "learning_rate": 0.0004944164659513904, "loss": 0.8978, "step": 107 }, { "epoch": 0.011816838995568686, "grad_norm": 0.16708214581012726, "learning_rate": 0.0004943617254215021, "loss": 0.8825, "step": 108 }, { "epoch": 0.011926254171453581, "grad_norm": 0.15999279916286469, "learning_rate": 0.0004943069848916137, "loss": 0.844, "step": 109 }, { "epoch": 0.012035669347338476, "grad_norm": 0.16234947741031647, "learning_rate": 0.0004942522443617254, "loss": 0.8173, "step": 110 }, { "epoch": 0.012145084523223371, "grad_norm": 0.1565627008676529, "learning_rate": 0.0004941975038318371, "loss": 0.9369, "step": 111 }, { "epoch": 0.012254499699108266, "grad_norm": 0.1558198630809784, "learning_rate": 0.0004941427633019488, "loss": 1.0324, "step": 112 }, { "epoch": 0.012363914874993161, "grad_norm": 0.14520087838172913, "learning_rate": 0.0004940880227720604, "loss": 0.9035, "step": 113 }, { "epoch": 0.012473330050878056, "grad_norm": 0.15729227662086487, "learning_rate": 0.0004940332822421721, "loss": 0.8385, "step": 114 }, { "epoch": 0.012582745226762952, "grad_norm": 0.14594535529613495, "learning_rate": 0.0004939785417122838, "loss": 0.9033, "step": 115 }, { "epoch": 0.012692160402647847, "grad_norm": 0.13702966272830963, "learning_rate": 0.0004939238011823954, "loss": 0.9528, "step": 116 }, { "epoch": 0.012801575578532743, "grad_norm": 0.14287912845611572, "learning_rate": 0.0004938690606525072, "loss": 0.8234, "step": 117 }, { "epoch": 0.012910990754417638, "grad_norm": 0.13964290916919708, "learning_rate": 0.0004938143201226188, "loss": 0.8635, "step": 118 }, { "epoch": 0.013020405930302533, "grad_norm": 0.14050424098968506, "learning_rate": 0.0004937595795927304, "loss": 0.8428, "step": 119 }, { "epoch": 0.013129821106187428, "grad_norm": 0.1534089297056198, "learning_rate": 0.0004937048390628422, "loss": 0.9291, "step": 120 }, { "epoch": 0.013239236282072324, "grad_norm": 0.1387636810541153, "learning_rate": 0.0004936500985329538, "loss": 0.8649, "step": 121 }, { "epoch": 0.013348651457957219, "grad_norm": 0.13886477053165436, "learning_rate": 0.0004935953580030654, "loss": 0.8611, "step": 122 }, { "epoch": 0.013458066633842114, "grad_norm": 0.17605999112129211, "learning_rate": 0.0004935406174731772, "loss": 0.9295, "step": 123 }, { "epoch": 0.013567481809727009, "grad_norm": 0.17614100873470306, "learning_rate": 0.0004934858769432888, "loss": 0.8318, "step": 124 }, { "epoch": 0.013676896985611904, "grad_norm": 0.16368156671524048, "learning_rate": 0.0004934311364134004, "loss": 0.9563, "step": 125 }, { "epoch": 0.013786312161496799, "grad_norm": 0.14977481961250305, "learning_rate": 0.0004933763958835122, "loss": 0.786, "step": 126 }, { "epoch": 0.013895727337381696, "grad_norm": 0.1488548219203949, "learning_rate": 0.0004933216553536238, "loss": 0.8962, "step": 127 }, { "epoch": 0.01400514251326659, "grad_norm": 0.14431630074977875, "learning_rate": 0.0004932669148237354, "loss": 0.8703, "step": 128 }, { "epoch": 0.014114557689151486, "grad_norm": 0.13175436854362488, "learning_rate": 0.0004932121742938472, "loss": 0.812, "step": 129 }, { "epoch": 0.01422397286503638, "grad_norm": 0.13051766157150269, "learning_rate": 0.0004931574337639588, "loss": 0.8011, "step": 130 }, { "epoch": 0.014333388040921276, "grad_norm": 0.14817772805690765, "learning_rate": 0.0004931026932340706, "loss": 0.8485, "step": 131 }, { "epoch": 0.01444280321680617, "grad_norm": 0.12780654430389404, "learning_rate": 0.0004930479527041822, "loss": 0.8645, "step": 132 }, { "epoch": 0.014552218392691066, "grad_norm": 0.1372184008359909, "learning_rate": 0.0004929932121742939, "loss": 0.8937, "step": 133 }, { "epoch": 0.014661633568575961, "grad_norm": 0.13617576658725739, "learning_rate": 0.0004929384716444056, "loss": 0.8305, "step": 134 }, { "epoch": 0.014771048744460856, "grad_norm": 0.1279815137386322, "learning_rate": 0.0004928837311145172, "loss": 0.8093, "step": 135 }, { "epoch": 0.014880463920345753, "grad_norm": 0.1756022870540619, "learning_rate": 0.0004928289905846289, "loss": 0.8474, "step": 136 }, { "epoch": 0.014989879096230648, "grad_norm": 0.3069600462913513, "learning_rate": 0.0004927742500547406, "loss": 0.8264, "step": 137 }, { "epoch": 0.015099294272115543, "grad_norm": 0.15297222137451172, "learning_rate": 0.0004927195095248522, "loss": 0.8526, "step": 138 }, { "epoch": 0.015208709448000438, "grad_norm": 0.18646004796028137, "learning_rate": 0.0004926647689949639, "loss": 0.8253, "step": 139 }, { "epoch": 0.015318124623885333, "grad_norm": 0.15774445235729218, "learning_rate": 0.0004926100284650756, "loss": 0.8315, "step": 140 }, { "epoch": 0.015427539799770228, "grad_norm": 0.18145789206027985, "learning_rate": 0.0004925552879351872, "loss": 0.8466, "step": 141 }, { "epoch": 0.015536954975655123, "grad_norm": 0.18059922754764557, "learning_rate": 0.0004925005474052989, "loss": 0.8983, "step": 142 }, { "epoch": 0.01564637015154002, "grad_norm": 0.14341312646865845, "learning_rate": 0.0004924458068754106, "loss": 0.9127, "step": 143 }, { "epoch": 0.015755785327424915, "grad_norm": 0.15536214411258698, "learning_rate": 0.0004923910663455222, "loss": 0.9245, "step": 144 }, { "epoch": 0.01586520050330981, "grad_norm": 0.1529022753238678, "learning_rate": 0.0004923363258156339, "loss": 0.9025, "step": 145 }, { "epoch": 0.015974615679194705, "grad_norm": 0.1431504786014557, "learning_rate": 0.0004922815852857456, "loss": 0.8116, "step": 146 }, { "epoch": 0.0160840308550796, "grad_norm": 0.1411881446838379, "learning_rate": 0.0004922268447558573, "loss": 0.9079, "step": 147 }, { "epoch": 0.016193446030964495, "grad_norm": 0.13766123354434967, "learning_rate": 0.0004921721042259689, "loss": 0.825, "step": 148 }, { "epoch": 0.01630286120684939, "grad_norm": 0.17526914179325104, "learning_rate": 0.0004921173636960806, "loss": 0.8982, "step": 149 }, { "epoch": 0.016412276382734285, "grad_norm": 0.1324506253004074, "learning_rate": 0.0004920626231661923, "loss": 0.8594, "step": 150 }, { "epoch": 0.01652169155861918, "grad_norm": 0.153593048453331, "learning_rate": 0.0004920078826363039, "loss": 0.8159, "step": 151 }, { "epoch": 0.016631106734504075, "grad_norm": 0.1556576043367386, "learning_rate": 0.0004919531421064157, "loss": 0.8357, "step": 152 }, { "epoch": 0.01674052191038897, "grad_norm": 0.1526254564523697, "learning_rate": 0.0004918984015765273, "loss": 0.835, "step": 153 }, { "epoch": 0.016849937086273865, "grad_norm": 0.13940195739269257, "learning_rate": 0.0004918436610466389, "loss": 0.9228, "step": 154 }, { "epoch": 0.01695935226215876, "grad_norm": 0.1408044397830963, "learning_rate": 0.0004917889205167507, "loss": 0.783, "step": 155 }, { "epoch": 0.017068767438043655, "grad_norm": 0.1356937289237976, "learning_rate": 0.0004917341799868623, "loss": 0.8597, "step": 156 }, { "epoch": 0.01717818261392855, "grad_norm": 0.14266614615917206, "learning_rate": 0.0004916794394569739, "loss": 0.849, "step": 157 }, { "epoch": 0.017287597789813446, "grad_norm": 0.13175524771213531, "learning_rate": 0.0004916246989270857, "loss": 0.8036, "step": 158 }, { "epoch": 0.01739701296569834, "grad_norm": 0.1707000881433487, "learning_rate": 0.0004915699583971973, "loss": 0.8458, "step": 159 }, { "epoch": 0.01750642814158324, "grad_norm": 0.13843651115894318, "learning_rate": 0.0004915152178673089, "loss": 0.8604, "step": 160 }, { "epoch": 0.017615843317468134, "grad_norm": 0.1356249302625656, "learning_rate": 0.0004914604773374207, "loss": 0.8409, "step": 161 }, { "epoch": 0.01772525849335303, "grad_norm": 0.1492074579000473, "learning_rate": 0.0004914057368075323, "loss": 0.8454, "step": 162 }, { "epoch": 0.017834673669237924, "grad_norm": 0.1343483328819275, "learning_rate": 0.0004913509962776439, "loss": 0.898, "step": 163 }, { "epoch": 0.01794408884512282, "grad_norm": 0.14064320921897888, "learning_rate": 0.0004912962557477557, "loss": 0.8653, "step": 164 }, { "epoch": 0.018053504021007714, "grad_norm": 0.12447015941143036, "learning_rate": 0.0004912415152178673, "loss": 0.9226, "step": 165 }, { "epoch": 0.01816291919689261, "grad_norm": 0.1371176540851593, "learning_rate": 0.000491186774687979, "loss": 0.9185, "step": 166 }, { "epoch": 0.018272334372777504, "grad_norm": 0.12911225855350494, "learning_rate": 0.0004911320341580907, "loss": 0.8732, "step": 167 }, { "epoch": 0.0183817495486624, "grad_norm": 0.14181435108184814, "learning_rate": 0.0004910772936282023, "loss": 0.8419, "step": 168 }, { "epoch": 0.018491164724547295, "grad_norm": 0.13337838649749756, "learning_rate": 0.000491022553098314, "loss": 0.8549, "step": 169 }, { "epoch": 0.01860057990043219, "grad_norm": 0.1270197033882141, "learning_rate": 0.0004909678125684257, "loss": 0.8134, "step": 170 }, { "epoch": 0.018709995076317085, "grad_norm": 0.1448320746421814, "learning_rate": 0.0004909130720385374, "loss": 0.8984, "step": 171 }, { "epoch": 0.01881941025220198, "grad_norm": 0.15902382135391235, "learning_rate": 0.000490858331508649, "loss": 0.8631, "step": 172 }, { "epoch": 0.018928825428086875, "grad_norm": 0.1365499049425125, "learning_rate": 0.0004908035909787607, "loss": 0.8567, "step": 173 }, { "epoch": 0.01903824060397177, "grad_norm": 0.1331349015235901, "learning_rate": 0.0004907488504488724, "loss": 0.8374, "step": 174 }, { "epoch": 0.019147655779856665, "grad_norm": 0.1263165920972824, "learning_rate": 0.000490694109918984, "loss": 0.7637, "step": 175 }, { "epoch": 0.01925707095574156, "grad_norm": 0.1363866925239563, "learning_rate": 0.0004906393693890957, "loss": 0.8468, "step": 176 }, { "epoch": 0.019366486131626455, "grad_norm": 0.16107019782066345, "learning_rate": 0.0004905846288592074, "loss": 0.7973, "step": 177 }, { "epoch": 0.019475901307511354, "grad_norm": 0.12075755000114441, "learning_rate": 0.000490529888329319, "loss": 0.7858, "step": 178 }, { "epoch": 0.01958531648339625, "grad_norm": 0.11825218051671982, "learning_rate": 0.0004904751477994307, "loss": 0.8706, "step": 179 }, { "epoch": 0.019694731659281144, "grad_norm": 0.1258508414030075, "learning_rate": 0.0004904204072695424, "loss": 0.8769, "step": 180 }, { "epoch": 0.01980414683516604, "grad_norm": 0.13747736811637878, "learning_rate": 0.000490365666739654, "loss": 0.8544, "step": 181 }, { "epoch": 0.019913562011050934, "grad_norm": 0.12983323633670807, "learning_rate": 0.0004903109262097657, "loss": 0.8742, "step": 182 }, { "epoch": 0.02002297718693583, "grad_norm": 0.1278667449951172, "learning_rate": 0.0004902561856798774, "loss": 0.8766, "step": 183 }, { "epoch": 0.020132392362820724, "grad_norm": 0.13013461232185364, "learning_rate": 0.000490201445149989, "loss": 0.8823, "step": 184 }, { "epoch": 0.02024180753870562, "grad_norm": 0.1314740777015686, "learning_rate": 0.0004901467046201008, "loss": 0.813, "step": 185 }, { "epoch": 0.020351222714590514, "grad_norm": 0.14077121019363403, "learning_rate": 0.0004900919640902124, "loss": 0.8308, "step": 186 }, { "epoch": 0.02046063789047541, "grad_norm": 0.12999317049980164, "learning_rate": 0.0004900372235603241, "loss": 0.8386, "step": 187 }, { "epoch": 0.020570053066360304, "grad_norm": 0.1378699243068695, "learning_rate": 0.0004899824830304358, "loss": 0.827, "step": 188 }, { "epoch": 0.0206794682422452, "grad_norm": 0.12532277405261993, "learning_rate": 0.0004899277425005474, "loss": 0.8671, "step": 189 }, { "epoch": 0.020788883418130094, "grad_norm": 0.12412350624799728, "learning_rate": 0.0004898730019706591, "loss": 0.8725, "step": 190 }, { "epoch": 0.02089829859401499, "grad_norm": 0.12123074382543564, "learning_rate": 0.0004898182614407708, "loss": 0.7653, "step": 191 }, { "epoch": 0.021007713769899884, "grad_norm": 0.13051626086235046, "learning_rate": 0.0004897635209108824, "loss": 0.8419, "step": 192 }, { "epoch": 0.02111712894578478, "grad_norm": 0.12628591060638428, "learning_rate": 0.0004897087803809941, "loss": 0.8487, "step": 193 }, { "epoch": 0.021226544121669674, "grad_norm": 0.11880683153867722, "learning_rate": 0.0004896540398511058, "loss": 0.8446, "step": 194 }, { "epoch": 0.02133595929755457, "grad_norm": 0.13202403485774994, "learning_rate": 0.0004895992993212174, "loss": 0.9101, "step": 195 }, { "epoch": 0.021445374473439464, "grad_norm": 0.13024359941482544, "learning_rate": 0.0004895445587913291, "loss": 0.8662, "step": 196 }, { "epoch": 0.021554789649324363, "grad_norm": 0.1338254064321518, "learning_rate": 0.0004894898182614408, "loss": 0.8772, "step": 197 }, { "epoch": 0.021664204825209258, "grad_norm": 0.12384945154190063, "learning_rate": 0.0004894350777315524, "loss": 0.7823, "step": 198 }, { "epoch": 0.021773620001094153, "grad_norm": 0.11963345110416412, "learning_rate": 0.0004893803372016641, "loss": 0.8701, "step": 199 }, { "epoch": 0.021883035176979048, "grad_norm": 0.14850996434688568, "learning_rate": 0.0004893255966717758, "loss": 0.8102, "step": 200 }, { "epoch": 0.021992450352863943, "grad_norm": 0.11774320900440216, "learning_rate": 0.0004892708561418875, "loss": 0.828, "step": 201 }, { "epoch": 0.022101865528748838, "grad_norm": 0.1327088177204132, "learning_rate": 0.0004892161156119991, "loss": 0.8672, "step": 202 }, { "epoch": 0.022211280704633733, "grad_norm": 0.12870842218399048, "learning_rate": 0.0004891613750821108, "loss": 0.8626, "step": 203 }, { "epoch": 0.02232069588051863, "grad_norm": 0.13005399703979492, "learning_rate": 0.0004891066345522225, "loss": 0.8697, "step": 204 }, { "epoch": 0.022430111056403523, "grad_norm": 0.12203162908554077, "learning_rate": 0.0004890518940223341, "loss": 0.8867, "step": 205 }, { "epoch": 0.02253952623228842, "grad_norm": 0.1309415102005005, "learning_rate": 0.0004889971534924459, "loss": 0.865, "step": 206 }, { "epoch": 0.022648941408173313, "grad_norm": 0.11920128017663956, "learning_rate": 0.0004889424129625575, "loss": 0.7743, "step": 207 }, { "epoch": 0.02275835658405821, "grad_norm": 0.14900828897953033, "learning_rate": 0.0004888876724326691, "loss": 0.8416, "step": 208 }, { "epoch": 0.022867771759943104, "grad_norm": 0.1147937998175621, "learning_rate": 0.0004888329319027809, "loss": 0.8395, "step": 209 }, { "epoch": 0.022977186935828, "grad_norm": 0.1286356896162033, "learning_rate": 0.0004887781913728925, "loss": 0.8955, "step": 210 }, { "epoch": 0.023086602111712894, "grad_norm": 0.1349114179611206, "learning_rate": 0.0004887234508430041, "loss": 0.8312, "step": 211 }, { "epoch": 0.02319601728759779, "grad_norm": 0.14344258606433868, "learning_rate": 0.0004886687103131159, "loss": 0.912, "step": 212 }, { "epoch": 0.023305432463482684, "grad_norm": 0.12790583074092865, "learning_rate": 0.0004886139697832275, "loss": 0.8775, "step": 213 }, { "epoch": 0.02341484763936758, "grad_norm": 0.13284027576446533, "learning_rate": 0.0004885592292533391, "loss": 0.7917, "step": 214 }, { "epoch": 0.023524262815252474, "grad_norm": 0.12785430252552032, "learning_rate": 0.0004885044887234509, "loss": 0.8784, "step": 215 }, { "epoch": 0.023633677991137372, "grad_norm": 0.12593692541122437, "learning_rate": 0.0004884497481935625, "loss": 0.861, "step": 216 }, { "epoch": 0.023743093167022267, "grad_norm": 0.12369520217180252, "learning_rate": 0.0004883950076636741, "loss": 0.8757, "step": 217 }, { "epoch": 0.023852508342907162, "grad_norm": 0.1321219950914383, "learning_rate": 0.0004883402671337859, "loss": 0.8259, "step": 218 }, { "epoch": 0.023961923518792057, "grad_norm": 0.1315348595380783, "learning_rate": 0.0004882855266038975, "loss": 0.8222, "step": 219 }, { "epoch": 0.024071338694676953, "grad_norm": 0.1322745978832245, "learning_rate": 0.00048823078607400925, "loss": 0.8464, "step": 220 }, { "epoch": 0.024180753870561848, "grad_norm": 0.1268254518508911, "learning_rate": 0.0004881760455441209, "loss": 0.8314, "step": 221 }, { "epoch": 0.024290169046446743, "grad_norm": 0.12347619980573654, "learning_rate": 0.00048812130501423256, "loss": 0.8457, "step": 222 }, { "epoch": 0.024399584222331638, "grad_norm": 0.1304161697626114, "learning_rate": 0.0004880665644843442, "loss": 0.8432, "step": 223 }, { "epoch": 0.024508999398216533, "grad_norm": 0.12853313982486725, "learning_rate": 0.0004880118239544559, "loss": 0.7822, "step": 224 }, { "epoch": 0.024618414574101428, "grad_norm": 0.14689983427524567, "learning_rate": 0.00048795708342456756, "loss": 0.8363, "step": 225 }, { "epoch": 0.024727829749986323, "grad_norm": 0.13197535276412964, "learning_rate": 0.0004879023428946792, "loss": 0.8509, "step": 226 }, { "epoch": 0.024837244925871218, "grad_norm": 0.13846123218536377, "learning_rate": 0.00048784760236479093, "loss": 0.821, "step": 227 }, { "epoch": 0.024946660101756113, "grad_norm": 0.12475208938121796, "learning_rate": 0.00048779286183490256, "loss": 0.843, "step": 228 }, { "epoch": 0.025056075277641008, "grad_norm": 0.13594871759414673, "learning_rate": 0.0004877381213050142, "loss": 0.8609, "step": 229 }, { "epoch": 0.025165490453525903, "grad_norm": 0.13174864649772644, "learning_rate": 0.00048768338077512593, "loss": 0.8532, "step": 230 }, { "epoch": 0.025274905629410798, "grad_norm": 0.15541617572307587, "learning_rate": 0.00048762864024523756, "loss": 0.7958, "step": 231 }, { "epoch": 0.025384320805295693, "grad_norm": 0.14390912652015686, "learning_rate": 0.00048757389971534925, "loss": 0.8682, "step": 232 }, { "epoch": 0.025493735981180588, "grad_norm": 0.13307932019233704, "learning_rate": 0.00048751915918546093, "loss": 0.8408, "step": 233 }, { "epoch": 0.025603151157065487, "grad_norm": 0.12708593904972076, "learning_rate": 0.0004874644186555726, "loss": 0.8102, "step": 234 }, { "epoch": 0.025712566332950382, "grad_norm": 0.12773257493972778, "learning_rate": 0.00048740967812568425, "loss": 0.8109, "step": 235 }, { "epoch": 0.025821981508835277, "grad_norm": 0.13425928354263306, "learning_rate": 0.00048735493759579593, "loss": 0.834, "step": 236 }, { "epoch": 0.025931396684720172, "grad_norm": 0.12936341762542725, "learning_rate": 0.0004873001970659076, "loss": 0.8253, "step": 237 }, { "epoch": 0.026040811860605067, "grad_norm": 0.13622316718101501, "learning_rate": 0.00048724545653601925, "loss": 0.8613, "step": 238 }, { "epoch": 0.026150227036489962, "grad_norm": 0.14686493575572968, "learning_rate": 0.000487190716006131, "loss": 0.7861, "step": 239 }, { "epoch": 0.026259642212374857, "grad_norm": 0.12417619675397873, "learning_rate": 0.0004871359754762426, "loss": 0.7846, "step": 240 }, { "epoch": 0.026369057388259752, "grad_norm": 0.13221241533756256, "learning_rate": 0.0004870812349463543, "loss": 0.9353, "step": 241 }, { "epoch": 0.026478472564144647, "grad_norm": 0.1345079094171524, "learning_rate": 0.00048702649441646593, "loss": 0.8175, "step": 242 }, { "epoch": 0.026587887740029542, "grad_norm": 0.19912089407444, "learning_rate": 0.0004869717538865776, "loss": 0.8517, "step": 243 }, { "epoch": 0.026697302915914437, "grad_norm": 0.1491551548242569, "learning_rate": 0.0004869170133566893, "loss": 0.9153, "step": 244 }, { "epoch": 0.026806718091799332, "grad_norm": 0.13533416390419006, "learning_rate": 0.00048686227282680093, "loss": 0.8371, "step": 245 }, { "epoch": 0.026916133267684227, "grad_norm": 0.14181853830814362, "learning_rate": 0.0004868075322969127, "loss": 0.8304, "step": 246 }, { "epoch": 0.027025548443569122, "grad_norm": 0.13739491999149323, "learning_rate": 0.0004867527917670243, "loss": 0.7633, "step": 247 }, { "epoch": 0.027134963619454017, "grad_norm": 0.1531188189983368, "learning_rate": 0.000486698051237136, "loss": 0.898, "step": 248 }, { "epoch": 0.027244378795338912, "grad_norm": 0.13637474179267883, "learning_rate": 0.0004866433107072477, "loss": 0.838, "step": 249 }, { "epoch": 0.027353793971223807, "grad_norm": 0.14042706787586212, "learning_rate": 0.0004865885701773593, "loss": 0.8121, "step": 250 }, { "epoch": 0.027463209147108703, "grad_norm": 0.1260949969291687, "learning_rate": 0.000486533829647471, "loss": 0.837, "step": 251 }, { "epoch": 0.027572624322993598, "grad_norm": 0.13275760412216187, "learning_rate": 0.0004864790891175827, "loss": 0.7835, "step": 252 }, { "epoch": 0.027682039498878496, "grad_norm": 0.12232906371355057, "learning_rate": 0.00048642434858769436, "loss": 0.8785, "step": 253 }, { "epoch": 0.02779145467476339, "grad_norm": 0.12081768363714218, "learning_rate": 0.000486369608057806, "loss": 0.909, "step": 254 }, { "epoch": 0.027900869850648286, "grad_norm": 0.12284121662378311, "learning_rate": 0.00048631486752791773, "loss": 0.7896, "step": 255 }, { "epoch": 0.02801028502653318, "grad_norm": 0.1346869319677353, "learning_rate": 0.00048626012699802936, "loss": 0.9161, "step": 256 }, { "epoch": 0.028119700202418076, "grad_norm": 0.12553107738494873, "learning_rate": 0.000486205386468141, "loss": 0.8256, "step": 257 }, { "epoch": 0.02822911537830297, "grad_norm": 0.12672337889671326, "learning_rate": 0.0004861506459382527, "loss": 0.8799, "step": 258 }, { "epoch": 0.028338530554187866, "grad_norm": 0.15716969966888428, "learning_rate": 0.00048609590540836436, "loss": 0.8602, "step": 259 }, { "epoch": 0.02844794573007276, "grad_norm": 0.13170772790908813, "learning_rate": 0.00048604116487847604, "loss": 0.8857, "step": 260 }, { "epoch": 0.028557360905957656, "grad_norm": 0.12984032928943634, "learning_rate": 0.0004859864243485877, "loss": 0.9047, "step": 261 }, { "epoch": 0.02866677608184255, "grad_norm": 0.12875130772590637, "learning_rate": 0.0004859316838186994, "loss": 0.8208, "step": 262 }, { "epoch": 0.028776191257727447, "grad_norm": 0.1412181854248047, "learning_rate": 0.00048587694328881104, "loss": 0.8295, "step": 263 }, { "epoch": 0.02888560643361234, "grad_norm": 0.14098694920539856, "learning_rate": 0.0004858222027589227, "loss": 0.84, "step": 264 }, { "epoch": 0.028995021609497237, "grad_norm": 0.15128925442695618, "learning_rate": 0.0004857674622290344, "loss": 0.8191, "step": 265 }, { "epoch": 0.029104436785382132, "grad_norm": 0.14116854965686798, "learning_rate": 0.00048571272169914604, "loss": 0.8821, "step": 266 }, { "epoch": 0.029213851961267027, "grad_norm": 0.14722275733947754, "learning_rate": 0.00048565798116925773, "loss": 0.8198, "step": 267 }, { "epoch": 0.029323267137151922, "grad_norm": 0.13787107169628143, "learning_rate": 0.0004856032406393694, "loss": 0.9446, "step": 268 }, { "epoch": 0.029432682313036817, "grad_norm": 0.12594938278198242, "learning_rate": 0.0004855485001094811, "loss": 0.8061, "step": 269 }, { "epoch": 0.029542097488921712, "grad_norm": 0.1432928442955017, "learning_rate": 0.00048549375957959273, "loss": 0.7676, "step": 270 }, { "epoch": 0.029651512664806607, "grad_norm": 0.12749522924423218, "learning_rate": 0.0004854390190497044, "loss": 0.8302, "step": 271 }, { "epoch": 0.029760927840691506, "grad_norm": 0.12526798248291016, "learning_rate": 0.0004853842785198161, "loss": 0.8412, "step": 272 }, { "epoch": 0.0298703430165764, "grad_norm": 0.14048653841018677, "learning_rate": 0.00048532953798992773, "loss": 0.874, "step": 273 }, { "epoch": 0.029979758192461296, "grad_norm": 0.1280943751335144, "learning_rate": 0.00048527479746003947, "loss": 0.7686, "step": 274 }, { "epoch": 0.03008917336834619, "grad_norm": 0.13356173038482666, "learning_rate": 0.0004852200569301511, "loss": 0.869, "step": 275 }, { "epoch": 0.030198588544231086, "grad_norm": 0.13505426049232483, "learning_rate": 0.0004851653164002628, "loss": 0.7175, "step": 276 }, { "epoch": 0.03030800372011598, "grad_norm": 0.13522036373615265, "learning_rate": 0.0004851105758703744, "loss": 0.88, "step": 277 }, { "epoch": 0.030417418896000876, "grad_norm": 0.13343246281147003, "learning_rate": 0.0004850558353404861, "loss": 0.8535, "step": 278 }, { "epoch": 0.03052683407188577, "grad_norm": 0.18214909732341766, "learning_rate": 0.0004850010948105978, "loss": 0.8569, "step": 279 }, { "epoch": 0.030636249247770666, "grad_norm": 0.15554703772068024, "learning_rate": 0.0004849463542807094, "loss": 0.8525, "step": 280 }, { "epoch": 0.03074566442365556, "grad_norm": 0.13330568373203278, "learning_rate": 0.00048489161375082115, "loss": 0.8584, "step": 281 }, { "epoch": 0.030855079599540456, "grad_norm": 0.13182665407657623, "learning_rate": 0.0004848368732209328, "loss": 0.8451, "step": 282 }, { "epoch": 0.03096449477542535, "grad_norm": 0.14277927577495575, "learning_rate": 0.00048478213269104447, "loss": 0.7768, "step": 283 }, { "epoch": 0.031073909951310246, "grad_norm": 0.14235122501850128, "learning_rate": 0.00048472739216115615, "loss": 0.7961, "step": 284 }, { "epoch": 0.03118332512719514, "grad_norm": 0.15761759877204895, "learning_rate": 0.0004846726516312678, "loss": 0.9086, "step": 285 }, { "epoch": 0.03129274030308004, "grad_norm": 0.1271880567073822, "learning_rate": 0.00048461791110137947, "loss": 0.8106, "step": 286 }, { "epoch": 0.03140215547896493, "grad_norm": 0.20635129511356354, "learning_rate": 0.00048456317057149115, "loss": 0.949, "step": 287 }, { "epoch": 0.03151157065484983, "grad_norm": 0.12266603857278824, "learning_rate": 0.00048450843004160284, "loss": 0.8335, "step": 288 }, { "epoch": 0.03162098583073472, "grad_norm": 0.132630854845047, "learning_rate": 0.00048445368951171447, "loss": 0.8594, "step": 289 }, { "epoch": 0.03173040100661962, "grad_norm": 0.13536503911018372, "learning_rate": 0.0004843989489818262, "loss": 0.8359, "step": 290 }, { "epoch": 0.03183981618250451, "grad_norm": 0.12953446805477142, "learning_rate": 0.00048434420845193784, "loss": 0.9094, "step": 291 }, { "epoch": 0.03194923135838941, "grad_norm": 0.13808174431324005, "learning_rate": 0.00048428946792204947, "loss": 0.8242, "step": 292 }, { "epoch": 0.0320586465342743, "grad_norm": 0.13716864585876465, "learning_rate": 0.00048423472739216115, "loss": 0.9143, "step": 293 }, { "epoch": 0.0321680617101592, "grad_norm": 0.17680105566978455, "learning_rate": 0.00048417998686227284, "loss": 0.8371, "step": 294 }, { "epoch": 0.03227747688604409, "grad_norm": 0.1293095350265503, "learning_rate": 0.0004841252463323845, "loss": 0.9229, "step": 295 }, { "epoch": 0.03238689206192899, "grad_norm": 0.13231034576892853, "learning_rate": 0.00048407050580249615, "loss": 0.8743, "step": 296 }, { "epoch": 0.03249630723781388, "grad_norm": 0.14851507544517517, "learning_rate": 0.00048401576527260784, "loss": 0.8384, "step": 297 }, { "epoch": 0.03260572241369878, "grad_norm": 0.13708654046058655, "learning_rate": 0.0004839610247427195, "loss": 0.7347, "step": 298 }, { "epoch": 0.03271513758958367, "grad_norm": 0.13999919593334198, "learning_rate": 0.00048390628421283115, "loss": 0.9064, "step": 299 }, { "epoch": 0.03282455276546857, "grad_norm": 0.13744334876537323, "learning_rate": 0.0004838515436829429, "loss": 0.8536, "step": 300 }, { "epoch": 0.03293396794135347, "grad_norm": 0.29050105810165405, "learning_rate": 0.0004837968031530545, "loss": 0.9396, "step": 301 }, { "epoch": 0.03304338311723836, "grad_norm": 0.12851062417030334, "learning_rate": 0.0004837420626231662, "loss": 0.8705, "step": 302 }, { "epoch": 0.03315279829312326, "grad_norm": 0.12897340953350067, "learning_rate": 0.0004836873220932779, "loss": 0.9053, "step": 303 }, { "epoch": 0.03326221346900815, "grad_norm": 0.13862080872058868, "learning_rate": 0.0004836325815633895, "loss": 0.9275, "step": 304 }, { "epoch": 0.03337162864489305, "grad_norm": 0.1366620659828186, "learning_rate": 0.0004835778410335012, "loss": 0.8504, "step": 305 }, { "epoch": 0.03348104382077794, "grad_norm": 0.13112667202949524, "learning_rate": 0.0004835231005036129, "loss": 0.8772, "step": 306 }, { "epoch": 0.03359045899666284, "grad_norm": 0.12157533317804337, "learning_rate": 0.0004834683599737246, "loss": 0.8848, "step": 307 }, { "epoch": 0.03369987417254773, "grad_norm": 0.17539173364639282, "learning_rate": 0.0004834136194438362, "loss": 0.8286, "step": 308 }, { "epoch": 0.03380928934843263, "grad_norm": 0.13544873893260956, "learning_rate": 0.00048335887891394794, "loss": 0.8707, "step": 309 }, { "epoch": 0.03391870452431752, "grad_norm": 0.12689419090747833, "learning_rate": 0.0004833041383840596, "loss": 0.8646, "step": 310 }, { "epoch": 0.03402811970020242, "grad_norm": 0.15279771387577057, "learning_rate": 0.0004832493978541712, "loss": 0.8417, "step": 311 }, { "epoch": 0.03413753487608731, "grad_norm": 0.14196592569351196, "learning_rate": 0.0004831946573242829, "loss": 0.8012, "step": 312 }, { "epoch": 0.03424695005197221, "grad_norm": 0.152607262134552, "learning_rate": 0.0004831399167943946, "loss": 0.823, "step": 313 }, { "epoch": 0.0343563652278571, "grad_norm": 0.13974478840827942, "learning_rate": 0.00048308517626450626, "loss": 0.8201, "step": 314 }, { "epoch": 0.034465780403742, "grad_norm": 0.18035024404525757, "learning_rate": 0.0004830304357346179, "loss": 0.8354, "step": 315 }, { "epoch": 0.03457519557962689, "grad_norm": 0.14140082895755768, "learning_rate": 0.00048297569520472963, "loss": 0.7757, "step": 316 }, { "epoch": 0.03468461075551179, "grad_norm": 0.16026467084884644, "learning_rate": 0.00048292095467484126, "loss": 0.831, "step": 317 }, { "epoch": 0.03479402593139668, "grad_norm": 0.13814222812652588, "learning_rate": 0.0004828662141449529, "loss": 0.8489, "step": 318 }, { "epoch": 0.03490344110728158, "grad_norm": 0.14738810062408447, "learning_rate": 0.00048281147361506463, "loss": 0.8469, "step": 319 }, { "epoch": 0.03501285628316648, "grad_norm": 0.13862577080726624, "learning_rate": 0.00048275673308517626, "loss": 0.8305, "step": 320 }, { "epoch": 0.03512227145905137, "grad_norm": 0.13592027127742767, "learning_rate": 0.00048270199255528794, "loss": 0.8833, "step": 321 }, { "epoch": 0.03523168663493627, "grad_norm": 0.1452496498823166, "learning_rate": 0.00048264725202539963, "loss": 0.8832, "step": 322 }, { "epoch": 0.03534110181082116, "grad_norm": 0.1382453292608261, "learning_rate": 0.0004825925114955113, "loss": 0.8313, "step": 323 }, { "epoch": 0.03545051698670606, "grad_norm": 0.1344103068113327, "learning_rate": 0.00048253777096562294, "loss": 0.8376, "step": 324 }, { "epoch": 0.03555993216259095, "grad_norm": 0.13858944177627563, "learning_rate": 0.00048248303043573463, "loss": 0.8882, "step": 325 }, { "epoch": 0.03566934733847585, "grad_norm": 0.11499838531017303, "learning_rate": 0.0004824282899058463, "loss": 0.7864, "step": 326 }, { "epoch": 0.03577876251436074, "grad_norm": 0.1376533955335617, "learning_rate": 0.00048237354937595794, "loss": 0.8035, "step": 327 }, { "epoch": 0.03588817769024564, "grad_norm": 0.14145523309707642, "learning_rate": 0.00048231880884606963, "loss": 0.8442, "step": 328 }, { "epoch": 0.03599759286613053, "grad_norm": 0.13443966209888458, "learning_rate": 0.0004822640683161813, "loss": 0.8433, "step": 329 }, { "epoch": 0.03610700804201543, "grad_norm": 0.12933747470378876, "learning_rate": 0.000482209327786293, "loss": 0.8373, "step": 330 }, { "epoch": 0.03621642321790032, "grad_norm": 0.13795332610607147, "learning_rate": 0.00048215458725640463, "loss": 0.9143, "step": 331 }, { "epoch": 0.03632583839378522, "grad_norm": 0.14005737006664276, "learning_rate": 0.0004820998467265163, "loss": 0.7724, "step": 332 }, { "epoch": 0.03643525356967011, "grad_norm": 0.14076226949691772, "learning_rate": 0.000482045106196628, "loss": 0.8689, "step": 333 }, { "epoch": 0.03654466874555501, "grad_norm": 0.11951719224452972, "learning_rate": 0.00048199036566673963, "loss": 0.7808, "step": 334 }, { "epoch": 0.0366540839214399, "grad_norm": 0.11970534175634384, "learning_rate": 0.00048193562513685137, "loss": 0.8385, "step": 335 }, { "epoch": 0.0367634990973248, "grad_norm": 0.1368328183889389, "learning_rate": 0.000481880884606963, "loss": 0.8792, "step": 336 }, { "epoch": 0.0368729142732097, "grad_norm": 0.1458551287651062, "learning_rate": 0.0004818261440770747, "loss": 0.8217, "step": 337 }, { "epoch": 0.03698232944909459, "grad_norm": 0.1367243528366089, "learning_rate": 0.00048177140354718637, "loss": 0.9092, "step": 338 }, { "epoch": 0.03709174462497949, "grad_norm": 0.12709486484527588, "learning_rate": 0.000481716663017298, "loss": 0.8613, "step": 339 }, { "epoch": 0.03720115980086438, "grad_norm": 0.1327936053276062, "learning_rate": 0.0004816619224874097, "loss": 0.7813, "step": 340 }, { "epoch": 0.03731057497674928, "grad_norm": 0.15766604244709015, "learning_rate": 0.00048160718195752137, "loss": 0.8042, "step": 341 }, { "epoch": 0.03741999015263417, "grad_norm": 0.13956274092197418, "learning_rate": 0.00048155244142763305, "loss": 0.8529, "step": 342 }, { "epoch": 0.03752940532851907, "grad_norm": 0.1399865746498108, "learning_rate": 0.0004814977008977447, "loss": 0.8448, "step": 343 }, { "epoch": 0.03763882050440396, "grad_norm": 0.17616799473762512, "learning_rate": 0.0004814429603678564, "loss": 0.914, "step": 344 }, { "epoch": 0.03774823568028886, "grad_norm": 0.11687588691711426, "learning_rate": 0.00048138821983796805, "loss": 0.8684, "step": 345 }, { "epoch": 0.03785765085617375, "grad_norm": 0.11483744531869888, "learning_rate": 0.0004813334793080797, "loss": 0.8068, "step": 346 }, { "epoch": 0.03796706603205865, "grad_norm": 0.12279192358255386, "learning_rate": 0.00048127873877819137, "loss": 0.8473, "step": 347 }, { "epoch": 0.03807648120794354, "grad_norm": 0.1315828263759613, "learning_rate": 0.00048122399824830305, "loss": 0.8495, "step": 348 }, { "epoch": 0.03818589638382844, "grad_norm": 0.12246734648942947, "learning_rate": 0.00048116925771841474, "loss": 0.8309, "step": 349 }, { "epoch": 0.03829531155971333, "grad_norm": 0.13004641234874725, "learning_rate": 0.00048111451718852637, "loss": 0.852, "step": 350 }, { "epoch": 0.03840472673559823, "grad_norm": 0.11880378425121307, "learning_rate": 0.0004810597766586381, "loss": 0.8608, "step": 351 }, { "epoch": 0.03851414191148312, "grad_norm": 0.12172593921422958, "learning_rate": 0.00048100503612874974, "loss": 0.792, "step": 352 }, { "epoch": 0.03862355708736802, "grad_norm": 0.12583863735198975, "learning_rate": 0.00048095029559886137, "loss": 0.8466, "step": 353 }, { "epoch": 0.03873297226325291, "grad_norm": 0.12995636463165283, "learning_rate": 0.0004808955550689731, "loss": 0.7885, "step": 354 }, { "epoch": 0.03884238743913781, "grad_norm": 0.1281077265739441, "learning_rate": 0.00048084081453908474, "loss": 0.7984, "step": 355 }, { "epoch": 0.03895180261502271, "grad_norm": 0.1271606981754303, "learning_rate": 0.0004807860740091964, "loss": 0.8428, "step": 356 }, { "epoch": 0.0390612177909076, "grad_norm": 0.14155694842338562, "learning_rate": 0.0004807313334793081, "loss": 0.858, "step": 357 }, { "epoch": 0.0391706329667925, "grad_norm": 0.13719260692596436, "learning_rate": 0.00048067659294941974, "loss": 0.8201, "step": 358 }, { "epoch": 0.03928004814267739, "grad_norm": 0.1307201236486435, "learning_rate": 0.0004806218524195314, "loss": 0.8092, "step": 359 }, { "epoch": 0.03938946331856229, "grad_norm": 0.13807396590709686, "learning_rate": 0.0004805671118896431, "loss": 0.8208, "step": 360 }, { "epoch": 0.03949887849444718, "grad_norm": 0.1288439780473709, "learning_rate": 0.0004805123713597548, "loss": 0.7994, "step": 361 }, { "epoch": 0.03960829367033208, "grad_norm": 0.13726359605789185, "learning_rate": 0.0004804576308298664, "loss": 0.8079, "step": 362 }, { "epoch": 0.03971770884621697, "grad_norm": 0.15435628592967987, "learning_rate": 0.0004804028902999781, "loss": 0.9132, "step": 363 }, { "epoch": 0.03982712402210187, "grad_norm": 0.1330219954252243, "learning_rate": 0.0004803481497700898, "loss": 0.8342, "step": 364 }, { "epoch": 0.03993653919798676, "grad_norm": 0.13101331889629364, "learning_rate": 0.0004802934092402014, "loss": 0.8728, "step": 365 }, { "epoch": 0.04004595437387166, "grad_norm": 0.12459966540336609, "learning_rate": 0.0004802386687103131, "loss": 0.8359, "step": 366 }, { "epoch": 0.04015536954975655, "grad_norm": 0.13876336812973022, "learning_rate": 0.0004801839281804248, "loss": 0.7782, "step": 367 }, { "epoch": 0.04026478472564145, "grad_norm": 0.12267450243234634, "learning_rate": 0.0004801291876505365, "loss": 0.7747, "step": 368 }, { "epoch": 0.04037419990152634, "grad_norm": 0.1124066486954689, "learning_rate": 0.0004800744471206481, "loss": 0.8076, "step": 369 }, { "epoch": 0.04048361507741124, "grad_norm": 0.14169226586818695, "learning_rate": 0.00048001970659075985, "loss": 0.8114, "step": 370 }, { "epoch": 0.04059303025329613, "grad_norm": 0.12065637856721878, "learning_rate": 0.0004799649660608715, "loss": 0.7946, "step": 371 }, { "epoch": 0.04070244542918103, "grad_norm": 0.12634219229221344, "learning_rate": 0.0004799102255309831, "loss": 0.8097, "step": 372 }, { "epoch": 0.04081186060506592, "grad_norm": 0.11397214978933334, "learning_rate": 0.00047985548500109485, "loss": 0.868, "step": 373 }, { "epoch": 0.04092127578095082, "grad_norm": 0.11848662793636322, "learning_rate": 0.0004798007444712065, "loss": 0.7573, "step": 374 }, { "epoch": 0.041030690956835716, "grad_norm": 0.11919302493333817, "learning_rate": 0.00047974600394131816, "loss": 0.7716, "step": 375 }, { "epoch": 0.04114010613272061, "grad_norm": 0.12730705738067627, "learning_rate": 0.00047969126341142985, "loss": 0.7895, "step": 376 }, { "epoch": 0.04124952130860551, "grad_norm": 0.12870383262634277, "learning_rate": 0.00047963652288154153, "loss": 0.8618, "step": 377 }, { "epoch": 0.0413589364844904, "grad_norm": 0.12116729468107224, "learning_rate": 0.00047958178235165316, "loss": 0.8034, "step": 378 }, { "epoch": 0.0414683516603753, "grad_norm": 0.1221083253622055, "learning_rate": 0.00047952704182176485, "loss": 0.8086, "step": 379 }, { "epoch": 0.04157776683626019, "grad_norm": 0.12973731756210327, "learning_rate": 0.00047947230129187653, "loss": 0.7649, "step": 380 }, { "epoch": 0.04168718201214509, "grad_norm": 0.13131606578826904, "learning_rate": 0.00047941756076198816, "loss": 0.8153, "step": 381 }, { "epoch": 0.04179659718802998, "grad_norm": 0.1252298206090927, "learning_rate": 0.00047936282023209985, "loss": 0.8057, "step": 382 }, { "epoch": 0.04190601236391488, "grad_norm": 0.12150246649980545, "learning_rate": 0.00047930807970221153, "loss": 0.8764, "step": 383 }, { "epoch": 0.04201542753979977, "grad_norm": 0.11678726971149445, "learning_rate": 0.0004792533391723232, "loss": 0.7729, "step": 384 }, { "epoch": 0.04212484271568467, "grad_norm": 0.1183658242225647, "learning_rate": 0.00047919859864243485, "loss": 0.8218, "step": 385 }, { "epoch": 0.04223425789156956, "grad_norm": 0.12238837033510208, "learning_rate": 0.00047914385811254653, "loss": 0.8725, "step": 386 }, { "epoch": 0.04234367306745446, "grad_norm": 0.12167118489742279, "learning_rate": 0.0004790891175826582, "loss": 0.8851, "step": 387 }, { "epoch": 0.04245308824333935, "grad_norm": 0.13302160799503326, "learning_rate": 0.00047903437705276985, "loss": 0.8258, "step": 388 }, { "epoch": 0.04256250341922425, "grad_norm": 0.13581429421901703, "learning_rate": 0.0004789796365228816, "loss": 0.8461, "step": 389 }, { "epoch": 0.04267191859510914, "grad_norm": 0.13201041519641876, "learning_rate": 0.0004789248959929932, "loss": 0.8255, "step": 390 }, { "epoch": 0.04278133377099404, "grad_norm": 0.13232508301734924, "learning_rate": 0.0004788701554631049, "loss": 0.8186, "step": 391 }, { "epoch": 0.04289074894687893, "grad_norm": 0.13206316530704498, "learning_rate": 0.0004788154149332166, "loss": 0.7449, "step": 392 }, { "epoch": 0.04300016412276383, "grad_norm": 0.1378641277551651, "learning_rate": 0.0004787606744033282, "loss": 0.7649, "step": 393 }, { "epoch": 0.043109579298648726, "grad_norm": 0.11533229053020477, "learning_rate": 0.0004787059338734399, "loss": 0.8901, "step": 394 }, { "epoch": 0.04321899447453362, "grad_norm": 0.13201822340488434, "learning_rate": 0.0004786511933435516, "loss": 0.7864, "step": 395 }, { "epoch": 0.043328409650418516, "grad_norm": 0.1398223638534546, "learning_rate": 0.00047859645281366327, "loss": 0.8626, "step": 396 }, { "epoch": 0.04343782482630341, "grad_norm": 0.12339071184396744, "learning_rate": 0.0004785417122837749, "loss": 0.8471, "step": 397 }, { "epoch": 0.043547240002188306, "grad_norm": 0.12447839230298996, "learning_rate": 0.0004784869717538866, "loss": 0.7923, "step": 398 }, { "epoch": 0.0436566551780732, "grad_norm": 0.1843576729297638, "learning_rate": 0.00047843223122399827, "loss": 0.7722, "step": 399 }, { "epoch": 0.043766070353958096, "grad_norm": 0.1763126105070114, "learning_rate": 0.0004783774906941099, "loss": 0.7855, "step": 400 }, { "epoch": 0.04387548552984299, "grad_norm": 0.15224714577198029, "learning_rate": 0.0004783227501642216, "loss": 0.8312, "step": 401 }, { "epoch": 0.043984900705727886, "grad_norm": 0.1346595734357834, "learning_rate": 0.00047826800963433327, "loss": 0.876, "step": 402 }, { "epoch": 0.04409431588161278, "grad_norm": 0.12431242316961288, "learning_rate": 0.00047821326910444496, "loss": 0.8948, "step": 403 }, { "epoch": 0.044203731057497676, "grad_norm": 0.15662507712841034, "learning_rate": 0.0004781585285745566, "loss": 0.8171, "step": 404 }, { "epoch": 0.04431314623338257, "grad_norm": 0.13906443119049072, "learning_rate": 0.0004781037880446683, "loss": 0.7881, "step": 405 }, { "epoch": 0.044422561409267466, "grad_norm": 0.13181500136852264, "learning_rate": 0.00047804904751477996, "loss": 0.8434, "step": 406 }, { "epoch": 0.04453197658515236, "grad_norm": 0.1325126588344574, "learning_rate": 0.0004779943069848916, "loss": 0.8205, "step": 407 }, { "epoch": 0.04464139176103726, "grad_norm": 0.12682099640369415, "learning_rate": 0.0004779395664550033, "loss": 0.8018, "step": 408 }, { "epoch": 0.04475080693692215, "grad_norm": 0.13513289391994476, "learning_rate": 0.00047788482592511496, "loss": 0.8174, "step": 409 }, { "epoch": 0.04486022211280705, "grad_norm": 0.13072939217090607, "learning_rate": 0.00047783008539522664, "loss": 0.9023, "step": 410 }, { "epoch": 0.04496963728869194, "grad_norm": 0.12470893561840057, "learning_rate": 0.0004777753448653383, "loss": 0.7338, "step": 411 }, { "epoch": 0.04507905246457684, "grad_norm": 0.1329692304134369, "learning_rate": 0.00047772060433545, "loss": 0.8108, "step": 412 }, { "epoch": 0.045188467640461735, "grad_norm": 0.12628582119941711, "learning_rate": 0.00047766586380556164, "loss": 0.769, "step": 413 }, { "epoch": 0.04529788281634663, "grad_norm": 0.13004402816295624, "learning_rate": 0.0004776111232756733, "loss": 0.7406, "step": 414 }, { "epoch": 0.045407297992231525, "grad_norm": 0.12384099513292313, "learning_rate": 0.000477556382745785, "loss": 0.8578, "step": 415 }, { "epoch": 0.04551671316811642, "grad_norm": 0.1328175961971283, "learning_rate": 0.00047750164221589664, "loss": 0.8833, "step": 416 }, { "epoch": 0.045626128344001315, "grad_norm": 0.13750024139881134, "learning_rate": 0.0004774469016860083, "loss": 0.9228, "step": 417 }, { "epoch": 0.04573554351988621, "grad_norm": 0.1349189132452011, "learning_rate": 0.00047739216115612, "loss": 0.836, "step": 418 }, { "epoch": 0.045844958695771106, "grad_norm": 0.12512297928333282, "learning_rate": 0.00047733742062623164, "loss": 0.8629, "step": 419 }, { "epoch": 0.045954373871656, "grad_norm": 0.1281662881374359, "learning_rate": 0.0004772826800963433, "loss": 0.7301, "step": 420 }, { "epoch": 0.046063789047540896, "grad_norm": 0.13380907475948334, "learning_rate": 0.000477227939566455, "loss": 0.826, "step": 421 }, { "epoch": 0.04617320422342579, "grad_norm": 0.14681455492973328, "learning_rate": 0.0004771731990365667, "loss": 0.7906, "step": 422 }, { "epoch": 0.046282619399310686, "grad_norm": 0.16080713272094727, "learning_rate": 0.0004771184585066783, "loss": 0.8402, "step": 423 }, { "epoch": 0.04639203457519558, "grad_norm": 0.1346859186887741, "learning_rate": 0.00047706371797679006, "loss": 0.769, "step": 424 }, { "epoch": 0.046501449751080476, "grad_norm": 0.140382319688797, "learning_rate": 0.0004770089774469017, "loss": 0.7305, "step": 425 }, { "epoch": 0.04661086492696537, "grad_norm": 0.14222441613674164, "learning_rate": 0.0004769542369170133, "loss": 0.8143, "step": 426 }, { "epoch": 0.046720280102850266, "grad_norm": 0.16781143844127655, "learning_rate": 0.00047689949638712506, "loss": 0.8355, "step": 427 }, { "epoch": 0.04682969527873516, "grad_norm": 0.20867043733596802, "learning_rate": 0.0004768447558572367, "loss": 0.8366, "step": 428 }, { "epoch": 0.046939110454620056, "grad_norm": 0.15092511475086212, "learning_rate": 0.0004767900153273484, "loss": 0.8035, "step": 429 }, { "epoch": 0.04704852563050495, "grad_norm": 0.16088160872459412, "learning_rate": 0.00047673527479746006, "loss": 0.908, "step": 430 }, { "epoch": 0.047157940806389846, "grad_norm": 0.13905419409275055, "learning_rate": 0.00047668053426757175, "loss": 0.7526, "step": 431 }, { "epoch": 0.047267355982274745, "grad_norm": 0.12037703394889832, "learning_rate": 0.0004766257937376834, "loss": 0.8245, "step": 432 }, { "epoch": 0.047376771158159636, "grad_norm": 0.14218132197856903, "learning_rate": 0.000476571053207795, "loss": 0.7887, "step": 433 }, { "epoch": 0.047486186334044535, "grad_norm": 0.20737305283546448, "learning_rate": 0.00047651631267790675, "loss": 0.7946, "step": 434 }, { "epoch": 0.047595601509929426, "grad_norm": 0.15414604544639587, "learning_rate": 0.0004764615721480184, "loss": 0.7925, "step": 435 }, { "epoch": 0.047705016685814325, "grad_norm": 0.13833996653556824, "learning_rate": 0.00047640683161813006, "loss": 0.8423, "step": 436 }, { "epoch": 0.047814431861699216, "grad_norm": 0.12981440126895905, "learning_rate": 0.00047635209108824175, "loss": 0.857, "step": 437 }, { "epoch": 0.047923847037584115, "grad_norm": 0.13583309948444366, "learning_rate": 0.00047629735055835343, "loss": 0.8156, "step": 438 }, { "epoch": 0.04803326221346901, "grad_norm": 0.13116657733917236, "learning_rate": 0.00047624261002846506, "loss": 0.8069, "step": 439 }, { "epoch": 0.048142677389353905, "grad_norm": 0.1354154497385025, "learning_rate": 0.00047618786949857675, "loss": 0.7622, "step": 440 }, { "epoch": 0.0482520925652388, "grad_norm": 0.13979236781597137, "learning_rate": 0.00047613312896868843, "loss": 0.8073, "step": 441 }, { "epoch": 0.048361507741123695, "grad_norm": 0.1377575844526291, "learning_rate": 0.00047607838843880006, "loss": 0.8671, "step": 442 }, { "epoch": 0.04847092291700859, "grad_norm": 0.14632603526115417, "learning_rate": 0.0004760236479089118, "loss": 0.7922, "step": 443 }, { "epoch": 0.048580338092893485, "grad_norm": 0.1309819519519806, "learning_rate": 0.00047596890737902343, "loss": 0.8138, "step": 444 }, { "epoch": 0.04868975326877838, "grad_norm": 0.12831667065620422, "learning_rate": 0.0004759141668491351, "loss": 0.8214, "step": 445 }, { "epoch": 0.048799168444663275, "grad_norm": 0.14683941006660461, "learning_rate": 0.0004758594263192468, "loss": 0.817, "step": 446 }, { "epoch": 0.04890858362054817, "grad_norm": 0.130832701921463, "learning_rate": 0.00047580468578935843, "loss": 0.8549, "step": 447 }, { "epoch": 0.049017998796433065, "grad_norm": 0.14086203277111053, "learning_rate": 0.0004757499452594701, "loss": 0.8361, "step": 448 }, { "epoch": 0.04912741397231796, "grad_norm": 0.12889981269836426, "learning_rate": 0.0004756952047295818, "loss": 0.8345, "step": 449 }, { "epoch": 0.049236829148202856, "grad_norm": 0.1334647387266159, "learning_rate": 0.0004756404641996935, "loss": 0.7711, "step": 450 }, { "epoch": 0.049346244324087754, "grad_norm": 0.14128762483596802, "learning_rate": 0.0004755857236698051, "loss": 0.8652, "step": 451 }, { "epoch": 0.049455659499972646, "grad_norm": 0.129248708486557, "learning_rate": 0.0004755309831399168, "loss": 0.8101, "step": 452 }, { "epoch": 0.049565074675857544, "grad_norm": 0.1451921910047531, "learning_rate": 0.0004754762426100285, "loss": 0.8593, "step": 453 }, { "epoch": 0.049674489851742436, "grad_norm": 0.1253187209367752, "learning_rate": 0.0004754215020801401, "loss": 0.7332, "step": 454 }, { "epoch": 0.049783905027627334, "grad_norm": 0.13685990869998932, "learning_rate": 0.0004753667615502518, "loss": 0.8495, "step": 455 }, { "epoch": 0.049893320203512226, "grad_norm": 0.11595848947763443, "learning_rate": 0.0004753120210203635, "loss": 0.7705, "step": 456 }, { "epoch": 0.050002735379397124, "grad_norm": 0.13273455202579498, "learning_rate": 0.0004752572804904752, "loss": 0.7953, "step": 457 }, { "epoch": 0.050112150555282016, "grad_norm": 0.13202500343322754, "learning_rate": 0.0004752025399605868, "loss": 0.8634, "step": 458 }, { "epoch": 0.050221565731166914, "grad_norm": 0.13565777242183685, "learning_rate": 0.00047514779943069854, "loss": 0.768, "step": 459 }, { "epoch": 0.050330980907051806, "grad_norm": 0.13175652921199799, "learning_rate": 0.0004750930589008102, "loss": 0.7571, "step": 460 }, { "epoch": 0.050440396082936705, "grad_norm": 0.12729625403881073, "learning_rate": 0.0004750383183709218, "loss": 0.7903, "step": 461 }, { "epoch": 0.050549811258821596, "grad_norm": 0.12637971341609955, "learning_rate": 0.00047498357784103354, "loss": 0.7971, "step": 462 }, { "epoch": 0.050659226434706495, "grad_norm": 0.1296805590391159, "learning_rate": 0.0004749288373111452, "loss": 0.8539, "step": 463 }, { "epoch": 0.050768641610591386, "grad_norm": 0.138379767537117, "learning_rate": 0.00047487409678125686, "loss": 0.8425, "step": 464 }, { "epoch": 0.050878056786476285, "grad_norm": 0.1355636566877365, "learning_rate": 0.00047481935625136854, "loss": 0.773, "step": 465 }, { "epoch": 0.050987471962361176, "grad_norm": 0.13277575373649597, "learning_rate": 0.00047476461572148023, "loss": 0.8548, "step": 466 }, { "epoch": 0.051096887138246075, "grad_norm": 0.13311153650283813, "learning_rate": 0.00047470987519159186, "loss": 0.8618, "step": 467 }, { "epoch": 0.05120630231413097, "grad_norm": 0.13007399439811707, "learning_rate": 0.0004746551346617035, "loss": 0.8652, "step": 468 }, { "epoch": 0.051315717490015865, "grad_norm": 0.135046124458313, "learning_rate": 0.00047460039413181523, "loss": 0.8234, "step": 469 }, { "epoch": 0.051425132665900763, "grad_norm": 0.14026938378810883, "learning_rate": 0.00047454565360192686, "loss": 0.8039, "step": 470 }, { "epoch": 0.051534547841785655, "grad_norm": 0.12763498723506927, "learning_rate": 0.00047449091307203854, "loss": 0.8583, "step": 471 }, { "epoch": 0.051643963017670554, "grad_norm": 0.1405477672815323, "learning_rate": 0.00047443617254215023, "loss": 0.7494, "step": 472 }, { "epoch": 0.051753378193555445, "grad_norm": 0.14984366297721863, "learning_rate": 0.0004743814320122619, "loss": 0.8587, "step": 473 }, { "epoch": 0.051862793369440344, "grad_norm": 0.13336025178432465, "learning_rate": 0.00047432669148237354, "loss": 0.8799, "step": 474 }, { "epoch": 0.051972208545325235, "grad_norm": 0.12535150349140167, "learning_rate": 0.00047427195095248523, "loss": 0.7878, "step": 475 }, { "epoch": 0.052081623721210134, "grad_norm": 0.14053194224834442, "learning_rate": 0.0004742172104225969, "loss": 0.8301, "step": 476 }, { "epoch": 0.052191038897095025, "grad_norm": 0.14059722423553467, "learning_rate": 0.00047416246989270854, "loss": 0.8352, "step": 477 }, { "epoch": 0.052300454072979924, "grad_norm": 0.1304231435060501, "learning_rate": 0.0004741077293628203, "loss": 0.8226, "step": 478 }, { "epoch": 0.052409869248864815, "grad_norm": 0.15285630524158478, "learning_rate": 0.0004740529888329319, "loss": 0.838, "step": 479 }, { "epoch": 0.052519284424749714, "grad_norm": 0.11948804557323456, "learning_rate": 0.0004739982483030436, "loss": 0.85, "step": 480 }, { "epoch": 0.052628699600634606, "grad_norm": 0.13626250624656677, "learning_rate": 0.0004739435077731553, "loss": 0.8465, "step": 481 }, { "epoch": 0.052738114776519504, "grad_norm": 0.1226978749036789, "learning_rate": 0.0004738887672432669, "loss": 0.8446, "step": 482 }, { "epoch": 0.052847529952404396, "grad_norm": 0.12458670884370804, "learning_rate": 0.0004738340267133786, "loss": 0.8313, "step": 483 }, { "epoch": 0.052956945128289294, "grad_norm": 0.13627855479717255, "learning_rate": 0.0004737792861834903, "loss": 0.787, "step": 484 }, { "epoch": 0.053066360304174186, "grad_norm": 0.13069342076778412, "learning_rate": 0.00047372454565360197, "loss": 0.789, "step": 485 }, { "epoch": 0.053175775480059084, "grad_norm": 0.13109619915485382, "learning_rate": 0.0004736698051237136, "loss": 0.7937, "step": 486 }, { "epoch": 0.05328519065594398, "grad_norm": 0.1246679350733757, "learning_rate": 0.00047361506459382523, "loss": 0.8338, "step": 487 }, { "epoch": 0.053394605831828874, "grad_norm": 0.13287188112735748, "learning_rate": 0.00047356032406393697, "loss": 0.8254, "step": 488 }, { "epoch": 0.05350402100771377, "grad_norm": 0.14065100252628326, "learning_rate": 0.0004735055835340486, "loss": 0.7921, "step": 489 }, { "epoch": 0.053613436183598664, "grad_norm": 0.1384941041469574, "learning_rate": 0.0004734508430041603, "loss": 0.8004, "step": 490 }, { "epoch": 0.05372285135948356, "grad_norm": 0.11589401215314865, "learning_rate": 0.00047339610247427197, "loss": 0.8039, "step": 491 }, { "epoch": 0.053832266535368455, "grad_norm": 0.13655462861061096, "learning_rate": 0.00047334136194438365, "loss": 0.7761, "step": 492 }, { "epoch": 0.05394168171125335, "grad_norm": 0.15496814250946045, "learning_rate": 0.0004732866214144953, "loss": 0.8612, "step": 493 }, { "epoch": 0.054051096887138245, "grad_norm": 0.13407987356185913, "learning_rate": 0.00047323188088460697, "loss": 0.7939, "step": 494 }, { "epoch": 0.05416051206302314, "grad_norm": 0.1404959112405777, "learning_rate": 0.00047317714035471865, "loss": 0.8315, "step": 495 }, { "epoch": 0.054269927238908035, "grad_norm": 0.13380727171897888, "learning_rate": 0.0004731223998248303, "loss": 0.7543, "step": 496 }, { "epoch": 0.05437934241479293, "grad_norm": 0.1390526294708252, "learning_rate": 0.000473067659294942, "loss": 0.8038, "step": 497 }, { "epoch": 0.054488757590677825, "grad_norm": 0.1304234266281128, "learning_rate": 0.00047301291876505365, "loss": 0.7964, "step": 498 }, { "epoch": 0.05459817276656272, "grad_norm": 0.12416510283946991, "learning_rate": 0.00047295817823516534, "loss": 0.8506, "step": 499 }, { "epoch": 0.054707587942447615, "grad_norm": 0.1303643137216568, "learning_rate": 0.000472903437705277, "loss": 0.7898, "step": 500 }, { "epoch": 0.054817003118332513, "grad_norm": 0.1315460503101349, "learning_rate": 0.00047284869717538865, "loss": 0.8121, "step": 501 }, { "epoch": 0.054926418294217405, "grad_norm": 0.12373746931552887, "learning_rate": 0.00047279395664550034, "loss": 0.8744, "step": 502 }, { "epoch": 0.055035833470102304, "grad_norm": 0.1263662874698639, "learning_rate": 0.00047273921611561197, "loss": 0.7829, "step": 503 }, { "epoch": 0.055145248645987195, "grad_norm": 0.13189184665679932, "learning_rate": 0.0004726844755857237, "loss": 0.7083, "step": 504 }, { "epoch": 0.055254663821872094, "grad_norm": 0.1423538476228714, "learning_rate": 0.00047262973505583534, "loss": 0.8398, "step": 505 }, { "epoch": 0.05536407899775699, "grad_norm": 0.14177332818508148, "learning_rate": 0.000472574994525947, "loss": 0.8434, "step": 506 }, { "epoch": 0.055473494173641884, "grad_norm": 0.13676130771636963, "learning_rate": 0.0004725202539960587, "loss": 0.7446, "step": 507 }, { "epoch": 0.05558290934952678, "grad_norm": 0.1501990705728531, "learning_rate": 0.00047246551346617034, "loss": 0.7777, "step": 508 }, { "epoch": 0.055692324525411674, "grad_norm": 0.13246706128120422, "learning_rate": 0.000472410772936282, "loss": 0.7802, "step": 509 }, { "epoch": 0.05580173970129657, "grad_norm": 0.1298731118440628, "learning_rate": 0.0004723560324063937, "loss": 0.7795, "step": 510 }, { "epoch": 0.055911154877181464, "grad_norm": 0.13104499876499176, "learning_rate": 0.0004723012918765054, "loss": 0.8536, "step": 511 }, { "epoch": 0.05602057005306636, "grad_norm": 0.1356210857629776, "learning_rate": 0.000472246551346617, "loss": 0.8242, "step": 512 }, { "epoch": 0.056129985228951254, "grad_norm": 0.12274765968322754, "learning_rate": 0.00047219181081672876, "loss": 0.8104, "step": 513 }, { "epoch": 0.05623940040483615, "grad_norm": 0.1381741464138031, "learning_rate": 0.0004721370702868404, "loss": 0.8614, "step": 514 }, { "epoch": 0.056348815580721044, "grad_norm": 0.12555018067359924, "learning_rate": 0.000472082329756952, "loss": 0.8417, "step": 515 }, { "epoch": 0.05645823075660594, "grad_norm": 0.12701943516731262, "learning_rate": 0.00047202758922706376, "loss": 0.7351, "step": 516 }, { "epoch": 0.056567645932490834, "grad_norm": 0.13290026783943176, "learning_rate": 0.0004719728486971754, "loss": 0.8929, "step": 517 }, { "epoch": 0.05667706110837573, "grad_norm": 0.13139797747135162, "learning_rate": 0.0004719181081672871, "loss": 0.7804, "step": 518 }, { "epoch": 0.056786476284260624, "grad_norm": 0.13975876569747925, "learning_rate": 0.00047186336763739876, "loss": 0.8072, "step": 519 }, { "epoch": 0.05689589146014552, "grad_norm": 0.13449639081954956, "learning_rate": 0.00047180862710751044, "loss": 0.8093, "step": 520 }, { "epoch": 0.057005306636030414, "grad_norm": 0.13777483999729156, "learning_rate": 0.0004717538865776221, "loss": 0.8823, "step": 521 }, { "epoch": 0.05711472181191531, "grad_norm": 0.12297987937927246, "learning_rate": 0.0004716991460477337, "loss": 0.7295, "step": 522 }, { "epoch": 0.057224136987800205, "grad_norm": 0.13800039887428284, "learning_rate": 0.00047164440551784544, "loss": 0.8355, "step": 523 }, { "epoch": 0.0573335521636851, "grad_norm": 0.12213094532489777, "learning_rate": 0.0004715896649879571, "loss": 0.8375, "step": 524 }, { "epoch": 0.05744296733957, "grad_norm": 0.14648137986660004, "learning_rate": 0.00047153492445806876, "loss": 0.7572, "step": 525 }, { "epoch": 0.05755238251545489, "grad_norm": 0.17608322203159332, "learning_rate": 0.00047148018392818044, "loss": 0.8605, "step": 526 }, { "epoch": 0.05766179769133979, "grad_norm": 0.1197759211063385, "learning_rate": 0.00047142544339829213, "loss": 0.8306, "step": 527 }, { "epoch": 0.05777121286722468, "grad_norm": 0.14035165309906006, "learning_rate": 0.00047137070286840376, "loss": 0.7852, "step": 528 }, { "epoch": 0.05788062804310958, "grad_norm": 0.1412707418203354, "learning_rate": 0.00047131596233851544, "loss": 0.8008, "step": 529 }, { "epoch": 0.05799004321899447, "grad_norm": 0.13915716111660004, "learning_rate": 0.00047126122180862713, "loss": 0.9325, "step": 530 }, { "epoch": 0.05809945839487937, "grad_norm": 0.14317084848880768, "learning_rate": 0.00047120648127873876, "loss": 0.8572, "step": 531 }, { "epoch": 0.058208873570764263, "grad_norm": 0.12259969115257263, "learning_rate": 0.0004711517407488505, "loss": 0.8812, "step": 532 }, { "epoch": 0.05831828874664916, "grad_norm": 0.14278283715248108, "learning_rate": 0.00047109700021896213, "loss": 0.8177, "step": 533 }, { "epoch": 0.058427703922534054, "grad_norm": 0.12821869552135468, "learning_rate": 0.0004710422596890738, "loss": 0.828, "step": 534 }, { "epoch": 0.05853711909841895, "grad_norm": 0.12583594024181366, "learning_rate": 0.0004709875191591855, "loss": 0.7852, "step": 535 }, { "epoch": 0.058646534274303844, "grad_norm": 0.14253616333007812, "learning_rate": 0.00047093277862929713, "loss": 0.7543, "step": 536 }, { "epoch": 0.05875594945018874, "grad_norm": 0.12869729101657867, "learning_rate": 0.0004708780380994088, "loss": 0.8779, "step": 537 }, { "epoch": 0.058865364626073634, "grad_norm": 0.12630493938922882, "learning_rate": 0.00047082329756952044, "loss": 0.7555, "step": 538 }, { "epoch": 0.05897477980195853, "grad_norm": 0.1225631907582283, "learning_rate": 0.0004707685570396322, "loss": 0.8265, "step": 539 }, { "epoch": 0.059084194977843424, "grad_norm": 0.14105695486068726, "learning_rate": 0.0004707138165097438, "loss": 0.7313, "step": 540 }, { "epoch": 0.05919361015372832, "grad_norm": 0.1276872754096985, "learning_rate": 0.0004706590759798555, "loss": 0.7925, "step": 541 }, { "epoch": 0.059303025329613214, "grad_norm": 0.11875477433204651, "learning_rate": 0.0004706043354499672, "loss": 0.7873, "step": 542 }, { "epoch": 0.05941244050549811, "grad_norm": 0.14192385971546173, "learning_rate": 0.0004705495949200788, "loss": 0.8505, "step": 543 }, { "epoch": 0.05952185568138301, "grad_norm": 0.12448369711637497, "learning_rate": 0.0004704948543901905, "loss": 0.8473, "step": 544 }, { "epoch": 0.0596312708572679, "grad_norm": 0.11890346556901932, "learning_rate": 0.0004704401138603022, "loss": 0.731, "step": 545 }, { "epoch": 0.0597406860331528, "grad_norm": 0.13036148250102997, "learning_rate": 0.00047038537333041387, "loss": 0.7939, "step": 546 }, { "epoch": 0.05985010120903769, "grad_norm": 0.13924115896224976, "learning_rate": 0.0004703306328005255, "loss": 0.8527, "step": 547 }, { "epoch": 0.05995951638492259, "grad_norm": 0.11722873151302338, "learning_rate": 0.0004702758922706372, "loss": 0.7602, "step": 548 }, { "epoch": 0.06006893156080748, "grad_norm": 0.14304529130458832, "learning_rate": 0.00047022115174074887, "loss": 0.7499, "step": 549 }, { "epoch": 0.06017834673669238, "grad_norm": 0.13572187721729279, "learning_rate": 0.0004701664112108605, "loss": 0.8017, "step": 550 }, { "epoch": 0.06028776191257727, "grad_norm": 0.12979310750961304, "learning_rate": 0.00047011167068097224, "loss": 0.8733, "step": 551 }, { "epoch": 0.06039717708846217, "grad_norm": 0.13578380644321442, "learning_rate": 0.00047005693015108387, "loss": 0.7225, "step": 552 }, { "epoch": 0.06050659226434706, "grad_norm": 0.1233299970626831, "learning_rate": 0.00047000218962119555, "loss": 0.8021, "step": 553 }, { "epoch": 0.06061600744023196, "grad_norm": 0.13357064127922058, "learning_rate": 0.00046994744909130724, "loss": 0.8186, "step": 554 }, { "epoch": 0.06072542261611685, "grad_norm": 0.130630224943161, "learning_rate": 0.00046989270856141887, "loss": 0.8157, "step": 555 }, { "epoch": 0.06083483779200175, "grad_norm": 0.12634019553661346, "learning_rate": 0.00046983796803153055, "loss": 0.8338, "step": 556 }, { "epoch": 0.06094425296788664, "grad_norm": 0.13366109132766724, "learning_rate": 0.0004697832275016422, "loss": 0.7575, "step": 557 }, { "epoch": 0.06105366814377154, "grad_norm": 0.12227834016084671, "learning_rate": 0.0004697284869717539, "loss": 0.791, "step": 558 }, { "epoch": 0.06116308331965643, "grad_norm": 0.12996412813663483, "learning_rate": 0.00046967374644186555, "loss": 0.863, "step": 559 }, { "epoch": 0.06127249849554133, "grad_norm": 0.12549655139446259, "learning_rate": 0.00046961900591197724, "loss": 0.7955, "step": 560 }, { "epoch": 0.06138191367142622, "grad_norm": 0.14375489950180054, "learning_rate": 0.0004695642653820889, "loss": 0.76, "step": 561 }, { "epoch": 0.06149132884731112, "grad_norm": 0.13011205196380615, "learning_rate": 0.00046950952485220055, "loss": 0.8017, "step": 562 }, { "epoch": 0.06160074402319602, "grad_norm": 0.12246464192867279, "learning_rate": 0.00046945478432231224, "loss": 0.7618, "step": 563 }, { "epoch": 0.06171015919908091, "grad_norm": 0.1271633803844452, "learning_rate": 0.0004694000437924239, "loss": 0.7679, "step": 564 }, { "epoch": 0.06181957437496581, "grad_norm": 0.14037491381168365, "learning_rate": 0.0004693453032625356, "loss": 0.8537, "step": 565 }, { "epoch": 0.0619289895508507, "grad_norm": 0.1436329334974289, "learning_rate": 0.00046929056273264724, "loss": 0.7778, "step": 566 }, { "epoch": 0.0620384047267356, "grad_norm": 0.1376689076423645, "learning_rate": 0.000469235822202759, "loss": 0.7946, "step": 567 }, { "epoch": 0.06214781990262049, "grad_norm": 0.14051944017410278, "learning_rate": 0.0004691810816728706, "loss": 0.7819, "step": 568 }, { "epoch": 0.06225723507850539, "grad_norm": 0.14609172940254211, "learning_rate": 0.00046912634114298224, "loss": 0.7691, "step": 569 }, { "epoch": 0.06236665025439028, "grad_norm": 0.1264907717704773, "learning_rate": 0.000469071600613094, "loss": 0.8026, "step": 570 }, { "epoch": 0.06247606543027518, "grad_norm": 0.14280982315540314, "learning_rate": 0.0004690168600832056, "loss": 0.7942, "step": 571 }, { "epoch": 0.06258548060616008, "grad_norm": 0.13052652776241302, "learning_rate": 0.0004689621195533173, "loss": 0.8055, "step": 572 }, { "epoch": 0.06269489578204497, "grad_norm": 0.12681901454925537, "learning_rate": 0.0004689073790234289, "loss": 0.818, "step": 573 }, { "epoch": 0.06280431095792986, "grad_norm": 0.13195925951004028, "learning_rate": 0.00046885263849354066, "loss": 0.8095, "step": 574 }, { "epoch": 0.06291372613381475, "grad_norm": 0.13278184831142426, "learning_rate": 0.0004687978979636523, "loss": 0.8224, "step": 575 }, { "epoch": 0.06302314130969966, "grad_norm": 0.1525234431028366, "learning_rate": 0.0004687431574337639, "loss": 0.7757, "step": 576 }, { "epoch": 0.06313255648558455, "grad_norm": 0.12720610201358795, "learning_rate": 0.00046868841690387566, "loss": 0.8068, "step": 577 }, { "epoch": 0.06324197166146944, "grad_norm": 0.13211318850517273, "learning_rate": 0.0004686336763739873, "loss": 0.8132, "step": 578 }, { "epoch": 0.06335138683735433, "grad_norm": 0.12570929527282715, "learning_rate": 0.000468578935844099, "loss": 0.77, "step": 579 }, { "epoch": 0.06346080201323924, "grad_norm": 0.138916015625, "learning_rate": 0.00046852419531421066, "loss": 0.795, "step": 580 }, { "epoch": 0.06357021718912413, "grad_norm": 0.12386522442102432, "learning_rate": 0.00046846945478432235, "loss": 0.861, "step": 581 }, { "epoch": 0.06367963236500902, "grad_norm": 0.1275731772184372, "learning_rate": 0.000468414714254434, "loss": 0.8065, "step": 582 }, { "epoch": 0.06378904754089393, "grad_norm": 0.13213448226451874, "learning_rate": 0.00046835997372454566, "loss": 0.8164, "step": 583 }, { "epoch": 0.06389846271677882, "grad_norm": 0.1227845549583435, "learning_rate": 0.00046830523319465735, "loss": 0.7722, "step": 584 }, { "epoch": 0.06400787789266371, "grad_norm": 0.1237664595246315, "learning_rate": 0.000468250492664769, "loss": 0.7814, "step": 585 }, { "epoch": 0.0641172930685486, "grad_norm": 0.12868838012218475, "learning_rate": 0.0004681957521348807, "loss": 0.8311, "step": 586 }, { "epoch": 0.06422670824443351, "grad_norm": 0.11838873475790024, "learning_rate": 0.00046814101160499235, "loss": 0.7809, "step": 587 }, { "epoch": 0.0643361234203184, "grad_norm": 0.1331378072500229, "learning_rate": 0.00046808627107510403, "loss": 0.848, "step": 588 }, { "epoch": 0.06444553859620329, "grad_norm": 0.13517355918884277, "learning_rate": 0.00046803153054521566, "loss": 0.7883, "step": 589 }, { "epoch": 0.06455495377208818, "grad_norm": 0.12798786163330078, "learning_rate": 0.00046797679001532735, "loss": 0.8001, "step": 590 }, { "epoch": 0.06466436894797309, "grad_norm": 0.12432023882865906, "learning_rate": 0.00046792204948543903, "loss": 0.8476, "step": 591 }, { "epoch": 0.06477378412385798, "grad_norm": 0.133591890335083, "learning_rate": 0.00046786730895555066, "loss": 0.8376, "step": 592 }, { "epoch": 0.06488319929974287, "grad_norm": 0.12150149047374725, "learning_rate": 0.0004678125684256624, "loss": 0.7636, "step": 593 }, { "epoch": 0.06499261447562776, "grad_norm": 0.1254023015499115, "learning_rate": 0.00046775782789577403, "loss": 0.7485, "step": 594 }, { "epoch": 0.06510202965151267, "grad_norm": 0.12058507651090622, "learning_rate": 0.0004677030873658857, "loss": 0.7953, "step": 595 }, { "epoch": 0.06521144482739756, "grad_norm": 0.12763969600200653, "learning_rate": 0.0004676483468359974, "loss": 0.8285, "step": 596 }, { "epoch": 0.06532086000328245, "grad_norm": 0.12653779983520508, "learning_rate": 0.00046759360630610903, "loss": 0.8228, "step": 597 }, { "epoch": 0.06543027517916734, "grad_norm": 0.13308772444725037, "learning_rate": 0.0004675388657762207, "loss": 0.8241, "step": 598 }, { "epoch": 0.06553969035505225, "grad_norm": 0.13980732858181, "learning_rate": 0.0004674841252463324, "loss": 0.8756, "step": 599 }, { "epoch": 0.06564910553093714, "grad_norm": 0.13959132134914398, "learning_rate": 0.0004674293847164441, "loss": 0.8122, "step": 600 }, { "epoch": 0.06575852070682203, "grad_norm": 0.12147784233093262, "learning_rate": 0.0004673746441865557, "loss": 0.7558, "step": 601 }, { "epoch": 0.06586793588270694, "grad_norm": 0.12732960283756256, "learning_rate": 0.00046731990365666746, "loss": 0.851, "step": 602 }, { "epoch": 0.06597735105859183, "grad_norm": 0.14001739025115967, "learning_rate": 0.0004672651631267791, "loss": 0.7754, "step": 603 }, { "epoch": 0.06608676623447672, "grad_norm": 0.13588491082191467, "learning_rate": 0.0004672104225968907, "loss": 0.8219, "step": 604 }, { "epoch": 0.06619618141036161, "grad_norm": 0.12781395018100739, "learning_rate": 0.00046715568206700246, "loss": 0.7882, "step": 605 }, { "epoch": 0.06630559658624652, "grad_norm": 0.12739777565002441, "learning_rate": 0.0004671009415371141, "loss": 0.7631, "step": 606 }, { "epoch": 0.06641501176213141, "grad_norm": 0.13478794693946838, "learning_rate": 0.00046704620100722577, "loss": 0.7845, "step": 607 }, { "epoch": 0.0665244269380163, "grad_norm": 0.1125001609325409, "learning_rate": 0.0004669914604773374, "loss": 0.7953, "step": 608 }, { "epoch": 0.06663384211390119, "grad_norm": 0.1294303834438324, "learning_rate": 0.0004669367199474491, "loss": 0.7665, "step": 609 }, { "epoch": 0.0667432572897861, "grad_norm": 0.13305005431175232, "learning_rate": 0.00046688197941756077, "loss": 0.8686, "step": 610 }, { "epoch": 0.06685267246567099, "grad_norm": 0.12168179452419281, "learning_rate": 0.0004668272388876724, "loss": 0.8315, "step": 611 }, { "epoch": 0.06696208764155588, "grad_norm": 0.12148953974246979, "learning_rate": 0.00046677249835778414, "loss": 0.808, "step": 612 }, { "epoch": 0.06707150281744077, "grad_norm": 0.1295832097530365, "learning_rate": 0.00046671775782789577, "loss": 0.7886, "step": 613 }, { "epoch": 0.06718091799332568, "grad_norm": 0.1361664980649948, "learning_rate": 0.00046666301729800746, "loss": 0.7962, "step": 614 }, { "epoch": 0.06729033316921057, "grad_norm": 0.13372081518173218, "learning_rate": 0.00046660827676811914, "loss": 0.8154, "step": 615 }, { "epoch": 0.06739974834509546, "grad_norm": 0.12010551244020462, "learning_rate": 0.00046655353623823077, "loss": 0.8037, "step": 616 }, { "epoch": 0.06750916352098035, "grad_norm": 0.12622663378715515, "learning_rate": 0.00046649879570834246, "loss": 0.7951, "step": 617 }, { "epoch": 0.06761857869686526, "grad_norm": 0.12775279581546783, "learning_rate": 0.00046644405517845414, "loss": 0.8354, "step": 618 }, { "epoch": 0.06772799387275015, "grad_norm": 0.12514543533325195, "learning_rate": 0.0004663893146485658, "loss": 0.7763, "step": 619 }, { "epoch": 0.06783740904863504, "grad_norm": 0.11726637184619904, "learning_rate": 0.00046633457411867746, "loss": 0.8017, "step": 620 }, { "epoch": 0.06794682422451995, "grad_norm": 0.1312878131866455, "learning_rate": 0.0004662798335887892, "loss": 0.791, "step": 621 }, { "epoch": 0.06805623940040484, "grad_norm": 0.12784713506698608, "learning_rate": 0.0004662250930589008, "loss": 0.8453, "step": 622 }, { "epoch": 0.06816565457628973, "grad_norm": 0.1377086192369461, "learning_rate": 0.00046617035252901246, "loss": 0.7802, "step": 623 }, { "epoch": 0.06827506975217462, "grad_norm": 0.13536842167377472, "learning_rate": 0.00046611561199912414, "loss": 0.789, "step": 624 }, { "epoch": 0.06838448492805953, "grad_norm": 0.1367691606283188, "learning_rate": 0.0004660608714692358, "loss": 0.7448, "step": 625 }, { "epoch": 0.06849390010394442, "grad_norm": 0.19393286108970642, "learning_rate": 0.0004660061309393475, "loss": 0.8085, "step": 626 }, { "epoch": 0.06860331527982931, "grad_norm": 0.13058878481388092, "learning_rate": 0.00046595139040945914, "loss": 0.7947, "step": 627 }, { "epoch": 0.0687127304557142, "grad_norm": 0.23999661207199097, "learning_rate": 0.0004658966498795709, "loss": 0.7252, "step": 628 }, { "epoch": 0.06882214563159911, "grad_norm": 0.1341647058725357, "learning_rate": 0.0004658419093496825, "loss": 0.8046, "step": 629 }, { "epoch": 0.068931560807484, "grad_norm": 0.13841235637664795, "learning_rate": 0.00046578716881979414, "loss": 0.78, "step": 630 }, { "epoch": 0.06904097598336889, "grad_norm": 0.12886326014995575, "learning_rate": 0.0004657324282899059, "loss": 0.7799, "step": 631 }, { "epoch": 0.06915039115925378, "grad_norm": 0.14886216819286346, "learning_rate": 0.0004656776877600175, "loss": 0.8385, "step": 632 }, { "epoch": 0.06925980633513869, "grad_norm": 0.13308022916316986, "learning_rate": 0.0004656229472301292, "loss": 0.772, "step": 633 }, { "epoch": 0.06936922151102358, "grad_norm": 0.14792108535766602, "learning_rate": 0.0004655682067002409, "loss": 0.7965, "step": 634 }, { "epoch": 0.06947863668690847, "grad_norm": 0.13456743955612183, "learning_rate": 0.00046551346617035256, "loss": 0.8122, "step": 635 }, { "epoch": 0.06958805186279336, "grad_norm": 0.12842796742916107, "learning_rate": 0.0004654587256404642, "loss": 0.8563, "step": 636 }, { "epoch": 0.06969746703867827, "grad_norm": 0.13942907750606537, "learning_rate": 0.0004654039851105759, "loss": 0.835, "step": 637 }, { "epoch": 0.06980688221456316, "grad_norm": 0.13513164222240448, "learning_rate": 0.00046534924458068756, "loss": 0.8086, "step": 638 }, { "epoch": 0.06991629739044805, "grad_norm": 0.12416902929544449, "learning_rate": 0.0004652945040507992, "loss": 0.7941, "step": 639 }, { "epoch": 0.07002571256633296, "grad_norm": 0.13336944580078125, "learning_rate": 0.00046523976352091093, "loss": 0.7957, "step": 640 }, { "epoch": 0.07013512774221785, "grad_norm": 0.14218859374523163, "learning_rate": 0.00046518502299102256, "loss": 0.8557, "step": 641 }, { "epoch": 0.07024454291810274, "grad_norm": 0.16235743463039398, "learning_rate": 0.00046513028246113425, "loss": 0.8287, "step": 642 }, { "epoch": 0.07035395809398763, "grad_norm": 0.15705278515815735, "learning_rate": 0.0004650755419312459, "loss": 0.7972, "step": 643 }, { "epoch": 0.07046337326987254, "grad_norm": 0.13714385032653809, "learning_rate": 0.00046502080140135756, "loss": 0.7617, "step": 644 }, { "epoch": 0.07057278844575743, "grad_norm": 0.1451435536146164, "learning_rate": 0.00046496606087146925, "loss": 0.8366, "step": 645 }, { "epoch": 0.07068220362164232, "grad_norm": 0.2505577504634857, "learning_rate": 0.0004649113203415809, "loss": 0.7858, "step": 646 }, { "epoch": 0.07079161879752721, "grad_norm": 0.13205477595329285, "learning_rate": 0.0004648565798116926, "loss": 0.7768, "step": 647 }, { "epoch": 0.07090103397341212, "grad_norm": 0.13973627984523773, "learning_rate": 0.00046480183928180425, "loss": 0.838, "step": 648 }, { "epoch": 0.07101044914929701, "grad_norm": 0.1369800716638565, "learning_rate": 0.00046474709875191593, "loss": 0.8141, "step": 649 }, { "epoch": 0.0711198643251819, "grad_norm": 0.12767469882965088, "learning_rate": 0.0004646923582220276, "loss": 0.7693, "step": 650 }, { "epoch": 0.07122927950106679, "grad_norm": 0.14099447429180145, "learning_rate": 0.00046463761769213925, "loss": 0.7916, "step": 651 }, { "epoch": 0.0713386946769517, "grad_norm": 0.14479169249534607, "learning_rate": 0.00046458287716225093, "loss": 0.8076, "step": 652 }, { "epoch": 0.07144810985283659, "grad_norm": 0.14097611606121063, "learning_rate": 0.0004645281366323626, "loss": 0.822, "step": 653 }, { "epoch": 0.07155752502872148, "grad_norm": 0.1321667581796646, "learning_rate": 0.0004644733961024743, "loss": 0.7491, "step": 654 }, { "epoch": 0.07166694020460639, "grad_norm": 0.14424948394298553, "learning_rate": 0.00046441865557258593, "loss": 0.8241, "step": 655 }, { "epoch": 0.07177635538049128, "grad_norm": 0.13472658395767212, "learning_rate": 0.00046436391504269767, "loss": 0.8303, "step": 656 }, { "epoch": 0.07188577055637617, "grad_norm": 0.12333603203296661, "learning_rate": 0.0004643091745128093, "loss": 0.7597, "step": 657 }, { "epoch": 0.07199518573226106, "grad_norm": 0.13285815715789795, "learning_rate": 0.00046425443398292093, "loss": 0.7774, "step": 658 }, { "epoch": 0.07210460090814597, "grad_norm": 0.12795621156692505, "learning_rate": 0.0004641996934530326, "loss": 0.8494, "step": 659 }, { "epoch": 0.07221401608403086, "grad_norm": 0.13559001684188843, "learning_rate": 0.0004641449529231443, "loss": 0.8506, "step": 660 }, { "epoch": 0.07232343125991575, "grad_norm": 0.1488664299249649, "learning_rate": 0.000464090212393256, "loss": 0.8201, "step": 661 }, { "epoch": 0.07243284643580064, "grad_norm": 0.1501270979642868, "learning_rate": 0.0004640354718633676, "loss": 0.77, "step": 662 }, { "epoch": 0.07254226161168555, "grad_norm": 0.13518112897872925, "learning_rate": 0.00046398073133347936, "loss": 0.8186, "step": 663 }, { "epoch": 0.07265167678757044, "grad_norm": 0.15087081491947174, "learning_rate": 0.000463925990803591, "loss": 0.8167, "step": 664 }, { "epoch": 0.07276109196345533, "grad_norm": 0.16018567979335785, "learning_rate": 0.0004638712502737026, "loss": 0.8764, "step": 665 }, { "epoch": 0.07287050713934022, "grad_norm": 0.14538486301898956, "learning_rate": 0.00046381650974381436, "loss": 0.8312, "step": 666 }, { "epoch": 0.07297992231522513, "grad_norm": 0.13870789110660553, "learning_rate": 0.000463761769213926, "loss": 0.829, "step": 667 }, { "epoch": 0.07308933749111002, "grad_norm": 0.135314479470253, "learning_rate": 0.00046370702868403767, "loss": 0.8329, "step": 668 }, { "epoch": 0.07319875266699491, "grad_norm": 0.1305926889181137, "learning_rate": 0.00046365228815414936, "loss": 0.8522, "step": 669 }, { "epoch": 0.0733081678428798, "grad_norm": 0.12999588251113892, "learning_rate": 0.00046359754762426104, "loss": 0.7814, "step": 670 }, { "epoch": 0.0734175830187647, "grad_norm": 0.13360697031021118, "learning_rate": 0.00046354280709437267, "loss": 0.8358, "step": 671 }, { "epoch": 0.0735269981946496, "grad_norm": 0.1347617357969284, "learning_rate": 0.00046348806656448436, "loss": 0.8496, "step": 672 }, { "epoch": 0.07363641337053449, "grad_norm": 0.14312683045864105, "learning_rate": 0.00046343332603459604, "loss": 0.7242, "step": 673 }, { "epoch": 0.0737458285464194, "grad_norm": 0.13751371204853058, "learning_rate": 0.00046337858550470767, "loss": 0.8804, "step": 674 }, { "epoch": 0.07385524372230429, "grad_norm": 0.14755578339099884, "learning_rate": 0.0004633238449748194, "loss": 0.8014, "step": 675 }, { "epoch": 0.07396465889818918, "grad_norm": 0.1347828060388565, "learning_rate": 0.00046326910444493104, "loss": 0.779, "step": 676 }, { "epoch": 0.07407407407407407, "grad_norm": 0.12936298549175262, "learning_rate": 0.00046321436391504267, "loss": 0.7896, "step": 677 }, { "epoch": 0.07418348924995898, "grad_norm": 0.1454291194677353, "learning_rate": 0.00046315962338515436, "loss": 0.7488, "step": 678 }, { "epoch": 0.07429290442584387, "grad_norm": 0.1385572999715805, "learning_rate": 0.00046310488285526604, "loss": 0.8317, "step": 679 }, { "epoch": 0.07440231960172876, "grad_norm": 0.13784347474575043, "learning_rate": 0.0004630501423253777, "loss": 0.8172, "step": 680 }, { "epoch": 0.07451173477761365, "grad_norm": 0.1282973736524582, "learning_rate": 0.00046299540179548936, "loss": 0.7557, "step": 681 }, { "epoch": 0.07462114995349856, "grad_norm": 0.1393175721168518, "learning_rate": 0.0004629406612656011, "loss": 0.7678, "step": 682 }, { "epoch": 0.07473056512938345, "grad_norm": 0.12132387608289719, "learning_rate": 0.0004628859207357127, "loss": 0.8437, "step": 683 }, { "epoch": 0.07483998030526834, "grad_norm": 0.14124171435832977, "learning_rate": 0.00046283118020582436, "loss": 0.7693, "step": 684 }, { "epoch": 0.07494939548115323, "grad_norm": 0.1384229063987732, "learning_rate": 0.0004627764396759361, "loss": 0.7912, "step": 685 }, { "epoch": 0.07505881065703814, "grad_norm": 0.1357513964176178, "learning_rate": 0.0004627216991460477, "loss": 0.7479, "step": 686 }, { "epoch": 0.07516822583292303, "grad_norm": 0.13733328878879547, "learning_rate": 0.0004626669586161594, "loss": 0.7962, "step": 687 }, { "epoch": 0.07527764100880792, "grad_norm": 0.15147341787815094, "learning_rate": 0.0004626122180862711, "loss": 0.7487, "step": 688 }, { "epoch": 0.07538705618469281, "grad_norm": 0.14673854410648346, "learning_rate": 0.0004625574775563828, "loss": 0.8256, "step": 689 }, { "epoch": 0.07549647136057772, "grad_norm": 0.12854699790477753, "learning_rate": 0.0004625027370264944, "loss": 0.843, "step": 690 }, { "epoch": 0.07560588653646261, "grad_norm": 0.13233281672000885, "learning_rate": 0.0004624479964966061, "loss": 0.863, "step": 691 }, { "epoch": 0.0757153017123475, "grad_norm": 0.12444347888231277, "learning_rate": 0.0004623932559667178, "loss": 0.8002, "step": 692 }, { "epoch": 0.0758247168882324, "grad_norm": 0.12794019281864166, "learning_rate": 0.0004623385154368294, "loss": 0.7606, "step": 693 }, { "epoch": 0.0759341320641173, "grad_norm": 0.13981245458126068, "learning_rate": 0.0004622837749069411, "loss": 0.9011, "step": 694 }, { "epoch": 0.07604354724000219, "grad_norm": 0.1277012974023819, "learning_rate": 0.0004622290343770528, "loss": 0.7943, "step": 695 }, { "epoch": 0.07615296241588708, "grad_norm": 0.14286808669567108, "learning_rate": 0.00046217429384716447, "loss": 0.7895, "step": 696 }, { "epoch": 0.07626237759177198, "grad_norm": 0.15406154096126556, "learning_rate": 0.0004621195533172761, "loss": 0.8901, "step": 697 }, { "epoch": 0.07637179276765688, "grad_norm": 0.14177300035953522, "learning_rate": 0.0004620648127873878, "loss": 0.8557, "step": 698 }, { "epoch": 0.07648120794354177, "grad_norm": 0.1333388388156891, "learning_rate": 0.00046201007225749947, "loss": 0.7929, "step": 699 }, { "epoch": 0.07659062311942666, "grad_norm": 0.14906929433345795, "learning_rate": 0.0004619553317276111, "loss": 0.8368, "step": 700 }, { "epoch": 0.07670003829531157, "grad_norm": 0.1259603351354599, "learning_rate": 0.00046190059119772284, "loss": 0.78, "step": 701 }, { "epoch": 0.07680945347119646, "grad_norm": 0.2038787305355072, "learning_rate": 0.00046184585066783447, "loss": 0.7612, "step": 702 }, { "epoch": 0.07691886864708135, "grad_norm": 0.16181035339832306, "learning_rate": 0.00046179111013794615, "loss": 0.7352, "step": 703 }, { "epoch": 0.07702828382296624, "grad_norm": 0.14859457314014435, "learning_rate": 0.00046173636960805784, "loss": 0.8561, "step": 704 }, { "epoch": 0.07713769899885115, "grad_norm": 0.14329321682453156, "learning_rate": 0.00046168162907816947, "loss": 0.7385, "step": 705 }, { "epoch": 0.07724711417473604, "grad_norm": 0.14117203652858734, "learning_rate": 0.00046162688854828115, "loss": 0.8193, "step": 706 }, { "epoch": 0.07735652935062093, "grad_norm": 0.12848559021949768, "learning_rate": 0.00046157214801839284, "loss": 0.7918, "step": 707 }, { "epoch": 0.07746594452650582, "grad_norm": 0.14721111953258514, "learning_rate": 0.0004615174074885045, "loss": 0.8227, "step": 708 }, { "epoch": 0.07757535970239073, "grad_norm": 0.13443589210510254, "learning_rate": 0.00046146266695861615, "loss": 0.8232, "step": 709 }, { "epoch": 0.07768477487827562, "grad_norm": 0.13354553282260895, "learning_rate": 0.0004614079264287279, "loss": 0.8796, "step": 710 }, { "epoch": 0.07779419005416051, "grad_norm": 0.13339437544345856, "learning_rate": 0.0004613531858988395, "loss": 0.8113, "step": 711 }, { "epoch": 0.07790360523004541, "grad_norm": 0.12624484300613403, "learning_rate": 0.00046129844536895115, "loss": 0.86, "step": 712 }, { "epoch": 0.0780130204059303, "grad_norm": 0.1346779465675354, "learning_rate": 0.00046124370483906284, "loss": 0.7851, "step": 713 }, { "epoch": 0.0781224355818152, "grad_norm": 0.1257091611623764, "learning_rate": 0.0004611889643091745, "loss": 0.7497, "step": 714 }, { "epoch": 0.07823185075770009, "grad_norm": 0.15553726255893707, "learning_rate": 0.0004611342237792862, "loss": 0.7938, "step": 715 }, { "epoch": 0.078341265933585, "grad_norm": 0.14898237586021423, "learning_rate": 0.00046107948324939784, "loss": 0.832, "step": 716 }, { "epoch": 0.07845068110946989, "grad_norm": 0.13182316720485687, "learning_rate": 0.0004610247427195096, "loss": 0.8468, "step": 717 }, { "epoch": 0.07856009628535478, "grad_norm": 0.1355324238538742, "learning_rate": 0.0004609700021896212, "loss": 0.8122, "step": 718 }, { "epoch": 0.07866951146123967, "grad_norm": 0.13562671840190887, "learning_rate": 0.00046091526165973284, "loss": 0.8381, "step": 719 }, { "epoch": 0.07877892663712457, "grad_norm": 0.14802195131778717, "learning_rate": 0.0004608605211298446, "loss": 0.693, "step": 720 }, { "epoch": 0.07888834181300947, "grad_norm": 0.14064821600914001, "learning_rate": 0.0004608057805999562, "loss": 0.7486, "step": 721 }, { "epoch": 0.07899775698889436, "grad_norm": 0.1340484619140625, "learning_rate": 0.0004607510400700679, "loss": 0.7956, "step": 722 }, { "epoch": 0.07910717216477925, "grad_norm": 0.12158786505460739, "learning_rate": 0.0004606962995401796, "loss": 0.7932, "step": 723 }, { "epoch": 0.07921658734066415, "grad_norm": 0.12938861548900604, "learning_rate": 0.00046064155901029126, "loss": 0.7721, "step": 724 }, { "epoch": 0.07932600251654905, "grad_norm": 0.14314547181129456, "learning_rate": 0.0004605868184804029, "loss": 0.7932, "step": 725 }, { "epoch": 0.07943541769243394, "grad_norm": 0.1265585869550705, "learning_rate": 0.0004605320779505146, "loss": 0.8247, "step": 726 }, { "epoch": 0.07954483286831883, "grad_norm": 0.1409253627061844, "learning_rate": 0.00046047733742062626, "loss": 0.8864, "step": 727 }, { "epoch": 0.07965424804420373, "grad_norm": 0.12851783633232117, "learning_rate": 0.0004604225968907379, "loss": 0.8116, "step": 728 }, { "epoch": 0.07976366322008863, "grad_norm": 0.136226624250412, "learning_rate": 0.0004603678563608496, "loss": 0.8179, "step": 729 }, { "epoch": 0.07987307839597352, "grad_norm": 0.1396748274564743, "learning_rate": 0.00046031311583096126, "loss": 0.8422, "step": 730 }, { "epoch": 0.07998249357185842, "grad_norm": 0.13701552152633667, "learning_rate": 0.00046025837530107294, "loss": 0.8531, "step": 731 }, { "epoch": 0.08009190874774332, "grad_norm": 0.1328989863395691, "learning_rate": 0.0004602036347711846, "loss": 0.7655, "step": 732 }, { "epoch": 0.0802013239236282, "grad_norm": 0.13064415752887726, "learning_rate": 0.00046014889424129626, "loss": 0.7716, "step": 733 }, { "epoch": 0.0803107390995131, "grad_norm": 0.13497187197208405, "learning_rate": 0.00046009415371140794, "loss": 0.8002, "step": 734 }, { "epoch": 0.080420154275398, "grad_norm": 0.17116543650627136, "learning_rate": 0.0004600394131815196, "loss": 0.8126, "step": 735 }, { "epoch": 0.0805295694512829, "grad_norm": 0.1349925696849823, "learning_rate": 0.0004599846726516313, "loss": 0.8182, "step": 736 }, { "epoch": 0.08063898462716779, "grad_norm": 0.14530497789382935, "learning_rate": 0.00045992993212174294, "loss": 0.7823, "step": 737 }, { "epoch": 0.08074839980305268, "grad_norm": 0.13708613812923431, "learning_rate": 0.0004598751915918546, "loss": 0.7838, "step": 738 }, { "epoch": 0.08085781497893758, "grad_norm": 0.12676386535167694, "learning_rate": 0.0004598204510619663, "loss": 0.7625, "step": 739 }, { "epoch": 0.08096723015482248, "grad_norm": 0.13747204840183258, "learning_rate": 0.00045976571053207794, "loss": 0.7778, "step": 740 }, { "epoch": 0.08107664533070737, "grad_norm": 0.13029745221138, "learning_rate": 0.00045971097000218963, "loss": 0.7883, "step": 741 }, { "epoch": 0.08118606050659226, "grad_norm": 0.13534535467624664, "learning_rate": 0.0004596562294723013, "loss": 0.7992, "step": 742 }, { "epoch": 0.08129547568247716, "grad_norm": 0.12492301315069199, "learning_rate": 0.000459601488942413, "loss": 0.8075, "step": 743 }, { "epoch": 0.08140489085836206, "grad_norm": 0.1461811661720276, "learning_rate": 0.00045954674841252463, "loss": 0.7978, "step": 744 }, { "epoch": 0.08151430603424695, "grad_norm": 0.12807439267635345, "learning_rate": 0.0004594920078826363, "loss": 0.828, "step": 745 }, { "epoch": 0.08162372121013184, "grad_norm": 0.13595931231975555, "learning_rate": 0.000459437267352748, "loss": 0.8093, "step": 746 }, { "epoch": 0.08173313638601674, "grad_norm": 0.13752730190753937, "learning_rate": 0.00045938252682285963, "loss": 0.8755, "step": 747 }, { "epoch": 0.08184255156190164, "grad_norm": 0.116949662566185, "learning_rate": 0.0004593277862929713, "loss": 0.7302, "step": 748 }, { "epoch": 0.08195196673778653, "grad_norm": 0.13037700951099396, "learning_rate": 0.000459273045763083, "loss": 0.7818, "step": 749 }, { "epoch": 0.08206138191367143, "grad_norm": 0.13325439393520355, "learning_rate": 0.0004592183052331947, "loss": 0.798, "step": 750 }, { "epoch": 0.08217079708955632, "grad_norm": 0.1491130292415619, "learning_rate": 0.0004591635647033063, "loss": 0.7698, "step": 751 }, { "epoch": 0.08228021226544122, "grad_norm": 0.12912854552268982, "learning_rate": 0.000459108824173418, "loss": 0.7393, "step": 752 }, { "epoch": 0.08238962744132611, "grad_norm": 0.12378308922052383, "learning_rate": 0.0004590540836435297, "loss": 0.7588, "step": 753 }, { "epoch": 0.08249904261721101, "grad_norm": 0.1265900433063507, "learning_rate": 0.0004589993431136413, "loss": 0.8247, "step": 754 }, { "epoch": 0.0826084577930959, "grad_norm": 0.12409325689077377, "learning_rate": 0.00045894460258375305, "loss": 0.7868, "step": 755 }, { "epoch": 0.0827178729689808, "grad_norm": 0.13457505404949188, "learning_rate": 0.0004588898620538647, "loss": 0.7068, "step": 756 }, { "epoch": 0.08282728814486569, "grad_norm": 0.11714039742946625, "learning_rate": 0.00045883512152397637, "loss": 0.8502, "step": 757 }, { "epoch": 0.0829367033207506, "grad_norm": 0.1506553590297699, "learning_rate": 0.00045878038099408805, "loss": 0.8035, "step": 758 }, { "epoch": 0.08304611849663548, "grad_norm": 0.14593565464019775, "learning_rate": 0.0004587256404641997, "loss": 0.8372, "step": 759 }, { "epoch": 0.08315553367252038, "grad_norm": 0.12969183921813965, "learning_rate": 0.00045867089993431137, "loss": 0.8296, "step": 760 }, { "epoch": 0.08326494884840527, "grad_norm": 0.14958368241786957, "learning_rate": 0.00045861615940442305, "loss": 0.7974, "step": 761 }, { "epoch": 0.08337436402429017, "grad_norm": 0.15841010212898254, "learning_rate": 0.00045856141887453474, "loss": 0.8556, "step": 762 }, { "epoch": 0.08348377920017507, "grad_norm": 0.12097176164388657, "learning_rate": 0.00045850667834464637, "loss": 0.7399, "step": 763 }, { "epoch": 0.08359319437605996, "grad_norm": 0.1463671624660492, "learning_rate": 0.00045845193781475805, "loss": 0.7737, "step": 764 }, { "epoch": 0.08370260955194485, "grad_norm": 0.15766289830207825, "learning_rate": 0.00045839719728486974, "loss": 0.8425, "step": 765 }, { "epoch": 0.08381202472782975, "grad_norm": 0.13115522265434265, "learning_rate": 0.00045834245675498137, "loss": 0.7768, "step": 766 }, { "epoch": 0.08392143990371465, "grad_norm": 0.1365516483783722, "learning_rate": 0.00045828771622509305, "loss": 0.7871, "step": 767 }, { "epoch": 0.08403085507959954, "grad_norm": 0.12917247414588928, "learning_rate": 0.00045823297569520474, "loss": 0.727, "step": 768 }, { "epoch": 0.08414027025548444, "grad_norm": 0.12478167563676834, "learning_rate": 0.0004581782351653164, "loss": 0.8388, "step": 769 }, { "epoch": 0.08424968543136933, "grad_norm": 0.11924006044864655, "learning_rate": 0.00045812349463542805, "loss": 0.6959, "step": 770 }, { "epoch": 0.08435910060725423, "grad_norm": 0.12922175228595734, "learning_rate": 0.0004580687541055398, "loss": 0.8373, "step": 771 }, { "epoch": 0.08446851578313912, "grad_norm": 0.13486608862876892, "learning_rate": 0.0004580140135756514, "loss": 0.8139, "step": 772 }, { "epoch": 0.08457793095902402, "grad_norm": 0.1283293515443802, "learning_rate": 0.00045795927304576305, "loss": 0.7644, "step": 773 }, { "epoch": 0.08468734613490891, "grad_norm": 0.13156479597091675, "learning_rate": 0.0004579045325158748, "loss": 0.8672, "step": 774 }, { "epoch": 0.0847967613107938, "grad_norm": 0.1299055814743042, "learning_rate": 0.0004578497919859864, "loss": 0.8499, "step": 775 }, { "epoch": 0.0849061764866787, "grad_norm": 0.12746237218379974, "learning_rate": 0.0004577950514560981, "loss": 0.7914, "step": 776 }, { "epoch": 0.0850155916625636, "grad_norm": 0.12026140838861465, "learning_rate": 0.0004577403109262098, "loss": 0.757, "step": 777 }, { "epoch": 0.0851250068384485, "grad_norm": 0.12187577784061432, "learning_rate": 0.0004576855703963215, "loss": 0.7784, "step": 778 }, { "epoch": 0.08523442201433339, "grad_norm": 0.12545311450958252, "learning_rate": 0.0004576308298664331, "loss": 0.7363, "step": 779 }, { "epoch": 0.08534383719021828, "grad_norm": 0.13175129890441895, "learning_rate": 0.0004575760893365448, "loss": 0.8165, "step": 780 }, { "epoch": 0.08545325236610318, "grad_norm": 0.13319259881973267, "learning_rate": 0.0004575213488066565, "loss": 0.758, "step": 781 }, { "epoch": 0.08556266754198807, "grad_norm": 0.1395317167043686, "learning_rate": 0.0004574666082767681, "loss": 0.8706, "step": 782 }, { "epoch": 0.08567208271787297, "grad_norm": 0.16012714803218842, "learning_rate": 0.0004574118677468798, "loss": 0.7403, "step": 783 }, { "epoch": 0.08578149789375786, "grad_norm": 0.1328243911266327, "learning_rate": 0.0004573571272169915, "loss": 0.7532, "step": 784 }, { "epoch": 0.08589091306964276, "grad_norm": 0.12839773297309875, "learning_rate": 0.00045730238668710316, "loss": 0.8373, "step": 785 }, { "epoch": 0.08600032824552765, "grad_norm": 0.1196015253663063, "learning_rate": 0.0004572476461572148, "loss": 0.7828, "step": 786 }, { "epoch": 0.08610974342141255, "grad_norm": 0.12401208281517029, "learning_rate": 0.0004571929056273265, "loss": 0.7942, "step": 787 }, { "epoch": 0.08621915859729745, "grad_norm": 0.13554725050926208, "learning_rate": 0.00045713816509743816, "loss": 0.7973, "step": 788 }, { "epoch": 0.08632857377318234, "grad_norm": 0.1284472942352295, "learning_rate": 0.0004570834245675498, "loss": 0.8394, "step": 789 }, { "epoch": 0.08643798894906723, "grad_norm": 0.13878479599952698, "learning_rate": 0.00045702868403766153, "loss": 0.8059, "step": 790 }, { "epoch": 0.08654740412495213, "grad_norm": 0.133403941988945, "learning_rate": 0.00045697394350777316, "loss": 0.7456, "step": 791 }, { "epoch": 0.08665681930083703, "grad_norm": 0.1310684233903885, "learning_rate": 0.00045691920297788485, "loss": 0.8112, "step": 792 }, { "epoch": 0.08676623447672192, "grad_norm": 0.13146792352199554, "learning_rate": 0.00045686446244799653, "loss": 0.7989, "step": 793 }, { "epoch": 0.08687564965260682, "grad_norm": 0.12958961725234985, "learning_rate": 0.00045680972191810816, "loss": 0.7148, "step": 794 }, { "epoch": 0.0869850648284917, "grad_norm": 0.14529135823249817, "learning_rate": 0.00045675498138821985, "loss": 0.7541, "step": 795 }, { "epoch": 0.08709448000437661, "grad_norm": 0.12200675904750824, "learning_rate": 0.00045670024085833153, "loss": 0.7812, "step": 796 }, { "epoch": 0.0872038951802615, "grad_norm": 0.13477866351604462, "learning_rate": 0.0004566455003284432, "loss": 0.8254, "step": 797 }, { "epoch": 0.0873133103561464, "grad_norm": 0.14279486238956451, "learning_rate": 0.00045659075979855485, "loss": 0.8165, "step": 798 }, { "epoch": 0.08742272553203129, "grad_norm": 0.12621647119522095, "learning_rate": 0.0004565360192686665, "loss": 0.7018, "step": 799 }, { "epoch": 0.08753214070791619, "grad_norm": 0.13293705880641937, "learning_rate": 0.0004564812787387782, "loss": 0.7726, "step": 800 }, { "epoch": 0.08764155588380108, "grad_norm": 0.12643350660800934, "learning_rate": 0.00045642653820888985, "loss": 0.8035, "step": 801 }, { "epoch": 0.08775097105968598, "grad_norm": 0.12448395788669586, "learning_rate": 0.00045637179767900153, "loss": 0.756, "step": 802 }, { "epoch": 0.08786038623557087, "grad_norm": 0.12382738292217255, "learning_rate": 0.0004563170571491132, "loss": 0.7426, "step": 803 }, { "epoch": 0.08796980141145577, "grad_norm": 0.13370352983474731, "learning_rate": 0.0004562623166192249, "loss": 0.7884, "step": 804 }, { "epoch": 0.08807921658734066, "grad_norm": 0.12445847690105438, "learning_rate": 0.00045620757608933653, "loss": 0.8184, "step": 805 }, { "epoch": 0.08818863176322556, "grad_norm": 0.12635451555252075, "learning_rate": 0.0004561528355594482, "loss": 0.8078, "step": 806 }, { "epoch": 0.08829804693911046, "grad_norm": 0.12853336334228516, "learning_rate": 0.0004560980950295599, "loss": 0.7725, "step": 807 }, { "epoch": 0.08840746211499535, "grad_norm": 0.13253910839557648, "learning_rate": 0.00045604335449967153, "loss": 0.8211, "step": 808 }, { "epoch": 0.08851687729088024, "grad_norm": 0.13106602430343628, "learning_rate": 0.00045598861396978327, "loss": 0.7749, "step": 809 }, { "epoch": 0.08862629246676514, "grad_norm": 0.13255615532398224, "learning_rate": 0.0004559338734398949, "loss": 0.7923, "step": 810 }, { "epoch": 0.08873570764265004, "grad_norm": 0.1336010843515396, "learning_rate": 0.0004558791329100066, "loss": 0.756, "step": 811 }, { "epoch": 0.08884512281853493, "grad_norm": 0.14960592985153198, "learning_rate": 0.00045582439238011827, "loss": 0.6679, "step": 812 }, { "epoch": 0.08895453799441982, "grad_norm": 0.15116195380687714, "learning_rate": 0.0004557696518502299, "loss": 0.7654, "step": 813 }, { "epoch": 0.08906395317030472, "grad_norm": 0.13945285975933075, "learning_rate": 0.0004557149113203416, "loss": 0.7588, "step": 814 }, { "epoch": 0.08917336834618962, "grad_norm": 0.13368047773838043, "learning_rate": 0.00045566017079045327, "loss": 0.8254, "step": 815 }, { "epoch": 0.08928278352207451, "grad_norm": 0.15960678458213806, "learning_rate": 0.00045560543026056496, "loss": 0.7404, "step": 816 }, { "epoch": 0.0893921986979594, "grad_norm": 0.1291269063949585, "learning_rate": 0.0004555506897306766, "loss": 0.8311, "step": 817 }, { "epoch": 0.0895016138738443, "grad_norm": 0.14496581256389618, "learning_rate": 0.00045549594920078827, "loss": 0.8191, "step": 818 }, { "epoch": 0.0896110290497292, "grad_norm": 0.12724857032299042, "learning_rate": 0.00045544120867089996, "loss": 0.7858, "step": 819 }, { "epoch": 0.0897204442256141, "grad_norm": 0.15328818559646606, "learning_rate": 0.0004553864681410116, "loss": 0.8157, "step": 820 }, { "epoch": 0.08982985940149898, "grad_norm": 0.12095645070075989, "learning_rate": 0.00045533172761112327, "loss": 0.7306, "step": 821 }, { "epoch": 0.08993927457738388, "grad_norm": 0.13796503841876984, "learning_rate": 0.00045527698708123496, "loss": 0.7605, "step": 822 }, { "epoch": 0.09004868975326878, "grad_norm": 0.1254550665616989, "learning_rate": 0.00045522224655134664, "loss": 0.7448, "step": 823 }, { "epoch": 0.09015810492915367, "grad_norm": 0.16461820900440216, "learning_rate": 0.00045516750602145827, "loss": 0.836, "step": 824 }, { "epoch": 0.09026752010503857, "grad_norm": 0.1215628832578659, "learning_rate": 0.00045511276549157, "loss": 0.8003, "step": 825 }, { "epoch": 0.09037693528092347, "grad_norm": 0.12400868535041809, "learning_rate": 0.00045505802496168164, "loss": 0.7892, "step": 826 }, { "epoch": 0.09048635045680836, "grad_norm": 0.12768247723579407, "learning_rate": 0.00045500328443179327, "loss": 0.7247, "step": 827 }, { "epoch": 0.09059576563269325, "grad_norm": 0.13239333033561707, "learning_rate": 0.000454948543901905, "loss": 0.7582, "step": 828 }, { "epoch": 0.09070518080857815, "grad_norm": 0.12698513269424438, "learning_rate": 0.00045489380337201664, "loss": 0.7867, "step": 829 }, { "epoch": 0.09081459598446305, "grad_norm": 0.14535489678382874, "learning_rate": 0.0004548390628421283, "loss": 0.8315, "step": 830 }, { "epoch": 0.09092401116034794, "grad_norm": 0.12221015989780426, "learning_rate": 0.00045478432231224, "loss": 0.7008, "step": 831 }, { "epoch": 0.09103342633623283, "grad_norm": 0.1389721781015396, "learning_rate": 0.0004547295817823517, "loss": 0.8935, "step": 832 }, { "epoch": 0.09114284151211773, "grad_norm": 0.1352846473455429, "learning_rate": 0.0004546748412524633, "loss": 0.8277, "step": 833 }, { "epoch": 0.09125225668800263, "grad_norm": 0.12230843305587769, "learning_rate": 0.00045462010072257496, "loss": 0.8191, "step": 834 }, { "epoch": 0.09136167186388752, "grad_norm": 0.13898305594921112, "learning_rate": 0.0004545653601926867, "loss": 0.771, "step": 835 }, { "epoch": 0.09147108703977241, "grad_norm": 0.1404215693473816, "learning_rate": 0.0004545106196627983, "loss": 0.8107, "step": 836 }, { "epoch": 0.0915805022156573, "grad_norm": 0.1333947628736496, "learning_rate": 0.00045445587913291, "loss": 0.7718, "step": 837 }, { "epoch": 0.09168991739154221, "grad_norm": 0.1258363425731659, "learning_rate": 0.0004544011386030217, "loss": 0.758, "step": 838 }, { "epoch": 0.0917993325674271, "grad_norm": 0.1407511681318283, "learning_rate": 0.0004543463980731334, "loss": 0.8552, "step": 839 }, { "epoch": 0.091908747743312, "grad_norm": 0.1348382532596588, "learning_rate": 0.000454291657543245, "loss": 0.7539, "step": 840 }, { "epoch": 0.09201816291919689, "grad_norm": 0.130409836769104, "learning_rate": 0.0004542369170133567, "loss": 0.7938, "step": 841 }, { "epoch": 0.09212757809508179, "grad_norm": 0.1348240077495575, "learning_rate": 0.0004541821764834684, "loss": 0.7617, "step": 842 }, { "epoch": 0.09223699327096668, "grad_norm": 0.13386191427707672, "learning_rate": 0.00045412743595358, "loss": 0.7728, "step": 843 }, { "epoch": 0.09234640844685157, "grad_norm": 0.12400679290294647, "learning_rate": 0.00045407269542369175, "loss": 0.7927, "step": 844 }, { "epoch": 0.09245582362273648, "grad_norm": 0.11994948983192444, "learning_rate": 0.0004540179548938034, "loss": 0.7771, "step": 845 }, { "epoch": 0.09256523879862137, "grad_norm": 0.1302652209997177, "learning_rate": 0.00045396321436391506, "loss": 0.7494, "step": 846 }, { "epoch": 0.09267465397450626, "grad_norm": 0.12986743450164795, "learning_rate": 0.00045390847383402675, "loss": 0.8138, "step": 847 }, { "epoch": 0.09278406915039115, "grad_norm": 0.13425888121128082, "learning_rate": 0.0004538537333041384, "loss": 0.7756, "step": 848 }, { "epoch": 0.09289348432627606, "grad_norm": 0.12459810823202133, "learning_rate": 0.00045379899277425006, "loss": 0.7741, "step": 849 }, { "epoch": 0.09300289950216095, "grad_norm": 0.1383737325668335, "learning_rate": 0.00045374425224436175, "loss": 0.8325, "step": 850 }, { "epoch": 0.09311231467804584, "grad_norm": 0.13608716428279877, "learning_rate": 0.00045368951171447343, "loss": 0.7756, "step": 851 }, { "epoch": 0.09322172985393073, "grad_norm": 0.13889335095882416, "learning_rate": 0.00045363477118458506, "loss": 0.7732, "step": 852 }, { "epoch": 0.09333114502981564, "grad_norm": 0.1280822902917862, "learning_rate": 0.00045358003065469675, "loss": 0.8564, "step": 853 }, { "epoch": 0.09344056020570053, "grad_norm": 0.14072707295417786, "learning_rate": 0.00045352529012480843, "loss": 0.8155, "step": 854 }, { "epoch": 0.09354997538158542, "grad_norm": 0.13418373465538025, "learning_rate": 0.00045347054959492006, "loss": 0.8179, "step": 855 }, { "epoch": 0.09365939055747032, "grad_norm": 0.12718930840492249, "learning_rate": 0.00045341580906503175, "loss": 0.8311, "step": 856 }, { "epoch": 0.09376880573335522, "grad_norm": 0.13539299368858337, "learning_rate": 0.00045336106853514343, "loss": 0.7597, "step": 857 }, { "epoch": 0.09387822090924011, "grad_norm": 0.13393492996692657, "learning_rate": 0.0004533063280052551, "loss": 0.7633, "step": 858 }, { "epoch": 0.093987636085125, "grad_norm": 0.12403690814971924, "learning_rate": 0.00045325158747536675, "loss": 0.8246, "step": 859 }, { "epoch": 0.0940970512610099, "grad_norm": 0.14093652367591858, "learning_rate": 0.0004531968469454785, "loss": 0.8478, "step": 860 }, { "epoch": 0.0942064664368948, "grad_norm": 0.1299886852502823, "learning_rate": 0.0004531421064155901, "loss": 0.8033, "step": 861 }, { "epoch": 0.09431588161277969, "grad_norm": 0.1436314731836319, "learning_rate": 0.00045308736588570175, "loss": 0.756, "step": 862 }, { "epoch": 0.09442529678866458, "grad_norm": 0.14625628292560577, "learning_rate": 0.0004530326253558135, "loss": 0.8077, "step": 863 }, { "epoch": 0.09453471196454949, "grad_norm": 0.13311560451984406, "learning_rate": 0.0004529778848259251, "loss": 0.7904, "step": 864 }, { "epoch": 0.09464412714043438, "grad_norm": 0.13658975064754486, "learning_rate": 0.0004529231442960368, "loss": 0.7551, "step": 865 }, { "epoch": 0.09475354231631927, "grad_norm": 0.12488414347171783, "learning_rate": 0.0004528684037661485, "loss": 0.7791, "step": 866 }, { "epoch": 0.09486295749220416, "grad_norm": 0.16200841963291168, "learning_rate": 0.0004528136632362601, "loss": 0.769, "step": 867 }, { "epoch": 0.09497237266808907, "grad_norm": 0.13552823662757874, "learning_rate": 0.0004527589227063718, "loss": 0.8406, "step": 868 }, { "epoch": 0.09508178784397396, "grad_norm": 0.13387542963027954, "learning_rate": 0.00045270418217648343, "loss": 0.8225, "step": 869 }, { "epoch": 0.09519120301985885, "grad_norm": 0.13191357254981995, "learning_rate": 0.00045264944164659517, "loss": 0.7619, "step": 870 }, { "epoch": 0.09530061819574374, "grad_norm": 0.12455244362354279, "learning_rate": 0.0004525947011167068, "loss": 0.7855, "step": 871 }, { "epoch": 0.09541003337162865, "grad_norm": 0.1381181925535202, "learning_rate": 0.0004525399605868185, "loss": 0.8178, "step": 872 }, { "epoch": 0.09551944854751354, "grad_norm": 0.12198858708143234, "learning_rate": 0.00045248522005693017, "loss": 0.811, "step": 873 }, { "epoch": 0.09562886372339843, "grad_norm": 0.13506944477558136, "learning_rate": 0.0004524304795270418, "loss": 0.8515, "step": 874 }, { "epoch": 0.09573827889928332, "grad_norm": 0.11900053173303604, "learning_rate": 0.0004523757389971535, "loss": 0.7405, "step": 875 }, { "epoch": 0.09584769407516823, "grad_norm": 0.13938312232494354, "learning_rate": 0.00045232099846726517, "loss": 0.765, "step": 876 }, { "epoch": 0.09595710925105312, "grad_norm": 0.13506482541561127, "learning_rate": 0.00045226625793737686, "loss": 0.8249, "step": 877 }, { "epoch": 0.09606652442693801, "grad_norm": 0.13080205023288727, "learning_rate": 0.0004522115174074885, "loss": 0.835, "step": 878 }, { "epoch": 0.0961759396028229, "grad_norm": 0.12592864036560059, "learning_rate": 0.0004521567768776002, "loss": 0.7856, "step": 879 }, { "epoch": 0.09628535477870781, "grad_norm": 0.13010521233081818, "learning_rate": 0.00045210203634771186, "loss": 0.7482, "step": 880 }, { "epoch": 0.0963947699545927, "grad_norm": 0.1319974660873413, "learning_rate": 0.0004520472958178235, "loss": 0.769, "step": 881 }, { "epoch": 0.0965041851304776, "grad_norm": 0.13264569640159607, "learning_rate": 0.0004519925552879352, "loss": 0.7728, "step": 882 }, { "epoch": 0.0966136003063625, "grad_norm": 0.1272253692150116, "learning_rate": 0.00045193781475804686, "loss": 0.7635, "step": 883 }, { "epoch": 0.09672301548224739, "grad_norm": 0.13206911087036133, "learning_rate": 0.00045188307422815854, "loss": 0.8642, "step": 884 }, { "epoch": 0.09683243065813228, "grad_norm": 0.12514425814151764, "learning_rate": 0.0004518283336982702, "loss": 0.7729, "step": 885 }, { "epoch": 0.09694184583401717, "grad_norm": 0.13075041770935059, "learning_rate": 0.0004517735931683819, "loss": 0.8154, "step": 886 }, { "epoch": 0.09705126100990208, "grad_norm": 0.1345946490764618, "learning_rate": 0.00045171885263849354, "loss": 0.7227, "step": 887 }, { "epoch": 0.09716067618578697, "grad_norm": 0.13903819024562836, "learning_rate": 0.00045166411210860517, "loss": 0.8337, "step": 888 }, { "epoch": 0.09727009136167186, "grad_norm": 0.1348152756690979, "learning_rate": 0.0004516093715787169, "loss": 0.8058, "step": 889 }, { "epoch": 0.09737950653755675, "grad_norm": 0.1311562955379486, "learning_rate": 0.00045155463104882854, "loss": 0.812, "step": 890 }, { "epoch": 0.09748892171344166, "grad_norm": 0.12518268823623657, "learning_rate": 0.0004514998905189402, "loss": 0.7878, "step": 891 }, { "epoch": 0.09759833688932655, "grad_norm": 0.12454608082771301, "learning_rate": 0.0004514451499890519, "loss": 0.8559, "step": 892 }, { "epoch": 0.09770775206521144, "grad_norm": 0.12925975024700165, "learning_rate": 0.0004513904094591636, "loss": 0.7896, "step": 893 }, { "epoch": 0.09781716724109633, "grad_norm": 0.12917102873325348, "learning_rate": 0.0004513356689292752, "loss": 0.8312, "step": 894 }, { "epoch": 0.09792658241698124, "grad_norm": 0.13915930688381195, "learning_rate": 0.0004512809283993869, "loss": 0.8597, "step": 895 }, { "epoch": 0.09803599759286613, "grad_norm": 0.11723369359970093, "learning_rate": 0.0004512261878694986, "loss": 0.8087, "step": 896 }, { "epoch": 0.09814541276875102, "grad_norm": 0.13288523256778717, "learning_rate": 0.0004511714473396102, "loss": 0.8145, "step": 897 }, { "epoch": 0.09825482794463591, "grad_norm": 0.14985594153404236, "learning_rate": 0.00045111670680972197, "loss": 0.781, "step": 898 }, { "epoch": 0.09836424312052082, "grad_norm": 0.12683513760566711, "learning_rate": 0.0004510619662798336, "loss": 0.7186, "step": 899 }, { "epoch": 0.09847365829640571, "grad_norm": 0.15821225941181183, "learning_rate": 0.0004510072257499453, "loss": 0.7815, "step": 900 }, { "epoch": 0.0985830734722906, "grad_norm": 0.12333038449287415, "learning_rate": 0.00045095248522005697, "loss": 0.7847, "step": 901 }, { "epoch": 0.09869248864817551, "grad_norm": 0.14353390038013458, "learning_rate": 0.0004508977446901686, "loss": 0.8657, "step": 902 }, { "epoch": 0.0988019038240604, "grad_norm": 0.1506509929895401, "learning_rate": 0.0004508430041602803, "loss": 0.8579, "step": 903 }, { "epoch": 0.09891131899994529, "grad_norm": 0.13668878376483917, "learning_rate": 0.0004507882636303919, "loss": 0.8436, "step": 904 }, { "epoch": 0.09902073417583018, "grad_norm": 0.14859361946582794, "learning_rate": 0.00045073352310050365, "loss": 0.7659, "step": 905 }, { "epoch": 0.09913014935171509, "grad_norm": 0.12982383370399475, "learning_rate": 0.0004506787825706153, "loss": 0.8125, "step": 906 }, { "epoch": 0.09923956452759998, "grad_norm": 0.14861206710338593, "learning_rate": 0.00045062404204072697, "loss": 0.7861, "step": 907 }, { "epoch": 0.09934897970348487, "grad_norm": 0.17041583359241486, "learning_rate": 0.00045056930151083865, "loss": 0.8402, "step": 908 }, { "epoch": 0.09945839487936976, "grad_norm": 0.13023744523525238, "learning_rate": 0.0004505145609809503, "loss": 0.7616, "step": 909 }, { "epoch": 0.09956781005525467, "grad_norm": 0.18082945048809052, "learning_rate": 0.00045045982045106197, "loss": 0.7837, "step": 910 }, { "epoch": 0.09967722523113956, "grad_norm": 0.1330898255109787, "learning_rate": 0.00045040507992117365, "loss": 0.7268, "step": 911 }, { "epoch": 0.09978664040702445, "grad_norm": 0.1201239824295044, "learning_rate": 0.00045035033939128534, "loss": 0.7648, "step": 912 }, { "epoch": 0.09989605558290934, "grad_norm": 0.1424979865550995, "learning_rate": 0.00045029559886139697, "loss": 0.8071, "step": 913 }, { "epoch": 0.10000547075879425, "grad_norm": 0.14414188265800476, "learning_rate": 0.0004502408583315087, "loss": 0.7925, "step": 914 }, { "epoch": 0.10011488593467914, "grad_norm": 0.1314583122730255, "learning_rate": 0.00045018611780162034, "loss": 0.8373, "step": 915 }, { "epoch": 0.10022430111056403, "grad_norm": 0.1319127380847931, "learning_rate": 0.00045013137727173197, "loss": 0.7714, "step": 916 }, { "epoch": 0.10033371628644894, "grad_norm": 0.12201649695634842, "learning_rate": 0.0004500766367418437, "loss": 0.766, "step": 917 }, { "epoch": 0.10044313146233383, "grad_norm": 0.15227025747299194, "learning_rate": 0.00045002189621195534, "loss": 0.7894, "step": 918 }, { "epoch": 0.10055254663821872, "grad_norm": 0.13453732430934906, "learning_rate": 0.000449967155682067, "loss": 0.8109, "step": 919 }, { "epoch": 0.10066196181410361, "grad_norm": 0.14098048210144043, "learning_rate": 0.0004499124151521787, "loss": 0.7829, "step": 920 }, { "epoch": 0.10077137698998852, "grad_norm": 0.1319231241941452, "learning_rate": 0.0004498576746222904, "loss": 0.7628, "step": 921 }, { "epoch": 0.10088079216587341, "grad_norm": 0.13196790218353271, "learning_rate": 0.000449802934092402, "loss": 0.7714, "step": 922 }, { "epoch": 0.1009902073417583, "grad_norm": 0.14268504083156586, "learning_rate": 0.00044974819356251365, "loss": 0.7394, "step": 923 }, { "epoch": 0.10109962251764319, "grad_norm": 0.12704773247241974, "learning_rate": 0.0004496934530326254, "loss": 0.7387, "step": 924 }, { "epoch": 0.1012090376935281, "grad_norm": 0.13709773123264313, "learning_rate": 0.000449638712502737, "loss": 0.7454, "step": 925 }, { "epoch": 0.10131845286941299, "grad_norm": 0.1277209371328354, "learning_rate": 0.0004495839719728487, "loss": 0.7484, "step": 926 }, { "epoch": 0.10142786804529788, "grad_norm": 0.14560799300670624, "learning_rate": 0.0004495292314429604, "loss": 0.792, "step": 927 }, { "epoch": 0.10153728322118277, "grad_norm": 0.12648740410804749, "learning_rate": 0.000449474490913072, "loss": 0.8183, "step": 928 }, { "epoch": 0.10164669839706768, "grad_norm": 0.1352737694978714, "learning_rate": 0.0004494197503831837, "loss": 0.7679, "step": 929 }, { "epoch": 0.10175611357295257, "grad_norm": 0.15277786552906036, "learning_rate": 0.0004493650098532954, "loss": 0.7519, "step": 930 }, { "epoch": 0.10186552874883746, "grad_norm": 0.12293413281440735, "learning_rate": 0.0004493102693234071, "loss": 0.775, "step": 931 }, { "epoch": 0.10197494392472235, "grad_norm": 0.13168272376060486, "learning_rate": 0.0004492555287935187, "loss": 0.8015, "step": 932 }, { "epoch": 0.10208435910060726, "grad_norm": 0.1431524157524109, "learning_rate": 0.00044920078826363044, "loss": 0.8114, "step": 933 }, { "epoch": 0.10219377427649215, "grad_norm": 0.1353079378604889, "learning_rate": 0.0004491460477337421, "loss": 0.7533, "step": 934 }, { "epoch": 0.10230318945237704, "grad_norm": 0.13141769170761108, "learning_rate": 0.0004490913072038537, "loss": 0.8162, "step": 935 }, { "epoch": 0.10241260462826195, "grad_norm": 0.13079199194908142, "learning_rate": 0.00044903656667396544, "loss": 0.8132, "step": 936 }, { "epoch": 0.10252201980414684, "grad_norm": 0.13032720983028412, "learning_rate": 0.0004489818261440771, "loss": 0.8286, "step": 937 }, { "epoch": 0.10263143498003173, "grad_norm": 0.12879133224487305, "learning_rate": 0.00044892708561418876, "loss": 0.7371, "step": 938 }, { "epoch": 0.10274085015591662, "grad_norm": 0.12975215911865234, "learning_rate": 0.0004488723450843004, "loss": 0.7895, "step": 939 }, { "epoch": 0.10285026533180153, "grad_norm": 0.1332099735736847, "learning_rate": 0.00044881760455441213, "loss": 0.7853, "step": 940 }, { "epoch": 0.10295968050768642, "grad_norm": 0.1320764124393463, "learning_rate": 0.00044876286402452376, "loss": 0.8064, "step": 941 }, { "epoch": 0.10306909568357131, "grad_norm": 0.1274600327014923, "learning_rate": 0.0004487081234946354, "loss": 0.8307, "step": 942 }, { "epoch": 0.1031785108594562, "grad_norm": 0.12957219779491425, "learning_rate": 0.00044865338296474713, "loss": 0.8129, "step": 943 }, { "epoch": 0.10328792603534111, "grad_norm": 0.12994976341724396, "learning_rate": 0.00044859864243485876, "loss": 0.8133, "step": 944 }, { "epoch": 0.103397341211226, "grad_norm": 0.13373355567455292, "learning_rate": 0.00044854390190497044, "loss": 0.7286, "step": 945 }, { "epoch": 0.10350675638711089, "grad_norm": 0.13426929712295532, "learning_rate": 0.00044848916137508213, "loss": 0.808, "step": 946 }, { "epoch": 0.10361617156299578, "grad_norm": 0.14540019631385803, "learning_rate": 0.0004484344208451938, "loss": 0.8161, "step": 947 }, { "epoch": 0.10372558673888069, "grad_norm": 0.14169450104236603, "learning_rate": 0.00044837968031530544, "loss": 0.7757, "step": 948 }, { "epoch": 0.10383500191476558, "grad_norm": 0.13643689453601837, "learning_rate": 0.00044832493978541713, "loss": 0.785, "step": 949 }, { "epoch": 0.10394441709065047, "grad_norm": 0.14777769148349762, "learning_rate": 0.0004482701992555288, "loss": 0.7428, "step": 950 }, { "epoch": 0.10405383226653536, "grad_norm": 0.1227537989616394, "learning_rate": 0.00044821545872564044, "loss": 0.8217, "step": 951 }, { "epoch": 0.10416324744242027, "grad_norm": 0.13628049194812775, "learning_rate": 0.0004481607181957522, "loss": 0.79, "step": 952 }, { "epoch": 0.10427266261830516, "grad_norm": 0.12361959367990494, "learning_rate": 0.0004481059776658638, "loss": 0.7754, "step": 953 }, { "epoch": 0.10438207779419005, "grad_norm": 0.1461232304573059, "learning_rate": 0.0004480512371359755, "loss": 0.7361, "step": 954 }, { "epoch": 0.10449149297007496, "grad_norm": 0.1306305080652237, "learning_rate": 0.0004479964966060872, "loss": 0.7339, "step": 955 }, { "epoch": 0.10460090814595985, "grad_norm": 0.1357058584690094, "learning_rate": 0.0004479417560761988, "loss": 0.7876, "step": 956 }, { "epoch": 0.10471032332184474, "grad_norm": 0.1358271837234497, "learning_rate": 0.0004478870155463105, "loss": 0.7357, "step": 957 }, { "epoch": 0.10481973849772963, "grad_norm": 0.15451757609844208, "learning_rate": 0.00044783227501642213, "loss": 0.7493, "step": 958 }, { "epoch": 0.10492915367361454, "grad_norm": 0.13164077699184418, "learning_rate": 0.00044777753448653387, "loss": 0.8281, "step": 959 }, { "epoch": 0.10503856884949943, "grad_norm": 0.13828858733177185, "learning_rate": 0.0004477227939566455, "loss": 0.8481, "step": 960 }, { "epoch": 0.10514798402538432, "grad_norm": 0.13938112556934357, "learning_rate": 0.0004476680534267572, "loss": 0.9154, "step": 961 }, { "epoch": 0.10525739920126921, "grad_norm": 0.13653281331062317, "learning_rate": 0.00044761331289686887, "loss": 0.7995, "step": 962 }, { "epoch": 0.10536681437715412, "grad_norm": 0.12309497594833374, "learning_rate": 0.0004475585723669805, "loss": 0.7739, "step": 963 }, { "epoch": 0.10547622955303901, "grad_norm": 0.13955220580101013, "learning_rate": 0.0004475038318370922, "loss": 0.9002, "step": 964 }, { "epoch": 0.1055856447289239, "grad_norm": 0.12930788099765778, "learning_rate": 0.00044744909130720387, "loss": 0.7399, "step": 965 }, { "epoch": 0.10569505990480879, "grad_norm": 0.1366620808839798, "learning_rate": 0.00044739435077731555, "loss": 0.7229, "step": 966 }, { "epoch": 0.1058044750806937, "grad_norm": 0.1330488920211792, "learning_rate": 0.0004473396102474272, "loss": 0.7984, "step": 967 }, { "epoch": 0.10591389025657859, "grad_norm": 0.1287115067243576, "learning_rate": 0.0004472848697175389, "loss": 0.8171, "step": 968 }, { "epoch": 0.10602330543246348, "grad_norm": 0.1275072693824768, "learning_rate": 0.00044723012918765055, "loss": 0.7928, "step": 969 }, { "epoch": 0.10613272060834837, "grad_norm": 0.13001388311386108, "learning_rate": 0.0004471753886577622, "loss": 0.8268, "step": 970 }, { "epoch": 0.10624213578423328, "grad_norm": 0.14156094193458557, "learning_rate": 0.0004471206481278739, "loss": 0.7461, "step": 971 }, { "epoch": 0.10635155096011817, "grad_norm": 0.12408027052879333, "learning_rate": 0.00044706590759798555, "loss": 0.7529, "step": 972 }, { "epoch": 0.10646096613600306, "grad_norm": 0.1415116935968399, "learning_rate": 0.00044701116706809724, "loss": 0.7345, "step": 973 }, { "epoch": 0.10657038131188797, "grad_norm": 0.13719205558300018, "learning_rate": 0.00044695642653820887, "loss": 0.7634, "step": 974 }, { "epoch": 0.10667979648777286, "grad_norm": 0.1260707974433899, "learning_rate": 0.0004469016860083206, "loss": 0.751, "step": 975 }, { "epoch": 0.10678921166365775, "grad_norm": 0.13430964946746826, "learning_rate": 0.00044684694547843224, "loss": 0.8175, "step": 976 }, { "epoch": 0.10689862683954264, "grad_norm": 0.13415879011154175, "learning_rate": 0.00044679220494854387, "loss": 0.7737, "step": 977 }, { "epoch": 0.10700804201542755, "grad_norm": 0.13064701855182648, "learning_rate": 0.0004467374644186556, "loss": 0.8187, "step": 978 }, { "epoch": 0.10711745719131244, "grad_norm": 0.1309397965669632, "learning_rate": 0.00044668272388876724, "loss": 0.7711, "step": 979 }, { "epoch": 0.10722687236719733, "grad_norm": 0.13807454705238342, "learning_rate": 0.0004466279833588789, "loss": 0.844, "step": 980 }, { "epoch": 0.10733628754308222, "grad_norm": 0.12134860455989838, "learning_rate": 0.0004465732428289906, "loss": 0.7733, "step": 981 }, { "epoch": 0.10744570271896713, "grad_norm": 0.12328996509313583, "learning_rate": 0.0004465185022991023, "loss": 0.8046, "step": 982 }, { "epoch": 0.10755511789485202, "grad_norm": 0.13826784491539001, "learning_rate": 0.0004464637617692139, "loss": 0.7971, "step": 983 }, { "epoch": 0.10766453307073691, "grad_norm": 0.13583026826381683, "learning_rate": 0.0004464090212393256, "loss": 0.7911, "step": 984 }, { "epoch": 0.1077739482466218, "grad_norm": 0.13675393164157867, "learning_rate": 0.0004463542807094373, "loss": 0.7732, "step": 985 }, { "epoch": 0.1078833634225067, "grad_norm": 0.12852011620998383, "learning_rate": 0.0004462995401795489, "loss": 0.7934, "step": 986 }, { "epoch": 0.1079927785983916, "grad_norm": 0.13517022132873535, "learning_rate": 0.00044624479964966066, "loss": 0.7325, "step": 987 }, { "epoch": 0.10810219377427649, "grad_norm": 0.12992070615291595, "learning_rate": 0.0004461900591197723, "loss": 0.7478, "step": 988 }, { "epoch": 0.10821160895016138, "grad_norm": 0.15335342288017273, "learning_rate": 0.0004461353185898839, "loss": 0.8266, "step": 989 }, { "epoch": 0.10832102412604629, "grad_norm": 0.1259089559316635, "learning_rate": 0.0004460805780599956, "loss": 0.771, "step": 990 }, { "epoch": 0.10843043930193118, "grad_norm": 0.1366206407546997, "learning_rate": 0.0004460258375301073, "loss": 0.7729, "step": 991 }, { "epoch": 0.10853985447781607, "grad_norm": 0.1322328895330429, "learning_rate": 0.000445971097000219, "loss": 0.7515, "step": 992 }, { "epoch": 0.10864926965370098, "grad_norm": 0.14436480402946472, "learning_rate": 0.0004459163564703306, "loss": 0.7777, "step": 993 }, { "epoch": 0.10875868482958587, "grad_norm": 0.13576185703277588, "learning_rate": 0.00044586161594044235, "loss": 0.7794, "step": 994 }, { "epoch": 0.10886810000547076, "grad_norm": 0.13443642854690552, "learning_rate": 0.000445806875410554, "loss": 0.7907, "step": 995 }, { "epoch": 0.10897751518135565, "grad_norm": 0.1381470263004303, "learning_rate": 0.0004457521348806656, "loss": 0.6953, "step": 996 }, { "epoch": 0.10908693035724056, "grad_norm": 0.11961760371923447, "learning_rate": 0.00044569739435077735, "loss": 0.7909, "step": 997 }, { "epoch": 0.10919634553312545, "grad_norm": 0.13247431814670563, "learning_rate": 0.000445642653820889, "loss": 0.7734, "step": 998 }, { "epoch": 0.10930576070901034, "grad_norm": 0.12336277216672897, "learning_rate": 0.00044558791329100066, "loss": 0.8228, "step": 999 }, { "epoch": 0.10941517588489523, "grad_norm": 0.12705232203006744, "learning_rate": 0.00044553317276111235, "loss": 0.8246, "step": 1000 }, { "epoch": 0.10952459106078014, "grad_norm": 0.12456828355789185, "learning_rate": 0.00044547843223122403, "loss": 0.7289, "step": 1001 }, { "epoch": 0.10963400623666503, "grad_norm": 0.13424871861934662, "learning_rate": 0.00044542369170133566, "loss": 0.7155, "step": 1002 }, { "epoch": 0.10974342141254992, "grad_norm": 0.134383887052536, "learning_rate": 0.00044536895117144735, "loss": 0.7498, "step": 1003 }, { "epoch": 0.10985283658843481, "grad_norm": 0.12783612310886383, "learning_rate": 0.00044531421064155903, "loss": 0.7767, "step": 1004 }, { "epoch": 0.10996225176431972, "grad_norm": 0.13858047127723694, "learning_rate": 0.00044525947011167066, "loss": 0.7852, "step": 1005 }, { "epoch": 0.11007166694020461, "grad_norm": 0.12759718298912048, "learning_rate": 0.0004452047295817824, "loss": 0.7572, "step": 1006 }, { "epoch": 0.1101810821160895, "grad_norm": 0.12997287511825562, "learning_rate": 0.00044514998905189403, "loss": 0.7803, "step": 1007 }, { "epoch": 0.11029049729197439, "grad_norm": 0.12527695298194885, "learning_rate": 0.0004450952485220057, "loss": 0.7384, "step": 1008 }, { "epoch": 0.1103999124678593, "grad_norm": 0.13485771417617798, "learning_rate": 0.00044504050799211735, "loss": 0.6983, "step": 1009 }, { "epoch": 0.11050932764374419, "grad_norm": 0.12797512114048004, "learning_rate": 0.00044498576746222903, "loss": 0.7729, "step": 1010 }, { "epoch": 0.11061874281962908, "grad_norm": 0.12598799169063568, "learning_rate": 0.0004449310269323407, "loss": 0.7826, "step": 1011 }, { "epoch": 0.11072815799551398, "grad_norm": 0.1288139969110489, "learning_rate": 0.00044487628640245235, "loss": 0.7843, "step": 1012 }, { "epoch": 0.11083757317139888, "grad_norm": 0.1305469572544098, "learning_rate": 0.0004448215458725641, "loss": 0.7634, "step": 1013 }, { "epoch": 0.11094698834728377, "grad_norm": 0.12389127910137177, "learning_rate": 0.0004447668053426757, "loss": 0.8276, "step": 1014 }, { "epoch": 0.11105640352316866, "grad_norm": 0.13536781072616577, "learning_rate": 0.0004447120648127874, "loss": 0.8113, "step": 1015 }, { "epoch": 0.11116581869905356, "grad_norm": 0.1299169510602951, "learning_rate": 0.0004446573242828991, "loss": 0.7913, "step": 1016 }, { "epoch": 0.11127523387493846, "grad_norm": 0.1322363317012787, "learning_rate": 0.0004446025837530107, "loss": 0.7861, "step": 1017 }, { "epoch": 0.11138464905082335, "grad_norm": 0.13953304290771484, "learning_rate": 0.0004445478432231224, "loss": 0.7426, "step": 1018 }, { "epoch": 0.11149406422670824, "grad_norm": 0.13578711450099945, "learning_rate": 0.0004444931026932341, "loss": 0.8461, "step": 1019 }, { "epoch": 0.11160347940259314, "grad_norm": 0.13135883212089539, "learning_rate": 0.00044443836216334577, "loss": 0.755, "step": 1020 }, { "epoch": 0.11171289457847804, "grad_norm": 0.11800549179315567, "learning_rate": 0.0004443836216334574, "loss": 0.7312, "step": 1021 }, { "epoch": 0.11182230975436293, "grad_norm": 0.139238640666008, "learning_rate": 0.00044432888110356914, "loss": 0.785, "step": 1022 }, { "epoch": 0.11193172493024782, "grad_norm": 0.13069406151771545, "learning_rate": 0.00044427414057368077, "loss": 0.8341, "step": 1023 }, { "epoch": 0.11204114010613273, "grad_norm": 0.14362677931785583, "learning_rate": 0.0004442194000437924, "loss": 0.7662, "step": 1024 }, { "epoch": 0.11215055528201762, "grad_norm": 0.14535081386566162, "learning_rate": 0.0004441646595139041, "loss": 0.7539, "step": 1025 }, { "epoch": 0.11225997045790251, "grad_norm": 0.1405307501554489, "learning_rate": 0.00044410991898401577, "loss": 0.7457, "step": 1026 }, { "epoch": 0.1123693856337874, "grad_norm": 0.13889093697071075, "learning_rate": 0.00044405517845412745, "loss": 0.8255, "step": 1027 }, { "epoch": 0.1124788008096723, "grad_norm": 0.1214916929602623, "learning_rate": 0.0004440004379242391, "loss": 0.765, "step": 1028 }, { "epoch": 0.1125882159855572, "grad_norm": 0.13284996151924133, "learning_rate": 0.0004439456973943508, "loss": 0.8089, "step": 1029 }, { "epoch": 0.11269763116144209, "grad_norm": 0.12932129204273224, "learning_rate": 0.00044389095686446245, "loss": 0.7914, "step": 1030 }, { "epoch": 0.112807046337327, "grad_norm": 0.13474150002002716, "learning_rate": 0.0004438362163345741, "loss": 0.8124, "step": 1031 }, { "epoch": 0.11291646151321189, "grad_norm": 0.12624211609363556, "learning_rate": 0.0004437814758046858, "loss": 0.7398, "step": 1032 }, { "epoch": 0.11302587668909678, "grad_norm": 0.12032605707645416, "learning_rate": 0.00044372673527479745, "loss": 0.7627, "step": 1033 }, { "epoch": 0.11313529186498167, "grad_norm": 0.14337024092674255, "learning_rate": 0.00044367199474490914, "loss": 0.8248, "step": 1034 }, { "epoch": 0.11324470704086657, "grad_norm": 0.12730887532234192, "learning_rate": 0.0004436172542150208, "loss": 0.7589, "step": 1035 }, { "epoch": 0.11335412221675147, "grad_norm": 0.12947671115398407, "learning_rate": 0.0004435625136851325, "loss": 0.746, "step": 1036 }, { "epoch": 0.11346353739263636, "grad_norm": 0.13235624134540558, "learning_rate": 0.00044350777315524414, "loss": 0.7463, "step": 1037 }, { "epoch": 0.11357295256852125, "grad_norm": 0.1227552592754364, "learning_rate": 0.0004434530326253558, "loss": 0.7329, "step": 1038 }, { "epoch": 0.11368236774440615, "grad_norm": 0.13684602081775665, "learning_rate": 0.0004433982920954675, "loss": 0.7765, "step": 1039 }, { "epoch": 0.11379178292029105, "grad_norm": 0.1298614889383316, "learning_rate": 0.00044334355156557914, "loss": 0.7453, "step": 1040 }, { "epoch": 0.11390119809617594, "grad_norm": 0.12696249783039093, "learning_rate": 0.0004432888110356909, "loss": 0.7675, "step": 1041 }, { "epoch": 0.11401061327206083, "grad_norm": 0.13396140933036804, "learning_rate": 0.0004432340705058025, "loss": 0.7668, "step": 1042 }, { "epoch": 0.11412002844794573, "grad_norm": 0.13220083713531494, "learning_rate": 0.0004431793299759142, "loss": 0.769, "step": 1043 }, { "epoch": 0.11422944362383063, "grad_norm": 0.13527829945087433, "learning_rate": 0.0004431245894460258, "loss": 0.7677, "step": 1044 }, { "epoch": 0.11433885879971552, "grad_norm": 0.12606564164161682, "learning_rate": 0.0004430698489161375, "loss": 0.7885, "step": 1045 }, { "epoch": 0.11444827397560041, "grad_norm": 0.1336769461631775, "learning_rate": 0.0004430151083862492, "loss": 0.7435, "step": 1046 }, { "epoch": 0.11455768915148531, "grad_norm": 0.12855565547943115, "learning_rate": 0.0004429603678563608, "loss": 0.77, "step": 1047 }, { "epoch": 0.1146671043273702, "grad_norm": 0.13434453308582306, "learning_rate": 0.00044290562732647256, "loss": 0.7697, "step": 1048 }, { "epoch": 0.1147765195032551, "grad_norm": 0.14219850301742554, "learning_rate": 0.0004428508867965842, "loss": 0.7708, "step": 1049 }, { "epoch": 0.11488593467914, "grad_norm": 0.1333613395690918, "learning_rate": 0.0004427961462666959, "loss": 0.7225, "step": 1050 }, { "epoch": 0.1149953498550249, "grad_norm": 0.12290302664041519, "learning_rate": 0.00044274140573680756, "loss": 0.7492, "step": 1051 }, { "epoch": 0.11510476503090979, "grad_norm": 0.14548704028129578, "learning_rate": 0.0004426866652069192, "loss": 0.7527, "step": 1052 }, { "epoch": 0.11521418020679468, "grad_norm": 0.11695350706577301, "learning_rate": 0.0004426319246770309, "loss": 0.7478, "step": 1053 }, { "epoch": 0.11532359538267958, "grad_norm": 0.13109999895095825, "learning_rate": 0.00044257718414714256, "loss": 0.8121, "step": 1054 }, { "epoch": 0.11543301055856448, "grad_norm": 0.14163804054260254, "learning_rate": 0.00044252244361725425, "loss": 0.7955, "step": 1055 }, { "epoch": 0.11554242573444937, "grad_norm": 0.12898865342140198, "learning_rate": 0.0004424677030873659, "loss": 0.7475, "step": 1056 }, { "epoch": 0.11565184091033426, "grad_norm": 0.12029261887073517, "learning_rate": 0.00044241296255747756, "loss": 0.7192, "step": 1057 }, { "epoch": 0.11576125608621916, "grad_norm": 0.13619616627693176, "learning_rate": 0.00044235822202758925, "loss": 0.8009, "step": 1058 }, { "epoch": 0.11587067126210406, "grad_norm": 0.13375337421894073, "learning_rate": 0.0004423034814977009, "loss": 0.7869, "step": 1059 }, { "epoch": 0.11598008643798895, "grad_norm": 0.12112840265035629, "learning_rate": 0.00044224874096781256, "loss": 0.7485, "step": 1060 }, { "epoch": 0.11608950161387384, "grad_norm": 0.1314300298690796, "learning_rate": 0.00044219400043792425, "loss": 0.7571, "step": 1061 }, { "epoch": 0.11619891678975874, "grad_norm": 0.12144117802381516, "learning_rate": 0.00044213925990803593, "loss": 0.7869, "step": 1062 }, { "epoch": 0.11630833196564364, "grad_norm": 0.13495445251464844, "learning_rate": 0.00044208451937814756, "loss": 0.7773, "step": 1063 }, { "epoch": 0.11641774714152853, "grad_norm": 0.11821703612804413, "learning_rate": 0.00044202977884825925, "loss": 0.8025, "step": 1064 }, { "epoch": 0.11652716231741342, "grad_norm": 0.12675173580646515, "learning_rate": 0.00044197503831837093, "loss": 0.8253, "step": 1065 }, { "epoch": 0.11663657749329832, "grad_norm": 0.12554557621479034, "learning_rate": 0.00044192029778848256, "loss": 0.7861, "step": 1066 }, { "epoch": 0.11674599266918322, "grad_norm": 0.13366426527500153, "learning_rate": 0.0004418655572585943, "loss": 0.7871, "step": 1067 }, { "epoch": 0.11685540784506811, "grad_norm": 0.1291031390428543, "learning_rate": 0.00044181081672870593, "loss": 0.728, "step": 1068 }, { "epoch": 0.11696482302095301, "grad_norm": 0.1254938393831253, "learning_rate": 0.0004417560761988176, "loss": 0.7994, "step": 1069 }, { "epoch": 0.1170742381968379, "grad_norm": 0.13304096460342407, "learning_rate": 0.0004417013356689293, "loss": 0.74, "step": 1070 }, { "epoch": 0.1171836533727228, "grad_norm": 0.13056455552577972, "learning_rate": 0.00044164659513904093, "loss": 0.7247, "step": 1071 }, { "epoch": 0.11729306854860769, "grad_norm": 0.13367599248886108, "learning_rate": 0.0004415918546091526, "loss": 0.74, "step": 1072 }, { "epoch": 0.11740248372449259, "grad_norm": 0.13441628217697144, "learning_rate": 0.0004415371140792643, "loss": 0.7347, "step": 1073 }, { "epoch": 0.11751189890037748, "grad_norm": 0.12244869768619537, "learning_rate": 0.000441482373549376, "loss": 0.7392, "step": 1074 }, { "epoch": 0.11762131407626238, "grad_norm": 0.12894991040229797, "learning_rate": 0.0004414276330194876, "loss": 0.7525, "step": 1075 }, { "epoch": 0.11773072925214727, "grad_norm": 0.13036777079105377, "learning_rate": 0.00044137289248959936, "loss": 0.7296, "step": 1076 }, { "epoch": 0.11784014442803217, "grad_norm": 0.14531676471233368, "learning_rate": 0.000441318151959711, "loss": 0.7549, "step": 1077 }, { "epoch": 0.11794955960391706, "grad_norm": 0.1592281460762024, "learning_rate": 0.0004412634114298226, "loss": 0.8069, "step": 1078 }, { "epoch": 0.11805897477980196, "grad_norm": 0.1485714316368103, "learning_rate": 0.0004412086708999343, "loss": 0.8245, "step": 1079 }, { "epoch": 0.11816838995568685, "grad_norm": 0.14192278683185577, "learning_rate": 0.000441153930370046, "loss": 0.7955, "step": 1080 }, { "epoch": 0.11827780513157175, "grad_norm": 0.13488787412643433, "learning_rate": 0.00044109918984015767, "loss": 0.6834, "step": 1081 }, { "epoch": 0.11838722030745664, "grad_norm": 0.1197369247674942, "learning_rate": 0.0004410444493102693, "loss": 0.8, "step": 1082 }, { "epoch": 0.11849663548334154, "grad_norm": 0.15575090050697327, "learning_rate": 0.00044098970878038104, "loss": 0.7744, "step": 1083 }, { "epoch": 0.11860605065922643, "grad_norm": 0.12576475739479065, "learning_rate": 0.00044093496825049267, "loss": 0.8084, "step": 1084 }, { "epoch": 0.11871546583511133, "grad_norm": 0.13382244110107422, "learning_rate": 0.0004408802277206043, "loss": 0.7622, "step": 1085 }, { "epoch": 0.11882488101099623, "grad_norm": 0.12559066712856293, "learning_rate": 0.00044082548719071604, "loss": 0.7256, "step": 1086 }, { "epoch": 0.11893429618688112, "grad_norm": 0.13809314370155334, "learning_rate": 0.00044077074666082767, "loss": 0.7657, "step": 1087 }, { "epoch": 0.11904371136276602, "grad_norm": 0.131216362118721, "learning_rate": 0.00044071600613093936, "loss": 0.8098, "step": 1088 }, { "epoch": 0.11915312653865091, "grad_norm": 0.1170535609126091, "learning_rate": 0.00044066126560105104, "loss": 0.7474, "step": 1089 }, { "epoch": 0.1192625417145358, "grad_norm": 0.11713378876447678, "learning_rate": 0.0004406065250711627, "loss": 0.7953, "step": 1090 }, { "epoch": 0.1193719568904207, "grad_norm": 0.12003420293331146, "learning_rate": 0.00044055178454127436, "loss": 0.7615, "step": 1091 }, { "epoch": 0.1194813720663056, "grad_norm": 0.12679164111614227, "learning_rate": 0.00044049704401138604, "loss": 0.7631, "step": 1092 }, { "epoch": 0.1195907872421905, "grad_norm": 0.1327028125524521, "learning_rate": 0.0004404423034814977, "loss": 0.7701, "step": 1093 }, { "epoch": 0.11970020241807539, "grad_norm": 0.11966891586780548, "learning_rate": 0.00044038756295160936, "loss": 0.7642, "step": 1094 }, { "epoch": 0.11980961759396028, "grad_norm": 0.11959035694599152, "learning_rate": 0.00044033282242172104, "loss": 0.7533, "step": 1095 }, { "epoch": 0.11991903276984518, "grad_norm": 0.13304780423641205, "learning_rate": 0.0004402780818918327, "loss": 0.8226, "step": 1096 }, { "epoch": 0.12002844794573007, "grad_norm": 0.13597477972507477, "learning_rate": 0.0004402233413619444, "loss": 0.8356, "step": 1097 }, { "epoch": 0.12013786312161497, "grad_norm": 0.1298782229423523, "learning_rate": 0.00044016860083205604, "loss": 0.7086, "step": 1098 }, { "epoch": 0.12024727829749986, "grad_norm": 0.13741499185562134, "learning_rate": 0.0004401138603021677, "loss": 0.753, "step": 1099 }, { "epoch": 0.12035669347338476, "grad_norm": 0.13556331396102905, "learning_rate": 0.0004400591197722794, "loss": 0.7769, "step": 1100 }, { "epoch": 0.12046610864926965, "grad_norm": 0.12200785428285599, "learning_rate": 0.00044000437924239104, "loss": 0.8042, "step": 1101 }, { "epoch": 0.12057552382515455, "grad_norm": 0.12551499903202057, "learning_rate": 0.0004399496387125028, "loss": 0.7532, "step": 1102 }, { "epoch": 0.12068493900103944, "grad_norm": 0.12719234824180603, "learning_rate": 0.0004398948981826144, "loss": 0.7698, "step": 1103 }, { "epoch": 0.12079435417692434, "grad_norm": 0.12838374078273773, "learning_rate": 0.0004398401576527261, "loss": 0.7514, "step": 1104 }, { "epoch": 0.12090376935280923, "grad_norm": 0.1346876323223114, "learning_rate": 0.0004397854171228378, "loss": 0.8248, "step": 1105 }, { "epoch": 0.12101318452869413, "grad_norm": 0.1377728283405304, "learning_rate": 0.0004397306765929494, "loss": 0.8361, "step": 1106 }, { "epoch": 0.12112259970457903, "grad_norm": 0.12911072373390198, "learning_rate": 0.0004396759360630611, "loss": 0.7058, "step": 1107 }, { "epoch": 0.12123201488046392, "grad_norm": 0.13019248843193054, "learning_rate": 0.0004396211955331728, "loss": 0.7319, "step": 1108 }, { "epoch": 0.12134143005634881, "grad_norm": 0.13676296174526215, "learning_rate": 0.00043956645500328447, "loss": 0.809, "step": 1109 }, { "epoch": 0.1214508452322337, "grad_norm": 0.12073907256126404, "learning_rate": 0.0004395117144733961, "loss": 0.7566, "step": 1110 }, { "epoch": 0.12156026040811861, "grad_norm": 0.13152284920215607, "learning_rate": 0.00043945697394350784, "loss": 0.8395, "step": 1111 }, { "epoch": 0.1216696755840035, "grad_norm": 0.13532520830631256, "learning_rate": 0.00043940223341361947, "loss": 0.8327, "step": 1112 }, { "epoch": 0.1217790907598884, "grad_norm": 0.1292223185300827, "learning_rate": 0.0004393474928837311, "loss": 0.8071, "step": 1113 }, { "epoch": 0.12188850593577329, "grad_norm": 0.1269766092300415, "learning_rate": 0.0004392927523538428, "loss": 0.8504, "step": 1114 }, { "epoch": 0.12199792111165819, "grad_norm": 0.1321382373571396, "learning_rate": 0.00043923801182395447, "loss": 0.7638, "step": 1115 }, { "epoch": 0.12210733628754308, "grad_norm": 0.13590598106384277, "learning_rate": 0.00043918327129406615, "loss": 0.8095, "step": 1116 }, { "epoch": 0.12221675146342798, "grad_norm": 0.12761558592319489, "learning_rate": 0.0004391285307641778, "loss": 0.7502, "step": 1117 }, { "epoch": 0.12232616663931287, "grad_norm": 0.1326517015695572, "learning_rate": 0.00043907379023428947, "loss": 0.8202, "step": 1118 }, { "epoch": 0.12243558181519777, "grad_norm": 0.143679678440094, "learning_rate": 0.00043901904970440115, "loss": 0.7744, "step": 1119 }, { "epoch": 0.12254499699108266, "grad_norm": 0.15425191819667816, "learning_rate": 0.0004389643091745128, "loss": 0.9176, "step": 1120 }, { "epoch": 0.12265441216696756, "grad_norm": 0.14009669423103333, "learning_rate": 0.0004389095686446245, "loss": 0.7772, "step": 1121 }, { "epoch": 0.12276382734285245, "grad_norm": 0.12876300513744354, "learning_rate": 0.00043885482811473615, "loss": 0.807, "step": 1122 }, { "epoch": 0.12287324251873735, "grad_norm": 0.1545405387878418, "learning_rate": 0.00043880008758484784, "loss": 0.7989, "step": 1123 }, { "epoch": 0.12298265769462224, "grad_norm": 0.12171108275651932, "learning_rate": 0.0004387453470549595, "loss": 0.7716, "step": 1124 }, { "epoch": 0.12309207287050714, "grad_norm": 0.15100206434726715, "learning_rate": 0.00043869060652507115, "loss": 0.7507, "step": 1125 }, { "epoch": 0.12320148804639204, "grad_norm": 0.13011153042316437, "learning_rate": 0.00043863586599518284, "loss": 0.8026, "step": 1126 }, { "epoch": 0.12331090322227693, "grad_norm": 0.1497156172990799, "learning_rate": 0.0004385811254652945, "loss": 0.8125, "step": 1127 }, { "epoch": 0.12342031839816182, "grad_norm": 0.132900670170784, "learning_rate": 0.0004385263849354062, "loss": 0.796, "step": 1128 }, { "epoch": 0.12352973357404672, "grad_norm": 0.13192936778068542, "learning_rate": 0.00043847164440551784, "loss": 0.819, "step": 1129 }, { "epoch": 0.12363914874993162, "grad_norm": 0.1334887593984604, "learning_rate": 0.0004384169038756295, "loss": 0.8496, "step": 1130 }, { "epoch": 0.12374856392581651, "grad_norm": 0.12504062056541443, "learning_rate": 0.0004383621633457412, "loss": 0.7736, "step": 1131 }, { "epoch": 0.1238579791017014, "grad_norm": 0.13605064153671265, "learning_rate": 0.00043830742281585284, "loss": 0.7994, "step": 1132 }, { "epoch": 0.1239673942775863, "grad_norm": 0.13616980612277985, "learning_rate": 0.0004382526822859645, "loss": 0.8325, "step": 1133 }, { "epoch": 0.1240768094534712, "grad_norm": 0.13959208130836487, "learning_rate": 0.0004381979417560762, "loss": 0.8276, "step": 1134 }, { "epoch": 0.12418622462935609, "grad_norm": 0.1294364631175995, "learning_rate": 0.0004381432012261879, "loss": 0.8518, "step": 1135 }, { "epoch": 0.12429563980524098, "grad_norm": 0.12846042215824127, "learning_rate": 0.0004380884606962995, "loss": 0.776, "step": 1136 }, { "epoch": 0.12440505498112588, "grad_norm": 0.13048383593559265, "learning_rate": 0.00043803372016641126, "loss": 0.7514, "step": 1137 }, { "epoch": 0.12451447015701078, "grad_norm": 0.1378210186958313, "learning_rate": 0.0004379789796365229, "loss": 0.7708, "step": 1138 }, { "epoch": 0.12462388533289567, "grad_norm": 0.13358299434185028, "learning_rate": 0.0004379242391066345, "loss": 0.7403, "step": 1139 }, { "epoch": 0.12473330050878056, "grad_norm": 0.13265816867351532, "learning_rate": 0.00043786949857674626, "loss": 0.7653, "step": 1140 }, { "epoch": 0.12484271568466546, "grad_norm": 0.12494358420372009, "learning_rate": 0.0004378147580468579, "loss": 0.6986, "step": 1141 }, { "epoch": 0.12495213086055036, "grad_norm": 0.1496211290359497, "learning_rate": 0.0004377600175169696, "loss": 0.7527, "step": 1142 }, { "epoch": 0.12506154603643527, "grad_norm": 0.12276919186115265, "learning_rate": 0.00043770527698708126, "loss": 0.7954, "step": 1143 }, { "epoch": 0.12517096121232016, "grad_norm": 0.126662939786911, "learning_rate": 0.00043765053645719294, "loss": 0.734, "step": 1144 }, { "epoch": 0.12528037638820505, "grad_norm": 0.1426427811384201, "learning_rate": 0.0004375957959273046, "loss": 0.7681, "step": 1145 }, { "epoch": 0.12538979156408994, "grad_norm": 0.14840476214885712, "learning_rate": 0.00043754105539741626, "loss": 0.7638, "step": 1146 }, { "epoch": 0.12549920673997483, "grad_norm": 0.12506920099258423, "learning_rate": 0.00043748631486752794, "loss": 0.7476, "step": 1147 }, { "epoch": 0.12560862191585973, "grad_norm": 0.13591055572032928, "learning_rate": 0.0004374315743376396, "loss": 0.8024, "step": 1148 }, { "epoch": 0.12571803709174462, "grad_norm": 0.13331814110279083, "learning_rate": 0.00043737683380775126, "loss": 0.7886, "step": 1149 }, { "epoch": 0.1258274522676295, "grad_norm": 0.12547822296619415, "learning_rate": 0.00043732209327786294, "loss": 0.7316, "step": 1150 }, { "epoch": 0.12593686744351443, "grad_norm": 0.150065615773201, "learning_rate": 0.00043726735274797463, "loss": 0.7291, "step": 1151 }, { "epoch": 0.12604628261939932, "grad_norm": 0.130365788936615, "learning_rate": 0.00043721261221808626, "loss": 0.7436, "step": 1152 }, { "epoch": 0.1261556977952842, "grad_norm": 0.13981714844703674, "learning_rate": 0.00043715787168819794, "loss": 0.7459, "step": 1153 }, { "epoch": 0.1262651129711691, "grad_norm": 0.1366787701845169, "learning_rate": 0.00043710313115830963, "loss": 0.7913, "step": 1154 }, { "epoch": 0.126374528147054, "grad_norm": 0.1365724802017212, "learning_rate": 0.00043704839062842126, "loss": 0.7729, "step": 1155 }, { "epoch": 0.12648394332293889, "grad_norm": 0.16093815863132477, "learning_rate": 0.000436993650098533, "loss": 0.7787, "step": 1156 }, { "epoch": 0.12659335849882378, "grad_norm": 0.12994886934757233, "learning_rate": 0.00043693890956864463, "loss": 0.6733, "step": 1157 }, { "epoch": 0.12670277367470867, "grad_norm": 0.14282678067684174, "learning_rate": 0.0004368841690387563, "loss": 0.8059, "step": 1158 }, { "epoch": 0.1268121888505936, "grad_norm": 0.13004565238952637, "learning_rate": 0.000436829428508868, "loss": 0.817, "step": 1159 }, { "epoch": 0.12692160402647848, "grad_norm": 0.13264061510562897, "learning_rate": 0.00043677468797897963, "loss": 0.7658, "step": 1160 }, { "epoch": 0.12703101920236337, "grad_norm": 0.13007225096225739, "learning_rate": 0.0004367199474490913, "loss": 0.821, "step": 1161 }, { "epoch": 0.12714043437824826, "grad_norm": 0.12610454857349396, "learning_rate": 0.000436665206919203, "loss": 0.8383, "step": 1162 }, { "epoch": 0.12724984955413315, "grad_norm": 0.16056965291500092, "learning_rate": 0.0004366104663893147, "loss": 0.7602, "step": 1163 }, { "epoch": 0.12735926473001805, "grad_norm": 0.1486276537179947, "learning_rate": 0.0004365557258594263, "loss": 0.9063, "step": 1164 }, { "epoch": 0.12746867990590294, "grad_norm": 0.133860245347023, "learning_rate": 0.000436500985329538, "loss": 0.7382, "step": 1165 }, { "epoch": 0.12757809508178786, "grad_norm": 0.1574244648218155, "learning_rate": 0.0004364462447996497, "loss": 0.749, "step": 1166 }, { "epoch": 0.12768751025767275, "grad_norm": 0.13888847827911377, "learning_rate": 0.0004363915042697613, "loss": 0.7909, "step": 1167 }, { "epoch": 0.12779692543355764, "grad_norm": 0.12209299206733704, "learning_rate": 0.000436336763739873, "loss": 0.7222, "step": 1168 }, { "epoch": 0.12790634060944253, "grad_norm": 0.17961803078651428, "learning_rate": 0.0004362820232099847, "loss": 0.7682, "step": 1169 }, { "epoch": 0.12801575578532742, "grad_norm": 0.1341380476951599, "learning_rate": 0.00043622728268009637, "loss": 0.7638, "step": 1170 }, { "epoch": 0.12812517096121231, "grad_norm": 0.13182243704795837, "learning_rate": 0.000436172542150208, "loss": 0.7312, "step": 1171 }, { "epoch": 0.1282345861370972, "grad_norm": 0.13689130544662476, "learning_rate": 0.00043611780162031974, "loss": 0.7893, "step": 1172 }, { "epoch": 0.1283440013129821, "grad_norm": 0.14147675037384033, "learning_rate": 0.00043606306109043137, "loss": 0.8035, "step": 1173 }, { "epoch": 0.12845341648886702, "grad_norm": 0.1377492994070053, "learning_rate": 0.000436008320560543, "loss": 0.8243, "step": 1174 }, { "epoch": 0.1285628316647519, "grad_norm": 0.13435527682304382, "learning_rate": 0.00043595358003065474, "loss": 0.7738, "step": 1175 }, { "epoch": 0.1286722468406368, "grad_norm": 0.13645008206367493, "learning_rate": 0.00043589883950076637, "loss": 0.7853, "step": 1176 }, { "epoch": 0.1287816620165217, "grad_norm": 0.15436284244060516, "learning_rate": 0.00043584409897087805, "loss": 0.7809, "step": 1177 }, { "epoch": 0.12889107719240658, "grad_norm": 0.12589530646800995, "learning_rate": 0.00043578935844098974, "loss": 0.7636, "step": 1178 }, { "epoch": 0.12900049236829148, "grad_norm": 0.13672082126140594, "learning_rate": 0.00043573461791110137, "loss": 0.7314, "step": 1179 }, { "epoch": 0.12910990754417637, "grad_norm": 0.1406700611114502, "learning_rate": 0.00043567987738121305, "loss": 0.8231, "step": 1180 }, { "epoch": 0.12921932272006129, "grad_norm": 0.19643892347812653, "learning_rate": 0.00043562513685132474, "loss": 0.7587, "step": 1181 }, { "epoch": 0.12932873789594618, "grad_norm": 0.12397599220275879, "learning_rate": 0.0004355703963214364, "loss": 0.7762, "step": 1182 }, { "epoch": 0.12943815307183107, "grad_norm": 0.14394938945770264, "learning_rate": 0.00043551565579154805, "loss": 0.8138, "step": 1183 }, { "epoch": 0.12954756824771596, "grad_norm": 0.1412685364484787, "learning_rate": 0.00043546091526165974, "loss": 0.7117, "step": 1184 }, { "epoch": 0.12965698342360085, "grad_norm": 0.14082439243793488, "learning_rate": 0.0004354061747317714, "loss": 0.7806, "step": 1185 }, { "epoch": 0.12976639859948574, "grad_norm": 0.1373262256383896, "learning_rate": 0.00043535143420188305, "loss": 0.8075, "step": 1186 }, { "epoch": 0.12987581377537064, "grad_norm": 0.1362428218126297, "learning_rate": 0.00043529669367199474, "loss": 0.7201, "step": 1187 }, { "epoch": 0.12998522895125553, "grad_norm": 0.1372116059064865, "learning_rate": 0.0004352419531421064, "loss": 0.7999, "step": 1188 }, { "epoch": 0.13009464412714045, "grad_norm": 0.1493496149778366, "learning_rate": 0.0004351872126122181, "loss": 0.7812, "step": 1189 }, { "epoch": 0.13020405930302534, "grad_norm": 0.13253554701805115, "learning_rate": 0.00043513247208232974, "loss": 0.8844, "step": 1190 }, { "epoch": 0.13031347447891023, "grad_norm": 0.14732947945594788, "learning_rate": 0.0004350777315524415, "loss": 0.7237, "step": 1191 }, { "epoch": 0.13042288965479512, "grad_norm": 0.18449175357818604, "learning_rate": 0.0004350229910225531, "loss": 0.7629, "step": 1192 }, { "epoch": 0.13053230483068, "grad_norm": 0.14982518553733826, "learning_rate": 0.00043496825049266474, "loss": 0.7746, "step": 1193 }, { "epoch": 0.1306417200065649, "grad_norm": 0.39108192920684814, "learning_rate": 0.0004349135099627765, "loss": 0.7162, "step": 1194 }, { "epoch": 0.1307511351824498, "grad_norm": 0.20944374799728394, "learning_rate": 0.0004348587694328881, "loss": 0.8308, "step": 1195 }, { "epoch": 0.1308605503583347, "grad_norm": 0.1352107971906662, "learning_rate": 0.0004348040289029998, "loss": 0.7842, "step": 1196 }, { "epoch": 0.1309699655342196, "grad_norm": 0.1621432602405548, "learning_rate": 0.0004347492883731115, "loss": 0.7105, "step": 1197 }, { "epoch": 0.1310793807101045, "grad_norm": 0.2577078640460968, "learning_rate": 0.00043469454784322316, "loss": 0.7584, "step": 1198 }, { "epoch": 0.1311887958859894, "grad_norm": 0.14671602845191956, "learning_rate": 0.0004346398073133348, "loss": 0.7559, "step": 1199 }, { "epoch": 0.13129821106187428, "grad_norm": 0.15495435893535614, "learning_rate": 0.0004345850667834464, "loss": 0.7494, "step": 1200 }, { "epoch": 0.13140762623775917, "grad_norm": 0.1493402123451233, "learning_rate": 0.00043453032625355816, "loss": 0.8364, "step": 1201 }, { "epoch": 0.13151704141364406, "grad_norm": 0.14528483152389526, "learning_rate": 0.0004344755857236698, "loss": 0.745, "step": 1202 }, { "epoch": 0.13162645658952896, "grad_norm": 0.1387147307395935, "learning_rate": 0.0004344208451937815, "loss": 0.7793, "step": 1203 }, { "epoch": 0.13173587176541388, "grad_norm": 0.14273737370967865, "learning_rate": 0.00043436610466389316, "loss": 0.7683, "step": 1204 }, { "epoch": 0.13184528694129877, "grad_norm": 0.153502956032753, "learning_rate": 0.00043431136413400485, "loss": 0.8696, "step": 1205 }, { "epoch": 0.13195470211718366, "grad_norm": 0.13589438796043396, "learning_rate": 0.0004342566236041165, "loss": 0.7936, "step": 1206 }, { "epoch": 0.13206411729306855, "grad_norm": 0.15617847442626953, "learning_rate": 0.00043420188307422816, "loss": 0.7442, "step": 1207 }, { "epoch": 0.13217353246895344, "grad_norm": 0.15253420174121857, "learning_rate": 0.00043414714254433985, "loss": 0.8063, "step": 1208 }, { "epoch": 0.13228294764483833, "grad_norm": 0.13717269897460938, "learning_rate": 0.0004340924020144515, "loss": 0.7736, "step": 1209 }, { "epoch": 0.13239236282072323, "grad_norm": 0.14210516214370728, "learning_rate": 0.0004340376614845632, "loss": 0.7955, "step": 1210 }, { "epoch": 0.13250177799660812, "grad_norm": 0.1542780101299286, "learning_rate": 0.00043398292095467485, "loss": 0.7502, "step": 1211 }, { "epoch": 0.13261119317249304, "grad_norm": 0.14640051126480103, "learning_rate": 0.00043392818042478653, "loss": 0.7959, "step": 1212 }, { "epoch": 0.13272060834837793, "grad_norm": 0.14602446556091309, "learning_rate": 0.0004338734398948982, "loss": 0.8462, "step": 1213 }, { "epoch": 0.13283002352426282, "grad_norm": 0.13805072009563446, "learning_rate": 0.00043381869936500985, "loss": 0.8199, "step": 1214 }, { "epoch": 0.1329394387001477, "grad_norm": 0.1911824494600296, "learning_rate": 0.00043376395883512153, "loss": 0.7982, "step": 1215 }, { "epoch": 0.1330488538760326, "grad_norm": 0.13687466084957123, "learning_rate": 0.0004337092183052332, "loss": 0.7628, "step": 1216 }, { "epoch": 0.1331582690519175, "grad_norm": 0.16205120086669922, "learning_rate": 0.0004336544777753449, "loss": 0.7551, "step": 1217 }, { "epoch": 0.13326768422780239, "grad_norm": 0.13975821435451508, "learning_rate": 0.00043359973724545653, "loss": 0.7658, "step": 1218 }, { "epoch": 0.1333770994036873, "grad_norm": 0.15233905613422394, "learning_rate": 0.0004335449967155682, "loss": 0.7018, "step": 1219 }, { "epoch": 0.1334865145795722, "grad_norm": 0.15656650066375732, "learning_rate": 0.0004334902561856799, "loss": 0.8269, "step": 1220 }, { "epoch": 0.1335959297554571, "grad_norm": 0.14820407330989838, "learning_rate": 0.00043343551565579153, "loss": 0.7171, "step": 1221 }, { "epoch": 0.13370534493134198, "grad_norm": 0.1466151624917984, "learning_rate": 0.0004333807751259032, "loss": 0.8716, "step": 1222 }, { "epoch": 0.13381476010722687, "grad_norm": 0.1546681672334671, "learning_rate": 0.0004333260345960149, "loss": 0.8042, "step": 1223 }, { "epoch": 0.13392417528311176, "grad_norm": 0.15298831462860107, "learning_rate": 0.0004332712940661266, "loss": 0.7643, "step": 1224 }, { "epoch": 0.13403359045899665, "grad_norm": 0.14478518068790436, "learning_rate": 0.0004332165535362382, "loss": 0.7336, "step": 1225 }, { "epoch": 0.13414300563488155, "grad_norm": 0.14360474050045013, "learning_rate": 0.00043316181300634995, "loss": 0.821, "step": 1226 }, { "epoch": 0.13425242081076647, "grad_norm": 0.14628145098686218, "learning_rate": 0.0004331070724764616, "loss": 0.6808, "step": 1227 }, { "epoch": 0.13436183598665136, "grad_norm": 0.14383678138256073, "learning_rate": 0.0004330523319465732, "loss": 0.7476, "step": 1228 }, { "epoch": 0.13447125116253625, "grad_norm": 0.1475321501493454, "learning_rate": 0.00043299759141668495, "loss": 0.7676, "step": 1229 }, { "epoch": 0.13458066633842114, "grad_norm": 0.15485474467277527, "learning_rate": 0.0004329428508867966, "loss": 0.796, "step": 1230 }, { "epoch": 0.13469008151430603, "grad_norm": 0.141950324177742, "learning_rate": 0.00043288811035690827, "loss": 0.8137, "step": 1231 }, { "epoch": 0.13479949669019092, "grad_norm": 0.1389899104833603, "learning_rate": 0.00043283336982701995, "loss": 0.7659, "step": 1232 }, { "epoch": 0.13490891186607581, "grad_norm": 0.13521866500377655, "learning_rate": 0.00043277862929713164, "loss": 0.6593, "step": 1233 }, { "epoch": 0.1350183270419607, "grad_norm": 0.1500558704137802, "learning_rate": 0.00043272388876724327, "loss": 0.7849, "step": 1234 }, { "epoch": 0.13512774221784563, "grad_norm": 0.14103363454341888, "learning_rate": 0.0004326691482373549, "loss": 0.7456, "step": 1235 }, { "epoch": 0.13523715739373052, "grad_norm": 0.1372871845960617, "learning_rate": 0.00043261440770746664, "loss": 0.7289, "step": 1236 }, { "epoch": 0.1353465725696154, "grad_norm": 0.14821115136146545, "learning_rate": 0.00043255966717757827, "loss": 0.7715, "step": 1237 }, { "epoch": 0.1354559877455003, "grad_norm": 0.136243999004364, "learning_rate": 0.00043250492664768995, "loss": 0.7032, "step": 1238 }, { "epoch": 0.1355654029213852, "grad_norm": 0.13866601884365082, "learning_rate": 0.00043245018611780164, "loss": 0.7716, "step": 1239 }, { "epoch": 0.13567481809727008, "grad_norm": 0.15790672600269318, "learning_rate": 0.0004323954455879133, "loss": 0.785, "step": 1240 }, { "epoch": 0.13578423327315498, "grad_norm": 0.151424378156662, "learning_rate": 0.00043234070505802495, "loss": 0.7389, "step": 1241 }, { "epoch": 0.1358936484490399, "grad_norm": 0.14574922621250153, "learning_rate": 0.00043228596452813664, "loss": 0.781, "step": 1242 }, { "epoch": 0.13600306362492479, "grad_norm": 0.14764772355556488, "learning_rate": 0.0004322312239982483, "loss": 0.8053, "step": 1243 }, { "epoch": 0.13611247880080968, "grad_norm": 0.148560032248497, "learning_rate": 0.00043217648346835995, "loss": 0.7816, "step": 1244 }, { "epoch": 0.13622189397669457, "grad_norm": 0.151231586933136, "learning_rate": 0.0004321217429384717, "loss": 0.8124, "step": 1245 }, { "epoch": 0.13633130915257946, "grad_norm": 0.20962515473365784, "learning_rate": 0.0004320670024085833, "loss": 0.8109, "step": 1246 }, { "epoch": 0.13644072432846435, "grad_norm": 0.5634602904319763, "learning_rate": 0.00043201226187869495, "loss": 0.7749, "step": 1247 }, { "epoch": 0.13655013950434924, "grad_norm": 0.3542329668998718, "learning_rate": 0.0004319575213488067, "loss": 0.8194, "step": 1248 }, { "epoch": 0.13665955468023414, "grad_norm": 0.38452109694480896, "learning_rate": 0.0004319027808189183, "loss": 0.8377, "step": 1249 }, { "epoch": 0.13676896985611905, "grad_norm": 0.5101556777954102, "learning_rate": 0.00043184804028903, "loss": 0.8471, "step": 1250 }, { "epoch": 0.13687838503200395, "grad_norm": 0.2097257822751999, "learning_rate": 0.0004317932997591417, "loss": 0.798, "step": 1251 }, { "epoch": 0.13698780020788884, "grad_norm": 0.5947410464286804, "learning_rate": 0.0004317385592292534, "loss": 0.8077, "step": 1252 }, { "epoch": 0.13709721538377373, "grad_norm": 2.876098394393921, "learning_rate": 0.000431683818699365, "loss": 0.9649, "step": 1253 }, { "epoch": 0.13720663055965862, "grad_norm": 0.9928034543991089, "learning_rate": 0.00043162907816947664, "loss": 0.9554, "step": 1254 }, { "epoch": 0.1373160457355435, "grad_norm": 3.914745807647705, "learning_rate": 0.0004315743376395884, "loss": 1.4435, "step": 1255 }, { "epoch": 0.1374254609114284, "grad_norm": 2.670599937438965, "learning_rate": 0.0004315195971097, "loss": 1.1566, "step": 1256 }, { "epoch": 0.13753487608731332, "grad_norm": 5.5911126136779785, "learning_rate": 0.0004314648565798117, "loss": 1.208, "step": 1257 }, { "epoch": 0.13764429126319822, "grad_norm": 5.702223777770996, "learning_rate": 0.0004314101160499234, "loss": 1.2687, "step": 1258 }, { "epoch": 0.1377537064390831, "grad_norm": 53.0676155090332, "learning_rate": 0.00043135537552003506, "loss": 4.183, "step": 1259 }, { "epoch": 0.137863121614968, "grad_norm": 5.823371887207031, "learning_rate": 0.0004313006349901467, "loss": 3.1561, "step": 1260 }, { "epoch": 0.1379725367908529, "grad_norm": 244.98675537109375, "learning_rate": 0.0004312458944602584, "loss": 15.5391, "step": 1261 }, { "epoch": 0.13808195196673778, "grad_norm": 22.36630630493164, "learning_rate": 0.00043119115393037006, "loss": 3.8134, "step": 1262 }, { "epoch": 0.13819136714262267, "grad_norm": 9.112852096557617, "learning_rate": 0.0004311364134004817, "loss": 2.0215, "step": 1263 }, { "epoch": 0.13830078231850756, "grad_norm": 3.3677618503570557, "learning_rate": 0.00043108167287059343, "loss": 1.9234, "step": 1264 }, { "epoch": 0.13841019749439248, "grad_norm": 106.30290985107422, "learning_rate": 0.00043102693234070506, "loss": 11.353, "step": 1265 }, { "epoch": 0.13851961267027738, "grad_norm": 6.946611404418945, "learning_rate": 0.00043097219181081675, "loss": 2.734, "step": 1266 }, { "epoch": 0.13862902784616227, "grad_norm": 5.58245849609375, "learning_rate": 0.00043091745128092843, "loss": 1.8873, "step": 1267 }, { "epoch": 0.13873844302204716, "grad_norm": 5.026088237762451, "learning_rate": 0.00043086271075104006, "loss": 1.9779, "step": 1268 }, { "epoch": 0.13884785819793205, "grad_norm": 11.954253196716309, "learning_rate": 0.00043080797022115175, "loss": 1.6994, "step": 1269 }, { "epoch": 0.13895727337381694, "grad_norm": 6.284012794494629, "learning_rate": 0.0004307532296912634, "loss": 2.4946, "step": 1270 }, { "epoch": 0.13906668854970183, "grad_norm": 37.222957611083984, "learning_rate": 0.0004306984891613751, "loss": 4.0351, "step": 1271 }, { "epoch": 0.13917610372558673, "grad_norm": 14.617050170898438, "learning_rate": 0.00043064374863148675, "loss": 3.5739, "step": 1272 }, { "epoch": 0.13928551890147164, "grad_norm": 5.772739887237549, "learning_rate": 0.00043058900810159843, "loss": 2.9974, "step": 1273 }, { "epoch": 0.13939493407735654, "grad_norm": 272.21990966796875, "learning_rate": 0.0004305342675717101, "loss": 8.1033, "step": 1274 }, { "epoch": 0.13950434925324143, "grad_norm": 12.305122375488281, "learning_rate": 0.00043047952704182175, "loss": 4.1463, "step": 1275 }, { "epoch": 0.13961376442912632, "grad_norm": 92.43973541259766, "learning_rate": 0.00043042478651193343, "loss": 4.3805, "step": 1276 }, { "epoch": 0.1397231796050112, "grad_norm": 7.673314094543457, "learning_rate": 0.0004303700459820451, "loss": 3.3163, "step": 1277 }, { "epoch": 0.1398325947808961, "grad_norm": 9.355205535888672, "learning_rate": 0.0004303153054521568, "loss": 3.5456, "step": 1278 }, { "epoch": 0.139942009956781, "grad_norm": 18.197338104248047, "learning_rate": 0.00043026056492226843, "loss": 4.3408, "step": 1279 }, { "epoch": 0.1400514251326659, "grad_norm": 4.632465839385986, "learning_rate": 0.00043020582439238017, "loss": 3.4814, "step": 1280 }, { "epoch": 0.1401608403085508, "grad_norm": 14.701290130615234, "learning_rate": 0.0004301510838624918, "loss": 4.3399, "step": 1281 }, { "epoch": 0.1402702554844357, "grad_norm": 19.827333450317383, "learning_rate": 0.00043009634333260343, "loss": 4.0353, "step": 1282 }, { "epoch": 0.1403796706603206, "grad_norm": 6.4052934646606445, "learning_rate": 0.00043004160280271517, "loss": 6.0777, "step": 1283 }, { "epoch": 0.14048908583620548, "grad_norm": 29.91147232055664, "learning_rate": 0.0004299868622728268, "loss": 4.0952, "step": 1284 }, { "epoch": 0.14059850101209037, "grad_norm": 4.749367713928223, "learning_rate": 0.0004299321217429385, "loss": 3.7294, "step": 1285 }, { "epoch": 0.14070791618797526, "grad_norm": 54.281314849853516, "learning_rate": 0.00042987738121305017, "loss": 4.7893, "step": 1286 }, { "epoch": 0.14081733136386015, "grad_norm": 1.885614275932312, "learning_rate": 0.00042982264068316186, "loss": 4.0348, "step": 1287 }, { "epoch": 0.14092674653974507, "grad_norm": 8.117119789123535, "learning_rate": 0.0004297679001532735, "loss": 4.15, "step": 1288 }, { "epoch": 0.14103616171562997, "grad_norm": 2.734502077102661, "learning_rate": 0.0004297131596233851, "loss": 4.8581, "step": 1289 }, { "epoch": 0.14114557689151486, "grad_norm": 2.186641216278076, "learning_rate": 0.00042965841909349686, "loss": 3.7746, "step": 1290 }, { "epoch": 0.14125499206739975, "grad_norm": 0.6904694437980652, "learning_rate": 0.0004296036785636085, "loss": 3.5773, "step": 1291 }, { "epoch": 0.14136440724328464, "grad_norm": 0.6338698863983154, "learning_rate": 0.00042954893803372017, "loss": 3.4449, "step": 1292 }, { "epoch": 0.14147382241916953, "grad_norm": 0.5468335747718811, "learning_rate": 0.00042949419750383186, "loss": 3.4431, "step": 1293 }, { "epoch": 0.14158323759505442, "grad_norm": 1.5276789665222168, "learning_rate": 0.00042943945697394354, "loss": 3.4188, "step": 1294 }, { "epoch": 0.14169265277093934, "grad_norm": 1.203803539276123, "learning_rate": 0.00042938471644405517, "loss": 3.5158, "step": 1295 }, { "epoch": 0.14180206794682423, "grad_norm": 0.286347895860672, "learning_rate": 0.00042932997591416686, "loss": 3.3289, "step": 1296 }, { "epoch": 0.14191148312270913, "grad_norm": 2.824207305908203, "learning_rate": 0.00042927523538427854, "loss": 3.5101, "step": 1297 }, { "epoch": 0.14202089829859402, "grad_norm": 0.5233584642410278, "learning_rate": 0.00042922049485439017, "loss": 3.3802, "step": 1298 }, { "epoch": 0.1421303134744789, "grad_norm": 3.702470302581787, "learning_rate": 0.0004291657543245019, "loss": 3.3011, "step": 1299 }, { "epoch": 0.1422397286503638, "grad_norm": 0.4291185140609741, "learning_rate": 0.00042911101379461354, "loss": 3.4041, "step": 1300 }, { "epoch": 0.1423491438262487, "grad_norm": 0.40739575028419495, "learning_rate": 0.0004290562732647252, "loss": 3.238, "step": 1301 }, { "epoch": 0.14245855900213358, "grad_norm": 0.5334272980690002, "learning_rate": 0.0004290015327348369, "loss": 3.2924, "step": 1302 }, { "epoch": 0.1425679741780185, "grad_norm": 0.1978183090686798, "learning_rate": 0.00042894679220494854, "loss": 3.152, "step": 1303 }, { "epoch": 0.1426773893539034, "grad_norm": 0.20030373334884644, "learning_rate": 0.0004288920516750602, "loss": 3.116, "step": 1304 }, { "epoch": 0.14278680452978829, "grad_norm": 0.283561646938324, "learning_rate": 0.00042883731114517186, "loss": 3.1349, "step": 1305 }, { "epoch": 0.14289621970567318, "grad_norm": 0.30255720019340515, "learning_rate": 0.0004287825706152836, "loss": 3.0759, "step": 1306 }, { "epoch": 0.14300563488155807, "grad_norm": 0.292035311460495, "learning_rate": 0.0004287278300853952, "loss": 3.0756, "step": 1307 }, { "epoch": 0.14311505005744296, "grad_norm": 0.13852208852767944, "learning_rate": 0.00042867308955550686, "loss": 3.033, "step": 1308 }, { "epoch": 0.14322446523332785, "grad_norm": 0.538864016532898, "learning_rate": 0.0004286183490256186, "loss": 2.986, "step": 1309 }, { "epoch": 0.14333388040921277, "grad_norm": 0.10854928195476532, "learning_rate": 0.0004285636084957302, "loss": 3.02, "step": 1310 }, { "epoch": 0.14344329558509766, "grad_norm": 0.16174031794071198, "learning_rate": 0.0004285088679658419, "loss": 3.0307, "step": 1311 }, { "epoch": 0.14355271076098255, "grad_norm": 0.16765357553958893, "learning_rate": 0.0004284541274359536, "loss": 3.0381, "step": 1312 }, { "epoch": 0.14366212593686745, "grad_norm": 0.19234171509742737, "learning_rate": 0.0004283993869060653, "loss": 2.9761, "step": 1313 }, { "epoch": 0.14377154111275234, "grad_norm": 0.4381254315376282, "learning_rate": 0.0004283446463761769, "loss": 3.1107, "step": 1314 }, { "epoch": 0.14388095628863723, "grad_norm": 0.45916226506233215, "learning_rate": 0.0004282899058462886, "loss": 2.9573, "step": 1315 }, { "epoch": 0.14399037146452212, "grad_norm": 0.6974062323570251, "learning_rate": 0.0004282351653164003, "loss": 3.0076, "step": 1316 }, { "epoch": 0.144099786640407, "grad_norm": 0.7667591571807861, "learning_rate": 0.0004281804247865119, "loss": 2.9977, "step": 1317 }, { "epoch": 0.14420920181629193, "grad_norm": 2.336378812789917, "learning_rate": 0.00042812568425662365, "loss": 2.9613, "step": 1318 }, { "epoch": 0.14431861699217682, "grad_norm": 0.4108619689941406, "learning_rate": 0.0004280709437267353, "loss": 2.9401, "step": 1319 }, { "epoch": 0.14442803216806172, "grad_norm": 0.17185351252555847, "learning_rate": 0.00042801620319684697, "loss": 2.9692, "step": 1320 }, { "epoch": 0.1445374473439466, "grad_norm": 0.18790218234062195, "learning_rate": 0.00042796146266695865, "loss": 2.9818, "step": 1321 }, { "epoch": 0.1446468625198315, "grad_norm": 0.40254470705986023, "learning_rate": 0.0004279067221370703, "loss": 3.0378, "step": 1322 }, { "epoch": 0.1447562776957164, "grad_norm": 0.9063878655433655, "learning_rate": 0.00042785198160718197, "loss": 2.9714, "step": 1323 }, { "epoch": 0.14486569287160128, "grad_norm": 0.7669951319694519, "learning_rate": 0.0004277972410772936, "loss": 3.0587, "step": 1324 }, { "epoch": 0.14497510804748617, "grad_norm": 0.2354392558336258, "learning_rate": 0.00042774250054740533, "loss": 2.9961, "step": 1325 }, { "epoch": 0.1450845232233711, "grad_norm": 0.5630313158035278, "learning_rate": 0.00042768776001751697, "loss": 3.0593, "step": 1326 }, { "epoch": 0.14519393839925598, "grad_norm": 0.2833912670612335, "learning_rate": 0.00042763301948762865, "loss": 3.0119, "step": 1327 }, { "epoch": 0.14530335357514088, "grad_norm": 0.2547456622123718, "learning_rate": 0.00042757827895774033, "loss": 2.9823, "step": 1328 }, { "epoch": 0.14541276875102577, "grad_norm": 0.2314075529575348, "learning_rate": 0.00042752353842785197, "loss": 3.0873, "step": 1329 }, { "epoch": 0.14552218392691066, "grad_norm": 0.32047659158706665, "learning_rate": 0.00042746879789796365, "loss": 2.9469, "step": 1330 }, { "epoch": 0.14563159910279555, "grad_norm": 0.2595478892326355, "learning_rate": 0.00042741405736807533, "loss": 2.9171, "step": 1331 }, { "epoch": 0.14574101427868044, "grad_norm": 0.19989535212516785, "learning_rate": 0.000427359316838187, "loss": 2.9386, "step": 1332 }, { "epoch": 0.14585042945456536, "grad_norm": 0.38448506593704224, "learning_rate": 0.00042730457630829865, "loss": 2.9515, "step": 1333 }, { "epoch": 0.14595984463045025, "grad_norm": 0.3570884168148041, "learning_rate": 0.0004272498357784104, "loss": 2.9302, "step": 1334 }, { "epoch": 0.14606925980633514, "grad_norm": 6.872991561889648, "learning_rate": 0.000427195095248522, "loss": 2.8923, "step": 1335 }, { "epoch": 0.14617867498222004, "grad_norm": 1.0655744075775146, "learning_rate": 0.00042714035471863365, "loss": 2.9993, "step": 1336 }, { "epoch": 0.14628809015810493, "grad_norm": 0.501538872718811, "learning_rate": 0.0004270856141887454, "loss": 2.9074, "step": 1337 }, { "epoch": 0.14639750533398982, "grad_norm": 0.46240749955177307, "learning_rate": 0.000427030873658857, "loss": 2.9342, "step": 1338 }, { "epoch": 0.1465069205098747, "grad_norm": 0.34012240171432495, "learning_rate": 0.0004269761331289687, "loss": 2.9446, "step": 1339 }, { "epoch": 0.1466163356857596, "grad_norm": 0.35387521982192993, "learning_rate": 0.00042692139259908034, "loss": 3.0265, "step": 1340 }, { "epoch": 0.14672575086164452, "grad_norm": 0.3966999650001526, "learning_rate": 0.0004268666520691921, "loss": 3.0056, "step": 1341 }, { "epoch": 0.1468351660375294, "grad_norm": 0.3362230956554413, "learning_rate": 0.0004268119115393037, "loss": 2.968, "step": 1342 }, { "epoch": 0.1469445812134143, "grad_norm": 0.3393986225128174, "learning_rate": 0.00042675717100941534, "loss": 2.9579, "step": 1343 }, { "epoch": 0.1470539963892992, "grad_norm": 0.5290712714195251, "learning_rate": 0.0004267024304795271, "loss": 2.9396, "step": 1344 }, { "epoch": 0.1471634115651841, "grad_norm": 0.1749253273010254, "learning_rate": 0.0004266476899496387, "loss": 2.9368, "step": 1345 }, { "epoch": 0.14727282674106898, "grad_norm": 0.2809406518936157, "learning_rate": 0.0004265929494197504, "loss": 2.8704, "step": 1346 }, { "epoch": 0.14738224191695387, "grad_norm": 0.2897566258907318, "learning_rate": 0.0004265382088898621, "loss": 2.924, "step": 1347 }, { "epoch": 0.1474916570928388, "grad_norm": 0.4092259407043457, "learning_rate": 0.00042648346835997376, "loss": 2.8573, "step": 1348 }, { "epoch": 0.14760107226872368, "grad_norm": 0.21341341733932495, "learning_rate": 0.0004264287278300854, "loss": 2.9055, "step": 1349 }, { "epoch": 0.14771048744460857, "grad_norm": 0.3352910578250885, "learning_rate": 0.0004263739873001971, "loss": 2.8686, "step": 1350 }, { "epoch": 0.14781990262049347, "grad_norm": 0.29584747552871704, "learning_rate": 0.00042631924677030876, "loss": 2.8699, "step": 1351 }, { "epoch": 0.14792931779637836, "grad_norm": 0.21915782988071442, "learning_rate": 0.0004262645062404204, "loss": 2.9109, "step": 1352 }, { "epoch": 0.14803873297226325, "grad_norm": 0.2703619599342346, "learning_rate": 0.00042620976571053213, "loss": 2.854, "step": 1353 }, { "epoch": 0.14814814814814814, "grad_norm": 6.737955093383789, "learning_rate": 0.00042615502518064376, "loss": 2.9177, "step": 1354 }, { "epoch": 0.14825756332403303, "grad_norm": 0.24854503571987152, "learning_rate": 0.00042610028465075544, "loss": 2.8909, "step": 1355 }, { "epoch": 0.14836697849991795, "grad_norm": 0.27250033617019653, "learning_rate": 0.0004260455441208671, "loss": 2.8629, "step": 1356 }, { "epoch": 0.14847639367580284, "grad_norm": 0.40814289450645447, "learning_rate": 0.00042599080359097876, "loss": 2.8442, "step": 1357 }, { "epoch": 0.14858580885168773, "grad_norm": 0.9164649248123169, "learning_rate": 0.00042593606306109044, "loss": 2.8877, "step": 1358 }, { "epoch": 0.14869522402757263, "grad_norm": 0.23642268776893616, "learning_rate": 0.0004258813225312021, "loss": 2.8015, "step": 1359 }, { "epoch": 0.14880463920345752, "grad_norm": 0.5103562474250793, "learning_rate": 0.0004258265820013138, "loss": 2.884, "step": 1360 }, { "epoch": 0.1489140543793424, "grad_norm": 0.3587506413459778, "learning_rate": 0.00042577184147142544, "loss": 2.9004, "step": 1361 }, { "epoch": 0.1490234695552273, "grad_norm": 0.2382264882326126, "learning_rate": 0.00042571710094153713, "loss": 2.8062, "step": 1362 }, { "epoch": 0.1491328847311122, "grad_norm": 0.8094629645347595, "learning_rate": 0.0004256623604116488, "loss": 2.8043, "step": 1363 }, { "epoch": 0.1492422999069971, "grad_norm": 0.2808386981487274, "learning_rate": 0.00042560761988176044, "loss": 2.8689, "step": 1364 }, { "epoch": 0.149351715082882, "grad_norm": 0.3178032636642456, "learning_rate": 0.00042555287935187213, "loss": 2.7953, "step": 1365 }, { "epoch": 0.1494611302587669, "grad_norm": 0.17930878698825836, "learning_rate": 0.0004254981388219838, "loss": 2.7627, "step": 1366 }, { "epoch": 0.14957054543465179, "grad_norm": 0.4724622964859009, "learning_rate": 0.0004254433982920955, "loss": 2.7749, "step": 1367 }, { "epoch": 0.14967996061053668, "grad_norm": 0.35572484135627747, "learning_rate": 0.00042538865776220713, "loss": 2.8068, "step": 1368 }, { "epoch": 0.14978937578642157, "grad_norm": 0.31159594655036926, "learning_rate": 0.0004253339172323188, "loss": 2.8539, "step": 1369 }, { "epoch": 0.14989879096230646, "grad_norm": 0.6285644173622131, "learning_rate": 0.0004252791767024305, "loss": 2.7237, "step": 1370 }, { "epoch": 0.15000820613819138, "grad_norm": 0.8869861960411072, "learning_rate": 0.00042522443617254213, "loss": 2.8116, "step": 1371 }, { "epoch": 0.15011762131407627, "grad_norm": 0.3772529363632202, "learning_rate": 0.00042516969564265387, "loss": 2.8534, "step": 1372 }, { "epoch": 0.15022703648996116, "grad_norm": 0.4114645719528198, "learning_rate": 0.0004251149551127655, "loss": 2.9135, "step": 1373 }, { "epoch": 0.15033645166584605, "grad_norm": 0.35799479484558105, "learning_rate": 0.0004250602145828772, "loss": 2.7794, "step": 1374 }, { "epoch": 0.15044586684173095, "grad_norm": 0.26164180040359497, "learning_rate": 0.0004250054740529888, "loss": 2.7878, "step": 1375 }, { "epoch": 0.15055528201761584, "grad_norm": 0.25124114751815796, "learning_rate": 0.0004249507335231005, "loss": 2.7617, "step": 1376 }, { "epoch": 0.15066469719350073, "grad_norm": 0.3189118802547455, "learning_rate": 0.0004248959929932122, "loss": 2.7899, "step": 1377 }, { "epoch": 0.15077411236938562, "grad_norm": 0.17282496392726898, "learning_rate": 0.0004248412524633238, "loss": 2.752, "step": 1378 }, { "epoch": 0.15088352754527054, "grad_norm": 0.19283908605575562, "learning_rate": 0.00042478651193343555, "loss": 2.772, "step": 1379 }, { "epoch": 0.15099294272115543, "grad_norm": 0.2467368096113205, "learning_rate": 0.0004247317714035472, "loss": 2.7643, "step": 1380 }, { "epoch": 0.15110235789704032, "grad_norm": 0.3856501281261444, "learning_rate": 0.00042467703087365887, "loss": 2.6902, "step": 1381 }, { "epoch": 0.15121177307292522, "grad_norm": 0.31245726346969604, "learning_rate": 0.00042462229034377055, "loss": 2.754, "step": 1382 }, { "epoch": 0.1513211882488101, "grad_norm": 0.33093300461769104, "learning_rate": 0.0004245675498138822, "loss": 2.7068, "step": 1383 }, { "epoch": 0.151430603424695, "grad_norm": 0.4372984766960144, "learning_rate": 0.00042451280928399387, "loss": 2.7464, "step": 1384 }, { "epoch": 0.1515400186005799, "grad_norm": 1.6138362884521484, "learning_rate": 0.00042445806875410555, "loss": 2.7367, "step": 1385 }, { "epoch": 0.1516494337764648, "grad_norm": 0.7219127416610718, "learning_rate": 0.00042440332822421724, "loss": 2.8082, "step": 1386 }, { "epoch": 0.1517588489523497, "grad_norm": 0.4770384728908539, "learning_rate": 0.00042434858769432887, "loss": 2.7393, "step": 1387 }, { "epoch": 0.1518682641282346, "grad_norm": 0.9233636856079102, "learning_rate": 0.0004242938471644406, "loss": 2.7266, "step": 1388 }, { "epoch": 0.15197767930411948, "grad_norm": 0.3614331781864166, "learning_rate": 0.00042423910663455224, "loss": 2.708, "step": 1389 }, { "epoch": 0.15208709448000438, "grad_norm": 0.4371580183506012, "learning_rate": 0.00042418436610466387, "loss": 2.6863, "step": 1390 }, { "epoch": 0.15219650965588927, "grad_norm": 3.803858757019043, "learning_rate": 0.00042412962557477555, "loss": 2.7444, "step": 1391 }, { "epoch": 0.15230592483177416, "grad_norm": 0.5080681443214417, "learning_rate": 0.00042407488504488724, "loss": 2.7955, "step": 1392 }, { "epoch": 0.15241534000765905, "grad_norm": 0.5367249846458435, "learning_rate": 0.0004240201445149989, "loss": 2.8716, "step": 1393 }, { "epoch": 0.15252475518354397, "grad_norm": 0.3795190453529358, "learning_rate": 0.00042396540398511055, "loss": 2.7797, "step": 1394 }, { "epoch": 0.15263417035942886, "grad_norm": 0.47555217146873474, "learning_rate": 0.0004239106634552223, "loss": 2.7314, "step": 1395 }, { "epoch": 0.15274358553531375, "grad_norm": 0.5974704027175903, "learning_rate": 0.0004238559229253339, "loss": 2.7504, "step": 1396 }, { "epoch": 0.15285300071119864, "grad_norm": 1.3982996940612793, "learning_rate": 0.00042380118239544555, "loss": 2.9034, "step": 1397 }, { "epoch": 0.15296241588708354, "grad_norm": 0.4663357436656952, "learning_rate": 0.0004237464418655573, "loss": 2.7499, "step": 1398 }, { "epoch": 0.15307183106296843, "grad_norm": 0.398451566696167, "learning_rate": 0.0004236917013356689, "loss": 2.7927, "step": 1399 }, { "epoch": 0.15318124623885332, "grad_norm": 0.30768758058547974, "learning_rate": 0.0004236369608057806, "loss": 2.7812, "step": 1400 }, { "epoch": 0.1532906614147382, "grad_norm": 0.49977463483810425, "learning_rate": 0.0004235822202758923, "loss": 2.7565, "step": 1401 }, { "epoch": 0.15340007659062313, "grad_norm": 0.3828587532043457, "learning_rate": 0.000423527479746004, "loss": 2.6933, "step": 1402 }, { "epoch": 0.15350949176650802, "grad_norm": 0.6481732726097107, "learning_rate": 0.0004234727392161156, "loss": 2.8627, "step": 1403 }, { "epoch": 0.1536189069423929, "grad_norm": 0.35093799233436584, "learning_rate": 0.0004234179986862273, "loss": 2.6514, "step": 1404 }, { "epoch": 0.1537283221182778, "grad_norm": 0.3120945692062378, "learning_rate": 0.000423363258156339, "loss": 2.6294, "step": 1405 }, { "epoch": 0.1538377372941627, "grad_norm": 0.5273593068122864, "learning_rate": 0.0004233085176264506, "loss": 2.7407, "step": 1406 }, { "epoch": 0.1539471524700476, "grad_norm": 0.39234939217567444, "learning_rate": 0.00042325377709656235, "loss": 2.6839, "step": 1407 }, { "epoch": 0.15405656764593248, "grad_norm": 0.4503597617149353, "learning_rate": 0.000423199036566674, "loss": 2.6705, "step": 1408 }, { "epoch": 0.1541659828218174, "grad_norm": 1.1509003639221191, "learning_rate": 0.00042314429603678566, "loss": 2.7301, "step": 1409 }, { "epoch": 0.1542753979977023, "grad_norm": 0.42109042406082153, "learning_rate": 0.0004230895555068973, "loss": 2.6557, "step": 1410 }, { "epoch": 0.15438481317358718, "grad_norm": 0.4195028245449066, "learning_rate": 0.000423034814977009, "loss": 2.7663, "step": 1411 }, { "epoch": 0.15449422834947207, "grad_norm": 0.30895525217056274, "learning_rate": 0.00042298007444712066, "loss": 2.6641, "step": 1412 }, { "epoch": 0.15460364352535697, "grad_norm": 0.3087160587310791, "learning_rate": 0.0004229253339172323, "loss": 2.6535, "step": 1413 }, { "epoch": 0.15471305870124186, "grad_norm": 0.48939278721809387, "learning_rate": 0.00042287059338734403, "loss": 2.7024, "step": 1414 }, { "epoch": 0.15482247387712675, "grad_norm": 0.6009643077850342, "learning_rate": 0.00042281585285745566, "loss": 2.6602, "step": 1415 }, { "epoch": 0.15493188905301164, "grad_norm": 0.3475269377231598, "learning_rate": 0.00042276111232756735, "loss": 2.6417, "step": 1416 }, { "epoch": 0.15504130422889656, "grad_norm": 0.42946910858154297, "learning_rate": 0.00042270637179767903, "loss": 2.6272, "step": 1417 }, { "epoch": 0.15515071940478145, "grad_norm": 0.35592907667160034, "learning_rate": 0.00042265163126779066, "loss": 2.6308, "step": 1418 }, { "epoch": 0.15526013458066634, "grad_norm": 0.38300150632858276, "learning_rate": 0.00042259689073790235, "loss": 2.6547, "step": 1419 }, { "epoch": 0.15536954975655123, "grad_norm": 0.2311307191848755, "learning_rate": 0.00042254215020801403, "loss": 2.5913, "step": 1420 }, { "epoch": 0.15547896493243613, "grad_norm": 0.30274632573127747, "learning_rate": 0.0004224874096781257, "loss": 2.6753, "step": 1421 }, { "epoch": 0.15558838010832102, "grad_norm": 0.30768653750419617, "learning_rate": 0.00042243266914823735, "loss": 2.5992, "step": 1422 }, { "epoch": 0.1556977952842059, "grad_norm": 1.174759864807129, "learning_rate": 0.0004223779286183491, "loss": 2.6129, "step": 1423 }, { "epoch": 0.15580721046009083, "grad_norm": 0.648792564868927, "learning_rate": 0.0004223231880884607, "loss": 2.6024, "step": 1424 }, { "epoch": 0.15591662563597572, "grad_norm": 1.1695830821990967, "learning_rate": 0.00042226844755857235, "loss": 2.6511, "step": 1425 }, { "epoch": 0.1560260408118606, "grad_norm": 0.8282488584518433, "learning_rate": 0.00042221370702868403, "loss": 2.7717, "step": 1426 }, { "epoch": 0.1561354559877455, "grad_norm": 0.8721874952316284, "learning_rate": 0.0004221589664987957, "loss": 2.6339, "step": 1427 }, { "epoch": 0.1562448711636304, "grad_norm": 0.5445930361747742, "learning_rate": 0.0004221042259689074, "loss": 2.6734, "step": 1428 }, { "epoch": 0.15635428633951529, "grad_norm": 0.5415084362030029, "learning_rate": 0.00042204948543901903, "loss": 2.7048, "step": 1429 }, { "epoch": 0.15646370151540018, "grad_norm": 0.6199241280555725, "learning_rate": 0.00042199474490913077, "loss": 2.6377, "step": 1430 }, { "epoch": 0.15657311669128507, "grad_norm": 0.31615084409713745, "learning_rate": 0.0004219400043792424, "loss": 2.5974, "step": 1431 }, { "epoch": 0.15668253186717, "grad_norm": 0.3916900157928467, "learning_rate": 0.00042188526384935403, "loss": 2.5866, "step": 1432 }, { "epoch": 0.15679194704305488, "grad_norm": 2.2676942348480225, "learning_rate": 0.00042183052331946577, "loss": 2.6283, "step": 1433 }, { "epoch": 0.15690136221893977, "grad_norm": 0.7799410820007324, "learning_rate": 0.0004217757827895774, "loss": 2.7023, "step": 1434 }, { "epoch": 0.15701077739482466, "grad_norm": 2.9610345363616943, "learning_rate": 0.0004217210422596891, "loss": 2.7533, "step": 1435 }, { "epoch": 0.15712019257070955, "grad_norm": 1.8953630924224854, "learning_rate": 0.00042166630172980077, "loss": 2.8426, "step": 1436 }, { "epoch": 0.15722960774659445, "grad_norm": 0.35628998279571533, "learning_rate": 0.0004216115611999124, "loss": 2.7061, "step": 1437 }, { "epoch": 0.15733902292247934, "grad_norm": 1.3388895988464355, "learning_rate": 0.0004215568206700241, "loss": 2.7713, "step": 1438 }, { "epoch": 0.15744843809836423, "grad_norm": 0.3483014404773712, "learning_rate": 0.00042150208014013577, "loss": 2.6952, "step": 1439 }, { "epoch": 0.15755785327424915, "grad_norm": 0.6318539977073669, "learning_rate": 0.00042144733961024745, "loss": 2.7425, "step": 1440 }, { "epoch": 0.15766726845013404, "grad_norm": 0.41275665163993835, "learning_rate": 0.0004213925990803591, "loss": 2.7607, "step": 1441 }, { "epoch": 0.15777668362601893, "grad_norm": 0.48338428139686584, "learning_rate": 0.0004213378585504708, "loss": 2.7076, "step": 1442 }, { "epoch": 0.15788609880190382, "grad_norm": 0.5133205056190491, "learning_rate": 0.00042128311802058245, "loss": 2.7317, "step": 1443 }, { "epoch": 0.15799551397778872, "grad_norm": 0.8006664514541626, "learning_rate": 0.0004212283774906941, "loss": 2.6877, "step": 1444 }, { "epoch": 0.1581049291536736, "grad_norm": 0.3939412534236908, "learning_rate": 0.00042117363696080577, "loss": 2.6252, "step": 1445 }, { "epoch": 0.1582143443295585, "grad_norm": 0.38859090209007263, "learning_rate": 0.00042111889643091745, "loss": 2.6491, "step": 1446 }, { "epoch": 0.15832375950544342, "grad_norm": 0.3862842321395874, "learning_rate": 0.00042106415590102914, "loss": 2.6881, "step": 1447 }, { "epoch": 0.1584331746813283, "grad_norm": 0.4164084792137146, "learning_rate": 0.00042100941537114077, "loss": 2.5767, "step": 1448 }, { "epoch": 0.1585425898572132, "grad_norm": 0.580954909324646, "learning_rate": 0.0004209546748412525, "loss": 2.6938, "step": 1449 }, { "epoch": 0.1586520050330981, "grad_norm": 1.3408719301223755, "learning_rate": 0.00042089993431136414, "loss": 2.6131, "step": 1450 }, { "epoch": 0.15876142020898298, "grad_norm": 1.10221529006958, "learning_rate": 0.00042084519378147577, "loss": 2.6613, "step": 1451 }, { "epoch": 0.15887083538486788, "grad_norm": 0.3367241322994232, "learning_rate": 0.0004207904532515875, "loss": 2.6087, "step": 1452 }, { "epoch": 0.15898025056075277, "grad_norm": 0.3430436849594116, "learning_rate": 0.00042073571272169914, "loss": 2.5836, "step": 1453 }, { "epoch": 0.15908966573663766, "grad_norm": 0.763767659664154, "learning_rate": 0.0004206809721918108, "loss": 2.5926, "step": 1454 }, { "epoch": 0.15919908091252258, "grad_norm": 0.5347039699554443, "learning_rate": 0.0004206262316619225, "loss": 2.6063, "step": 1455 }, { "epoch": 0.15930849608840747, "grad_norm": 0.4092235565185547, "learning_rate": 0.0004205714911320342, "loss": 2.5935, "step": 1456 }, { "epoch": 0.15941791126429236, "grad_norm": 0.3270774185657501, "learning_rate": 0.0004205167506021458, "loss": 2.5123, "step": 1457 }, { "epoch": 0.15952732644017725, "grad_norm": 0.4106026291847229, "learning_rate": 0.0004204620100722575, "loss": 2.5387, "step": 1458 }, { "epoch": 0.15963674161606214, "grad_norm": 0.3910042643547058, "learning_rate": 0.0004204072695423692, "loss": 2.5428, "step": 1459 }, { "epoch": 0.15974615679194704, "grad_norm": 0.3977607488632202, "learning_rate": 0.0004203525290124808, "loss": 2.5491, "step": 1460 }, { "epoch": 0.15985557196783193, "grad_norm": 1.0908948183059692, "learning_rate": 0.0004202977884825925, "loss": 2.4886, "step": 1461 }, { "epoch": 0.15996498714371685, "grad_norm": 1.3532558679580688, "learning_rate": 0.0004202430479527042, "loss": 2.6316, "step": 1462 }, { "epoch": 0.16007440231960174, "grad_norm": 1.0530575513839722, "learning_rate": 0.0004201883074228159, "loss": 2.6052, "step": 1463 }, { "epoch": 0.16018381749548663, "grad_norm": 1.3277018070220947, "learning_rate": 0.0004201335668929275, "loss": 2.5754, "step": 1464 }, { "epoch": 0.16029323267137152, "grad_norm": 0.6497476100921631, "learning_rate": 0.0004200788263630392, "loss": 2.6146, "step": 1465 }, { "epoch": 0.1604026478472564, "grad_norm": 0.6633959412574768, "learning_rate": 0.0004200240858331509, "loss": 2.5966, "step": 1466 }, { "epoch": 0.1605120630231413, "grad_norm": 2.0498170852661133, "learning_rate": 0.0004199693453032625, "loss": 2.5745, "step": 1467 }, { "epoch": 0.1606214781990262, "grad_norm": 0.41927024722099304, "learning_rate": 0.00041991460477337425, "loss": 2.6285, "step": 1468 }, { "epoch": 0.1607308933749111, "grad_norm": 1.778193473815918, "learning_rate": 0.0004198598642434859, "loss": 2.6176, "step": 1469 }, { "epoch": 0.160840308550796, "grad_norm": 0.4256199598312378, "learning_rate": 0.00041980512371359756, "loss": 2.575, "step": 1470 }, { "epoch": 0.1609497237266809, "grad_norm": 0.4861196279525757, "learning_rate": 0.00041975038318370925, "loss": 2.6361, "step": 1471 }, { "epoch": 0.1610591389025658, "grad_norm": 0.472994327545166, "learning_rate": 0.0004196956426538209, "loss": 2.5599, "step": 1472 }, { "epoch": 0.16116855407845068, "grad_norm": 0.42564958333969116, "learning_rate": 0.00041964090212393256, "loss": 2.6018, "step": 1473 }, { "epoch": 0.16127796925433557, "grad_norm": 0.49066072702407837, "learning_rate": 0.00041958616159404425, "loss": 2.5382, "step": 1474 }, { "epoch": 0.16138738443022047, "grad_norm": 0.7271188497543335, "learning_rate": 0.00041953142106415593, "loss": 2.5694, "step": 1475 }, { "epoch": 0.16149679960610536, "grad_norm": 0.5893601775169373, "learning_rate": 0.00041947668053426756, "loss": 2.4516, "step": 1476 }, { "epoch": 0.16160621478199025, "grad_norm": 1.4951133728027344, "learning_rate": 0.0004194219400043793, "loss": 2.5415, "step": 1477 }, { "epoch": 0.16171562995787517, "grad_norm": 1.9824845790863037, "learning_rate": 0.00041936719947449093, "loss": 2.5512, "step": 1478 }, { "epoch": 0.16182504513376006, "grad_norm": 0.5421838760375977, "learning_rate": 0.00041931245894460256, "loss": 2.58, "step": 1479 }, { "epoch": 0.16193446030964495, "grad_norm": 6.233069896697998, "learning_rate": 0.00041925771841471425, "loss": 2.5778, "step": 1480 }, { "epoch": 0.16204387548552984, "grad_norm": 0.833074152469635, "learning_rate": 0.00041920297788482593, "loss": 2.5963, "step": 1481 }, { "epoch": 0.16215329066141473, "grad_norm": 0.757673442363739, "learning_rate": 0.0004191482373549376, "loss": 2.6016, "step": 1482 }, { "epoch": 0.16226270583729963, "grad_norm": 0.6771922707557678, "learning_rate": 0.00041909349682504925, "loss": 2.5781, "step": 1483 }, { "epoch": 0.16237212101318452, "grad_norm": 0.4034157395362854, "learning_rate": 0.000419038756295161, "loss": 2.526, "step": 1484 }, { "epoch": 0.16248153618906944, "grad_norm": 0.3139336109161377, "learning_rate": 0.0004189840157652726, "loss": 2.5311, "step": 1485 }, { "epoch": 0.16259095136495433, "grad_norm": 0.6241982579231262, "learning_rate": 0.00041892927523538425, "loss": 2.6007, "step": 1486 }, { "epoch": 0.16270036654083922, "grad_norm": 0.27057185769081116, "learning_rate": 0.000418874534705496, "loss": 2.5449, "step": 1487 }, { "epoch": 0.1628097817167241, "grad_norm": 0.7986899614334106, "learning_rate": 0.0004188197941756076, "loss": 2.5439, "step": 1488 }, { "epoch": 0.162919196892609, "grad_norm": 0.4457003176212311, "learning_rate": 0.0004187650536457193, "loss": 2.4988, "step": 1489 }, { "epoch": 0.1630286120684939, "grad_norm": 0.7477899193763733, "learning_rate": 0.000418710313115831, "loss": 2.5944, "step": 1490 }, { "epoch": 0.16313802724437879, "grad_norm": 0.45845240354537964, "learning_rate": 0.00041865557258594267, "loss": 2.5691, "step": 1491 }, { "epoch": 0.16324744242026368, "grad_norm": 0.4798039495944977, "learning_rate": 0.0004186008320560543, "loss": 2.6311, "step": 1492 }, { "epoch": 0.1633568575961486, "grad_norm": 2.0342509746551514, "learning_rate": 0.000418546091526166, "loss": 2.5289, "step": 1493 }, { "epoch": 0.1634662727720335, "grad_norm": 0.8433592915534973, "learning_rate": 0.00041849135099627767, "loss": 2.5337, "step": 1494 }, { "epoch": 0.16357568794791838, "grad_norm": 0.5579163432121277, "learning_rate": 0.0004184366104663893, "loss": 2.5738, "step": 1495 }, { "epoch": 0.16368510312380327, "grad_norm": 1.2545627355575562, "learning_rate": 0.000418381869936501, "loss": 2.583, "step": 1496 }, { "epoch": 0.16379451829968816, "grad_norm": 0.7874237895011902, "learning_rate": 0.00041832712940661267, "loss": 2.516, "step": 1497 }, { "epoch": 0.16390393347557305, "grad_norm": 0.8054859638214111, "learning_rate": 0.0004182723888767243, "loss": 2.5202, "step": 1498 }, { "epoch": 0.16401334865145795, "grad_norm": 0.8685526251792908, "learning_rate": 0.000418217648346836, "loss": 2.5796, "step": 1499 }, { "epoch": 0.16412276382734287, "grad_norm": 0.4534640312194824, "learning_rate": 0.00041816290781694767, "loss": 2.4732, "step": 1500 } ], "logging_steps": 1, "max_steps": 9139, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.292098917629952e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }