{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999709884243814, "eval_steps": 1000, "global_step": 17234, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000580231512373437, "grad_norm": 4.24833869934082, "learning_rate": 1.0440835266821346e-05, "loss": 10.74, "step": 10 }, { "epoch": 0.001160463024746874, "grad_norm": 2.10276198387146, "learning_rate": 2.2041763341067284e-05, "loss": 9.9422, "step": 20 }, { "epoch": 0.001740694537120311, "grad_norm": 2.0392377376556396, "learning_rate": 3.364269141531322e-05, "loss": 9.4932, "step": 30 }, { "epoch": 0.002320926049493748, "grad_norm": 1.8358850479125977, "learning_rate": 4.5243619489559165e-05, "loss": 9.1248, "step": 40 }, { "epoch": 0.002901157561867185, "grad_norm": 1.6119049787521362, "learning_rate": 5.68445475638051e-05, "loss": 8.6786, "step": 50 }, { "epoch": 0.003481389074240622, "grad_norm": 1.438369870185852, "learning_rate": 6.844547563805105e-05, "loss": 8.2109, "step": 60 }, { "epoch": 0.004061620586614059, "grad_norm": 1.2206534147262573, "learning_rate": 8.004640371229699e-05, "loss": 7.7847, "step": 70 }, { "epoch": 0.004641852098987496, "grad_norm": 1.797905445098877, "learning_rate": 9.164733178654293e-05, "loss": 7.4453, "step": 80 }, { "epoch": 0.005222083611360933, "grad_norm": 0.9732237458229065, "learning_rate": 0.00010324825986078886, "loss": 7.2239, "step": 90 }, { "epoch": 0.00580231512373437, "grad_norm": 0.704902172088623, "learning_rate": 0.0001148491879350348, "loss": 7.0716, "step": 100 }, { "epoch": 0.006382546636107807, "grad_norm": 0.7608431577682495, "learning_rate": 0.00012645011600928075, "loss": 6.9281, "step": 110 }, { "epoch": 0.006962778148481244, "grad_norm": 0.5090209245681763, "learning_rate": 0.00013805104408352666, "loss": 6.7721, "step": 120 }, { "epoch": 0.007543009660854681, "grad_norm": 0.4300777018070221, "learning_rate": 0.00014965197215777263, "loss": 6.6322, "step": 130 }, { "epoch": 0.008123241173228117, "grad_norm": 0.8929088115692139, "learning_rate": 0.00016125290023201856, "loss": 6.5415, "step": 140 }, { "epoch": 0.008703472685601555, "grad_norm": 0.3701019287109375, "learning_rate": 0.0001728538283062645, "loss": 6.4223, "step": 150 }, { "epoch": 0.009283704197974993, "grad_norm": 0.5044598579406738, "learning_rate": 0.00018445475638051046, "loss": 6.3371, "step": 160 }, { "epoch": 0.009863935710348428, "grad_norm": 0.42209455370903015, "learning_rate": 0.00019605568445475637, "loss": 6.2547, "step": 170 }, { "epoch": 0.010444167222721866, "grad_norm": 0.3204844295978546, "learning_rate": 0.00020765661252900234, "loss": 6.1801, "step": 180 }, { "epoch": 0.011024398735095304, "grad_norm": 0.7174881100654602, "learning_rate": 0.00021925754060324827, "loss": 6.1174, "step": 190 }, { "epoch": 0.01160463024746874, "grad_norm": 0.37421539425849915, "learning_rate": 0.0002308584686774942, "loss": 6.0759, "step": 200 }, { "epoch": 0.012184861759842177, "grad_norm": 0.5762574672698975, "learning_rate": 0.00024245939675174015, "loss": 5.9975, "step": 210 }, { "epoch": 0.012765093272215615, "grad_norm": 0.29894348978996277, "learning_rate": 0.00025406032482598606, "loss": 5.9512, "step": 220 }, { "epoch": 0.01334532478458905, "grad_norm": 0.37387117743492126, "learning_rate": 0.000265661252900232, "loss": 5.9045, "step": 230 }, { "epoch": 0.013925556296962488, "grad_norm": 0.4004443883895874, "learning_rate": 0.000277262180974478, "loss": 5.872, "step": 240 }, { "epoch": 0.014505787809335926, "grad_norm": 0.43937498331069946, "learning_rate": 0.0002888631090487239, "loss": 5.8183, "step": 250 }, { "epoch": 0.015086019321709361, "grad_norm": 0.3658200204372406, "learning_rate": 0.0003004640371229698, "loss": 5.7888, "step": 260 }, { "epoch": 0.0156662508340828, "grad_norm": 0.3341761827468872, "learning_rate": 0.0003120649651972158, "loss": 5.7416, "step": 270 }, { "epoch": 0.016246482346456235, "grad_norm": 0.46115365624427795, "learning_rate": 0.00032366589327146174, "loss": 5.7067, "step": 280 }, { "epoch": 0.016826713858829674, "grad_norm": 0.3009159564971924, "learning_rate": 0.00033526682134570767, "loss": 5.665, "step": 290 }, { "epoch": 0.01740694537120311, "grad_norm": 0.4389355778694153, "learning_rate": 0.0003468677494199536, "loss": 5.6231, "step": 300 }, { "epoch": 0.017987176883576546, "grad_norm": 0.4746924042701721, "learning_rate": 0.00035846867749419955, "loss": 5.5983, "step": 310 }, { "epoch": 0.018567408395949985, "grad_norm": 0.33507513999938965, "learning_rate": 0.0003700696055684455, "loss": 5.5685, "step": 320 }, { "epoch": 0.01914763990832342, "grad_norm": 0.6776261329650879, "learning_rate": 0.0003816705336426914, "loss": 5.5087, "step": 330 }, { "epoch": 0.019727871420696857, "grad_norm": 0.4664747416973114, "learning_rate": 0.00039327146171693736, "loss": 5.4964, "step": 340 }, { "epoch": 0.020308102933070296, "grad_norm": 0.4787660539150238, "learning_rate": 0.0004048723897911833, "loss": 5.4473, "step": 350 }, { "epoch": 0.020888334445443732, "grad_norm": 0.41453346610069275, "learning_rate": 0.00041647331786542923, "loss": 5.4019, "step": 360 }, { "epoch": 0.021468565957817168, "grad_norm": 0.5684117078781128, "learning_rate": 0.0004280742459396752, "loss": 5.3821, "step": 370 }, { "epoch": 0.022048797470190607, "grad_norm": 0.34503793716430664, "learning_rate": 0.0004396751740139211, "loss": 5.3503, "step": 380 }, { "epoch": 0.022629028982564043, "grad_norm": 0.465599924325943, "learning_rate": 0.00045127610208816704, "loss": 5.3183, "step": 390 }, { "epoch": 0.02320926049493748, "grad_norm": 0.3912290036678314, "learning_rate": 0.000462877030162413, "loss": 5.2937, "step": 400 }, { "epoch": 0.023789492007310918, "grad_norm": 0.42982372641563416, "learning_rate": 0.00047447795823665897, "loss": 5.2526, "step": 410 }, { "epoch": 0.024369723519684354, "grad_norm": 0.6874813437461853, "learning_rate": 0.00048607888631090485, "loss": 5.2364, "step": 420 }, { "epoch": 0.02494995503205779, "grad_norm": 0.3257123529911041, "learning_rate": 0.0004976798143851508, "loss": 5.2107, "step": 430 }, { "epoch": 0.02553018654443123, "grad_norm": 0.43527376651763916, "learning_rate": 0.0005092807424593968, "loss": 5.1765, "step": 440 }, { "epoch": 0.026110418056804665, "grad_norm": 0.36336761713027954, "learning_rate": 0.0005208816705336427, "loss": 5.1535, "step": 450 }, { "epoch": 0.0266906495691781, "grad_norm": 0.5609498023986816, "learning_rate": 0.0005324825986078887, "loss": 5.1106, "step": 460 }, { "epoch": 0.02727088108155154, "grad_norm": 0.3533839285373688, "learning_rate": 0.0005440835266821345, "loss": 5.0758, "step": 470 }, { "epoch": 0.027851112593924976, "grad_norm": 0.3420592248439789, "learning_rate": 0.0005556844547563805, "loss": 5.0461, "step": 480 }, { "epoch": 0.028431344106298412, "grad_norm": 0.34275180101394653, "learning_rate": 0.0005672853828306265, "loss": 5.0096, "step": 490 }, { "epoch": 0.02901157561867185, "grad_norm": 0.36812710762023926, "learning_rate": 0.0005788863109048724, "loss": 4.989, "step": 500 }, { "epoch": 0.029591807131045287, "grad_norm": 0.30121806263923645, "learning_rate": 0.0005904872389791184, "loss": 4.947, "step": 510 }, { "epoch": 0.030172038643418723, "grad_norm": 0.30486181378364563, "learning_rate": 0.0006020881670533644, "loss": 4.9224, "step": 520 }, { "epoch": 0.030752270155792162, "grad_norm": 0.36893951892852783, "learning_rate": 0.0006136890951276102, "loss": 4.8849, "step": 530 }, { "epoch": 0.0313325016681656, "grad_norm": 0.3596450090408325, "learning_rate": 0.0006252900232018562, "loss": 4.8539, "step": 540 }, { "epoch": 0.031912733180539034, "grad_norm": 0.3460347354412079, "learning_rate": 0.000636890951276102, "loss": 4.8462, "step": 550 }, { "epoch": 0.03249296469291247, "grad_norm": 0.353222519159317, "learning_rate": 0.000648491879350348, "loss": 4.8129, "step": 560 }, { "epoch": 0.03307319620528591, "grad_norm": 0.23676836490631104, "learning_rate": 0.000660092807424594, "loss": 4.7744, "step": 570 }, { "epoch": 0.03365342771765935, "grad_norm": 0.3343792259693146, "learning_rate": 0.0006716937354988399, "loss": 4.7622, "step": 580 }, { "epoch": 0.034233659230032784, "grad_norm": 0.24879467487335205, "learning_rate": 0.0006832946635730859, "loss": 4.7259, "step": 590 }, { "epoch": 0.03481389074240622, "grad_norm": 0.3369862139225006, "learning_rate": 0.0006948955916473319, "loss": 4.6865, "step": 600 }, { "epoch": 0.035394122254779656, "grad_norm": 0.3400874435901642, "learning_rate": 0.0007064965197215777, "loss": 4.6676, "step": 610 }, { "epoch": 0.03597435376715309, "grad_norm": 0.30212903022766113, "learning_rate": 0.0007180974477958236, "loss": 4.6644, "step": 620 }, { "epoch": 0.036554585279526534, "grad_norm": 0.27961453795433044, "learning_rate": 0.0007296983758700696, "loss": 4.6348, "step": 630 }, { "epoch": 0.03713481679189997, "grad_norm": 0.33748018741607666, "learning_rate": 0.0007412993039443155, "loss": 4.5963, "step": 640 }, { "epoch": 0.037715048304273406, "grad_norm": 0.31729650497436523, "learning_rate": 0.0007529002320185615, "loss": 4.5758, "step": 650 }, { "epoch": 0.03829527981664684, "grad_norm": 0.22230634093284607, "learning_rate": 0.0007645011600928075, "loss": 4.5496, "step": 660 }, { "epoch": 0.03887551132902028, "grad_norm": 0.2796823978424072, "learning_rate": 0.0007761020881670534, "loss": 4.5275, "step": 670 }, { "epoch": 0.039455742841393714, "grad_norm": 0.30923864245414734, "learning_rate": 0.0007877030162412994, "loss": 4.5012, "step": 680 }, { "epoch": 0.040035974353767156, "grad_norm": 0.2574792504310608, "learning_rate": 0.0007993039443155452, "loss": 4.4765, "step": 690 }, { "epoch": 0.04061620586614059, "grad_norm": 0.26686057448387146, "learning_rate": 0.0008109048723897911, "loss": 4.4458, "step": 700 }, { "epoch": 0.04119643737851403, "grad_norm": 0.31044116616249084, "learning_rate": 0.0008225058004640371, "loss": 4.4281, "step": 710 }, { "epoch": 0.041776668890887464, "grad_norm": 0.27075859904289246, "learning_rate": 0.000834106728538283, "loss": 4.4045, "step": 720 }, { "epoch": 0.0423569004032609, "grad_norm": 0.25896894931793213, "learning_rate": 0.000845707656612529, "loss": 4.3825, "step": 730 }, { "epoch": 0.042937131915634336, "grad_norm": 0.20856112241744995, "learning_rate": 0.000857308584686775, "loss": 4.3483, "step": 740 }, { "epoch": 0.04351736342800778, "grad_norm": 0.3068506121635437, "learning_rate": 0.0008689095127610209, "loss": 4.3236, "step": 750 }, { "epoch": 0.044097594940381214, "grad_norm": 0.2396797239780426, "learning_rate": 0.0008805104408352669, "loss": 4.3045, "step": 760 }, { "epoch": 0.04467782645275465, "grad_norm": 0.32250258326530457, "learning_rate": 0.0008921113689095129, "loss": 4.2823, "step": 770 }, { "epoch": 0.045258057965128086, "grad_norm": 0.28663381934165955, "learning_rate": 0.0009037122969837586, "loss": 4.2603, "step": 780 }, { "epoch": 0.04583828947750152, "grad_norm": 0.26493045687675476, "learning_rate": 0.0009153132250580046, "loss": 4.2477, "step": 790 }, { "epoch": 0.04641852098987496, "grad_norm": 0.2331818789243698, "learning_rate": 0.0009269141531322506, "loss": 4.2247, "step": 800 }, { "epoch": 0.0469987525022484, "grad_norm": 0.31670576333999634, "learning_rate": 0.0009385150812064965, "loss": 4.1969, "step": 810 }, { "epoch": 0.047578984014621836, "grad_norm": 0.2539173662662506, "learning_rate": 0.0009501160092807425, "loss": 4.1818, "step": 820 }, { "epoch": 0.04815921552699527, "grad_norm": 0.24318990111351013, "learning_rate": 0.0009617169373549885, "loss": 4.1624, "step": 830 }, { "epoch": 0.04873944703936871, "grad_norm": 0.2574463486671448, "learning_rate": 0.0009733178654292344, "loss": 4.1577, "step": 840 }, { "epoch": 0.049319678551742144, "grad_norm": 0.28978243470191956, "learning_rate": 0.0009849187935034804, "loss": 4.1412, "step": 850 }, { "epoch": 0.04989991006411558, "grad_norm": 0.20802126824855804, "learning_rate": 0.0009965197215777261, "loss": 4.1262, "step": 860 }, { "epoch": 0.05048014157648902, "grad_norm": 0.1996484100818634, "learning_rate": 0.0009999995489420968, "loss": 4.1093, "step": 870 }, { "epoch": 0.05106037308886246, "grad_norm": 0.2888406813144684, "learning_rate": 0.0009999973396808558, "loss": 4.0934, "step": 880 }, { "epoch": 0.051640604601235894, "grad_norm": 0.23817220330238342, "learning_rate": 0.0009999932893770317, "loss": 4.0812, "step": 890 }, { "epoch": 0.05222083611360933, "grad_norm": 0.21273092925548553, "learning_rate": 0.0009999873980455383, "loss": 4.0675, "step": 900 }, { "epoch": 0.052801067625982766, "grad_norm": 0.2241075336933136, "learning_rate": 0.000999979665708068, "loss": 4.0543, "step": 910 }, { "epoch": 0.0533812991383562, "grad_norm": 0.2210296392440796, "learning_rate": 0.000999970092393092, "loss": 4.0374, "step": 920 }, { "epoch": 0.053961530650729644, "grad_norm": 0.21746701002120972, "learning_rate": 0.0009999586781358604, "loss": 4.0324, "step": 930 }, { "epoch": 0.05454176216310308, "grad_norm": 0.20182138681411743, "learning_rate": 0.0009999454229784018, "loss": 4.0211, "step": 940 }, { "epoch": 0.055121993675476516, "grad_norm": 0.20171615481376648, "learning_rate": 0.0009999303269695226, "loss": 4.0011, "step": 950 }, { "epoch": 0.05570222518784995, "grad_norm": 0.27165260910987854, "learning_rate": 0.0009999133901648083, "loss": 3.9968, "step": 960 }, { "epoch": 0.05628245670022339, "grad_norm": 0.2524718940258026, "learning_rate": 0.0009998946126266218, "loss": 3.9876, "step": 970 }, { "epoch": 0.056862688212596824, "grad_norm": 0.18922555446624756, "learning_rate": 0.0009998739944241041, "loss": 3.9756, "step": 980 }, { "epoch": 0.057442919724970266, "grad_norm": 0.2163887917995453, "learning_rate": 0.0009998515356331734, "loss": 3.953, "step": 990 }, { "epoch": 0.0580231512373437, "grad_norm": 0.24239082634449005, "learning_rate": 0.0009998272363365254, "loss": 3.9616, "step": 1000 }, { "epoch": 0.0580231512373437, "eval_loss": 3.8911185264587402, "eval_runtime": 3.2761, "eval_samples_per_second": 1321.706, "eval_steps_per_second": 10.378, "step": 1000 }, { "epoch": 0.05860338274971714, "grad_norm": 0.19143226742744446, "learning_rate": 0.000999801096623633, "loss": 3.9342, "step": 1010 }, { "epoch": 0.059183614262090574, "grad_norm": 0.23061209917068481, "learning_rate": 0.000999773116590745, "loss": 3.932, "step": 1020 }, { "epoch": 0.05976384577446401, "grad_norm": 0.20795617997646332, "learning_rate": 0.0009997432963408865, "loss": 3.9247, "step": 1030 }, { "epoch": 0.060344077286837446, "grad_norm": 0.22474446892738342, "learning_rate": 0.0009997116359838595, "loss": 3.9318, "step": 1040 }, { "epoch": 0.06092430879921089, "grad_norm": 0.1934744417667389, "learning_rate": 0.00099967813563624, "loss": 3.9166, "step": 1050 }, { "epoch": 0.061504540311584324, "grad_norm": 0.22670230269432068, "learning_rate": 0.0009996427954213807, "loss": 3.8964, "step": 1060 }, { "epoch": 0.06208477182395776, "grad_norm": 0.193163201212883, "learning_rate": 0.0009996056154694072, "loss": 3.9087, "step": 1070 }, { "epoch": 0.0626650033363312, "grad_norm": 0.26965293288230896, "learning_rate": 0.0009995665959172202, "loss": 3.8943, "step": 1080 }, { "epoch": 0.06324523484870463, "grad_norm": 0.16498436033725739, "learning_rate": 0.0009995257369084939, "loss": 3.8751, "step": 1090 }, { "epoch": 0.06382546636107807, "grad_norm": 0.18623340129852295, "learning_rate": 0.0009994830385936754, "loss": 3.8759, "step": 1100 }, { "epoch": 0.0644056978734515, "grad_norm": 0.18320336937904358, "learning_rate": 0.000999438501129984, "loss": 3.8592, "step": 1110 }, { "epoch": 0.06498592938582494, "grad_norm": 0.16004326939582825, "learning_rate": 0.0009993921246814119, "loss": 3.8453, "step": 1120 }, { "epoch": 0.06556616089819838, "grad_norm": 0.20819014310836792, "learning_rate": 0.0009993439094187217, "loss": 3.8493, "step": 1130 }, { "epoch": 0.06614639241057182, "grad_norm": 0.20609912276268005, "learning_rate": 0.0009992938555194472, "loss": 3.8399, "step": 1140 }, { "epoch": 0.06672662392294526, "grad_norm": 0.16247253119945526, "learning_rate": 0.0009992419631678921, "loss": 3.8425, "step": 1150 }, { "epoch": 0.0673068554353187, "grad_norm": 0.2205752432346344, "learning_rate": 0.0009991882325551295, "loss": 3.823, "step": 1160 }, { "epoch": 0.06788708694769213, "grad_norm": 0.18031305074691772, "learning_rate": 0.0009991326638790008, "loss": 3.8129, "step": 1170 }, { "epoch": 0.06846731846006557, "grad_norm": 0.21720442175865173, "learning_rate": 0.0009990752573441162, "loss": 3.8177, "step": 1180 }, { "epoch": 0.069047549972439, "grad_norm": 0.17516951262950897, "learning_rate": 0.000999016013161852, "loss": 3.8144, "step": 1190 }, { "epoch": 0.06962778148481244, "grad_norm": 0.1756078600883484, "learning_rate": 0.000998954931550352, "loss": 3.8041, "step": 1200 }, { "epoch": 0.07020801299718588, "grad_norm": 0.19864057004451752, "learning_rate": 0.000998892012734525, "loss": 3.8031, "step": 1210 }, { "epoch": 0.07078824450955931, "grad_norm": 0.20211303234100342, "learning_rate": 0.0009988272569460442, "loss": 3.8009, "step": 1220 }, { "epoch": 0.07136847602193275, "grad_norm": 0.16449439525604248, "learning_rate": 0.0009987606644233477, "loss": 3.7916, "step": 1230 }, { "epoch": 0.07194870753430618, "grad_norm": 0.24472416937351227, "learning_rate": 0.0009986922354116362, "loss": 3.7902, "step": 1240 }, { "epoch": 0.07252893904667962, "grad_norm": 0.15388889610767365, "learning_rate": 0.000998621970162872, "loss": 3.7747, "step": 1250 }, { "epoch": 0.07310917055905307, "grad_norm": 0.18148912489414215, "learning_rate": 0.0009985498689357797, "loss": 3.771, "step": 1260 }, { "epoch": 0.0736894020714265, "grad_norm": 0.20958511531352997, "learning_rate": 0.000998475931995843, "loss": 3.7728, "step": 1270 }, { "epoch": 0.07426963358379994, "grad_norm": 0.20867913961410522, "learning_rate": 0.000998400159615306, "loss": 3.7641, "step": 1280 }, { "epoch": 0.07484986509617338, "grad_norm": 0.2014162242412567, "learning_rate": 0.00099832255207317, "loss": 3.7605, "step": 1290 }, { "epoch": 0.07543009660854681, "grad_norm": 0.19448301196098328, "learning_rate": 0.0009982431096551947, "loss": 3.7562, "step": 1300 }, { "epoch": 0.07601032812092025, "grad_norm": 0.17145995795726776, "learning_rate": 0.0009981618326538948, "loss": 3.7583, "step": 1310 }, { "epoch": 0.07659055963329368, "grad_norm": 0.16092616319656372, "learning_rate": 0.000998078721368541, "loss": 3.7439, "step": 1320 }, { "epoch": 0.07717079114566712, "grad_norm": 0.22316782176494598, "learning_rate": 0.000997993776105158, "loss": 3.7373, "step": 1330 }, { "epoch": 0.07775102265804056, "grad_norm": 0.17662325501441956, "learning_rate": 0.0009979069971765226, "loss": 3.7337, "step": 1340 }, { "epoch": 0.07833125417041399, "grad_norm": 0.139273002743721, "learning_rate": 0.0009978183849021645, "loss": 3.7296, "step": 1350 }, { "epoch": 0.07891148568278743, "grad_norm": 0.2038998305797577, "learning_rate": 0.000997727939608363, "loss": 3.7353, "step": 1360 }, { "epoch": 0.07949171719516086, "grad_norm": 0.18561561405658722, "learning_rate": 0.0009976356616281474, "loss": 3.7318, "step": 1370 }, { "epoch": 0.08007194870753431, "grad_norm": 0.20797798037528992, "learning_rate": 0.0009975415513012946, "loss": 3.7242, "step": 1380 }, { "epoch": 0.08065218021990775, "grad_norm": 0.1697220355272293, "learning_rate": 0.0009974456089743289, "loss": 3.7066, "step": 1390 }, { "epoch": 0.08123241173228118, "grad_norm": 0.18265104293823242, "learning_rate": 0.0009973478350005199, "loss": 3.7159, "step": 1400 }, { "epoch": 0.08181264324465462, "grad_norm": 0.18804903328418732, "learning_rate": 0.0009972482297398817, "loss": 3.7042, "step": 1410 }, { "epoch": 0.08239287475702806, "grad_norm": 0.1842174530029297, "learning_rate": 0.0009971467935591713, "loss": 3.7103, "step": 1420 }, { "epoch": 0.08297310626940149, "grad_norm": 0.18509556353092194, "learning_rate": 0.000997043526831887, "loss": 3.7066, "step": 1430 }, { "epoch": 0.08355333778177493, "grad_norm": 0.19408974051475525, "learning_rate": 0.0009969384299382683, "loss": 3.6867, "step": 1440 }, { "epoch": 0.08413356929414836, "grad_norm": 0.18689195811748505, "learning_rate": 0.0009968315032652924, "loss": 3.6787, "step": 1450 }, { "epoch": 0.0847138008065218, "grad_norm": 0.17087410390377045, "learning_rate": 0.0009967227472066748, "loss": 3.6792, "step": 1460 }, { "epoch": 0.08529403231889524, "grad_norm": 0.1809985190629959, "learning_rate": 0.000996612162162867, "loss": 3.6975, "step": 1470 }, { "epoch": 0.08587426383126867, "grad_norm": 0.2063867449760437, "learning_rate": 0.000996499748541054, "loss": 3.6831, "step": 1480 }, { "epoch": 0.08645449534364211, "grad_norm": 0.191127747297287, "learning_rate": 0.0009963855067551552, "loss": 3.6779, "step": 1490 }, { "epoch": 0.08703472685601556, "grad_norm": 0.17394115030765533, "learning_rate": 0.0009962694372258206, "loss": 3.665, "step": 1500 }, { "epoch": 0.08761495836838899, "grad_norm": 0.17784751951694489, "learning_rate": 0.0009961515403804303, "loss": 3.6587, "step": 1510 }, { "epoch": 0.08819518988076243, "grad_norm": 0.2151278853416443, "learning_rate": 0.0009960318166530927, "loss": 3.6526, "step": 1520 }, { "epoch": 0.08877542139313586, "grad_norm": 0.20224444568157196, "learning_rate": 0.0009959102664846432, "loss": 3.664, "step": 1530 }, { "epoch": 0.0893556529055093, "grad_norm": 0.18003828823566437, "learning_rate": 0.0009957868903226425, "loss": 3.6487, "step": 1540 }, { "epoch": 0.08993588441788274, "grad_norm": 0.17379723489284515, "learning_rate": 0.0009956616886213742, "loss": 3.6553, "step": 1550 }, { "epoch": 0.09051611593025617, "grad_norm": 0.19469398260116577, "learning_rate": 0.0009955346618418443, "loss": 3.6518, "step": 1560 }, { "epoch": 0.09109634744262961, "grad_norm": 0.15254022181034088, "learning_rate": 0.0009954058104517788, "loss": 3.6517, "step": 1570 }, { "epoch": 0.09167657895500304, "grad_norm": 0.19052338600158691, "learning_rate": 0.0009952751349256218, "loss": 3.6479, "step": 1580 }, { "epoch": 0.09225681046737648, "grad_norm": 0.15514792501926422, "learning_rate": 0.0009951426357445343, "loss": 3.6307, "step": 1590 }, { "epoch": 0.09283704197974992, "grad_norm": 0.1835625171661377, "learning_rate": 0.0009950083133963923, "loss": 3.6349, "step": 1600 }, { "epoch": 0.09341727349212335, "grad_norm": 0.1494477540254593, "learning_rate": 0.0009948721683757846, "loss": 3.6373, "step": 1610 }, { "epoch": 0.0939975050044968, "grad_norm": 0.23332346975803375, "learning_rate": 0.0009947342011840114, "loss": 3.63, "step": 1620 }, { "epoch": 0.09457773651687024, "grad_norm": 0.1672951877117157, "learning_rate": 0.0009945944123290827, "loss": 3.6305, "step": 1630 }, { "epoch": 0.09515796802924367, "grad_norm": 0.1747497320175171, "learning_rate": 0.0009944528023257153, "loss": 3.6295, "step": 1640 }, { "epoch": 0.09573819954161711, "grad_norm": 0.15887069702148438, "learning_rate": 0.0009943093716953321, "loss": 3.6315, "step": 1650 }, { "epoch": 0.09631843105399054, "grad_norm": 0.1994808316230774, "learning_rate": 0.00099416412096606, "loss": 3.6236, "step": 1660 }, { "epoch": 0.09689866256636398, "grad_norm": 0.16880349814891815, "learning_rate": 0.0009940170506727273, "loss": 3.6111, "step": 1670 }, { "epoch": 0.09747889407873742, "grad_norm": 0.2141963392496109, "learning_rate": 0.000993868161356862, "loss": 3.6147, "step": 1680 }, { "epoch": 0.09805912559111085, "grad_norm": 0.19859230518341064, "learning_rate": 0.0009937174535666904, "loss": 3.6225, "step": 1690 }, { "epoch": 0.09863935710348429, "grad_norm": 0.21606995165348053, "learning_rate": 0.0009935649278571344, "loss": 3.6035, "step": 1700 }, { "epoch": 0.09921958861585772, "grad_norm": 0.1618034541606903, "learning_rate": 0.0009934105847898094, "loss": 3.6114, "step": 1710 }, { "epoch": 0.09979982012823116, "grad_norm": 0.19869261980056763, "learning_rate": 0.0009932544249330229, "loss": 3.6085, "step": 1720 }, { "epoch": 0.1003800516406046, "grad_norm": 0.20715004205703735, "learning_rate": 0.0009930964488617717, "loss": 3.6056, "step": 1730 }, { "epoch": 0.10096028315297804, "grad_norm": 0.14460305869579315, "learning_rate": 0.0009929366571577406, "loss": 3.6041, "step": 1740 }, { "epoch": 0.10154051466535148, "grad_norm": 0.1818549782037735, "learning_rate": 0.000992775050409299, "loss": 3.6026, "step": 1750 }, { "epoch": 0.10212074617772492, "grad_norm": 0.16325876116752625, "learning_rate": 0.0009926116292115, "loss": 3.5907, "step": 1760 }, { "epoch": 0.10270097769009835, "grad_norm": 0.1783674657344818, "learning_rate": 0.0009924463941660777, "loss": 3.5932, "step": 1770 }, { "epoch": 0.10328120920247179, "grad_norm": 0.23217235505580902, "learning_rate": 0.0009922793458814448, "loss": 3.5905, "step": 1780 }, { "epoch": 0.10386144071484522, "grad_norm": 0.18973788619041443, "learning_rate": 0.0009921104849726903, "loss": 3.6022, "step": 1790 }, { "epoch": 0.10444167222721866, "grad_norm": 0.17927169799804688, "learning_rate": 0.0009919398120615784, "loss": 3.5857, "step": 1800 }, { "epoch": 0.1050219037395921, "grad_norm": 0.1780613660812378, "learning_rate": 0.000991767327776544, "loss": 3.5834, "step": 1810 }, { "epoch": 0.10560213525196553, "grad_norm": 0.16915231943130493, "learning_rate": 0.0009915930327526925, "loss": 3.5848, "step": 1820 }, { "epoch": 0.10618236676433897, "grad_norm": 0.18921369314193726, "learning_rate": 0.0009914169276317966, "loss": 3.5855, "step": 1830 }, { "epoch": 0.1067625982767124, "grad_norm": 0.17203205823898315, "learning_rate": 0.0009912390130622935, "loss": 3.577, "step": 1840 }, { "epoch": 0.10734282978908584, "grad_norm": 0.23704229295253754, "learning_rate": 0.0009910592896992835, "loss": 3.5721, "step": 1850 }, { "epoch": 0.10792306130145929, "grad_norm": 0.15052714943885803, "learning_rate": 0.000990877758204527, "loss": 3.5778, "step": 1860 }, { "epoch": 0.10850329281383272, "grad_norm": 0.1933393031358719, "learning_rate": 0.0009906944192464417, "loss": 3.5789, "step": 1870 }, { "epoch": 0.10908352432620616, "grad_norm": 0.17307838797569275, "learning_rate": 0.000990509273500101, "loss": 3.5701, "step": 1880 }, { "epoch": 0.1096637558385796, "grad_norm": 0.20114563405513763, "learning_rate": 0.0009903223216472306, "loss": 3.5716, "step": 1890 }, { "epoch": 0.11024398735095303, "grad_norm": 0.20986546576023102, "learning_rate": 0.0009901335643762075, "loss": 3.5644, "step": 1900 }, { "epoch": 0.11082421886332647, "grad_norm": 0.1716219186782837, "learning_rate": 0.0009899430023820551, "loss": 3.5617, "step": 1910 }, { "epoch": 0.1114044503756999, "grad_norm": 0.1951904147863388, "learning_rate": 0.0009897506363664428, "loss": 3.5535, "step": 1920 }, { "epoch": 0.11198468188807334, "grad_norm": 0.1623149961233139, "learning_rate": 0.0009895564670376823, "loss": 3.5586, "step": 1930 }, { "epoch": 0.11256491340044678, "grad_norm": 0.1657789647579193, "learning_rate": 0.000989360495110726, "loss": 3.5623, "step": 1940 }, { "epoch": 0.11314514491282021, "grad_norm": 0.2033306509256363, "learning_rate": 0.0009891627213071625, "loss": 3.5404, "step": 1950 }, { "epoch": 0.11372537642519365, "grad_norm": 0.17595010995864868, "learning_rate": 0.0009889631463552157, "loss": 3.5493, "step": 1960 }, { "epoch": 0.11430560793756708, "grad_norm": 0.1695556640625, "learning_rate": 0.0009887617709897416, "loss": 3.5537, "step": 1970 }, { "epoch": 0.11488583944994053, "grad_norm": 0.14913249015808105, "learning_rate": 0.0009885585959522256, "loss": 3.5531, "step": 1980 }, { "epoch": 0.11546607096231397, "grad_norm": 0.1780378520488739, "learning_rate": 0.000988353621990779, "loss": 3.5458, "step": 1990 }, { "epoch": 0.1160463024746874, "grad_norm": 0.1777425855398178, "learning_rate": 0.0009881468498601379, "loss": 3.5512, "step": 2000 }, { "epoch": 0.1160463024746874, "eval_loss": 3.486086845397949, "eval_runtime": 3.248, "eval_samples_per_second": 1333.109, "eval_steps_per_second": 10.468, "step": 2000 }, { "epoch": 0.11662653398706084, "grad_norm": 0.1840885430574417, "learning_rate": 0.0009879382803216585, "loss": 3.5499, "step": 2010 }, { "epoch": 0.11720676549943428, "grad_norm": 0.15481173992156982, "learning_rate": 0.000987727914143316, "loss": 3.5435, "step": 2020 }, { "epoch": 0.11778699701180771, "grad_norm": 0.17583511769771576, "learning_rate": 0.0009875157520997005, "loss": 3.5421, "step": 2030 }, { "epoch": 0.11836722852418115, "grad_norm": 0.17113088071346283, "learning_rate": 0.000987301794972015, "loss": 3.5256, "step": 2040 }, { "epoch": 0.11894746003655458, "grad_norm": 0.19127151370048523, "learning_rate": 0.000987086043548072, "loss": 3.5307, "step": 2050 }, { "epoch": 0.11952769154892802, "grad_norm": 0.1801798790693283, "learning_rate": 0.000986868498622291, "loss": 3.5428, "step": 2060 }, { "epoch": 0.12010792306130146, "grad_norm": 0.17117474973201752, "learning_rate": 0.0009866491609956949, "loss": 3.5429, "step": 2070 }, { "epoch": 0.12068815457367489, "grad_norm": 0.18343955278396606, "learning_rate": 0.000986428031475908, "loss": 3.5305, "step": 2080 }, { "epoch": 0.12126838608604833, "grad_norm": 0.18340405821800232, "learning_rate": 0.0009862051108771523, "loss": 3.5239, "step": 2090 }, { "epoch": 0.12184861759842178, "grad_norm": 0.17750664055347443, "learning_rate": 0.000985980400020245, "loss": 3.5233, "step": 2100 }, { "epoch": 0.12242884911079521, "grad_norm": 0.18838383257389069, "learning_rate": 0.000985753899732595, "loss": 3.5299, "step": 2110 }, { "epoch": 0.12300908062316865, "grad_norm": 0.18249675631523132, "learning_rate": 0.0009855256108481996, "loss": 3.5363, "step": 2120 }, { "epoch": 0.12358931213554208, "grad_norm": 0.17180895805358887, "learning_rate": 0.0009852955342076431, "loss": 3.5211, "step": 2130 }, { "epoch": 0.12416954364791552, "grad_norm": 0.2116585075855255, "learning_rate": 0.0009850636706580911, "loss": 3.5278, "step": 2140 }, { "epoch": 0.12474977516028896, "grad_norm": 0.1602133959531784, "learning_rate": 0.0009848300210532899, "loss": 3.5184, "step": 2150 }, { "epoch": 0.1253300066726624, "grad_norm": 0.18009105324745178, "learning_rate": 0.0009845945862535618, "loss": 3.5151, "step": 2160 }, { "epoch": 0.12591023818503583, "grad_norm": 0.19407133758068085, "learning_rate": 0.0009843573671258024, "loss": 3.5237, "step": 2170 }, { "epoch": 0.12649046969740926, "grad_norm": 0.1537596881389618, "learning_rate": 0.000984118364543477, "loss": 3.5151, "step": 2180 }, { "epoch": 0.1270707012097827, "grad_norm": 0.20341388881206512, "learning_rate": 0.0009838775793866187, "loss": 3.5146, "step": 2190 }, { "epoch": 0.12765093272215614, "grad_norm": 0.19128254055976868, "learning_rate": 0.0009836350125418233, "loss": 3.5243, "step": 2200 }, { "epoch": 0.12823116423452957, "grad_norm": 0.18839573860168457, "learning_rate": 0.0009833906649022476, "loss": 3.5006, "step": 2210 }, { "epoch": 0.128811395746903, "grad_norm": 0.1921667903661728, "learning_rate": 0.0009831445373676049, "loss": 3.5139, "step": 2220 }, { "epoch": 0.12939162725927644, "grad_norm": 0.19188368320465088, "learning_rate": 0.000982896630844163, "loss": 3.5169, "step": 2230 }, { "epoch": 0.12997185877164988, "grad_norm": 0.17427149415016174, "learning_rate": 0.000982646946244739, "loss": 3.5046, "step": 2240 }, { "epoch": 0.13055209028402331, "grad_norm": 0.16430804133415222, "learning_rate": 0.0009823954844886983, "loss": 3.5112, "step": 2250 }, { "epoch": 0.13113232179639675, "grad_norm": 0.15311101078987122, "learning_rate": 0.0009821422465019496, "loss": 3.5017, "step": 2260 }, { "epoch": 0.13171255330877019, "grad_norm": 0.17969970405101776, "learning_rate": 0.000981887233216941, "loss": 3.4945, "step": 2270 }, { "epoch": 0.13229278482114365, "grad_norm": 0.16781874001026154, "learning_rate": 0.000981630445572659, "loss": 3.5051, "step": 2280 }, { "epoch": 0.13287301633351709, "grad_norm": 0.1790471076965332, "learning_rate": 0.0009813718845146215, "loss": 3.4946, "step": 2290 }, { "epoch": 0.13345324784589052, "grad_norm": 0.14774377644062042, "learning_rate": 0.0009811115509948784, "loss": 3.495, "step": 2300 }, { "epoch": 0.13403347935826396, "grad_norm": 0.18693110346794128, "learning_rate": 0.0009808494459720046, "loss": 3.5018, "step": 2310 }, { "epoch": 0.1346137108706374, "grad_norm": 0.163302943110466, "learning_rate": 0.000980585570411098, "loss": 3.4864, "step": 2320 }, { "epoch": 0.13519394238301083, "grad_norm": 0.17489096522331238, "learning_rate": 0.0009803199252837766, "loss": 3.4932, "step": 2330 }, { "epoch": 0.13577417389538426, "grad_norm": 0.1895146518945694, "learning_rate": 0.0009800525115681734, "loss": 3.4937, "step": 2340 }, { "epoch": 0.1363544054077577, "grad_norm": 0.19778315722942352, "learning_rate": 0.0009797833302489334, "loss": 3.4819, "step": 2350 }, { "epoch": 0.13693463692013114, "grad_norm": 0.17575684189796448, "learning_rate": 0.0009795123823172107, "loss": 3.4853, "step": 2360 }, { "epoch": 0.13751486843250457, "grad_norm": 0.1611810177564621, "learning_rate": 0.000979239668770664, "loss": 3.4912, "step": 2370 }, { "epoch": 0.138095099944878, "grad_norm": 0.19706352055072784, "learning_rate": 0.0009789651906134532, "loss": 3.4814, "step": 2380 }, { "epoch": 0.13867533145725144, "grad_norm": 0.15343667566776276, "learning_rate": 0.0009786889488562352, "loss": 3.4757, "step": 2390 }, { "epoch": 0.13925556296962488, "grad_norm": 0.1835697740316391, "learning_rate": 0.0009784109445161616, "loss": 3.48, "step": 2400 }, { "epoch": 0.13983579448199832, "grad_norm": 0.19989457726478577, "learning_rate": 0.0009781311786168732, "loss": 3.471, "step": 2410 }, { "epoch": 0.14041602599437175, "grad_norm": 0.1824110597372055, "learning_rate": 0.0009778496521884973, "loss": 3.4795, "step": 2420 }, { "epoch": 0.1409962575067452, "grad_norm": 0.17803703248500824, "learning_rate": 0.0009775663662676438, "loss": 3.4895, "step": 2430 }, { "epoch": 0.14157648901911862, "grad_norm": 0.17566360533237457, "learning_rate": 0.0009772813218974013, "loss": 3.4771, "step": 2440 }, { "epoch": 0.14215672053149206, "grad_norm": 0.17459633946418762, "learning_rate": 0.0009769945201273328, "loss": 3.4748, "step": 2450 }, { "epoch": 0.1427369520438655, "grad_norm": 0.17547911405563354, "learning_rate": 0.0009767059620134728, "loss": 3.4851, "step": 2460 }, { "epoch": 0.14331718355623893, "grad_norm": 0.1611047089099884, "learning_rate": 0.0009764156486183223, "loss": 3.4859, "step": 2470 }, { "epoch": 0.14389741506861237, "grad_norm": 0.22488833963871002, "learning_rate": 0.0009761235810108453, "loss": 3.4704, "step": 2480 }, { "epoch": 0.1444776465809858, "grad_norm": 0.1857168972492218, "learning_rate": 0.0009758297602664658, "loss": 3.4636, "step": 2490 }, { "epoch": 0.14505787809335924, "grad_norm": 0.18012335896492004, "learning_rate": 0.0009755341874670624, "loss": 3.4675, "step": 2500 }, { "epoch": 0.14563810960573267, "grad_norm": 0.14618253707885742, "learning_rate": 0.000975236863700965, "loss": 3.472, "step": 2510 }, { "epoch": 0.14621834111810614, "grad_norm": 0.20513653755187988, "learning_rate": 0.000974937790062951, "loss": 3.4772, "step": 2520 }, { "epoch": 0.14679857263047957, "grad_norm": 0.17752069234848022, "learning_rate": 0.0009746369676542408, "loss": 3.4674, "step": 2530 }, { "epoch": 0.147378804142853, "grad_norm": 0.1703299880027771, "learning_rate": 0.000974334397582494, "loss": 3.4631, "step": 2540 }, { "epoch": 0.14795903565522645, "grad_norm": 0.18766288459300995, "learning_rate": 0.0009740300809618055, "loss": 3.4696, "step": 2550 }, { "epoch": 0.14853926716759988, "grad_norm": 0.19645459949970245, "learning_rate": 0.0009737240189127005, "loss": 3.4686, "step": 2560 }, { "epoch": 0.14911949867997332, "grad_norm": 0.19703318178653717, "learning_rate": 0.0009734162125621322, "loss": 3.4645, "step": 2570 }, { "epoch": 0.14969973019234675, "grad_norm": 0.1729518324136734, "learning_rate": 0.0009731066630434753, "loss": 3.4623, "step": 2580 }, { "epoch": 0.1502799617047202, "grad_norm": 0.1629866361618042, "learning_rate": 0.0009727953714965238, "loss": 3.4587, "step": 2590 }, { "epoch": 0.15086019321709362, "grad_norm": 0.1935587227344513, "learning_rate": 0.0009724823390674857, "loss": 3.452, "step": 2600 }, { "epoch": 0.15144042472946706, "grad_norm": 0.1905779391527176, "learning_rate": 0.0009721675669089791, "loss": 3.4492, "step": 2610 }, { "epoch": 0.1520206562418405, "grad_norm": 0.1649785190820694, "learning_rate": 0.0009718510561800282, "loss": 3.4553, "step": 2620 }, { "epoch": 0.15260088775421393, "grad_norm": 0.18956676125526428, "learning_rate": 0.0009715328080460587, "loss": 3.4565, "step": 2630 }, { "epoch": 0.15318111926658737, "grad_norm": 0.1951354593038559, "learning_rate": 0.0009712128236788935, "loss": 3.4588, "step": 2640 }, { "epoch": 0.1537613507789608, "grad_norm": 0.19937202334403992, "learning_rate": 0.0009708911042567485, "loss": 3.4464, "step": 2650 }, { "epoch": 0.15434158229133424, "grad_norm": 0.1778160184621811, "learning_rate": 0.0009705676509642285, "loss": 3.4619, "step": 2660 }, { "epoch": 0.15492181380370768, "grad_norm": 0.19197410345077515, "learning_rate": 0.0009702424649923221, "loss": 3.4545, "step": 2670 }, { "epoch": 0.1555020453160811, "grad_norm": 0.20156262814998627, "learning_rate": 0.0009699155475383984, "loss": 3.4407, "step": 2680 }, { "epoch": 0.15608227682845455, "grad_norm": 0.1888488084077835, "learning_rate": 0.0009695868998062016, "loss": 3.4522, "step": 2690 }, { "epoch": 0.15666250834082798, "grad_norm": 0.15710744261741638, "learning_rate": 0.0009692565230058471, "loss": 3.4385, "step": 2700 }, { "epoch": 0.15724273985320142, "grad_norm": 0.17102789878845215, "learning_rate": 0.0009689244183538169, "loss": 3.4495, "step": 2710 }, { "epoch": 0.15782297136557485, "grad_norm": 0.19345730543136597, "learning_rate": 0.000968590587072955, "loss": 3.4449, "step": 2720 }, { "epoch": 0.1584032028779483, "grad_norm": 0.17778056859970093, "learning_rate": 0.0009682550303924633, "loss": 3.4424, "step": 2730 }, { "epoch": 0.15898343439032173, "grad_norm": 0.17961904406547546, "learning_rate": 0.0009679177495478966, "loss": 3.4457, "step": 2740 }, { "epoch": 0.15956366590269516, "grad_norm": 0.16124430298805237, "learning_rate": 0.0009675787457811583, "loss": 3.4388, "step": 2750 }, { "epoch": 0.16014389741506863, "grad_norm": 0.24593627452850342, "learning_rate": 0.0009672380203404957, "loss": 3.4491, "step": 2760 }, { "epoch": 0.16072412892744206, "grad_norm": 0.22059789299964905, "learning_rate": 0.0009668955744804957, "loss": 3.4452, "step": 2770 }, { "epoch": 0.1613043604398155, "grad_norm": 0.17355699837207794, "learning_rate": 0.0009665514094620798, "loss": 3.4334, "step": 2780 }, { "epoch": 0.16188459195218893, "grad_norm": 0.18848399817943573, "learning_rate": 0.0009662055265524996, "loss": 3.4445, "step": 2790 }, { "epoch": 0.16246482346456237, "grad_norm": 0.16156227886676788, "learning_rate": 0.0009658579270253321, "loss": 3.432, "step": 2800 }, { "epoch": 0.1630450549769358, "grad_norm": 0.17677579820156097, "learning_rate": 0.0009655086121604754, "loss": 3.4387, "step": 2810 }, { "epoch": 0.16362528648930924, "grad_norm": 0.1860220730304718, "learning_rate": 0.0009651575832441435, "loss": 3.4352, "step": 2820 }, { "epoch": 0.16420551800168268, "grad_norm": 0.1875036209821701, "learning_rate": 0.0009648048415688612, "loss": 3.4361, "step": 2830 }, { "epoch": 0.1647857495140561, "grad_norm": 0.18773604929447174, "learning_rate": 0.0009644503884334608, "loss": 3.4293, "step": 2840 }, { "epoch": 0.16536598102642955, "grad_norm": 0.17164553701877594, "learning_rate": 0.0009640942251430755, "loss": 3.4329, "step": 2850 }, { "epoch": 0.16594621253880298, "grad_norm": 0.22501535713672638, "learning_rate": 0.0009637363530091361, "loss": 3.4354, "step": 2860 }, { "epoch": 0.16652644405117642, "grad_norm": 0.16013330221176147, "learning_rate": 0.0009633767733493651, "loss": 3.4266, "step": 2870 }, { "epoch": 0.16710667556354986, "grad_norm": 0.17605000734329224, "learning_rate": 0.0009630154874877726, "loss": 3.4202, "step": 2880 }, { "epoch": 0.1676869070759233, "grad_norm": 0.19912280142307281, "learning_rate": 0.0009626524967546508, "loss": 3.4251, "step": 2890 }, { "epoch": 0.16826713858829673, "grad_norm": 0.19407010078430176, "learning_rate": 0.00096228780248657, "loss": 3.4242, "step": 2900 }, { "epoch": 0.16884737010067016, "grad_norm": 0.19874045252799988, "learning_rate": 0.0009619214060263723, "loss": 3.4326, "step": 2910 }, { "epoch": 0.1694276016130436, "grad_norm": 0.1755009889602661, "learning_rate": 0.000961553308723168, "loss": 3.4201, "step": 2920 }, { "epoch": 0.17000783312541703, "grad_norm": 0.16836123168468475, "learning_rate": 0.00096118351193233, "loss": 3.4244, "step": 2930 }, { "epoch": 0.17058806463779047, "grad_norm": 0.17263484001159668, "learning_rate": 0.0009608120170154886, "loss": 3.4245, "step": 2940 }, { "epoch": 0.1711682961501639, "grad_norm": 0.16616012156009674, "learning_rate": 0.0009604388253405272, "loss": 3.4149, "step": 2950 }, { "epoch": 0.17174852766253734, "grad_norm": 0.18633881211280823, "learning_rate": 0.0009600639382815768, "loss": 3.4247, "step": 2960 }, { "epoch": 0.17232875917491078, "grad_norm": 0.1888113021850586, "learning_rate": 0.0009596873572190104, "loss": 3.4185, "step": 2970 }, { "epoch": 0.17290899068728421, "grad_norm": 0.1729818731546402, "learning_rate": 0.0009593090835394392, "loss": 3.4188, "step": 2980 }, { "epoch": 0.17348922219965765, "grad_norm": 0.19416862726211548, "learning_rate": 0.0009589291186357066, "loss": 3.417, "step": 2990 }, { "epoch": 0.1740694537120311, "grad_norm": 0.16473758220672607, "learning_rate": 0.0009585474639068829, "loss": 3.4279, "step": 3000 }, { "epoch": 0.1740694537120311, "eval_loss": 3.3559625148773193, "eval_runtime": 3.2502, "eval_samples_per_second": 1332.237, "eval_steps_per_second": 10.461, "step": 3000 }, { "epoch": 0.17464968522440455, "grad_norm": 0.148405060172081, "learning_rate": 0.0009581641207582609, "loss": 3.4132, "step": 3010 }, { "epoch": 0.17522991673677799, "grad_norm": 0.17240671813488007, "learning_rate": 0.0009577790906013503, "loss": 3.4145, "step": 3020 }, { "epoch": 0.17581014824915142, "grad_norm": 0.1781778484582901, "learning_rate": 0.0009573923748538724, "loss": 3.4146, "step": 3030 }, { "epoch": 0.17639037976152486, "grad_norm": 0.1692868173122406, "learning_rate": 0.0009570039749397552, "loss": 3.4154, "step": 3040 }, { "epoch": 0.1769706112738983, "grad_norm": 0.18871796131134033, "learning_rate": 0.0009566138922891277, "loss": 3.4233, "step": 3050 }, { "epoch": 0.17755084278627173, "grad_norm": 0.16771915555000305, "learning_rate": 0.0009562221283383152, "loss": 3.4144, "step": 3060 }, { "epoch": 0.17813107429864516, "grad_norm": 0.17178234457969666, "learning_rate": 0.0009558286845298337, "loss": 3.4066, "step": 3070 }, { "epoch": 0.1787113058110186, "grad_norm": 0.17993003129959106, "learning_rate": 0.0009554335623123845, "loss": 3.4125, "step": 3080 }, { "epoch": 0.17929153732339204, "grad_norm": 0.1944742351770401, "learning_rate": 0.0009550367631408485, "loss": 3.4095, "step": 3090 }, { "epoch": 0.17987176883576547, "grad_norm": 0.21978144347667694, "learning_rate": 0.0009546382884762825, "loss": 3.4204, "step": 3100 }, { "epoch": 0.1804520003481389, "grad_norm": 0.19678272306919098, "learning_rate": 0.0009542381397859116, "loss": 3.3991, "step": 3110 }, { "epoch": 0.18103223186051234, "grad_norm": 0.16551128029823303, "learning_rate": 0.0009538363185431254, "loss": 3.4055, "step": 3120 }, { "epoch": 0.18161246337288578, "grad_norm": 0.18304401636123657, "learning_rate": 0.0009534328262274717, "loss": 3.4038, "step": 3130 }, { "epoch": 0.18219269488525922, "grad_norm": 0.1903512328863144, "learning_rate": 0.0009530276643246512, "loss": 3.4081, "step": 3140 }, { "epoch": 0.18277292639763265, "grad_norm": 0.19788897037506104, "learning_rate": 0.0009526208343265129, "loss": 3.3991, "step": 3150 }, { "epoch": 0.1833531579100061, "grad_norm": 0.17483383417129517, "learning_rate": 0.0009522123377310474, "loss": 3.4105, "step": 3160 }, { "epoch": 0.18393338942237952, "grad_norm": 0.18417778611183167, "learning_rate": 0.0009518021760423816, "loss": 3.3973, "step": 3170 }, { "epoch": 0.18451362093475296, "grad_norm": 0.17036600410938263, "learning_rate": 0.0009513903507707743, "loss": 3.403, "step": 3180 }, { "epoch": 0.1850938524471264, "grad_norm": 0.17953407764434814, "learning_rate": 0.0009509768634326089, "loss": 3.401, "step": 3190 }, { "epoch": 0.18567408395949983, "grad_norm": 0.18770861625671387, "learning_rate": 0.0009505617155503894, "loss": 3.4006, "step": 3200 }, { "epoch": 0.18625431547187327, "grad_norm": 0.20032437145709991, "learning_rate": 0.0009501449086527336, "loss": 3.4012, "step": 3210 }, { "epoch": 0.1868345469842467, "grad_norm": 0.19611109793186188, "learning_rate": 0.0009497264442743681, "loss": 3.3974, "step": 3220 }, { "epoch": 0.18741477849662014, "grad_norm": 0.17835482954978943, "learning_rate": 0.0009493063239561227, "loss": 3.3966, "step": 3230 }, { "epoch": 0.1879950100089936, "grad_norm": 0.17207197844982147, "learning_rate": 0.0009488845492449245, "loss": 3.3957, "step": 3240 }, { "epoch": 0.18857524152136704, "grad_norm": 0.15979701280593872, "learning_rate": 0.0009484611216937919, "loss": 3.3969, "step": 3250 }, { "epoch": 0.18915547303374047, "grad_norm": 0.19770529866218567, "learning_rate": 0.0009480360428618298, "loss": 3.3972, "step": 3260 }, { "epoch": 0.1897357045461139, "grad_norm": 0.161921888589859, "learning_rate": 0.0009476093143142231, "loss": 3.3782, "step": 3270 }, { "epoch": 0.19031593605848734, "grad_norm": 0.17377763986587524, "learning_rate": 0.0009471809376222304, "loss": 3.3959, "step": 3280 }, { "epoch": 0.19089616757086078, "grad_norm": 0.17865316569805145, "learning_rate": 0.00094675091436318, "loss": 3.3945, "step": 3290 }, { "epoch": 0.19147639908323422, "grad_norm": 0.17098024487495422, "learning_rate": 0.0009463192461204626, "loss": 3.3915, "step": 3300 }, { "epoch": 0.19205663059560765, "grad_norm": 0.18630030751228333, "learning_rate": 0.0009458859344835259, "loss": 3.3891, "step": 3310 }, { "epoch": 0.1926368621079811, "grad_norm": 0.17725063860416412, "learning_rate": 0.0009454509810478685, "loss": 3.3856, "step": 3320 }, { "epoch": 0.19321709362035452, "grad_norm": 0.15566089749336243, "learning_rate": 0.0009450143874150347, "loss": 3.3964, "step": 3330 }, { "epoch": 0.19379732513272796, "grad_norm": 0.16617019474506378, "learning_rate": 0.0009445761551926079, "loss": 3.3854, "step": 3340 }, { "epoch": 0.1943775566451014, "grad_norm": 0.1689499467611313, "learning_rate": 0.0009441362859942054, "loss": 3.3933, "step": 3350 }, { "epoch": 0.19495778815747483, "grad_norm": 0.16878637671470642, "learning_rate": 0.0009436947814394712, "loss": 3.3819, "step": 3360 }, { "epoch": 0.19553801966984827, "grad_norm": 0.20233598351478577, "learning_rate": 0.0009432516431540714, "loss": 3.3932, "step": 3370 }, { "epoch": 0.1961182511822217, "grad_norm": 0.14608658850193024, "learning_rate": 0.0009428068727696878, "loss": 3.3878, "step": 3380 }, { "epoch": 0.19669848269459514, "grad_norm": 0.16677936911582947, "learning_rate": 0.0009423604719240114, "loss": 3.3898, "step": 3390 }, { "epoch": 0.19727871420696858, "grad_norm": 0.17749983072280884, "learning_rate": 0.0009419124422607369, "loss": 3.3835, "step": 3400 }, { "epoch": 0.197858945719342, "grad_norm": 0.17364837229251862, "learning_rate": 0.0009414627854295566, "loss": 3.3873, "step": 3410 }, { "epoch": 0.19843917723171545, "grad_norm": 0.1968606561422348, "learning_rate": 0.0009410115030861536, "loss": 3.3834, "step": 3420 }, { "epoch": 0.19901940874408888, "grad_norm": 0.17494814097881317, "learning_rate": 0.0009405585968921974, "loss": 3.3768, "step": 3430 }, { "epoch": 0.19959964025646232, "grad_norm": 0.17461541295051575, "learning_rate": 0.0009401040685153357, "loss": 3.3673, "step": 3440 }, { "epoch": 0.20017987176883575, "grad_norm": 0.16065211594104767, "learning_rate": 0.0009396479196291896, "loss": 3.3831, "step": 3450 }, { "epoch": 0.2007601032812092, "grad_norm": 0.18772707879543304, "learning_rate": 0.000939190151913347, "loss": 3.381, "step": 3460 }, { "epoch": 0.20134033479358263, "grad_norm": 0.17115509510040283, "learning_rate": 0.000938730767053357, "loss": 3.3786, "step": 3470 }, { "epoch": 0.2019205663059561, "grad_norm": 0.16643331944942474, "learning_rate": 0.0009382697667407222, "loss": 3.381, "step": 3480 }, { "epoch": 0.20250079781832953, "grad_norm": 0.16961540281772614, "learning_rate": 0.0009378071526728944, "loss": 3.3798, "step": 3490 }, { "epoch": 0.20308102933070296, "grad_norm": 0.18497344851493835, "learning_rate": 0.000937342926553267, "loss": 3.3796, "step": 3500 }, { "epoch": 0.2036612608430764, "grad_norm": 0.17968598008155823, "learning_rate": 0.0009368770900911691, "loss": 3.3699, "step": 3510 }, { "epoch": 0.20424149235544983, "grad_norm": 0.19794964790344238, "learning_rate": 0.0009364096450018598, "loss": 3.3711, "step": 3520 }, { "epoch": 0.20482172386782327, "grad_norm": 0.16060756146907806, "learning_rate": 0.0009359405930065202, "loss": 3.3831, "step": 3530 }, { "epoch": 0.2054019553801967, "grad_norm": 0.16221892833709717, "learning_rate": 0.0009354699358322493, "loss": 3.3673, "step": 3540 }, { "epoch": 0.20598218689257014, "grad_norm": 0.1834096759557724, "learning_rate": 0.0009349976752120561, "loss": 3.3696, "step": 3550 }, { "epoch": 0.20656241840494358, "grad_norm": 0.1541026383638382, "learning_rate": 0.0009345238128848535, "loss": 3.3659, "step": 3560 }, { "epoch": 0.207142649917317, "grad_norm": 0.16199982166290283, "learning_rate": 0.0009340483505954524, "loss": 3.3728, "step": 3570 }, { "epoch": 0.20772288142969045, "grad_norm": 0.16066418588161469, "learning_rate": 0.0009335712900945547, "loss": 3.3695, "step": 3580 }, { "epoch": 0.20830311294206388, "grad_norm": 0.19340233504772186, "learning_rate": 0.0009330926331387472, "loss": 3.3751, "step": 3590 }, { "epoch": 0.20888334445443732, "grad_norm": 0.1752013862133026, "learning_rate": 0.0009326123814904949, "loss": 3.3665, "step": 3600 }, { "epoch": 0.20946357596681076, "grad_norm": 0.18730874359607697, "learning_rate": 0.0009321305369181345, "loss": 3.3656, "step": 3610 }, { "epoch": 0.2100438074791842, "grad_norm": 0.18153837323188782, "learning_rate": 0.0009316471011958685, "loss": 3.3761, "step": 3620 }, { "epoch": 0.21062403899155763, "grad_norm": 0.15640245378017426, "learning_rate": 0.0009311620761037578, "loss": 3.366, "step": 3630 }, { "epoch": 0.21120427050393106, "grad_norm": 0.14558587968349457, "learning_rate": 0.0009306754634277154, "loss": 3.3667, "step": 3640 }, { "epoch": 0.2117845020163045, "grad_norm": 0.17782875895500183, "learning_rate": 0.0009301872649595005, "loss": 3.3683, "step": 3650 }, { "epoch": 0.21236473352867793, "grad_norm": 0.19611169397830963, "learning_rate": 0.0009296974824967106, "loss": 3.3705, "step": 3660 }, { "epoch": 0.21294496504105137, "grad_norm": 0.17394675314426422, "learning_rate": 0.0009292061178427762, "loss": 3.3649, "step": 3670 }, { "epoch": 0.2135251965534248, "grad_norm": 0.1756591647863388, "learning_rate": 0.0009287131728069536, "loss": 3.3661, "step": 3680 }, { "epoch": 0.21410542806579824, "grad_norm": 0.19795431196689606, "learning_rate": 0.0009282186492043178, "loss": 3.3648, "step": 3690 }, { "epoch": 0.21468565957817168, "grad_norm": 0.19772683084011078, "learning_rate": 0.0009277225488557566, "loss": 3.3584, "step": 3700 }, { "epoch": 0.21526589109054511, "grad_norm": 0.18163970112800598, "learning_rate": 0.0009272248735879636, "loss": 3.3643, "step": 3710 }, { "epoch": 0.21584612260291858, "grad_norm": 0.1849670708179474, "learning_rate": 0.0009267256252334311, "loss": 3.3672, "step": 3720 }, { "epoch": 0.216426354115292, "grad_norm": 0.1637067347764969, "learning_rate": 0.0009262248056304439, "loss": 3.3708, "step": 3730 }, { "epoch": 0.21700658562766545, "grad_norm": 0.1776672899723053, "learning_rate": 0.0009257224166230722, "loss": 3.3561, "step": 3740 }, { "epoch": 0.21758681714003889, "grad_norm": 0.15775948762893677, "learning_rate": 0.0009252184600611651, "loss": 3.3573, "step": 3750 }, { "epoch": 0.21816704865241232, "grad_norm": 0.1565089225769043, "learning_rate": 0.0009247129378003432, "loss": 3.3654, "step": 3760 }, { "epoch": 0.21874728016478576, "grad_norm": 0.16912080347537994, "learning_rate": 0.0009242058517019926, "loss": 3.3494, "step": 3770 }, { "epoch": 0.2193275116771592, "grad_norm": 0.1838664710521698, "learning_rate": 0.0009236972036332574, "loss": 3.3694, "step": 3780 }, { "epoch": 0.21990774318953263, "grad_norm": 0.17254334688186646, "learning_rate": 0.0009231869954670331, "loss": 3.3601, "step": 3790 }, { "epoch": 0.22048797470190606, "grad_norm": 0.17754711210727692, "learning_rate": 0.0009226752290819595, "loss": 3.3586, "step": 3800 }, { "epoch": 0.2210682062142795, "grad_norm": 0.19803158938884735, "learning_rate": 0.0009221619063624143, "loss": 3.3603, "step": 3810 }, { "epoch": 0.22164843772665294, "grad_norm": 0.16345560550689697, "learning_rate": 0.0009216470291985053, "loss": 3.3511, "step": 3820 }, { "epoch": 0.22222866923902637, "grad_norm": 0.1706439107656479, "learning_rate": 0.0009211305994860641, "loss": 3.3578, "step": 3830 }, { "epoch": 0.2228089007513998, "grad_norm": 0.18484774231910706, "learning_rate": 0.0009206126191266393, "loss": 3.3567, "step": 3840 }, { "epoch": 0.22338913226377324, "grad_norm": 0.18500268459320068, "learning_rate": 0.0009200930900274884, "loss": 3.359, "step": 3850 }, { "epoch": 0.22396936377614668, "grad_norm": 0.18986065685749054, "learning_rate": 0.0009195720141015725, "loss": 3.3497, "step": 3860 }, { "epoch": 0.22454959528852012, "grad_norm": 0.17551766335964203, "learning_rate": 0.0009190493932675473, "loss": 3.3474, "step": 3870 }, { "epoch": 0.22512982680089355, "grad_norm": 0.19979430735111237, "learning_rate": 0.0009185252294497577, "loss": 3.3474, "step": 3880 }, { "epoch": 0.225710058313267, "grad_norm": 0.1754840612411499, "learning_rate": 0.0009179995245782297, "loss": 3.3426, "step": 3890 }, { "epoch": 0.22629028982564042, "grad_norm": 0.16811451315879822, "learning_rate": 0.0009174722805886638, "loss": 3.3523, "step": 3900 }, { "epoch": 0.22687052133801386, "grad_norm": 0.208675354719162, "learning_rate": 0.0009169434994224274, "loss": 3.3479, "step": 3910 }, { "epoch": 0.2274507528503873, "grad_norm": 0.1587597280740738, "learning_rate": 0.0009164131830265483, "loss": 3.3451, "step": 3920 }, { "epoch": 0.22803098436276073, "grad_norm": 0.1999424546957016, "learning_rate": 0.0009158813333537071, "loss": 3.3447, "step": 3930 }, { "epoch": 0.22861121587513417, "grad_norm": 0.1611049622297287, "learning_rate": 0.0009153479523622298, "loss": 3.3534, "step": 3940 }, { "epoch": 0.2291914473875076, "grad_norm": 0.18397875130176544, "learning_rate": 0.0009148130420160813, "loss": 3.346, "step": 3950 }, { "epoch": 0.22977167889988107, "grad_norm": 0.17322111129760742, "learning_rate": 0.0009142766042848574, "loss": 3.3534, "step": 3960 }, { "epoch": 0.2303519104122545, "grad_norm": 0.1563626080751419, "learning_rate": 0.000913738641143778, "loss": 3.3498, "step": 3970 }, { "epoch": 0.23093214192462794, "grad_norm": 0.18060965836048126, "learning_rate": 0.0009131991545736798, "loss": 3.3402, "step": 3980 }, { "epoch": 0.23151237343700137, "grad_norm": 0.18193970620632172, "learning_rate": 0.0009126581465610089, "loss": 3.3477, "step": 3990 }, { "epoch": 0.2320926049493748, "grad_norm": 0.17089489102363586, "learning_rate": 0.0009121156190978134, "loss": 3.3471, "step": 4000 }, { "epoch": 0.2320926049493748, "eval_loss": 3.2811107635498047, "eval_runtime": 3.2679, "eval_samples_per_second": 1325.027, "eval_steps_per_second": 10.404, "step": 4000 }, { "epoch": 0.23267283646174824, "grad_norm": 0.1482742726802826, "learning_rate": 0.0009115715741817364, "loss": 3.3448, "step": 4010 }, { "epoch": 0.23325306797412168, "grad_norm": 0.1840105950832367, "learning_rate": 0.000911026013816008, "loss": 3.3528, "step": 4020 }, { "epoch": 0.23383329948649512, "grad_norm": 0.159522145986557, "learning_rate": 0.0009104789400094387, "loss": 3.3452, "step": 4030 }, { "epoch": 0.23441353099886855, "grad_norm": 0.16177432239055634, "learning_rate": 0.0009099303547764118, "loss": 3.3407, "step": 4040 }, { "epoch": 0.234993762511242, "grad_norm": 0.1564028412103653, "learning_rate": 0.0009093802601368755, "loss": 3.3393, "step": 4050 }, { "epoch": 0.23557399402361542, "grad_norm": 0.18961066007614136, "learning_rate": 0.0009088286581163357, "loss": 3.3461, "step": 4060 }, { "epoch": 0.23615422553598886, "grad_norm": 0.15274052321910858, "learning_rate": 0.0009082755507458492, "loss": 3.339, "step": 4070 }, { "epoch": 0.2367344570483623, "grad_norm": 0.18588609993457794, "learning_rate": 0.0009077209400620148, "loss": 3.3366, "step": 4080 }, { "epoch": 0.23731468856073573, "grad_norm": 0.17476515471935272, "learning_rate": 0.0009071648281069673, "loss": 3.3353, "step": 4090 }, { "epoch": 0.23789492007310917, "grad_norm": 0.17199723422527313, "learning_rate": 0.0009066072169283695, "loss": 3.3329, "step": 4100 }, { "epoch": 0.2384751515854826, "grad_norm": 0.19376391172409058, "learning_rate": 0.0009060481085794037, "loss": 3.3347, "step": 4110 }, { "epoch": 0.23905538309785604, "grad_norm": 0.15385298430919647, "learning_rate": 0.0009054875051187657, "loss": 3.3387, "step": 4120 }, { "epoch": 0.23963561461022947, "grad_norm": 0.15670862793922424, "learning_rate": 0.000904925408610656, "loss": 3.3386, "step": 4130 }, { "epoch": 0.2402158461226029, "grad_norm": 0.16597050428390503, "learning_rate": 0.0009043618211247731, "loss": 3.3409, "step": 4140 }, { "epoch": 0.24079607763497635, "grad_norm": 0.15981680154800415, "learning_rate": 0.0009037967447363049, "loss": 3.338, "step": 4150 }, { "epoch": 0.24137630914734978, "grad_norm": 0.18593856692314148, "learning_rate": 0.0009032301815259221, "loss": 3.3384, "step": 4160 }, { "epoch": 0.24195654065972322, "grad_norm": 0.16867688298225403, "learning_rate": 0.0009026621335797696, "loss": 3.3342, "step": 4170 }, { "epoch": 0.24253677217209665, "grad_norm": 0.16456478834152222, "learning_rate": 0.0009020926029894594, "loss": 3.3346, "step": 4180 }, { "epoch": 0.2431170036844701, "grad_norm": 0.1605817675590515, "learning_rate": 0.0009015215918520629, "loss": 3.3316, "step": 4190 }, { "epoch": 0.24369723519684355, "grad_norm": 0.18208837509155273, "learning_rate": 0.0009009491022701028, "loss": 3.3329, "step": 4200 }, { "epoch": 0.244277466709217, "grad_norm": 0.1611657440662384, "learning_rate": 0.000900375136351546, "loss": 3.3293, "step": 4210 }, { "epoch": 0.24485769822159043, "grad_norm": 0.17636191844940186, "learning_rate": 0.0008997996962097947, "loss": 3.3449, "step": 4220 }, { "epoch": 0.24543792973396386, "grad_norm": 0.19152578711509705, "learning_rate": 0.0008992227839636804, "loss": 3.3272, "step": 4230 }, { "epoch": 0.2460181612463373, "grad_norm": 0.16381146013736725, "learning_rate": 0.0008986444017374538, "loss": 3.3223, "step": 4240 }, { "epoch": 0.24659839275871073, "grad_norm": 0.16677068173885345, "learning_rate": 0.0008980645516607793, "loss": 3.3294, "step": 4250 }, { "epoch": 0.24717862427108417, "grad_norm": 0.16276830434799194, "learning_rate": 0.0008974832358687253, "loss": 3.3337, "step": 4260 }, { "epoch": 0.2477588557834576, "grad_norm": 0.17291460931301117, "learning_rate": 0.0008969004565017577, "loss": 3.3255, "step": 4270 }, { "epoch": 0.24833908729583104, "grad_norm": 0.17407308518886566, "learning_rate": 0.0008963162157057309, "loss": 3.3329, "step": 4280 }, { "epoch": 0.24891931880820448, "grad_norm": 0.15487220883369446, "learning_rate": 0.0008957305156318811, "loss": 3.3245, "step": 4290 }, { "epoch": 0.2494995503205779, "grad_norm": 0.14908728003501892, "learning_rate": 0.000895143358436817, "loss": 3.3281, "step": 4300 }, { "epoch": 0.2500797818329513, "grad_norm": 0.20198597013950348, "learning_rate": 0.000894554746282513, "loss": 3.325, "step": 4310 }, { "epoch": 0.2506600133453248, "grad_norm": 0.19431117177009583, "learning_rate": 0.0008939646813363007, "loss": 3.322, "step": 4320 }, { "epoch": 0.25124024485769825, "grad_norm": 0.1598384529352188, "learning_rate": 0.000893373165770861, "loss": 3.3353, "step": 4330 }, { "epoch": 0.25182047637007166, "grad_norm": 0.1565127670764923, "learning_rate": 0.0008927802017642164, "loss": 3.3201, "step": 4340 }, { "epoch": 0.2524007078824451, "grad_norm": 0.1654927283525467, "learning_rate": 0.0008921857914997222, "loss": 3.3326, "step": 4350 }, { "epoch": 0.2529809393948185, "grad_norm": 0.15852831304073334, "learning_rate": 0.0008915899371660595, "loss": 3.328, "step": 4360 }, { "epoch": 0.253561170907192, "grad_norm": 0.16426099836826324, "learning_rate": 0.0008909926409572263, "loss": 3.3326, "step": 4370 }, { "epoch": 0.2541414024195654, "grad_norm": 0.16855676472187042, "learning_rate": 0.0008903939050725297, "loss": 3.3289, "step": 4380 }, { "epoch": 0.25472163393193886, "grad_norm": 0.1595139056444168, "learning_rate": 0.0008897937317165781, "loss": 3.3324, "step": 4390 }, { "epoch": 0.25530186544431227, "grad_norm": 0.15312696993350983, "learning_rate": 0.0008891921230992725, "loss": 3.3294, "step": 4400 }, { "epoch": 0.25588209695668573, "grad_norm": 0.16672903299331665, "learning_rate": 0.000888589081435799, "loss": 3.3217, "step": 4410 }, { "epoch": 0.25646232846905914, "grad_norm": 0.1601061373949051, "learning_rate": 0.0008879846089466202, "loss": 3.3153, "step": 4420 }, { "epoch": 0.2570425599814326, "grad_norm": 0.16103419661521912, "learning_rate": 0.0008873787078574671, "loss": 3.3176, "step": 4430 }, { "epoch": 0.257622791493806, "grad_norm": 0.15978160500526428, "learning_rate": 0.0008867713803993309, "loss": 3.3316, "step": 4440 }, { "epoch": 0.2582030230061795, "grad_norm": 0.15198193490505219, "learning_rate": 0.0008861626288084549, "loss": 3.3205, "step": 4450 }, { "epoch": 0.2587832545185529, "grad_norm": 0.17815245687961578, "learning_rate": 0.0008855524553263263, "loss": 3.3159, "step": 4460 }, { "epoch": 0.25936348603092635, "grad_norm": 0.17297674715518951, "learning_rate": 0.0008849408621996679, "loss": 3.3131, "step": 4470 }, { "epoch": 0.25994371754329976, "grad_norm": 0.17088572680950165, "learning_rate": 0.0008843278516804294, "loss": 3.3178, "step": 4480 }, { "epoch": 0.2605239490556732, "grad_norm": 0.14876043796539307, "learning_rate": 0.00088371342602578, "loss": 3.3113, "step": 4490 }, { "epoch": 0.26110418056804663, "grad_norm": 0.16213169693946838, "learning_rate": 0.0008830975874980991, "loss": 3.3168, "step": 4500 }, { "epoch": 0.2616844120804201, "grad_norm": 0.1761493980884552, "learning_rate": 0.0008824803383649688, "loss": 3.319, "step": 4510 }, { "epoch": 0.2622646435927935, "grad_norm": 0.15832360088825226, "learning_rate": 0.0008818616808991651, "loss": 3.3202, "step": 4520 }, { "epoch": 0.26284487510516696, "grad_norm": 0.1700046807527542, "learning_rate": 0.0008812416173786495, "loss": 3.3119, "step": 4530 }, { "epoch": 0.26342510661754037, "grad_norm": 0.17616261541843414, "learning_rate": 0.0008806201500865609, "loss": 3.3133, "step": 4540 }, { "epoch": 0.26400533812991384, "grad_norm": 0.17251409590244293, "learning_rate": 0.0008799972813112072, "loss": 3.3148, "step": 4550 }, { "epoch": 0.2645855696422873, "grad_norm": 0.16835258901119232, "learning_rate": 0.0008793730133460561, "loss": 3.3188, "step": 4560 }, { "epoch": 0.2651658011546607, "grad_norm": 0.16114716231822968, "learning_rate": 0.0008787473484897276, "loss": 3.3227, "step": 4570 }, { "epoch": 0.26574603266703417, "grad_norm": 0.16803395748138428, "learning_rate": 0.0008781202890459856, "loss": 3.322, "step": 4580 }, { "epoch": 0.2663262641794076, "grad_norm": 0.1682870239019394, "learning_rate": 0.0008774918373237284, "loss": 3.3142, "step": 4590 }, { "epoch": 0.26690649569178104, "grad_norm": 0.15376168489456177, "learning_rate": 0.0008768619956369813, "loss": 3.3131, "step": 4600 }, { "epoch": 0.26748672720415445, "grad_norm": 0.14724121987819672, "learning_rate": 0.0008762307663048871, "loss": 3.3105, "step": 4610 }, { "epoch": 0.2680669587165279, "grad_norm": 0.1947721391916275, "learning_rate": 0.0008755981516516987, "loss": 3.3177, "step": 4620 }, { "epoch": 0.2686471902289013, "grad_norm": 0.15725037455558777, "learning_rate": 0.0008749641540067691, "loss": 3.308, "step": 4630 }, { "epoch": 0.2692274217412748, "grad_norm": 0.1616797149181366, "learning_rate": 0.0008743287757045443, "loss": 3.3158, "step": 4640 }, { "epoch": 0.2698076532536482, "grad_norm": 0.16861191391944885, "learning_rate": 0.0008736920190845536, "loss": 3.3113, "step": 4650 }, { "epoch": 0.27038788476602166, "grad_norm": 0.16433420777320862, "learning_rate": 0.0008730538864914019, "loss": 3.3168, "step": 4660 }, { "epoch": 0.27096811627839507, "grad_norm": 0.1651991754770279, "learning_rate": 0.00087241438027476, "loss": 3.3016, "step": 4670 }, { "epoch": 0.27154834779076853, "grad_norm": 0.18741054832935333, "learning_rate": 0.0008717735027893568, "loss": 3.3121, "step": 4680 }, { "epoch": 0.27212857930314194, "grad_norm": 0.15020763874053955, "learning_rate": 0.0008711312563949703, "loss": 3.309, "step": 4690 }, { "epoch": 0.2727088108155154, "grad_norm": 0.17015768587589264, "learning_rate": 0.000870487643456419, "loss": 3.3225, "step": 4700 }, { "epoch": 0.2732890423278888, "grad_norm": 0.19843073189258575, "learning_rate": 0.0008698426663435533, "loss": 3.3058, "step": 4710 }, { "epoch": 0.2738692738402623, "grad_norm": 0.17581596970558167, "learning_rate": 0.0008691963274312464, "loss": 3.3086, "step": 4720 }, { "epoch": 0.2744495053526357, "grad_norm": 0.17392052710056305, "learning_rate": 0.000868548629099386, "loss": 3.311, "step": 4730 }, { "epoch": 0.27502973686500914, "grad_norm": 0.16661454737186432, "learning_rate": 0.0008678995737328651, "loss": 3.3108, "step": 4740 }, { "epoch": 0.27560996837738255, "grad_norm": 0.1836322695016861, "learning_rate": 0.0008672491637215735, "loss": 3.3042, "step": 4750 }, { "epoch": 0.276190199889756, "grad_norm": 0.14587625861167908, "learning_rate": 0.0008665974014603891, "loss": 3.3202, "step": 4760 }, { "epoch": 0.2767704314021294, "grad_norm": 0.1731816828250885, "learning_rate": 0.0008659442893491689, "loss": 3.295, "step": 4770 }, { "epoch": 0.2773506629145029, "grad_norm": 0.1422199159860611, "learning_rate": 0.0008652898297927398, "loss": 3.3102, "step": 4780 }, { "epoch": 0.2779308944268763, "grad_norm": 0.1520106941461563, "learning_rate": 0.0008646340252008908, "loss": 3.3186, "step": 4790 }, { "epoch": 0.27851112593924976, "grad_norm": 0.16036173701286316, "learning_rate": 0.000863976877988363, "loss": 3.3145, "step": 4800 }, { "epoch": 0.2790913574516232, "grad_norm": 0.15433992445468903, "learning_rate": 0.0008633183905748411, "loss": 3.3073, "step": 4810 }, { "epoch": 0.27967158896399663, "grad_norm": 0.15274393558502197, "learning_rate": 0.0008626585653849449, "loss": 3.3005, "step": 4820 }, { "epoch": 0.2802518204763701, "grad_norm": 0.1466848999261856, "learning_rate": 0.0008619974048482198, "loss": 3.3096, "step": 4830 }, { "epoch": 0.2808320519887435, "grad_norm": 0.17398701608181, "learning_rate": 0.0008613349113991283, "loss": 3.2977, "step": 4840 }, { "epoch": 0.28141228350111697, "grad_norm": 0.1633315235376358, "learning_rate": 0.0008606710874770405, "loss": 3.3048, "step": 4850 }, { "epoch": 0.2819925150134904, "grad_norm": 0.15874172747135162, "learning_rate": 0.0008600059355262259, "loss": 3.3, "step": 4860 }, { "epoch": 0.28257274652586384, "grad_norm": 0.16768650710582733, "learning_rate": 0.0008593394579958433, "loss": 3.2971, "step": 4870 }, { "epoch": 0.28315297803823725, "grad_norm": 0.17810046672821045, "learning_rate": 0.0008586716573399329, "loss": 3.3043, "step": 4880 }, { "epoch": 0.2837332095506107, "grad_norm": 0.16520732641220093, "learning_rate": 0.0008580025360174069, "loss": 3.3097, "step": 4890 }, { "epoch": 0.2843134410629841, "grad_norm": 0.15736092627048492, "learning_rate": 0.0008573320964920397, "loss": 3.2936, "step": 4900 }, { "epoch": 0.2848936725753576, "grad_norm": 0.16211272776126862, "learning_rate": 0.0008566603412324602, "loss": 3.3037, "step": 4910 }, { "epoch": 0.285473904087731, "grad_norm": 0.1499512791633606, "learning_rate": 0.0008559872727121416, "loss": 3.2995, "step": 4920 }, { "epoch": 0.28605413560010445, "grad_norm": 0.17990955710411072, "learning_rate": 0.0008553128934093926, "loss": 3.3008, "step": 4930 }, { "epoch": 0.28663436711247786, "grad_norm": 0.16948296129703522, "learning_rate": 0.0008546372058073484, "loss": 3.2988, "step": 4940 }, { "epoch": 0.2872145986248513, "grad_norm": 0.16478213667869568, "learning_rate": 0.0008539602123939616, "loss": 3.2981, "step": 4950 }, { "epoch": 0.28779483013722473, "grad_norm": 0.17590472102165222, "learning_rate": 0.0008532819156619928, "loss": 3.2979, "step": 4960 }, { "epoch": 0.2883750616495982, "grad_norm": 0.16984987258911133, "learning_rate": 0.0008526023181090019, "loss": 3.3093, "step": 4970 }, { "epoch": 0.2889552931619716, "grad_norm": 0.18633520603179932, "learning_rate": 0.0008519214222373379, "loss": 3.3027, "step": 4980 }, { "epoch": 0.28953552467434507, "grad_norm": 0.1713830530643463, "learning_rate": 0.000851239230554131, "loss": 3.3019, "step": 4990 }, { "epoch": 0.2901157561867185, "grad_norm": 0.16138288378715515, "learning_rate": 0.0008505557455712825, "loss": 3.2957, "step": 5000 }, { "epoch": 0.2901157561867185, "eval_loss": 3.232105016708374, "eval_runtime": 3.247, "eval_samples_per_second": 1333.559, "eval_steps_per_second": 10.471, "step": 5000 }, { "epoch": 0.29069598769909194, "grad_norm": 0.14849427342414856, "learning_rate": 0.0008498709698054553, "loss": 3.297, "step": 5010 }, { "epoch": 0.29127621921146535, "grad_norm": 0.15944212675094604, "learning_rate": 0.0008491849057780658, "loss": 3.2875, "step": 5020 }, { "epoch": 0.2918564507238388, "grad_norm": 0.14453737437725067, "learning_rate": 0.0008484975560152737, "loss": 3.2919, "step": 5030 }, { "epoch": 0.2924366822362123, "grad_norm": 0.18005253374576569, "learning_rate": 0.0008478089230479726, "loss": 3.2981, "step": 5040 }, { "epoch": 0.2930169137485857, "grad_norm": 0.16119055449962616, "learning_rate": 0.0008471190094117814, "loss": 3.2942, "step": 5050 }, { "epoch": 0.29359714526095915, "grad_norm": 0.15280525386333466, "learning_rate": 0.0008464278176470342, "loss": 3.2958, "step": 5060 }, { "epoch": 0.29417737677333256, "grad_norm": 0.15936006605625153, "learning_rate": 0.0008457353502987718, "loss": 3.294, "step": 5070 }, { "epoch": 0.294757608285706, "grad_norm": 0.15230213105678558, "learning_rate": 0.0008450416099167313, "loss": 3.3008, "step": 5080 }, { "epoch": 0.2953378397980794, "grad_norm": 0.15988774597644806, "learning_rate": 0.0008443465990553374, "loss": 3.2902, "step": 5090 }, { "epoch": 0.2959180713104529, "grad_norm": 0.17253676056861877, "learning_rate": 0.0008436503202736928, "loss": 3.2986, "step": 5100 }, { "epoch": 0.2964983028228263, "grad_norm": 0.15170855820178986, "learning_rate": 0.0008429527761355693, "loss": 3.2877, "step": 5110 }, { "epoch": 0.29707853433519976, "grad_norm": 0.16487392783164978, "learning_rate": 0.0008422539692093974, "loss": 3.2846, "step": 5120 }, { "epoch": 0.29765876584757317, "grad_norm": 0.14367428421974182, "learning_rate": 0.000841553902068257, "loss": 3.2906, "step": 5130 }, { "epoch": 0.29823899735994663, "grad_norm": 0.17368683218955994, "learning_rate": 0.0008408525772898692, "loss": 3.3027, "step": 5140 }, { "epoch": 0.29881922887232004, "grad_norm": 0.1599467247724533, "learning_rate": 0.000840149997456585, "loss": 3.2852, "step": 5150 }, { "epoch": 0.2993994603846935, "grad_norm": 0.15667842328548431, "learning_rate": 0.0008394461651553768, "loss": 3.2898, "step": 5160 }, { "epoch": 0.2999796918970669, "grad_norm": 0.15665532648563385, "learning_rate": 0.000838741082977829, "loss": 3.2994, "step": 5170 }, { "epoch": 0.3005599234094404, "grad_norm": 0.17890366911888123, "learning_rate": 0.0008380347535201283, "loss": 3.2879, "step": 5180 }, { "epoch": 0.3011401549218138, "grad_norm": 0.15844030678272247, "learning_rate": 0.0008373271793830536, "loss": 3.2948, "step": 5190 }, { "epoch": 0.30172038643418725, "grad_norm": 0.14670881628990173, "learning_rate": 0.0008366183631719668, "loss": 3.2901, "step": 5200 }, { "epoch": 0.30230061794656066, "grad_norm": 0.15351887047290802, "learning_rate": 0.0008359083074968039, "loss": 3.2899, "step": 5210 }, { "epoch": 0.3028808494589341, "grad_norm": 0.14965134859085083, "learning_rate": 0.0008351970149720636, "loss": 3.2885, "step": 5220 }, { "epoch": 0.30346108097130753, "grad_norm": 0.16267924010753632, "learning_rate": 0.0008344844882167999, "loss": 3.2937, "step": 5230 }, { "epoch": 0.304041312483681, "grad_norm": 0.14958548545837402, "learning_rate": 0.0008337707298546112, "loss": 3.2887, "step": 5240 }, { "epoch": 0.3046215439960544, "grad_norm": 0.1530071198940277, "learning_rate": 0.0008330557425136299, "loss": 3.2865, "step": 5250 }, { "epoch": 0.30520177550842786, "grad_norm": 0.17143143713474274, "learning_rate": 0.0008323395288265149, "loss": 3.2861, "step": 5260 }, { "epoch": 0.3057820070208013, "grad_norm": 0.15875962376594543, "learning_rate": 0.0008316220914304398, "loss": 3.2919, "step": 5270 }, { "epoch": 0.30636223853317474, "grad_norm": 0.14534059166908264, "learning_rate": 0.0008309034329670841, "loss": 3.2813, "step": 5280 }, { "epoch": 0.3069424700455482, "grad_norm": 0.1608089655637741, "learning_rate": 0.0008301835560826236, "loss": 3.2866, "step": 5290 }, { "epoch": 0.3075227015579216, "grad_norm": 0.15711037814617157, "learning_rate": 0.0008294624634277208, "loss": 3.2924, "step": 5300 }, { "epoch": 0.30810293307029507, "grad_norm": 0.1359197199344635, "learning_rate": 0.0008287401576575139, "loss": 3.2906, "step": 5310 }, { "epoch": 0.3086831645826685, "grad_norm": 0.1708402931690216, "learning_rate": 0.0008280166414316086, "loss": 3.2919, "step": 5320 }, { "epoch": 0.30926339609504194, "grad_norm": 0.16216853260993958, "learning_rate": 0.0008272919174140674, "loss": 3.278, "step": 5330 }, { "epoch": 0.30984362760741535, "grad_norm": 0.16680875420570374, "learning_rate": 0.0008265659882734002, "loss": 3.2745, "step": 5340 }, { "epoch": 0.3104238591197888, "grad_norm": 0.1761893630027771, "learning_rate": 0.0008258388566825539, "loss": 3.2768, "step": 5350 }, { "epoch": 0.3110040906321622, "grad_norm": 0.17635513842105865, "learning_rate": 0.0008251105253189034, "loss": 3.2908, "step": 5360 }, { "epoch": 0.3115843221445357, "grad_norm": 0.1403694897890091, "learning_rate": 0.0008243809968642411, "loss": 3.2896, "step": 5370 }, { "epoch": 0.3121645536569091, "grad_norm": 0.15853242576122284, "learning_rate": 0.0008236502740047669, "loss": 3.2876, "step": 5380 }, { "epoch": 0.31274478516928256, "grad_norm": 0.14447931945323944, "learning_rate": 0.0008229183594310791, "loss": 3.2749, "step": 5390 }, { "epoch": 0.31332501668165597, "grad_norm": 0.14274190366268158, "learning_rate": 0.0008221852558381639, "loss": 3.2826, "step": 5400 }, { "epoch": 0.31390524819402943, "grad_norm": 0.15020518004894257, "learning_rate": 0.0008214509659253855, "loss": 3.2768, "step": 5410 }, { "epoch": 0.31448547970640284, "grad_norm": 0.16364452242851257, "learning_rate": 0.0008207154923964761, "loss": 3.2796, "step": 5420 }, { "epoch": 0.3150657112187763, "grad_norm": 0.15643912553787231, "learning_rate": 0.0008199788379595266, "loss": 3.2897, "step": 5430 }, { "epoch": 0.3156459427311497, "grad_norm": 0.14374196529388428, "learning_rate": 0.0008192410053269757, "loss": 3.2829, "step": 5440 }, { "epoch": 0.3162261742435232, "grad_norm": 0.1532783806324005, "learning_rate": 0.0008185019972156003, "loss": 3.2775, "step": 5450 }, { "epoch": 0.3168064057558966, "grad_norm": 0.15971648693084717, "learning_rate": 0.0008177618163465054, "loss": 3.2815, "step": 5460 }, { "epoch": 0.31738663726827004, "grad_norm": 0.18425996601581573, "learning_rate": 0.0008170204654451154, "loss": 3.2777, "step": 5470 }, { "epoch": 0.31796686878064345, "grad_norm": 0.13549089431762695, "learning_rate": 0.0008162779472411612, "loss": 3.2782, "step": 5480 }, { "epoch": 0.3185471002930169, "grad_norm": 0.14235620200634003, "learning_rate": 0.0008155342644686729, "loss": 3.2755, "step": 5490 }, { "epoch": 0.3191273318053903, "grad_norm": 0.1648331880569458, "learning_rate": 0.0008147894198659683, "loss": 3.2767, "step": 5500 }, { "epoch": 0.3197075633177638, "grad_norm": 0.15751086175441742, "learning_rate": 0.0008140434161756433, "loss": 3.2789, "step": 5510 }, { "epoch": 0.32028779483013725, "grad_norm": 0.16342034935951233, "learning_rate": 0.0008132962561445616, "loss": 3.2693, "step": 5520 }, { "epoch": 0.32086802634251066, "grad_norm": 0.1530640870332718, "learning_rate": 0.0008125479425238447, "loss": 3.2773, "step": 5530 }, { "epoch": 0.3214482578548841, "grad_norm": 0.1614234745502472, "learning_rate": 0.0008117984780688619, "loss": 3.276, "step": 5540 }, { "epoch": 0.32202848936725753, "grad_norm": 0.16489213705062866, "learning_rate": 0.0008110478655392195, "loss": 3.2802, "step": 5550 }, { "epoch": 0.322608720879631, "grad_norm": 0.16258342564105988, "learning_rate": 0.0008102961076987519, "loss": 3.2755, "step": 5560 }, { "epoch": 0.3231889523920044, "grad_norm": 0.14555875957012177, "learning_rate": 0.0008095432073155098, "loss": 3.2775, "step": 5570 }, { "epoch": 0.32376918390437787, "grad_norm": 0.1406964659690857, "learning_rate": 0.0008087891671617515, "loss": 3.2611, "step": 5580 }, { "epoch": 0.3243494154167513, "grad_norm": 0.162723109126091, "learning_rate": 0.0008080339900139317, "loss": 3.2648, "step": 5590 }, { "epoch": 0.32492964692912474, "grad_norm": 0.1660017967224121, "learning_rate": 0.0008072776786526921, "loss": 3.2704, "step": 5600 }, { "epoch": 0.32550987844149815, "grad_norm": 0.13984504342079163, "learning_rate": 0.0008065202358628501, "loss": 3.2757, "step": 5610 }, { "epoch": 0.3260901099538716, "grad_norm": 0.17101338505744934, "learning_rate": 0.0008057616644333894, "loss": 3.2742, "step": 5620 }, { "epoch": 0.326670341466245, "grad_norm": 0.15518838167190552, "learning_rate": 0.0008050019671574496, "loss": 3.2676, "step": 5630 }, { "epoch": 0.3272505729786185, "grad_norm": 0.17470310628414154, "learning_rate": 0.0008042411468323154, "loss": 3.2731, "step": 5640 }, { "epoch": 0.3278308044909919, "grad_norm": 0.14603078365325928, "learning_rate": 0.0008034792062594072, "loss": 3.2727, "step": 5650 }, { "epoch": 0.32841103600336535, "grad_norm": 0.14128392934799194, "learning_rate": 0.00080271614824427, "loss": 3.2689, "step": 5660 }, { "epoch": 0.32899126751573876, "grad_norm": 0.16043803095817566, "learning_rate": 0.0008019519755965629, "loss": 3.2574, "step": 5670 }, { "epoch": 0.3295714990281122, "grad_norm": 0.1634809821844101, "learning_rate": 0.0008011866911300504, "loss": 3.2706, "step": 5680 }, { "epoch": 0.33015173054048563, "grad_norm": 0.1542406529188156, "learning_rate": 0.0008004202976625895, "loss": 3.2894, "step": 5690 }, { "epoch": 0.3307319620528591, "grad_norm": 0.15284700691699982, "learning_rate": 0.0007996527980161214, "loss": 3.2814, "step": 5700 }, { "epoch": 0.3313121935652325, "grad_norm": 0.1456363946199417, "learning_rate": 0.0007988841950166602, "loss": 3.2727, "step": 5710 }, { "epoch": 0.33189242507760597, "grad_norm": 0.173149973154068, "learning_rate": 0.0007981144914942827, "loss": 3.2607, "step": 5720 }, { "epoch": 0.3324726565899794, "grad_norm": 0.1568511724472046, "learning_rate": 0.0007973436902831179, "loss": 3.2638, "step": 5730 }, { "epoch": 0.33305288810235284, "grad_norm": 0.16106663644313812, "learning_rate": 0.0007965717942213365, "loss": 3.2652, "step": 5740 }, { "epoch": 0.3336331196147263, "grad_norm": 0.14553207159042358, "learning_rate": 0.0007957988061511408, "loss": 3.2771, "step": 5750 }, { "epoch": 0.3342133511270997, "grad_norm": 0.15205144882202148, "learning_rate": 0.0007950247289187538, "loss": 3.2729, "step": 5760 }, { "epoch": 0.3347935826394732, "grad_norm": 0.15027263760566711, "learning_rate": 0.0007942495653744089, "loss": 3.2727, "step": 5770 }, { "epoch": 0.3353738141518466, "grad_norm": 0.1563851535320282, "learning_rate": 0.0007934733183723395, "loss": 3.2653, "step": 5780 }, { "epoch": 0.33595404566422005, "grad_norm": 0.14442172646522522, "learning_rate": 0.0007926959907707683, "loss": 3.2754, "step": 5790 }, { "epoch": 0.33653427717659345, "grad_norm": 0.13226284086704254, "learning_rate": 0.0007919175854318971, "loss": 3.2605, "step": 5800 }, { "epoch": 0.3371145086889669, "grad_norm": 0.1466459184885025, "learning_rate": 0.0007911381052218961, "loss": 3.2638, "step": 5810 }, { "epoch": 0.3376947402013403, "grad_norm": 0.15801192820072174, "learning_rate": 0.0007903575530108926, "loss": 3.2604, "step": 5820 }, { "epoch": 0.3382749717137138, "grad_norm": 0.14419734477996826, "learning_rate": 0.000789575931672962, "loss": 3.2674, "step": 5830 }, { "epoch": 0.3388552032260872, "grad_norm": 0.16942447423934937, "learning_rate": 0.0007887932440861158, "loss": 3.2634, "step": 5840 }, { "epoch": 0.33943543473846066, "grad_norm": 0.15958154201507568, "learning_rate": 0.0007880094931322916, "loss": 3.2687, "step": 5850 }, { "epoch": 0.34001566625083407, "grad_norm": 0.14885276556015015, "learning_rate": 0.0007872246816973428, "loss": 3.2665, "step": 5860 }, { "epoch": 0.34059589776320753, "grad_norm": 0.15011096000671387, "learning_rate": 0.0007864388126710268, "loss": 3.2697, "step": 5870 }, { "epoch": 0.34117612927558094, "grad_norm": 0.1571209579706192, "learning_rate": 0.0007856518889469961, "loss": 3.2688, "step": 5880 }, { "epoch": 0.3417563607879544, "grad_norm": 0.1479196548461914, "learning_rate": 0.0007848639134227864, "loss": 3.2688, "step": 5890 }, { "epoch": 0.3423365923003278, "grad_norm": 0.1649380475282669, "learning_rate": 0.0007840748889998057, "loss": 3.2629, "step": 5900 }, { "epoch": 0.3429168238127013, "grad_norm": 0.1554240733385086, "learning_rate": 0.000783284818583325, "loss": 3.2653, "step": 5910 }, { "epoch": 0.3434970553250747, "grad_norm": 0.14077123999595642, "learning_rate": 0.000782493705082466, "loss": 3.2631, "step": 5920 }, { "epoch": 0.34407728683744815, "grad_norm": 0.16318172216415405, "learning_rate": 0.0007817015514101917, "loss": 3.2486, "step": 5930 }, { "epoch": 0.34465751834982156, "grad_norm": 0.16839627921581268, "learning_rate": 0.0007809083604832948, "loss": 3.265, "step": 5940 }, { "epoch": 0.345237749862195, "grad_norm": 0.14004649221897125, "learning_rate": 0.0007801141352223873, "loss": 3.261, "step": 5950 }, { "epoch": 0.34581798137456843, "grad_norm": 0.1588735729455948, "learning_rate": 0.0007793188785518901, "loss": 3.2614, "step": 5960 }, { "epoch": 0.3463982128869419, "grad_norm": 0.15165585279464722, "learning_rate": 0.0007785225934000213, "loss": 3.2654, "step": 5970 }, { "epoch": 0.3469784443993153, "grad_norm": 0.14551375806331635, "learning_rate": 0.0007777252826987864, "loss": 3.2593, "step": 5980 }, { "epoch": 0.34755867591168876, "grad_norm": 0.13176442682743073, "learning_rate": 0.0007769269493839669, "loss": 3.2519, "step": 5990 }, { "epoch": 0.3481389074240622, "grad_norm": 0.1538584679365158, "learning_rate": 0.0007761275963951096, "loss": 3.2677, "step": 6000 }, { "epoch": 0.3481389074240622, "eval_loss": 3.1944527626037598, "eval_runtime": 3.2607, "eval_samples_per_second": 1327.935, "eval_steps_per_second": 10.427, "step": 6000 }, { "epoch": 0.34871913893643564, "grad_norm": 0.1548028141260147, "learning_rate": 0.0007753272266755161, "loss": 3.2613, "step": 6010 }, { "epoch": 0.3492993704488091, "grad_norm": 0.15565787255764008, "learning_rate": 0.0007745258431722313, "loss": 3.2622, "step": 6020 }, { "epoch": 0.3498796019611825, "grad_norm": 0.15149720013141632, "learning_rate": 0.0007737234488360334, "loss": 3.2608, "step": 6030 }, { "epoch": 0.35045983347355597, "grad_norm": 0.14522841572761536, "learning_rate": 0.0007729200466214225, "loss": 3.2509, "step": 6040 }, { "epoch": 0.3510400649859294, "grad_norm": 0.1358969360589981, "learning_rate": 0.0007721156394866096, "loss": 3.2631, "step": 6050 }, { "epoch": 0.35162029649830284, "grad_norm": 0.13670052587985992, "learning_rate": 0.0007713102303935058, "loss": 3.2643, "step": 6060 }, { "epoch": 0.35220052801067625, "grad_norm": 0.16404469311237335, "learning_rate": 0.0007705038223077121, "loss": 3.2435, "step": 6070 }, { "epoch": 0.3527807595230497, "grad_norm": 0.14754830300807953, "learning_rate": 0.0007696964181985076, "loss": 3.264, "step": 6080 }, { "epoch": 0.3533609910354231, "grad_norm": 0.16356825828552246, "learning_rate": 0.0007688880210388384, "loss": 3.2629, "step": 6090 }, { "epoch": 0.3539412225477966, "grad_norm": 0.1487089991569519, "learning_rate": 0.0007680786338053079, "loss": 3.255, "step": 6100 }, { "epoch": 0.35452145406017, "grad_norm": 0.16473692655563354, "learning_rate": 0.0007672682594781645, "loss": 3.2539, "step": 6110 }, { "epoch": 0.35510168557254346, "grad_norm": 0.14944276213645935, "learning_rate": 0.0007664569010412914, "loss": 3.2526, "step": 6120 }, { "epoch": 0.35568191708491687, "grad_norm": 0.15919911861419678, "learning_rate": 0.0007656445614821954, "loss": 3.2613, "step": 6130 }, { "epoch": 0.35626214859729033, "grad_norm": 0.15756233036518097, "learning_rate": 0.000764831243791996, "loss": 3.2484, "step": 6140 }, { "epoch": 0.35684238010966374, "grad_norm": 0.14643166959285736, "learning_rate": 0.0007640169509654136, "loss": 3.2552, "step": 6150 }, { "epoch": 0.3574226116220372, "grad_norm": 0.15033231675624847, "learning_rate": 0.0007632016860007603, "loss": 3.2531, "step": 6160 }, { "epoch": 0.3580028431344106, "grad_norm": 0.14909091591835022, "learning_rate": 0.000762385451899927, "loss": 3.2651, "step": 6170 }, { "epoch": 0.3585830746467841, "grad_norm": 0.14181995391845703, "learning_rate": 0.0007615682516683728, "loss": 3.2596, "step": 6180 }, { "epoch": 0.3591633061591575, "grad_norm": 0.1447875052690506, "learning_rate": 0.0007607500883151148, "loss": 3.2588, "step": 6190 }, { "epoch": 0.35974353767153094, "grad_norm": 0.15402406454086304, "learning_rate": 0.0007599309648527162, "loss": 3.2478, "step": 6200 }, { "epoch": 0.36032376918390435, "grad_norm": 0.16491296887397766, "learning_rate": 0.0007591108842972754, "loss": 3.2442, "step": 6210 }, { "epoch": 0.3609040006962778, "grad_norm": 0.14670206606388092, "learning_rate": 0.0007582898496684148, "loss": 3.2601, "step": 6220 }, { "epoch": 0.3614842322086513, "grad_norm": 0.12047087401151657, "learning_rate": 0.0007574678639892702, "loss": 3.2531, "step": 6230 }, { "epoch": 0.3620644637210247, "grad_norm": 0.1422395259141922, "learning_rate": 0.0007566449302864784, "loss": 3.2565, "step": 6240 }, { "epoch": 0.36264469523339815, "grad_norm": 0.16634182631969452, "learning_rate": 0.0007558210515901683, "loss": 3.2521, "step": 6250 }, { "epoch": 0.36322492674577156, "grad_norm": 0.14773225784301758, "learning_rate": 0.0007549962309339467, "loss": 3.2571, "step": 6260 }, { "epoch": 0.363805158258145, "grad_norm": 0.1608228087425232, "learning_rate": 0.0007541704713548905, "loss": 3.2466, "step": 6270 }, { "epoch": 0.36438538977051843, "grad_norm": 0.1616470217704773, "learning_rate": 0.0007533437758935324, "loss": 3.2559, "step": 6280 }, { "epoch": 0.3649656212828919, "grad_norm": 0.14817708730697632, "learning_rate": 0.0007525161475938518, "loss": 3.2579, "step": 6290 }, { "epoch": 0.3655458527952653, "grad_norm": 0.1556018441915512, "learning_rate": 0.0007516875895032628, "loss": 3.2521, "step": 6300 }, { "epoch": 0.36612608430763877, "grad_norm": 0.13821960985660553, "learning_rate": 0.0007508581046726032, "loss": 3.256, "step": 6310 }, { "epoch": 0.3667063158200122, "grad_norm": 0.13531796634197235, "learning_rate": 0.0007500276961561232, "loss": 3.2476, "step": 6320 }, { "epoch": 0.36728654733238564, "grad_norm": 0.13882015645503998, "learning_rate": 0.0007491963670114737, "loss": 3.2507, "step": 6330 }, { "epoch": 0.36786677884475905, "grad_norm": 0.13630333542823792, "learning_rate": 0.0007483641202996957, "loss": 3.2536, "step": 6340 }, { "epoch": 0.3684470103571325, "grad_norm": 0.12747836112976074, "learning_rate": 0.0007475309590852089, "loss": 3.2559, "step": 6350 }, { "epoch": 0.3690272418695059, "grad_norm": 0.15810616314411163, "learning_rate": 0.0007466968864357998, "loss": 3.2431, "step": 6360 }, { "epoch": 0.3696074733818794, "grad_norm": 0.1615232676267624, "learning_rate": 0.0007458619054226117, "loss": 3.2513, "step": 6370 }, { "epoch": 0.3701877048942528, "grad_norm": 0.12830163538455963, "learning_rate": 0.000745026019120132, "loss": 3.2539, "step": 6380 }, { "epoch": 0.37076793640662625, "grad_norm": 0.16822132468223572, "learning_rate": 0.0007441892306061817, "loss": 3.2442, "step": 6390 }, { "epoch": 0.37134816791899966, "grad_norm": 0.14407211542129517, "learning_rate": 0.0007433515429619038, "loss": 3.2533, "step": 6400 }, { "epoch": 0.3719283994313731, "grad_norm": 0.13332654535770416, "learning_rate": 0.0007425129592717516, "loss": 3.247, "step": 6410 }, { "epoch": 0.37250863094374653, "grad_norm": 0.15194551646709442, "learning_rate": 0.0007416734826234786, "loss": 3.2469, "step": 6420 }, { "epoch": 0.37308886245612, "grad_norm": 0.13437363505363464, "learning_rate": 0.0007408331161081255, "loss": 3.246, "step": 6430 }, { "epoch": 0.3736690939684934, "grad_norm": 0.1475239098072052, "learning_rate": 0.00073999186282001, "loss": 3.2452, "step": 6440 }, { "epoch": 0.37424932548086687, "grad_norm": 0.1388455033302307, "learning_rate": 0.0007391497258567146, "loss": 3.2484, "step": 6450 }, { "epoch": 0.3748295569932403, "grad_norm": 0.14330509305000305, "learning_rate": 0.000738306708319076, "loss": 3.2499, "step": 6460 }, { "epoch": 0.37540978850561374, "grad_norm": 0.13358131051063538, "learning_rate": 0.0007374628133111728, "loss": 3.2416, "step": 6470 }, { "epoch": 0.3759900200179872, "grad_norm": 0.15574291348457336, "learning_rate": 0.0007366180439403152, "loss": 3.2499, "step": 6480 }, { "epoch": 0.3765702515303606, "grad_norm": 0.15618012845516205, "learning_rate": 0.0007357724033170323, "loss": 3.2408, "step": 6490 }, { "epoch": 0.3771504830427341, "grad_norm": 0.12743791937828064, "learning_rate": 0.0007349258945550615, "loss": 3.2478, "step": 6500 }, { "epoch": 0.3777307145551075, "grad_norm": 0.1619246006011963, "learning_rate": 0.000734078520771337, "loss": 3.2358, "step": 6510 }, { "epoch": 0.37831094606748095, "grad_norm": 0.1590278297662735, "learning_rate": 0.0007332302850859773, "loss": 3.2425, "step": 6520 }, { "epoch": 0.37889117757985435, "grad_norm": 0.16503369808197021, "learning_rate": 0.0007323811906222755, "loss": 3.2411, "step": 6530 }, { "epoch": 0.3794714090922278, "grad_norm": 0.1441235989332199, "learning_rate": 0.0007315312405066861, "loss": 3.245, "step": 6540 }, { "epoch": 0.3800516406046012, "grad_norm": 0.16268372535705566, "learning_rate": 0.0007306804378688147, "loss": 3.2475, "step": 6550 }, { "epoch": 0.3806318721169747, "grad_norm": 0.17126062512397766, "learning_rate": 0.0007298287858414057, "loss": 3.2395, "step": 6560 }, { "epoch": 0.3812121036293481, "grad_norm": 0.14614002406597137, "learning_rate": 0.0007289762875603308, "loss": 3.2465, "step": 6570 }, { "epoch": 0.38179233514172156, "grad_norm": 0.1300090104341507, "learning_rate": 0.0007281229461645782, "loss": 3.2534, "step": 6580 }, { "epoch": 0.38237256665409497, "grad_norm": 0.16573797166347504, "learning_rate": 0.0007272687647962403, "loss": 3.2395, "step": 6590 }, { "epoch": 0.38295279816646843, "grad_norm": 0.17565912008285522, "learning_rate": 0.0007264137466005025, "loss": 3.2412, "step": 6600 }, { "epoch": 0.38353302967884184, "grad_norm": 0.14961925148963928, "learning_rate": 0.0007255578947256312, "loss": 3.2339, "step": 6610 }, { "epoch": 0.3841132611912153, "grad_norm": 0.1480415016412735, "learning_rate": 0.0007247012123229627, "loss": 3.2358, "step": 6620 }, { "epoch": 0.3846934927035887, "grad_norm": 0.14414618909358978, "learning_rate": 0.0007238437025468913, "loss": 3.2367, "step": 6630 }, { "epoch": 0.3852737242159622, "grad_norm": 0.14013369381427765, "learning_rate": 0.0007229853685548578, "loss": 3.2453, "step": 6640 }, { "epoch": 0.3858539557283356, "grad_norm": 0.13546213507652283, "learning_rate": 0.0007221262135073381, "loss": 3.2346, "step": 6650 }, { "epoch": 0.38643418724070905, "grad_norm": 0.16352064907550812, "learning_rate": 0.0007212662405678309, "loss": 3.24, "step": 6660 }, { "epoch": 0.38701441875308246, "grad_norm": 0.14588786661624908, "learning_rate": 0.0007204054529028467, "loss": 3.2478, "step": 6670 }, { "epoch": 0.3875946502654559, "grad_norm": 0.151209756731987, "learning_rate": 0.0007195438536818957, "loss": 3.2306, "step": 6680 }, { "epoch": 0.38817488177782933, "grad_norm": 0.14419269561767578, "learning_rate": 0.0007186814460774769, "loss": 3.2372, "step": 6690 }, { "epoch": 0.3887551132902028, "grad_norm": 0.14094632863998413, "learning_rate": 0.0007178182332650649, "loss": 3.2323, "step": 6700 }, { "epoch": 0.38933534480257626, "grad_norm": 0.1500055193901062, "learning_rate": 0.0007169542184231001, "loss": 3.2421, "step": 6710 }, { "epoch": 0.38991557631494966, "grad_norm": 0.14962860941886902, "learning_rate": 0.0007160894047329756, "loss": 3.2392, "step": 6720 }, { "epoch": 0.3904958078273231, "grad_norm": 0.14648567140102386, "learning_rate": 0.0007152237953790258, "loss": 3.2358, "step": 6730 }, { "epoch": 0.39107603933969654, "grad_norm": 0.14237669110298157, "learning_rate": 0.0007143573935485153, "loss": 3.2479, "step": 6740 }, { "epoch": 0.39165627085207, "grad_norm": 0.12649035453796387, "learning_rate": 0.0007134902024316264, "loss": 3.2412, "step": 6750 }, { "epoch": 0.3922365023644434, "grad_norm": 0.13935695588588715, "learning_rate": 0.0007126222252214473, "loss": 3.2341, "step": 6760 }, { "epoch": 0.39281673387681687, "grad_norm": 0.15621213614940643, "learning_rate": 0.0007117534651139612, "loss": 3.2332, "step": 6770 }, { "epoch": 0.3933969653891903, "grad_norm": 0.1531130075454712, "learning_rate": 0.0007108839253080338, "loss": 3.2251, "step": 6780 }, { "epoch": 0.39397719690156374, "grad_norm": 0.14018535614013672, "learning_rate": 0.0007100136090054019, "loss": 3.2377, "step": 6790 }, { "epoch": 0.39455742841393715, "grad_norm": 0.1587972193956375, "learning_rate": 0.0007091425194106611, "loss": 3.2356, "step": 6800 }, { "epoch": 0.3951376599263106, "grad_norm": 0.13827280700206757, "learning_rate": 0.0007082706597312549, "loss": 3.2345, "step": 6810 }, { "epoch": 0.395717891438684, "grad_norm": 0.13535282015800476, "learning_rate": 0.0007073980331774615, "loss": 3.2347, "step": 6820 }, { "epoch": 0.3962981229510575, "grad_norm": 0.15061281621456146, "learning_rate": 0.0007065246429623835, "loss": 3.2345, "step": 6830 }, { "epoch": 0.3968783544634309, "grad_norm": 0.1398342251777649, "learning_rate": 0.0007056504923019352, "loss": 3.231, "step": 6840 }, { "epoch": 0.39745858597580436, "grad_norm": 0.14031299948692322, "learning_rate": 0.0007047755844148307, "loss": 3.2212, "step": 6850 }, { "epoch": 0.39803881748817777, "grad_norm": 0.1403796672821045, "learning_rate": 0.0007038999225225729, "loss": 3.2346, "step": 6860 }, { "epoch": 0.39861904900055123, "grad_norm": 0.13849115371704102, "learning_rate": 0.0007030235098494403, "loss": 3.2424, "step": 6870 }, { "epoch": 0.39919928051292464, "grad_norm": 0.14095619320869446, "learning_rate": 0.0007021463496224762, "loss": 3.2299, "step": 6880 }, { "epoch": 0.3997795120252981, "grad_norm": 0.14068861305713654, "learning_rate": 0.0007012684450714765, "loss": 3.2338, "step": 6890 }, { "epoch": 0.4003597435376715, "grad_norm": 0.14077772200107574, "learning_rate": 0.0007003897994289777, "loss": 3.2323, "step": 6900 }, { "epoch": 0.40093997505004497, "grad_norm": 0.14695732295513153, "learning_rate": 0.0006995104159302452, "loss": 3.2343, "step": 6910 }, { "epoch": 0.4015202065624184, "grad_norm": 0.14510050415992737, "learning_rate": 0.0006986302978132611, "loss": 3.2269, "step": 6920 }, { "epoch": 0.40210043807479184, "grad_norm": 0.14484266936779022, "learning_rate": 0.0006977494483187126, "loss": 3.2255, "step": 6930 }, { "epoch": 0.40268066958716525, "grad_norm": 0.14667174220085144, "learning_rate": 0.0006968678706899795, "loss": 3.2269, "step": 6940 }, { "epoch": 0.4032609010995387, "grad_norm": 0.15151144564151764, "learning_rate": 0.0006959855681731233, "loss": 3.2294, "step": 6950 }, { "epoch": 0.4038411326119122, "grad_norm": 0.1448170691728592, "learning_rate": 0.000695102544016874, "loss": 3.2299, "step": 6960 }, { "epoch": 0.4044213641242856, "grad_norm": 0.12772366404533386, "learning_rate": 0.0006942188014726194, "loss": 3.2285, "step": 6970 }, { "epoch": 0.40500159563665905, "grad_norm": 0.15471121668815613, "learning_rate": 0.000693334343794392, "loss": 3.227, "step": 6980 }, { "epoch": 0.40558182714903246, "grad_norm": 0.15615463256835938, "learning_rate": 0.0006924491742388573, "loss": 3.228, "step": 6990 }, { "epoch": 0.4061620586614059, "grad_norm": 0.12857802212238312, "learning_rate": 0.0006915632960653029, "loss": 3.225, "step": 7000 }, { "epoch": 0.4061620586614059, "eval_loss": 3.165278911590576, "eval_runtime": 3.2586, "eval_samples_per_second": 1328.797, "eval_steps_per_second": 10.434, "step": 7000 }, { "epoch": 0.40674229017377933, "grad_norm": 0.14582431316375732, "learning_rate": 0.0006906767125356246, "loss": 3.235, "step": 7010 }, { "epoch": 0.4073225216861528, "grad_norm": 0.1576128900051117, "learning_rate": 0.000689789426914316, "loss": 3.2256, "step": 7020 }, { "epoch": 0.4079027531985262, "grad_norm": 0.16213494539260864, "learning_rate": 0.0006889014424684557, "loss": 3.2409, "step": 7030 }, { "epoch": 0.40848298471089967, "grad_norm": 0.13359089195728302, "learning_rate": 0.0006880127624676955, "loss": 3.2328, "step": 7040 }, { "epoch": 0.4090632162232731, "grad_norm": 0.1388418972492218, "learning_rate": 0.0006871233901842481, "loss": 3.2191, "step": 7050 }, { "epoch": 0.40964344773564654, "grad_norm": 0.1342374086380005, "learning_rate": 0.0006862333288928755, "loss": 3.2348, "step": 7060 }, { "epoch": 0.41022367924801995, "grad_norm": 0.15014256536960602, "learning_rate": 0.0006853425818708767, "loss": 3.2239, "step": 7070 }, { "epoch": 0.4108039107603934, "grad_norm": 0.1368698626756668, "learning_rate": 0.0006844511523980755, "loss": 3.2385, "step": 7080 }, { "epoch": 0.4113841422727668, "grad_norm": 0.15549789369106293, "learning_rate": 0.0006835590437568084, "loss": 3.2344, "step": 7090 }, { "epoch": 0.4119643737851403, "grad_norm": 0.13888388872146606, "learning_rate": 0.0006826662592319131, "loss": 3.2258, "step": 7100 }, { "epoch": 0.4125446052975137, "grad_norm": 0.12590526044368744, "learning_rate": 0.0006817728021107159, "loss": 3.221, "step": 7110 }, { "epoch": 0.41312483680988715, "grad_norm": 0.12910686433315277, "learning_rate": 0.0006808786756830192, "loss": 3.2283, "step": 7120 }, { "epoch": 0.41370506832226056, "grad_norm": 0.13956746459007263, "learning_rate": 0.0006799838832410903, "loss": 3.2201, "step": 7130 }, { "epoch": 0.414285299834634, "grad_norm": 0.1646030694246292, "learning_rate": 0.0006790884280796486, "loss": 3.2191, "step": 7140 }, { "epoch": 0.41486553134700743, "grad_norm": 0.13890932500362396, "learning_rate": 0.0006781923134958539, "loss": 3.2257, "step": 7150 }, { "epoch": 0.4154457628593809, "grad_norm": 0.13294340670108795, "learning_rate": 0.0006772955427892939, "loss": 3.2317, "step": 7160 }, { "epoch": 0.4160259943717543, "grad_norm": 0.12860235571861267, "learning_rate": 0.0006763981192619726, "loss": 3.2154, "step": 7170 }, { "epoch": 0.41660622588412777, "grad_norm": 0.14738686382770538, "learning_rate": 0.0006755000462182972, "loss": 3.2332, "step": 7180 }, { "epoch": 0.41718645739650123, "grad_norm": 0.13093027472496033, "learning_rate": 0.0006746013269650666, "loss": 3.2351, "step": 7190 }, { "epoch": 0.41776668890887464, "grad_norm": 0.13175268471240997, "learning_rate": 0.0006737019648114593, "loss": 3.2294, "step": 7200 }, { "epoch": 0.4183469204212481, "grad_norm": 0.13433928787708282, "learning_rate": 0.000672801963069021, "loss": 3.2273, "step": 7210 }, { "epoch": 0.4189271519336215, "grad_norm": 0.14208847284317017, "learning_rate": 0.0006719013250516526, "loss": 3.2272, "step": 7220 }, { "epoch": 0.419507383445995, "grad_norm": 0.13174398243427277, "learning_rate": 0.0006710000540755973, "loss": 3.2153, "step": 7230 }, { "epoch": 0.4200876149583684, "grad_norm": 0.14360399544239044, "learning_rate": 0.0006700981534594296, "loss": 3.223, "step": 7240 }, { "epoch": 0.42066784647074185, "grad_norm": 0.1482868790626526, "learning_rate": 0.0006691956265240417, "loss": 3.218, "step": 7250 }, { "epoch": 0.42124807798311525, "grad_norm": 0.13119544088840485, "learning_rate": 0.0006682924765926323, "loss": 3.2294, "step": 7260 }, { "epoch": 0.4218283094954887, "grad_norm": 0.13039755821228027, "learning_rate": 0.0006673887069906945, "loss": 3.227, "step": 7270 }, { "epoch": 0.4224085410078621, "grad_norm": 0.12415551394224167, "learning_rate": 0.0006664843210460025, "loss": 3.2142, "step": 7280 }, { "epoch": 0.4229887725202356, "grad_norm": 0.13810203969478607, "learning_rate": 0.0006655793220885997, "loss": 3.2275, "step": 7290 }, { "epoch": 0.423569004032609, "grad_norm": 0.13545836508274078, "learning_rate": 0.0006646737134507874, "loss": 3.2113, "step": 7300 }, { "epoch": 0.42414923554498246, "grad_norm": 0.12676437199115753, "learning_rate": 0.0006637674984671113, "loss": 3.2183, "step": 7310 }, { "epoch": 0.42472946705735587, "grad_norm": 0.12899167835712433, "learning_rate": 0.0006628606804743502, "loss": 3.2237, "step": 7320 }, { "epoch": 0.42530969856972933, "grad_norm": 0.13533097505569458, "learning_rate": 0.0006619532628115027, "loss": 3.2025, "step": 7330 }, { "epoch": 0.42588993008210274, "grad_norm": 0.12174040824174881, "learning_rate": 0.0006610452488197758, "loss": 3.2141, "step": 7340 }, { "epoch": 0.4264701615944762, "grad_norm": 0.14033706486225128, "learning_rate": 0.000660136641842572, "loss": 3.2309, "step": 7350 }, { "epoch": 0.4270503931068496, "grad_norm": 0.1348879039287567, "learning_rate": 0.0006592274452254775, "loss": 3.2207, "step": 7360 }, { "epoch": 0.4276306246192231, "grad_norm": 0.13258253037929535, "learning_rate": 0.0006583176623162494, "loss": 3.2273, "step": 7370 }, { "epoch": 0.4282108561315965, "grad_norm": 0.14150184392929077, "learning_rate": 0.0006574072964648038, "loss": 3.2205, "step": 7380 }, { "epoch": 0.42879108764396995, "grad_norm": 0.1396942138671875, "learning_rate": 0.0006564963510232031, "loss": 3.2179, "step": 7390 }, { "epoch": 0.42937131915634336, "grad_norm": 0.13543544709682465, "learning_rate": 0.0006555848293456438, "loss": 3.217, "step": 7400 }, { "epoch": 0.4299515506687168, "grad_norm": 0.1295756995677948, "learning_rate": 0.0006546727347884441, "loss": 3.2206, "step": 7410 }, { "epoch": 0.43053178218109023, "grad_norm": 0.15362213551998138, "learning_rate": 0.000653760070710032, "loss": 3.2212, "step": 7420 }, { "epoch": 0.4311120136934637, "grad_norm": 0.1392851173877716, "learning_rate": 0.0006528468404709319, "loss": 3.2175, "step": 7430 }, { "epoch": 0.43169224520583716, "grad_norm": 0.12892089784145355, "learning_rate": 0.0006519330474337534, "loss": 3.218, "step": 7440 }, { "epoch": 0.43227247671821056, "grad_norm": 0.1390940397977829, "learning_rate": 0.0006510186949631782, "loss": 3.2171, "step": 7450 }, { "epoch": 0.432852708230584, "grad_norm": 0.13330115377902985, "learning_rate": 0.0006501037864259478, "loss": 3.2293, "step": 7460 }, { "epoch": 0.43343293974295743, "grad_norm": 0.1433860808610916, "learning_rate": 0.0006491883251908513, "loss": 3.2099, "step": 7470 }, { "epoch": 0.4340131712553309, "grad_norm": 0.13837961852550507, "learning_rate": 0.000648272314628713, "loss": 3.2139, "step": 7480 }, { "epoch": 0.4345934027677043, "grad_norm": 0.13204647600650787, "learning_rate": 0.0006473557581123797, "loss": 3.2267, "step": 7490 }, { "epoch": 0.43517363428007777, "grad_norm": 0.13703928887844086, "learning_rate": 0.0006464386590167082, "loss": 3.2131, "step": 7500 }, { "epoch": 0.4357538657924512, "grad_norm": 0.13702446222305298, "learning_rate": 0.0006455210207185539, "loss": 3.2238, "step": 7510 }, { "epoch": 0.43633409730482464, "grad_norm": 0.13354191184043884, "learning_rate": 0.0006446028465967568, "loss": 3.2131, "step": 7520 }, { "epoch": 0.43691432881719805, "grad_norm": 0.14659354090690613, "learning_rate": 0.0006436841400321304, "loss": 3.2243, "step": 7530 }, { "epoch": 0.4374945603295715, "grad_norm": 0.13794207572937012, "learning_rate": 0.0006427649044074484, "loss": 3.2229, "step": 7540 }, { "epoch": 0.4380747918419449, "grad_norm": 0.146932452917099, "learning_rate": 0.0006418451431074329, "loss": 3.2117, "step": 7550 }, { "epoch": 0.4386550233543184, "grad_norm": 0.13247311115264893, "learning_rate": 0.0006409248595187409, "loss": 3.223, "step": 7560 }, { "epoch": 0.4392352548666918, "grad_norm": 0.13509587943553925, "learning_rate": 0.0006400040570299535, "loss": 3.2165, "step": 7570 }, { "epoch": 0.43981548637906526, "grad_norm": 0.1419142633676529, "learning_rate": 0.0006390827390315614, "loss": 3.2125, "step": 7580 }, { "epoch": 0.44039571789143866, "grad_norm": 0.14198483526706696, "learning_rate": 0.0006381609089159545, "loss": 3.2188, "step": 7590 }, { "epoch": 0.44097594940381213, "grad_norm": 0.12889783084392548, "learning_rate": 0.0006372385700774075, "loss": 3.2026, "step": 7600 }, { "epoch": 0.44155618091618554, "grad_norm": 0.13903778791427612, "learning_rate": 0.0006363157259120689, "loss": 3.2068, "step": 7610 }, { "epoch": 0.442136412428559, "grad_norm": 0.13285143673419952, "learning_rate": 0.0006353923798179472, "loss": 3.2067, "step": 7620 }, { "epoch": 0.4427166439409324, "grad_norm": 0.13808636367321014, "learning_rate": 0.0006344685351948998, "loss": 3.2131, "step": 7630 }, { "epoch": 0.44329687545330587, "grad_norm": 0.12950055301189423, "learning_rate": 0.0006335441954446191, "loss": 3.2128, "step": 7640 }, { "epoch": 0.4438771069656793, "grad_norm": 0.1326564997434616, "learning_rate": 0.0006326193639706214, "loss": 3.2228, "step": 7650 }, { "epoch": 0.44445733847805274, "grad_norm": 0.12820059061050415, "learning_rate": 0.0006316940441782325, "loss": 3.2105, "step": 7660 }, { "epoch": 0.4450375699904262, "grad_norm": 0.14834731817245483, "learning_rate": 0.000630768239474577, "loss": 3.2162, "step": 7670 }, { "epoch": 0.4456178015027996, "grad_norm": 0.1407567858695984, "learning_rate": 0.0006298419532685649, "loss": 3.2075, "step": 7680 }, { "epoch": 0.4461980330151731, "grad_norm": 0.18233934044837952, "learning_rate": 0.0006289151889708788, "loss": 3.2209, "step": 7690 }, { "epoch": 0.4467782645275465, "grad_norm": 0.13413317501544952, "learning_rate": 0.0006279879499939625, "loss": 3.2062, "step": 7700 }, { "epoch": 0.44735849603991995, "grad_norm": 0.14402654767036438, "learning_rate": 0.0006270602397520065, "loss": 3.2056, "step": 7710 }, { "epoch": 0.44793872755229336, "grad_norm": 0.14101460576057434, "learning_rate": 0.0006261320616609372, "loss": 3.2135, "step": 7720 }, { "epoch": 0.4485189590646668, "grad_norm": 0.15453755855560303, "learning_rate": 0.0006252034191384035, "loss": 3.2165, "step": 7730 }, { "epoch": 0.44909919057704023, "grad_norm": 0.13480693101882935, "learning_rate": 0.0006242743156037646, "loss": 3.2139, "step": 7740 }, { "epoch": 0.4496794220894137, "grad_norm": 0.13115455210208893, "learning_rate": 0.0006233447544780772, "loss": 3.2135, "step": 7750 }, { "epoch": 0.4502596536017871, "grad_norm": 0.14997157454490662, "learning_rate": 0.0006224147391840824, "loss": 3.1969, "step": 7760 }, { "epoch": 0.45083988511416057, "grad_norm": 0.13748539984226227, "learning_rate": 0.0006214842731461942, "loss": 3.2268, "step": 7770 }, { "epoch": 0.451420116626534, "grad_norm": 0.12151113897562027, "learning_rate": 0.0006205533597904857, "loss": 3.2063, "step": 7780 }, { "epoch": 0.45200034813890744, "grad_norm": 0.13322634994983673, "learning_rate": 0.0006196220025446778, "loss": 3.2066, "step": 7790 }, { "epoch": 0.45258057965128085, "grad_norm": 0.1378646045923233, "learning_rate": 0.0006186902048381252, "loss": 3.1976, "step": 7800 }, { "epoch": 0.4531608111636543, "grad_norm": 0.14197058975696564, "learning_rate": 0.0006177579701018048, "loss": 3.2056, "step": 7810 }, { "epoch": 0.4537410426760277, "grad_norm": 0.13627830147743225, "learning_rate": 0.0006168253017683025, "loss": 3.2052, "step": 7820 }, { "epoch": 0.4543212741884012, "grad_norm": 0.15169207751750946, "learning_rate": 0.0006158922032718006, "loss": 3.211, "step": 7830 }, { "epoch": 0.4549015057007746, "grad_norm": 0.1337585300207138, "learning_rate": 0.0006149586780480659, "loss": 3.2157, "step": 7840 }, { "epoch": 0.45548173721314805, "grad_norm": 0.1394774168729782, "learning_rate": 0.0006140247295344359, "loss": 3.2174, "step": 7850 }, { "epoch": 0.45606196872552146, "grad_norm": 0.12764208018779755, "learning_rate": 0.0006130903611698067, "loss": 3.2102, "step": 7860 }, { "epoch": 0.4566422002378949, "grad_norm": 0.13290008902549744, "learning_rate": 0.0006121555763946207, "loss": 3.2041, "step": 7870 }, { "epoch": 0.45722243175026833, "grad_norm": 0.14185406267642975, "learning_rate": 0.0006112203786508533, "loss": 3.2152, "step": 7880 }, { "epoch": 0.4578026632626418, "grad_norm": 0.12418293952941895, "learning_rate": 0.0006102847713820006, "loss": 3.2028, "step": 7890 }, { "epoch": 0.4583828947750152, "grad_norm": 0.138755664229393, "learning_rate": 0.0006093487580330666, "loss": 3.2043, "step": 7900 }, { "epoch": 0.45896312628738867, "grad_norm": 0.13823552429676056, "learning_rate": 0.0006084123420505503, "loss": 3.2043, "step": 7910 }, { "epoch": 0.45954335779976213, "grad_norm": 0.1277630627155304, "learning_rate": 0.0006074755268824335, "loss": 3.2068, "step": 7920 }, { "epoch": 0.46012358931213554, "grad_norm": 0.14666809141635895, "learning_rate": 0.0006065383159781682, "loss": 3.2156, "step": 7930 }, { "epoch": 0.460703820824509, "grad_norm": 0.12684592604637146, "learning_rate": 0.0006056007127886626, "loss": 3.2059, "step": 7940 }, { "epoch": 0.4612840523368824, "grad_norm": 0.12497347593307495, "learning_rate": 0.0006046627207662702, "loss": 3.2043, "step": 7950 }, { "epoch": 0.4618642838492559, "grad_norm": 0.12166955322027206, "learning_rate": 0.0006037243433647757, "loss": 3.2039, "step": 7960 }, { "epoch": 0.4624445153616293, "grad_norm": 0.12836964428424835, "learning_rate": 0.000602785584039383, "loss": 3.1986, "step": 7970 }, { "epoch": 0.46302474687400275, "grad_norm": 0.1306101679801941, "learning_rate": 0.0006018464462467023, "loss": 3.2028, "step": 7980 }, { "epoch": 0.46360497838637615, "grad_norm": 0.13166449964046478, "learning_rate": 0.0006009069334447374, "loss": 3.2017, "step": 7990 }, { "epoch": 0.4641852098987496, "grad_norm": 0.1289730966091156, "learning_rate": 0.0005999670490928729, "loss": 3.2051, "step": 8000 }, { "epoch": 0.4641852098987496, "eval_loss": 3.13897442817688, "eval_runtime": 3.2533, "eval_samples_per_second": 1330.958, "eval_steps_per_second": 10.451, "step": 8000 }, { "epoch": 0.464765441411123, "grad_norm": 0.12794232368469238, "learning_rate": 0.0005990267966518613, "loss": 3.2052, "step": 8010 }, { "epoch": 0.4653456729234965, "grad_norm": 0.13217690587043762, "learning_rate": 0.0005980861795838108, "loss": 3.2057, "step": 8020 }, { "epoch": 0.4659259044358699, "grad_norm": 0.14063167572021484, "learning_rate": 0.0005971452013521717, "loss": 3.202, "step": 8030 }, { "epoch": 0.46650613594824336, "grad_norm": 0.1315622478723526, "learning_rate": 0.0005962038654217244, "loss": 3.202, "step": 8040 }, { "epoch": 0.46708636746061677, "grad_norm": 0.14890199899673462, "learning_rate": 0.0005952621752585667, "loss": 3.2069, "step": 8050 }, { "epoch": 0.46766659897299023, "grad_norm": 0.13835932314395905, "learning_rate": 0.0005943201343301005, "loss": 3.2079, "step": 8060 }, { "epoch": 0.46824683048536364, "grad_norm": 0.13147889077663422, "learning_rate": 0.0005933777461050187, "loss": 3.2082, "step": 8070 }, { "epoch": 0.4688270619977371, "grad_norm": 0.12799794971942902, "learning_rate": 0.0005924350140532939, "loss": 3.1974, "step": 8080 }, { "epoch": 0.4694072935101105, "grad_norm": 0.12934145331382751, "learning_rate": 0.000591491941646164, "loss": 3.2048, "step": 8090 }, { "epoch": 0.469987525022484, "grad_norm": 0.13451933860778809, "learning_rate": 0.0005905485323561207, "loss": 3.1955, "step": 8100 }, { "epoch": 0.4705677565348574, "grad_norm": 0.15659664571285248, "learning_rate": 0.0005896047896568955, "loss": 3.1993, "step": 8110 }, { "epoch": 0.47114798804723085, "grad_norm": 0.14385788142681122, "learning_rate": 0.0005886607170234482, "loss": 3.2043, "step": 8120 }, { "epoch": 0.47172821955960426, "grad_norm": 0.13023056089878082, "learning_rate": 0.0005877163179319527, "loss": 3.2048, "step": 8130 }, { "epoch": 0.4723084510719777, "grad_norm": 0.1275002360343933, "learning_rate": 0.0005867715958597859, "loss": 3.2101, "step": 8140 }, { "epoch": 0.4728886825843512, "grad_norm": 0.13934627175331116, "learning_rate": 0.000585826554285513, "loss": 3.204, "step": 8150 }, { "epoch": 0.4734689140967246, "grad_norm": 0.1253582239151001, "learning_rate": 0.0005848811966888763, "loss": 3.2038, "step": 8160 }, { "epoch": 0.47404914560909805, "grad_norm": 0.13219626247882843, "learning_rate": 0.0005839355265507817, "loss": 3.2011, "step": 8170 }, { "epoch": 0.47462937712147146, "grad_norm": 0.13276910781860352, "learning_rate": 0.0005829895473532852, "loss": 3.2011, "step": 8180 }, { "epoch": 0.4752096086338449, "grad_norm": 0.146236851811409, "learning_rate": 0.0005820432625795819, "loss": 3.1997, "step": 8190 }, { "epoch": 0.47578984014621833, "grad_norm": 0.13150210678577423, "learning_rate": 0.0005810966757139909, "loss": 3.1945, "step": 8200 }, { "epoch": 0.4763700716585918, "grad_norm": 0.14235766232013702, "learning_rate": 0.0005801497902419444, "loss": 3.2039, "step": 8210 }, { "epoch": 0.4769503031709652, "grad_norm": 0.13625676929950714, "learning_rate": 0.0005792026096499741, "loss": 3.1921, "step": 8220 }, { "epoch": 0.47753053468333867, "grad_norm": 0.12872271239757538, "learning_rate": 0.0005782551374256981, "loss": 3.1912, "step": 8230 }, { "epoch": 0.4781107661957121, "grad_norm": 0.14330317080020905, "learning_rate": 0.0005773073770578081, "loss": 3.1958, "step": 8240 }, { "epoch": 0.47869099770808554, "grad_norm": 0.128121480345726, "learning_rate": 0.0005763593320360575, "loss": 3.1934, "step": 8250 }, { "epoch": 0.47927122922045895, "grad_norm": 0.13301797211170197, "learning_rate": 0.000575411005851247, "loss": 3.1976, "step": 8260 }, { "epoch": 0.4798514607328324, "grad_norm": 0.12738023698329926, "learning_rate": 0.0005744624019952131, "loss": 3.1995, "step": 8270 }, { "epoch": 0.4804316922452058, "grad_norm": 0.13468343019485474, "learning_rate": 0.0005735135239608146, "loss": 3.2016, "step": 8280 }, { "epoch": 0.4810119237575793, "grad_norm": 0.14049942791461945, "learning_rate": 0.0005725643752419198, "loss": 3.2005, "step": 8290 }, { "epoch": 0.4815921552699527, "grad_norm": 0.12929829955101013, "learning_rate": 0.0005716149593333938, "loss": 3.2072, "step": 8300 }, { "epoch": 0.48217238678232616, "grad_norm": 0.1476507931947708, "learning_rate": 0.0005706652797310851, "loss": 3.2013, "step": 8310 }, { "epoch": 0.48275261829469956, "grad_norm": 0.15121173858642578, "learning_rate": 0.000569715339931814, "loss": 3.1976, "step": 8320 }, { "epoch": 0.48333284980707303, "grad_norm": 0.12322133034467697, "learning_rate": 0.000568765143433358, "loss": 3.1905, "step": 8330 }, { "epoch": 0.48391308131944644, "grad_norm": 0.1308000385761261, "learning_rate": 0.0005678146937344402, "loss": 3.1951, "step": 8340 }, { "epoch": 0.4844933128318199, "grad_norm": 0.14018505811691284, "learning_rate": 0.000566863994334716, "loss": 3.1983, "step": 8350 }, { "epoch": 0.4850735443441933, "grad_norm": 0.12124442309141159, "learning_rate": 0.0005659130487347602, "loss": 3.1969, "step": 8360 }, { "epoch": 0.48565377585656677, "grad_norm": 0.13091051578521729, "learning_rate": 0.000564961860436054, "loss": 3.1932, "step": 8370 }, { "epoch": 0.4862340073689402, "grad_norm": 0.12319710105657578, "learning_rate": 0.0005640104329409727, "loss": 3.1944, "step": 8380 }, { "epoch": 0.48681423888131364, "grad_norm": 0.12845876812934875, "learning_rate": 0.0005630587697527716, "loss": 3.1929, "step": 8390 }, { "epoch": 0.4873944703936871, "grad_norm": 0.14527294039726257, "learning_rate": 0.0005621068743755743, "loss": 3.1932, "step": 8400 }, { "epoch": 0.4879747019060605, "grad_norm": 0.1430954933166504, "learning_rate": 0.0005611547503143595, "loss": 3.1963, "step": 8410 }, { "epoch": 0.488554933418434, "grad_norm": 0.12142278254032135, "learning_rate": 0.0005602024010749475, "loss": 3.1912, "step": 8420 }, { "epoch": 0.4891351649308074, "grad_norm": 0.12531523406505585, "learning_rate": 0.0005592498301639884, "loss": 3.1936, "step": 8430 }, { "epoch": 0.48971539644318085, "grad_norm": 0.11823923885822296, "learning_rate": 0.0005582970410889476, "loss": 3.2031, "step": 8440 }, { "epoch": 0.49029562795555426, "grad_norm": 0.1265026479959488, "learning_rate": 0.0005573440373580946, "loss": 3.1863, "step": 8450 }, { "epoch": 0.4908758594679277, "grad_norm": 0.1225002259016037, "learning_rate": 0.0005563908224804887, "loss": 3.1978, "step": 8460 }, { "epoch": 0.49145609098030113, "grad_norm": 0.13115541636943817, "learning_rate": 0.000555437399965967, "loss": 3.1945, "step": 8470 }, { "epoch": 0.4920363224926746, "grad_norm": 0.11067093908786774, "learning_rate": 0.0005544837733251313, "loss": 3.195, "step": 8480 }, { "epoch": 0.492616554005048, "grad_norm": 0.13542482256889343, "learning_rate": 0.0005535299460693346, "loss": 3.1976, "step": 8490 }, { "epoch": 0.49319678551742147, "grad_norm": 0.12914744019508362, "learning_rate": 0.000552575921710669, "loss": 3.1817, "step": 8500 }, { "epoch": 0.4937770170297949, "grad_norm": 0.14814750850200653, "learning_rate": 0.0005516217037619517, "loss": 3.1952, "step": 8510 }, { "epoch": 0.49435724854216834, "grad_norm": 0.13739536702632904, "learning_rate": 0.0005506672957367135, "loss": 3.1946, "step": 8520 }, { "epoch": 0.49493748005454175, "grad_norm": 0.12713497877120972, "learning_rate": 0.0005497127011491846, "loss": 3.193, "step": 8530 }, { "epoch": 0.4955177115669152, "grad_norm": 0.13317294418811798, "learning_rate": 0.0005487579235142823, "loss": 3.1951, "step": 8540 }, { "epoch": 0.4960979430792886, "grad_norm": 0.13274219632148743, "learning_rate": 0.000547802966347598, "loss": 3.1799, "step": 8550 }, { "epoch": 0.4966781745916621, "grad_norm": 0.137456014752388, "learning_rate": 0.0005468478331653838, "loss": 3.1907, "step": 8560 }, { "epoch": 0.4972584061040355, "grad_norm": 0.12658333778381348, "learning_rate": 0.0005458925274845402, "loss": 3.1906, "step": 8570 }, { "epoch": 0.49783863761640895, "grad_norm": 0.15250617265701294, "learning_rate": 0.000544937052822603, "loss": 3.1905, "step": 8580 }, { "epoch": 0.49841886912878236, "grad_norm": 0.12137165665626526, "learning_rate": 0.0005439814126977296, "loss": 3.195, "step": 8590 }, { "epoch": 0.4989991006411558, "grad_norm": 0.12580706179141998, "learning_rate": 0.0005430256106286874, "loss": 3.1851, "step": 8600 }, { "epoch": 0.49957933215352923, "grad_norm": 0.1316945105791092, "learning_rate": 0.0005420696501348397, "loss": 3.1827, "step": 8610 }, { "epoch": 0.5001595636659026, "grad_norm": 0.12646295130252838, "learning_rate": 0.0005411135347361329, "loss": 3.1911, "step": 8620 }, { "epoch": 0.5007397951782762, "grad_norm": 0.1217370554804802, "learning_rate": 0.0005401572679530844, "loss": 3.1963, "step": 8630 }, { "epoch": 0.5013200266906496, "grad_norm": 0.13218845427036285, "learning_rate": 0.0005392008533067684, "loss": 3.1959, "step": 8640 }, { "epoch": 0.501900258203023, "grad_norm": 0.13100461661815643, "learning_rate": 0.000538244294318804, "loss": 3.1984, "step": 8650 }, { "epoch": 0.5024804897153965, "grad_norm": 0.12846529483795166, "learning_rate": 0.0005372875945113417, "loss": 3.1873, "step": 8660 }, { "epoch": 0.5030607212277699, "grad_norm": 0.14111390709877014, "learning_rate": 0.0005363307574070503, "loss": 3.1974, "step": 8670 }, { "epoch": 0.5036409527401433, "grad_norm": 0.1287715584039688, "learning_rate": 0.0005353737865291039, "loss": 3.1913, "step": 8680 }, { "epoch": 0.5042211842525167, "grad_norm": 0.1581788808107376, "learning_rate": 0.0005344166854011702, "loss": 3.1833, "step": 8690 }, { "epoch": 0.5048014157648902, "grad_norm": 0.1236489862203598, "learning_rate": 0.0005334594575473952, "loss": 3.1933, "step": 8700 }, { "epoch": 0.5053816472772636, "grad_norm": 0.12864017486572266, "learning_rate": 0.0005325021064923924, "loss": 3.1913, "step": 8710 }, { "epoch": 0.505961878789637, "grad_norm": 0.12411046773195267, "learning_rate": 0.0005315446357612288, "loss": 3.1871, "step": 8720 }, { "epoch": 0.5065421103020105, "grad_norm": 0.12000168114900589, "learning_rate": 0.0005305870488794117, "loss": 3.1815, "step": 8730 }, { "epoch": 0.507122341814384, "grad_norm": 0.1342875063419342, "learning_rate": 0.0005296293493728764, "loss": 3.187, "step": 8740 }, { "epoch": 0.5077025733267574, "grad_norm": 0.12101846933364868, "learning_rate": 0.0005286715407679729, "loss": 3.1871, "step": 8750 }, { "epoch": 0.5082828048391308, "grad_norm": 0.13273586332798004, "learning_rate": 0.0005277136265914528, "loss": 3.193, "step": 8760 }, { "epoch": 0.5088630363515042, "grad_norm": 0.12270906567573547, "learning_rate": 0.0005267556103704562, "loss": 3.178, "step": 8770 }, { "epoch": 0.5094432678638777, "grad_norm": 0.1259816586971283, "learning_rate": 0.0005257974956324994, "loss": 3.187, "step": 8780 }, { "epoch": 0.5100234993762511, "grad_norm": 0.12720678746700287, "learning_rate": 0.0005248392859054612, "loss": 3.1837, "step": 8790 }, { "epoch": 0.5106037308886245, "grad_norm": 0.13317066431045532, "learning_rate": 0.0005238809847175704, "loss": 3.1873, "step": 8800 }, { "epoch": 0.511183962400998, "grad_norm": 0.12688656151294708, "learning_rate": 0.000522922595597392, "loss": 3.1829, "step": 8810 }, { "epoch": 0.5117641939133715, "grad_norm": 0.1185089647769928, "learning_rate": 0.0005219641220738154, "loss": 3.1864, "step": 8820 }, { "epoch": 0.5123444254257449, "grad_norm": 0.12334717810153961, "learning_rate": 0.0005210055676760403, "loss": 3.1924, "step": 8830 }, { "epoch": 0.5129246569381183, "grad_norm": 0.130776509642601, "learning_rate": 0.0005200469359335645, "loss": 3.1864, "step": 8840 }, { "epoch": 0.5135048884504917, "grad_norm": 0.1395205855369568, "learning_rate": 0.0005190882303761707, "loss": 3.1894, "step": 8850 }, { "epoch": 0.5140851199628652, "grad_norm": 0.13597504794597626, "learning_rate": 0.000518129454533913, "loss": 3.197, "step": 8860 }, { "epoch": 0.5146653514752386, "grad_norm": 0.1248147115111351, "learning_rate": 0.0005171706119371045, "loss": 3.1865, "step": 8870 }, { "epoch": 0.515245582987612, "grad_norm": 0.12766875326633453, "learning_rate": 0.0005162117061163039, "loss": 3.1843, "step": 8880 }, { "epoch": 0.5158258144999855, "grad_norm": 0.125563845038414, "learning_rate": 0.0005152527406023033, "loss": 3.1827, "step": 8890 }, { "epoch": 0.516406046012359, "grad_norm": 0.12935802340507507, "learning_rate": 0.0005142937189261138, "loss": 3.1825, "step": 8900 }, { "epoch": 0.5169862775247324, "grad_norm": 0.12293805927038193, "learning_rate": 0.0005133346446189541, "loss": 3.1909, "step": 8910 }, { "epoch": 0.5175665090371058, "grad_norm": 0.12657864391803741, "learning_rate": 0.0005123755212122359, "loss": 3.172, "step": 8920 }, { "epoch": 0.5181467405494793, "grad_norm": 0.12287136912345886, "learning_rate": 0.0005114163522375522, "loss": 3.1968, "step": 8930 }, { "epoch": 0.5187269720618527, "grad_norm": 0.13364015519618988, "learning_rate": 0.0005104571412266636, "loss": 3.1799, "step": 8940 }, { "epoch": 0.5193072035742261, "grad_norm": 0.14035052061080933, "learning_rate": 0.0005094978917114853, "loss": 3.1776, "step": 8950 }, { "epoch": 0.5198874350865995, "grad_norm": 0.12295843660831451, "learning_rate": 0.000508538607224075, "loss": 3.1805, "step": 8960 }, { "epoch": 0.520467666598973, "grad_norm": 0.11810554563999176, "learning_rate": 0.0005075792912966184, "loss": 3.1785, "step": 8970 }, { "epoch": 0.5210478981113464, "grad_norm": 0.14744389057159424, "learning_rate": 0.0005066199474614173, "loss": 3.1906, "step": 8980 }, { "epoch": 0.5216281296237198, "grad_norm": 0.13044193387031555, "learning_rate": 0.000505660579250876, "loss": 3.1766, "step": 8990 }, { "epoch": 0.5222083611360933, "grad_norm": 0.1182679831981659, "learning_rate": 0.000504701190197489, "loss": 3.1816, "step": 9000 }, { "epoch": 0.5222083611360933, "eval_loss": 3.1160783767700195, "eval_runtime": 3.2455, "eval_samples_per_second": 1334.151, "eval_steps_per_second": 10.476, "step": 9000 }, { "epoch": 0.5227885926484668, "grad_norm": 0.13336721062660217, "learning_rate": 0.0005037417838338272, "loss": 3.1825, "step": 9010 }, { "epoch": 0.5233688241608402, "grad_norm": 0.11967909336090088, "learning_rate": 0.0005027823636925254, "loss": 3.1839, "step": 9020 }, { "epoch": 0.5239490556732136, "grad_norm": 0.1373438537120819, "learning_rate": 0.0005018229333062689, "loss": 3.1859, "step": 9030 }, { "epoch": 0.524529287185587, "grad_norm": 0.12351592630147934, "learning_rate": 0.0005008634962077811, "loss": 3.1889, "step": 9040 }, { "epoch": 0.5251095186979605, "grad_norm": 0.13033995032310486, "learning_rate": 0.0004999040559298097, "loss": 3.1879, "step": 9050 }, { "epoch": 0.5256897502103339, "grad_norm": 0.11891571432352066, "learning_rate": 0.0004989446160051145, "loss": 3.1905, "step": 9060 }, { "epoch": 0.5262699817227073, "grad_norm": 0.12579171359539032, "learning_rate": 0.0004979851799664539, "loss": 3.1708, "step": 9070 }, { "epoch": 0.5268502132350807, "grad_norm": 0.12359123677015305, "learning_rate": 0.0004970257513465714, "loss": 3.1824, "step": 9080 }, { "epoch": 0.5274304447474543, "grad_norm": 0.1159052848815918, "learning_rate": 0.0004960663336781842, "loss": 3.18, "step": 9090 }, { "epoch": 0.5280106762598277, "grad_norm": 0.11939360946416855, "learning_rate": 0.0004951069304939684, "loss": 3.1806, "step": 9100 }, { "epoch": 0.5285909077722011, "grad_norm": 0.12974359095096588, "learning_rate": 0.0004941475453265471, "loss": 3.1774, "step": 9110 }, { "epoch": 0.5291711392845746, "grad_norm": 0.13163182139396667, "learning_rate": 0.0004931881817084771, "loss": 3.1888, "step": 9120 }, { "epoch": 0.529751370796948, "grad_norm": 0.1382340043783188, "learning_rate": 0.0004922288431722355, "loss": 3.1814, "step": 9130 }, { "epoch": 0.5303316023093214, "grad_norm": 0.12294802814722061, "learning_rate": 0.0004912695332502076, "loss": 3.1793, "step": 9140 }, { "epoch": 0.5309118338216948, "grad_norm": 0.13962669670581818, "learning_rate": 0.0004903102554746727, "loss": 3.1819, "step": 9150 }, { "epoch": 0.5314920653340683, "grad_norm": 0.12373016029596329, "learning_rate": 0.0004893510133777922, "loss": 3.1747, "step": 9160 }, { "epoch": 0.5320722968464418, "grad_norm": 0.12787871062755585, "learning_rate": 0.0004883918104915962, "loss": 3.1756, "step": 9170 }, { "epoch": 0.5326525283588152, "grad_norm": 0.12098101526498795, "learning_rate": 0.0004874326503479698, "loss": 3.1826, "step": 9180 }, { "epoch": 0.5332327598711886, "grad_norm": 0.13320770859718323, "learning_rate": 0.0004864735364786415, "loss": 3.1798, "step": 9190 }, { "epoch": 0.5338129913835621, "grad_norm": 0.13723470270633698, "learning_rate": 0.00048551447241516866, "loss": 3.1811, "step": 9200 }, { "epoch": 0.5343932228959355, "grad_norm": 0.12632976472377777, "learning_rate": 0.00048455546168892614, "loss": 3.1935, "step": 9210 }, { "epoch": 0.5349734544083089, "grad_norm": 0.12484107166528702, "learning_rate": 0.00048359650783109145, "loss": 3.1719, "step": 9220 }, { "epoch": 0.5355536859206823, "grad_norm": 0.13436180353164673, "learning_rate": 0.0004826376143726332, "loss": 3.1862, "step": 9230 }, { "epoch": 0.5361339174330558, "grad_norm": 0.13016556203365326, "learning_rate": 0.00048167878484429793, "loss": 3.1812, "step": 9240 }, { "epoch": 0.5367141489454292, "grad_norm": 0.12285098433494568, "learning_rate": 0.00048072002277659595, "loss": 3.1799, "step": 9250 }, { "epoch": 0.5372943804578026, "grad_norm": 0.12092869728803635, "learning_rate": 0.0004797613316997899, "loss": 3.178, "step": 9260 }, { "epoch": 0.537874611970176, "grad_norm": 0.11524718254804611, "learning_rate": 0.0004788027151438806, "loss": 3.1737, "step": 9270 }, { "epoch": 0.5384548434825496, "grad_norm": 0.12745259702205658, "learning_rate": 0.0004778441766385947, "loss": 3.1746, "step": 9280 }, { "epoch": 0.539035074994923, "grad_norm": 0.12326768040657043, "learning_rate": 0.00047688571971337155, "loss": 3.1752, "step": 9290 }, { "epoch": 0.5396153065072964, "grad_norm": 0.11677366495132446, "learning_rate": 0.00047592734789734967, "loss": 3.1702, "step": 9300 }, { "epoch": 0.5401955380196698, "grad_norm": 0.1259879618883133, "learning_rate": 0.0004749690647193547, "loss": 3.174, "step": 9310 }, { "epoch": 0.5407757695320433, "grad_norm": 0.1180575042963028, "learning_rate": 0.00047401087370788547, "loss": 3.1738, "step": 9320 }, { "epoch": 0.5413560010444167, "grad_norm": 0.12572011351585388, "learning_rate": 0.00047305277839110207, "loss": 3.1795, "step": 9330 }, { "epoch": 0.5419362325567901, "grad_norm": 0.12240619957447052, "learning_rate": 0.0004720947822968113, "loss": 3.1814, "step": 9340 }, { "epoch": 0.5425164640691637, "grad_norm": 0.12223079055547714, "learning_rate": 0.00047113688895245536, "loss": 3.1693, "step": 9350 }, { "epoch": 0.5430966955815371, "grad_norm": 0.11417195945978165, "learning_rate": 0.00047017910188509805, "loss": 3.1765, "step": 9360 }, { "epoch": 0.5436769270939105, "grad_norm": 0.1286236047744751, "learning_rate": 0.00046922142462141146, "loss": 3.1799, "step": 9370 }, { "epoch": 0.5442571586062839, "grad_norm": 0.12266254425048828, "learning_rate": 0.0004682638606876639, "loss": 3.1679, "step": 9380 }, { "epoch": 0.5448373901186574, "grad_norm": 0.12598057091236115, "learning_rate": 0.00046730641360970564, "loss": 3.1589, "step": 9390 }, { "epoch": 0.5454176216310308, "grad_norm": 0.11109986901283264, "learning_rate": 0.0004663490869129574, "loss": 3.1772, "step": 9400 }, { "epoch": 0.5459978531434042, "grad_norm": 0.12376119196414948, "learning_rate": 0.0004653918841223964, "loss": 3.1748, "step": 9410 }, { "epoch": 0.5465780846557776, "grad_norm": 0.14834947884082794, "learning_rate": 0.0004644348087625434, "loss": 3.1799, "step": 9420 }, { "epoch": 0.5471583161681511, "grad_norm": 0.12348010390996933, "learning_rate": 0.00046347786435745053, "loss": 3.1679, "step": 9430 }, { "epoch": 0.5477385476805245, "grad_norm": 0.12390238046646118, "learning_rate": 0.00046252105443068676, "loss": 3.1809, "step": 9440 }, { "epoch": 0.548318779192898, "grad_norm": 0.12994909286499023, "learning_rate": 0.0004615643825053269, "loss": 3.1774, "step": 9450 }, { "epoch": 0.5488990107052714, "grad_norm": 0.13186348974704742, "learning_rate": 0.000460607852103937, "loss": 3.1627, "step": 9460 }, { "epoch": 0.5494792422176449, "grad_norm": 0.11778120696544647, "learning_rate": 0.00045965146674856216, "loss": 3.1642, "step": 9470 }, { "epoch": 0.5500594737300183, "grad_norm": 0.13027390837669373, "learning_rate": 0.0004586952299607139, "loss": 3.1745, "step": 9480 }, { "epoch": 0.5506397052423917, "grad_norm": 0.13938818871974945, "learning_rate": 0.00045773914526135555, "loss": 3.177, "step": 9490 }, { "epoch": 0.5512199367547651, "grad_norm": 0.13590595126152039, "learning_rate": 0.0004567832161708918, "loss": 3.1794, "step": 9500 }, { "epoch": 0.5518001682671386, "grad_norm": 0.13180691003799438, "learning_rate": 0.00045582744620915313, "loss": 3.1752, "step": 9510 }, { "epoch": 0.552380399779512, "grad_norm": 0.13000676035881042, "learning_rate": 0.0004548718388953849, "loss": 3.1737, "step": 9520 }, { "epoch": 0.5529606312918854, "grad_norm": 0.1294243186712265, "learning_rate": 0.00045391639774823345, "loss": 3.1729, "step": 9530 }, { "epoch": 0.5535408628042588, "grad_norm": 0.12174411118030548, "learning_rate": 0.000452961126285733, "loss": 3.173, "step": 9540 }, { "epoch": 0.5541210943166324, "grad_norm": 0.11989938467741013, "learning_rate": 0.0004520060280252934, "loss": 3.172, "step": 9550 }, { "epoch": 0.5547013258290058, "grad_norm": 0.12082493305206299, "learning_rate": 0.0004510511064836862, "loss": 3.1676, "step": 9560 }, { "epoch": 0.5552815573413792, "grad_norm": 0.12731657922267914, "learning_rate": 0.00045009636517703275, "loss": 3.1816, "step": 9570 }, { "epoch": 0.5558617888537526, "grad_norm": 0.11837522685527802, "learning_rate": 0.0004491418076207903, "loss": 3.1749, "step": 9580 }, { "epoch": 0.5564420203661261, "grad_norm": 0.11386506259441376, "learning_rate": 0.00044818743732974003, "loss": 3.1577, "step": 9590 }, { "epoch": 0.5570222518784995, "grad_norm": 0.12620550394058228, "learning_rate": 0.00044723325781797346, "loss": 3.1755, "step": 9600 }, { "epoch": 0.5576024833908729, "grad_norm": 0.11217296868562698, "learning_rate": 0.0004462792725988791, "loss": 3.1599, "step": 9610 }, { "epoch": 0.5581827149032464, "grad_norm": 0.12458086013793945, "learning_rate": 0.0004453254851851308, "loss": 3.1749, "step": 9620 }, { "epoch": 0.5587629464156199, "grad_norm": 0.1312059760093689, "learning_rate": 0.0004443718990886734, "loss": 3.1693, "step": 9630 }, { "epoch": 0.5593431779279933, "grad_norm": 0.128337562084198, "learning_rate": 0.00044341851782071106, "loss": 3.1755, "step": 9640 }, { "epoch": 0.5599234094403667, "grad_norm": 0.11443614959716797, "learning_rate": 0.00044246534489169367, "loss": 3.1716, "step": 9650 }, { "epoch": 0.5605036409527402, "grad_norm": 0.12734359502792358, "learning_rate": 0.00044151238381130324, "loss": 3.1717, "step": 9660 }, { "epoch": 0.5610838724651136, "grad_norm": 0.1212453842163086, "learning_rate": 0.0004405596380884428, "loss": 3.1642, "step": 9670 }, { "epoch": 0.561664103977487, "grad_norm": 0.1245492622256279, "learning_rate": 0.0004396071112312216, "loss": 3.175, "step": 9680 }, { "epoch": 0.5622443354898604, "grad_norm": 0.11448545008897781, "learning_rate": 0.0004386548067469437, "loss": 3.1716, "step": 9690 }, { "epoch": 0.5628245670022339, "grad_norm": 0.12811312079429626, "learning_rate": 0.00043770272814209343, "loss": 3.1614, "step": 9700 }, { "epoch": 0.5634047985146073, "grad_norm": 0.11437219381332397, "learning_rate": 0.0004367508789223243, "loss": 3.1724, "step": 9710 }, { "epoch": 0.5639850300269807, "grad_norm": 0.11365852504968643, "learning_rate": 0.00043579926259244487, "loss": 3.1707, "step": 9720 }, { "epoch": 0.5645652615393542, "grad_norm": 0.12131789326667786, "learning_rate": 0.0004348478826564059, "loss": 3.1694, "step": 9730 }, { "epoch": 0.5651454930517277, "grad_norm": 0.11996260285377502, "learning_rate": 0.0004338967426172884, "loss": 3.1579, "step": 9740 }, { "epoch": 0.5657257245641011, "grad_norm": 0.12249016016721725, "learning_rate": 0.00043294584597728915, "loss": 3.1685, "step": 9750 }, { "epoch": 0.5663059560764745, "grad_norm": 0.12243705987930298, "learning_rate": 0.0004319951962377094, "loss": 3.1719, "step": 9760 }, { "epoch": 0.5668861875888479, "grad_norm": 0.12344249337911606, "learning_rate": 0.00043104479689894137, "loss": 3.1779, "step": 9770 }, { "epoch": 0.5674664191012214, "grad_norm": 0.11184128373861313, "learning_rate": 0.00043009465146045444, "loss": 3.1705, "step": 9780 }, { "epoch": 0.5680466506135948, "grad_norm": 0.12422725558280945, "learning_rate": 0.0004291447634207841, "loss": 3.1702, "step": 9790 }, { "epoch": 0.5686268821259682, "grad_norm": 0.13139618933200836, "learning_rate": 0.0004281951362775173, "loss": 3.1658, "step": 9800 }, { "epoch": 0.5692071136383416, "grad_norm": 0.14261464774608612, "learning_rate": 0.000427245773527281, "loss": 3.165, "step": 9810 }, { "epoch": 0.5697873451507152, "grad_norm": 0.11359596252441406, "learning_rate": 0.0004262966786657279, "loss": 3.1698, "step": 9820 }, { "epoch": 0.5703675766630886, "grad_norm": 0.13556736707687378, "learning_rate": 0.0004253478551875249, "loss": 3.168, "step": 9830 }, { "epoch": 0.570947808175462, "grad_norm": 0.12147964537143707, "learning_rate": 0.00042439930658633965, "loss": 3.1672, "step": 9840 }, { "epoch": 0.5715280396878355, "grad_norm": 0.12102476507425308, "learning_rate": 0.00042345103635482706, "loss": 3.1628, "step": 9850 }, { "epoch": 0.5721082712002089, "grad_norm": 0.11904580146074295, "learning_rate": 0.0004225030479846179, "loss": 3.1644, "step": 9860 }, { "epoch": 0.5726885027125823, "grad_norm": 0.1157933697104454, "learning_rate": 0.00042155534496630427, "loss": 3.1663, "step": 9870 }, { "epoch": 0.5732687342249557, "grad_norm": 0.12185543030500412, "learning_rate": 0.00042060793078942804, "loss": 3.1785, "step": 9880 }, { "epoch": 0.5738489657373292, "grad_norm": 0.1252318024635315, "learning_rate": 0.00041966080894246773, "loss": 3.159, "step": 9890 }, { "epoch": 0.5744291972497026, "grad_norm": 0.12326642870903015, "learning_rate": 0.00041871398291282484, "loss": 3.1576, "step": 9900 }, { "epoch": 0.5750094287620761, "grad_norm": 0.1266362965106964, "learning_rate": 0.0004177674561868123, "loss": 3.16, "step": 9910 }, { "epoch": 0.5755896602744495, "grad_norm": 0.1305086612701416, "learning_rate": 0.00041682123224964047, "loss": 3.1697, "step": 9920 }, { "epoch": 0.576169891786823, "grad_norm": 0.12299249321222305, "learning_rate": 0.0004158753145854051, "loss": 3.1663, "step": 9930 }, { "epoch": 0.5767501232991964, "grad_norm": 0.11096496134996414, "learning_rate": 0.00041492970667707403, "loss": 3.1663, "step": 9940 }, { "epoch": 0.5773303548115698, "grad_norm": 0.10970742255449295, "learning_rate": 0.00041398441200647467, "loss": 3.1617, "step": 9950 }, { "epoch": 0.5779105863239432, "grad_norm": 0.12066974490880966, "learning_rate": 0.0004130394340542813, "loss": 3.1656, "step": 9960 }, { "epoch": 0.5784908178363167, "grad_norm": 0.10806959867477417, "learning_rate": 0.0004120947763000012, "loss": 3.1649, "step": 9970 }, { "epoch": 0.5790710493486901, "grad_norm": 0.11969128251075745, "learning_rate": 0.0004111504422219637, "loss": 3.1675, "step": 9980 }, { "epoch": 0.5796512808610635, "grad_norm": 0.11461341381072998, "learning_rate": 0.0004102064352973054, "loss": 3.1631, "step": 9990 }, { "epoch": 0.580231512373437, "grad_norm": 0.1177605539560318, "learning_rate": 0.00040926275900195886, "loss": 3.1583, "step": 10000 }, { "epoch": 0.580231512373437, "eval_loss": 3.0971479415893555, "eval_runtime": 3.2713, "eval_samples_per_second": 1323.637, "eval_steps_per_second": 10.393, "step": 10000 }, { "epoch": 0.5808117438858105, "grad_norm": 0.13472320139408112, "learning_rate": 0.00040831941681063926, "loss": 3.1596, "step": 10010 }, { "epoch": 0.5813919753981839, "grad_norm": 0.12773457169532776, "learning_rate": 0.000407376412196831, "loss": 3.1751, "step": 10020 }, { "epoch": 0.5819722069105573, "grad_norm": 0.11364042013883591, "learning_rate": 0.0004064337486327761, "loss": 3.1541, "step": 10030 }, { "epoch": 0.5825524384229307, "grad_norm": 0.1128978356719017, "learning_rate": 0.00040549142958946037, "loss": 3.1594, "step": 10040 }, { "epoch": 0.5831326699353042, "grad_norm": 0.11539763957262039, "learning_rate": 0.00040454945853660157, "loss": 3.1708, "step": 10050 }, { "epoch": 0.5837129014476776, "grad_norm": 0.13058942556381226, "learning_rate": 0.00040360783894263536, "loss": 3.1611, "step": 10060 }, { "epoch": 0.584293132960051, "grad_norm": 0.13269387185573578, "learning_rate": 0.00040266657427470395, "loss": 3.1631, "step": 10070 }, { "epoch": 0.5848733644724246, "grad_norm": 0.11398264765739441, "learning_rate": 0.00040172566799864264, "loss": 3.1593, "step": 10080 }, { "epoch": 0.585453595984798, "grad_norm": 0.12349914014339447, "learning_rate": 0.00040078512357896647, "loss": 3.1585, "step": 10090 }, { "epoch": 0.5860338274971714, "grad_norm": 0.12374427914619446, "learning_rate": 0.0003998449444788589, "loss": 3.1654, "step": 10100 }, { "epoch": 0.5866140590095448, "grad_norm": 0.11344794183969498, "learning_rate": 0.0003989051341601576, "loss": 3.1564, "step": 10110 }, { "epoch": 0.5871942905219183, "grad_norm": 0.11296453326940536, "learning_rate": 0.0003979656960833428, "loss": 3.1632, "step": 10120 }, { "epoch": 0.5877745220342917, "grad_norm": 0.11938530951738358, "learning_rate": 0.00039702663370752393, "loss": 3.1687, "step": 10130 }, { "epoch": 0.5883547535466651, "grad_norm": 0.12476367503404617, "learning_rate": 0.00039608795049042686, "loss": 3.1605, "step": 10140 }, { "epoch": 0.5889349850590385, "grad_norm": 0.1283896565437317, "learning_rate": 0.0003951496498883817, "loss": 3.154, "step": 10150 }, { "epoch": 0.589515216571412, "grad_norm": 0.11707280576229095, "learning_rate": 0.00039421173535630937, "loss": 3.1675, "step": 10160 }, { "epoch": 0.5900954480837854, "grad_norm": 0.11196309328079224, "learning_rate": 0.0003932742103477098, "loss": 3.1597, "step": 10170 }, { "epoch": 0.5906756795961589, "grad_norm": 0.13069289922714233, "learning_rate": 0.0003923370783146477, "loss": 3.162, "step": 10180 }, { "epoch": 0.5912559111085323, "grad_norm": 0.11600931733846664, "learning_rate": 0.0003914003427077418, "loss": 3.1611, "step": 10190 }, { "epoch": 0.5918361426209058, "grad_norm": 0.11921602487564087, "learning_rate": 0.00039046400697615076, "loss": 3.1603, "step": 10200 }, { "epoch": 0.5924163741332792, "grad_norm": 0.10909148305654526, "learning_rate": 0.0003895280745675606, "loss": 3.1651, "step": 10210 }, { "epoch": 0.5929966056456526, "grad_norm": 0.1261613517999649, "learning_rate": 0.0003885925489281729, "loss": 3.164, "step": 10220 }, { "epoch": 0.593576837158026, "grad_norm": 0.1152707114815712, "learning_rate": 0.00038765743350269047, "loss": 3.1569, "step": 10230 }, { "epoch": 0.5941570686703995, "grad_norm": 0.13062123954296112, "learning_rate": 0.0003867227317343066, "loss": 3.1526, "step": 10240 }, { "epoch": 0.5947373001827729, "grad_norm": 0.13169212639331818, "learning_rate": 0.0003857884470646912, "loss": 3.1584, "step": 10250 }, { "epoch": 0.5953175316951463, "grad_norm": 0.1235685646533966, "learning_rate": 0.0003848545829339781, "loss": 3.1635, "step": 10260 }, { "epoch": 0.5958977632075197, "grad_norm": 0.11871648579835892, "learning_rate": 0.00038392114278075316, "loss": 3.1547, "step": 10270 }, { "epoch": 0.5964779947198933, "grad_norm": 0.11664935946464539, "learning_rate": 0.0003829881300420404, "loss": 3.1553, "step": 10280 }, { "epoch": 0.5970582262322667, "grad_norm": 0.10464397817850113, "learning_rate": 0.0003820555481532908, "loss": 3.1465, "step": 10290 }, { "epoch": 0.5976384577446401, "grad_norm": 0.11757074296474457, "learning_rate": 0.0003811234005483683, "loss": 3.1576, "step": 10300 }, { "epoch": 0.5982186892570136, "grad_norm": 0.12942548096179962, "learning_rate": 0.0003801916906595382, "loss": 3.1582, "step": 10310 }, { "epoch": 0.598798920769387, "grad_norm": 0.13089211285114288, "learning_rate": 0.000379260421917454, "loss": 3.149, "step": 10320 }, { "epoch": 0.5993791522817604, "grad_norm": 0.123594731092453, "learning_rate": 0.0003783295977511445, "loss": 3.1622, "step": 10330 }, { "epoch": 0.5999593837941338, "grad_norm": 0.12618903815746307, "learning_rate": 0.0003773992215880022, "loss": 3.1599, "step": 10340 }, { "epoch": 0.6005396153065073, "grad_norm": 0.11297423392534256, "learning_rate": 0.00037646929685376904, "loss": 3.1575, "step": 10350 }, { "epoch": 0.6011198468188808, "grad_norm": 0.12514062225818634, "learning_rate": 0.0003755398269725256, "loss": 3.1549, "step": 10360 }, { "epoch": 0.6017000783312542, "grad_norm": 0.11910570412874222, "learning_rate": 0.00037461081536667743, "loss": 3.1615, "step": 10370 }, { "epoch": 0.6022803098436276, "grad_norm": 0.11765125393867493, "learning_rate": 0.0003736822654569425, "loss": 3.1613, "step": 10380 }, { "epoch": 0.6028605413560011, "grad_norm": 0.10604594647884369, "learning_rate": 0.00037275418066233903, "loss": 3.1475, "step": 10390 }, { "epoch": 0.6034407728683745, "grad_norm": 0.1241423636674881, "learning_rate": 0.00037182656440017207, "loss": 3.1537, "step": 10400 }, { "epoch": 0.6040210043807479, "grad_norm": 0.13135185837745667, "learning_rate": 0.0003708994200860221, "loss": 3.1423, "step": 10410 }, { "epoch": 0.6046012358931213, "grad_norm": 0.11381290853023529, "learning_rate": 0.0003699727511337316, "loss": 3.157, "step": 10420 }, { "epoch": 0.6051814674054948, "grad_norm": 0.11703768372535706, "learning_rate": 0.0003690465609553927, "loss": 3.15, "step": 10430 }, { "epoch": 0.6057616989178682, "grad_norm": 0.11526386439800262, "learning_rate": 0.0003681208529613348, "loss": 3.1625, "step": 10440 }, { "epoch": 0.6063419304302416, "grad_norm": 0.1294795721769333, "learning_rate": 0.00036719563056011146, "loss": 3.1577, "step": 10450 }, { "epoch": 0.6069221619426151, "grad_norm": 0.12788033485412598, "learning_rate": 0.0003662708971584887, "loss": 3.1549, "step": 10460 }, { "epoch": 0.6075023934549886, "grad_norm": 0.11444190889596939, "learning_rate": 0.00036534665616143157, "loss": 3.158, "step": 10470 }, { "epoch": 0.608082624967362, "grad_norm": 0.12848497927188873, "learning_rate": 0.00036442291097209245, "loss": 3.1534, "step": 10480 }, { "epoch": 0.6086628564797354, "grad_norm": 0.13192491233348846, "learning_rate": 0.000363499664991798, "loss": 3.1647, "step": 10490 }, { "epoch": 0.6092430879921088, "grad_norm": 0.1181025505065918, "learning_rate": 0.0003625769216200362, "loss": 3.1556, "step": 10500 }, { "epoch": 0.6098233195044823, "grad_norm": 0.11332180351018906, "learning_rate": 0.00036165468425444514, "loss": 3.1531, "step": 10510 }, { "epoch": 0.6104035510168557, "grad_norm": 0.11427458375692368, "learning_rate": 0.00036073295629079926, "loss": 3.1441, "step": 10520 }, { "epoch": 0.6109837825292291, "grad_norm": 0.1351877599954605, "learning_rate": 0.00035981174112299774, "loss": 3.1592, "step": 10530 }, { "epoch": 0.6115640140416027, "grad_norm": 0.11437386274337769, "learning_rate": 0.000358891042143051, "loss": 3.1508, "step": 10540 }, { "epoch": 0.6121442455539761, "grad_norm": 0.1317347139120102, "learning_rate": 0.00035797086274106917, "loss": 3.1602, "step": 10550 }, { "epoch": 0.6127244770663495, "grad_norm": 0.12212193757295609, "learning_rate": 0.00035705120630524946, "loss": 3.1562, "step": 10560 }, { "epoch": 0.6133047085787229, "grad_norm": 0.10987838357686996, "learning_rate": 0.00035613207622186297, "loss": 3.1498, "step": 10570 }, { "epoch": 0.6138849400910964, "grad_norm": 0.1109929159283638, "learning_rate": 0.00035521347587524324, "loss": 3.1592, "step": 10580 }, { "epoch": 0.6144651716034698, "grad_norm": 0.11722821742296219, "learning_rate": 0.00035429540864777254, "loss": 3.1588, "step": 10590 }, { "epoch": 0.6150454031158432, "grad_norm": 0.11384609341621399, "learning_rate": 0.00035337787791987085, "loss": 3.1563, "step": 10600 }, { "epoch": 0.6156256346282166, "grad_norm": 0.13255846500396729, "learning_rate": 0.0003524608870699826, "loss": 3.1546, "step": 10610 }, { "epoch": 0.6162058661405901, "grad_norm": 0.12805138528347015, "learning_rate": 0.00035154443947456364, "loss": 3.1468, "step": 10620 }, { "epoch": 0.6167860976529635, "grad_norm": 0.11819039285182953, "learning_rate": 0.0003506285385080705, "loss": 3.1436, "step": 10630 }, { "epoch": 0.617366329165337, "grad_norm": 0.11611706018447876, "learning_rate": 0.0003497131875429462, "loss": 3.153, "step": 10640 }, { "epoch": 0.6179465606777104, "grad_norm": 0.12574134767055511, "learning_rate": 0.0003487983899496092, "loss": 3.1676, "step": 10650 }, { "epoch": 0.6185267921900839, "grad_norm": 0.13298243284225464, "learning_rate": 0.00034788414909643975, "loss": 3.1448, "step": 10660 }, { "epoch": 0.6191070237024573, "grad_norm": 0.11737950146198273, "learning_rate": 0.00034697046834976847, "loss": 3.1603, "step": 10670 }, { "epoch": 0.6196872552148307, "grad_norm": 0.11029376089572906, "learning_rate": 0.0003460573510738638, "loss": 3.1523, "step": 10680 }, { "epoch": 0.6202674867272041, "grad_norm": 0.12390248477458954, "learning_rate": 0.000345144800630919, "loss": 3.1591, "step": 10690 }, { "epoch": 0.6208477182395776, "grad_norm": 0.11781900376081467, "learning_rate": 0.00034423282038104064, "loss": 3.1617, "step": 10700 }, { "epoch": 0.621427949751951, "grad_norm": 0.12515197694301605, "learning_rate": 0.0003433214136822352, "loss": 3.1418, "step": 10710 }, { "epoch": 0.6220081812643244, "grad_norm": 0.10986288636922836, "learning_rate": 0.0003424105838903978, "loss": 3.1374, "step": 10720 }, { "epoch": 0.6225884127766979, "grad_norm": 0.12500767409801483, "learning_rate": 0.00034150033435929926, "loss": 3.1508, "step": 10730 }, { "epoch": 0.6231686442890714, "grad_norm": 0.11399463564157486, "learning_rate": 0.0003405906684405735, "loss": 3.155, "step": 10740 }, { "epoch": 0.6237488758014448, "grad_norm": 0.1243964433670044, "learning_rate": 0.000339681589483706, "loss": 3.149, "step": 10750 }, { "epoch": 0.6243291073138182, "grad_norm": 0.1269841343164444, "learning_rate": 0.0003387731008360203, "loss": 3.157, "step": 10760 }, { "epoch": 0.6249093388261916, "grad_norm": 0.11488167196512222, "learning_rate": 0.0003378652058426672, "loss": 3.1591, "step": 10770 }, { "epoch": 0.6254895703385651, "grad_norm": 0.12310460954904556, "learning_rate": 0.00033695790784661085, "loss": 3.1493, "step": 10780 }, { "epoch": 0.6260698018509385, "grad_norm": 0.11951915174722672, "learning_rate": 0.0003360512101886176, "loss": 3.1519, "step": 10790 }, { "epoch": 0.6266500333633119, "grad_norm": 0.11739303171634674, "learning_rate": 0.0003351451162072435, "loss": 3.1517, "step": 10800 }, { "epoch": 0.6272302648756855, "grad_norm": 0.12451887875795364, "learning_rate": 0.000334239629238821, "loss": 3.1437, "step": 10810 }, { "epoch": 0.6278104963880589, "grad_norm": 0.10753390938043594, "learning_rate": 0.0003333347526174484, "loss": 3.1474, "step": 10820 }, { "epoch": 0.6283907279004323, "grad_norm": 0.12157886475324631, "learning_rate": 0.00033243048967497596, "loss": 3.1502, "step": 10830 }, { "epoch": 0.6289709594128057, "grad_norm": 0.13651184737682343, "learning_rate": 0.0003315268437409946, "loss": 3.1553, "step": 10840 }, { "epoch": 0.6295511909251792, "grad_norm": 0.12725335359573364, "learning_rate": 0.00033062381814282367, "loss": 3.141, "step": 10850 }, { "epoch": 0.6301314224375526, "grad_norm": 0.11685140430927277, "learning_rate": 0.00032972141620549747, "loss": 3.1451, "step": 10860 }, { "epoch": 0.630711653949926, "grad_norm": 0.1115005612373352, "learning_rate": 0.00032881964125175487, "loss": 3.1482, "step": 10870 }, { "epoch": 0.6312918854622994, "grad_norm": 0.11986386775970459, "learning_rate": 0.00032791849660202547, "loss": 3.1434, "step": 10880 }, { "epoch": 0.6318721169746729, "grad_norm": 0.11233355104923248, "learning_rate": 0.00032701798557441833, "loss": 3.1418, "step": 10890 }, { "epoch": 0.6324523484870463, "grad_norm": 0.11507276445627213, "learning_rate": 0.0003261181114847094, "loss": 3.1415, "step": 10900 }, { "epoch": 0.6330325799994198, "grad_norm": 0.1157032698392868, "learning_rate": 0.00032521887764632937, "loss": 3.149, "step": 10910 }, { "epoch": 0.6336128115117932, "grad_norm": 0.12391894310712814, "learning_rate": 0.0003243202873703516, "loss": 3.1476, "step": 10920 }, { "epoch": 0.6341930430241667, "grad_norm": 0.11616963148117065, "learning_rate": 0.00032342234396547933, "loss": 3.1522, "step": 10930 }, { "epoch": 0.6347732745365401, "grad_norm": 0.113109290599823, "learning_rate": 0.00032252505073803437, "loss": 3.1398, "step": 10940 }, { "epoch": 0.6353535060489135, "grad_norm": 0.1344575732946396, "learning_rate": 0.00032162841099194427, "loss": 3.1388, "step": 10950 }, { "epoch": 0.6359337375612869, "grad_norm": 0.1219155341386795, "learning_rate": 0.0003207324280287307, "loss": 3.1499, "step": 10960 }, { "epoch": 0.6365139690736604, "grad_norm": 0.11315035074949265, "learning_rate": 0.0003198371051474969, "loss": 3.152, "step": 10970 }, { "epoch": 0.6370942005860338, "grad_norm": 0.1105627492070198, "learning_rate": 0.000318942445644915, "loss": 3.1512, "step": 10980 }, { "epoch": 0.6376744320984072, "grad_norm": 0.11000196635723114, "learning_rate": 0.00031804845281521553, "loss": 3.1464, "step": 10990 }, { "epoch": 0.6382546636107806, "grad_norm": 0.11353638023138046, "learning_rate": 0.0003171551299501734, "loss": 3.1464, "step": 11000 }, { "epoch": 0.6382546636107806, "eval_loss": 3.079380750656128, "eval_runtime": 3.2712, "eval_samples_per_second": 1323.663, "eval_steps_per_second": 10.394, "step": 11000 }, { "epoch": 0.6388348951231542, "grad_norm": 0.11272257566452026, "learning_rate": 0.0003162624803390973, "loss": 3.1544, "step": 11010 }, { "epoch": 0.6394151266355276, "grad_norm": 0.11919167637825012, "learning_rate": 0.00031537050726881635, "loss": 3.1495, "step": 11020 }, { "epoch": 0.639995358147901, "grad_norm": 0.11367520689964294, "learning_rate": 0.00031447921402366874, "loss": 3.1422, "step": 11030 }, { "epoch": 0.6405755896602745, "grad_norm": 0.11759908497333527, "learning_rate": 0.0003135886038854899, "loss": 3.1414, "step": 11040 }, { "epoch": 0.6411558211726479, "grad_norm": 0.11302473396062851, "learning_rate": 0.0003126986801335995, "loss": 3.1471, "step": 11050 }, { "epoch": 0.6417360526850213, "grad_norm": 0.12279005348682404, "learning_rate": 0.0003118094460447901, "loss": 3.1427, "step": 11060 }, { "epoch": 0.6423162841973947, "grad_norm": 0.11281714588403702, "learning_rate": 0.0003109209048933145, "loss": 3.1327, "step": 11070 }, { "epoch": 0.6428965157097682, "grad_norm": 0.10893500596284866, "learning_rate": 0.0003100330599508745, "loss": 3.1472, "step": 11080 }, { "epoch": 0.6434767472221417, "grad_norm": 0.12594285607337952, "learning_rate": 0.0003091459144866083, "loss": 3.146, "step": 11090 }, { "epoch": 0.6440569787345151, "grad_norm": 0.1286400705575943, "learning_rate": 0.0003082594717670781, "loss": 3.1457, "step": 11100 }, { "epoch": 0.6446372102468885, "grad_norm": 0.12681199610233307, "learning_rate": 0.0003073737350562594, "loss": 3.1349, "step": 11110 }, { "epoch": 0.645217441759262, "grad_norm": 0.12026621401309967, "learning_rate": 0.00030648870761552693, "loss": 3.1425, "step": 11120 }, { "epoch": 0.6457976732716354, "grad_norm": 0.11217381060123444, "learning_rate": 0.00030560439270364495, "loss": 3.1424, "step": 11130 }, { "epoch": 0.6463779047840088, "grad_norm": 0.11966854333877563, "learning_rate": 0.00030472079357675316, "loss": 3.1477, "step": 11140 }, { "epoch": 0.6469581362963822, "grad_norm": 0.10743203014135361, "learning_rate": 0.0003038379134883563, "loss": 3.1472, "step": 11150 }, { "epoch": 0.6475383678087557, "grad_norm": 0.11516842246055603, "learning_rate": 0.0003029557556893117, "loss": 3.1363, "step": 11160 }, { "epoch": 0.6481185993211291, "grad_norm": 0.11448100209236145, "learning_rate": 0.00030207432342781615, "loss": 3.1397, "step": 11170 }, { "epoch": 0.6486988308335025, "grad_norm": 0.12240401655435562, "learning_rate": 0.0003011936199493962, "loss": 3.1451, "step": 11180 }, { "epoch": 0.649279062345876, "grad_norm": 0.1107584685087204, "learning_rate": 0.0003003136484968937, "loss": 3.1516, "step": 11190 }, { "epoch": 0.6498592938582495, "grad_norm": 0.11135096102952957, "learning_rate": 0.0002994344123104561, "loss": 3.1423, "step": 11200 }, { "epoch": 0.6504395253706229, "grad_norm": 0.11470366269350052, "learning_rate": 0.0002985559146275231, "loss": 3.1441, "step": 11210 }, { "epoch": 0.6510197568829963, "grad_norm": 0.11569292098283768, "learning_rate": 0.0002976781586828151, "loss": 3.149, "step": 11220 }, { "epoch": 0.6515999883953697, "grad_norm": 0.12388614565134048, "learning_rate": 0.0002968011477083217, "loss": 3.1319, "step": 11230 }, { "epoch": 0.6521802199077432, "grad_norm": 0.12496737390756607, "learning_rate": 0.00029592488493328885, "loss": 3.1391, "step": 11240 }, { "epoch": 0.6527604514201166, "grad_norm": 0.1108599305152893, "learning_rate": 0.00029504937358420803, "loss": 3.1453, "step": 11250 }, { "epoch": 0.65334068293249, "grad_norm": 0.11209242045879364, "learning_rate": 0.0002941746168848037, "loss": 3.1468, "step": 11260 }, { "epoch": 0.6539209144448636, "grad_norm": 0.10576393455266953, "learning_rate": 0.0002933006180560217, "loss": 3.1327, "step": 11270 }, { "epoch": 0.654501145957237, "grad_norm": 0.11058243364095688, "learning_rate": 0.00029242738031601745, "loss": 3.1378, "step": 11280 }, { "epoch": 0.6550813774696104, "grad_norm": 0.10569418221712112, "learning_rate": 0.00029155490688014343, "loss": 3.1402, "step": 11290 }, { "epoch": 0.6556616089819838, "grad_norm": 0.11297528445720673, "learning_rate": 0.0002906832009609384, "loss": 3.1453, "step": 11300 }, { "epoch": 0.6562418404943573, "grad_norm": 0.11635693162679672, "learning_rate": 0.00028981226576811506, "loss": 3.1323, "step": 11310 }, { "epoch": 0.6568220720067307, "grad_norm": 0.11293961852788925, "learning_rate": 0.0002889421045085475, "loss": 3.151, "step": 11320 }, { "epoch": 0.6574023035191041, "grad_norm": 0.11303776502609253, "learning_rate": 0.0002880727203862612, "loss": 3.1461, "step": 11330 }, { "epoch": 0.6579825350314775, "grad_norm": 0.10966860502958298, "learning_rate": 0.0002872041166024194, "loss": 3.1441, "step": 11340 }, { "epoch": 0.658562766543851, "grad_norm": 0.11160997301340103, "learning_rate": 0.00028633629635531224, "loss": 3.1488, "step": 11350 }, { "epoch": 0.6591429980562244, "grad_norm": 0.1070476621389389, "learning_rate": 0.0002854692628403446, "loss": 3.1413, "step": 11360 }, { "epoch": 0.6597232295685979, "grad_norm": 0.11823021620512009, "learning_rate": 0.0002846030192500249, "loss": 3.145, "step": 11370 }, { "epoch": 0.6603034610809713, "grad_norm": 0.11843527853488922, "learning_rate": 0.0002837375687739525, "loss": 3.1374, "step": 11380 }, { "epoch": 0.6608836925933448, "grad_norm": 0.118824802339077, "learning_rate": 0.00028287291459880716, "loss": 3.157, "step": 11390 }, { "epoch": 0.6614639241057182, "grad_norm": 0.11628689616918564, "learning_rate": 0.0002820090599083358, "loss": 3.1352, "step": 11400 }, { "epoch": 0.6620441556180916, "grad_norm": 0.11970434337854385, "learning_rate": 0.0002811460078833421, "loss": 3.1468, "step": 11410 }, { "epoch": 0.662624387130465, "grad_norm": 0.10809943079948425, "learning_rate": 0.00028028376170167383, "loss": 3.1405, "step": 11420 }, { "epoch": 0.6632046186428385, "grad_norm": 0.10611239075660706, "learning_rate": 0.00027942232453821193, "loss": 3.1449, "step": 11430 }, { "epoch": 0.6637848501552119, "grad_norm": 0.11383804678916931, "learning_rate": 0.0002785616995648579, "loss": 3.1525, "step": 11440 }, { "epoch": 0.6643650816675853, "grad_norm": 0.11580588668584824, "learning_rate": 0.0002777018899505236, "loss": 3.1335, "step": 11450 }, { "epoch": 0.6649453131799588, "grad_norm": 0.1111442893743515, "learning_rate": 0.0002768428988611178, "loss": 3.1467, "step": 11460 }, { "epoch": 0.6655255446923323, "grad_norm": 0.11236603558063507, "learning_rate": 0.0002759847294595357, "loss": 3.1369, "step": 11470 }, { "epoch": 0.6661057762047057, "grad_norm": 0.12457659840583801, "learning_rate": 0.00027512738490564697, "loss": 3.1346, "step": 11480 }, { "epoch": 0.6666860077170791, "grad_norm": 0.11812961846590042, "learning_rate": 0.0002742708683562841, "loss": 3.1479, "step": 11490 }, { "epoch": 0.6672662392294526, "grad_norm": 0.1041463240981102, "learning_rate": 0.0002734151829652304, "loss": 3.1363, "step": 11500 }, { "epoch": 0.667846470741826, "grad_norm": 0.11501579731702805, "learning_rate": 0.0002725603318832097, "loss": 3.1286, "step": 11510 }, { "epoch": 0.6684267022541994, "grad_norm": 0.11745285987854004, "learning_rate": 0.00027170631825787294, "loss": 3.1406, "step": 11520 }, { "epoch": 0.6690069337665728, "grad_norm": 0.1133042722940445, "learning_rate": 0.00027085314523378777, "loss": 3.1506, "step": 11530 }, { "epoch": 0.6695871652789464, "grad_norm": 0.11351029574871063, "learning_rate": 0.00027000081595242667, "loss": 3.135, "step": 11540 }, { "epoch": 0.6701673967913198, "grad_norm": 0.11267419159412384, "learning_rate": 0.0002691493335521551, "loss": 3.131, "step": 11550 }, { "epoch": 0.6707476283036932, "grad_norm": 0.10249326378107071, "learning_rate": 0.00026829870116822085, "loss": 3.1318, "step": 11560 }, { "epoch": 0.6713278598160666, "grad_norm": 0.11103315651416779, "learning_rate": 0.0002674489219327413, "loss": 3.1344, "step": 11570 }, { "epoch": 0.6719080913284401, "grad_norm": 0.11441586166620255, "learning_rate": 0.0002665999989746926, "loss": 3.1352, "step": 11580 }, { "epoch": 0.6724883228408135, "grad_norm": 0.11026325076818466, "learning_rate": 0.00026575193541989795, "loss": 3.1315, "step": 11590 }, { "epoch": 0.6730685543531869, "grad_norm": 0.1115291491150856, "learning_rate": 0.00026490473439101615, "loss": 3.1339, "step": 11600 }, { "epoch": 0.6736487858655603, "grad_norm": 0.10771960020065308, "learning_rate": 0.0002640583990075306, "loss": 3.1238, "step": 11610 }, { "epoch": 0.6742290173779338, "grad_norm": 0.11078547686338425, "learning_rate": 0.00026321293238573614, "loss": 3.1365, "step": 11620 }, { "epoch": 0.6748092488903072, "grad_norm": 0.1105736717581749, "learning_rate": 0.00026236833763872993, "loss": 3.1466, "step": 11630 }, { "epoch": 0.6753894804026807, "grad_norm": 0.10596097260713577, "learning_rate": 0.0002615246178763983, "loss": 3.1442, "step": 11640 }, { "epoch": 0.6759697119150541, "grad_norm": 0.11247435957193375, "learning_rate": 0.00026068177620540536, "loss": 3.1439, "step": 11650 }, { "epoch": 0.6765499434274276, "grad_norm": 0.11171044409275055, "learning_rate": 0.00025983981572918314, "loss": 3.1451, "step": 11660 }, { "epoch": 0.677130174939801, "grad_norm": 0.12344854325056076, "learning_rate": 0.0002589987395479175, "loss": 3.1372, "step": 11670 }, { "epoch": 0.6777104064521744, "grad_norm": 0.1220024824142456, "learning_rate": 0.00025815855075853977, "loss": 3.1366, "step": 11680 }, { "epoch": 0.6782906379645478, "grad_norm": 0.10972929000854492, "learning_rate": 0.0002573192524547128, "loss": 3.1299, "step": 11690 }, { "epoch": 0.6788708694769213, "grad_norm": 0.11175742000341415, "learning_rate": 0.00025648084772682056, "loss": 3.1375, "step": 11700 }, { "epoch": 0.6794511009892947, "grad_norm": 0.12257856875658035, "learning_rate": 0.00025564333966195785, "loss": 3.1402, "step": 11710 }, { "epoch": 0.6800313325016681, "grad_norm": 0.11670278757810593, "learning_rate": 0.0002548067313439162, "loss": 3.1357, "step": 11720 }, { "epoch": 0.6806115640140415, "grad_norm": 0.11864405870437622, "learning_rate": 0.0002539710258531759, "loss": 3.136, "step": 11730 }, { "epoch": 0.6811917955264151, "grad_norm": 0.10666168481111526, "learning_rate": 0.00025313622626689134, "loss": 3.1374, "step": 11740 }, { "epoch": 0.6817720270387885, "grad_norm": 0.10843271762132645, "learning_rate": 0.00025230233565888267, "loss": 3.1343, "step": 11750 }, { "epoch": 0.6823522585511619, "grad_norm": 0.10990433394908905, "learning_rate": 0.00025146935709962216, "loss": 3.1386, "step": 11760 }, { "epoch": 0.6829324900635354, "grad_norm": 0.10423313081264496, "learning_rate": 0.00025063729365622407, "loss": 3.1382, "step": 11770 }, { "epoch": 0.6835127215759088, "grad_norm": 0.11088298261165619, "learning_rate": 0.00024980614839243364, "loss": 3.1191, "step": 11780 }, { "epoch": 0.6840929530882822, "grad_norm": 0.11372388154268265, "learning_rate": 0.00024897592436861406, "loss": 3.1294, "step": 11790 }, { "epoch": 0.6846731846006556, "grad_norm": 0.10824663192033768, "learning_rate": 0.0002481466246417377, "loss": 3.1291, "step": 11800 }, { "epoch": 0.6852534161130291, "grad_norm": 0.10850938409566879, "learning_rate": 0.00024731825226537293, "loss": 3.1438, "step": 11810 }, { "epoch": 0.6858336476254026, "grad_norm": 0.1074269562959671, "learning_rate": 0.00024649081028967334, "loss": 3.1336, "step": 11820 }, { "epoch": 0.686413879137776, "grad_norm": 0.11285442113876343, "learning_rate": 0.00024566430176136756, "loss": 3.1326, "step": 11830 }, { "epoch": 0.6869941106501494, "grad_norm": 0.11877676844596863, "learning_rate": 0.0002448387297237459, "loss": 3.1333, "step": 11840 }, { "epoch": 0.6875743421625229, "grad_norm": 0.1159949079155922, "learning_rate": 0.00024401409721665148, "loss": 3.1271, "step": 11850 }, { "epoch": 0.6881545736748963, "grad_norm": 0.11141040176153183, "learning_rate": 0.00024319040727646752, "loss": 3.1315, "step": 11860 }, { "epoch": 0.6887348051872697, "grad_norm": 0.1103438287973404, "learning_rate": 0.0002423676629361064, "loss": 3.1271, "step": 11870 }, { "epoch": 0.6893150366996431, "grad_norm": 0.12033682316541672, "learning_rate": 0.00024154586722499965, "loss": 3.1317, "step": 11880 }, { "epoch": 0.6898952682120166, "grad_norm": 0.10661648213863373, "learning_rate": 0.00024072502316908428, "loss": 3.1272, "step": 11890 }, { "epoch": 0.69047549972439, "grad_norm": 0.1170666292309761, "learning_rate": 0.00023990513379079477, "loss": 3.1398, "step": 11900 }, { "epoch": 0.6910557312367634, "grad_norm": 0.11095455288887024, "learning_rate": 0.00023908620210904947, "loss": 3.1298, "step": 11910 }, { "epoch": 0.6916359627491369, "grad_norm": 0.1100478321313858, "learning_rate": 0.00023826823113924035, "loss": 3.1286, "step": 11920 }, { "epoch": 0.6922161942615104, "grad_norm": 0.11419103294610977, "learning_rate": 0.00023745122389322293, "loss": 3.1343, "step": 11930 }, { "epoch": 0.6927964257738838, "grad_norm": 0.11160432547330856, "learning_rate": 0.00023663518337930256, "loss": 3.1402, "step": 11940 }, { "epoch": 0.6933766572862572, "grad_norm": 0.10984364151954651, "learning_rate": 0.00023582011260222664, "loss": 3.1351, "step": 11950 }, { "epoch": 0.6939568887986306, "grad_norm": 0.11625155061483383, "learning_rate": 0.00023500601456317083, "loss": 3.134, "step": 11960 }, { "epoch": 0.6945371203110041, "grad_norm": 0.1080445721745491, "learning_rate": 0.00023419289225972946, "loss": 3.1311, "step": 11970 }, { "epoch": 0.6951173518233775, "grad_norm": 0.10590895265340805, "learning_rate": 0.00023338074868590393, "loss": 3.1371, "step": 11980 }, { "epoch": 0.6956975833357509, "grad_norm": 0.11543317884206772, "learning_rate": 0.0002325695868320919, "loss": 3.1316, "step": 11990 }, { "epoch": 0.6962778148481245, "grad_norm": 0.11939459294080734, "learning_rate": 0.0002317594096850768, "loss": 3.1365, "step": 12000 }, { "epoch": 0.6962778148481245, "eval_loss": 3.064452648162842, "eval_runtime": 3.2623, "eval_samples_per_second": 1327.301, "eval_steps_per_second": 10.422, "step": 12000 }, { "epoch": 0.6968580463604979, "grad_norm": 0.10952937602996826, "learning_rate": 0.00023095022022801503, "loss": 3.1378, "step": 12010 }, { "epoch": 0.6974382778728713, "grad_norm": 0.11545655131340027, "learning_rate": 0.00023014202144042744, "loss": 3.1373, "step": 12020 }, { "epoch": 0.6980185093852447, "grad_norm": 0.10757040232419968, "learning_rate": 0.00022933481629818653, "loss": 3.137, "step": 12030 }, { "epoch": 0.6985987408976182, "grad_norm": 0.11333664506673813, "learning_rate": 0.00022852860777350593, "loss": 3.1328, "step": 12040 }, { "epoch": 0.6991789724099916, "grad_norm": 0.10705193877220154, "learning_rate": 0.00022772339883493048, "loss": 3.1283, "step": 12050 }, { "epoch": 0.699759203922365, "grad_norm": 0.11158863455057144, "learning_rate": 0.00022691919244732307, "loss": 3.1303, "step": 12060 }, { "epoch": 0.7003394354347384, "grad_norm": 0.11993270367383957, "learning_rate": 0.00022611599157185648, "loss": 3.1262, "step": 12070 }, { "epoch": 0.7009196669471119, "grad_norm": 0.10471105575561523, "learning_rate": 0.00022531379916600026, "loss": 3.1397, "step": 12080 }, { "epoch": 0.7014998984594853, "grad_norm": 0.11472434550523758, "learning_rate": 0.00022451261818351082, "loss": 3.1334, "step": 12090 }, { "epoch": 0.7020801299718588, "grad_norm": 0.12063375115394592, "learning_rate": 0.0002237124515744206, "loss": 3.1311, "step": 12100 }, { "epoch": 0.7026603614842322, "grad_norm": 0.11272242665290833, "learning_rate": 0.00022291330228502658, "loss": 3.13, "step": 12110 }, { "epoch": 0.7032405929966057, "grad_norm": 0.10762272030115128, "learning_rate": 0.00022211517325788056, "loss": 3.1255, "step": 12120 }, { "epoch": 0.7038208245089791, "grad_norm": 0.10291790962219238, "learning_rate": 0.00022131806743177707, "loss": 3.1284, "step": 12130 }, { "epoch": 0.7044010560213525, "grad_norm": 0.10812744498252869, "learning_rate": 0.00022052198774174327, "loss": 3.1348, "step": 12140 }, { "epoch": 0.7049812875337259, "grad_norm": 0.10778633505105972, "learning_rate": 0.00021972693711902792, "loss": 3.1342, "step": 12150 }, { "epoch": 0.7055615190460994, "grad_norm": 0.10863006114959717, "learning_rate": 0.00021893291849109053, "loss": 3.1319, "step": 12160 }, { "epoch": 0.7061417505584728, "grad_norm": 0.11223878711462021, "learning_rate": 0.00021813993478159128, "loss": 3.1299, "step": 12170 }, { "epoch": 0.7067219820708462, "grad_norm": 0.11001647263765335, "learning_rate": 0.000217347988910379, "loss": 3.1256, "step": 12180 }, { "epoch": 0.7073022135832197, "grad_norm": 0.11175502091646194, "learning_rate": 0.00021655708379348144, "loss": 3.1374, "step": 12190 }, { "epoch": 0.7078824450955932, "grad_norm": 0.10740119218826294, "learning_rate": 0.00021576722234309403, "loss": 3.1284, "step": 12200 }, { "epoch": 0.7084626766079666, "grad_norm": 0.1120908334851265, "learning_rate": 0.00021497840746756942, "loss": 3.1225, "step": 12210 }, { "epoch": 0.70904290812034, "grad_norm": 0.110707126557827, "learning_rate": 0.00021419064207140639, "loss": 3.1256, "step": 12220 }, { "epoch": 0.7096231396327135, "grad_norm": 0.11498415470123291, "learning_rate": 0.00021340392905524002, "loss": 3.1249, "step": 12230 }, { "epoch": 0.7102033711450869, "grad_norm": 0.11245319992303848, "learning_rate": 0.00021261827131582989, "loss": 3.135, "step": 12240 }, { "epoch": 0.7107836026574603, "grad_norm": 0.10842925310134888, "learning_rate": 0.00021183367174605006, "loss": 3.121, "step": 12250 }, { "epoch": 0.7113638341698337, "grad_norm": 0.10173554718494415, "learning_rate": 0.00021105013323487843, "loss": 3.1246, "step": 12260 }, { "epoch": 0.7119440656822072, "grad_norm": 0.10313431173563004, "learning_rate": 0.00021026765866738578, "loss": 3.1298, "step": 12270 }, { "epoch": 0.7125242971945807, "grad_norm": 0.1053348183631897, "learning_rate": 0.00020948625092472535, "loss": 3.1264, "step": 12280 }, { "epoch": 0.7131045287069541, "grad_norm": 0.12006527930498123, "learning_rate": 0.00020870591288412254, "loss": 3.1306, "step": 12290 }, { "epoch": 0.7136847602193275, "grad_norm": 0.1115618497133255, "learning_rate": 0.00020792664741886368, "loss": 3.1264, "step": 12300 }, { "epoch": 0.714264991731701, "grad_norm": 0.10678742080926895, "learning_rate": 0.00020714845739828585, "loss": 3.1337, "step": 12310 }, { "epoch": 0.7148452232440744, "grad_norm": 0.11345722526311874, "learning_rate": 0.00020637134568776615, "loss": 3.1283, "step": 12320 }, { "epoch": 0.7154254547564478, "grad_norm": 0.11007855087518692, "learning_rate": 0.00020559531514871145, "loss": 3.124, "step": 12330 }, { "epoch": 0.7160056862688212, "grad_norm": 0.1093551367521286, "learning_rate": 0.00020482036863854708, "loss": 3.1251, "step": 12340 }, { "epoch": 0.7165859177811947, "grad_norm": 0.10831980407238007, "learning_rate": 0.00020404650901070787, "loss": 3.122, "step": 12350 }, { "epoch": 0.7171661492935681, "grad_norm": 0.1150059625506401, "learning_rate": 0.00020327373911462572, "loss": 3.1253, "step": 12360 }, { "epoch": 0.7177463808059416, "grad_norm": 0.10329602658748627, "learning_rate": 0.00020250206179572034, "loss": 3.1315, "step": 12370 }, { "epoch": 0.718326612318315, "grad_norm": 0.10848727822303772, "learning_rate": 0.00020173147989538853, "loss": 3.1334, "step": 12380 }, { "epoch": 0.7189068438306885, "grad_norm": 0.10476922988891602, "learning_rate": 0.00020096199625099337, "loss": 3.1208, "step": 12390 }, { "epoch": 0.7194870753430619, "grad_norm": 0.10537194460630417, "learning_rate": 0.00020019361369585454, "loss": 3.1265, "step": 12400 }, { "epoch": 0.7200673068554353, "grad_norm": 0.10518410056829453, "learning_rate": 0.00019942633505923703, "loss": 3.124, "step": 12410 }, { "epoch": 0.7206475383678087, "grad_norm": 0.10767358541488647, "learning_rate": 0.000198660163166341, "loss": 3.1174, "step": 12420 }, { "epoch": 0.7212277698801822, "grad_norm": 0.10856916010379791, "learning_rate": 0.0001978951008382918, "loss": 3.124, "step": 12430 }, { "epoch": 0.7218080013925556, "grad_norm": 0.1153908297419548, "learning_rate": 0.0001971311508921288, "loss": 3.1287, "step": 12440 }, { "epoch": 0.722388232904929, "grad_norm": 0.10942938178777695, "learning_rate": 0.00019636831614079625, "loss": 3.118, "step": 12450 }, { "epoch": 0.7229684644173026, "grad_norm": 0.1065671443939209, "learning_rate": 0.00019560659939313096, "loss": 3.1286, "step": 12460 }, { "epoch": 0.723548695929676, "grad_norm": 0.10312582552433014, "learning_rate": 0.0001948460034538543, "loss": 3.1235, "step": 12470 }, { "epoch": 0.7241289274420494, "grad_norm": 0.1132279559969902, "learning_rate": 0.00019408653112355995, "loss": 3.128, "step": 12480 }, { "epoch": 0.7247091589544228, "grad_norm": 0.10551323741674423, "learning_rate": 0.00019332818519870453, "loss": 3.1256, "step": 12490 }, { "epoch": 0.7252893904667963, "grad_norm": 0.11312738060951233, "learning_rate": 0.00019257096847159766, "loss": 3.1083, "step": 12500 }, { "epoch": 0.7258696219791697, "grad_norm": 0.113512322306633, "learning_rate": 0.00019181488373038992, "loss": 3.1143, "step": 12510 }, { "epoch": 0.7264498534915431, "grad_norm": 0.10724562406539917, "learning_rate": 0.00019105993375906512, "loss": 3.1284, "step": 12520 }, { "epoch": 0.7270300850039165, "grad_norm": 0.10460355132818222, "learning_rate": 0.00019030612133742787, "loss": 3.1162, "step": 12530 }, { "epoch": 0.72761031651629, "grad_norm": 0.10553659498691559, "learning_rate": 0.00018955344924109435, "loss": 3.1269, "step": 12540 }, { "epoch": 0.7281905480286635, "grad_norm": 0.10612872242927551, "learning_rate": 0.00018880192024148268, "loss": 3.1362, "step": 12550 }, { "epoch": 0.7287707795410369, "grad_norm": 0.11399485170841217, "learning_rate": 0.00018805153710580054, "loss": 3.135, "step": 12560 }, { "epoch": 0.7293510110534103, "grad_norm": 0.11703281104564667, "learning_rate": 0.00018730230259703795, "loss": 3.1188, "step": 12570 }, { "epoch": 0.7299312425657838, "grad_norm": 0.11565428972244263, "learning_rate": 0.00018655421947395425, "loss": 3.1244, "step": 12580 }, { "epoch": 0.7305114740781572, "grad_norm": 0.10253434628248215, "learning_rate": 0.00018580729049107026, "loss": 3.1183, "step": 12590 }, { "epoch": 0.7310917055905306, "grad_norm": 0.101934053003788, "learning_rate": 0.0001850615183986567, "loss": 3.1192, "step": 12600 }, { "epoch": 0.731671937102904, "grad_norm": 0.10347875952720642, "learning_rate": 0.0001843169059427243, "loss": 3.1201, "step": 12610 }, { "epoch": 0.7322521686152775, "grad_norm": 0.10719276964664459, "learning_rate": 0.00018357345586501468, "loss": 3.1261, "step": 12620 }, { "epoch": 0.7328324001276509, "grad_norm": 0.10952641069889069, "learning_rate": 0.00018283117090298813, "loss": 3.1286, "step": 12630 }, { "epoch": 0.7334126316400243, "grad_norm": 0.10987886041402817, "learning_rate": 0.00018209005378981626, "loss": 3.1325, "step": 12640 }, { "epoch": 0.7339928631523978, "grad_norm": 0.1137159988284111, "learning_rate": 0.00018135010725436968, "loss": 3.1282, "step": 12650 }, { "epoch": 0.7345730946647713, "grad_norm": 0.10682724416255951, "learning_rate": 0.00018061133402120895, "loss": 3.1168, "step": 12660 }, { "epoch": 0.7351533261771447, "grad_norm": 0.11520636081695557, "learning_rate": 0.00017987373681057495, "loss": 3.1311, "step": 12670 }, { "epoch": 0.7357335576895181, "grad_norm": 0.107805997133255, "learning_rate": 0.00017913731833837715, "loss": 3.1157, "step": 12680 }, { "epoch": 0.7363137892018915, "grad_norm": 0.10552658140659332, "learning_rate": 0.00017840208131618618, "loss": 3.1206, "step": 12690 }, { "epoch": 0.736894020714265, "grad_norm": 0.10237275809049606, "learning_rate": 0.0001776680284512215, "loss": 3.1185, "step": 12700 }, { "epoch": 0.7374742522266384, "grad_norm": 0.10909226536750793, "learning_rate": 0.00017693516244634246, "loss": 3.1108, "step": 12710 }, { "epoch": 0.7380544837390118, "grad_norm": 0.10805969685316086, "learning_rate": 0.00017620348600003898, "loss": 3.1244, "step": 12720 }, { "epoch": 0.7386347152513854, "grad_norm": 0.1112237498164177, "learning_rate": 0.00017547300180641978, "loss": 3.1242, "step": 12730 }, { "epoch": 0.7392149467637588, "grad_norm": 0.10815447568893433, "learning_rate": 0.00017474371255520466, "loss": 3.115, "step": 12740 }, { "epoch": 0.7397951782761322, "grad_norm": 0.10469721257686615, "learning_rate": 0.00017401562093171286, "loss": 3.1276, "step": 12750 }, { "epoch": 0.7403754097885056, "grad_norm": 0.10945837199687958, "learning_rate": 0.00017328872961685382, "loss": 3.1234, "step": 12760 }, { "epoch": 0.7409556413008791, "grad_norm": 0.11551317572593689, "learning_rate": 0.00017256304128711807, "loss": 3.1234, "step": 12770 }, { "epoch": 0.7415358728132525, "grad_norm": 0.10662870854139328, "learning_rate": 0.0001718385586145654, "loss": 3.1193, "step": 12780 }, { "epoch": 0.7421161043256259, "grad_norm": 0.0992654412984848, "learning_rate": 0.00017111528426681728, "loss": 3.12, "step": 12790 }, { "epoch": 0.7426963358379993, "grad_norm": 0.10338784754276276, "learning_rate": 0.00017039322090704555, "loss": 3.1162, "step": 12800 }, { "epoch": 0.7432765673503728, "grad_norm": 0.11413225531578064, "learning_rate": 0.00016967237119396318, "loss": 3.1261, "step": 12810 }, { "epoch": 0.7438567988627462, "grad_norm": 0.12023269385099411, "learning_rate": 0.00016895273778181426, "loss": 3.1234, "step": 12820 }, { "epoch": 0.7444370303751197, "grad_norm": 0.10315112769603729, "learning_rate": 0.00016823432332036426, "loss": 3.1175, "step": 12830 }, { "epoch": 0.7450172618874931, "grad_norm": 0.11159830540418625, "learning_rate": 0.00016751713045489098, "loss": 3.129, "step": 12840 }, { "epoch": 0.7455974933998666, "grad_norm": 0.10925300419330597, "learning_rate": 0.000166801161826173, "loss": 3.1211, "step": 12850 }, { "epoch": 0.74617772491224, "grad_norm": 0.10250292718410492, "learning_rate": 0.00016608642007048235, "loss": 3.1262, "step": 12860 }, { "epoch": 0.7467579564246134, "grad_norm": 0.10880285501480103, "learning_rate": 0.00016537290781957288, "loss": 3.1129, "step": 12870 }, { "epoch": 0.7473381879369868, "grad_norm": 0.10200098901987076, "learning_rate": 0.00016466062770067124, "loss": 3.1227, "step": 12880 }, { "epoch": 0.7479184194493603, "grad_norm": 0.10475246608257294, "learning_rate": 0.000163949582336468, "loss": 3.1347, "step": 12890 }, { "epoch": 0.7484986509617337, "grad_norm": 0.10829997807741165, "learning_rate": 0.00016323977434510594, "loss": 3.1228, "step": 12900 }, { "epoch": 0.7490788824741071, "grad_norm": 0.10326212644577026, "learning_rate": 0.000162531206340173, "loss": 3.1217, "step": 12910 }, { "epoch": 0.7496591139864806, "grad_norm": 0.10354658216238022, "learning_rate": 0.0001618238809306906, "loss": 3.1181, "step": 12920 }, { "epoch": 0.7502393454988541, "grad_norm": 0.10224345326423645, "learning_rate": 0.00016111780072110504, "loss": 3.12, "step": 12930 }, { "epoch": 0.7508195770112275, "grad_norm": 0.10535353422164917, "learning_rate": 0.00016041296831127756, "loss": 3.1297, "step": 12940 }, { "epoch": 0.7513998085236009, "grad_norm": 0.11045780032873154, "learning_rate": 0.0001597093862964748, "loss": 3.1183, "step": 12950 }, { "epoch": 0.7519800400359744, "grad_norm": 0.1009131371974945, "learning_rate": 0.00015900705726735976, "loss": 3.1174, "step": 12960 }, { "epoch": 0.7525602715483478, "grad_norm": 0.10427884012460709, "learning_rate": 0.00015830598380998134, "loss": 3.1101, "step": 12970 }, { "epoch": 0.7531405030607212, "grad_norm": 0.11507980525493622, "learning_rate": 0.0001576061685057655, "loss": 3.1247, "step": 12980 }, { "epoch": 0.7537207345730946, "grad_norm": 0.10372275114059448, "learning_rate": 0.00015690761393150537, "loss": 3.1183, "step": 12990 }, { "epoch": 0.7543009660854681, "grad_norm": 0.09975118935108185, "learning_rate": 0.00015621032265935203, "loss": 3.1256, "step": 13000 }, { "epoch": 0.7543009660854681, "eval_loss": 3.050916910171509, "eval_runtime": 3.2621, "eval_samples_per_second": 1327.375, "eval_steps_per_second": 10.423, "step": 13000 }, { "epoch": 0.7548811975978416, "grad_norm": 0.10384030640125275, "learning_rate": 0.00015551429725680531, "loss": 3.1167, "step": 13010 }, { "epoch": 0.755461429110215, "grad_norm": 0.10713458061218262, "learning_rate": 0.00015481954028670342, "loss": 3.1206, "step": 13020 }, { "epoch": 0.7560416606225884, "grad_norm": 0.10227679461240768, "learning_rate": 0.0001541260543072144, "loss": 3.1142, "step": 13030 }, { "epoch": 0.7566218921349619, "grad_norm": 0.10371891409158707, "learning_rate": 0.00015343384187182612, "loss": 3.12, "step": 13040 }, { "epoch": 0.7572021236473353, "grad_norm": 0.10193309932947159, "learning_rate": 0.00015274290552933745, "loss": 3.1191, "step": 13050 }, { "epoch": 0.7577823551597087, "grad_norm": 0.10207639634609222, "learning_rate": 0.00015205324782384817, "loss": 3.1159, "step": 13060 }, { "epoch": 0.7583625866720821, "grad_norm": 0.1024574562907219, "learning_rate": 0.00015136487129475046, "loss": 3.1155, "step": 13070 }, { "epoch": 0.7589428181844556, "grad_norm": 0.10583309829235077, "learning_rate": 0.00015067777847671876, "loss": 3.1178, "step": 13080 }, { "epoch": 0.759523049696829, "grad_norm": 0.10229542851448059, "learning_rate": 0.00014999197189970065, "loss": 3.1168, "step": 13090 }, { "epoch": 0.7601032812092025, "grad_norm": 0.09870131313800812, "learning_rate": 0.00014930745408890794, "loss": 3.1121, "step": 13100 }, { "epoch": 0.7606835127215759, "grad_norm": 0.10222458094358444, "learning_rate": 0.00014862422756480687, "loss": 3.128, "step": 13110 }, { "epoch": 0.7612637442339494, "grad_norm": 0.10863006114959717, "learning_rate": 0.00014794229484310883, "loss": 3.1115, "step": 13120 }, { "epoch": 0.7618439757463228, "grad_norm": 0.10503353178501129, "learning_rate": 0.00014726165843476202, "loss": 3.1222, "step": 13130 }, { "epoch": 0.7624242072586962, "grad_norm": 0.09862679988145828, "learning_rate": 0.0001465823208459407, "loss": 3.1138, "step": 13140 }, { "epoch": 0.7630044387710696, "grad_norm": 0.10148298740386963, "learning_rate": 0.00014590428457803706, "loss": 3.1158, "step": 13150 }, { "epoch": 0.7635846702834431, "grad_norm": 0.11013112962245941, "learning_rate": 0.00014522755212765176, "loss": 3.1157, "step": 13160 }, { "epoch": 0.7641649017958165, "grad_norm": 0.10017859935760498, "learning_rate": 0.00014455212598658447, "loss": 3.1264, "step": 13170 }, { "epoch": 0.7647451333081899, "grad_norm": 0.10435356944799423, "learning_rate": 0.00014387800864182487, "loss": 3.1072, "step": 13180 }, { "epoch": 0.7653253648205635, "grad_norm": 0.10140141099691391, "learning_rate": 0.00014320520257554397, "loss": 3.1145, "step": 13190 }, { "epoch": 0.7659055963329369, "grad_norm": 0.09953157603740692, "learning_rate": 0.000142533710265084, "loss": 3.1244, "step": 13200 }, { "epoch": 0.7664858278453103, "grad_norm": 0.10731340944766998, "learning_rate": 0.00014186353418295006, "loss": 3.1164, "step": 13210 }, { "epoch": 0.7670660593576837, "grad_norm": 0.097932830452919, "learning_rate": 0.0001411946767968006, "loss": 3.1192, "step": 13220 }, { "epoch": 0.7676462908700572, "grad_norm": 0.10346731543540955, "learning_rate": 0.00014052714056943849, "loss": 3.1211, "step": 13230 }, { "epoch": 0.7682265223824306, "grad_norm": 0.10606466978788376, "learning_rate": 0.0001398609279588024, "loss": 3.1154, "step": 13240 }, { "epoch": 0.768806753894804, "grad_norm": 0.10412232577800751, "learning_rate": 0.00013919604141795667, "loss": 3.1164, "step": 13250 }, { "epoch": 0.7693869854071774, "grad_norm": 0.09943995624780655, "learning_rate": 0.0001385324833950833, "loss": 3.1195, "step": 13260 }, { "epoch": 0.769967216919551, "grad_norm": 0.098769411444664, "learning_rate": 0.00013787025633347239, "loss": 3.1183, "step": 13270 }, { "epoch": 0.7705474484319244, "grad_norm": 0.11205046623945236, "learning_rate": 0.00013720936267151324, "loss": 3.12, "step": 13280 }, { "epoch": 0.7711276799442978, "grad_norm": 0.10622609406709671, "learning_rate": 0.00013654980484268598, "loss": 3.1139, "step": 13290 }, { "epoch": 0.7717079114566712, "grad_norm": 0.10065792500972748, "learning_rate": 0.00013589158527555094, "loss": 3.1104, "step": 13300 }, { "epoch": 0.7722881429690447, "grad_norm": 0.11066281795501709, "learning_rate": 0.0001352347063937422, "loss": 3.1149, "step": 13310 }, { "epoch": 0.7728683744814181, "grad_norm": 0.10775715857744217, "learning_rate": 0.0001345791706159562, "loss": 3.1172, "step": 13320 }, { "epoch": 0.7734486059937915, "grad_norm": 0.0999317467212677, "learning_rate": 0.0001339249803559444, "loss": 3.118, "step": 13330 }, { "epoch": 0.7740288375061649, "grad_norm": 0.10435137152671814, "learning_rate": 0.0001332721380225042, "loss": 3.1238, "step": 13340 }, { "epoch": 0.7746090690185384, "grad_norm": 0.10001866519451141, "learning_rate": 0.00013262064601946895, "loss": 3.1035, "step": 13350 }, { "epoch": 0.7751893005309118, "grad_norm": 0.10652778297662735, "learning_rate": 0.00013197050674570077, "loss": 3.1129, "step": 13360 }, { "epoch": 0.7757695320432852, "grad_norm": 0.09261602908372879, "learning_rate": 0.00013132172259508058, "loss": 3.1256, "step": 13370 }, { "epoch": 0.7763497635556587, "grad_norm": 0.1106601282954216, "learning_rate": 0.0001306742959564995, "loss": 3.1256, "step": 13380 }, { "epoch": 0.7769299950680322, "grad_norm": 0.11139431595802307, "learning_rate": 0.0001300282292138502, "loss": 3.1171, "step": 13390 }, { "epoch": 0.7775102265804056, "grad_norm": 0.09999184310436249, "learning_rate": 0.00012938352474601805, "loss": 3.1173, "step": 13400 }, { "epoch": 0.778090458092779, "grad_norm": 0.10087510198354721, "learning_rate": 0.0001287401849268728, "loss": 3.1224, "step": 13410 }, { "epoch": 0.7786706896051525, "grad_norm": 0.10008762031793594, "learning_rate": 0.0001280982121252585, "loss": 3.117, "step": 13420 }, { "epoch": 0.7792509211175259, "grad_norm": 0.10388967394828796, "learning_rate": 0.0001274576087049868, "loss": 3.105, "step": 13430 }, { "epoch": 0.7798311526298993, "grad_norm": 0.10136213898658752, "learning_rate": 0.0001268183770248263, "loss": 3.1128, "step": 13440 }, { "epoch": 0.7804113841422727, "grad_norm": 0.09719151258468628, "learning_rate": 0.0001261805194384949, "loss": 3.1094, "step": 13450 }, { "epoch": 0.7809916156546463, "grad_norm": 0.10660874843597412, "learning_rate": 0.00012554403829465155, "loss": 3.1207, "step": 13460 }, { "epoch": 0.7815718471670197, "grad_norm": 0.10400804132223129, "learning_rate": 0.00012490893593688584, "loss": 3.1109, "step": 13470 }, { "epoch": 0.7821520786793931, "grad_norm": 0.10217728465795517, "learning_rate": 0.00012427521470371173, "loss": 3.1128, "step": 13480 }, { "epoch": 0.7827323101917665, "grad_norm": 0.10613488405942917, "learning_rate": 0.0001236428769285569, "loss": 3.1091, "step": 13490 }, { "epoch": 0.78331254170414, "grad_norm": 0.09936373680830002, "learning_rate": 0.00012301192493975526, "loss": 3.1107, "step": 13500 }, { "epoch": 0.7838927732165134, "grad_norm": 0.1033458337187767, "learning_rate": 0.00012238236106053852, "loss": 3.1209, "step": 13510 }, { "epoch": 0.7844730047288868, "grad_norm": 0.10691102594137192, "learning_rate": 0.00012175418760902617, "loss": 3.1077, "step": 13520 }, { "epoch": 0.7850532362412602, "grad_norm": 0.11017080396413803, "learning_rate": 0.00012112740689821921, "loss": 3.119, "step": 13530 }, { "epoch": 0.7856334677536337, "grad_norm": 0.10583353787660599, "learning_rate": 0.00012050202123598974, "loss": 3.1136, "step": 13540 }, { "epoch": 0.7862136992660071, "grad_norm": 0.10265874862670898, "learning_rate": 0.00011987803292507305, "loss": 3.1122, "step": 13550 }, { "epoch": 0.7867939307783806, "grad_norm": 0.09812895208597183, "learning_rate": 0.00011925544426305996, "loss": 3.11, "step": 13560 }, { "epoch": 0.787374162290754, "grad_norm": 0.10639332979917526, "learning_rate": 0.00011863425754238655, "loss": 3.1162, "step": 13570 }, { "epoch": 0.7879543938031275, "grad_norm": 0.10056709498167038, "learning_rate": 0.00011801447505032786, "loss": 3.1108, "step": 13580 }, { "epoch": 0.7885346253155009, "grad_norm": 0.1005856990814209, "learning_rate": 0.00011739609906898774, "loss": 3.1051, "step": 13590 }, { "epoch": 0.7891148568278743, "grad_norm": 0.10471539199352264, "learning_rate": 0.00011677913187529126, "loss": 3.1174, "step": 13600 }, { "epoch": 0.7896950883402477, "grad_norm": 0.10265914350748062, "learning_rate": 0.0001161635757409767, "loss": 3.1132, "step": 13610 }, { "epoch": 0.7902753198526212, "grad_norm": 0.10047253966331482, "learning_rate": 0.00011554943293258557, "loss": 3.1144, "step": 13620 }, { "epoch": 0.7908555513649946, "grad_norm": 0.10369555652141571, "learning_rate": 0.00011493670571145665, "loss": 3.1165, "step": 13630 }, { "epoch": 0.791435782877368, "grad_norm": 0.10994482040405273, "learning_rate": 0.0001143253963337152, "loss": 3.1099, "step": 13640 }, { "epoch": 0.7920160143897415, "grad_norm": 0.10117276012897491, "learning_rate": 0.00011371550705026673, "loss": 3.1207, "step": 13650 }, { "epoch": 0.792596245902115, "grad_norm": 0.10518882423639297, "learning_rate": 0.00011310704010678747, "loss": 3.0989, "step": 13660 }, { "epoch": 0.7931764774144884, "grad_norm": 0.10278623551130295, "learning_rate": 0.00011249999774371621, "loss": 3.1032, "step": 13670 }, { "epoch": 0.7937567089268618, "grad_norm": 0.10157769918441772, "learning_rate": 0.00011189438219624698, "loss": 3.1141, "step": 13680 }, { "epoch": 0.7943369404392353, "grad_norm": 0.10142907500267029, "learning_rate": 0.00011129019569431908, "loss": 3.1123, "step": 13690 }, { "epoch": 0.7949171719516087, "grad_norm": 0.10015449672937393, "learning_rate": 0.00011068744046261098, "loss": 3.1125, "step": 13700 }, { "epoch": 0.7954974034639821, "grad_norm": 0.10350023210048676, "learning_rate": 0.00011008611872053037, "loss": 3.1038, "step": 13710 }, { "epoch": 0.7960776349763555, "grad_norm": 0.10056246072053909, "learning_rate": 0.00010948623268220676, "loss": 3.1087, "step": 13720 }, { "epoch": 0.796657866488729, "grad_norm": 0.09896915405988693, "learning_rate": 0.00010888778455648391, "loss": 3.1132, "step": 13730 }, { "epoch": 0.7972380980011025, "grad_norm": 0.10385739803314209, "learning_rate": 0.00010829077654690983, "loss": 3.1183, "step": 13740 }, { "epoch": 0.7978183295134759, "grad_norm": 0.09953512251377106, "learning_rate": 0.000107695210851731, "loss": 3.1125, "step": 13750 }, { "epoch": 0.7983985610258493, "grad_norm": 0.09491749107837677, "learning_rate": 0.00010710108966388266, "loss": 3.1131, "step": 13760 }, { "epoch": 0.7989787925382228, "grad_norm": 0.09977880120277405, "learning_rate": 0.00010650841517098115, "loss": 3.121, "step": 13770 }, { "epoch": 0.7995590240505962, "grad_norm": 0.10586149990558624, "learning_rate": 0.00010591718955531605, "loss": 3.1175, "step": 13780 }, { "epoch": 0.8001392555629696, "grad_norm": 0.10209766030311584, "learning_rate": 0.0001053274149938419, "loss": 3.1164, "step": 13790 }, { "epoch": 0.800719487075343, "grad_norm": 0.10039076209068298, "learning_rate": 0.0001047390936581707, "loss": 3.1094, "step": 13800 }, { "epoch": 0.8012997185877165, "grad_norm": 0.10035811364650726, "learning_rate": 0.00010415222771456307, "loss": 3.1173, "step": 13810 }, { "epoch": 0.8018799501000899, "grad_norm": 0.09645077586174011, "learning_rate": 0.00010356681932392093, "loss": 3.1097, "step": 13820 }, { "epoch": 0.8024601816124634, "grad_norm": 0.10422459989786148, "learning_rate": 0.0001029828706417793, "loss": 3.1142, "step": 13830 }, { "epoch": 0.8030404131248368, "grad_norm": 0.10029245167970657, "learning_rate": 0.0001024003838182982, "loss": 3.1054, "step": 13840 }, { "epoch": 0.8036206446372103, "grad_norm": 0.09637358039617538, "learning_rate": 0.00010181936099825551, "loss": 3.1093, "step": 13850 }, { "epoch": 0.8042008761495837, "grad_norm": 0.10224739462137222, "learning_rate": 0.00010123980432103791, "loss": 3.1085, "step": 13860 }, { "epoch": 0.8047811076619571, "grad_norm": 0.09893719106912613, "learning_rate": 0.00010066171592063377, "loss": 3.1045, "step": 13870 }, { "epoch": 0.8053613391743305, "grad_norm": 0.09696366637945175, "learning_rate": 0.00010008509792562525, "loss": 3.1068, "step": 13880 }, { "epoch": 0.805941570686704, "grad_norm": 0.09792386740446091, "learning_rate": 9.950995245918016e-05, "loss": 3.1193, "step": 13890 }, { "epoch": 0.8065218021990774, "grad_norm": 0.09721978008747101, "learning_rate": 9.893628163904417e-05, "loss": 3.1135, "step": 13900 }, { "epoch": 0.8071020337114508, "grad_norm": 0.102999746799469, "learning_rate": 9.836408757753363e-05, "loss": 3.1162, "step": 13910 }, { "epoch": 0.8076822652238244, "grad_norm": 0.10018763691186905, "learning_rate": 9.779337238152697e-05, "loss": 3.1185, "step": 13920 }, { "epoch": 0.8082624967361978, "grad_norm": 0.09860005974769592, "learning_rate": 9.722413815245717e-05, "loss": 3.1131, "step": 13930 }, { "epoch": 0.8088427282485712, "grad_norm": 0.09546367824077606, "learning_rate": 9.665638698630442e-05, "loss": 3.1123, "step": 13940 }, { "epoch": 0.8094229597609446, "grad_norm": 0.09807273745536804, "learning_rate": 9.6090120973588e-05, "loss": 3.1063, "step": 13950 }, { "epoch": 0.8100031912733181, "grad_norm": 0.10019023716449738, "learning_rate": 9.552534219935844e-05, "loss": 3.1155, "step": 13960 }, { "epoch": 0.8105834227856915, "grad_norm": 0.1006435975432396, "learning_rate": 9.496205274319069e-05, "loss": 3.106, "step": 13970 }, { "epoch": 0.8111636542980649, "grad_norm": 0.10411754250526428, "learning_rate": 9.44002546791754e-05, "loss": 3.1155, "step": 13980 }, { "epoch": 0.8117438858104383, "grad_norm": 0.1065743938088417, "learning_rate": 9.38399500759119e-05, "loss": 3.1075, "step": 13990 }, { "epoch": 0.8123241173228118, "grad_norm": 0.09786444157361984, "learning_rate": 9.328114099650042e-05, "loss": 3.1073, "step": 14000 }, { "epoch": 0.8123241173228118, "eval_loss": 3.041724681854248, "eval_runtime": 3.2609, "eval_samples_per_second": 1327.855, "eval_steps_per_second": 10.427, "step": 14000 }, { "epoch": 0.8129043488351853, "grad_norm": 0.09952949732542038, "learning_rate": 9.272382949853453e-05, "loss": 3.1102, "step": 14010 }, { "epoch": 0.8134845803475587, "grad_norm": 0.0931655615568161, "learning_rate": 9.216801763409343e-05, "loss": 3.1085, "step": 14020 }, { "epoch": 0.8140648118599321, "grad_norm": 0.09814907610416412, "learning_rate": 9.161370744973491e-05, "loss": 3.1011, "step": 14030 }, { "epoch": 0.8146450433723056, "grad_norm": 0.09730423241853714, "learning_rate": 9.106090098648696e-05, "loss": 3.1048, "step": 14040 }, { "epoch": 0.815225274884679, "grad_norm": 0.0960068479180336, "learning_rate": 9.05096002798409e-05, "loss": 3.1144, "step": 14050 }, { "epoch": 0.8158055063970524, "grad_norm": 0.0986780971288681, "learning_rate": 8.995980735974369e-05, "loss": 3.1092, "step": 14060 }, { "epoch": 0.8163857379094258, "grad_norm": 0.09585044533014297, "learning_rate": 8.941152425059034e-05, "loss": 3.1125, "step": 14070 }, { "epoch": 0.8169659694217993, "grad_norm": 0.10179416090250015, "learning_rate": 8.886475297121693e-05, "loss": 3.1041, "step": 14080 }, { "epoch": 0.8175462009341727, "grad_norm": 0.0965443029999733, "learning_rate": 8.831949553489249e-05, "loss": 3.1132, "step": 14090 }, { "epoch": 0.8181264324465461, "grad_norm": 0.0964798629283905, "learning_rate": 8.777575394931198e-05, "loss": 3.1103, "step": 14100 }, { "epoch": 0.8187066639589196, "grad_norm": 0.11066204309463501, "learning_rate": 8.723353021658892e-05, "loss": 3.105, "step": 14110 }, { "epoch": 0.8192868954712931, "grad_norm": 0.09815254807472229, "learning_rate": 8.669282633324776e-05, "loss": 3.1088, "step": 14120 }, { "epoch": 0.8198671269836665, "grad_norm": 0.10448320209980011, "learning_rate": 8.615364429021722e-05, "loss": 3.0998, "step": 14130 }, { "epoch": 0.8204473584960399, "grad_norm": 0.10330579429864883, "learning_rate": 8.56159860728215e-05, "loss": 3.1101, "step": 14140 }, { "epoch": 0.8210275900084134, "grad_norm": 0.10102424770593643, "learning_rate": 8.507985366077493e-05, "loss": 3.1033, "step": 14150 }, { "epoch": 0.8216078215207868, "grad_norm": 0.09644920378923416, "learning_rate": 8.454524902817312e-05, "loss": 3.1087, "step": 14160 }, { "epoch": 0.8221880530331602, "grad_norm": 0.09765134006738663, "learning_rate": 8.401217414348611e-05, "loss": 3.0975, "step": 14170 }, { "epoch": 0.8227682845455336, "grad_norm": 0.09793014079332352, "learning_rate": 8.348063096955188e-05, "loss": 3.116, "step": 14180 }, { "epoch": 0.8233485160579072, "grad_norm": 0.09745863080024719, "learning_rate": 8.295062146356763e-05, "loss": 3.1123, "step": 14190 }, { "epoch": 0.8239287475702806, "grad_norm": 0.10549872368574142, "learning_rate": 8.242214757708416e-05, "loss": 3.1137, "step": 14200 }, { "epoch": 0.824508979082654, "grad_norm": 0.10081037133932114, "learning_rate": 8.18952112559977e-05, "loss": 3.1119, "step": 14210 }, { "epoch": 0.8250892105950274, "grad_norm": 0.10452466458082199, "learning_rate": 8.136981444054281e-05, "loss": 3.108, "step": 14220 }, { "epoch": 0.8256694421074009, "grad_norm": 0.10242857784032822, "learning_rate": 8.084595906528574e-05, "loss": 3.1052, "step": 14230 }, { "epoch": 0.8262496736197743, "grad_norm": 0.10497426986694336, "learning_rate": 8.032364705911665e-05, "loss": 3.1, "step": 14240 }, { "epoch": 0.8268299051321477, "grad_norm": 0.0955236405134201, "learning_rate": 7.980288034524353e-05, "loss": 3.1138, "step": 14250 }, { "epoch": 0.8274101366445211, "grad_norm": 0.10172264277935028, "learning_rate": 7.928366084118338e-05, "loss": 3.0993, "step": 14260 }, { "epoch": 0.8279903681568946, "grad_norm": 0.09582705050706863, "learning_rate": 7.87659904587572e-05, "loss": 3.1224, "step": 14270 }, { "epoch": 0.828570599669268, "grad_norm": 0.10169988870620728, "learning_rate": 7.824987110408149e-05, "loss": 3.1154, "step": 14280 }, { "epoch": 0.8291508311816415, "grad_norm": 0.09703046083450317, "learning_rate": 7.773530467756168e-05, "loss": 3.1, "step": 14290 }, { "epoch": 0.8297310626940149, "grad_norm": 0.09659979492425919, "learning_rate": 7.722229307388551e-05, "loss": 3.1027, "step": 14300 }, { "epoch": 0.8303112942063884, "grad_norm": 0.10455524176359177, "learning_rate": 7.671083818201502e-05, "loss": 3.1086, "step": 14310 }, { "epoch": 0.8308915257187618, "grad_norm": 0.09594230353832245, "learning_rate": 7.620094188518112e-05, "loss": 3.098, "step": 14320 }, { "epoch": 0.8314717572311352, "grad_norm": 0.0956626608967781, "learning_rate": 7.569260606087518e-05, "loss": 3.0967, "step": 14330 }, { "epoch": 0.8320519887435086, "grad_norm": 0.09703920781612396, "learning_rate": 7.518583258084288e-05, "loss": 3.1088, "step": 14340 }, { "epoch": 0.8326322202558821, "grad_norm": 0.09883217513561249, "learning_rate": 7.468062331107761e-05, "loss": 3.1125, "step": 14350 }, { "epoch": 0.8332124517682555, "grad_norm": 0.09501095116138458, "learning_rate": 7.417698011181234e-05, "loss": 3.1007, "step": 14360 }, { "epoch": 0.8337926832806289, "grad_norm": 0.09835023432970047, "learning_rate": 7.367490483751448e-05, "loss": 3.103, "step": 14370 }, { "epoch": 0.8343729147930025, "grad_norm": 0.10247199982404709, "learning_rate": 7.317439933687764e-05, "loss": 3.1054, "step": 14380 }, { "epoch": 0.8349531463053759, "grad_norm": 0.10141029953956604, "learning_rate": 7.267546545281544e-05, "loss": 3.1124, "step": 14390 }, { "epoch": 0.8355333778177493, "grad_norm": 0.09786387532949448, "learning_rate": 7.217810502245498e-05, "loss": 3.1143, "step": 14400 }, { "epoch": 0.8361136093301227, "grad_norm": 0.10012129694223404, "learning_rate": 7.168231987712903e-05, "loss": 3.1133, "step": 14410 }, { "epoch": 0.8366938408424962, "grad_norm": 0.10030212253332138, "learning_rate": 7.118811184237078e-05, "loss": 3.1001, "step": 14420 }, { "epoch": 0.8372740723548696, "grad_norm": 0.09827015548944473, "learning_rate": 7.069548273790588e-05, "loss": 3.1031, "step": 14430 }, { "epoch": 0.837854303867243, "grad_norm": 0.09829414635896683, "learning_rate": 7.020443437764629e-05, "loss": 3.1095, "step": 14440 }, { "epoch": 0.8384345353796164, "grad_norm": 0.0953378975391388, "learning_rate": 6.971496856968351e-05, "loss": 3.1009, "step": 14450 }, { "epoch": 0.83901476689199, "grad_norm": 0.09821732342243195, "learning_rate": 6.922708711628183e-05, "loss": 3.1148, "step": 14460 }, { "epoch": 0.8395949984043634, "grad_norm": 0.09834583848714828, "learning_rate": 6.874079181387221e-05, "loss": 3.1015, "step": 14470 }, { "epoch": 0.8401752299167368, "grad_norm": 0.09869256615638733, "learning_rate": 6.825608445304443e-05, "loss": 3.1101, "step": 14480 }, { "epoch": 0.8407554614291102, "grad_norm": 0.10293745249509811, "learning_rate": 6.777296681854206e-05, "loss": 3.1056, "step": 14490 }, { "epoch": 0.8413356929414837, "grad_norm": 0.09749376773834229, "learning_rate": 6.72914406892548e-05, "loss": 3.1106, "step": 14500 }, { "epoch": 0.8419159244538571, "grad_norm": 0.09520737081766129, "learning_rate": 6.681150783821222e-05, "loss": 3.1085, "step": 14510 }, { "epoch": 0.8424961559662305, "grad_norm": 0.09628592431545258, "learning_rate": 6.633317003257755e-05, "loss": 3.1083, "step": 14520 }, { "epoch": 0.8430763874786039, "grad_norm": 0.09358352422714233, "learning_rate": 6.585642903364036e-05, "loss": 3.1113, "step": 14530 }, { "epoch": 0.8436566189909774, "grad_norm": 0.09465614706277847, "learning_rate": 6.538128659681131e-05, "loss": 3.1141, "step": 14540 }, { "epoch": 0.8442368505033508, "grad_norm": 0.09725864231586456, "learning_rate": 6.490774447161441e-05, "loss": 3.1104, "step": 14550 }, { "epoch": 0.8448170820157243, "grad_norm": 0.09526196122169495, "learning_rate": 6.443580440168146e-05, "loss": 3.1165, "step": 14560 }, { "epoch": 0.8453973135280977, "grad_norm": 0.09678266197443008, "learning_rate": 6.396546812474519e-05, "loss": 3.1012, "step": 14570 }, { "epoch": 0.8459775450404712, "grad_norm": 0.0973704531788826, "learning_rate": 6.349673737263295e-05, "loss": 3.1026, "step": 14580 }, { "epoch": 0.8465577765528446, "grad_norm": 0.09803210198879242, "learning_rate": 6.302961387126066e-05, "loss": 3.1056, "step": 14590 }, { "epoch": 0.847138008065218, "grad_norm": 0.09953798353672028, "learning_rate": 6.256409934062595e-05, "loss": 3.1067, "step": 14600 }, { "epoch": 0.8477182395775914, "grad_norm": 0.10524503886699677, "learning_rate": 6.2100195494802e-05, "loss": 3.1031, "step": 14610 }, { "epoch": 0.8482984710899649, "grad_norm": 0.09302034974098206, "learning_rate": 6.163790404193148e-05, "loss": 3.1096, "step": 14620 }, { "epoch": 0.8488787026023383, "grad_norm": 0.09579528868198395, "learning_rate": 6.117722668421971e-05, "loss": 3.1069, "step": 14630 }, { "epoch": 0.8494589341147117, "grad_norm": 0.09332197159528732, "learning_rate": 6.071816511792932e-05, "loss": 3.1117, "step": 14640 }, { "epoch": 0.8500391656270853, "grad_norm": 0.09204788506031036, "learning_rate": 6.0260721033372876e-05, "loss": 3.0956, "step": 14650 }, { "epoch": 0.8506193971394587, "grad_norm": 0.09581390768289566, "learning_rate": 5.980489611490747e-05, "loss": 3.098, "step": 14660 }, { "epoch": 0.8511996286518321, "grad_norm": 0.09121184796094894, "learning_rate": 5.935069204092819e-05, "loss": 3.112, "step": 14670 }, { "epoch": 0.8517798601642055, "grad_norm": 0.09455008059740067, "learning_rate": 5.889811048386201e-05, "loss": 3.1009, "step": 14680 }, { "epoch": 0.852360091676579, "grad_norm": 0.09240598976612091, "learning_rate": 5.8447153110161524e-05, "loss": 3.1075, "step": 14690 }, { "epoch": 0.8529403231889524, "grad_norm": 0.09437933564186096, "learning_rate": 5.7997821580299256e-05, "loss": 3.0997, "step": 14700 }, { "epoch": 0.8535205547013258, "grad_norm": 0.0974365696310997, "learning_rate": 5.755011754876088e-05, "loss": 3.0986, "step": 14710 }, { "epoch": 0.8541007862136992, "grad_norm": 0.0968250036239624, "learning_rate": 5.710404266403951e-05, "loss": 3.1132, "step": 14720 }, { "epoch": 0.8546810177260727, "grad_norm": 0.09723575413227081, "learning_rate": 5.665959856862962e-05, "loss": 3.1009, "step": 14730 }, { "epoch": 0.8552612492384462, "grad_norm": 0.09329159557819366, "learning_rate": 5.621678689902077e-05, "loss": 3.1138, "step": 14740 }, { "epoch": 0.8558414807508196, "grad_norm": 0.09336376190185547, "learning_rate": 5.57756092856922e-05, "loss": 3.0973, "step": 14750 }, { "epoch": 0.856421712263193, "grad_norm": 0.0969925969839096, "learning_rate": 5.5336067353105976e-05, "loss": 3.0939, "step": 14760 }, { "epoch": 0.8570019437755665, "grad_norm": 0.09060470759868622, "learning_rate": 5.489816271970149e-05, "loss": 3.1113, "step": 14770 }, { "epoch": 0.8575821752879399, "grad_norm": 0.09199319034814835, "learning_rate": 5.4461896997889505e-05, "loss": 3.1083, "step": 14780 }, { "epoch": 0.8581624068003133, "grad_norm": 0.09637030959129333, "learning_rate": 5.402727179404615e-05, "loss": 3.1091, "step": 14790 }, { "epoch": 0.8587426383126867, "grad_norm": 0.0952460765838623, "learning_rate": 5.359428870850691e-05, "loss": 3.1057, "step": 14800 }, { "epoch": 0.8593228698250602, "grad_norm": 0.09476283192634583, "learning_rate": 5.316294933556076e-05, "loss": 3.1085, "step": 14810 }, { "epoch": 0.8599031013374336, "grad_norm": 0.0928397923707962, "learning_rate": 5.273325526344469e-05, "loss": 3.0943, "step": 14820 }, { "epoch": 0.860483332849807, "grad_norm": 0.09518643468618393, "learning_rate": 5.230520807433714e-05, "loss": 3.0964, "step": 14830 }, { "epoch": 0.8610635643621805, "grad_norm": 0.09664203971624374, "learning_rate": 5.187880934435274e-05, "loss": 3.1037, "step": 14840 }, { "epoch": 0.861643795874554, "grad_norm": 0.0946909636259079, "learning_rate": 5.145406064353631e-05, "loss": 3.0976, "step": 14850 }, { "epoch": 0.8622240273869274, "grad_norm": 0.09507571905851364, "learning_rate": 5.10309635358569e-05, "loss": 3.0998, "step": 14860 }, { "epoch": 0.8628042588993008, "grad_norm": 0.09289544820785522, "learning_rate": 5.060951957920257e-05, "loss": 3.1094, "step": 14870 }, { "epoch": 0.8633844904116743, "grad_norm": 0.09258411824703217, "learning_rate": 5.018973032537411e-05, "loss": 3.1052, "step": 14880 }, { "epoch": 0.8639647219240477, "grad_norm": 0.09440912306308746, "learning_rate": 4.977159732007941e-05, "loss": 3.1092, "step": 14890 }, { "epoch": 0.8645449534364211, "grad_norm": 0.09835471212863922, "learning_rate": 4.935512210292814e-05, "loss": 3.0988, "step": 14900 }, { "epoch": 0.8651251849487945, "grad_norm": 0.09666918218135834, "learning_rate": 4.894030620742545e-05, "loss": 3.1009, "step": 14910 }, { "epoch": 0.865705416461168, "grad_norm": 0.0979539081454277, "learning_rate": 4.8527151160967286e-05, "loss": 3.0995, "step": 14920 }, { "epoch": 0.8662856479735415, "grad_norm": 0.0943804681301117, "learning_rate": 4.81156584848334e-05, "loss": 3.1054, "step": 14930 }, { "epoch": 0.8668658794859149, "grad_norm": 0.09513070434331894, "learning_rate": 4.770582969418319e-05, "loss": 3.1108, "step": 14940 }, { "epoch": 0.8674461109982883, "grad_norm": 0.09076978266239166, "learning_rate": 4.7297666298049156e-05, "loss": 3.1028, "step": 14950 }, { "epoch": 0.8680263425106618, "grad_norm": 0.09252411872148514, "learning_rate": 4.6891169799331614e-05, "loss": 3.117, "step": 14960 }, { "epoch": 0.8686065740230352, "grad_norm": 0.09229396283626556, "learning_rate": 4.648634169479343e-05, "loss": 3.1078, "step": 14970 }, { "epoch": 0.8691868055354086, "grad_norm": 0.09328366816043854, "learning_rate": 4.60831834750538e-05, "loss": 3.1087, "step": 14980 }, { "epoch": 0.869767037047782, "grad_norm": 0.09177900850772858, "learning_rate": 4.568169662458377e-05, "loss": 3.0944, "step": 14990 }, { "epoch": 0.8703472685601555, "grad_norm": 0.09426326304674149, "learning_rate": 4.528188262169991e-05, "loss": 3.108, "step": 15000 }, { "epoch": 0.8703472685601555, "eval_loss": 3.0348994731903076, "eval_runtime": 3.2633, "eval_samples_per_second": 1326.859, "eval_steps_per_second": 10.419, "step": 15000 }, { "epoch": 0.870927500072529, "grad_norm": 0.0925714373588562, "learning_rate": 4.488374293855918e-05, "loss": 3.104, "step": 15010 }, { "epoch": 0.8715077315849024, "grad_norm": 0.0921747237443924, "learning_rate": 4.448727904115379e-05, "loss": 3.1142, "step": 15020 }, { "epoch": 0.8720879630972758, "grad_norm": 0.09229473024606705, "learning_rate": 4.4092492389305074e-05, "loss": 3.0982, "step": 15030 }, { "epoch": 0.8726681946096493, "grad_norm": 0.09434866905212402, "learning_rate": 4.369938443665922e-05, "loss": 3.1127, "step": 15040 }, { "epoch": 0.8732484261220227, "grad_norm": 0.09186001121997833, "learning_rate": 4.330795663068044e-05, "loss": 3.1025, "step": 15050 }, { "epoch": 0.8738286576343961, "grad_norm": 0.09200981259346008, "learning_rate": 4.291821041264721e-05, "loss": 3.0938, "step": 15060 }, { "epoch": 0.8744088891467695, "grad_norm": 0.09246696531772614, "learning_rate": 4.253014721764592e-05, "loss": 3.1122, "step": 15070 }, { "epoch": 0.874989120659143, "grad_norm": 0.09366760402917862, "learning_rate": 4.214376847456575e-05, "loss": 3.1114, "step": 15080 }, { "epoch": 0.8755693521715164, "grad_norm": 0.09217476844787598, "learning_rate": 4.1759075606093934e-05, "loss": 3.1152, "step": 15090 }, { "epoch": 0.8761495836838898, "grad_norm": 0.09385888278484344, "learning_rate": 4.137607002870969e-05, "loss": 3.1151, "step": 15100 }, { "epoch": 0.8767298151962634, "grad_norm": 0.09338943660259247, "learning_rate": 4.099475315267981e-05, "loss": 3.1108, "step": 15110 }, { "epoch": 0.8773100467086368, "grad_norm": 0.09102658182382584, "learning_rate": 4.0615126382052945e-05, "loss": 3.106, "step": 15120 }, { "epoch": 0.8778902782210102, "grad_norm": 0.09203559905290604, "learning_rate": 4.023719111465457e-05, "loss": 3.1, "step": 15130 }, { "epoch": 0.8784705097333836, "grad_norm": 0.09662605822086334, "learning_rate": 3.986094874208218e-05, "loss": 3.1095, "step": 15140 }, { "epoch": 0.8790507412457571, "grad_norm": 0.09100698679685593, "learning_rate": 3.9486400649699216e-05, "loss": 3.0917, "step": 15150 }, { "epoch": 0.8796309727581305, "grad_norm": 0.09504050016403198, "learning_rate": 3.911354821663127e-05, "loss": 3.1041, "step": 15160 }, { "epoch": 0.8802112042705039, "grad_norm": 0.09688866138458252, "learning_rate": 3.874239281576003e-05, "loss": 3.0942, "step": 15170 }, { "epoch": 0.8807914357828773, "grad_norm": 0.0897068902850151, "learning_rate": 3.837293581371837e-05, "loss": 3.1024, "step": 15180 }, { "epoch": 0.8813716672952508, "grad_norm": 0.0906522199511528, "learning_rate": 3.800517857088604e-05, "loss": 3.103, "step": 15190 }, { "epoch": 0.8819518988076243, "grad_norm": 0.0898643210530281, "learning_rate": 3.763912244138334e-05, "loss": 3.11, "step": 15200 }, { "epoch": 0.8825321303199977, "grad_norm": 0.09033368527889252, "learning_rate": 3.727476877306751e-05, "loss": 3.1093, "step": 15210 }, { "epoch": 0.8831123618323711, "grad_norm": 0.0916651263833046, "learning_rate": 3.691211890752688e-05, "loss": 3.1059, "step": 15220 }, { "epoch": 0.8836925933447446, "grad_norm": 0.09232784807682037, "learning_rate": 3.6551174180076195e-05, "loss": 3.1066, "step": 15230 }, { "epoch": 0.884272824857118, "grad_norm": 0.09441141784191132, "learning_rate": 3.619193591975195e-05, "loss": 3.1105, "step": 15240 }, { "epoch": 0.8848530563694914, "grad_norm": 0.09673616290092468, "learning_rate": 3.583440544930672e-05, "loss": 3.0993, "step": 15250 }, { "epoch": 0.8854332878818648, "grad_norm": 0.09390676021575928, "learning_rate": 3.547858408520538e-05, "loss": 3.1056, "step": 15260 }, { "epoch": 0.8860135193942383, "grad_norm": 0.09317633509635925, "learning_rate": 3.512447313761946e-05, "loss": 3.0977, "step": 15270 }, { "epoch": 0.8865937509066117, "grad_norm": 0.09400813281536102, "learning_rate": 3.477207391042253e-05, "loss": 3.0963, "step": 15280 }, { "epoch": 0.8871739824189852, "grad_norm": 0.09057381004095078, "learning_rate": 3.442138770118547e-05, "loss": 3.1024, "step": 15290 }, { "epoch": 0.8877542139313586, "grad_norm": 0.09377612918615341, "learning_rate": 3.4072415801171484e-05, "loss": 3.0959, "step": 15300 }, { "epoch": 0.8883344454437321, "grad_norm": 0.09093187749385834, "learning_rate": 3.3725159495332e-05, "loss": 3.0976, "step": 15310 }, { "epoch": 0.8889146769561055, "grad_norm": 0.09208898991346359, "learning_rate": 3.3379620062300774e-05, "loss": 3.1007, "step": 15320 }, { "epoch": 0.8894949084684789, "grad_norm": 0.09365664422512054, "learning_rate": 3.303579877439039e-05, "loss": 3.1053, "step": 15330 }, { "epoch": 0.8900751399808524, "grad_norm": 0.09230521321296692, "learning_rate": 3.269369689758683e-05, "loss": 3.1055, "step": 15340 }, { "epoch": 0.8906553714932258, "grad_norm": 0.0912981778383255, "learning_rate": 3.235331569154493e-05, "loss": 3.0972, "step": 15350 }, { "epoch": 0.8912356030055992, "grad_norm": 0.08950291574001312, "learning_rate": 3.2014656409584174e-05, "loss": 3.0999, "step": 15360 }, { "epoch": 0.8918158345179726, "grad_norm": 0.09219230711460114, "learning_rate": 3.167772029868321e-05, "loss": 3.1019, "step": 15370 }, { "epoch": 0.8923960660303462, "grad_norm": 0.09009566158056259, "learning_rate": 3.134250859947635e-05, "loss": 3.0978, "step": 15380 }, { "epoch": 0.8929762975427196, "grad_norm": 0.0936865359544754, "learning_rate": 3.1009022546248045e-05, "loss": 3.1021, "step": 15390 }, { "epoch": 0.893556529055093, "grad_norm": 0.09219173341989517, "learning_rate": 3.0677263366928944e-05, "loss": 3.0984, "step": 15400 }, { "epoch": 0.8941367605674664, "grad_norm": 0.09084775298833847, "learning_rate": 3.0347232283091107e-05, "loss": 3.1039, "step": 15410 }, { "epoch": 0.8947169920798399, "grad_norm": 0.0911671370267868, "learning_rate": 3.001893050994342e-05, "loss": 3.0934, "step": 15420 }, { "epoch": 0.8952972235922133, "grad_norm": 0.08928447961807251, "learning_rate": 2.9692359256327628e-05, "loss": 3.1013, "step": 15430 }, { "epoch": 0.8958774551045867, "grad_norm": 0.09330154210329056, "learning_rate": 2.936751972471313e-05, "loss": 3.0978, "step": 15440 }, { "epoch": 0.8964576866169601, "grad_norm": 0.09146152436733246, "learning_rate": 2.904441311119321e-05, "loss": 3.0998, "step": 15450 }, { "epoch": 0.8970379181293336, "grad_norm": 0.09131080657243729, "learning_rate": 2.87230406054802e-05, "loss": 3.1012, "step": 15460 }, { "epoch": 0.897618149641707, "grad_norm": 0.09509788453578949, "learning_rate": 2.8403403390901305e-05, "loss": 3.102, "step": 15470 }, { "epoch": 0.8981983811540805, "grad_norm": 0.09333484619855881, "learning_rate": 2.8085502644394355e-05, "loss": 3.1051, "step": 15480 }, { "epoch": 0.8987786126664539, "grad_norm": 0.09036233276128769, "learning_rate": 2.7769339536503125e-05, "loss": 3.1117, "step": 15490 }, { "epoch": 0.8993588441788274, "grad_norm": 0.08942391723394394, "learning_rate": 2.745491523137328e-05, "loss": 3.1117, "step": 15500 }, { "epoch": 0.8999390756912008, "grad_norm": 0.0910114273428917, "learning_rate": 2.7142230886748053e-05, "loss": 3.0984, "step": 15510 }, { "epoch": 0.9005193072035742, "grad_norm": 0.09009117633104324, "learning_rate": 2.683128765396403e-05, "loss": 3.0985, "step": 15520 }, { "epoch": 0.9010995387159476, "grad_norm": 0.0904909297823906, "learning_rate": 2.652208667794659e-05, "loss": 3.0974, "step": 15530 }, { "epoch": 0.9016797702283211, "grad_norm": 0.09261862933635712, "learning_rate": 2.6214629097206345e-05, "loss": 3.1042, "step": 15540 }, { "epoch": 0.9022600017406945, "grad_norm": 0.09034094959497452, "learning_rate": 2.5908916043834218e-05, "loss": 3.1026, "step": 15550 }, { "epoch": 0.902840233253068, "grad_norm": 0.09254541248083115, "learning_rate": 2.560494864349766e-05, "loss": 3.0954, "step": 15560 }, { "epoch": 0.9034204647654415, "grad_norm": 0.08995792269706726, "learning_rate": 2.530272801543654e-05, "loss": 3.1003, "step": 15570 }, { "epoch": 0.9040006962778149, "grad_norm": 0.08986230194568634, "learning_rate": 2.5002255272458806e-05, "loss": 3.0967, "step": 15580 }, { "epoch": 0.9045809277901883, "grad_norm": 0.08791092783212662, "learning_rate": 2.4703531520936572e-05, "loss": 3.0929, "step": 15590 }, { "epoch": 0.9051611593025617, "grad_norm": 0.09303991496562958, "learning_rate": 2.440655786080209e-05, "loss": 3.0981, "step": 15600 }, { "epoch": 0.9057413908149352, "grad_norm": 0.09381508827209473, "learning_rate": 2.4111335385543387e-05, "loss": 3.0977, "step": 15610 }, { "epoch": 0.9063216223273086, "grad_norm": 0.09249861538410187, "learning_rate": 2.3817865182200638e-05, "loss": 3.0969, "step": 15620 }, { "epoch": 0.906901853839682, "grad_norm": 0.09130273759365082, "learning_rate": 2.352614833136174e-05, "loss": 3.1012, "step": 15630 }, { "epoch": 0.9074820853520554, "grad_norm": 0.08810003846883774, "learning_rate": 2.3236185907158814e-05, "loss": 3.0956, "step": 15640 }, { "epoch": 0.908062316864429, "grad_norm": 0.09278014302253723, "learning_rate": 2.2947978977263807e-05, "loss": 3.1024, "step": 15650 }, { "epoch": 0.9086425483768024, "grad_norm": 0.09021242707967758, "learning_rate": 2.266152860288484e-05, "loss": 3.0915, "step": 15660 }, { "epoch": 0.9092227798891758, "grad_norm": 0.08989161998033524, "learning_rate": 2.2376835838762265e-05, "loss": 3.0851, "step": 15670 }, { "epoch": 0.9098030114015492, "grad_norm": 0.09114709496498108, "learning_rate": 2.2093901733164612e-05, "loss": 3.1014, "step": 15680 }, { "epoch": 0.9103832429139227, "grad_norm": 0.08926232159137726, "learning_rate": 2.1812727327884918e-05, "loss": 3.0965, "step": 15690 }, { "epoch": 0.9109634744262961, "grad_norm": 0.09116176515817642, "learning_rate": 2.1533313658236688e-05, "loss": 3.1009, "step": 15700 }, { "epoch": 0.9115437059386695, "grad_norm": 0.08799432218074799, "learning_rate": 2.1255661753050492e-05, "loss": 3.1023, "step": 15710 }, { "epoch": 0.9121239374510429, "grad_norm": 0.08743447065353394, "learning_rate": 2.097977263466966e-05, "loss": 3.0984, "step": 15720 }, { "epoch": 0.9127041689634164, "grad_norm": 0.09166787564754486, "learning_rate": 2.0705647318946806e-05, "loss": 3.097, "step": 15730 }, { "epoch": 0.9132844004757898, "grad_norm": 0.09091733396053314, "learning_rate": 2.0433286815240092e-05, "loss": 3.1049, "step": 15740 }, { "epoch": 0.9138646319881633, "grad_norm": 0.08930478990077972, "learning_rate": 2.0162692126409365e-05, "loss": 3.0977, "step": 15750 }, { "epoch": 0.9144448635005367, "grad_norm": 0.08997286111116409, "learning_rate": 1.989386424881273e-05, "loss": 3.1036, "step": 15760 }, { "epoch": 0.9150250950129102, "grad_norm": 0.08813077956438065, "learning_rate": 1.9626804172302447e-05, "loss": 3.1003, "step": 15770 }, { "epoch": 0.9156053265252836, "grad_norm": 0.08898695558309555, "learning_rate": 1.936151288022181e-05, "loss": 3.1065, "step": 15780 }, { "epoch": 0.916185558037657, "grad_norm": 0.0892912819981575, "learning_rate": 1.9097991349401156e-05, "loss": 3.1047, "step": 15790 }, { "epoch": 0.9167657895500304, "grad_norm": 0.09048785269260406, "learning_rate": 1.8836240550154205e-05, "loss": 3.1035, "step": 15800 }, { "epoch": 0.9173460210624039, "grad_norm": 0.08921167254447937, "learning_rate": 1.8576261446275057e-05, "loss": 3.1013, "step": 15810 }, { "epoch": 0.9179262525747773, "grad_norm": 0.08912645280361176, "learning_rate": 1.8318054995033805e-05, "loss": 3.0982, "step": 15820 }, { "epoch": 0.9185064840871507, "grad_norm": 0.09258027374744415, "learning_rate": 1.8061622147173716e-05, "loss": 3.1059, "step": 15830 }, { "epoch": 0.9190867155995243, "grad_norm": 0.08841285109519958, "learning_rate": 1.7806963846907498e-05, "loss": 3.095, "step": 15840 }, { "epoch": 0.9196669471118977, "grad_norm": 0.09142499417066574, "learning_rate": 1.7554081031913528e-05, "loss": 3.1007, "step": 15850 }, { "epoch": 0.9202471786242711, "grad_norm": 0.08727526664733887, "learning_rate": 1.7302974633332968e-05, "loss": 3.0974, "step": 15860 }, { "epoch": 0.9208274101366445, "grad_norm": 0.09390713274478912, "learning_rate": 1.7053645575765718e-05, "loss": 3.0998, "step": 15870 }, { "epoch": 0.921407641649018, "grad_norm": 0.09036395698785782, "learning_rate": 1.6806094777267744e-05, "loss": 3.0948, "step": 15880 }, { "epoch": 0.9219878731613914, "grad_norm": 0.08717726916074753, "learning_rate": 1.656032314934669e-05, "loss": 3.0995, "step": 15890 }, { "epoch": 0.9225681046737648, "grad_norm": 0.08829261362552643, "learning_rate": 1.631633159695972e-05, "loss": 3.0997, "step": 15900 }, { "epoch": 0.9231483361861382, "grad_norm": 0.09095877408981323, "learning_rate": 1.6074121018509137e-05, "loss": 3.099, "step": 15910 }, { "epoch": 0.9237285676985117, "grad_norm": 0.08995683491230011, "learning_rate": 1.5833692305839642e-05, "loss": 3.0973, "step": 15920 }, { "epoch": 0.9243087992108852, "grad_norm": 0.08980005234479904, "learning_rate": 1.5595046344235143e-05, "loss": 3.1039, "step": 15930 }, { "epoch": 0.9248890307232586, "grad_norm": 0.08741045743227005, "learning_rate": 1.535818401241479e-05, "loss": 3.1075, "step": 15940 }, { "epoch": 0.925469262235632, "grad_norm": 0.0891076922416687, "learning_rate": 1.512310618253071e-05, "loss": 3.0986, "step": 15950 }, { "epoch": 0.9260494937480055, "grad_norm": 0.0910383015871048, "learning_rate": 1.4889813720164013e-05, "loss": 3.1035, "step": 15960 }, { "epoch": 0.9266297252603789, "grad_norm": 0.08668383955955505, "learning_rate": 1.4658307484321953e-05, "loss": 3.1023, "step": 15970 }, { "epoch": 0.9272099567727523, "grad_norm": 0.08511517196893692, "learning_rate": 1.4428588327434933e-05, "loss": 3.0929, "step": 15980 }, { "epoch": 0.9277901882851257, "grad_norm": 0.08581209927797318, "learning_rate": 1.4200657095352676e-05, "loss": 3.1002, "step": 15990 }, { "epoch": 0.9283704197974992, "grad_norm": 0.08814380317926407, "learning_rate": 1.397451462734206e-05, "loss": 3.098, "step": 16000 }, { "epoch": 0.9283704197974992, "eval_loss": 3.031247854232788, "eval_runtime": 3.2581, "eval_samples_per_second": 1328.977, "eval_steps_per_second": 10.435, "step": 16000 }, { "epoch": 0.9289506513098726, "grad_norm": 0.08873005956411362, "learning_rate": 1.3750161756083234e-05, "loss": 3.1036, "step": 16010 }, { "epoch": 0.929530882822246, "grad_norm": 0.08824755996465683, "learning_rate": 1.3527599307667005e-05, "loss": 3.0983, "step": 16020 }, { "epoch": 0.9301111143346195, "grad_norm": 0.08939357846975327, "learning_rate": 1.3306828101591728e-05, "loss": 3.1033, "step": 16030 }, { "epoch": 0.930691345846993, "grad_norm": 0.08686842769384384, "learning_rate": 1.3087848950759873e-05, "loss": 3.0965, "step": 16040 }, { "epoch": 0.9312715773593664, "grad_norm": 0.08945832401514053, "learning_rate": 1.2870662661475852e-05, "loss": 3.0951, "step": 16050 }, { "epoch": 0.9318518088717398, "grad_norm": 0.088344506919384, "learning_rate": 1.2655270033442189e-05, "loss": 3.1039, "step": 16060 }, { "epoch": 0.9324320403841133, "grad_norm": 0.08870889991521835, "learning_rate": 1.2441671859757143e-05, "loss": 3.0998, "step": 16070 }, { "epoch": 0.9330122718964867, "grad_norm": 0.08570938557386398, "learning_rate": 1.2229868926911636e-05, "loss": 3.0957, "step": 16080 }, { "epoch": 0.9335925034088601, "grad_norm": 0.0883183628320694, "learning_rate": 1.201986201478611e-05, "loss": 3.1008, "step": 16090 }, { "epoch": 0.9341727349212335, "grad_norm": 0.09012133628129959, "learning_rate": 1.1811651896648178e-05, "loss": 3.0915, "step": 16100 }, { "epoch": 0.9347529664336071, "grad_norm": 0.09092947840690613, "learning_rate": 1.1605239339149199e-05, "loss": 3.0886, "step": 16110 }, { "epoch": 0.9353331979459805, "grad_norm": 0.08645268529653549, "learning_rate": 1.140062510232187e-05, "loss": 3.09, "step": 16120 }, { "epoch": 0.9359134294583539, "grad_norm": 0.08784764260053635, "learning_rate": 1.1197809939577197e-05, "loss": 3.0997, "step": 16130 }, { "epoch": 0.9364936609707273, "grad_norm": 0.0903283953666687, "learning_rate": 1.0996794597701865e-05, "loss": 3.1108, "step": 16140 }, { "epoch": 0.9370738924831008, "grad_norm": 0.0857265442609787, "learning_rate": 1.0797579816855585e-05, "loss": 3.1028, "step": 16150 }, { "epoch": 0.9376541239954742, "grad_norm": 0.09120402485132217, "learning_rate": 1.0600166330567761e-05, "loss": 3.0891, "step": 16160 }, { "epoch": 0.9382343555078476, "grad_norm": 0.09053795039653778, "learning_rate": 1.0404554865735771e-05, "loss": 3.1013, "step": 16170 }, { "epoch": 0.938814587020221, "grad_norm": 0.08675380051136017, "learning_rate": 1.0210746142621408e-05, "loss": 3.1088, "step": 16180 }, { "epoch": 0.9393948185325945, "grad_norm": 0.08844220638275146, "learning_rate": 1.0018740874848664e-05, "loss": 3.1075, "step": 16190 }, { "epoch": 0.939975050044968, "grad_norm": 0.09051468223333359, "learning_rate": 9.828539769401235e-06, "loss": 3.0957, "step": 16200 }, { "epoch": 0.9405552815573414, "grad_norm": 0.08637753129005432, "learning_rate": 9.640143526619239e-06, "loss": 3.103, "step": 16210 }, { "epoch": 0.9411355130697148, "grad_norm": 0.09072989225387573, "learning_rate": 9.45355284019761e-06, "loss": 3.1031, "step": 16220 }, { "epoch": 0.9417157445820883, "grad_norm": 0.08837340027093887, "learning_rate": 9.268768397182715e-06, "loss": 3.0989, "step": 16230 }, { "epoch": 0.9422959760944617, "grad_norm": 0.08710675686597824, "learning_rate": 9.085790877970234e-06, "loss": 3.0922, "step": 16240 }, { "epoch": 0.9428762076068351, "grad_norm": 0.08685487508773804, "learning_rate": 8.904620956302512e-06, "loss": 3.0877, "step": 16250 }, { "epoch": 0.9434564391192085, "grad_norm": 0.08923321217298508, "learning_rate": 8.725259299266209e-06, "loss": 3.0994, "step": 16260 }, { "epoch": 0.944036670631582, "grad_norm": 0.08733417093753815, "learning_rate": 8.547706567289814e-06, "loss": 3.0953, "step": 16270 }, { "epoch": 0.9446169021439554, "grad_norm": 0.08618912100791931, "learning_rate": 8.371963414140982e-06, "loss": 3.1021, "step": 16280 }, { "epoch": 0.9451971336563288, "grad_norm": 0.09028159826993942, "learning_rate": 8.198030486924468e-06, "loss": 3.1022, "step": 16290 }, { "epoch": 0.9457773651687024, "grad_norm": 0.09073447436094284, "learning_rate": 8.025908426079532e-06, "loss": 3.1016, "step": 16300 }, { "epoch": 0.9463575966810758, "grad_norm": 0.08651082217693329, "learning_rate": 7.85559786537754e-06, "loss": 3.1053, "step": 16310 }, { "epoch": 0.9469378281934492, "grad_norm": 0.08602554351091385, "learning_rate": 7.687099431919974e-06, "loss": 3.0999, "step": 16320 }, { "epoch": 0.9475180597058226, "grad_norm": 0.08610852062702179, "learning_rate": 7.520413746135657e-06, "loss": 3.1059, "step": 16330 }, { "epoch": 0.9480982912181961, "grad_norm": 0.08780808746814728, "learning_rate": 7.355541421778689e-06, "loss": 3.1046, "step": 16340 }, { "epoch": 0.9486785227305695, "grad_norm": 0.08994690328836441, "learning_rate": 7.1924830659262916e-06, "loss": 3.1094, "step": 16350 }, { "epoch": 0.9492587542429429, "grad_norm": 0.08709990233182907, "learning_rate": 7.03123927897642e-06, "loss": 3.105, "step": 16360 }, { "epoch": 0.9498389857553163, "grad_norm": 0.08651293069124222, "learning_rate": 6.871810654645483e-06, "loss": 3.0934, "step": 16370 }, { "epoch": 0.9504192172676899, "grad_norm": 0.08726444095373154, "learning_rate": 6.7141977799665685e-06, "loss": 3.0952, "step": 16380 }, { "epoch": 0.9509994487800633, "grad_norm": 0.08613457530736923, "learning_rate": 6.558401235286615e-06, "loss": 3.088, "step": 16390 }, { "epoch": 0.9515796802924367, "grad_norm": 0.088069386780262, "learning_rate": 6.404421594264909e-06, "loss": 3.0973, "step": 16400 }, { "epoch": 0.9521599118048101, "grad_norm": 0.08668968081474304, "learning_rate": 6.252259423870643e-06, "loss": 3.1089, "step": 16410 }, { "epoch": 0.9527401433171836, "grad_norm": 0.08948860317468643, "learning_rate": 6.10191528438081e-06, "loss": 3.0993, "step": 16420 }, { "epoch": 0.953320374829557, "grad_norm": 0.08769353479146957, "learning_rate": 5.953389729378256e-06, "loss": 3.1144, "step": 16430 }, { "epoch": 0.9539006063419304, "grad_norm": 0.08861144632101059, "learning_rate": 5.806683305749682e-06, "loss": 3.1077, "step": 16440 }, { "epoch": 0.9544808378543038, "grad_norm": 0.09008525311946869, "learning_rate": 5.661796553683541e-06, "loss": 3.101, "step": 16450 }, { "epoch": 0.9550610693666773, "grad_norm": 0.08720948547124863, "learning_rate": 5.518730006668027e-06, "loss": 3.1043, "step": 16460 }, { "epoch": 0.9556413008790507, "grad_norm": 0.08759420365095139, "learning_rate": 5.377484191489035e-06, "loss": 3.1016, "step": 16470 }, { "epoch": 0.9562215323914242, "grad_norm": 0.08649755269289017, "learning_rate": 5.238059628228598e-06, "loss": 3.0915, "step": 16480 }, { "epoch": 0.9568017639037976, "grad_norm": 0.0874122902750969, "learning_rate": 5.1004568302624456e-06, "loss": 3.1012, "step": 16490 }, { "epoch": 0.9573819954161711, "grad_norm": 0.08565357327461243, "learning_rate": 4.96467630425862e-06, "loss": 3.1011, "step": 16500 }, { "epoch": 0.9579622269285445, "grad_norm": 0.08730974048376083, "learning_rate": 4.830718550175139e-06, "loss": 3.1077, "step": 16510 }, { "epoch": 0.9585424584409179, "grad_norm": 0.08632799237966537, "learning_rate": 4.698584061258559e-06, "loss": 3.0943, "step": 16520 }, { "epoch": 0.9591226899532914, "grad_norm": 0.08760344982147217, "learning_rate": 4.5682733240418605e-06, "loss": 3.0995, "step": 16530 }, { "epoch": 0.9597029214656648, "grad_norm": 0.08842134475708008, "learning_rate": 4.439786818342784e-06, "loss": 3.1084, "step": 16540 }, { "epoch": 0.9602831529780382, "grad_norm": 0.08709140866994858, "learning_rate": 4.313125017262221e-06, "loss": 3.0968, "step": 16550 }, { "epoch": 0.9608633844904116, "grad_norm": 0.08542519807815552, "learning_rate": 4.188288387182104e-06, "loss": 3.097, "step": 16560 }, { "epoch": 0.9614436160027852, "grad_norm": 0.08856651186943054, "learning_rate": 4.065277387764077e-06, "loss": 3.0984, "step": 16570 }, { "epoch": 0.9620238475151586, "grad_norm": 0.09005647897720337, "learning_rate": 3.9440924719473805e-06, "loss": 3.0914, "step": 16580 }, { "epoch": 0.962604079027532, "grad_norm": 0.08719483762979507, "learning_rate": 3.82473408594769e-06, "loss": 3.0979, "step": 16590 }, { "epoch": 0.9631843105399054, "grad_norm": 0.08669883012771606, "learning_rate": 3.7072026692550608e-06, "loss": 3.098, "step": 16600 }, { "epoch": 0.9637645420522789, "grad_norm": 0.08828626573085785, "learning_rate": 3.5914986546323747e-06, "loss": 3.0995, "step": 16610 }, { "epoch": 0.9643447735646523, "grad_norm": 0.08661678433418274, "learning_rate": 3.4776224681141167e-06, "loss": 3.0966, "step": 16620 }, { "epoch": 0.9649250050770257, "grad_norm": 0.08606945723295212, "learning_rate": 3.3655745290042117e-06, "loss": 3.1034, "step": 16630 }, { "epoch": 0.9655052365893991, "grad_norm": 0.08614910393953323, "learning_rate": 3.255355249874914e-06, "loss": 3.1112, "step": 16640 }, { "epoch": 0.9660854681017726, "grad_norm": 0.08819039165973663, "learning_rate": 3.1469650365652525e-06, "loss": 3.0944, "step": 16650 }, { "epoch": 0.9666656996141461, "grad_norm": 0.08805646747350693, "learning_rate": 3.0404042881792546e-06, "loss": 3.0881, "step": 16660 }, { "epoch": 0.9672459311265195, "grad_norm": 0.08734241127967834, "learning_rate": 2.9356733970847817e-06, "loss": 3.0993, "step": 16670 }, { "epoch": 0.9678261626388929, "grad_norm": 0.08561859279870987, "learning_rate": 2.832772748911916e-06, "loss": 3.0975, "step": 16680 }, { "epoch": 0.9684063941512664, "grad_norm": 0.08738164603710175, "learning_rate": 2.7317027225516323e-06, "loss": 3.1009, "step": 16690 }, { "epoch": 0.9689866256636398, "grad_norm": 0.08456070721149445, "learning_rate": 2.632463690154463e-06, "loss": 3.1, "step": 16700 }, { "epoch": 0.9695668571760132, "grad_norm": 0.08673311769962311, "learning_rate": 2.5350560171287783e-06, "loss": 3.1015, "step": 16710 }, { "epoch": 0.9701470886883866, "grad_norm": 0.09023085236549377, "learning_rate": 2.439480062139954e-06, "loss": 3.101, "step": 16720 }, { "epoch": 0.9707273202007601, "grad_norm": 0.08622407913208008, "learning_rate": 2.345736177108537e-06, "loss": 3.1137, "step": 16730 }, { "epoch": 0.9713075517131335, "grad_norm": 0.08697347342967987, "learning_rate": 2.2538247072094177e-06, "loss": 3.1018, "step": 16740 }, { "epoch": 0.971887783225507, "grad_norm": 0.0895911455154419, "learning_rate": 2.1637459908702695e-06, "loss": 3.1039, "step": 16750 }, { "epoch": 0.9724680147378804, "grad_norm": 0.08516402542591095, "learning_rate": 2.075500359770277e-06, "loss": 3.102, "step": 16760 }, { "epoch": 0.9730482462502539, "grad_norm": 0.08854671567678452, "learning_rate": 1.98908813883919e-06, "loss": 3.0975, "step": 16770 }, { "epoch": 0.9736284777626273, "grad_norm": 0.08487720042467117, "learning_rate": 1.9045096462558253e-06, "loss": 3.0992, "step": 16780 }, { "epoch": 0.9742087092750007, "grad_norm": 0.08441456407308578, "learning_rate": 1.8217651934470669e-06, "loss": 3.1019, "step": 16790 }, { "epoch": 0.9747889407873742, "grad_norm": 0.08551038801670074, "learning_rate": 1.74085508508659e-06, "loss": 3.098, "step": 16800 }, { "epoch": 0.9753691722997476, "grad_norm": 0.08782043308019638, "learning_rate": 1.6617796190939726e-06, "loss": 3.1039, "step": 16810 }, { "epoch": 0.975949403812121, "grad_norm": 0.08647370338439941, "learning_rate": 1.5845390866333631e-06, "loss": 3.104, "step": 16820 }, { "epoch": 0.9765296353244944, "grad_norm": 0.08616235107183456, "learning_rate": 1.5091337721124254e-06, "loss": 3.0958, "step": 16830 }, { "epoch": 0.977109866836868, "grad_norm": 0.08559519797563553, "learning_rate": 1.4355639531815067e-06, "loss": 3.1067, "step": 16840 }, { "epoch": 0.9776900983492414, "grad_norm": 0.08562770485877991, "learning_rate": 1.363829900732305e-06, "loss": 3.1058, "step": 16850 }, { "epoch": 0.9782703298616148, "grad_norm": 0.08564095199108124, "learning_rate": 1.2939318788971477e-06, "loss": 3.0996, "step": 16860 }, { "epoch": 0.9788505613739882, "grad_norm": 0.08510784804821014, "learning_rate": 1.225870145047936e-06, "loss": 3.092, "step": 16870 }, { "epoch": 0.9794307928863617, "grad_norm": 0.08742259442806244, "learning_rate": 1.1596449497949802e-06, "loss": 3.0902, "step": 16880 }, { "epoch": 0.9800110243987351, "grad_norm": 0.08536962419748306, "learning_rate": 1.0952565369864997e-06, "loss": 3.0974, "step": 16890 }, { "epoch": 0.9805912559111085, "grad_norm": 0.08702561259269714, "learning_rate": 1.0327051437073464e-06, "loss": 3.1094, "step": 16900 }, { "epoch": 0.9811714874234819, "grad_norm": 0.08652474731206894, "learning_rate": 9.719910002782829e-07, "loss": 3.1017, "step": 16910 }, { "epoch": 0.9817517189358554, "grad_norm": 0.08817258477210999, "learning_rate": 9.131143302551492e-07, "loss": 3.1042, "step": 16920 }, { "epoch": 0.9823319504482289, "grad_norm": 0.08725294470787048, "learning_rate": 8.560753504279761e-07, "loss": 3.1031, "step": 16930 }, { "epoch": 0.9829121819606023, "grad_norm": 0.0844888985157013, "learning_rate": 8.008742708203731e-07, "loss": 3.1036, "step": 16940 }, { "epoch": 0.9834924134729757, "grad_norm": 0.08515673130750656, "learning_rate": 7.475112946883633e-07, "loss": 3.0962, "step": 16950 }, { "epoch": 0.9840726449853492, "grad_norm": 0.08552297949790955, "learning_rate": 6.959866185201058e-07, "loss": 3.0973, "step": 16960 }, { "epoch": 0.9846528764977226, "grad_norm": 0.08687438070774078, "learning_rate": 6.463004320348409e-07, "loss": 3.1033, "step": 16970 }, { "epoch": 0.985233108010096, "grad_norm": 0.0858420580625534, "learning_rate": 5.984529181822795e-07, "loss": 3.1055, "step": 16980 }, { "epoch": 0.9858133395224694, "grad_norm": 0.08427808433771133, "learning_rate": 5.524442531419927e-07, "loss": 3.0978, "step": 16990 }, { "epoch": 0.9863935710348429, "grad_norm": 0.0840638130903244, "learning_rate": 5.08274606322745e-07, "loss": 3.092, "step": 17000 }, { "epoch": 0.9863935710348429, "eval_loss": 3.0300889015197754, "eval_runtime": 3.2481, "eval_samples_per_second": 1333.079, "eval_steps_per_second": 10.468, "step": 17000 }, { "epoch": 0.9869738025472163, "grad_norm": 0.08689724653959274, "learning_rate": 4.6594414036171815e-07, "loss": 3.1035, "step": 17010 }, { "epoch": 0.9875540340595897, "grad_norm": 0.08669095486402512, "learning_rate": 4.2545301112423274e-07, "loss": 3.0939, "step": 17020 }, { "epoch": 0.9881342655719633, "grad_norm": 0.08550863713026047, "learning_rate": 3.868013677028048e-07, "loss": 3.097, "step": 17030 }, { "epoch": 0.9887144970843367, "grad_norm": 0.08494267612695694, "learning_rate": 3.4998935241681295e-07, "loss": 3.0995, "step": 17040 }, { "epoch": 0.9892947285967101, "grad_norm": 0.08708130568265915, "learning_rate": 3.1501710081199843e-07, "loss": 3.1011, "step": 17050 }, { "epoch": 0.9898749601090835, "grad_norm": 0.08479303121566772, "learning_rate": 2.8188474165979915e-07, "loss": 3.1057, "step": 17060 }, { "epoch": 0.990455191621457, "grad_norm": 0.08544128388166428, "learning_rate": 2.505923969571278e-07, "loss": 3.084, "step": 17070 }, { "epoch": 0.9910354231338304, "grad_norm": 0.08571518212556839, "learning_rate": 2.2114018192553874e-07, "loss": 3.1036, "step": 17080 }, { "epoch": 0.9916156546462038, "grad_norm": 0.08791361004114151, "learning_rate": 1.9352820501133961e-07, "loss": 3.1063, "step": 17090 }, { "epoch": 0.9921958861585772, "grad_norm": 0.08581080287694931, "learning_rate": 1.6775656788459158e-07, "loss": 3.1072, "step": 17100 }, { "epoch": 0.9927761176709508, "grad_norm": 0.0846245288848877, "learning_rate": 1.4382536543922076e-07, "loss": 3.0959, "step": 17110 }, { "epoch": 0.9933563491833242, "grad_norm": 0.08586116135120392, "learning_rate": 1.217346857924073e-07, "loss": 3.114, "step": 17120 }, { "epoch": 0.9939365806956976, "grad_norm": 0.0862361267209053, "learning_rate": 1.014846102843081e-07, "loss": 3.0948, "step": 17130 }, { "epoch": 0.994516812208071, "grad_norm": 0.08657976239919662, "learning_rate": 8.307521347789005e-08, "loss": 3.1003, "step": 17140 }, { "epoch": 0.9950970437204445, "grad_norm": 0.0862279012799263, "learning_rate": 6.650656315848602e-08, "loss": 3.0992, "step": 17150 }, { "epoch": 0.9956772752328179, "grad_norm": 0.0885181874036789, "learning_rate": 5.1778720333517383e-08, "loss": 3.0967, "step": 17160 }, { "epoch": 0.9962575067451913, "grad_norm": 0.08494796603918076, "learning_rate": 3.88917392325494e-08, "loss": 3.1039, "step": 17170 }, { "epoch": 0.9968377382575647, "grad_norm": 0.08858942240476608, "learning_rate": 2.78456673066807e-08, "loss": 3.0973, "step": 17180 }, { "epoch": 0.9974179697699382, "grad_norm": 0.08530562371015549, "learning_rate": 1.8640545228820748e-08, "loss": 3.0982, "step": 17190 }, { "epoch": 0.9979982012823116, "grad_norm": 0.08376429975032806, "learning_rate": 1.1276406893079294e-08, "loss": 3.0963, "step": 17200 }, { "epoch": 0.9985784327946851, "grad_norm": 0.08597017079591751, "learning_rate": 5.7532794150994e-09, "loss": 3.1043, "step": 17210 }, { "epoch": 0.9991586643070585, "grad_norm": 0.08577102422714233, "learning_rate": 2.0711831315578524e-09, "loss": 3.0981, "step": 17220 }, { "epoch": 0.999738895819432, "grad_norm": 0.09527655690908432, "learning_rate": 2.3013160027618442e-10, "loss": 3.0959, "step": 17230 }, { "epoch": 0.9999709884243814, "step": 17234, "total_flos": 7.47219573773697e+18, "train_loss": 3.3572154520448043, "train_runtime": 15550.2697, "train_samples_per_second": 567.438, "train_steps_per_second": 1.108 } ], "logging_steps": 10, "max_steps": 17234, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.47219573773697e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }