diff --git "a/contextlm_gpt2_base/trainer_state.json" "b/contextlm_gpt2_base/trainer_state.json" new file mode 100644--- /dev/null +++ "b/contextlm_gpt2_base/trainer_state.json" @@ -0,0 +1,12240 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999709884243814, + "eval_steps": 1000, + "global_step": 17234, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.000580231512373437, + "grad_norm": 4.24833869934082, + "learning_rate": 1.0440835266821346e-05, + "loss": 10.74, + "step": 10 + }, + { + "epoch": 0.001160463024746874, + "grad_norm": 2.10276198387146, + "learning_rate": 2.2041763341067284e-05, + "loss": 9.9422, + "step": 20 + }, + { + "epoch": 0.001740694537120311, + "grad_norm": 2.0392377376556396, + "learning_rate": 3.364269141531322e-05, + "loss": 9.4932, + "step": 30 + }, + { + "epoch": 0.002320926049493748, + "grad_norm": 1.8358850479125977, + "learning_rate": 4.5243619489559165e-05, + "loss": 9.1248, + "step": 40 + }, + { + "epoch": 0.002901157561867185, + "grad_norm": 1.6119049787521362, + "learning_rate": 5.68445475638051e-05, + "loss": 8.6786, + "step": 50 + }, + { + "epoch": 0.003481389074240622, + "grad_norm": 1.438369870185852, + "learning_rate": 6.844547563805105e-05, + "loss": 8.2109, + "step": 60 + }, + { + "epoch": 0.004061620586614059, + "grad_norm": 1.2206534147262573, + "learning_rate": 8.004640371229699e-05, + "loss": 7.7847, + "step": 70 + }, + { + "epoch": 0.004641852098987496, + "grad_norm": 1.797905445098877, + "learning_rate": 9.164733178654293e-05, + "loss": 7.4453, + "step": 80 + }, + { + "epoch": 0.005222083611360933, + "grad_norm": 0.9732237458229065, + "learning_rate": 0.00010324825986078886, + "loss": 7.2239, + "step": 90 + }, + { + "epoch": 0.00580231512373437, + "grad_norm": 0.704902172088623, + "learning_rate": 0.0001148491879350348, + "loss": 7.0716, + "step": 100 + }, + { + "epoch": 0.006382546636107807, + "grad_norm": 0.7608431577682495, + "learning_rate": 0.00012645011600928075, + "loss": 6.9281, + "step": 110 + }, + { + "epoch": 0.006962778148481244, + "grad_norm": 0.5090209245681763, + "learning_rate": 0.00013805104408352666, + "loss": 6.7721, + "step": 120 + }, + { + "epoch": 0.007543009660854681, + "grad_norm": 0.4300777018070221, + "learning_rate": 0.00014965197215777263, + "loss": 6.6322, + "step": 130 + }, + { + "epoch": 0.008123241173228117, + "grad_norm": 0.8929088115692139, + "learning_rate": 0.00016125290023201856, + "loss": 6.5415, + "step": 140 + }, + { + "epoch": 0.008703472685601555, + "grad_norm": 0.3701019287109375, + "learning_rate": 0.0001728538283062645, + "loss": 6.4223, + "step": 150 + }, + { + "epoch": 0.009283704197974993, + "grad_norm": 0.5044598579406738, + "learning_rate": 0.00018445475638051046, + "loss": 6.3371, + "step": 160 + }, + { + "epoch": 0.009863935710348428, + "grad_norm": 0.42209455370903015, + "learning_rate": 0.00019605568445475637, + "loss": 6.2547, + "step": 170 + }, + { + "epoch": 0.010444167222721866, + "grad_norm": 0.3204844295978546, + "learning_rate": 0.00020765661252900234, + "loss": 6.1801, + "step": 180 + }, + { + "epoch": 0.011024398735095304, + "grad_norm": 0.7174881100654602, + "learning_rate": 0.00021925754060324827, + "loss": 6.1174, + "step": 190 + }, + { + "epoch": 0.01160463024746874, + "grad_norm": 0.37421539425849915, + "learning_rate": 0.0002308584686774942, + "loss": 6.0759, + "step": 200 + }, + { + "epoch": 0.012184861759842177, + "grad_norm": 0.5762574672698975, + "learning_rate": 0.00024245939675174015, + "loss": 5.9975, + "step": 210 + }, + { + "epoch": 0.012765093272215615, + "grad_norm": 0.29894348978996277, + "learning_rate": 0.00025406032482598606, + "loss": 5.9512, + "step": 220 + }, + { + "epoch": 0.01334532478458905, + "grad_norm": 0.37387117743492126, + "learning_rate": 0.000265661252900232, + "loss": 5.9045, + "step": 230 + }, + { + "epoch": 0.013925556296962488, + "grad_norm": 0.4004443883895874, + "learning_rate": 0.000277262180974478, + "loss": 5.872, + "step": 240 + }, + { + "epoch": 0.014505787809335926, + "grad_norm": 0.43937498331069946, + "learning_rate": 0.0002888631090487239, + "loss": 5.8183, + "step": 250 + }, + { + "epoch": 0.015086019321709361, + "grad_norm": 0.3658200204372406, + "learning_rate": 0.0003004640371229698, + "loss": 5.7888, + "step": 260 + }, + { + "epoch": 0.0156662508340828, + "grad_norm": 0.3341761827468872, + "learning_rate": 0.0003120649651972158, + "loss": 5.7416, + "step": 270 + }, + { + "epoch": 0.016246482346456235, + "grad_norm": 0.46115365624427795, + "learning_rate": 0.00032366589327146174, + "loss": 5.7067, + "step": 280 + }, + { + "epoch": 0.016826713858829674, + "grad_norm": 0.3009159564971924, + "learning_rate": 0.00033526682134570767, + "loss": 5.665, + "step": 290 + }, + { + "epoch": 0.01740694537120311, + "grad_norm": 0.4389355778694153, + "learning_rate": 0.0003468677494199536, + "loss": 5.6231, + "step": 300 + }, + { + "epoch": 0.017987176883576546, + "grad_norm": 0.4746924042701721, + "learning_rate": 0.00035846867749419955, + "loss": 5.5983, + "step": 310 + }, + { + "epoch": 0.018567408395949985, + "grad_norm": 0.33507513999938965, + "learning_rate": 0.0003700696055684455, + "loss": 5.5685, + "step": 320 + }, + { + "epoch": 0.01914763990832342, + "grad_norm": 0.6776261329650879, + "learning_rate": 0.0003816705336426914, + "loss": 5.5087, + "step": 330 + }, + { + "epoch": 0.019727871420696857, + "grad_norm": 0.4664747416973114, + "learning_rate": 0.00039327146171693736, + "loss": 5.4964, + "step": 340 + }, + { + "epoch": 0.020308102933070296, + "grad_norm": 0.4787660539150238, + "learning_rate": 0.0004048723897911833, + "loss": 5.4473, + "step": 350 + }, + { + "epoch": 0.020888334445443732, + "grad_norm": 0.41453346610069275, + "learning_rate": 0.00041647331786542923, + "loss": 5.4019, + "step": 360 + }, + { + "epoch": 0.021468565957817168, + "grad_norm": 0.5684117078781128, + "learning_rate": 0.0004280742459396752, + "loss": 5.3821, + "step": 370 + }, + { + "epoch": 0.022048797470190607, + "grad_norm": 0.34503793716430664, + "learning_rate": 0.0004396751740139211, + "loss": 5.3503, + "step": 380 + }, + { + "epoch": 0.022629028982564043, + "grad_norm": 0.465599924325943, + "learning_rate": 0.00045127610208816704, + "loss": 5.3183, + "step": 390 + }, + { + "epoch": 0.02320926049493748, + "grad_norm": 0.3912290036678314, + "learning_rate": 0.000462877030162413, + "loss": 5.2937, + "step": 400 + }, + { + "epoch": 0.023789492007310918, + "grad_norm": 0.42982372641563416, + "learning_rate": 0.00047447795823665897, + "loss": 5.2526, + "step": 410 + }, + { + "epoch": 0.024369723519684354, + "grad_norm": 0.6874813437461853, + "learning_rate": 0.00048607888631090485, + "loss": 5.2364, + "step": 420 + }, + { + "epoch": 0.02494995503205779, + "grad_norm": 0.3257123529911041, + "learning_rate": 0.0004976798143851508, + "loss": 5.2107, + "step": 430 + }, + { + "epoch": 0.02553018654443123, + "grad_norm": 0.43527376651763916, + "learning_rate": 0.0005092807424593968, + "loss": 5.1765, + "step": 440 + }, + { + "epoch": 0.026110418056804665, + "grad_norm": 0.36336761713027954, + "learning_rate": 0.0005208816705336427, + "loss": 5.1535, + "step": 450 + }, + { + "epoch": 0.0266906495691781, + "grad_norm": 0.5609498023986816, + "learning_rate": 0.0005324825986078887, + "loss": 5.1106, + "step": 460 + }, + { + "epoch": 0.02727088108155154, + "grad_norm": 0.3533839285373688, + "learning_rate": 0.0005440835266821345, + "loss": 5.0758, + "step": 470 + }, + { + "epoch": 0.027851112593924976, + "grad_norm": 0.3420592248439789, + "learning_rate": 0.0005556844547563805, + "loss": 5.0461, + "step": 480 + }, + { + "epoch": 0.028431344106298412, + "grad_norm": 0.34275180101394653, + "learning_rate": 0.0005672853828306265, + "loss": 5.0096, + "step": 490 + }, + { + "epoch": 0.02901157561867185, + "grad_norm": 0.36812710762023926, + "learning_rate": 0.0005788863109048724, + "loss": 4.989, + "step": 500 + }, + { + "epoch": 0.029591807131045287, + "grad_norm": 0.30121806263923645, + "learning_rate": 0.0005904872389791184, + "loss": 4.947, + "step": 510 + }, + { + "epoch": 0.030172038643418723, + "grad_norm": 0.30486181378364563, + "learning_rate": 0.0006020881670533644, + "loss": 4.9224, + "step": 520 + }, + { + "epoch": 0.030752270155792162, + "grad_norm": 0.36893951892852783, + "learning_rate": 0.0006136890951276102, + "loss": 4.8849, + "step": 530 + }, + { + "epoch": 0.0313325016681656, + "grad_norm": 0.3596450090408325, + "learning_rate": 0.0006252900232018562, + "loss": 4.8539, + "step": 540 + }, + { + "epoch": 0.031912733180539034, + "grad_norm": 0.3460347354412079, + "learning_rate": 0.000636890951276102, + "loss": 4.8462, + "step": 550 + }, + { + "epoch": 0.03249296469291247, + "grad_norm": 0.353222519159317, + "learning_rate": 0.000648491879350348, + "loss": 4.8129, + "step": 560 + }, + { + "epoch": 0.03307319620528591, + "grad_norm": 0.23676836490631104, + "learning_rate": 0.000660092807424594, + "loss": 4.7744, + "step": 570 + }, + { + "epoch": 0.03365342771765935, + "grad_norm": 0.3343792259693146, + "learning_rate": 0.0006716937354988399, + "loss": 4.7622, + "step": 580 + }, + { + "epoch": 0.034233659230032784, + "grad_norm": 0.24879467487335205, + "learning_rate": 0.0006832946635730859, + "loss": 4.7259, + "step": 590 + }, + { + "epoch": 0.03481389074240622, + "grad_norm": 0.3369862139225006, + "learning_rate": 0.0006948955916473319, + "loss": 4.6865, + "step": 600 + }, + { + "epoch": 0.035394122254779656, + "grad_norm": 0.3400874435901642, + "learning_rate": 0.0007064965197215777, + "loss": 4.6676, + "step": 610 + }, + { + "epoch": 0.03597435376715309, + "grad_norm": 0.30212903022766113, + "learning_rate": 0.0007180974477958236, + "loss": 4.6644, + "step": 620 + }, + { + "epoch": 0.036554585279526534, + "grad_norm": 0.27961453795433044, + "learning_rate": 0.0007296983758700696, + "loss": 4.6348, + "step": 630 + }, + { + "epoch": 0.03713481679189997, + "grad_norm": 0.33748018741607666, + "learning_rate": 0.0007412993039443155, + "loss": 4.5963, + "step": 640 + }, + { + "epoch": 0.037715048304273406, + "grad_norm": 0.31729650497436523, + "learning_rate": 0.0007529002320185615, + "loss": 4.5758, + "step": 650 + }, + { + "epoch": 0.03829527981664684, + "grad_norm": 0.22230634093284607, + "learning_rate": 0.0007645011600928075, + "loss": 4.5496, + "step": 660 + }, + { + "epoch": 0.03887551132902028, + "grad_norm": 0.2796823978424072, + "learning_rate": 0.0007761020881670534, + "loss": 4.5275, + "step": 670 + }, + { + "epoch": 0.039455742841393714, + "grad_norm": 0.30923864245414734, + "learning_rate": 0.0007877030162412994, + "loss": 4.5012, + "step": 680 + }, + { + "epoch": 0.040035974353767156, + "grad_norm": 0.2574792504310608, + "learning_rate": 0.0007993039443155452, + "loss": 4.4765, + "step": 690 + }, + { + "epoch": 0.04061620586614059, + "grad_norm": 0.26686057448387146, + "learning_rate": 0.0008109048723897911, + "loss": 4.4458, + "step": 700 + }, + { + "epoch": 0.04119643737851403, + "grad_norm": 0.31044116616249084, + "learning_rate": 0.0008225058004640371, + "loss": 4.4281, + "step": 710 + }, + { + "epoch": 0.041776668890887464, + "grad_norm": 0.27075859904289246, + "learning_rate": 0.000834106728538283, + "loss": 4.4045, + "step": 720 + }, + { + "epoch": 0.0423569004032609, + "grad_norm": 0.25896894931793213, + "learning_rate": 0.000845707656612529, + "loss": 4.3825, + "step": 730 + }, + { + "epoch": 0.042937131915634336, + "grad_norm": 0.20856112241744995, + "learning_rate": 0.000857308584686775, + "loss": 4.3483, + "step": 740 + }, + { + "epoch": 0.04351736342800778, + "grad_norm": 0.3068506121635437, + "learning_rate": 0.0008689095127610209, + "loss": 4.3236, + "step": 750 + }, + { + "epoch": 0.044097594940381214, + "grad_norm": 0.2396797239780426, + "learning_rate": 0.0008805104408352669, + "loss": 4.3045, + "step": 760 + }, + { + "epoch": 0.04467782645275465, + "grad_norm": 0.32250258326530457, + "learning_rate": 0.0008921113689095129, + "loss": 4.2823, + "step": 770 + }, + { + "epoch": 0.045258057965128086, + "grad_norm": 0.28663381934165955, + "learning_rate": 0.0009037122969837586, + "loss": 4.2603, + "step": 780 + }, + { + "epoch": 0.04583828947750152, + "grad_norm": 0.26493045687675476, + "learning_rate": 0.0009153132250580046, + "loss": 4.2477, + "step": 790 + }, + { + "epoch": 0.04641852098987496, + "grad_norm": 0.2331818789243698, + "learning_rate": 0.0009269141531322506, + "loss": 4.2247, + "step": 800 + }, + { + "epoch": 0.0469987525022484, + "grad_norm": 0.31670576333999634, + "learning_rate": 0.0009385150812064965, + "loss": 4.1969, + "step": 810 + }, + { + "epoch": 0.047578984014621836, + "grad_norm": 0.2539173662662506, + "learning_rate": 0.0009501160092807425, + "loss": 4.1818, + "step": 820 + }, + { + "epoch": 0.04815921552699527, + "grad_norm": 0.24318990111351013, + "learning_rate": 0.0009617169373549885, + "loss": 4.1624, + "step": 830 + }, + { + "epoch": 0.04873944703936871, + "grad_norm": 0.2574463486671448, + "learning_rate": 0.0009733178654292344, + "loss": 4.1577, + "step": 840 + }, + { + "epoch": 0.049319678551742144, + "grad_norm": 0.28978243470191956, + "learning_rate": 0.0009849187935034804, + "loss": 4.1412, + "step": 850 + }, + { + "epoch": 0.04989991006411558, + "grad_norm": 0.20802126824855804, + "learning_rate": 0.0009965197215777261, + "loss": 4.1262, + "step": 860 + }, + { + "epoch": 0.05048014157648902, + "grad_norm": 0.1996484100818634, + "learning_rate": 0.0009999995489420968, + "loss": 4.1093, + "step": 870 + }, + { + "epoch": 0.05106037308886246, + "grad_norm": 0.2888406813144684, + "learning_rate": 0.0009999973396808558, + "loss": 4.0934, + "step": 880 + }, + { + "epoch": 0.051640604601235894, + "grad_norm": 0.23817220330238342, + "learning_rate": 0.0009999932893770317, + "loss": 4.0812, + "step": 890 + }, + { + "epoch": 0.05222083611360933, + "grad_norm": 0.21273092925548553, + "learning_rate": 0.0009999873980455383, + "loss": 4.0675, + "step": 900 + }, + { + "epoch": 0.052801067625982766, + "grad_norm": 0.2241075336933136, + "learning_rate": 0.000999979665708068, + "loss": 4.0543, + "step": 910 + }, + { + "epoch": 0.0533812991383562, + "grad_norm": 0.2210296392440796, + "learning_rate": 0.000999970092393092, + "loss": 4.0374, + "step": 920 + }, + { + "epoch": 0.053961530650729644, + "grad_norm": 0.21746701002120972, + "learning_rate": 0.0009999586781358604, + "loss": 4.0324, + "step": 930 + }, + { + "epoch": 0.05454176216310308, + "grad_norm": 0.20182138681411743, + "learning_rate": 0.0009999454229784018, + "loss": 4.0211, + "step": 940 + }, + { + "epoch": 0.055121993675476516, + "grad_norm": 0.20171615481376648, + "learning_rate": 0.0009999303269695226, + "loss": 4.0011, + "step": 950 + }, + { + "epoch": 0.05570222518784995, + "grad_norm": 0.27165260910987854, + "learning_rate": 0.0009999133901648083, + "loss": 3.9968, + "step": 960 + }, + { + "epoch": 0.05628245670022339, + "grad_norm": 0.2524718940258026, + "learning_rate": 0.0009998946126266218, + "loss": 3.9876, + "step": 970 + }, + { + "epoch": 0.056862688212596824, + "grad_norm": 0.18922555446624756, + "learning_rate": 0.0009998739944241041, + "loss": 3.9756, + "step": 980 + }, + { + "epoch": 0.057442919724970266, + "grad_norm": 0.2163887917995453, + "learning_rate": 0.0009998515356331734, + "loss": 3.953, + "step": 990 + }, + { + "epoch": 0.0580231512373437, + "grad_norm": 0.24239082634449005, + "learning_rate": 0.0009998272363365254, + "loss": 3.9616, + "step": 1000 + }, + { + "epoch": 0.0580231512373437, + "eval_loss": 3.8911185264587402, + "eval_runtime": 3.2761, + "eval_samples_per_second": 1321.706, + "eval_steps_per_second": 10.378, + "step": 1000 + }, + { + "epoch": 0.05860338274971714, + "grad_norm": 0.19143226742744446, + "learning_rate": 0.000999801096623633, + "loss": 3.9342, + "step": 1010 + }, + { + "epoch": 0.059183614262090574, + "grad_norm": 0.23061209917068481, + "learning_rate": 0.000999773116590745, + "loss": 3.932, + "step": 1020 + }, + { + "epoch": 0.05976384577446401, + "grad_norm": 0.20795617997646332, + "learning_rate": 0.0009997432963408865, + "loss": 3.9247, + "step": 1030 + }, + { + "epoch": 0.060344077286837446, + "grad_norm": 0.22474446892738342, + "learning_rate": 0.0009997116359838595, + "loss": 3.9318, + "step": 1040 + }, + { + "epoch": 0.06092430879921089, + "grad_norm": 0.1934744417667389, + "learning_rate": 0.00099967813563624, + "loss": 3.9166, + "step": 1050 + }, + { + "epoch": 0.061504540311584324, + "grad_norm": 0.22670230269432068, + "learning_rate": 0.0009996427954213807, + "loss": 3.8964, + "step": 1060 + }, + { + "epoch": 0.06208477182395776, + "grad_norm": 0.193163201212883, + "learning_rate": 0.0009996056154694072, + "loss": 3.9087, + "step": 1070 + }, + { + "epoch": 0.0626650033363312, + "grad_norm": 0.26965293288230896, + "learning_rate": 0.0009995665959172202, + "loss": 3.8943, + "step": 1080 + }, + { + "epoch": 0.06324523484870463, + "grad_norm": 0.16498436033725739, + "learning_rate": 0.0009995257369084939, + "loss": 3.8751, + "step": 1090 + }, + { + "epoch": 0.06382546636107807, + "grad_norm": 0.18623340129852295, + "learning_rate": 0.0009994830385936754, + "loss": 3.8759, + "step": 1100 + }, + { + "epoch": 0.0644056978734515, + "grad_norm": 0.18320336937904358, + "learning_rate": 0.000999438501129984, + "loss": 3.8592, + "step": 1110 + }, + { + "epoch": 0.06498592938582494, + "grad_norm": 0.16004326939582825, + "learning_rate": 0.0009993921246814119, + "loss": 3.8453, + "step": 1120 + }, + { + "epoch": 0.06556616089819838, + "grad_norm": 0.20819014310836792, + "learning_rate": 0.0009993439094187217, + "loss": 3.8493, + "step": 1130 + }, + { + "epoch": 0.06614639241057182, + "grad_norm": 0.20609912276268005, + "learning_rate": 0.0009992938555194472, + "loss": 3.8399, + "step": 1140 + }, + { + "epoch": 0.06672662392294526, + "grad_norm": 0.16247253119945526, + "learning_rate": 0.0009992419631678921, + "loss": 3.8425, + "step": 1150 + }, + { + "epoch": 0.0673068554353187, + "grad_norm": 0.2205752432346344, + "learning_rate": 0.0009991882325551295, + "loss": 3.823, + "step": 1160 + }, + { + "epoch": 0.06788708694769213, + "grad_norm": 0.18031305074691772, + "learning_rate": 0.0009991326638790008, + "loss": 3.8129, + "step": 1170 + }, + { + "epoch": 0.06846731846006557, + "grad_norm": 0.21720442175865173, + "learning_rate": 0.0009990752573441162, + "loss": 3.8177, + "step": 1180 + }, + { + "epoch": 0.069047549972439, + "grad_norm": 0.17516951262950897, + "learning_rate": 0.000999016013161852, + "loss": 3.8144, + "step": 1190 + }, + { + "epoch": 0.06962778148481244, + "grad_norm": 0.1756078600883484, + "learning_rate": 0.000998954931550352, + "loss": 3.8041, + "step": 1200 + }, + { + "epoch": 0.07020801299718588, + "grad_norm": 0.19864057004451752, + "learning_rate": 0.000998892012734525, + "loss": 3.8031, + "step": 1210 + }, + { + "epoch": 0.07078824450955931, + "grad_norm": 0.20211303234100342, + "learning_rate": 0.0009988272569460442, + "loss": 3.8009, + "step": 1220 + }, + { + "epoch": 0.07136847602193275, + "grad_norm": 0.16449439525604248, + "learning_rate": 0.0009987606644233477, + "loss": 3.7916, + "step": 1230 + }, + { + "epoch": 0.07194870753430618, + "grad_norm": 0.24472416937351227, + "learning_rate": 0.0009986922354116362, + "loss": 3.7902, + "step": 1240 + }, + { + "epoch": 0.07252893904667962, + "grad_norm": 0.15388889610767365, + "learning_rate": 0.000998621970162872, + "loss": 3.7747, + "step": 1250 + }, + { + "epoch": 0.07310917055905307, + "grad_norm": 0.18148912489414215, + "learning_rate": 0.0009985498689357797, + "loss": 3.771, + "step": 1260 + }, + { + "epoch": 0.0736894020714265, + "grad_norm": 0.20958511531352997, + "learning_rate": 0.000998475931995843, + "loss": 3.7728, + "step": 1270 + }, + { + "epoch": 0.07426963358379994, + "grad_norm": 0.20867913961410522, + "learning_rate": 0.000998400159615306, + "loss": 3.7641, + "step": 1280 + }, + { + "epoch": 0.07484986509617338, + "grad_norm": 0.2014162242412567, + "learning_rate": 0.00099832255207317, + "loss": 3.7605, + "step": 1290 + }, + { + "epoch": 0.07543009660854681, + "grad_norm": 0.19448301196098328, + "learning_rate": 0.0009982431096551947, + "loss": 3.7562, + "step": 1300 + }, + { + "epoch": 0.07601032812092025, + "grad_norm": 0.17145995795726776, + "learning_rate": 0.0009981618326538948, + "loss": 3.7583, + "step": 1310 + }, + { + "epoch": 0.07659055963329368, + "grad_norm": 0.16092616319656372, + "learning_rate": 0.000998078721368541, + "loss": 3.7439, + "step": 1320 + }, + { + "epoch": 0.07717079114566712, + "grad_norm": 0.22316782176494598, + "learning_rate": 0.000997993776105158, + "loss": 3.7373, + "step": 1330 + }, + { + "epoch": 0.07775102265804056, + "grad_norm": 0.17662325501441956, + "learning_rate": 0.0009979069971765226, + "loss": 3.7337, + "step": 1340 + }, + { + "epoch": 0.07833125417041399, + "grad_norm": 0.139273002743721, + "learning_rate": 0.0009978183849021645, + "loss": 3.7296, + "step": 1350 + }, + { + "epoch": 0.07891148568278743, + "grad_norm": 0.2038998305797577, + "learning_rate": 0.000997727939608363, + "loss": 3.7353, + "step": 1360 + }, + { + "epoch": 0.07949171719516086, + "grad_norm": 0.18561561405658722, + "learning_rate": 0.0009976356616281474, + "loss": 3.7318, + "step": 1370 + }, + { + "epoch": 0.08007194870753431, + "grad_norm": 0.20797798037528992, + "learning_rate": 0.0009975415513012946, + "loss": 3.7242, + "step": 1380 + }, + { + "epoch": 0.08065218021990775, + "grad_norm": 0.1697220355272293, + "learning_rate": 0.0009974456089743289, + "loss": 3.7066, + "step": 1390 + }, + { + "epoch": 0.08123241173228118, + "grad_norm": 0.18265104293823242, + "learning_rate": 0.0009973478350005199, + "loss": 3.7159, + "step": 1400 + }, + { + "epoch": 0.08181264324465462, + "grad_norm": 0.18804903328418732, + "learning_rate": 0.0009972482297398817, + "loss": 3.7042, + "step": 1410 + }, + { + "epoch": 0.08239287475702806, + "grad_norm": 0.1842174530029297, + "learning_rate": 0.0009971467935591713, + "loss": 3.7103, + "step": 1420 + }, + { + "epoch": 0.08297310626940149, + "grad_norm": 0.18509556353092194, + "learning_rate": 0.000997043526831887, + "loss": 3.7066, + "step": 1430 + }, + { + "epoch": 0.08355333778177493, + "grad_norm": 0.19408974051475525, + "learning_rate": 0.0009969384299382683, + "loss": 3.6867, + "step": 1440 + }, + { + "epoch": 0.08413356929414836, + "grad_norm": 0.18689195811748505, + "learning_rate": 0.0009968315032652924, + "loss": 3.6787, + "step": 1450 + }, + { + "epoch": 0.0847138008065218, + "grad_norm": 0.17087410390377045, + "learning_rate": 0.0009967227472066748, + "loss": 3.6792, + "step": 1460 + }, + { + "epoch": 0.08529403231889524, + "grad_norm": 0.1809985190629959, + "learning_rate": 0.000996612162162867, + "loss": 3.6975, + "step": 1470 + }, + { + "epoch": 0.08587426383126867, + "grad_norm": 0.2063867449760437, + "learning_rate": 0.000996499748541054, + "loss": 3.6831, + "step": 1480 + }, + { + "epoch": 0.08645449534364211, + "grad_norm": 0.191127747297287, + "learning_rate": 0.0009963855067551552, + "loss": 3.6779, + "step": 1490 + }, + { + "epoch": 0.08703472685601556, + "grad_norm": 0.17394115030765533, + "learning_rate": 0.0009962694372258206, + "loss": 3.665, + "step": 1500 + }, + { + "epoch": 0.08761495836838899, + "grad_norm": 0.17784751951694489, + "learning_rate": 0.0009961515403804303, + "loss": 3.6587, + "step": 1510 + }, + { + "epoch": 0.08819518988076243, + "grad_norm": 0.2151278853416443, + "learning_rate": 0.0009960318166530927, + "loss": 3.6526, + "step": 1520 + }, + { + "epoch": 0.08877542139313586, + "grad_norm": 0.20224444568157196, + "learning_rate": 0.0009959102664846432, + "loss": 3.664, + "step": 1530 + }, + { + "epoch": 0.0893556529055093, + "grad_norm": 0.18003828823566437, + "learning_rate": 0.0009957868903226425, + "loss": 3.6487, + "step": 1540 + }, + { + "epoch": 0.08993588441788274, + "grad_norm": 0.17379723489284515, + "learning_rate": 0.0009956616886213742, + "loss": 3.6553, + "step": 1550 + }, + { + "epoch": 0.09051611593025617, + "grad_norm": 0.19469398260116577, + "learning_rate": 0.0009955346618418443, + "loss": 3.6518, + "step": 1560 + }, + { + "epoch": 0.09109634744262961, + "grad_norm": 0.15254022181034088, + "learning_rate": 0.0009954058104517788, + "loss": 3.6517, + "step": 1570 + }, + { + "epoch": 0.09167657895500304, + "grad_norm": 0.19052338600158691, + "learning_rate": 0.0009952751349256218, + "loss": 3.6479, + "step": 1580 + }, + { + "epoch": 0.09225681046737648, + "grad_norm": 0.15514792501926422, + "learning_rate": 0.0009951426357445343, + "loss": 3.6307, + "step": 1590 + }, + { + "epoch": 0.09283704197974992, + "grad_norm": 0.1835625171661377, + "learning_rate": 0.0009950083133963923, + "loss": 3.6349, + "step": 1600 + }, + { + "epoch": 0.09341727349212335, + "grad_norm": 0.1494477540254593, + "learning_rate": 0.0009948721683757846, + "loss": 3.6373, + "step": 1610 + }, + { + "epoch": 0.0939975050044968, + "grad_norm": 0.23332346975803375, + "learning_rate": 0.0009947342011840114, + "loss": 3.63, + "step": 1620 + }, + { + "epoch": 0.09457773651687024, + "grad_norm": 0.1672951877117157, + "learning_rate": 0.0009945944123290827, + "loss": 3.6305, + "step": 1630 + }, + { + "epoch": 0.09515796802924367, + "grad_norm": 0.1747497320175171, + "learning_rate": 0.0009944528023257153, + "loss": 3.6295, + "step": 1640 + }, + { + "epoch": 0.09573819954161711, + "grad_norm": 0.15887069702148438, + "learning_rate": 0.0009943093716953321, + "loss": 3.6315, + "step": 1650 + }, + { + "epoch": 0.09631843105399054, + "grad_norm": 0.1994808316230774, + "learning_rate": 0.00099416412096606, + "loss": 3.6236, + "step": 1660 + }, + { + "epoch": 0.09689866256636398, + "grad_norm": 0.16880349814891815, + "learning_rate": 0.0009940170506727273, + "loss": 3.6111, + "step": 1670 + }, + { + "epoch": 0.09747889407873742, + "grad_norm": 0.2141963392496109, + "learning_rate": 0.000993868161356862, + "loss": 3.6147, + "step": 1680 + }, + { + "epoch": 0.09805912559111085, + "grad_norm": 0.19859230518341064, + "learning_rate": 0.0009937174535666904, + "loss": 3.6225, + "step": 1690 + }, + { + "epoch": 0.09863935710348429, + "grad_norm": 0.21606995165348053, + "learning_rate": 0.0009935649278571344, + "loss": 3.6035, + "step": 1700 + }, + { + "epoch": 0.09921958861585772, + "grad_norm": 0.1618034541606903, + "learning_rate": 0.0009934105847898094, + "loss": 3.6114, + "step": 1710 + }, + { + "epoch": 0.09979982012823116, + "grad_norm": 0.19869261980056763, + "learning_rate": 0.0009932544249330229, + "loss": 3.6085, + "step": 1720 + }, + { + "epoch": 0.1003800516406046, + "grad_norm": 0.20715004205703735, + "learning_rate": 0.0009930964488617717, + "loss": 3.6056, + "step": 1730 + }, + { + "epoch": 0.10096028315297804, + "grad_norm": 0.14460305869579315, + "learning_rate": 0.0009929366571577406, + "loss": 3.6041, + "step": 1740 + }, + { + "epoch": 0.10154051466535148, + "grad_norm": 0.1818549782037735, + "learning_rate": 0.000992775050409299, + "loss": 3.6026, + "step": 1750 + }, + { + "epoch": 0.10212074617772492, + "grad_norm": 0.16325876116752625, + "learning_rate": 0.0009926116292115, + "loss": 3.5907, + "step": 1760 + }, + { + "epoch": 0.10270097769009835, + "grad_norm": 0.1783674657344818, + "learning_rate": 0.0009924463941660777, + "loss": 3.5932, + "step": 1770 + }, + { + "epoch": 0.10328120920247179, + "grad_norm": 0.23217235505580902, + "learning_rate": 0.0009922793458814448, + "loss": 3.5905, + "step": 1780 + }, + { + "epoch": 0.10386144071484522, + "grad_norm": 0.18973788619041443, + "learning_rate": 0.0009921104849726903, + "loss": 3.6022, + "step": 1790 + }, + { + "epoch": 0.10444167222721866, + "grad_norm": 0.17927169799804688, + "learning_rate": 0.0009919398120615784, + "loss": 3.5857, + "step": 1800 + }, + { + "epoch": 0.1050219037395921, + "grad_norm": 0.1780613660812378, + "learning_rate": 0.000991767327776544, + "loss": 3.5834, + "step": 1810 + }, + { + "epoch": 0.10560213525196553, + "grad_norm": 0.16915231943130493, + "learning_rate": 0.0009915930327526925, + "loss": 3.5848, + "step": 1820 + }, + { + "epoch": 0.10618236676433897, + "grad_norm": 0.18921369314193726, + "learning_rate": 0.0009914169276317966, + "loss": 3.5855, + "step": 1830 + }, + { + "epoch": 0.1067625982767124, + "grad_norm": 0.17203205823898315, + "learning_rate": 0.0009912390130622935, + "loss": 3.577, + "step": 1840 + }, + { + "epoch": 0.10734282978908584, + "grad_norm": 0.23704229295253754, + "learning_rate": 0.0009910592896992835, + "loss": 3.5721, + "step": 1850 + }, + { + "epoch": 0.10792306130145929, + "grad_norm": 0.15052714943885803, + "learning_rate": 0.000990877758204527, + "loss": 3.5778, + "step": 1860 + }, + { + "epoch": 0.10850329281383272, + "grad_norm": 0.1933393031358719, + "learning_rate": 0.0009906944192464417, + "loss": 3.5789, + "step": 1870 + }, + { + "epoch": 0.10908352432620616, + "grad_norm": 0.17307838797569275, + "learning_rate": 0.000990509273500101, + "loss": 3.5701, + "step": 1880 + }, + { + "epoch": 0.1096637558385796, + "grad_norm": 0.20114563405513763, + "learning_rate": 0.0009903223216472306, + "loss": 3.5716, + "step": 1890 + }, + { + "epoch": 0.11024398735095303, + "grad_norm": 0.20986546576023102, + "learning_rate": 0.0009901335643762075, + "loss": 3.5644, + "step": 1900 + }, + { + "epoch": 0.11082421886332647, + "grad_norm": 0.1716219186782837, + "learning_rate": 0.0009899430023820551, + "loss": 3.5617, + "step": 1910 + }, + { + "epoch": 0.1114044503756999, + "grad_norm": 0.1951904147863388, + "learning_rate": 0.0009897506363664428, + "loss": 3.5535, + "step": 1920 + }, + { + "epoch": 0.11198468188807334, + "grad_norm": 0.1623149961233139, + "learning_rate": 0.0009895564670376823, + "loss": 3.5586, + "step": 1930 + }, + { + "epoch": 0.11256491340044678, + "grad_norm": 0.1657789647579193, + "learning_rate": 0.000989360495110726, + "loss": 3.5623, + "step": 1940 + }, + { + "epoch": 0.11314514491282021, + "grad_norm": 0.2033306509256363, + "learning_rate": 0.0009891627213071625, + "loss": 3.5404, + "step": 1950 + }, + { + "epoch": 0.11372537642519365, + "grad_norm": 0.17595010995864868, + "learning_rate": 0.0009889631463552157, + "loss": 3.5493, + "step": 1960 + }, + { + "epoch": 0.11430560793756708, + "grad_norm": 0.1695556640625, + "learning_rate": 0.0009887617709897416, + "loss": 3.5537, + "step": 1970 + }, + { + "epoch": 0.11488583944994053, + "grad_norm": 0.14913249015808105, + "learning_rate": 0.0009885585959522256, + "loss": 3.5531, + "step": 1980 + }, + { + "epoch": 0.11546607096231397, + "grad_norm": 0.1780378520488739, + "learning_rate": 0.000988353621990779, + "loss": 3.5458, + "step": 1990 + }, + { + "epoch": 0.1160463024746874, + "grad_norm": 0.1777425855398178, + "learning_rate": 0.0009881468498601379, + "loss": 3.5512, + "step": 2000 + }, + { + "epoch": 0.1160463024746874, + "eval_loss": 3.486086845397949, + "eval_runtime": 3.248, + "eval_samples_per_second": 1333.109, + "eval_steps_per_second": 10.468, + "step": 2000 + }, + { + "epoch": 0.11662653398706084, + "grad_norm": 0.1840885430574417, + "learning_rate": 0.0009879382803216585, + "loss": 3.5499, + "step": 2010 + }, + { + "epoch": 0.11720676549943428, + "grad_norm": 0.15481173992156982, + "learning_rate": 0.000987727914143316, + "loss": 3.5435, + "step": 2020 + }, + { + "epoch": 0.11778699701180771, + "grad_norm": 0.17583511769771576, + "learning_rate": 0.0009875157520997005, + "loss": 3.5421, + "step": 2030 + }, + { + "epoch": 0.11836722852418115, + "grad_norm": 0.17113088071346283, + "learning_rate": 0.000987301794972015, + "loss": 3.5256, + "step": 2040 + }, + { + "epoch": 0.11894746003655458, + "grad_norm": 0.19127151370048523, + "learning_rate": 0.000987086043548072, + "loss": 3.5307, + "step": 2050 + }, + { + "epoch": 0.11952769154892802, + "grad_norm": 0.1801798790693283, + "learning_rate": 0.000986868498622291, + "loss": 3.5428, + "step": 2060 + }, + { + "epoch": 0.12010792306130146, + "grad_norm": 0.17117474973201752, + "learning_rate": 0.0009866491609956949, + "loss": 3.5429, + "step": 2070 + }, + { + "epoch": 0.12068815457367489, + "grad_norm": 0.18343955278396606, + "learning_rate": 0.000986428031475908, + "loss": 3.5305, + "step": 2080 + }, + { + "epoch": 0.12126838608604833, + "grad_norm": 0.18340405821800232, + "learning_rate": 0.0009862051108771523, + "loss": 3.5239, + "step": 2090 + }, + { + "epoch": 0.12184861759842178, + "grad_norm": 0.17750664055347443, + "learning_rate": 0.000985980400020245, + "loss": 3.5233, + "step": 2100 + }, + { + "epoch": 0.12242884911079521, + "grad_norm": 0.18838383257389069, + "learning_rate": 0.000985753899732595, + "loss": 3.5299, + "step": 2110 + }, + { + "epoch": 0.12300908062316865, + "grad_norm": 0.18249675631523132, + "learning_rate": 0.0009855256108481996, + "loss": 3.5363, + "step": 2120 + }, + { + "epoch": 0.12358931213554208, + "grad_norm": 0.17180895805358887, + "learning_rate": 0.0009852955342076431, + "loss": 3.5211, + "step": 2130 + }, + { + "epoch": 0.12416954364791552, + "grad_norm": 0.2116585075855255, + "learning_rate": 0.0009850636706580911, + "loss": 3.5278, + "step": 2140 + }, + { + "epoch": 0.12474977516028896, + "grad_norm": 0.1602133959531784, + "learning_rate": 0.0009848300210532899, + "loss": 3.5184, + "step": 2150 + }, + { + "epoch": 0.1253300066726624, + "grad_norm": 0.18009105324745178, + "learning_rate": 0.0009845945862535618, + "loss": 3.5151, + "step": 2160 + }, + { + "epoch": 0.12591023818503583, + "grad_norm": 0.19407133758068085, + "learning_rate": 0.0009843573671258024, + "loss": 3.5237, + "step": 2170 + }, + { + "epoch": 0.12649046969740926, + "grad_norm": 0.1537596881389618, + "learning_rate": 0.000984118364543477, + "loss": 3.5151, + "step": 2180 + }, + { + "epoch": 0.1270707012097827, + "grad_norm": 0.20341388881206512, + "learning_rate": 0.0009838775793866187, + "loss": 3.5146, + "step": 2190 + }, + { + "epoch": 0.12765093272215614, + "grad_norm": 0.19128254055976868, + "learning_rate": 0.0009836350125418233, + "loss": 3.5243, + "step": 2200 + }, + { + "epoch": 0.12823116423452957, + "grad_norm": 0.18839573860168457, + "learning_rate": 0.0009833906649022476, + "loss": 3.5006, + "step": 2210 + }, + { + "epoch": 0.128811395746903, + "grad_norm": 0.1921667903661728, + "learning_rate": 0.0009831445373676049, + "loss": 3.5139, + "step": 2220 + }, + { + "epoch": 0.12939162725927644, + "grad_norm": 0.19188368320465088, + "learning_rate": 0.000982896630844163, + "loss": 3.5169, + "step": 2230 + }, + { + "epoch": 0.12997185877164988, + "grad_norm": 0.17427149415016174, + "learning_rate": 0.000982646946244739, + "loss": 3.5046, + "step": 2240 + }, + { + "epoch": 0.13055209028402331, + "grad_norm": 0.16430804133415222, + "learning_rate": 0.0009823954844886983, + "loss": 3.5112, + "step": 2250 + }, + { + "epoch": 0.13113232179639675, + "grad_norm": 0.15311101078987122, + "learning_rate": 0.0009821422465019496, + "loss": 3.5017, + "step": 2260 + }, + { + "epoch": 0.13171255330877019, + "grad_norm": 0.17969970405101776, + "learning_rate": 0.000981887233216941, + "loss": 3.4945, + "step": 2270 + }, + { + "epoch": 0.13229278482114365, + "grad_norm": 0.16781874001026154, + "learning_rate": 0.000981630445572659, + "loss": 3.5051, + "step": 2280 + }, + { + "epoch": 0.13287301633351709, + "grad_norm": 0.1790471076965332, + "learning_rate": 0.0009813718845146215, + "loss": 3.4946, + "step": 2290 + }, + { + "epoch": 0.13345324784589052, + "grad_norm": 0.14774377644062042, + "learning_rate": 0.0009811115509948784, + "loss": 3.495, + "step": 2300 + }, + { + "epoch": 0.13403347935826396, + "grad_norm": 0.18693110346794128, + "learning_rate": 0.0009808494459720046, + "loss": 3.5018, + "step": 2310 + }, + { + "epoch": 0.1346137108706374, + "grad_norm": 0.163302943110466, + "learning_rate": 0.000980585570411098, + "loss": 3.4864, + "step": 2320 + }, + { + "epoch": 0.13519394238301083, + "grad_norm": 0.17489096522331238, + "learning_rate": 0.0009803199252837766, + "loss": 3.4932, + "step": 2330 + }, + { + "epoch": 0.13577417389538426, + "grad_norm": 0.1895146518945694, + "learning_rate": 0.0009800525115681734, + "loss": 3.4937, + "step": 2340 + }, + { + "epoch": 0.1363544054077577, + "grad_norm": 0.19778315722942352, + "learning_rate": 0.0009797833302489334, + "loss": 3.4819, + "step": 2350 + }, + { + "epoch": 0.13693463692013114, + "grad_norm": 0.17575684189796448, + "learning_rate": 0.0009795123823172107, + "loss": 3.4853, + "step": 2360 + }, + { + "epoch": 0.13751486843250457, + "grad_norm": 0.1611810177564621, + "learning_rate": 0.000979239668770664, + "loss": 3.4912, + "step": 2370 + }, + { + "epoch": 0.138095099944878, + "grad_norm": 0.19706352055072784, + "learning_rate": 0.0009789651906134532, + "loss": 3.4814, + "step": 2380 + }, + { + "epoch": 0.13867533145725144, + "grad_norm": 0.15343667566776276, + "learning_rate": 0.0009786889488562352, + "loss": 3.4757, + "step": 2390 + }, + { + "epoch": 0.13925556296962488, + "grad_norm": 0.1835697740316391, + "learning_rate": 0.0009784109445161616, + "loss": 3.48, + "step": 2400 + }, + { + "epoch": 0.13983579448199832, + "grad_norm": 0.19989457726478577, + "learning_rate": 0.0009781311786168732, + "loss": 3.471, + "step": 2410 + }, + { + "epoch": 0.14041602599437175, + "grad_norm": 0.1824110597372055, + "learning_rate": 0.0009778496521884973, + "loss": 3.4795, + "step": 2420 + }, + { + "epoch": 0.1409962575067452, + "grad_norm": 0.17803703248500824, + "learning_rate": 0.0009775663662676438, + "loss": 3.4895, + "step": 2430 + }, + { + "epoch": 0.14157648901911862, + "grad_norm": 0.17566360533237457, + "learning_rate": 0.0009772813218974013, + "loss": 3.4771, + "step": 2440 + }, + { + "epoch": 0.14215672053149206, + "grad_norm": 0.17459633946418762, + "learning_rate": 0.0009769945201273328, + "loss": 3.4748, + "step": 2450 + }, + { + "epoch": 0.1427369520438655, + "grad_norm": 0.17547911405563354, + "learning_rate": 0.0009767059620134728, + "loss": 3.4851, + "step": 2460 + }, + { + "epoch": 0.14331718355623893, + "grad_norm": 0.1611047089099884, + "learning_rate": 0.0009764156486183223, + "loss": 3.4859, + "step": 2470 + }, + { + "epoch": 0.14389741506861237, + "grad_norm": 0.22488833963871002, + "learning_rate": 0.0009761235810108453, + "loss": 3.4704, + "step": 2480 + }, + { + "epoch": 0.1444776465809858, + "grad_norm": 0.1857168972492218, + "learning_rate": 0.0009758297602664658, + "loss": 3.4636, + "step": 2490 + }, + { + "epoch": 0.14505787809335924, + "grad_norm": 0.18012335896492004, + "learning_rate": 0.0009755341874670624, + "loss": 3.4675, + "step": 2500 + }, + { + "epoch": 0.14563810960573267, + "grad_norm": 0.14618253707885742, + "learning_rate": 0.000975236863700965, + "loss": 3.472, + "step": 2510 + }, + { + "epoch": 0.14621834111810614, + "grad_norm": 0.20513653755187988, + "learning_rate": 0.000974937790062951, + "loss": 3.4772, + "step": 2520 + }, + { + "epoch": 0.14679857263047957, + "grad_norm": 0.17752069234848022, + "learning_rate": 0.0009746369676542408, + "loss": 3.4674, + "step": 2530 + }, + { + "epoch": 0.147378804142853, + "grad_norm": 0.1703299880027771, + "learning_rate": 0.000974334397582494, + "loss": 3.4631, + "step": 2540 + }, + { + "epoch": 0.14795903565522645, + "grad_norm": 0.18766288459300995, + "learning_rate": 0.0009740300809618055, + "loss": 3.4696, + "step": 2550 + }, + { + "epoch": 0.14853926716759988, + "grad_norm": 0.19645459949970245, + "learning_rate": 0.0009737240189127005, + "loss": 3.4686, + "step": 2560 + }, + { + "epoch": 0.14911949867997332, + "grad_norm": 0.19703318178653717, + "learning_rate": 0.0009734162125621322, + "loss": 3.4645, + "step": 2570 + }, + { + "epoch": 0.14969973019234675, + "grad_norm": 0.1729518324136734, + "learning_rate": 0.0009731066630434753, + "loss": 3.4623, + "step": 2580 + }, + { + "epoch": 0.1502799617047202, + "grad_norm": 0.1629866361618042, + "learning_rate": 0.0009727953714965238, + "loss": 3.4587, + "step": 2590 + }, + { + "epoch": 0.15086019321709362, + "grad_norm": 0.1935587227344513, + "learning_rate": 0.0009724823390674857, + "loss": 3.452, + "step": 2600 + }, + { + "epoch": 0.15144042472946706, + "grad_norm": 0.1905779391527176, + "learning_rate": 0.0009721675669089791, + "loss": 3.4492, + "step": 2610 + }, + { + "epoch": 0.1520206562418405, + "grad_norm": 0.1649785190820694, + "learning_rate": 0.0009718510561800282, + "loss": 3.4553, + "step": 2620 + }, + { + "epoch": 0.15260088775421393, + "grad_norm": 0.18956676125526428, + "learning_rate": 0.0009715328080460587, + "loss": 3.4565, + "step": 2630 + }, + { + "epoch": 0.15318111926658737, + "grad_norm": 0.1951354593038559, + "learning_rate": 0.0009712128236788935, + "loss": 3.4588, + "step": 2640 + }, + { + "epoch": 0.1537613507789608, + "grad_norm": 0.19937202334403992, + "learning_rate": 0.0009708911042567485, + "loss": 3.4464, + "step": 2650 + }, + { + "epoch": 0.15434158229133424, + "grad_norm": 0.1778160184621811, + "learning_rate": 0.0009705676509642285, + "loss": 3.4619, + "step": 2660 + }, + { + "epoch": 0.15492181380370768, + "grad_norm": 0.19197410345077515, + "learning_rate": 0.0009702424649923221, + "loss": 3.4545, + "step": 2670 + }, + { + "epoch": 0.1555020453160811, + "grad_norm": 0.20156262814998627, + "learning_rate": 0.0009699155475383984, + "loss": 3.4407, + "step": 2680 + }, + { + "epoch": 0.15608227682845455, + "grad_norm": 0.1888488084077835, + "learning_rate": 0.0009695868998062016, + "loss": 3.4522, + "step": 2690 + }, + { + "epoch": 0.15666250834082798, + "grad_norm": 0.15710744261741638, + "learning_rate": 0.0009692565230058471, + "loss": 3.4385, + "step": 2700 + }, + { + "epoch": 0.15724273985320142, + "grad_norm": 0.17102789878845215, + "learning_rate": 0.0009689244183538169, + "loss": 3.4495, + "step": 2710 + }, + { + "epoch": 0.15782297136557485, + "grad_norm": 0.19345730543136597, + "learning_rate": 0.000968590587072955, + "loss": 3.4449, + "step": 2720 + }, + { + "epoch": 0.1584032028779483, + "grad_norm": 0.17778056859970093, + "learning_rate": 0.0009682550303924633, + "loss": 3.4424, + "step": 2730 + }, + { + "epoch": 0.15898343439032173, + "grad_norm": 0.17961904406547546, + "learning_rate": 0.0009679177495478966, + "loss": 3.4457, + "step": 2740 + }, + { + "epoch": 0.15956366590269516, + "grad_norm": 0.16124430298805237, + "learning_rate": 0.0009675787457811583, + "loss": 3.4388, + "step": 2750 + }, + { + "epoch": 0.16014389741506863, + "grad_norm": 0.24593627452850342, + "learning_rate": 0.0009672380203404957, + "loss": 3.4491, + "step": 2760 + }, + { + "epoch": 0.16072412892744206, + "grad_norm": 0.22059789299964905, + "learning_rate": 0.0009668955744804957, + "loss": 3.4452, + "step": 2770 + }, + { + "epoch": 0.1613043604398155, + "grad_norm": 0.17355699837207794, + "learning_rate": 0.0009665514094620798, + "loss": 3.4334, + "step": 2780 + }, + { + "epoch": 0.16188459195218893, + "grad_norm": 0.18848399817943573, + "learning_rate": 0.0009662055265524996, + "loss": 3.4445, + "step": 2790 + }, + { + "epoch": 0.16246482346456237, + "grad_norm": 0.16156227886676788, + "learning_rate": 0.0009658579270253321, + "loss": 3.432, + "step": 2800 + }, + { + "epoch": 0.1630450549769358, + "grad_norm": 0.17677579820156097, + "learning_rate": 0.0009655086121604754, + "loss": 3.4387, + "step": 2810 + }, + { + "epoch": 0.16362528648930924, + "grad_norm": 0.1860220730304718, + "learning_rate": 0.0009651575832441435, + "loss": 3.4352, + "step": 2820 + }, + { + "epoch": 0.16420551800168268, + "grad_norm": 0.1875036209821701, + "learning_rate": 0.0009648048415688612, + "loss": 3.4361, + "step": 2830 + }, + { + "epoch": 0.1647857495140561, + "grad_norm": 0.18773604929447174, + "learning_rate": 0.0009644503884334608, + "loss": 3.4293, + "step": 2840 + }, + { + "epoch": 0.16536598102642955, + "grad_norm": 0.17164553701877594, + "learning_rate": 0.0009640942251430755, + "loss": 3.4329, + "step": 2850 + }, + { + "epoch": 0.16594621253880298, + "grad_norm": 0.22501535713672638, + "learning_rate": 0.0009637363530091361, + "loss": 3.4354, + "step": 2860 + }, + { + "epoch": 0.16652644405117642, + "grad_norm": 0.16013330221176147, + "learning_rate": 0.0009633767733493651, + "loss": 3.4266, + "step": 2870 + }, + { + "epoch": 0.16710667556354986, + "grad_norm": 0.17605000734329224, + "learning_rate": 0.0009630154874877726, + "loss": 3.4202, + "step": 2880 + }, + { + "epoch": 0.1676869070759233, + "grad_norm": 0.19912280142307281, + "learning_rate": 0.0009626524967546508, + "loss": 3.4251, + "step": 2890 + }, + { + "epoch": 0.16826713858829673, + "grad_norm": 0.19407010078430176, + "learning_rate": 0.00096228780248657, + "loss": 3.4242, + "step": 2900 + }, + { + "epoch": 0.16884737010067016, + "grad_norm": 0.19874045252799988, + "learning_rate": 0.0009619214060263723, + "loss": 3.4326, + "step": 2910 + }, + { + "epoch": 0.1694276016130436, + "grad_norm": 0.1755009889602661, + "learning_rate": 0.000961553308723168, + "loss": 3.4201, + "step": 2920 + }, + { + "epoch": 0.17000783312541703, + "grad_norm": 0.16836123168468475, + "learning_rate": 0.00096118351193233, + "loss": 3.4244, + "step": 2930 + }, + { + "epoch": 0.17058806463779047, + "grad_norm": 0.17263484001159668, + "learning_rate": 0.0009608120170154886, + "loss": 3.4245, + "step": 2940 + }, + { + "epoch": 0.1711682961501639, + "grad_norm": 0.16616012156009674, + "learning_rate": 0.0009604388253405272, + "loss": 3.4149, + "step": 2950 + }, + { + "epoch": 0.17174852766253734, + "grad_norm": 0.18633881211280823, + "learning_rate": 0.0009600639382815768, + "loss": 3.4247, + "step": 2960 + }, + { + "epoch": 0.17232875917491078, + "grad_norm": 0.1888113021850586, + "learning_rate": 0.0009596873572190104, + "loss": 3.4185, + "step": 2970 + }, + { + "epoch": 0.17290899068728421, + "grad_norm": 0.1729818731546402, + "learning_rate": 0.0009593090835394392, + "loss": 3.4188, + "step": 2980 + }, + { + "epoch": 0.17348922219965765, + "grad_norm": 0.19416862726211548, + "learning_rate": 0.0009589291186357066, + "loss": 3.417, + "step": 2990 + }, + { + "epoch": 0.1740694537120311, + "grad_norm": 0.16473758220672607, + "learning_rate": 0.0009585474639068829, + "loss": 3.4279, + "step": 3000 + }, + { + "epoch": 0.1740694537120311, + "eval_loss": 3.3559625148773193, + "eval_runtime": 3.2502, + "eval_samples_per_second": 1332.237, + "eval_steps_per_second": 10.461, + "step": 3000 + }, + { + "epoch": 0.17464968522440455, + "grad_norm": 0.148405060172081, + "learning_rate": 0.0009581641207582609, + "loss": 3.4132, + "step": 3010 + }, + { + "epoch": 0.17522991673677799, + "grad_norm": 0.17240671813488007, + "learning_rate": 0.0009577790906013503, + "loss": 3.4145, + "step": 3020 + }, + { + "epoch": 0.17581014824915142, + "grad_norm": 0.1781778484582901, + "learning_rate": 0.0009573923748538724, + "loss": 3.4146, + "step": 3030 + }, + { + "epoch": 0.17639037976152486, + "grad_norm": 0.1692868173122406, + "learning_rate": 0.0009570039749397552, + "loss": 3.4154, + "step": 3040 + }, + { + "epoch": 0.1769706112738983, + "grad_norm": 0.18871796131134033, + "learning_rate": 0.0009566138922891277, + "loss": 3.4233, + "step": 3050 + }, + { + "epoch": 0.17755084278627173, + "grad_norm": 0.16771915555000305, + "learning_rate": 0.0009562221283383152, + "loss": 3.4144, + "step": 3060 + }, + { + "epoch": 0.17813107429864516, + "grad_norm": 0.17178234457969666, + "learning_rate": 0.0009558286845298337, + "loss": 3.4066, + "step": 3070 + }, + { + "epoch": 0.1787113058110186, + "grad_norm": 0.17993003129959106, + "learning_rate": 0.0009554335623123845, + "loss": 3.4125, + "step": 3080 + }, + { + "epoch": 0.17929153732339204, + "grad_norm": 0.1944742351770401, + "learning_rate": 0.0009550367631408485, + "loss": 3.4095, + "step": 3090 + }, + { + "epoch": 0.17987176883576547, + "grad_norm": 0.21978144347667694, + "learning_rate": 0.0009546382884762825, + "loss": 3.4204, + "step": 3100 + }, + { + "epoch": 0.1804520003481389, + "grad_norm": 0.19678272306919098, + "learning_rate": 0.0009542381397859116, + "loss": 3.3991, + "step": 3110 + }, + { + "epoch": 0.18103223186051234, + "grad_norm": 0.16551128029823303, + "learning_rate": 0.0009538363185431254, + "loss": 3.4055, + "step": 3120 + }, + { + "epoch": 0.18161246337288578, + "grad_norm": 0.18304401636123657, + "learning_rate": 0.0009534328262274717, + "loss": 3.4038, + "step": 3130 + }, + { + "epoch": 0.18219269488525922, + "grad_norm": 0.1903512328863144, + "learning_rate": 0.0009530276643246512, + "loss": 3.4081, + "step": 3140 + }, + { + "epoch": 0.18277292639763265, + "grad_norm": 0.19788897037506104, + "learning_rate": 0.0009526208343265129, + "loss": 3.3991, + "step": 3150 + }, + { + "epoch": 0.1833531579100061, + "grad_norm": 0.17483383417129517, + "learning_rate": 0.0009522123377310474, + "loss": 3.4105, + "step": 3160 + }, + { + "epoch": 0.18393338942237952, + "grad_norm": 0.18417778611183167, + "learning_rate": 0.0009518021760423816, + "loss": 3.3973, + "step": 3170 + }, + { + "epoch": 0.18451362093475296, + "grad_norm": 0.17036600410938263, + "learning_rate": 0.0009513903507707743, + "loss": 3.403, + "step": 3180 + }, + { + "epoch": 0.1850938524471264, + "grad_norm": 0.17953407764434814, + "learning_rate": 0.0009509768634326089, + "loss": 3.401, + "step": 3190 + }, + { + "epoch": 0.18567408395949983, + "grad_norm": 0.18770861625671387, + "learning_rate": 0.0009505617155503894, + "loss": 3.4006, + "step": 3200 + }, + { + "epoch": 0.18625431547187327, + "grad_norm": 0.20032437145709991, + "learning_rate": 0.0009501449086527336, + "loss": 3.4012, + "step": 3210 + }, + { + "epoch": 0.1868345469842467, + "grad_norm": 0.19611109793186188, + "learning_rate": 0.0009497264442743681, + "loss": 3.3974, + "step": 3220 + }, + { + "epoch": 0.18741477849662014, + "grad_norm": 0.17835482954978943, + "learning_rate": 0.0009493063239561227, + "loss": 3.3966, + "step": 3230 + }, + { + "epoch": 0.1879950100089936, + "grad_norm": 0.17207197844982147, + "learning_rate": 0.0009488845492449245, + "loss": 3.3957, + "step": 3240 + }, + { + "epoch": 0.18857524152136704, + "grad_norm": 0.15979701280593872, + "learning_rate": 0.0009484611216937919, + "loss": 3.3969, + "step": 3250 + }, + { + "epoch": 0.18915547303374047, + "grad_norm": 0.19770529866218567, + "learning_rate": 0.0009480360428618298, + "loss": 3.3972, + "step": 3260 + }, + { + "epoch": 0.1897357045461139, + "grad_norm": 0.161921888589859, + "learning_rate": 0.0009476093143142231, + "loss": 3.3782, + "step": 3270 + }, + { + "epoch": 0.19031593605848734, + "grad_norm": 0.17377763986587524, + "learning_rate": 0.0009471809376222304, + "loss": 3.3959, + "step": 3280 + }, + { + "epoch": 0.19089616757086078, + "grad_norm": 0.17865316569805145, + "learning_rate": 0.00094675091436318, + "loss": 3.3945, + "step": 3290 + }, + { + "epoch": 0.19147639908323422, + "grad_norm": 0.17098024487495422, + "learning_rate": 0.0009463192461204626, + "loss": 3.3915, + "step": 3300 + }, + { + "epoch": 0.19205663059560765, + "grad_norm": 0.18630030751228333, + "learning_rate": 0.0009458859344835259, + "loss": 3.3891, + "step": 3310 + }, + { + "epoch": 0.1926368621079811, + "grad_norm": 0.17725063860416412, + "learning_rate": 0.0009454509810478685, + "loss": 3.3856, + "step": 3320 + }, + { + "epoch": 0.19321709362035452, + "grad_norm": 0.15566089749336243, + "learning_rate": 0.0009450143874150347, + "loss": 3.3964, + "step": 3330 + }, + { + "epoch": 0.19379732513272796, + "grad_norm": 0.16617019474506378, + "learning_rate": 0.0009445761551926079, + "loss": 3.3854, + "step": 3340 + }, + { + "epoch": 0.1943775566451014, + "grad_norm": 0.1689499467611313, + "learning_rate": 0.0009441362859942054, + "loss": 3.3933, + "step": 3350 + }, + { + "epoch": 0.19495778815747483, + "grad_norm": 0.16878637671470642, + "learning_rate": 0.0009436947814394712, + "loss": 3.3819, + "step": 3360 + }, + { + "epoch": 0.19553801966984827, + "grad_norm": 0.20233598351478577, + "learning_rate": 0.0009432516431540714, + "loss": 3.3932, + "step": 3370 + }, + { + "epoch": 0.1961182511822217, + "grad_norm": 0.14608658850193024, + "learning_rate": 0.0009428068727696878, + "loss": 3.3878, + "step": 3380 + }, + { + "epoch": 0.19669848269459514, + "grad_norm": 0.16677936911582947, + "learning_rate": 0.0009423604719240114, + "loss": 3.3898, + "step": 3390 + }, + { + "epoch": 0.19727871420696858, + "grad_norm": 0.17749983072280884, + "learning_rate": 0.0009419124422607369, + "loss": 3.3835, + "step": 3400 + }, + { + "epoch": 0.197858945719342, + "grad_norm": 0.17364837229251862, + "learning_rate": 0.0009414627854295566, + "loss": 3.3873, + "step": 3410 + }, + { + "epoch": 0.19843917723171545, + "grad_norm": 0.1968606561422348, + "learning_rate": 0.0009410115030861536, + "loss": 3.3834, + "step": 3420 + }, + { + "epoch": 0.19901940874408888, + "grad_norm": 0.17494814097881317, + "learning_rate": 0.0009405585968921974, + "loss": 3.3768, + "step": 3430 + }, + { + "epoch": 0.19959964025646232, + "grad_norm": 0.17461541295051575, + "learning_rate": 0.0009401040685153357, + "loss": 3.3673, + "step": 3440 + }, + { + "epoch": 0.20017987176883575, + "grad_norm": 0.16065211594104767, + "learning_rate": 0.0009396479196291896, + "loss": 3.3831, + "step": 3450 + }, + { + "epoch": 0.2007601032812092, + "grad_norm": 0.18772707879543304, + "learning_rate": 0.000939190151913347, + "loss": 3.381, + "step": 3460 + }, + { + "epoch": 0.20134033479358263, + "grad_norm": 0.17115509510040283, + "learning_rate": 0.000938730767053357, + "loss": 3.3786, + "step": 3470 + }, + { + "epoch": 0.2019205663059561, + "grad_norm": 0.16643331944942474, + "learning_rate": 0.0009382697667407222, + "loss": 3.381, + "step": 3480 + }, + { + "epoch": 0.20250079781832953, + "grad_norm": 0.16961540281772614, + "learning_rate": 0.0009378071526728944, + "loss": 3.3798, + "step": 3490 + }, + { + "epoch": 0.20308102933070296, + "grad_norm": 0.18497344851493835, + "learning_rate": 0.000937342926553267, + "loss": 3.3796, + "step": 3500 + }, + { + "epoch": 0.2036612608430764, + "grad_norm": 0.17968598008155823, + "learning_rate": 0.0009368770900911691, + "loss": 3.3699, + "step": 3510 + }, + { + "epoch": 0.20424149235544983, + "grad_norm": 0.19794964790344238, + "learning_rate": 0.0009364096450018598, + "loss": 3.3711, + "step": 3520 + }, + { + "epoch": 0.20482172386782327, + "grad_norm": 0.16060756146907806, + "learning_rate": 0.0009359405930065202, + "loss": 3.3831, + "step": 3530 + }, + { + "epoch": 0.2054019553801967, + "grad_norm": 0.16221892833709717, + "learning_rate": 0.0009354699358322493, + "loss": 3.3673, + "step": 3540 + }, + { + "epoch": 0.20598218689257014, + "grad_norm": 0.1834096759557724, + "learning_rate": 0.0009349976752120561, + "loss": 3.3696, + "step": 3550 + }, + { + "epoch": 0.20656241840494358, + "grad_norm": 0.1541026383638382, + "learning_rate": 0.0009345238128848535, + "loss": 3.3659, + "step": 3560 + }, + { + "epoch": 0.207142649917317, + "grad_norm": 0.16199982166290283, + "learning_rate": 0.0009340483505954524, + "loss": 3.3728, + "step": 3570 + }, + { + "epoch": 0.20772288142969045, + "grad_norm": 0.16066418588161469, + "learning_rate": 0.0009335712900945547, + "loss": 3.3695, + "step": 3580 + }, + { + "epoch": 0.20830311294206388, + "grad_norm": 0.19340233504772186, + "learning_rate": 0.0009330926331387472, + "loss": 3.3751, + "step": 3590 + }, + { + "epoch": 0.20888334445443732, + "grad_norm": 0.1752013862133026, + "learning_rate": 0.0009326123814904949, + "loss": 3.3665, + "step": 3600 + }, + { + "epoch": 0.20946357596681076, + "grad_norm": 0.18730874359607697, + "learning_rate": 0.0009321305369181345, + "loss": 3.3656, + "step": 3610 + }, + { + "epoch": 0.2100438074791842, + "grad_norm": 0.18153837323188782, + "learning_rate": 0.0009316471011958685, + "loss": 3.3761, + "step": 3620 + }, + { + "epoch": 0.21062403899155763, + "grad_norm": 0.15640245378017426, + "learning_rate": 0.0009311620761037578, + "loss": 3.366, + "step": 3630 + }, + { + "epoch": 0.21120427050393106, + "grad_norm": 0.14558587968349457, + "learning_rate": 0.0009306754634277154, + "loss": 3.3667, + "step": 3640 + }, + { + "epoch": 0.2117845020163045, + "grad_norm": 0.17782875895500183, + "learning_rate": 0.0009301872649595005, + "loss": 3.3683, + "step": 3650 + }, + { + "epoch": 0.21236473352867793, + "grad_norm": 0.19611169397830963, + "learning_rate": 0.0009296974824967106, + "loss": 3.3705, + "step": 3660 + }, + { + "epoch": 0.21294496504105137, + "grad_norm": 0.17394675314426422, + "learning_rate": 0.0009292061178427762, + "loss": 3.3649, + "step": 3670 + }, + { + "epoch": 0.2135251965534248, + "grad_norm": 0.1756591647863388, + "learning_rate": 0.0009287131728069536, + "loss": 3.3661, + "step": 3680 + }, + { + "epoch": 0.21410542806579824, + "grad_norm": 0.19795431196689606, + "learning_rate": 0.0009282186492043178, + "loss": 3.3648, + "step": 3690 + }, + { + "epoch": 0.21468565957817168, + "grad_norm": 0.19772683084011078, + "learning_rate": 0.0009277225488557566, + "loss": 3.3584, + "step": 3700 + }, + { + "epoch": 0.21526589109054511, + "grad_norm": 0.18163970112800598, + "learning_rate": 0.0009272248735879636, + "loss": 3.3643, + "step": 3710 + }, + { + "epoch": 0.21584612260291858, + "grad_norm": 0.1849670708179474, + "learning_rate": 0.0009267256252334311, + "loss": 3.3672, + "step": 3720 + }, + { + "epoch": 0.216426354115292, + "grad_norm": 0.1637067347764969, + "learning_rate": 0.0009262248056304439, + "loss": 3.3708, + "step": 3730 + }, + { + "epoch": 0.21700658562766545, + "grad_norm": 0.1776672899723053, + "learning_rate": 0.0009257224166230722, + "loss": 3.3561, + "step": 3740 + }, + { + "epoch": 0.21758681714003889, + "grad_norm": 0.15775948762893677, + "learning_rate": 0.0009252184600611651, + "loss": 3.3573, + "step": 3750 + }, + { + "epoch": 0.21816704865241232, + "grad_norm": 0.1565089225769043, + "learning_rate": 0.0009247129378003432, + "loss": 3.3654, + "step": 3760 + }, + { + "epoch": 0.21874728016478576, + "grad_norm": 0.16912080347537994, + "learning_rate": 0.0009242058517019926, + "loss": 3.3494, + "step": 3770 + }, + { + "epoch": 0.2193275116771592, + "grad_norm": 0.1838664710521698, + "learning_rate": 0.0009236972036332574, + "loss": 3.3694, + "step": 3780 + }, + { + "epoch": 0.21990774318953263, + "grad_norm": 0.17254334688186646, + "learning_rate": 0.0009231869954670331, + "loss": 3.3601, + "step": 3790 + }, + { + "epoch": 0.22048797470190606, + "grad_norm": 0.17754711210727692, + "learning_rate": 0.0009226752290819595, + "loss": 3.3586, + "step": 3800 + }, + { + "epoch": 0.2210682062142795, + "grad_norm": 0.19803158938884735, + "learning_rate": 0.0009221619063624143, + "loss": 3.3603, + "step": 3810 + }, + { + "epoch": 0.22164843772665294, + "grad_norm": 0.16345560550689697, + "learning_rate": 0.0009216470291985053, + "loss": 3.3511, + "step": 3820 + }, + { + "epoch": 0.22222866923902637, + "grad_norm": 0.1706439107656479, + "learning_rate": 0.0009211305994860641, + "loss": 3.3578, + "step": 3830 + }, + { + "epoch": 0.2228089007513998, + "grad_norm": 0.18484774231910706, + "learning_rate": 0.0009206126191266393, + "loss": 3.3567, + "step": 3840 + }, + { + "epoch": 0.22338913226377324, + "grad_norm": 0.18500268459320068, + "learning_rate": 0.0009200930900274884, + "loss": 3.359, + "step": 3850 + }, + { + "epoch": 0.22396936377614668, + "grad_norm": 0.18986065685749054, + "learning_rate": 0.0009195720141015725, + "loss": 3.3497, + "step": 3860 + }, + { + "epoch": 0.22454959528852012, + "grad_norm": 0.17551766335964203, + "learning_rate": 0.0009190493932675473, + "loss": 3.3474, + "step": 3870 + }, + { + "epoch": 0.22512982680089355, + "grad_norm": 0.19979430735111237, + "learning_rate": 0.0009185252294497577, + "loss": 3.3474, + "step": 3880 + }, + { + "epoch": 0.225710058313267, + "grad_norm": 0.1754840612411499, + "learning_rate": 0.0009179995245782297, + "loss": 3.3426, + "step": 3890 + }, + { + "epoch": 0.22629028982564042, + "grad_norm": 0.16811451315879822, + "learning_rate": 0.0009174722805886638, + "loss": 3.3523, + "step": 3900 + }, + { + "epoch": 0.22687052133801386, + "grad_norm": 0.208675354719162, + "learning_rate": 0.0009169434994224274, + "loss": 3.3479, + "step": 3910 + }, + { + "epoch": 0.2274507528503873, + "grad_norm": 0.1587597280740738, + "learning_rate": 0.0009164131830265483, + "loss": 3.3451, + "step": 3920 + }, + { + "epoch": 0.22803098436276073, + "grad_norm": 0.1999424546957016, + "learning_rate": 0.0009158813333537071, + "loss": 3.3447, + "step": 3930 + }, + { + "epoch": 0.22861121587513417, + "grad_norm": 0.1611049622297287, + "learning_rate": 0.0009153479523622298, + "loss": 3.3534, + "step": 3940 + }, + { + "epoch": 0.2291914473875076, + "grad_norm": 0.18397875130176544, + "learning_rate": 0.0009148130420160813, + "loss": 3.346, + "step": 3950 + }, + { + "epoch": 0.22977167889988107, + "grad_norm": 0.17322111129760742, + "learning_rate": 0.0009142766042848574, + "loss": 3.3534, + "step": 3960 + }, + { + "epoch": 0.2303519104122545, + "grad_norm": 0.1563626080751419, + "learning_rate": 0.000913738641143778, + "loss": 3.3498, + "step": 3970 + }, + { + "epoch": 0.23093214192462794, + "grad_norm": 0.18060965836048126, + "learning_rate": 0.0009131991545736798, + "loss": 3.3402, + "step": 3980 + }, + { + "epoch": 0.23151237343700137, + "grad_norm": 0.18193970620632172, + "learning_rate": 0.0009126581465610089, + "loss": 3.3477, + "step": 3990 + }, + { + "epoch": 0.2320926049493748, + "grad_norm": 0.17089489102363586, + "learning_rate": 0.0009121156190978134, + "loss": 3.3471, + "step": 4000 + }, + { + "epoch": 0.2320926049493748, + "eval_loss": 3.2811107635498047, + "eval_runtime": 3.2679, + "eval_samples_per_second": 1325.027, + "eval_steps_per_second": 10.404, + "step": 4000 + }, + { + "epoch": 0.23267283646174824, + "grad_norm": 0.1482742726802826, + "learning_rate": 0.0009115715741817364, + "loss": 3.3448, + "step": 4010 + }, + { + "epoch": 0.23325306797412168, + "grad_norm": 0.1840105950832367, + "learning_rate": 0.000911026013816008, + "loss": 3.3528, + "step": 4020 + }, + { + "epoch": 0.23383329948649512, + "grad_norm": 0.159522145986557, + "learning_rate": 0.0009104789400094387, + "loss": 3.3452, + "step": 4030 + }, + { + "epoch": 0.23441353099886855, + "grad_norm": 0.16177432239055634, + "learning_rate": 0.0009099303547764118, + "loss": 3.3407, + "step": 4040 + }, + { + "epoch": 0.234993762511242, + "grad_norm": 0.1564028412103653, + "learning_rate": 0.0009093802601368755, + "loss": 3.3393, + "step": 4050 + }, + { + "epoch": 0.23557399402361542, + "grad_norm": 0.18961066007614136, + "learning_rate": 0.0009088286581163357, + "loss": 3.3461, + "step": 4060 + }, + { + "epoch": 0.23615422553598886, + "grad_norm": 0.15274052321910858, + "learning_rate": 0.0009082755507458492, + "loss": 3.339, + "step": 4070 + }, + { + "epoch": 0.2367344570483623, + "grad_norm": 0.18588609993457794, + "learning_rate": 0.0009077209400620148, + "loss": 3.3366, + "step": 4080 + }, + { + "epoch": 0.23731468856073573, + "grad_norm": 0.17476515471935272, + "learning_rate": 0.0009071648281069673, + "loss": 3.3353, + "step": 4090 + }, + { + "epoch": 0.23789492007310917, + "grad_norm": 0.17199723422527313, + "learning_rate": 0.0009066072169283695, + "loss": 3.3329, + "step": 4100 + }, + { + "epoch": 0.2384751515854826, + "grad_norm": 0.19376391172409058, + "learning_rate": 0.0009060481085794037, + "loss": 3.3347, + "step": 4110 + }, + { + "epoch": 0.23905538309785604, + "grad_norm": 0.15385298430919647, + "learning_rate": 0.0009054875051187657, + "loss": 3.3387, + "step": 4120 + }, + { + "epoch": 0.23963561461022947, + "grad_norm": 0.15670862793922424, + "learning_rate": 0.000904925408610656, + "loss": 3.3386, + "step": 4130 + }, + { + "epoch": 0.2402158461226029, + "grad_norm": 0.16597050428390503, + "learning_rate": 0.0009043618211247731, + "loss": 3.3409, + "step": 4140 + }, + { + "epoch": 0.24079607763497635, + "grad_norm": 0.15981680154800415, + "learning_rate": 0.0009037967447363049, + "loss": 3.338, + "step": 4150 + }, + { + "epoch": 0.24137630914734978, + "grad_norm": 0.18593856692314148, + "learning_rate": 0.0009032301815259221, + "loss": 3.3384, + "step": 4160 + }, + { + "epoch": 0.24195654065972322, + "grad_norm": 0.16867688298225403, + "learning_rate": 0.0009026621335797696, + "loss": 3.3342, + "step": 4170 + }, + { + "epoch": 0.24253677217209665, + "grad_norm": 0.16456478834152222, + "learning_rate": 0.0009020926029894594, + "loss": 3.3346, + "step": 4180 + }, + { + "epoch": 0.2431170036844701, + "grad_norm": 0.1605817675590515, + "learning_rate": 0.0009015215918520629, + "loss": 3.3316, + "step": 4190 + }, + { + "epoch": 0.24369723519684355, + "grad_norm": 0.18208837509155273, + "learning_rate": 0.0009009491022701028, + "loss": 3.3329, + "step": 4200 + }, + { + "epoch": 0.244277466709217, + "grad_norm": 0.1611657440662384, + "learning_rate": 0.000900375136351546, + "loss": 3.3293, + "step": 4210 + }, + { + "epoch": 0.24485769822159043, + "grad_norm": 0.17636191844940186, + "learning_rate": 0.0008997996962097947, + "loss": 3.3449, + "step": 4220 + }, + { + "epoch": 0.24543792973396386, + "grad_norm": 0.19152578711509705, + "learning_rate": 0.0008992227839636804, + "loss": 3.3272, + "step": 4230 + }, + { + "epoch": 0.2460181612463373, + "grad_norm": 0.16381146013736725, + "learning_rate": 0.0008986444017374538, + "loss": 3.3223, + "step": 4240 + }, + { + "epoch": 0.24659839275871073, + "grad_norm": 0.16677068173885345, + "learning_rate": 0.0008980645516607793, + "loss": 3.3294, + "step": 4250 + }, + { + "epoch": 0.24717862427108417, + "grad_norm": 0.16276830434799194, + "learning_rate": 0.0008974832358687253, + "loss": 3.3337, + "step": 4260 + }, + { + "epoch": 0.2477588557834576, + "grad_norm": 0.17291460931301117, + "learning_rate": 0.0008969004565017577, + "loss": 3.3255, + "step": 4270 + }, + { + "epoch": 0.24833908729583104, + "grad_norm": 0.17407308518886566, + "learning_rate": 0.0008963162157057309, + "loss": 3.3329, + "step": 4280 + }, + { + "epoch": 0.24891931880820448, + "grad_norm": 0.15487220883369446, + "learning_rate": 0.0008957305156318811, + "loss": 3.3245, + "step": 4290 + }, + { + "epoch": 0.2494995503205779, + "grad_norm": 0.14908728003501892, + "learning_rate": 0.000895143358436817, + "loss": 3.3281, + "step": 4300 + }, + { + "epoch": 0.2500797818329513, + "grad_norm": 0.20198597013950348, + "learning_rate": 0.000894554746282513, + "loss": 3.325, + "step": 4310 + }, + { + "epoch": 0.2506600133453248, + "grad_norm": 0.19431117177009583, + "learning_rate": 0.0008939646813363007, + "loss": 3.322, + "step": 4320 + }, + { + "epoch": 0.25124024485769825, + "grad_norm": 0.1598384529352188, + "learning_rate": 0.000893373165770861, + "loss": 3.3353, + "step": 4330 + }, + { + "epoch": 0.25182047637007166, + "grad_norm": 0.1565127670764923, + "learning_rate": 0.0008927802017642164, + "loss": 3.3201, + "step": 4340 + }, + { + "epoch": 0.2524007078824451, + "grad_norm": 0.1654927283525467, + "learning_rate": 0.0008921857914997222, + "loss": 3.3326, + "step": 4350 + }, + { + "epoch": 0.2529809393948185, + "grad_norm": 0.15852831304073334, + "learning_rate": 0.0008915899371660595, + "loss": 3.328, + "step": 4360 + }, + { + "epoch": 0.253561170907192, + "grad_norm": 0.16426099836826324, + "learning_rate": 0.0008909926409572263, + "loss": 3.3326, + "step": 4370 + }, + { + "epoch": 0.2541414024195654, + "grad_norm": 0.16855676472187042, + "learning_rate": 0.0008903939050725297, + "loss": 3.3289, + "step": 4380 + }, + { + "epoch": 0.25472163393193886, + "grad_norm": 0.1595139056444168, + "learning_rate": 0.0008897937317165781, + "loss": 3.3324, + "step": 4390 + }, + { + "epoch": 0.25530186544431227, + "grad_norm": 0.15312696993350983, + "learning_rate": 0.0008891921230992725, + "loss": 3.3294, + "step": 4400 + }, + { + "epoch": 0.25588209695668573, + "grad_norm": 0.16672903299331665, + "learning_rate": 0.000888589081435799, + "loss": 3.3217, + "step": 4410 + }, + { + "epoch": 0.25646232846905914, + "grad_norm": 0.1601061373949051, + "learning_rate": 0.0008879846089466202, + "loss": 3.3153, + "step": 4420 + }, + { + "epoch": 0.2570425599814326, + "grad_norm": 0.16103419661521912, + "learning_rate": 0.0008873787078574671, + "loss": 3.3176, + "step": 4430 + }, + { + "epoch": 0.257622791493806, + "grad_norm": 0.15978160500526428, + "learning_rate": 0.0008867713803993309, + "loss": 3.3316, + "step": 4440 + }, + { + "epoch": 0.2582030230061795, + "grad_norm": 0.15198193490505219, + "learning_rate": 0.0008861626288084549, + "loss": 3.3205, + "step": 4450 + }, + { + "epoch": 0.2587832545185529, + "grad_norm": 0.17815245687961578, + "learning_rate": 0.0008855524553263263, + "loss": 3.3159, + "step": 4460 + }, + { + "epoch": 0.25936348603092635, + "grad_norm": 0.17297674715518951, + "learning_rate": 0.0008849408621996679, + "loss": 3.3131, + "step": 4470 + }, + { + "epoch": 0.25994371754329976, + "grad_norm": 0.17088572680950165, + "learning_rate": 0.0008843278516804294, + "loss": 3.3178, + "step": 4480 + }, + { + "epoch": 0.2605239490556732, + "grad_norm": 0.14876043796539307, + "learning_rate": 0.00088371342602578, + "loss": 3.3113, + "step": 4490 + }, + { + "epoch": 0.26110418056804663, + "grad_norm": 0.16213169693946838, + "learning_rate": 0.0008830975874980991, + "loss": 3.3168, + "step": 4500 + }, + { + "epoch": 0.2616844120804201, + "grad_norm": 0.1761493980884552, + "learning_rate": 0.0008824803383649688, + "loss": 3.319, + "step": 4510 + }, + { + "epoch": 0.2622646435927935, + "grad_norm": 0.15832360088825226, + "learning_rate": 0.0008818616808991651, + "loss": 3.3202, + "step": 4520 + }, + { + "epoch": 0.26284487510516696, + "grad_norm": 0.1700046807527542, + "learning_rate": 0.0008812416173786495, + "loss": 3.3119, + "step": 4530 + }, + { + "epoch": 0.26342510661754037, + "grad_norm": 0.17616261541843414, + "learning_rate": 0.0008806201500865609, + "loss": 3.3133, + "step": 4540 + }, + { + "epoch": 0.26400533812991384, + "grad_norm": 0.17251409590244293, + "learning_rate": 0.0008799972813112072, + "loss": 3.3148, + "step": 4550 + }, + { + "epoch": 0.2645855696422873, + "grad_norm": 0.16835258901119232, + "learning_rate": 0.0008793730133460561, + "loss": 3.3188, + "step": 4560 + }, + { + "epoch": 0.2651658011546607, + "grad_norm": 0.16114716231822968, + "learning_rate": 0.0008787473484897276, + "loss": 3.3227, + "step": 4570 + }, + { + "epoch": 0.26574603266703417, + "grad_norm": 0.16803395748138428, + "learning_rate": 0.0008781202890459856, + "loss": 3.322, + "step": 4580 + }, + { + "epoch": 0.2663262641794076, + "grad_norm": 0.1682870239019394, + "learning_rate": 0.0008774918373237284, + "loss": 3.3142, + "step": 4590 + }, + { + "epoch": 0.26690649569178104, + "grad_norm": 0.15376168489456177, + "learning_rate": 0.0008768619956369813, + "loss": 3.3131, + "step": 4600 + }, + { + "epoch": 0.26748672720415445, + "grad_norm": 0.14724121987819672, + "learning_rate": 0.0008762307663048871, + "loss": 3.3105, + "step": 4610 + }, + { + "epoch": 0.2680669587165279, + "grad_norm": 0.1947721391916275, + "learning_rate": 0.0008755981516516987, + "loss": 3.3177, + "step": 4620 + }, + { + "epoch": 0.2686471902289013, + "grad_norm": 0.15725037455558777, + "learning_rate": 0.0008749641540067691, + "loss": 3.308, + "step": 4630 + }, + { + "epoch": 0.2692274217412748, + "grad_norm": 0.1616797149181366, + "learning_rate": 0.0008743287757045443, + "loss": 3.3158, + "step": 4640 + }, + { + "epoch": 0.2698076532536482, + "grad_norm": 0.16861191391944885, + "learning_rate": 0.0008736920190845536, + "loss": 3.3113, + "step": 4650 + }, + { + "epoch": 0.27038788476602166, + "grad_norm": 0.16433420777320862, + "learning_rate": 0.0008730538864914019, + "loss": 3.3168, + "step": 4660 + }, + { + "epoch": 0.27096811627839507, + "grad_norm": 0.1651991754770279, + "learning_rate": 0.00087241438027476, + "loss": 3.3016, + "step": 4670 + }, + { + "epoch": 0.27154834779076853, + "grad_norm": 0.18741054832935333, + "learning_rate": 0.0008717735027893568, + "loss": 3.3121, + "step": 4680 + }, + { + "epoch": 0.27212857930314194, + "grad_norm": 0.15020763874053955, + "learning_rate": 0.0008711312563949703, + "loss": 3.309, + "step": 4690 + }, + { + "epoch": 0.2727088108155154, + "grad_norm": 0.17015768587589264, + "learning_rate": 0.000870487643456419, + "loss": 3.3225, + "step": 4700 + }, + { + "epoch": 0.2732890423278888, + "grad_norm": 0.19843073189258575, + "learning_rate": 0.0008698426663435533, + "loss": 3.3058, + "step": 4710 + }, + { + "epoch": 0.2738692738402623, + "grad_norm": 0.17581596970558167, + "learning_rate": 0.0008691963274312464, + "loss": 3.3086, + "step": 4720 + }, + { + "epoch": 0.2744495053526357, + "grad_norm": 0.17392052710056305, + "learning_rate": 0.000868548629099386, + "loss": 3.311, + "step": 4730 + }, + { + "epoch": 0.27502973686500914, + "grad_norm": 0.16661454737186432, + "learning_rate": 0.0008678995737328651, + "loss": 3.3108, + "step": 4740 + }, + { + "epoch": 0.27560996837738255, + "grad_norm": 0.1836322695016861, + "learning_rate": 0.0008672491637215735, + "loss": 3.3042, + "step": 4750 + }, + { + "epoch": 0.276190199889756, + "grad_norm": 0.14587625861167908, + "learning_rate": 0.0008665974014603891, + "loss": 3.3202, + "step": 4760 + }, + { + "epoch": 0.2767704314021294, + "grad_norm": 0.1731816828250885, + "learning_rate": 0.0008659442893491689, + "loss": 3.295, + "step": 4770 + }, + { + "epoch": 0.2773506629145029, + "grad_norm": 0.1422199159860611, + "learning_rate": 0.0008652898297927398, + "loss": 3.3102, + "step": 4780 + }, + { + "epoch": 0.2779308944268763, + "grad_norm": 0.1520106941461563, + "learning_rate": 0.0008646340252008908, + "loss": 3.3186, + "step": 4790 + }, + { + "epoch": 0.27851112593924976, + "grad_norm": 0.16036173701286316, + "learning_rate": 0.000863976877988363, + "loss": 3.3145, + "step": 4800 + }, + { + "epoch": 0.2790913574516232, + "grad_norm": 0.15433992445468903, + "learning_rate": 0.0008633183905748411, + "loss": 3.3073, + "step": 4810 + }, + { + "epoch": 0.27967158896399663, + "grad_norm": 0.15274393558502197, + "learning_rate": 0.0008626585653849449, + "loss": 3.3005, + "step": 4820 + }, + { + "epoch": 0.2802518204763701, + "grad_norm": 0.1466848999261856, + "learning_rate": 0.0008619974048482198, + "loss": 3.3096, + "step": 4830 + }, + { + "epoch": 0.2808320519887435, + "grad_norm": 0.17398701608181, + "learning_rate": 0.0008613349113991283, + "loss": 3.2977, + "step": 4840 + }, + { + "epoch": 0.28141228350111697, + "grad_norm": 0.1633315235376358, + "learning_rate": 0.0008606710874770405, + "loss": 3.3048, + "step": 4850 + }, + { + "epoch": 0.2819925150134904, + "grad_norm": 0.15874172747135162, + "learning_rate": 0.0008600059355262259, + "loss": 3.3, + "step": 4860 + }, + { + "epoch": 0.28257274652586384, + "grad_norm": 0.16768650710582733, + "learning_rate": 0.0008593394579958433, + "loss": 3.2971, + "step": 4870 + }, + { + "epoch": 0.28315297803823725, + "grad_norm": 0.17810046672821045, + "learning_rate": 0.0008586716573399329, + "loss": 3.3043, + "step": 4880 + }, + { + "epoch": 0.2837332095506107, + "grad_norm": 0.16520732641220093, + "learning_rate": 0.0008580025360174069, + "loss": 3.3097, + "step": 4890 + }, + { + "epoch": 0.2843134410629841, + "grad_norm": 0.15736092627048492, + "learning_rate": 0.0008573320964920397, + "loss": 3.2936, + "step": 4900 + }, + { + "epoch": 0.2848936725753576, + "grad_norm": 0.16211272776126862, + "learning_rate": 0.0008566603412324602, + "loss": 3.3037, + "step": 4910 + }, + { + "epoch": 0.285473904087731, + "grad_norm": 0.1499512791633606, + "learning_rate": 0.0008559872727121416, + "loss": 3.2995, + "step": 4920 + }, + { + "epoch": 0.28605413560010445, + "grad_norm": 0.17990955710411072, + "learning_rate": 0.0008553128934093926, + "loss": 3.3008, + "step": 4930 + }, + { + "epoch": 0.28663436711247786, + "grad_norm": 0.16948296129703522, + "learning_rate": 0.0008546372058073484, + "loss": 3.2988, + "step": 4940 + }, + { + "epoch": 0.2872145986248513, + "grad_norm": 0.16478213667869568, + "learning_rate": 0.0008539602123939616, + "loss": 3.2981, + "step": 4950 + }, + { + "epoch": 0.28779483013722473, + "grad_norm": 0.17590472102165222, + "learning_rate": 0.0008532819156619928, + "loss": 3.2979, + "step": 4960 + }, + { + "epoch": 0.2883750616495982, + "grad_norm": 0.16984987258911133, + "learning_rate": 0.0008526023181090019, + "loss": 3.3093, + "step": 4970 + }, + { + "epoch": 0.2889552931619716, + "grad_norm": 0.18633520603179932, + "learning_rate": 0.0008519214222373379, + "loss": 3.3027, + "step": 4980 + }, + { + "epoch": 0.28953552467434507, + "grad_norm": 0.1713830530643463, + "learning_rate": 0.000851239230554131, + "loss": 3.3019, + "step": 4990 + }, + { + "epoch": 0.2901157561867185, + "grad_norm": 0.16138288378715515, + "learning_rate": 0.0008505557455712825, + "loss": 3.2957, + "step": 5000 + }, + { + "epoch": 0.2901157561867185, + "eval_loss": 3.232105016708374, + "eval_runtime": 3.247, + "eval_samples_per_second": 1333.559, + "eval_steps_per_second": 10.471, + "step": 5000 + }, + { + "epoch": 0.29069598769909194, + "grad_norm": 0.14849427342414856, + "learning_rate": 0.0008498709698054553, + "loss": 3.297, + "step": 5010 + }, + { + "epoch": 0.29127621921146535, + "grad_norm": 0.15944212675094604, + "learning_rate": 0.0008491849057780658, + "loss": 3.2875, + "step": 5020 + }, + { + "epoch": 0.2918564507238388, + "grad_norm": 0.14453737437725067, + "learning_rate": 0.0008484975560152737, + "loss": 3.2919, + "step": 5030 + }, + { + "epoch": 0.2924366822362123, + "grad_norm": 0.18005253374576569, + "learning_rate": 0.0008478089230479726, + "loss": 3.2981, + "step": 5040 + }, + { + "epoch": 0.2930169137485857, + "grad_norm": 0.16119055449962616, + "learning_rate": 0.0008471190094117814, + "loss": 3.2942, + "step": 5050 + }, + { + "epoch": 0.29359714526095915, + "grad_norm": 0.15280525386333466, + "learning_rate": 0.0008464278176470342, + "loss": 3.2958, + "step": 5060 + }, + { + "epoch": 0.29417737677333256, + "grad_norm": 0.15936006605625153, + "learning_rate": 0.0008457353502987718, + "loss": 3.294, + "step": 5070 + }, + { + "epoch": 0.294757608285706, + "grad_norm": 0.15230213105678558, + "learning_rate": 0.0008450416099167313, + "loss": 3.3008, + "step": 5080 + }, + { + "epoch": 0.2953378397980794, + "grad_norm": 0.15988774597644806, + "learning_rate": 0.0008443465990553374, + "loss": 3.2902, + "step": 5090 + }, + { + "epoch": 0.2959180713104529, + "grad_norm": 0.17253676056861877, + "learning_rate": 0.0008436503202736928, + "loss": 3.2986, + "step": 5100 + }, + { + "epoch": 0.2964983028228263, + "grad_norm": 0.15170855820178986, + "learning_rate": 0.0008429527761355693, + "loss": 3.2877, + "step": 5110 + }, + { + "epoch": 0.29707853433519976, + "grad_norm": 0.16487392783164978, + "learning_rate": 0.0008422539692093974, + "loss": 3.2846, + "step": 5120 + }, + { + "epoch": 0.29765876584757317, + "grad_norm": 0.14367428421974182, + "learning_rate": 0.000841553902068257, + "loss": 3.2906, + "step": 5130 + }, + { + "epoch": 0.29823899735994663, + "grad_norm": 0.17368683218955994, + "learning_rate": 0.0008408525772898692, + "loss": 3.3027, + "step": 5140 + }, + { + "epoch": 0.29881922887232004, + "grad_norm": 0.1599467247724533, + "learning_rate": 0.000840149997456585, + "loss": 3.2852, + "step": 5150 + }, + { + "epoch": 0.2993994603846935, + "grad_norm": 0.15667842328548431, + "learning_rate": 0.0008394461651553768, + "loss": 3.2898, + "step": 5160 + }, + { + "epoch": 0.2999796918970669, + "grad_norm": 0.15665532648563385, + "learning_rate": 0.000838741082977829, + "loss": 3.2994, + "step": 5170 + }, + { + "epoch": 0.3005599234094404, + "grad_norm": 0.17890366911888123, + "learning_rate": 0.0008380347535201283, + "loss": 3.2879, + "step": 5180 + }, + { + "epoch": 0.3011401549218138, + "grad_norm": 0.15844030678272247, + "learning_rate": 0.0008373271793830536, + "loss": 3.2948, + "step": 5190 + }, + { + "epoch": 0.30172038643418725, + "grad_norm": 0.14670881628990173, + "learning_rate": 0.0008366183631719668, + "loss": 3.2901, + "step": 5200 + }, + { + "epoch": 0.30230061794656066, + "grad_norm": 0.15351887047290802, + "learning_rate": 0.0008359083074968039, + "loss": 3.2899, + "step": 5210 + }, + { + "epoch": 0.3028808494589341, + "grad_norm": 0.14965134859085083, + "learning_rate": 0.0008351970149720636, + "loss": 3.2885, + "step": 5220 + }, + { + "epoch": 0.30346108097130753, + "grad_norm": 0.16267924010753632, + "learning_rate": 0.0008344844882167999, + "loss": 3.2937, + "step": 5230 + }, + { + "epoch": 0.304041312483681, + "grad_norm": 0.14958548545837402, + "learning_rate": 0.0008337707298546112, + "loss": 3.2887, + "step": 5240 + }, + { + "epoch": 0.3046215439960544, + "grad_norm": 0.1530071198940277, + "learning_rate": 0.0008330557425136299, + "loss": 3.2865, + "step": 5250 + }, + { + "epoch": 0.30520177550842786, + "grad_norm": 0.17143143713474274, + "learning_rate": 0.0008323395288265149, + "loss": 3.2861, + "step": 5260 + }, + { + "epoch": 0.3057820070208013, + "grad_norm": 0.15875962376594543, + "learning_rate": 0.0008316220914304398, + "loss": 3.2919, + "step": 5270 + }, + { + "epoch": 0.30636223853317474, + "grad_norm": 0.14534059166908264, + "learning_rate": 0.0008309034329670841, + "loss": 3.2813, + "step": 5280 + }, + { + "epoch": 0.3069424700455482, + "grad_norm": 0.1608089655637741, + "learning_rate": 0.0008301835560826236, + "loss": 3.2866, + "step": 5290 + }, + { + "epoch": 0.3075227015579216, + "grad_norm": 0.15711037814617157, + "learning_rate": 0.0008294624634277208, + "loss": 3.2924, + "step": 5300 + }, + { + "epoch": 0.30810293307029507, + "grad_norm": 0.1359197199344635, + "learning_rate": 0.0008287401576575139, + "loss": 3.2906, + "step": 5310 + }, + { + "epoch": 0.3086831645826685, + "grad_norm": 0.1708402931690216, + "learning_rate": 0.0008280166414316086, + "loss": 3.2919, + "step": 5320 + }, + { + "epoch": 0.30926339609504194, + "grad_norm": 0.16216853260993958, + "learning_rate": 0.0008272919174140674, + "loss": 3.278, + "step": 5330 + }, + { + "epoch": 0.30984362760741535, + "grad_norm": 0.16680875420570374, + "learning_rate": 0.0008265659882734002, + "loss": 3.2745, + "step": 5340 + }, + { + "epoch": 0.3104238591197888, + "grad_norm": 0.1761893630027771, + "learning_rate": 0.0008258388566825539, + "loss": 3.2768, + "step": 5350 + }, + { + "epoch": 0.3110040906321622, + "grad_norm": 0.17635513842105865, + "learning_rate": 0.0008251105253189034, + "loss": 3.2908, + "step": 5360 + }, + { + "epoch": 0.3115843221445357, + "grad_norm": 0.1403694897890091, + "learning_rate": 0.0008243809968642411, + "loss": 3.2896, + "step": 5370 + }, + { + "epoch": 0.3121645536569091, + "grad_norm": 0.15853242576122284, + "learning_rate": 0.0008236502740047669, + "loss": 3.2876, + "step": 5380 + }, + { + "epoch": 0.31274478516928256, + "grad_norm": 0.14447931945323944, + "learning_rate": 0.0008229183594310791, + "loss": 3.2749, + "step": 5390 + }, + { + "epoch": 0.31332501668165597, + "grad_norm": 0.14274190366268158, + "learning_rate": 0.0008221852558381639, + "loss": 3.2826, + "step": 5400 + }, + { + "epoch": 0.31390524819402943, + "grad_norm": 0.15020518004894257, + "learning_rate": 0.0008214509659253855, + "loss": 3.2768, + "step": 5410 + }, + { + "epoch": 0.31448547970640284, + "grad_norm": 0.16364452242851257, + "learning_rate": 0.0008207154923964761, + "loss": 3.2796, + "step": 5420 + }, + { + "epoch": 0.3150657112187763, + "grad_norm": 0.15643912553787231, + "learning_rate": 0.0008199788379595266, + "loss": 3.2897, + "step": 5430 + }, + { + "epoch": 0.3156459427311497, + "grad_norm": 0.14374196529388428, + "learning_rate": 0.0008192410053269757, + "loss": 3.2829, + "step": 5440 + }, + { + "epoch": 0.3162261742435232, + "grad_norm": 0.1532783806324005, + "learning_rate": 0.0008185019972156003, + "loss": 3.2775, + "step": 5450 + }, + { + "epoch": 0.3168064057558966, + "grad_norm": 0.15971648693084717, + "learning_rate": 0.0008177618163465054, + "loss": 3.2815, + "step": 5460 + }, + { + "epoch": 0.31738663726827004, + "grad_norm": 0.18425996601581573, + "learning_rate": 0.0008170204654451154, + "loss": 3.2777, + "step": 5470 + }, + { + "epoch": 0.31796686878064345, + "grad_norm": 0.13549089431762695, + "learning_rate": 0.0008162779472411612, + "loss": 3.2782, + "step": 5480 + }, + { + "epoch": 0.3185471002930169, + "grad_norm": 0.14235620200634003, + "learning_rate": 0.0008155342644686729, + "loss": 3.2755, + "step": 5490 + }, + { + "epoch": 0.3191273318053903, + "grad_norm": 0.1648331880569458, + "learning_rate": 0.0008147894198659683, + "loss": 3.2767, + "step": 5500 + }, + { + "epoch": 0.3197075633177638, + "grad_norm": 0.15751086175441742, + "learning_rate": 0.0008140434161756433, + "loss": 3.2789, + "step": 5510 + }, + { + "epoch": 0.32028779483013725, + "grad_norm": 0.16342034935951233, + "learning_rate": 0.0008132962561445616, + "loss": 3.2693, + "step": 5520 + }, + { + "epoch": 0.32086802634251066, + "grad_norm": 0.1530640870332718, + "learning_rate": 0.0008125479425238447, + "loss": 3.2773, + "step": 5530 + }, + { + "epoch": 0.3214482578548841, + "grad_norm": 0.1614234745502472, + "learning_rate": 0.0008117984780688619, + "loss": 3.276, + "step": 5540 + }, + { + "epoch": 0.32202848936725753, + "grad_norm": 0.16489213705062866, + "learning_rate": 0.0008110478655392195, + "loss": 3.2802, + "step": 5550 + }, + { + "epoch": 0.322608720879631, + "grad_norm": 0.16258342564105988, + "learning_rate": 0.0008102961076987519, + "loss": 3.2755, + "step": 5560 + }, + { + "epoch": 0.3231889523920044, + "grad_norm": 0.14555875957012177, + "learning_rate": 0.0008095432073155098, + "loss": 3.2775, + "step": 5570 + }, + { + "epoch": 0.32376918390437787, + "grad_norm": 0.1406964659690857, + "learning_rate": 0.0008087891671617515, + "loss": 3.2611, + "step": 5580 + }, + { + "epoch": 0.3243494154167513, + "grad_norm": 0.162723109126091, + "learning_rate": 0.0008080339900139317, + "loss": 3.2648, + "step": 5590 + }, + { + "epoch": 0.32492964692912474, + "grad_norm": 0.1660017967224121, + "learning_rate": 0.0008072776786526921, + "loss": 3.2704, + "step": 5600 + }, + { + "epoch": 0.32550987844149815, + "grad_norm": 0.13984504342079163, + "learning_rate": 0.0008065202358628501, + "loss": 3.2757, + "step": 5610 + }, + { + "epoch": 0.3260901099538716, + "grad_norm": 0.17101338505744934, + "learning_rate": 0.0008057616644333894, + "loss": 3.2742, + "step": 5620 + }, + { + "epoch": 0.326670341466245, + "grad_norm": 0.15518838167190552, + "learning_rate": 0.0008050019671574496, + "loss": 3.2676, + "step": 5630 + }, + { + "epoch": 0.3272505729786185, + "grad_norm": 0.17470310628414154, + "learning_rate": 0.0008042411468323154, + "loss": 3.2731, + "step": 5640 + }, + { + "epoch": 0.3278308044909919, + "grad_norm": 0.14603078365325928, + "learning_rate": 0.0008034792062594072, + "loss": 3.2727, + "step": 5650 + }, + { + "epoch": 0.32841103600336535, + "grad_norm": 0.14128392934799194, + "learning_rate": 0.00080271614824427, + "loss": 3.2689, + "step": 5660 + }, + { + "epoch": 0.32899126751573876, + "grad_norm": 0.16043803095817566, + "learning_rate": 0.0008019519755965629, + "loss": 3.2574, + "step": 5670 + }, + { + "epoch": 0.3295714990281122, + "grad_norm": 0.1634809821844101, + "learning_rate": 0.0008011866911300504, + "loss": 3.2706, + "step": 5680 + }, + { + "epoch": 0.33015173054048563, + "grad_norm": 0.1542406529188156, + "learning_rate": 0.0008004202976625895, + "loss": 3.2894, + "step": 5690 + }, + { + "epoch": 0.3307319620528591, + "grad_norm": 0.15284700691699982, + "learning_rate": 0.0007996527980161214, + "loss": 3.2814, + "step": 5700 + }, + { + "epoch": 0.3313121935652325, + "grad_norm": 0.1456363946199417, + "learning_rate": 0.0007988841950166602, + "loss": 3.2727, + "step": 5710 + }, + { + "epoch": 0.33189242507760597, + "grad_norm": 0.173149973154068, + "learning_rate": 0.0007981144914942827, + "loss": 3.2607, + "step": 5720 + }, + { + "epoch": 0.3324726565899794, + "grad_norm": 0.1568511724472046, + "learning_rate": 0.0007973436902831179, + "loss": 3.2638, + "step": 5730 + }, + { + "epoch": 0.33305288810235284, + "grad_norm": 0.16106663644313812, + "learning_rate": 0.0007965717942213365, + "loss": 3.2652, + "step": 5740 + }, + { + "epoch": 0.3336331196147263, + "grad_norm": 0.14553207159042358, + "learning_rate": 0.0007957988061511408, + "loss": 3.2771, + "step": 5750 + }, + { + "epoch": 0.3342133511270997, + "grad_norm": 0.15205144882202148, + "learning_rate": 0.0007950247289187538, + "loss": 3.2729, + "step": 5760 + }, + { + "epoch": 0.3347935826394732, + "grad_norm": 0.15027263760566711, + "learning_rate": 0.0007942495653744089, + "loss": 3.2727, + "step": 5770 + }, + { + "epoch": 0.3353738141518466, + "grad_norm": 0.1563851535320282, + "learning_rate": 0.0007934733183723395, + "loss": 3.2653, + "step": 5780 + }, + { + "epoch": 0.33595404566422005, + "grad_norm": 0.14442172646522522, + "learning_rate": 0.0007926959907707683, + "loss": 3.2754, + "step": 5790 + }, + { + "epoch": 0.33653427717659345, + "grad_norm": 0.13226284086704254, + "learning_rate": 0.0007919175854318971, + "loss": 3.2605, + "step": 5800 + }, + { + "epoch": 0.3371145086889669, + "grad_norm": 0.1466459184885025, + "learning_rate": 0.0007911381052218961, + "loss": 3.2638, + "step": 5810 + }, + { + "epoch": 0.3376947402013403, + "grad_norm": 0.15801192820072174, + "learning_rate": 0.0007903575530108926, + "loss": 3.2604, + "step": 5820 + }, + { + "epoch": 0.3382749717137138, + "grad_norm": 0.14419734477996826, + "learning_rate": 0.000789575931672962, + "loss": 3.2674, + "step": 5830 + }, + { + "epoch": 0.3388552032260872, + "grad_norm": 0.16942447423934937, + "learning_rate": 0.0007887932440861158, + "loss": 3.2634, + "step": 5840 + }, + { + "epoch": 0.33943543473846066, + "grad_norm": 0.15958154201507568, + "learning_rate": 0.0007880094931322916, + "loss": 3.2687, + "step": 5850 + }, + { + "epoch": 0.34001566625083407, + "grad_norm": 0.14885276556015015, + "learning_rate": 0.0007872246816973428, + "loss": 3.2665, + "step": 5860 + }, + { + "epoch": 0.34059589776320753, + "grad_norm": 0.15011096000671387, + "learning_rate": 0.0007864388126710268, + "loss": 3.2697, + "step": 5870 + }, + { + "epoch": 0.34117612927558094, + "grad_norm": 0.1571209579706192, + "learning_rate": 0.0007856518889469961, + "loss": 3.2688, + "step": 5880 + }, + { + "epoch": 0.3417563607879544, + "grad_norm": 0.1479196548461914, + "learning_rate": 0.0007848639134227864, + "loss": 3.2688, + "step": 5890 + }, + { + "epoch": 0.3423365923003278, + "grad_norm": 0.1649380475282669, + "learning_rate": 0.0007840748889998057, + "loss": 3.2629, + "step": 5900 + }, + { + "epoch": 0.3429168238127013, + "grad_norm": 0.1554240733385086, + "learning_rate": 0.000783284818583325, + "loss": 3.2653, + "step": 5910 + }, + { + "epoch": 0.3434970553250747, + "grad_norm": 0.14077123999595642, + "learning_rate": 0.000782493705082466, + "loss": 3.2631, + "step": 5920 + }, + { + "epoch": 0.34407728683744815, + "grad_norm": 0.16318172216415405, + "learning_rate": 0.0007817015514101917, + "loss": 3.2486, + "step": 5930 + }, + { + "epoch": 0.34465751834982156, + "grad_norm": 0.16839627921581268, + "learning_rate": 0.0007809083604832948, + "loss": 3.265, + "step": 5940 + }, + { + "epoch": 0.345237749862195, + "grad_norm": 0.14004649221897125, + "learning_rate": 0.0007801141352223873, + "loss": 3.261, + "step": 5950 + }, + { + "epoch": 0.34581798137456843, + "grad_norm": 0.1588735729455948, + "learning_rate": 0.0007793188785518901, + "loss": 3.2614, + "step": 5960 + }, + { + "epoch": 0.3463982128869419, + "grad_norm": 0.15165585279464722, + "learning_rate": 0.0007785225934000213, + "loss": 3.2654, + "step": 5970 + }, + { + "epoch": 0.3469784443993153, + "grad_norm": 0.14551375806331635, + "learning_rate": 0.0007777252826987864, + "loss": 3.2593, + "step": 5980 + }, + { + "epoch": 0.34755867591168876, + "grad_norm": 0.13176442682743073, + "learning_rate": 0.0007769269493839669, + "loss": 3.2519, + "step": 5990 + }, + { + "epoch": 0.3481389074240622, + "grad_norm": 0.1538584679365158, + "learning_rate": 0.0007761275963951096, + "loss": 3.2677, + "step": 6000 + }, + { + "epoch": 0.3481389074240622, + "eval_loss": 3.1944527626037598, + "eval_runtime": 3.2607, + "eval_samples_per_second": 1327.935, + "eval_steps_per_second": 10.427, + "step": 6000 + }, + { + "epoch": 0.34871913893643564, + "grad_norm": 0.1548028141260147, + "learning_rate": 0.0007753272266755161, + "loss": 3.2613, + "step": 6010 + }, + { + "epoch": 0.3492993704488091, + "grad_norm": 0.15565787255764008, + "learning_rate": 0.0007745258431722313, + "loss": 3.2622, + "step": 6020 + }, + { + "epoch": 0.3498796019611825, + "grad_norm": 0.15149720013141632, + "learning_rate": 0.0007737234488360334, + "loss": 3.2608, + "step": 6030 + }, + { + "epoch": 0.35045983347355597, + "grad_norm": 0.14522841572761536, + "learning_rate": 0.0007729200466214225, + "loss": 3.2509, + "step": 6040 + }, + { + "epoch": 0.3510400649859294, + "grad_norm": 0.1358969360589981, + "learning_rate": 0.0007721156394866096, + "loss": 3.2631, + "step": 6050 + }, + { + "epoch": 0.35162029649830284, + "grad_norm": 0.13670052587985992, + "learning_rate": 0.0007713102303935058, + "loss": 3.2643, + "step": 6060 + }, + { + "epoch": 0.35220052801067625, + "grad_norm": 0.16404469311237335, + "learning_rate": 0.0007705038223077121, + "loss": 3.2435, + "step": 6070 + }, + { + "epoch": 0.3527807595230497, + "grad_norm": 0.14754830300807953, + "learning_rate": 0.0007696964181985076, + "loss": 3.264, + "step": 6080 + }, + { + "epoch": 0.3533609910354231, + "grad_norm": 0.16356825828552246, + "learning_rate": 0.0007688880210388384, + "loss": 3.2629, + "step": 6090 + }, + { + "epoch": 0.3539412225477966, + "grad_norm": 0.1487089991569519, + "learning_rate": 0.0007680786338053079, + "loss": 3.255, + "step": 6100 + }, + { + "epoch": 0.35452145406017, + "grad_norm": 0.16473692655563354, + "learning_rate": 0.0007672682594781645, + "loss": 3.2539, + "step": 6110 + }, + { + "epoch": 0.35510168557254346, + "grad_norm": 0.14944276213645935, + "learning_rate": 0.0007664569010412914, + "loss": 3.2526, + "step": 6120 + }, + { + "epoch": 0.35568191708491687, + "grad_norm": 0.15919911861419678, + "learning_rate": 0.0007656445614821954, + "loss": 3.2613, + "step": 6130 + }, + { + "epoch": 0.35626214859729033, + "grad_norm": 0.15756233036518097, + "learning_rate": 0.000764831243791996, + "loss": 3.2484, + "step": 6140 + }, + { + "epoch": 0.35684238010966374, + "grad_norm": 0.14643166959285736, + "learning_rate": 0.0007640169509654136, + "loss": 3.2552, + "step": 6150 + }, + { + "epoch": 0.3574226116220372, + "grad_norm": 0.15033231675624847, + "learning_rate": 0.0007632016860007603, + "loss": 3.2531, + "step": 6160 + }, + { + "epoch": 0.3580028431344106, + "grad_norm": 0.14909091591835022, + "learning_rate": 0.000762385451899927, + "loss": 3.2651, + "step": 6170 + }, + { + "epoch": 0.3585830746467841, + "grad_norm": 0.14181995391845703, + "learning_rate": 0.0007615682516683728, + "loss": 3.2596, + "step": 6180 + }, + { + "epoch": 0.3591633061591575, + "grad_norm": 0.1447875052690506, + "learning_rate": 0.0007607500883151148, + "loss": 3.2588, + "step": 6190 + }, + { + "epoch": 0.35974353767153094, + "grad_norm": 0.15402406454086304, + "learning_rate": 0.0007599309648527162, + "loss": 3.2478, + "step": 6200 + }, + { + "epoch": 0.36032376918390435, + "grad_norm": 0.16491296887397766, + "learning_rate": 0.0007591108842972754, + "loss": 3.2442, + "step": 6210 + }, + { + "epoch": 0.3609040006962778, + "grad_norm": 0.14670206606388092, + "learning_rate": 0.0007582898496684148, + "loss": 3.2601, + "step": 6220 + }, + { + "epoch": 0.3614842322086513, + "grad_norm": 0.12047087401151657, + "learning_rate": 0.0007574678639892702, + "loss": 3.2531, + "step": 6230 + }, + { + "epoch": 0.3620644637210247, + "grad_norm": 0.1422395259141922, + "learning_rate": 0.0007566449302864784, + "loss": 3.2565, + "step": 6240 + }, + { + "epoch": 0.36264469523339815, + "grad_norm": 0.16634182631969452, + "learning_rate": 0.0007558210515901683, + "loss": 3.2521, + "step": 6250 + }, + { + "epoch": 0.36322492674577156, + "grad_norm": 0.14773225784301758, + "learning_rate": 0.0007549962309339467, + "loss": 3.2571, + "step": 6260 + }, + { + "epoch": 0.363805158258145, + "grad_norm": 0.1608228087425232, + "learning_rate": 0.0007541704713548905, + "loss": 3.2466, + "step": 6270 + }, + { + "epoch": 0.36438538977051843, + "grad_norm": 0.1616470217704773, + "learning_rate": 0.0007533437758935324, + "loss": 3.2559, + "step": 6280 + }, + { + "epoch": 0.3649656212828919, + "grad_norm": 0.14817708730697632, + "learning_rate": 0.0007525161475938518, + "loss": 3.2579, + "step": 6290 + }, + { + "epoch": 0.3655458527952653, + "grad_norm": 0.1556018441915512, + "learning_rate": 0.0007516875895032628, + "loss": 3.2521, + "step": 6300 + }, + { + "epoch": 0.36612608430763877, + "grad_norm": 0.13821960985660553, + "learning_rate": 0.0007508581046726032, + "loss": 3.256, + "step": 6310 + }, + { + "epoch": 0.3667063158200122, + "grad_norm": 0.13531796634197235, + "learning_rate": 0.0007500276961561232, + "loss": 3.2476, + "step": 6320 + }, + { + "epoch": 0.36728654733238564, + "grad_norm": 0.13882015645503998, + "learning_rate": 0.0007491963670114737, + "loss": 3.2507, + "step": 6330 + }, + { + "epoch": 0.36786677884475905, + "grad_norm": 0.13630333542823792, + "learning_rate": 0.0007483641202996957, + "loss": 3.2536, + "step": 6340 + }, + { + "epoch": 0.3684470103571325, + "grad_norm": 0.12747836112976074, + "learning_rate": 0.0007475309590852089, + "loss": 3.2559, + "step": 6350 + }, + { + "epoch": 0.3690272418695059, + "grad_norm": 0.15810616314411163, + "learning_rate": 0.0007466968864357998, + "loss": 3.2431, + "step": 6360 + }, + { + "epoch": 0.3696074733818794, + "grad_norm": 0.1615232676267624, + "learning_rate": 0.0007458619054226117, + "loss": 3.2513, + "step": 6370 + }, + { + "epoch": 0.3701877048942528, + "grad_norm": 0.12830163538455963, + "learning_rate": 0.000745026019120132, + "loss": 3.2539, + "step": 6380 + }, + { + "epoch": 0.37076793640662625, + "grad_norm": 0.16822132468223572, + "learning_rate": 0.0007441892306061817, + "loss": 3.2442, + "step": 6390 + }, + { + "epoch": 0.37134816791899966, + "grad_norm": 0.14407211542129517, + "learning_rate": 0.0007433515429619038, + "loss": 3.2533, + "step": 6400 + }, + { + "epoch": 0.3719283994313731, + "grad_norm": 0.13332654535770416, + "learning_rate": 0.0007425129592717516, + "loss": 3.247, + "step": 6410 + }, + { + "epoch": 0.37250863094374653, + "grad_norm": 0.15194551646709442, + "learning_rate": 0.0007416734826234786, + "loss": 3.2469, + "step": 6420 + }, + { + "epoch": 0.37308886245612, + "grad_norm": 0.13437363505363464, + "learning_rate": 0.0007408331161081255, + "loss": 3.246, + "step": 6430 + }, + { + "epoch": 0.3736690939684934, + "grad_norm": 0.1475239098072052, + "learning_rate": 0.00073999186282001, + "loss": 3.2452, + "step": 6440 + }, + { + "epoch": 0.37424932548086687, + "grad_norm": 0.1388455033302307, + "learning_rate": 0.0007391497258567146, + "loss": 3.2484, + "step": 6450 + }, + { + "epoch": 0.3748295569932403, + "grad_norm": 0.14330509305000305, + "learning_rate": 0.000738306708319076, + "loss": 3.2499, + "step": 6460 + }, + { + "epoch": 0.37540978850561374, + "grad_norm": 0.13358131051063538, + "learning_rate": 0.0007374628133111728, + "loss": 3.2416, + "step": 6470 + }, + { + "epoch": 0.3759900200179872, + "grad_norm": 0.15574291348457336, + "learning_rate": 0.0007366180439403152, + "loss": 3.2499, + "step": 6480 + }, + { + "epoch": 0.3765702515303606, + "grad_norm": 0.15618012845516205, + "learning_rate": 0.0007357724033170323, + "loss": 3.2408, + "step": 6490 + }, + { + "epoch": 0.3771504830427341, + "grad_norm": 0.12743791937828064, + "learning_rate": 0.0007349258945550615, + "loss": 3.2478, + "step": 6500 + }, + { + "epoch": 0.3777307145551075, + "grad_norm": 0.1619246006011963, + "learning_rate": 0.000734078520771337, + "loss": 3.2358, + "step": 6510 + }, + { + "epoch": 0.37831094606748095, + "grad_norm": 0.1590278297662735, + "learning_rate": 0.0007332302850859773, + "loss": 3.2425, + "step": 6520 + }, + { + "epoch": 0.37889117757985435, + "grad_norm": 0.16503369808197021, + "learning_rate": 0.0007323811906222755, + "loss": 3.2411, + "step": 6530 + }, + { + "epoch": 0.3794714090922278, + "grad_norm": 0.1441235989332199, + "learning_rate": 0.0007315312405066861, + "loss": 3.245, + "step": 6540 + }, + { + "epoch": 0.3800516406046012, + "grad_norm": 0.16268372535705566, + "learning_rate": 0.0007306804378688147, + "loss": 3.2475, + "step": 6550 + }, + { + "epoch": 0.3806318721169747, + "grad_norm": 0.17126062512397766, + "learning_rate": 0.0007298287858414057, + "loss": 3.2395, + "step": 6560 + }, + { + "epoch": 0.3812121036293481, + "grad_norm": 0.14614002406597137, + "learning_rate": 0.0007289762875603308, + "loss": 3.2465, + "step": 6570 + }, + { + "epoch": 0.38179233514172156, + "grad_norm": 0.1300090104341507, + "learning_rate": 0.0007281229461645782, + "loss": 3.2534, + "step": 6580 + }, + { + "epoch": 0.38237256665409497, + "grad_norm": 0.16573797166347504, + "learning_rate": 0.0007272687647962403, + "loss": 3.2395, + "step": 6590 + }, + { + "epoch": 0.38295279816646843, + "grad_norm": 0.17565912008285522, + "learning_rate": 0.0007264137466005025, + "loss": 3.2412, + "step": 6600 + }, + { + "epoch": 0.38353302967884184, + "grad_norm": 0.14961925148963928, + "learning_rate": 0.0007255578947256312, + "loss": 3.2339, + "step": 6610 + }, + { + "epoch": 0.3841132611912153, + "grad_norm": 0.1480415016412735, + "learning_rate": 0.0007247012123229627, + "loss": 3.2358, + "step": 6620 + }, + { + "epoch": 0.3846934927035887, + "grad_norm": 0.14414618909358978, + "learning_rate": 0.0007238437025468913, + "loss": 3.2367, + "step": 6630 + }, + { + "epoch": 0.3852737242159622, + "grad_norm": 0.14013369381427765, + "learning_rate": 0.0007229853685548578, + "loss": 3.2453, + "step": 6640 + }, + { + "epoch": 0.3858539557283356, + "grad_norm": 0.13546213507652283, + "learning_rate": 0.0007221262135073381, + "loss": 3.2346, + "step": 6650 + }, + { + "epoch": 0.38643418724070905, + "grad_norm": 0.16352064907550812, + "learning_rate": 0.0007212662405678309, + "loss": 3.24, + "step": 6660 + }, + { + "epoch": 0.38701441875308246, + "grad_norm": 0.14588786661624908, + "learning_rate": 0.0007204054529028467, + "loss": 3.2478, + "step": 6670 + }, + { + "epoch": 0.3875946502654559, + "grad_norm": 0.151209756731987, + "learning_rate": 0.0007195438536818957, + "loss": 3.2306, + "step": 6680 + }, + { + "epoch": 0.38817488177782933, + "grad_norm": 0.14419269561767578, + "learning_rate": 0.0007186814460774769, + "loss": 3.2372, + "step": 6690 + }, + { + "epoch": 0.3887551132902028, + "grad_norm": 0.14094632863998413, + "learning_rate": 0.0007178182332650649, + "loss": 3.2323, + "step": 6700 + }, + { + "epoch": 0.38933534480257626, + "grad_norm": 0.1500055193901062, + "learning_rate": 0.0007169542184231001, + "loss": 3.2421, + "step": 6710 + }, + { + "epoch": 0.38991557631494966, + "grad_norm": 0.14962860941886902, + "learning_rate": 0.0007160894047329756, + "loss": 3.2392, + "step": 6720 + }, + { + "epoch": 0.3904958078273231, + "grad_norm": 0.14648567140102386, + "learning_rate": 0.0007152237953790258, + "loss": 3.2358, + "step": 6730 + }, + { + "epoch": 0.39107603933969654, + "grad_norm": 0.14237669110298157, + "learning_rate": 0.0007143573935485153, + "loss": 3.2479, + "step": 6740 + }, + { + "epoch": 0.39165627085207, + "grad_norm": 0.12649035453796387, + "learning_rate": 0.0007134902024316264, + "loss": 3.2412, + "step": 6750 + }, + { + "epoch": 0.3922365023644434, + "grad_norm": 0.13935695588588715, + "learning_rate": 0.0007126222252214473, + "loss": 3.2341, + "step": 6760 + }, + { + "epoch": 0.39281673387681687, + "grad_norm": 0.15621213614940643, + "learning_rate": 0.0007117534651139612, + "loss": 3.2332, + "step": 6770 + }, + { + "epoch": 0.3933969653891903, + "grad_norm": 0.1531130075454712, + "learning_rate": 0.0007108839253080338, + "loss": 3.2251, + "step": 6780 + }, + { + "epoch": 0.39397719690156374, + "grad_norm": 0.14018535614013672, + "learning_rate": 0.0007100136090054019, + "loss": 3.2377, + "step": 6790 + }, + { + "epoch": 0.39455742841393715, + "grad_norm": 0.1587972193956375, + "learning_rate": 0.0007091425194106611, + "loss": 3.2356, + "step": 6800 + }, + { + "epoch": 0.3951376599263106, + "grad_norm": 0.13827280700206757, + "learning_rate": 0.0007082706597312549, + "loss": 3.2345, + "step": 6810 + }, + { + "epoch": 0.395717891438684, + "grad_norm": 0.13535282015800476, + "learning_rate": 0.0007073980331774615, + "loss": 3.2347, + "step": 6820 + }, + { + "epoch": 0.3962981229510575, + "grad_norm": 0.15061281621456146, + "learning_rate": 0.0007065246429623835, + "loss": 3.2345, + "step": 6830 + }, + { + "epoch": 0.3968783544634309, + "grad_norm": 0.1398342251777649, + "learning_rate": 0.0007056504923019352, + "loss": 3.231, + "step": 6840 + }, + { + "epoch": 0.39745858597580436, + "grad_norm": 0.14031299948692322, + "learning_rate": 0.0007047755844148307, + "loss": 3.2212, + "step": 6850 + }, + { + "epoch": 0.39803881748817777, + "grad_norm": 0.1403796672821045, + "learning_rate": 0.0007038999225225729, + "loss": 3.2346, + "step": 6860 + }, + { + "epoch": 0.39861904900055123, + "grad_norm": 0.13849115371704102, + "learning_rate": 0.0007030235098494403, + "loss": 3.2424, + "step": 6870 + }, + { + "epoch": 0.39919928051292464, + "grad_norm": 0.14095619320869446, + "learning_rate": 0.0007021463496224762, + "loss": 3.2299, + "step": 6880 + }, + { + "epoch": 0.3997795120252981, + "grad_norm": 0.14068861305713654, + "learning_rate": 0.0007012684450714765, + "loss": 3.2338, + "step": 6890 + }, + { + "epoch": 0.4003597435376715, + "grad_norm": 0.14077772200107574, + "learning_rate": 0.0007003897994289777, + "loss": 3.2323, + "step": 6900 + }, + { + "epoch": 0.40093997505004497, + "grad_norm": 0.14695732295513153, + "learning_rate": 0.0006995104159302452, + "loss": 3.2343, + "step": 6910 + }, + { + "epoch": 0.4015202065624184, + "grad_norm": 0.14510050415992737, + "learning_rate": 0.0006986302978132611, + "loss": 3.2269, + "step": 6920 + }, + { + "epoch": 0.40210043807479184, + "grad_norm": 0.14484266936779022, + "learning_rate": 0.0006977494483187126, + "loss": 3.2255, + "step": 6930 + }, + { + "epoch": 0.40268066958716525, + "grad_norm": 0.14667174220085144, + "learning_rate": 0.0006968678706899795, + "loss": 3.2269, + "step": 6940 + }, + { + "epoch": 0.4032609010995387, + "grad_norm": 0.15151144564151764, + "learning_rate": 0.0006959855681731233, + "loss": 3.2294, + "step": 6950 + }, + { + "epoch": 0.4038411326119122, + "grad_norm": 0.1448170691728592, + "learning_rate": 0.000695102544016874, + "loss": 3.2299, + "step": 6960 + }, + { + "epoch": 0.4044213641242856, + "grad_norm": 0.12772366404533386, + "learning_rate": 0.0006942188014726194, + "loss": 3.2285, + "step": 6970 + }, + { + "epoch": 0.40500159563665905, + "grad_norm": 0.15471121668815613, + "learning_rate": 0.000693334343794392, + "loss": 3.227, + "step": 6980 + }, + { + "epoch": 0.40558182714903246, + "grad_norm": 0.15615463256835938, + "learning_rate": 0.0006924491742388573, + "loss": 3.228, + "step": 6990 + }, + { + "epoch": 0.4061620586614059, + "grad_norm": 0.12857802212238312, + "learning_rate": 0.0006915632960653029, + "loss": 3.225, + "step": 7000 + }, + { + "epoch": 0.4061620586614059, + "eval_loss": 3.165278911590576, + "eval_runtime": 3.2586, + "eval_samples_per_second": 1328.797, + "eval_steps_per_second": 10.434, + "step": 7000 + }, + { + "epoch": 0.40674229017377933, + "grad_norm": 0.14582431316375732, + "learning_rate": 0.0006906767125356246, + "loss": 3.235, + "step": 7010 + }, + { + "epoch": 0.4073225216861528, + "grad_norm": 0.1576128900051117, + "learning_rate": 0.000689789426914316, + "loss": 3.2256, + "step": 7020 + }, + { + "epoch": 0.4079027531985262, + "grad_norm": 0.16213494539260864, + "learning_rate": 0.0006889014424684557, + "loss": 3.2409, + "step": 7030 + }, + { + "epoch": 0.40848298471089967, + "grad_norm": 0.13359089195728302, + "learning_rate": 0.0006880127624676955, + "loss": 3.2328, + "step": 7040 + }, + { + "epoch": 0.4090632162232731, + "grad_norm": 0.1388418972492218, + "learning_rate": 0.0006871233901842481, + "loss": 3.2191, + "step": 7050 + }, + { + "epoch": 0.40964344773564654, + "grad_norm": 0.1342374086380005, + "learning_rate": 0.0006862333288928755, + "loss": 3.2348, + "step": 7060 + }, + { + "epoch": 0.41022367924801995, + "grad_norm": 0.15014256536960602, + "learning_rate": 0.0006853425818708767, + "loss": 3.2239, + "step": 7070 + }, + { + "epoch": 0.4108039107603934, + "grad_norm": 0.1368698626756668, + "learning_rate": 0.0006844511523980755, + "loss": 3.2385, + "step": 7080 + }, + { + "epoch": 0.4113841422727668, + "grad_norm": 0.15549789369106293, + "learning_rate": 0.0006835590437568084, + "loss": 3.2344, + "step": 7090 + }, + { + "epoch": 0.4119643737851403, + "grad_norm": 0.13888388872146606, + "learning_rate": 0.0006826662592319131, + "loss": 3.2258, + "step": 7100 + }, + { + "epoch": 0.4125446052975137, + "grad_norm": 0.12590526044368744, + "learning_rate": 0.0006817728021107159, + "loss": 3.221, + "step": 7110 + }, + { + "epoch": 0.41312483680988715, + "grad_norm": 0.12910686433315277, + "learning_rate": 0.0006808786756830192, + "loss": 3.2283, + "step": 7120 + }, + { + "epoch": 0.41370506832226056, + "grad_norm": 0.13956746459007263, + "learning_rate": 0.0006799838832410903, + "loss": 3.2201, + "step": 7130 + }, + { + "epoch": 0.414285299834634, + "grad_norm": 0.1646030694246292, + "learning_rate": 0.0006790884280796486, + "loss": 3.2191, + "step": 7140 + }, + { + "epoch": 0.41486553134700743, + "grad_norm": 0.13890932500362396, + "learning_rate": 0.0006781923134958539, + "loss": 3.2257, + "step": 7150 + }, + { + "epoch": 0.4154457628593809, + "grad_norm": 0.13294340670108795, + "learning_rate": 0.0006772955427892939, + "loss": 3.2317, + "step": 7160 + }, + { + "epoch": 0.4160259943717543, + "grad_norm": 0.12860235571861267, + "learning_rate": 0.0006763981192619726, + "loss": 3.2154, + "step": 7170 + }, + { + "epoch": 0.41660622588412777, + "grad_norm": 0.14738686382770538, + "learning_rate": 0.0006755000462182972, + "loss": 3.2332, + "step": 7180 + }, + { + "epoch": 0.41718645739650123, + "grad_norm": 0.13093027472496033, + "learning_rate": 0.0006746013269650666, + "loss": 3.2351, + "step": 7190 + }, + { + "epoch": 0.41776668890887464, + "grad_norm": 0.13175268471240997, + "learning_rate": 0.0006737019648114593, + "loss": 3.2294, + "step": 7200 + }, + { + "epoch": 0.4183469204212481, + "grad_norm": 0.13433928787708282, + "learning_rate": 0.000672801963069021, + "loss": 3.2273, + "step": 7210 + }, + { + "epoch": 0.4189271519336215, + "grad_norm": 0.14208847284317017, + "learning_rate": 0.0006719013250516526, + "loss": 3.2272, + "step": 7220 + }, + { + "epoch": 0.419507383445995, + "grad_norm": 0.13174398243427277, + "learning_rate": 0.0006710000540755973, + "loss": 3.2153, + "step": 7230 + }, + { + "epoch": 0.4200876149583684, + "grad_norm": 0.14360399544239044, + "learning_rate": 0.0006700981534594296, + "loss": 3.223, + "step": 7240 + }, + { + "epoch": 0.42066784647074185, + "grad_norm": 0.1482868790626526, + "learning_rate": 0.0006691956265240417, + "loss": 3.218, + "step": 7250 + }, + { + "epoch": 0.42124807798311525, + "grad_norm": 0.13119544088840485, + "learning_rate": 0.0006682924765926323, + "loss": 3.2294, + "step": 7260 + }, + { + "epoch": 0.4218283094954887, + "grad_norm": 0.13039755821228027, + "learning_rate": 0.0006673887069906945, + "loss": 3.227, + "step": 7270 + }, + { + "epoch": 0.4224085410078621, + "grad_norm": 0.12415551394224167, + "learning_rate": 0.0006664843210460025, + "loss": 3.2142, + "step": 7280 + }, + { + "epoch": 0.4229887725202356, + "grad_norm": 0.13810203969478607, + "learning_rate": 0.0006655793220885997, + "loss": 3.2275, + "step": 7290 + }, + { + "epoch": 0.423569004032609, + "grad_norm": 0.13545836508274078, + "learning_rate": 0.0006646737134507874, + "loss": 3.2113, + "step": 7300 + }, + { + "epoch": 0.42414923554498246, + "grad_norm": 0.12676437199115753, + "learning_rate": 0.0006637674984671113, + "loss": 3.2183, + "step": 7310 + }, + { + "epoch": 0.42472946705735587, + "grad_norm": 0.12899167835712433, + "learning_rate": 0.0006628606804743502, + "loss": 3.2237, + "step": 7320 + }, + { + "epoch": 0.42530969856972933, + "grad_norm": 0.13533097505569458, + "learning_rate": 0.0006619532628115027, + "loss": 3.2025, + "step": 7330 + }, + { + "epoch": 0.42588993008210274, + "grad_norm": 0.12174040824174881, + "learning_rate": 0.0006610452488197758, + "loss": 3.2141, + "step": 7340 + }, + { + "epoch": 0.4264701615944762, + "grad_norm": 0.14033706486225128, + "learning_rate": 0.000660136641842572, + "loss": 3.2309, + "step": 7350 + }, + { + "epoch": 0.4270503931068496, + "grad_norm": 0.1348879039287567, + "learning_rate": 0.0006592274452254775, + "loss": 3.2207, + "step": 7360 + }, + { + "epoch": 0.4276306246192231, + "grad_norm": 0.13258253037929535, + "learning_rate": 0.0006583176623162494, + "loss": 3.2273, + "step": 7370 + }, + { + "epoch": 0.4282108561315965, + "grad_norm": 0.14150184392929077, + "learning_rate": 0.0006574072964648038, + "loss": 3.2205, + "step": 7380 + }, + { + "epoch": 0.42879108764396995, + "grad_norm": 0.1396942138671875, + "learning_rate": 0.0006564963510232031, + "loss": 3.2179, + "step": 7390 + }, + { + "epoch": 0.42937131915634336, + "grad_norm": 0.13543544709682465, + "learning_rate": 0.0006555848293456438, + "loss": 3.217, + "step": 7400 + }, + { + "epoch": 0.4299515506687168, + "grad_norm": 0.1295756995677948, + "learning_rate": 0.0006546727347884441, + "loss": 3.2206, + "step": 7410 + }, + { + "epoch": 0.43053178218109023, + "grad_norm": 0.15362213551998138, + "learning_rate": 0.000653760070710032, + "loss": 3.2212, + "step": 7420 + }, + { + "epoch": 0.4311120136934637, + "grad_norm": 0.1392851173877716, + "learning_rate": 0.0006528468404709319, + "loss": 3.2175, + "step": 7430 + }, + { + "epoch": 0.43169224520583716, + "grad_norm": 0.12892089784145355, + "learning_rate": 0.0006519330474337534, + "loss": 3.218, + "step": 7440 + }, + { + "epoch": 0.43227247671821056, + "grad_norm": 0.1390940397977829, + "learning_rate": 0.0006510186949631782, + "loss": 3.2171, + "step": 7450 + }, + { + "epoch": 0.432852708230584, + "grad_norm": 0.13330115377902985, + "learning_rate": 0.0006501037864259478, + "loss": 3.2293, + "step": 7460 + }, + { + "epoch": 0.43343293974295743, + "grad_norm": 0.1433860808610916, + "learning_rate": 0.0006491883251908513, + "loss": 3.2099, + "step": 7470 + }, + { + "epoch": 0.4340131712553309, + "grad_norm": 0.13837961852550507, + "learning_rate": 0.000648272314628713, + "loss": 3.2139, + "step": 7480 + }, + { + "epoch": 0.4345934027677043, + "grad_norm": 0.13204647600650787, + "learning_rate": 0.0006473557581123797, + "loss": 3.2267, + "step": 7490 + }, + { + "epoch": 0.43517363428007777, + "grad_norm": 0.13703928887844086, + "learning_rate": 0.0006464386590167082, + "loss": 3.2131, + "step": 7500 + }, + { + "epoch": 0.4357538657924512, + "grad_norm": 0.13702446222305298, + "learning_rate": 0.0006455210207185539, + "loss": 3.2238, + "step": 7510 + }, + { + "epoch": 0.43633409730482464, + "grad_norm": 0.13354191184043884, + "learning_rate": 0.0006446028465967568, + "loss": 3.2131, + "step": 7520 + }, + { + "epoch": 0.43691432881719805, + "grad_norm": 0.14659354090690613, + "learning_rate": 0.0006436841400321304, + "loss": 3.2243, + "step": 7530 + }, + { + "epoch": 0.4374945603295715, + "grad_norm": 0.13794207572937012, + "learning_rate": 0.0006427649044074484, + "loss": 3.2229, + "step": 7540 + }, + { + "epoch": 0.4380747918419449, + "grad_norm": 0.146932452917099, + "learning_rate": 0.0006418451431074329, + "loss": 3.2117, + "step": 7550 + }, + { + "epoch": 0.4386550233543184, + "grad_norm": 0.13247311115264893, + "learning_rate": 0.0006409248595187409, + "loss": 3.223, + "step": 7560 + }, + { + "epoch": 0.4392352548666918, + "grad_norm": 0.13509587943553925, + "learning_rate": 0.0006400040570299535, + "loss": 3.2165, + "step": 7570 + }, + { + "epoch": 0.43981548637906526, + "grad_norm": 0.1419142633676529, + "learning_rate": 0.0006390827390315614, + "loss": 3.2125, + "step": 7580 + }, + { + "epoch": 0.44039571789143866, + "grad_norm": 0.14198483526706696, + "learning_rate": 0.0006381609089159545, + "loss": 3.2188, + "step": 7590 + }, + { + "epoch": 0.44097594940381213, + "grad_norm": 0.12889783084392548, + "learning_rate": 0.0006372385700774075, + "loss": 3.2026, + "step": 7600 + }, + { + "epoch": 0.44155618091618554, + "grad_norm": 0.13903778791427612, + "learning_rate": 0.0006363157259120689, + "loss": 3.2068, + "step": 7610 + }, + { + "epoch": 0.442136412428559, + "grad_norm": 0.13285143673419952, + "learning_rate": 0.0006353923798179472, + "loss": 3.2067, + "step": 7620 + }, + { + "epoch": 0.4427166439409324, + "grad_norm": 0.13808636367321014, + "learning_rate": 0.0006344685351948998, + "loss": 3.2131, + "step": 7630 + }, + { + "epoch": 0.44329687545330587, + "grad_norm": 0.12950055301189423, + "learning_rate": 0.0006335441954446191, + "loss": 3.2128, + "step": 7640 + }, + { + "epoch": 0.4438771069656793, + "grad_norm": 0.1326564997434616, + "learning_rate": 0.0006326193639706214, + "loss": 3.2228, + "step": 7650 + }, + { + "epoch": 0.44445733847805274, + "grad_norm": 0.12820059061050415, + "learning_rate": 0.0006316940441782325, + "loss": 3.2105, + "step": 7660 + }, + { + "epoch": 0.4450375699904262, + "grad_norm": 0.14834731817245483, + "learning_rate": 0.000630768239474577, + "loss": 3.2162, + "step": 7670 + }, + { + "epoch": 0.4456178015027996, + "grad_norm": 0.1407567858695984, + "learning_rate": 0.0006298419532685649, + "loss": 3.2075, + "step": 7680 + }, + { + "epoch": 0.4461980330151731, + "grad_norm": 0.18233934044837952, + "learning_rate": 0.0006289151889708788, + "loss": 3.2209, + "step": 7690 + }, + { + "epoch": 0.4467782645275465, + "grad_norm": 0.13413317501544952, + "learning_rate": 0.0006279879499939625, + "loss": 3.2062, + "step": 7700 + }, + { + "epoch": 0.44735849603991995, + "grad_norm": 0.14402654767036438, + "learning_rate": 0.0006270602397520065, + "loss": 3.2056, + "step": 7710 + }, + { + "epoch": 0.44793872755229336, + "grad_norm": 0.14101460576057434, + "learning_rate": 0.0006261320616609372, + "loss": 3.2135, + "step": 7720 + }, + { + "epoch": 0.4485189590646668, + "grad_norm": 0.15453755855560303, + "learning_rate": 0.0006252034191384035, + "loss": 3.2165, + "step": 7730 + }, + { + "epoch": 0.44909919057704023, + "grad_norm": 0.13480693101882935, + "learning_rate": 0.0006242743156037646, + "loss": 3.2139, + "step": 7740 + }, + { + "epoch": 0.4496794220894137, + "grad_norm": 0.13115455210208893, + "learning_rate": 0.0006233447544780772, + "loss": 3.2135, + "step": 7750 + }, + { + "epoch": 0.4502596536017871, + "grad_norm": 0.14997157454490662, + "learning_rate": 0.0006224147391840824, + "loss": 3.1969, + "step": 7760 + }, + { + "epoch": 0.45083988511416057, + "grad_norm": 0.13748539984226227, + "learning_rate": 0.0006214842731461942, + "loss": 3.2268, + "step": 7770 + }, + { + "epoch": 0.451420116626534, + "grad_norm": 0.12151113897562027, + "learning_rate": 0.0006205533597904857, + "loss": 3.2063, + "step": 7780 + }, + { + "epoch": 0.45200034813890744, + "grad_norm": 0.13322634994983673, + "learning_rate": 0.0006196220025446778, + "loss": 3.2066, + "step": 7790 + }, + { + "epoch": 0.45258057965128085, + "grad_norm": 0.1378646045923233, + "learning_rate": 0.0006186902048381252, + "loss": 3.1976, + "step": 7800 + }, + { + "epoch": 0.4531608111636543, + "grad_norm": 0.14197058975696564, + "learning_rate": 0.0006177579701018048, + "loss": 3.2056, + "step": 7810 + }, + { + "epoch": 0.4537410426760277, + "grad_norm": 0.13627830147743225, + "learning_rate": 0.0006168253017683025, + "loss": 3.2052, + "step": 7820 + }, + { + "epoch": 0.4543212741884012, + "grad_norm": 0.15169207751750946, + "learning_rate": 0.0006158922032718006, + "loss": 3.211, + "step": 7830 + }, + { + "epoch": 0.4549015057007746, + "grad_norm": 0.1337585300207138, + "learning_rate": 0.0006149586780480659, + "loss": 3.2157, + "step": 7840 + }, + { + "epoch": 0.45548173721314805, + "grad_norm": 0.1394774168729782, + "learning_rate": 0.0006140247295344359, + "loss": 3.2174, + "step": 7850 + }, + { + "epoch": 0.45606196872552146, + "grad_norm": 0.12764208018779755, + "learning_rate": 0.0006130903611698067, + "loss": 3.2102, + "step": 7860 + }, + { + "epoch": 0.4566422002378949, + "grad_norm": 0.13290008902549744, + "learning_rate": 0.0006121555763946207, + "loss": 3.2041, + "step": 7870 + }, + { + "epoch": 0.45722243175026833, + "grad_norm": 0.14185406267642975, + "learning_rate": 0.0006112203786508533, + "loss": 3.2152, + "step": 7880 + }, + { + "epoch": 0.4578026632626418, + "grad_norm": 0.12418293952941895, + "learning_rate": 0.0006102847713820006, + "loss": 3.2028, + "step": 7890 + }, + { + "epoch": 0.4583828947750152, + "grad_norm": 0.138755664229393, + "learning_rate": 0.0006093487580330666, + "loss": 3.2043, + "step": 7900 + }, + { + "epoch": 0.45896312628738867, + "grad_norm": 0.13823552429676056, + "learning_rate": 0.0006084123420505503, + "loss": 3.2043, + "step": 7910 + }, + { + "epoch": 0.45954335779976213, + "grad_norm": 0.1277630627155304, + "learning_rate": 0.0006074755268824335, + "loss": 3.2068, + "step": 7920 + }, + { + "epoch": 0.46012358931213554, + "grad_norm": 0.14666809141635895, + "learning_rate": 0.0006065383159781682, + "loss": 3.2156, + "step": 7930 + }, + { + "epoch": 0.460703820824509, + "grad_norm": 0.12684592604637146, + "learning_rate": 0.0006056007127886626, + "loss": 3.2059, + "step": 7940 + }, + { + "epoch": 0.4612840523368824, + "grad_norm": 0.12497347593307495, + "learning_rate": 0.0006046627207662702, + "loss": 3.2043, + "step": 7950 + }, + { + "epoch": 0.4618642838492559, + "grad_norm": 0.12166955322027206, + "learning_rate": 0.0006037243433647757, + "loss": 3.2039, + "step": 7960 + }, + { + "epoch": 0.4624445153616293, + "grad_norm": 0.12836964428424835, + "learning_rate": 0.000602785584039383, + "loss": 3.1986, + "step": 7970 + }, + { + "epoch": 0.46302474687400275, + "grad_norm": 0.1306101679801941, + "learning_rate": 0.0006018464462467023, + "loss": 3.2028, + "step": 7980 + }, + { + "epoch": 0.46360497838637615, + "grad_norm": 0.13166449964046478, + "learning_rate": 0.0006009069334447374, + "loss": 3.2017, + "step": 7990 + }, + { + "epoch": 0.4641852098987496, + "grad_norm": 0.1289730966091156, + "learning_rate": 0.0005999670490928729, + "loss": 3.2051, + "step": 8000 + }, + { + "epoch": 0.4641852098987496, + "eval_loss": 3.13897442817688, + "eval_runtime": 3.2533, + "eval_samples_per_second": 1330.958, + "eval_steps_per_second": 10.451, + "step": 8000 + }, + { + "epoch": 0.464765441411123, + "grad_norm": 0.12794232368469238, + "learning_rate": 0.0005990267966518613, + "loss": 3.2052, + "step": 8010 + }, + { + "epoch": 0.4653456729234965, + "grad_norm": 0.13217690587043762, + "learning_rate": 0.0005980861795838108, + "loss": 3.2057, + "step": 8020 + }, + { + "epoch": 0.4659259044358699, + "grad_norm": 0.14063167572021484, + "learning_rate": 0.0005971452013521717, + "loss": 3.202, + "step": 8030 + }, + { + "epoch": 0.46650613594824336, + "grad_norm": 0.1315622478723526, + "learning_rate": 0.0005962038654217244, + "loss": 3.202, + "step": 8040 + }, + { + "epoch": 0.46708636746061677, + "grad_norm": 0.14890199899673462, + "learning_rate": 0.0005952621752585667, + "loss": 3.2069, + "step": 8050 + }, + { + "epoch": 0.46766659897299023, + "grad_norm": 0.13835932314395905, + "learning_rate": 0.0005943201343301005, + "loss": 3.2079, + "step": 8060 + }, + { + "epoch": 0.46824683048536364, + "grad_norm": 0.13147889077663422, + "learning_rate": 0.0005933777461050187, + "loss": 3.2082, + "step": 8070 + }, + { + "epoch": 0.4688270619977371, + "grad_norm": 0.12799794971942902, + "learning_rate": 0.0005924350140532939, + "loss": 3.1974, + "step": 8080 + }, + { + "epoch": 0.4694072935101105, + "grad_norm": 0.12934145331382751, + "learning_rate": 0.000591491941646164, + "loss": 3.2048, + "step": 8090 + }, + { + "epoch": 0.469987525022484, + "grad_norm": 0.13451933860778809, + "learning_rate": 0.0005905485323561207, + "loss": 3.1955, + "step": 8100 + }, + { + "epoch": 0.4705677565348574, + "grad_norm": 0.15659664571285248, + "learning_rate": 0.0005896047896568955, + "loss": 3.1993, + "step": 8110 + }, + { + "epoch": 0.47114798804723085, + "grad_norm": 0.14385788142681122, + "learning_rate": 0.0005886607170234482, + "loss": 3.2043, + "step": 8120 + }, + { + "epoch": 0.47172821955960426, + "grad_norm": 0.13023056089878082, + "learning_rate": 0.0005877163179319527, + "loss": 3.2048, + "step": 8130 + }, + { + "epoch": 0.4723084510719777, + "grad_norm": 0.1275002360343933, + "learning_rate": 0.0005867715958597859, + "loss": 3.2101, + "step": 8140 + }, + { + "epoch": 0.4728886825843512, + "grad_norm": 0.13934627175331116, + "learning_rate": 0.000585826554285513, + "loss": 3.204, + "step": 8150 + }, + { + "epoch": 0.4734689140967246, + "grad_norm": 0.1253582239151001, + "learning_rate": 0.0005848811966888763, + "loss": 3.2038, + "step": 8160 + }, + { + "epoch": 0.47404914560909805, + "grad_norm": 0.13219626247882843, + "learning_rate": 0.0005839355265507817, + "loss": 3.2011, + "step": 8170 + }, + { + "epoch": 0.47462937712147146, + "grad_norm": 0.13276910781860352, + "learning_rate": 0.0005829895473532852, + "loss": 3.2011, + "step": 8180 + }, + { + "epoch": 0.4752096086338449, + "grad_norm": 0.146236851811409, + "learning_rate": 0.0005820432625795819, + "loss": 3.1997, + "step": 8190 + }, + { + "epoch": 0.47578984014621833, + "grad_norm": 0.13150210678577423, + "learning_rate": 0.0005810966757139909, + "loss": 3.1945, + "step": 8200 + }, + { + "epoch": 0.4763700716585918, + "grad_norm": 0.14235766232013702, + "learning_rate": 0.0005801497902419444, + "loss": 3.2039, + "step": 8210 + }, + { + "epoch": 0.4769503031709652, + "grad_norm": 0.13625676929950714, + "learning_rate": 0.0005792026096499741, + "loss": 3.1921, + "step": 8220 + }, + { + "epoch": 0.47753053468333867, + "grad_norm": 0.12872271239757538, + "learning_rate": 0.0005782551374256981, + "loss": 3.1912, + "step": 8230 + }, + { + "epoch": 0.4781107661957121, + "grad_norm": 0.14330317080020905, + "learning_rate": 0.0005773073770578081, + "loss": 3.1958, + "step": 8240 + }, + { + "epoch": 0.47869099770808554, + "grad_norm": 0.128121480345726, + "learning_rate": 0.0005763593320360575, + "loss": 3.1934, + "step": 8250 + }, + { + "epoch": 0.47927122922045895, + "grad_norm": 0.13301797211170197, + "learning_rate": 0.000575411005851247, + "loss": 3.1976, + "step": 8260 + }, + { + "epoch": 0.4798514607328324, + "grad_norm": 0.12738023698329926, + "learning_rate": 0.0005744624019952131, + "loss": 3.1995, + "step": 8270 + }, + { + "epoch": 0.4804316922452058, + "grad_norm": 0.13468343019485474, + "learning_rate": 0.0005735135239608146, + "loss": 3.2016, + "step": 8280 + }, + { + "epoch": 0.4810119237575793, + "grad_norm": 0.14049942791461945, + "learning_rate": 0.0005725643752419198, + "loss": 3.2005, + "step": 8290 + }, + { + "epoch": 0.4815921552699527, + "grad_norm": 0.12929829955101013, + "learning_rate": 0.0005716149593333938, + "loss": 3.2072, + "step": 8300 + }, + { + "epoch": 0.48217238678232616, + "grad_norm": 0.1476507931947708, + "learning_rate": 0.0005706652797310851, + "loss": 3.2013, + "step": 8310 + }, + { + "epoch": 0.48275261829469956, + "grad_norm": 0.15121173858642578, + "learning_rate": 0.000569715339931814, + "loss": 3.1976, + "step": 8320 + }, + { + "epoch": 0.48333284980707303, + "grad_norm": 0.12322133034467697, + "learning_rate": 0.000568765143433358, + "loss": 3.1905, + "step": 8330 + }, + { + "epoch": 0.48391308131944644, + "grad_norm": 0.1308000385761261, + "learning_rate": 0.0005678146937344402, + "loss": 3.1951, + "step": 8340 + }, + { + "epoch": 0.4844933128318199, + "grad_norm": 0.14018505811691284, + "learning_rate": 0.000566863994334716, + "loss": 3.1983, + "step": 8350 + }, + { + "epoch": 0.4850735443441933, + "grad_norm": 0.12124442309141159, + "learning_rate": 0.0005659130487347602, + "loss": 3.1969, + "step": 8360 + }, + { + "epoch": 0.48565377585656677, + "grad_norm": 0.13091051578521729, + "learning_rate": 0.000564961860436054, + "loss": 3.1932, + "step": 8370 + }, + { + "epoch": 0.4862340073689402, + "grad_norm": 0.12319710105657578, + "learning_rate": 0.0005640104329409727, + "loss": 3.1944, + "step": 8380 + }, + { + "epoch": 0.48681423888131364, + "grad_norm": 0.12845876812934875, + "learning_rate": 0.0005630587697527716, + "loss": 3.1929, + "step": 8390 + }, + { + "epoch": 0.4873944703936871, + "grad_norm": 0.14527294039726257, + "learning_rate": 0.0005621068743755743, + "loss": 3.1932, + "step": 8400 + }, + { + "epoch": 0.4879747019060605, + "grad_norm": 0.1430954933166504, + "learning_rate": 0.0005611547503143595, + "loss": 3.1963, + "step": 8410 + }, + { + "epoch": 0.488554933418434, + "grad_norm": 0.12142278254032135, + "learning_rate": 0.0005602024010749475, + "loss": 3.1912, + "step": 8420 + }, + { + "epoch": 0.4891351649308074, + "grad_norm": 0.12531523406505585, + "learning_rate": 0.0005592498301639884, + "loss": 3.1936, + "step": 8430 + }, + { + "epoch": 0.48971539644318085, + "grad_norm": 0.11823923885822296, + "learning_rate": 0.0005582970410889476, + "loss": 3.2031, + "step": 8440 + }, + { + "epoch": 0.49029562795555426, + "grad_norm": 0.1265026479959488, + "learning_rate": 0.0005573440373580946, + "loss": 3.1863, + "step": 8450 + }, + { + "epoch": 0.4908758594679277, + "grad_norm": 0.1225002259016037, + "learning_rate": 0.0005563908224804887, + "loss": 3.1978, + "step": 8460 + }, + { + "epoch": 0.49145609098030113, + "grad_norm": 0.13115541636943817, + "learning_rate": 0.000555437399965967, + "loss": 3.1945, + "step": 8470 + }, + { + "epoch": 0.4920363224926746, + "grad_norm": 0.11067093908786774, + "learning_rate": 0.0005544837733251313, + "loss": 3.195, + "step": 8480 + }, + { + "epoch": 0.492616554005048, + "grad_norm": 0.13542482256889343, + "learning_rate": 0.0005535299460693346, + "loss": 3.1976, + "step": 8490 + }, + { + "epoch": 0.49319678551742147, + "grad_norm": 0.12914744019508362, + "learning_rate": 0.000552575921710669, + "loss": 3.1817, + "step": 8500 + }, + { + "epoch": 0.4937770170297949, + "grad_norm": 0.14814750850200653, + "learning_rate": 0.0005516217037619517, + "loss": 3.1952, + "step": 8510 + }, + { + "epoch": 0.49435724854216834, + "grad_norm": 0.13739536702632904, + "learning_rate": 0.0005506672957367135, + "loss": 3.1946, + "step": 8520 + }, + { + "epoch": 0.49493748005454175, + "grad_norm": 0.12713497877120972, + "learning_rate": 0.0005497127011491846, + "loss": 3.193, + "step": 8530 + }, + { + "epoch": 0.4955177115669152, + "grad_norm": 0.13317294418811798, + "learning_rate": 0.0005487579235142823, + "loss": 3.1951, + "step": 8540 + }, + { + "epoch": 0.4960979430792886, + "grad_norm": 0.13274219632148743, + "learning_rate": 0.000547802966347598, + "loss": 3.1799, + "step": 8550 + }, + { + "epoch": 0.4966781745916621, + "grad_norm": 0.137456014752388, + "learning_rate": 0.0005468478331653838, + "loss": 3.1907, + "step": 8560 + }, + { + "epoch": 0.4972584061040355, + "grad_norm": 0.12658333778381348, + "learning_rate": 0.0005458925274845402, + "loss": 3.1906, + "step": 8570 + }, + { + "epoch": 0.49783863761640895, + "grad_norm": 0.15250617265701294, + "learning_rate": 0.000544937052822603, + "loss": 3.1905, + "step": 8580 + }, + { + "epoch": 0.49841886912878236, + "grad_norm": 0.12137165665626526, + "learning_rate": 0.0005439814126977296, + "loss": 3.195, + "step": 8590 + }, + { + "epoch": 0.4989991006411558, + "grad_norm": 0.12580706179141998, + "learning_rate": 0.0005430256106286874, + "loss": 3.1851, + "step": 8600 + }, + { + "epoch": 0.49957933215352923, + "grad_norm": 0.1316945105791092, + "learning_rate": 0.0005420696501348397, + "loss": 3.1827, + "step": 8610 + }, + { + "epoch": 0.5001595636659026, + "grad_norm": 0.12646295130252838, + "learning_rate": 0.0005411135347361329, + "loss": 3.1911, + "step": 8620 + }, + { + "epoch": 0.5007397951782762, + "grad_norm": 0.1217370554804802, + "learning_rate": 0.0005401572679530844, + "loss": 3.1963, + "step": 8630 + }, + { + "epoch": 0.5013200266906496, + "grad_norm": 0.13218845427036285, + "learning_rate": 0.0005392008533067684, + "loss": 3.1959, + "step": 8640 + }, + { + "epoch": 0.501900258203023, + "grad_norm": 0.13100461661815643, + "learning_rate": 0.000538244294318804, + "loss": 3.1984, + "step": 8650 + }, + { + "epoch": 0.5024804897153965, + "grad_norm": 0.12846529483795166, + "learning_rate": 0.0005372875945113417, + "loss": 3.1873, + "step": 8660 + }, + { + "epoch": 0.5030607212277699, + "grad_norm": 0.14111390709877014, + "learning_rate": 0.0005363307574070503, + "loss": 3.1974, + "step": 8670 + }, + { + "epoch": 0.5036409527401433, + "grad_norm": 0.1287715584039688, + "learning_rate": 0.0005353737865291039, + "loss": 3.1913, + "step": 8680 + }, + { + "epoch": 0.5042211842525167, + "grad_norm": 0.1581788808107376, + "learning_rate": 0.0005344166854011702, + "loss": 3.1833, + "step": 8690 + }, + { + "epoch": 0.5048014157648902, + "grad_norm": 0.1236489862203598, + "learning_rate": 0.0005334594575473952, + "loss": 3.1933, + "step": 8700 + }, + { + "epoch": 0.5053816472772636, + "grad_norm": 0.12864017486572266, + "learning_rate": 0.0005325021064923924, + "loss": 3.1913, + "step": 8710 + }, + { + "epoch": 0.505961878789637, + "grad_norm": 0.12411046773195267, + "learning_rate": 0.0005315446357612288, + "loss": 3.1871, + "step": 8720 + }, + { + "epoch": 0.5065421103020105, + "grad_norm": 0.12000168114900589, + "learning_rate": 0.0005305870488794117, + "loss": 3.1815, + "step": 8730 + }, + { + "epoch": 0.507122341814384, + "grad_norm": 0.1342875063419342, + "learning_rate": 0.0005296293493728764, + "loss": 3.187, + "step": 8740 + }, + { + "epoch": 0.5077025733267574, + "grad_norm": 0.12101846933364868, + "learning_rate": 0.0005286715407679729, + "loss": 3.1871, + "step": 8750 + }, + { + "epoch": 0.5082828048391308, + "grad_norm": 0.13273586332798004, + "learning_rate": 0.0005277136265914528, + "loss": 3.193, + "step": 8760 + }, + { + "epoch": 0.5088630363515042, + "grad_norm": 0.12270906567573547, + "learning_rate": 0.0005267556103704562, + "loss": 3.178, + "step": 8770 + }, + { + "epoch": 0.5094432678638777, + "grad_norm": 0.1259816586971283, + "learning_rate": 0.0005257974956324994, + "loss": 3.187, + "step": 8780 + }, + { + "epoch": 0.5100234993762511, + "grad_norm": 0.12720678746700287, + "learning_rate": 0.0005248392859054612, + "loss": 3.1837, + "step": 8790 + }, + { + "epoch": 0.5106037308886245, + "grad_norm": 0.13317066431045532, + "learning_rate": 0.0005238809847175704, + "loss": 3.1873, + "step": 8800 + }, + { + "epoch": 0.511183962400998, + "grad_norm": 0.12688656151294708, + "learning_rate": 0.000522922595597392, + "loss": 3.1829, + "step": 8810 + }, + { + "epoch": 0.5117641939133715, + "grad_norm": 0.1185089647769928, + "learning_rate": 0.0005219641220738154, + "loss": 3.1864, + "step": 8820 + }, + { + "epoch": 0.5123444254257449, + "grad_norm": 0.12334717810153961, + "learning_rate": 0.0005210055676760403, + "loss": 3.1924, + "step": 8830 + }, + { + "epoch": 0.5129246569381183, + "grad_norm": 0.130776509642601, + "learning_rate": 0.0005200469359335645, + "loss": 3.1864, + "step": 8840 + }, + { + "epoch": 0.5135048884504917, + "grad_norm": 0.1395205855369568, + "learning_rate": 0.0005190882303761707, + "loss": 3.1894, + "step": 8850 + }, + { + "epoch": 0.5140851199628652, + "grad_norm": 0.13597504794597626, + "learning_rate": 0.000518129454533913, + "loss": 3.197, + "step": 8860 + }, + { + "epoch": 0.5146653514752386, + "grad_norm": 0.1248147115111351, + "learning_rate": 0.0005171706119371045, + "loss": 3.1865, + "step": 8870 + }, + { + "epoch": 0.515245582987612, + "grad_norm": 0.12766875326633453, + "learning_rate": 0.0005162117061163039, + "loss": 3.1843, + "step": 8880 + }, + { + "epoch": 0.5158258144999855, + "grad_norm": 0.125563845038414, + "learning_rate": 0.0005152527406023033, + "loss": 3.1827, + "step": 8890 + }, + { + "epoch": 0.516406046012359, + "grad_norm": 0.12935802340507507, + "learning_rate": 0.0005142937189261138, + "loss": 3.1825, + "step": 8900 + }, + { + "epoch": 0.5169862775247324, + "grad_norm": 0.12293805927038193, + "learning_rate": 0.0005133346446189541, + "loss": 3.1909, + "step": 8910 + }, + { + "epoch": 0.5175665090371058, + "grad_norm": 0.12657864391803741, + "learning_rate": 0.0005123755212122359, + "loss": 3.172, + "step": 8920 + }, + { + "epoch": 0.5181467405494793, + "grad_norm": 0.12287136912345886, + "learning_rate": 0.0005114163522375522, + "loss": 3.1968, + "step": 8930 + }, + { + "epoch": 0.5187269720618527, + "grad_norm": 0.13364015519618988, + "learning_rate": 0.0005104571412266636, + "loss": 3.1799, + "step": 8940 + }, + { + "epoch": 0.5193072035742261, + "grad_norm": 0.14035052061080933, + "learning_rate": 0.0005094978917114853, + "loss": 3.1776, + "step": 8950 + }, + { + "epoch": 0.5198874350865995, + "grad_norm": 0.12295843660831451, + "learning_rate": 0.000508538607224075, + "loss": 3.1805, + "step": 8960 + }, + { + "epoch": 0.520467666598973, + "grad_norm": 0.11810554563999176, + "learning_rate": 0.0005075792912966184, + "loss": 3.1785, + "step": 8970 + }, + { + "epoch": 0.5210478981113464, + "grad_norm": 0.14744389057159424, + "learning_rate": 0.0005066199474614173, + "loss": 3.1906, + "step": 8980 + }, + { + "epoch": 0.5216281296237198, + "grad_norm": 0.13044193387031555, + "learning_rate": 0.000505660579250876, + "loss": 3.1766, + "step": 8990 + }, + { + "epoch": 0.5222083611360933, + "grad_norm": 0.1182679831981659, + "learning_rate": 0.000504701190197489, + "loss": 3.1816, + "step": 9000 + }, + { + "epoch": 0.5222083611360933, + "eval_loss": 3.1160783767700195, + "eval_runtime": 3.2455, + "eval_samples_per_second": 1334.151, + "eval_steps_per_second": 10.476, + "step": 9000 + }, + { + "epoch": 0.5227885926484668, + "grad_norm": 0.13336721062660217, + "learning_rate": 0.0005037417838338272, + "loss": 3.1825, + "step": 9010 + }, + { + "epoch": 0.5233688241608402, + "grad_norm": 0.11967909336090088, + "learning_rate": 0.0005027823636925254, + "loss": 3.1839, + "step": 9020 + }, + { + "epoch": 0.5239490556732136, + "grad_norm": 0.1373438537120819, + "learning_rate": 0.0005018229333062689, + "loss": 3.1859, + "step": 9030 + }, + { + "epoch": 0.524529287185587, + "grad_norm": 0.12351592630147934, + "learning_rate": 0.0005008634962077811, + "loss": 3.1889, + "step": 9040 + }, + { + "epoch": 0.5251095186979605, + "grad_norm": 0.13033995032310486, + "learning_rate": 0.0004999040559298097, + "loss": 3.1879, + "step": 9050 + }, + { + "epoch": 0.5256897502103339, + "grad_norm": 0.11891571432352066, + "learning_rate": 0.0004989446160051145, + "loss": 3.1905, + "step": 9060 + }, + { + "epoch": 0.5262699817227073, + "grad_norm": 0.12579171359539032, + "learning_rate": 0.0004979851799664539, + "loss": 3.1708, + "step": 9070 + }, + { + "epoch": 0.5268502132350807, + "grad_norm": 0.12359123677015305, + "learning_rate": 0.0004970257513465714, + "loss": 3.1824, + "step": 9080 + }, + { + "epoch": 0.5274304447474543, + "grad_norm": 0.1159052848815918, + "learning_rate": 0.0004960663336781842, + "loss": 3.18, + "step": 9090 + }, + { + "epoch": 0.5280106762598277, + "grad_norm": 0.11939360946416855, + "learning_rate": 0.0004951069304939684, + "loss": 3.1806, + "step": 9100 + }, + { + "epoch": 0.5285909077722011, + "grad_norm": 0.12974359095096588, + "learning_rate": 0.0004941475453265471, + "loss": 3.1774, + "step": 9110 + }, + { + "epoch": 0.5291711392845746, + "grad_norm": 0.13163182139396667, + "learning_rate": 0.0004931881817084771, + "loss": 3.1888, + "step": 9120 + }, + { + "epoch": 0.529751370796948, + "grad_norm": 0.1382340043783188, + "learning_rate": 0.0004922288431722355, + "loss": 3.1814, + "step": 9130 + }, + { + "epoch": 0.5303316023093214, + "grad_norm": 0.12294802814722061, + "learning_rate": 0.0004912695332502076, + "loss": 3.1793, + "step": 9140 + }, + { + "epoch": 0.5309118338216948, + "grad_norm": 0.13962669670581818, + "learning_rate": 0.0004903102554746727, + "loss": 3.1819, + "step": 9150 + }, + { + "epoch": 0.5314920653340683, + "grad_norm": 0.12373016029596329, + "learning_rate": 0.0004893510133777922, + "loss": 3.1747, + "step": 9160 + }, + { + "epoch": 0.5320722968464418, + "grad_norm": 0.12787871062755585, + "learning_rate": 0.0004883918104915962, + "loss": 3.1756, + "step": 9170 + }, + { + "epoch": 0.5326525283588152, + "grad_norm": 0.12098101526498795, + "learning_rate": 0.0004874326503479698, + "loss": 3.1826, + "step": 9180 + }, + { + "epoch": 0.5332327598711886, + "grad_norm": 0.13320770859718323, + "learning_rate": 0.0004864735364786415, + "loss": 3.1798, + "step": 9190 + }, + { + "epoch": 0.5338129913835621, + "grad_norm": 0.13723470270633698, + "learning_rate": 0.00048551447241516866, + "loss": 3.1811, + "step": 9200 + }, + { + "epoch": 0.5343932228959355, + "grad_norm": 0.12632976472377777, + "learning_rate": 0.00048455546168892614, + "loss": 3.1935, + "step": 9210 + }, + { + "epoch": 0.5349734544083089, + "grad_norm": 0.12484107166528702, + "learning_rate": 0.00048359650783109145, + "loss": 3.1719, + "step": 9220 + }, + { + "epoch": 0.5355536859206823, + "grad_norm": 0.13436180353164673, + "learning_rate": 0.0004826376143726332, + "loss": 3.1862, + "step": 9230 + }, + { + "epoch": 0.5361339174330558, + "grad_norm": 0.13016556203365326, + "learning_rate": 0.00048167878484429793, + "loss": 3.1812, + "step": 9240 + }, + { + "epoch": 0.5367141489454292, + "grad_norm": 0.12285098433494568, + "learning_rate": 0.00048072002277659595, + "loss": 3.1799, + "step": 9250 + }, + { + "epoch": 0.5372943804578026, + "grad_norm": 0.12092869728803635, + "learning_rate": 0.0004797613316997899, + "loss": 3.178, + "step": 9260 + }, + { + "epoch": 0.537874611970176, + "grad_norm": 0.11524718254804611, + "learning_rate": 0.0004788027151438806, + "loss": 3.1737, + "step": 9270 + }, + { + "epoch": 0.5384548434825496, + "grad_norm": 0.12745259702205658, + "learning_rate": 0.0004778441766385947, + "loss": 3.1746, + "step": 9280 + }, + { + "epoch": 0.539035074994923, + "grad_norm": 0.12326768040657043, + "learning_rate": 0.00047688571971337155, + "loss": 3.1752, + "step": 9290 + }, + { + "epoch": 0.5396153065072964, + "grad_norm": 0.11677366495132446, + "learning_rate": 0.00047592734789734967, + "loss": 3.1702, + "step": 9300 + }, + { + "epoch": 0.5401955380196698, + "grad_norm": 0.1259879618883133, + "learning_rate": 0.0004749690647193547, + "loss": 3.174, + "step": 9310 + }, + { + "epoch": 0.5407757695320433, + "grad_norm": 0.1180575042963028, + "learning_rate": 0.00047401087370788547, + "loss": 3.1738, + "step": 9320 + }, + { + "epoch": 0.5413560010444167, + "grad_norm": 0.12572011351585388, + "learning_rate": 0.00047305277839110207, + "loss": 3.1795, + "step": 9330 + }, + { + "epoch": 0.5419362325567901, + "grad_norm": 0.12240619957447052, + "learning_rate": 0.0004720947822968113, + "loss": 3.1814, + "step": 9340 + }, + { + "epoch": 0.5425164640691637, + "grad_norm": 0.12223079055547714, + "learning_rate": 0.00047113688895245536, + "loss": 3.1693, + "step": 9350 + }, + { + "epoch": 0.5430966955815371, + "grad_norm": 0.11417195945978165, + "learning_rate": 0.00047017910188509805, + "loss": 3.1765, + "step": 9360 + }, + { + "epoch": 0.5436769270939105, + "grad_norm": 0.1286236047744751, + "learning_rate": 0.00046922142462141146, + "loss": 3.1799, + "step": 9370 + }, + { + "epoch": 0.5442571586062839, + "grad_norm": 0.12266254425048828, + "learning_rate": 0.0004682638606876639, + "loss": 3.1679, + "step": 9380 + }, + { + "epoch": 0.5448373901186574, + "grad_norm": 0.12598057091236115, + "learning_rate": 0.00046730641360970564, + "loss": 3.1589, + "step": 9390 + }, + { + "epoch": 0.5454176216310308, + "grad_norm": 0.11109986901283264, + "learning_rate": 0.0004663490869129574, + "loss": 3.1772, + "step": 9400 + }, + { + "epoch": 0.5459978531434042, + "grad_norm": 0.12376119196414948, + "learning_rate": 0.0004653918841223964, + "loss": 3.1748, + "step": 9410 + }, + { + "epoch": 0.5465780846557776, + "grad_norm": 0.14834947884082794, + "learning_rate": 0.0004644348087625434, + "loss": 3.1799, + "step": 9420 + }, + { + "epoch": 0.5471583161681511, + "grad_norm": 0.12348010390996933, + "learning_rate": 0.00046347786435745053, + "loss": 3.1679, + "step": 9430 + }, + { + "epoch": 0.5477385476805245, + "grad_norm": 0.12390238046646118, + "learning_rate": 0.00046252105443068676, + "loss": 3.1809, + "step": 9440 + }, + { + "epoch": 0.548318779192898, + "grad_norm": 0.12994909286499023, + "learning_rate": 0.0004615643825053269, + "loss": 3.1774, + "step": 9450 + }, + { + "epoch": 0.5488990107052714, + "grad_norm": 0.13186348974704742, + "learning_rate": 0.000460607852103937, + "loss": 3.1627, + "step": 9460 + }, + { + "epoch": 0.5494792422176449, + "grad_norm": 0.11778120696544647, + "learning_rate": 0.00045965146674856216, + "loss": 3.1642, + "step": 9470 + }, + { + "epoch": 0.5500594737300183, + "grad_norm": 0.13027390837669373, + "learning_rate": 0.0004586952299607139, + "loss": 3.1745, + "step": 9480 + }, + { + "epoch": 0.5506397052423917, + "grad_norm": 0.13938818871974945, + "learning_rate": 0.00045773914526135555, + "loss": 3.177, + "step": 9490 + }, + { + "epoch": 0.5512199367547651, + "grad_norm": 0.13590595126152039, + "learning_rate": 0.0004567832161708918, + "loss": 3.1794, + "step": 9500 + }, + { + "epoch": 0.5518001682671386, + "grad_norm": 0.13180691003799438, + "learning_rate": 0.00045582744620915313, + "loss": 3.1752, + "step": 9510 + }, + { + "epoch": 0.552380399779512, + "grad_norm": 0.13000676035881042, + "learning_rate": 0.0004548718388953849, + "loss": 3.1737, + "step": 9520 + }, + { + "epoch": 0.5529606312918854, + "grad_norm": 0.1294243186712265, + "learning_rate": 0.00045391639774823345, + "loss": 3.1729, + "step": 9530 + }, + { + "epoch": 0.5535408628042588, + "grad_norm": 0.12174411118030548, + "learning_rate": 0.000452961126285733, + "loss": 3.173, + "step": 9540 + }, + { + "epoch": 0.5541210943166324, + "grad_norm": 0.11989938467741013, + "learning_rate": 0.0004520060280252934, + "loss": 3.172, + "step": 9550 + }, + { + "epoch": 0.5547013258290058, + "grad_norm": 0.12082493305206299, + "learning_rate": 0.0004510511064836862, + "loss": 3.1676, + "step": 9560 + }, + { + "epoch": 0.5552815573413792, + "grad_norm": 0.12731657922267914, + "learning_rate": 0.00045009636517703275, + "loss": 3.1816, + "step": 9570 + }, + { + "epoch": 0.5558617888537526, + "grad_norm": 0.11837522685527802, + "learning_rate": 0.0004491418076207903, + "loss": 3.1749, + "step": 9580 + }, + { + "epoch": 0.5564420203661261, + "grad_norm": 0.11386506259441376, + "learning_rate": 0.00044818743732974003, + "loss": 3.1577, + "step": 9590 + }, + { + "epoch": 0.5570222518784995, + "grad_norm": 0.12620550394058228, + "learning_rate": 0.00044723325781797346, + "loss": 3.1755, + "step": 9600 + }, + { + "epoch": 0.5576024833908729, + "grad_norm": 0.11217296868562698, + "learning_rate": 0.0004462792725988791, + "loss": 3.1599, + "step": 9610 + }, + { + "epoch": 0.5581827149032464, + "grad_norm": 0.12458086013793945, + "learning_rate": 0.0004453254851851308, + "loss": 3.1749, + "step": 9620 + }, + { + "epoch": 0.5587629464156199, + "grad_norm": 0.1312059760093689, + "learning_rate": 0.0004443718990886734, + "loss": 3.1693, + "step": 9630 + }, + { + "epoch": 0.5593431779279933, + "grad_norm": 0.128337562084198, + "learning_rate": 0.00044341851782071106, + "loss": 3.1755, + "step": 9640 + }, + { + "epoch": 0.5599234094403667, + "grad_norm": 0.11443614959716797, + "learning_rate": 0.00044246534489169367, + "loss": 3.1716, + "step": 9650 + }, + { + "epoch": 0.5605036409527402, + "grad_norm": 0.12734359502792358, + "learning_rate": 0.00044151238381130324, + "loss": 3.1717, + "step": 9660 + }, + { + "epoch": 0.5610838724651136, + "grad_norm": 0.1212453842163086, + "learning_rate": 0.0004405596380884428, + "loss": 3.1642, + "step": 9670 + }, + { + "epoch": 0.561664103977487, + "grad_norm": 0.1245492622256279, + "learning_rate": 0.0004396071112312216, + "loss": 3.175, + "step": 9680 + }, + { + "epoch": 0.5622443354898604, + "grad_norm": 0.11448545008897781, + "learning_rate": 0.0004386548067469437, + "loss": 3.1716, + "step": 9690 + }, + { + "epoch": 0.5628245670022339, + "grad_norm": 0.12811312079429626, + "learning_rate": 0.00043770272814209343, + "loss": 3.1614, + "step": 9700 + }, + { + "epoch": 0.5634047985146073, + "grad_norm": 0.11437219381332397, + "learning_rate": 0.0004367508789223243, + "loss": 3.1724, + "step": 9710 + }, + { + "epoch": 0.5639850300269807, + "grad_norm": 0.11365852504968643, + "learning_rate": 0.00043579926259244487, + "loss": 3.1707, + "step": 9720 + }, + { + "epoch": 0.5645652615393542, + "grad_norm": 0.12131789326667786, + "learning_rate": 0.0004348478826564059, + "loss": 3.1694, + "step": 9730 + }, + { + "epoch": 0.5651454930517277, + "grad_norm": 0.11996260285377502, + "learning_rate": 0.0004338967426172884, + "loss": 3.1579, + "step": 9740 + }, + { + "epoch": 0.5657257245641011, + "grad_norm": 0.12249016016721725, + "learning_rate": 0.00043294584597728915, + "loss": 3.1685, + "step": 9750 + }, + { + "epoch": 0.5663059560764745, + "grad_norm": 0.12243705987930298, + "learning_rate": 0.0004319951962377094, + "loss": 3.1719, + "step": 9760 + }, + { + "epoch": 0.5668861875888479, + "grad_norm": 0.12344249337911606, + "learning_rate": 0.00043104479689894137, + "loss": 3.1779, + "step": 9770 + }, + { + "epoch": 0.5674664191012214, + "grad_norm": 0.11184128373861313, + "learning_rate": 0.00043009465146045444, + "loss": 3.1705, + "step": 9780 + }, + { + "epoch": 0.5680466506135948, + "grad_norm": 0.12422725558280945, + "learning_rate": 0.0004291447634207841, + "loss": 3.1702, + "step": 9790 + }, + { + "epoch": 0.5686268821259682, + "grad_norm": 0.13139618933200836, + "learning_rate": 0.0004281951362775173, + "loss": 3.1658, + "step": 9800 + }, + { + "epoch": 0.5692071136383416, + "grad_norm": 0.14261464774608612, + "learning_rate": 0.000427245773527281, + "loss": 3.165, + "step": 9810 + }, + { + "epoch": 0.5697873451507152, + "grad_norm": 0.11359596252441406, + "learning_rate": 0.0004262966786657279, + "loss": 3.1698, + "step": 9820 + }, + { + "epoch": 0.5703675766630886, + "grad_norm": 0.13556736707687378, + "learning_rate": 0.0004253478551875249, + "loss": 3.168, + "step": 9830 + }, + { + "epoch": 0.570947808175462, + "grad_norm": 0.12147964537143707, + "learning_rate": 0.00042439930658633965, + "loss": 3.1672, + "step": 9840 + }, + { + "epoch": 0.5715280396878355, + "grad_norm": 0.12102476507425308, + "learning_rate": 0.00042345103635482706, + "loss": 3.1628, + "step": 9850 + }, + { + "epoch": 0.5721082712002089, + "grad_norm": 0.11904580146074295, + "learning_rate": 0.0004225030479846179, + "loss": 3.1644, + "step": 9860 + }, + { + "epoch": 0.5726885027125823, + "grad_norm": 0.1157933697104454, + "learning_rate": 0.00042155534496630427, + "loss": 3.1663, + "step": 9870 + }, + { + "epoch": 0.5732687342249557, + "grad_norm": 0.12185543030500412, + "learning_rate": 0.00042060793078942804, + "loss": 3.1785, + "step": 9880 + }, + { + "epoch": 0.5738489657373292, + "grad_norm": 0.1252318024635315, + "learning_rate": 0.00041966080894246773, + "loss": 3.159, + "step": 9890 + }, + { + "epoch": 0.5744291972497026, + "grad_norm": 0.12326642870903015, + "learning_rate": 0.00041871398291282484, + "loss": 3.1576, + "step": 9900 + }, + { + "epoch": 0.5750094287620761, + "grad_norm": 0.1266362965106964, + "learning_rate": 0.0004177674561868123, + "loss": 3.16, + "step": 9910 + }, + { + "epoch": 0.5755896602744495, + "grad_norm": 0.1305086612701416, + "learning_rate": 0.00041682123224964047, + "loss": 3.1697, + "step": 9920 + }, + { + "epoch": 0.576169891786823, + "grad_norm": 0.12299249321222305, + "learning_rate": 0.0004158753145854051, + "loss": 3.1663, + "step": 9930 + }, + { + "epoch": 0.5767501232991964, + "grad_norm": 0.11096496134996414, + "learning_rate": 0.00041492970667707403, + "loss": 3.1663, + "step": 9940 + }, + { + "epoch": 0.5773303548115698, + "grad_norm": 0.10970742255449295, + "learning_rate": 0.00041398441200647467, + "loss": 3.1617, + "step": 9950 + }, + { + "epoch": 0.5779105863239432, + "grad_norm": 0.12066974490880966, + "learning_rate": 0.0004130394340542813, + "loss": 3.1656, + "step": 9960 + }, + { + "epoch": 0.5784908178363167, + "grad_norm": 0.10806959867477417, + "learning_rate": 0.0004120947763000012, + "loss": 3.1649, + "step": 9970 + }, + { + "epoch": 0.5790710493486901, + "grad_norm": 0.11969128251075745, + "learning_rate": 0.0004111504422219637, + "loss": 3.1675, + "step": 9980 + }, + { + "epoch": 0.5796512808610635, + "grad_norm": 0.11461341381072998, + "learning_rate": 0.0004102064352973054, + "loss": 3.1631, + "step": 9990 + }, + { + "epoch": 0.580231512373437, + "grad_norm": 0.1177605539560318, + "learning_rate": 0.00040926275900195886, + "loss": 3.1583, + "step": 10000 + }, + { + "epoch": 0.580231512373437, + "eval_loss": 3.0971479415893555, + "eval_runtime": 3.2713, + "eval_samples_per_second": 1323.637, + "eval_steps_per_second": 10.393, + "step": 10000 + }, + { + "epoch": 0.5808117438858105, + "grad_norm": 0.13472320139408112, + "learning_rate": 0.00040831941681063926, + "loss": 3.1596, + "step": 10010 + }, + { + "epoch": 0.5813919753981839, + "grad_norm": 0.12773457169532776, + "learning_rate": 0.000407376412196831, + "loss": 3.1751, + "step": 10020 + }, + { + "epoch": 0.5819722069105573, + "grad_norm": 0.11364042013883591, + "learning_rate": 0.0004064337486327761, + "loss": 3.1541, + "step": 10030 + }, + { + "epoch": 0.5825524384229307, + "grad_norm": 0.1128978356719017, + "learning_rate": 0.00040549142958946037, + "loss": 3.1594, + "step": 10040 + }, + { + "epoch": 0.5831326699353042, + "grad_norm": 0.11539763957262039, + "learning_rate": 0.00040454945853660157, + "loss": 3.1708, + "step": 10050 + }, + { + "epoch": 0.5837129014476776, + "grad_norm": 0.13058942556381226, + "learning_rate": 0.00040360783894263536, + "loss": 3.1611, + "step": 10060 + }, + { + "epoch": 0.584293132960051, + "grad_norm": 0.13269387185573578, + "learning_rate": 0.00040266657427470395, + "loss": 3.1631, + "step": 10070 + }, + { + "epoch": 0.5848733644724246, + "grad_norm": 0.11398264765739441, + "learning_rate": 0.00040172566799864264, + "loss": 3.1593, + "step": 10080 + }, + { + "epoch": 0.585453595984798, + "grad_norm": 0.12349914014339447, + "learning_rate": 0.00040078512357896647, + "loss": 3.1585, + "step": 10090 + }, + { + "epoch": 0.5860338274971714, + "grad_norm": 0.12374427914619446, + "learning_rate": 0.0003998449444788589, + "loss": 3.1654, + "step": 10100 + }, + { + "epoch": 0.5866140590095448, + "grad_norm": 0.11344794183969498, + "learning_rate": 0.0003989051341601576, + "loss": 3.1564, + "step": 10110 + }, + { + "epoch": 0.5871942905219183, + "grad_norm": 0.11296453326940536, + "learning_rate": 0.0003979656960833428, + "loss": 3.1632, + "step": 10120 + }, + { + "epoch": 0.5877745220342917, + "grad_norm": 0.11938530951738358, + "learning_rate": 0.00039702663370752393, + "loss": 3.1687, + "step": 10130 + }, + { + "epoch": 0.5883547535466651, + "grad_norm": 0.12476367503404617, + "learning_rate": 0.00039608795049042686, + "loss": 3.1605, + "step": 10140 + }, + { + "epoch": 0.5889349850590385, + "grad_norm": 0.1283896565437317, + "learning_rate": 0.0003951496498883817, + "loss": 3.154, + "step": 10150 + }, + { + "epoch": 0.589515216571412, + "grad_norm": 0.11707280576229095, + "learning_rate": 0.00039421173535630937, + "loss": 3.1675, + "step": 10160 + }, + { + "epoch": 0.5900954480837854, + "grad_norm": 0.11196309328079224, + "learning_rate": 0.0003932742103477098, + "loss": 3.1597, + "step": 10170 + }, + { + "epoch": 0.5906756795961589, + "grad_norm": 0.13069289922714233, + "learning_rate": 0.0003923370783146477, + "loss": 3.162, + "step": 10180 + }, + { + "epoch": 0.5912559111085323, + "grad_norm": 0.11600931733846664, + "learning_rate": 0.0003914003427077418, + "loss": 3.1611, + "step": 10190 + }, + { + "epoch": 0.5918361426209058, + "grad_norm": 0.11921602487564087, + "learning_rate": 0.00039046400697615076, + "loss": 3.1603, + "step": 10200 + }, + { + "epoch": 0.5924163741332792, + "grad_norm": 0.10909148305654526, + "learning_rate": 0.0003895280745675606, + "loss": 3.1651, + "step": 10210 + }, + { + "epoch": 0.5929966056456526, + "grad_norm": 0.1261613517999649, + "learning_rate": 0.0003885925489281729, + "loss": 3.164, + "step": 10220 + }, + { + "epoch": 0.593576837158026, + "grad_norm": 0.1152707114815712, + "learning_rate": 0.00038765743350269047, + "loss": 3.1569, + "step": 10230 + }, + { + "epoch": 0.5941570686703995, + "grad_norm": 0.13062123954296112, + "learning_rate": 0.0003867227317343066, + "loss": 3.1526, + "step": 10240 + }, + { + "epoch": 0.5947373001827729, + "grad_norm": 0.13169212639331818, + "learning_rate": 0.0003857884470646912, + "loss": 3.1584, + "step": 10250 + }, + { + "epoch": 0.5953175316951463, + "grad_norm": 0.1235685646533966, + "learning_rate": 0.0003848545829339781, + "loss": 3.1635, + "step": 10260 + }, + { + "epoch": 0.5958977632075197, + "grad_norm": 0.11871648579835892, + "learning_rate": 0.00038392114278075316, + "loss": 3.1547, + "step": 10270 + }, + { + "epoch": 0.5964779947198933, + "grad_norm": 0.11664935946464539, + "learning_rate": 0.0003829881300420404, + "loss": 3.1553, + "step": 10280 + }, + { + "epoch": 0.5970582262322667, + "grad_norm": 0.10464397817850113, + "learning_rate": 0.0003820555481532908, + "loss": 3.1465, + "step": 10290 + }, + { + "epoch": 0.5976384577446401, + "grad_norm": 0.11757074296474457, + "learning_rate": 0.0003811234005483683, + "loss": 3.1576, + "step": 10300 + }, + { + "epoch": 0.5982186892570136, + "grad_norm": 0.12942548096179962, + "learning_rate": 0.0003801916906595382, + "loss": 3.1582, + "step": 10310 + }, + { + "epoch": 0.598798920769387, + "grad_norm": 0.13089211285114288, + "learning_rate": 0.000379260421917454, + "loss": 3.149, + "step": 10320 + }, + { + "epoch": 0.5993791522817604, + "grad_norm": 0.123594731092453, + "learning_rate": 0.0003783295977511445, + "loss": 3.1622, + "step": 10330 + }, + { + "epoch": 0.5999593837941338, + "grad_norm": 0.12618903815746307, + "learning_rate": 0.0003773992215880022, + "loss": 3.1599, + "step": 10340 + }, + { + "epoch": 0.6005396153065073, + "grad_norm": 0.11297423392534256, + "learning_rate": 0.00037646929685376904, + "loss": 3.1575, + "step": 10350 + }, + { + "epoch": 0.6011198468188808, + "grad_norm": 0.12514062225818634, + "learning_rate": 0.0003755398269725256, + "loss": 3.1549, + "step": 10360 + }, + { + "epoch": 0.6017000783312542, + "grad_norm": 0.11910570412874222, + "learning_rate": 0.00037461081536667743, + "loss": 3.1615, + "step": 10370 + }, + { + "epoch": 0.6022803098436276, + "grad_norm": 0.11765125393867493, + "learning_rate": 0.0003736822654569425, + "loss": 3.1613, + "step": 10380 + }, + { + "epoch": 0.6028605413560011, + "grad_norm": 0.10604594647884369, + "learning_rate": 0.00037275418066233903, + "loss": 3.1475, + "step": 10390 + }, + { + "epoch": 0.6034407728683745, + "grad_norm": 0.1241423636674881, + "learning_rate": 0.00037182656440017207, + "loss": 3.1537, + "step": 10400 + }, + { + "epoch": 0.6040210043807479, + "grad_norm": 0.13135185837745667, + "learning_rate": 0.0003708994200860221, + "loss": 3.1423, + "step": 10410 + }, + { + "epoch": 0.6046012358931213, + "grad_norm": 0.11381290853023529, + "learning_rate": 0.0003699727511337316, + "loss": 3.157, + "step": 10420 + }, + { + "epoch": 0.6051814674054948, + "grad_norm": 0.11703768372535706, + "learning_rate": 0.0003690465609553927, + "loss": 3.15, + "step": 10430 + }, + { + "epoch": 0.6057616989178682, + "grad_norm": 0.11526386439800262, + "learning_rate": 0.0003681208529613348, + "loss": 3.1625, + "step": 10440 + }, + { + "epoch": 0.6063419304302416, + "grad_norm": 0.1294795721769333, + "learning_rate": 0.00036719563056011146, + "loss": 3.1577, + "step": 10450 + }, + { + "epoch": 0.6069221619426151, + "grad_norm": 0.12788033485412598, + "learning_rate": 0.0003662708971584887, + "loss": 3.1549, + "step": 10460 + }, + { + "epoch": 0.6075023934549886, + "grad_norm": 0.11444190889596939, + "learning_rate": 0.00036534665616143157, + "loss": 3.158, + "step": 10470 + }, + { + "epoch": 0.608082624967362, + "grad_norm": 0.12848497927188873, + "learning_rate": 0.00036442291097209245, + "loss": 3.1534, + "step": 10480 + }, + { + "epoch": 0.6086628564797354, + "grad_norm": 0.13192491233348846, + "learning_rate": 0.000363499664991798, + "loss": 3.1647, + "step": 10490 + }, + { + "epoch": 0.6092430879921088, + "grad_norm": 0.1181025505065918, + "learning_rate": 0.0003625769216200362, + "loss": 3.1556, + "step": 10500 + }, + { + "epoch": 0.6098233195044823, + "grad_norm": 0.11332180351018906, + "learning_rate": 0.00036165468425444514, + "loss": 3.1531, + "step": 10510 + }, + { + "epoch": 0.6104035510168557, + "grad_norm": 0.11427458375692368, + "learning_rate": 0.00036073295629079926, + "loss": 3.1441, + "step": 10520 + }, + { + "epoch": 0.6109837825292291, + "grad_norm": 0.1351877599954605, + "learning_rate": 0.00035981174112299774, + "loss": 3.1592, + "step": 10530 + }, + { + "epoch": 0.6115640140416027, + "grad_norm": 0.11437386274337769, + "learning_rate": 0.000358891042143051, + "loss": 3.1508, + "step": 10540 + }, + { + "epoch": 0.6121442455539761, + "grad_norm": 0.1317347139120102, + "learning_rate": 0.00035797086274106917, + "loss": 3.1602, + "step": 10550 + }, + { + "epoch": 0.6127244770663495, + "grad_norm": 0.12212193757295609, + "learning_rate": 0.00035705120630524946, + "loss": 3.1562, + "step": 10560 + }, + { + "epoch": 0.6133047085787229, + "grad_norm": 0.10987838357686996, + "learning_rate": 0.00035613207622186297, + "loss": 3.1498, + "step": 10570 + }, + { + "epoch": 0.6138849400910964, + "grad_norm": 0.1109929159283638, + "learning_rate": 0.00035521347587524324, + "loss": 3.1592, + "step": 10580 + }, + { + "epoch": 0.6144651716034698, + "grad_norm": 0.11722821742296219, + "learning_rate": 0.00035429540864777254, + "loss": 3.1588, + "step": 10590 + }, + { + "epoch": 0.6150454031158432, + "grad_norm": 0.11384609341621399, + "learning_rate": 0.00035337787791987085, + "loss": 3.1563, + "step": 10600 + }, + { + "epoch": 0.6156256346282166, + "grad_norm": 0.13255846500396729, + "learning_rate": 0.0003524608870699826, + "loss": 3.1546, + "step": 10610 + }, + { + "epoch": 0.6162058661405901, + "grad_norm": 0.12805138528347015, + "learning_rate": 0.00035154443947456364, + "loss": 3.1468, + "step": 10620 + }, + { + "epoch": 0.6167860976529635, + "grad_norm": 0.11819039285182953, + "learning_rate": 0.0003506285385080705, + "loss": 3.1436, + "step": 10630 + }, + { + "epoch": 0.617366329165337, + "grad_norm": 0.11611706018447876, + "learning_rate": 0.0003497131875429462, + "loss": 3.153, + "step": 10640 + }, + { + "epoch": 0.6179465606777104, + "grad_norm": 0.12574134767055511, + "learning_rate": 0.0003487983899496092, + "loss": 3.1676, + "step": 10650 + }, + { + "epoch": 0.6185267921900839, + "grad_norm": 0.13298243284225464, + "learning_rate": 0.00034788414909643975, + "loss": 3.1448, + "step": 10660 + }, + { + "epoch": 0.6191070237024573, + "grad_norm": 0.11737950146198273, + "learning_rate": 0.00034697046834976847, + "loss": 3.1603, + "step": 10670 + }, + { + "epoch": 0.6196872552148307, + "grad_norm": 0.11029376089572906, + "learning_rate": 0.0003460573510738638, + "loss": 3.1523, + "step": 10680 + }, + { + "epoch": 0.6202674867272041, + "grad_norm": 0.12390248477458954, + "learning_rate": 0.000345144800630919, + "loss": 3.1591, + "step": 10690 + }, + { + "epoch": 0.6208477182395776, + "grad_norm": 0.11781900376081467, + "learning_rate": 0.00034423282038104064, + "loss": 3.1617, + "step": 10700 + }, + { + "epoch": 0.621427949751951, + "grad_norm": 0.12515197694301605, + "learning_rate": 0.0003433214136822352, + "loss": 3.1418, + "step": 10710 + }, + { + "epoch": 0.6220081812643244, + "grad_norm": 0.10986288636922836, + "learning_rate": 0.0003424105838903978, + "loss": 3.1374, + "step": 10720 + }, + { + "epoch": 0.6225884127766979, + "grad_norm": 0.12500767409801483, + "learning_rate": 0.00034150033435929926, + "loss": 3.1508, + "step": 10730 + }, + { + "epoch": 0.6231686442890714, + "grad_norm": 0.11399463564157486, + "learning_rate": 0.0003405906684405735, + "loss": 3.155, + "step": 10740 + }, + { + "epoch": 0.6237488758014448, + "grad_norm": 0.1243964433670044, + "learning_rate": 0.000339681589483706, + "loss": 3.149, + "step": 10750 + }, + { + "epoch": 0.6243291073138182, + "grad_norm": 0.1269841343164444, + "learning_rate": 0.0003387731008360203, + "loss": 3.157, + "step": 10760 + }, + { + "epoch": 0.6249093388261916, + "grad_norm": 0.11488167196512222, + "learning_rate": 0.0003378652058426672, + "loss": 3.1591, + "step": 10770 + }, + { + "epoch": 0.6254895703385651, + "grad_norm": 0.12310460954904556, + "learning_rate": 0.00033695790784661085, + "loss": 3.1493, + "step": 10780 + }, + { + "epoch": 0.6260698018509385, + "grad_norm": 0.11951915174722672, + "learning_rate": 0.0003360512101886176, + "loss": 3.1519, + "step": 10790 + }, + { + "epoch": 0.6266500333633119, + "grad_norm": 0.11739303171634674, + "learning_rate": 0.0003351451162072435, + "loss": 3.1517, + "step": 10800 + }, + { + "epoch": 0.6272302648756855, + "grad_norm": 0.12451887875795364, + "learning_rate": 0.000334239629238821, + "loss": 3.1437, + "step": 10810 + }, + { + "epoch": 0.6278104963880589, + "grad_norm": 0.10753390938043594, + "learning_rate": 0.0003333347526174484, + "loss": 3.1474, + "step": 10820 + }, + { + "epoch": 0.6283907279004323, + "grad_norm": 0.12157886475324631, + "learning_rate": 0.00033243048967497596, + "loss": 3.1502, + "step": 10830 + }, + { + "epoch": 0.6289709594128057, + "grad_norm": 0.13651184737682343, + "learning_rate": 0.0003315268437409946, + "loss": 3.1553, + "step": 10840 + }, + { + "epoch": 0.6295511909251792, + "grad_norm": 0.12725335359573364, + "learning_rate": 0.00033062381814282367, + "loss": 3.141, + "step": 10850 + }, + { + "epoch": 0.6301314224375526, + "grad_norm": 0.11685140430927277, + "learning_rate": 0.00032972141620549747, + "loss": 3.1451, + "step": 10860 + }, + { + "epoch": 0.630711653949926, + "grad_norm": 0.1115005612373352, + "learning_rate": 0.00032881964125175487, + "loss": 3.1482, + "step": 10870 + }, + { + "epoch": 0.6312918854622994, + "grad_norm": 0.11986386775970459, + "learning_rate": 0.00032791849660202547, + "loss": 3.1434, + "step": 10880 + }, + { + "epoch": 0.6318721169746729, + "grad_norm": 0.11233355104923248, + "learning_rate": 0.00032701798557441833, + "loss": 3.1418, + "step": 10890 + }, + { + "epoch": 0.6324523484870463, + "grad_norm": 0.11507276445627213, + "learning_rate": 0.0003261181114847094, + "loss": 3.1415, + "step": 10900 + }, + { + "epoch": 0.6330325799994198, + "grad_norm": 0.1157032698392868, + "learning_rate": 0.00032521887764632937, + "loss": 3.149, + "step": 10910 + }, + { + "epoch": 0.6336128115117932, + "grad_norm": 0.12391894310712814, + "learning_rate": 0.0003243202873703516, + "loss": 3.1476, + "step": 10920 + }, + { + "epoch": 0.6341930430241667, + "grad_norm": 0.11616963148117065, + "learning_rate": 0.00032342234396547933, + "loss": 3.1522, + "step": 10930 + }, + { + "epoch": 0.6347732745365401, + "grad_norm": 0.113109290599823, + "learning_rate": 0.00032252505073803437, + "loss": 3.1398, + "step": 10940 + }, + { + "epoch": 0.6353535060489135, + "grad_norm": 0.1344575732946396, + "learning_rate": 0.00032162841099194427, + "loss": 3.1388, + "step": 10950 + }, + { + "epoch": 0.6359337375612869, + "grad_norm": 0.1219155341386795, + "learning_rate": 0.0003207324280287307, + "loss": 3.1499, + "step": 10960 + }, + { + "epoch": 0.6365139690736604, + "grad_norm": 0.11315035074949265, + "learning_rate": 0.0003198371051474969, + "loss": 3.152, + "step": 10970 + }, + { + "epoch": 0.6370942005860338, + "grad_norm": 0.1105627492070198, + "learning_rate": 0.000318942445644915, + "loss": 3.1512, + "step": 10980 + }, + { + "epoch": 0.6376744320984072, + "grad_norm": 0.11000196635723114, + "learning_rate": 0.00031804845281521553, + "loss": 3.1464, + "step": 10990 + }, + { + "epoch": 0.6382546636107806, + "grad_norm": 0.11353638023138046, + "learning_rate": 0.0003171551299501734, + "loss": 3.1464, + "step": 11000 + }, + { + "epoch": 0.6382546636107806, + "eval_loss": 3.079380750656128, + "eval_runtime": 3.2712, + "eval_samples_per_second": 1323.663, + "eval_steps_per_second": 10.394, + "step": 11000 + }, + { + "epoch": 0.6388348951231542, + "grad_norm": 0.11272257566452026, + "learning_rate": 0.0003162624803390973, + "loss": 3.1544, + "step": 11010 + }, + { + "epoch": 0.6394151266355276, + "grad_norm": 0.11919167637825012, + "learning_rate": 0.00031537050726881635, + "loss": 3.1495, + "step": 11020 + }, + { + "epoch": 0.639995358147901, + "grad_norm": 0.11367520689964294, + "learning_rate": 0.00031447921402366874, + "loss": 3.1422, + "step": 11030 + }, + { + "epoch": 0.6405755896602745, + "grad_norm": 0.11759908497333527, + "learning_rate": 0.0003135886038854899, + "loss": 3.1414, + "step": 11040 + }, + { + "epoch": 0.6411558211726479, + "grad_norm": 0.11302473396062851, + "learning_rate": 0.0003126986801335995, + "loss": 3.1471, + "step": 11050 + }, + { + "epoch": 0.6417360526850213, + "grad_norm": 0.12279005348682404, + "learning_rate": 0.0003118094460447901, + "loss": 3.1427, + "step": 11060 + }, + { + "epoch": 0.6423162841973947, + "grad_norm": 0.11281714588403702, + "learning_rate": 0.0003109209048933145, + "loss": 3.1327, + "step": 11070 + }, + { + "epoch": 0.6428965157097682, + "grad_norm": 0.10893500596284866, + "learning_rate": 0.0003100330599508745, + "loss": 3.1472, + "step": 11080 + }, + { + "epoch": 0.6434767472221417, + "grad_norm": 0.12594285607337952, + "learning_rate": 0.0003091459144866083, + "loss": 3.146, + "step": 11090 + }, + { + "epoch": 0.6440569787345151, + "grad_norm": 0.1286400705575943, + "learning_rate": 0.0003082594717670781, + "loss": 3.1457, + "step": 11100 + }, + { + "epoch": 0.6446372102468885, + "grad_norm": 0.12681199610233307, + "learning_rate": 0.0003073737350562594, + "loss": 3.1349, + "step": 11110 + }, + { + "epoch": 0.645217441759262, + "grad_norm": 0.12026621401309967, + "learning_rate": 0.00030648870761552693, + "loss": 3.1425, + "step": 11120 + }, + { + "epoch": 0.6457976732716354, + "grad_norm": 0.11217381060123444, + "learning_rate": 0.00030560439270364495, + "loss": 3.1424, + "step": 11130 + }, + { + "epoch": 0.6463779047840088, + "grad_norm": 0.11966854333877563, + "learning_rate": 0.00030472079357675316, + "loss": 3.1477, + "step": 11140 + }, + { + "epoch": 0.6469581362963822, + "grad_norm": 0.10743203014135361, + "learning_rate": 0.0003038379134883563, + "loss": 3.1472, + "step": 11150 + }, + { + "epoch": 0.6475383678087557, + "grad_norm": 0.11516842246055603, + "learning_rate": 0.0003029557556893117, + "loss": 3.1363, + "step": 11160 + }, + { + "epoch": 0.6481185993211291, + "grad_norm": 0.11448100209236145, + "learning_rate": 0.00030207432342781615, + "loss": 3.1397, + "step": 11170 + }, + { + "epoch": 0.6486988308335025, + "grad_norm": 0.12240401655435562, + "learning_rate": 0.0003011936199493962, + "loss": 3.1451, + "step": 11180 + }, + { + "epoch": 0.649279062345876, + "grad_norm": 0.1107584685087204, + "learning_rate": 0.0003003136484968937, + "loss": 3.1516, + "step": 11190 + }, + { + "epoch": 0.6498592938582495, + "grad_norm": 0.11135096102952957, + "learning_rate": 0.0002994344123104561, + "loss": 3.1423, + "step": 11200 + }, + { + "epoch": 0.6504395253706229, + "grad_norm": 0.11470366269350052, + "learning_rate": 0.0002985559146275231, + "loss": 3.1441, + "step": 11210 + }, + { + "epoch": 0.6510197568829963, + "grad_norm": 0.11569292098283768, + "learning_rate": 0.0002976781586828151, + "loss": 3.149, + "step": 11220 + }, + { + "epoch": 0.6515999883953697, + "grad_norm": 0.12388614565134048, + "learning_rate": 0.0002968011477083217, + "loss": 3.1319, + "step": 11230 + }, + { + "epoch": 0.6521802199077432, + "grad_norm": 0.12496737390756607, + "learning_rate": 0.00029592488493328885, + "loss": 3.1391, + "step": 11240 + }, + { + "epoch": 0.6527604514201166, + "grad_norm": 0.1108599305152893, + "learning_rate": 0.00029504937358420803, + "loss": 3.1453, + "step": 11250 + }, + { + "epoch": 0.65334068293249, + "grad_norm": 0.11209242045879364, + "learning_rate": 0.0002941746168848037, + "loss": 3.1468, + "step": 11260 + }, + { + "epoch": 0.6539209144448636, + "grad_norm": 0.10576393455266953, + "learning_rate": 0.0002933006180560217, + "loss": 3.1327, + "step": 11270 + }, + { + "epoch": 0.654501145957237, + "grad_norm": 0.11058243364095688, + "learning_rate": 0.00029242738031601745, + "loss": 3.1378, + "step": 11280 + }, + { + "epoch": 0.6550813774696104, + "grad_norm": 0.10569418221712112, + "learning_rate": 0.00029155490688014343, + "loss": 3.1402, + "step": 11290 + }, + { + "epoch": 0.6556616089819838, + "grad_norm": 0.11297528445720673, + "learning_rate": 0.0002906832009609384, + "loss": 3.1453, + "step": 11300 + }, + { + "epoch": 0.6562418404943573, + "grad_norm": 0.11635693162679672, + "learning_rate": 0.00028981226576811506, + "loss": 3.1323, + "step": 11310 + }, + { + "epoch": 0.6568220720067307, + "grad_norm": 0.11293961852788925, + "learning_rate": 0.0002889421045085475, + "loss": 3.151, + "step": 11320 + }, + { + "epoch": 0.6574023035191041, + "grad_norm": 0.11303776502609253, + "learning_rate": 0.0002880727203862612, + "loss": 3.1461, + "step": 11330 + }, + { + "epoch": 0.6579825350314775, + "grad_norm": 0.10966860502958298, + "learning_rate": 0.0002872041166024194, + "loss": 3.1441, + "step": 11340 + }, + { + "epoch": 0.658562766543851, + "grad_norm": 0.11160997301340103, + "learning_rate": 0.00028633629635531224, + "loss": 3.1488, + "step": 11350 + }, + { + "epoch": 0.6591429980562244, + "grad_norm": 0.1070476621389389, + "learning_rate": 0.0002854692628403446, + "loss": 3.1413, + "step": 11360 + }, + { + "epoch": 0.6597232295685979, + "grad_norm": 0.11823021620512009, + "learning_rate": 0.0002846030192500249, + "loss": 3.145, + "step": 11370 + }, + { + "epoch": 0.6603034610809713, + "grad_norm": 0.11843527853488922, + "learning_rate": 0.0002837375687739525, + "loss": 3.1374, + "step": 11380 + }, + { + "epoch": 0.6608836925933448, + "grad_norm": 0.118824802339077, + "learning_rate": 0.00028287291459880716, + "loss": 3.157, + "step": 11390 + }, + { + "epoch": 0.6614639241057182, + "grad_norm": 0.11628689616918564, + "learning_rate": 0.0002820090599083358, + "loss": 3.1352, + "step": 11400 + }, + { + "epoch": 0.6620441556180916, + "grad_norm": 0.11970434337854385, + "learning_rate": 0.0002811460078833421, + "loss": 3.1468, + "step": 11410 + }, + { + "epoch": 0.662624387130465, + "grad_norm": 0.10809943079948425, + "learning_rate": 0.00028028376170167383, + "loss": 3.1405, + "step": 11420 + }, + { + "epoch": 0.6632046186428385, + "grad_norm": 0.10611239075660706, + "learning_rate": 0.00027942232453821193, + "loss": 3.1449, + "step": 11430 + }, + { + "epoch": 0.6637848501552119, + "grad_norm": 0.11383804678916931, + "learning_rate": 0.0002785616995648579, + "loss": 3.1525, + "step": 11440 + }, + { + "epoch": 0.6643650816675853, + "grad_norm": 0.11580588668584824, + "learning_rate": 0.0002777018899505236, + "loss": 3.1335, + "step": 11450 + }, + { + "epoch": 0.6649453131799588, + "grad_norm": 0.1111442893743515, + "learning_rate": 0.0002768428988611178, + "loss": 3.1467, + "step": 11460 + }, + { + "epoch": 0.6655255446923323, + "grad_norm": 0.11236603558063507, + "learning_rate": 0.0002759847294595357, + "loss": 3.1369, + "step": 11470 + }, + { + "epoch": 0.6661057762047057, + "grad_norm": 0.12457659840583801, + "learning_rate": 0.00027512738490564697, + "loss": 3.1346, + "step": 11480 + }, + { + "epoch": 0.6666860077170791, + "grad_norm": 0.11812961846590042, + "learning_rate": 0.0002742708683562841, + "loss": 3.1479, + "step": 11490 + }, + { + "epoch": 0.6672662392294526, + "grad_norm": 0.1041463240981102, + "learning_rate": 0.0002734151829652304, + "loss": 3.1363, + "step": 11500 + }, + { + "epoch": 0.667846470741826, + "grad_norm": 0.11501579731702805, + "learning_rate": 0.0002725603318832097, + "loss": 3.1286, + "step": 11510 + }, + { + "epoch": 0.6684267022541994, + "grad_norm": 0.11745285987854004, + "learning_rate": 0.00027170631825787294, + "loss": 3.1406, + "step": 11520 + }, + { + "epoch": 0.6690069337665728, + "grad_norm": 0.1133042722940445, + "learning_rate": 0.00027085314523378777, + "loss": 3.1506, + "step": 11530 + }, + { + "epoch": 0.6695871652789464, + "grad_norm": 0.11351029574871063, + "learning_rate": 0.00027000081595242667, + "loss": 3.135, + "step": 11540 + }, + { + "epoch": 0.6701673967913198, + "grad_norm": 0.11267419159412384, + "learning_rate": 0.0002691493335521551, + "loss": 3.131, + "step": 11550 + }, + { + "epoch": 0.6707476283036932, + "grad_norm": 0.10249326378107071, + "learning_rate": 0.00026829870116822085, + "loss": 3.1318, + "step": 11560 + }, + { + "epoch": 0.6713278598160666, + "grad_norm": 0.11103315651416779, + "learning_rate": 0.0002674489219327413, + "loss": 3.1344, + "step": 11570 + }, + { + "epoch": 0.6719080913284401, + "grad_norm": 0.11441586166620255, + "learning_rate": 0.0002665999989746926, + "loss": 3.1352, + "step": 11580 + }, + { + "epoch": 0.6724883228408135, + "grad_norm": 0.11026325076818466, + "learning_rate": 0.00026575193541989795, + "loss": 3.1315, + "step": 11590 + }, + { + "epoch": 0.6730685543531869, + "grad_norm": 0.1115291491150856, + "learning_rate": 0.00026490473439101615, + "loss": 3.1339, + "step": 11600 + }, + { + "epoch": 0.6736487858655603, + "grad_norm": 0.10771960020065308, + "learning_rate": 0.0002640583990075306, + "loss": 3.1238, + "step": 11610 + }, + { + "epoch": 0.6742290173779338, + "grad_norm": 0.11078547686338425, + "learning_rate": 0.00026321293238573614, + "loss": 3.1365, + "step": 11620 + }, + { + "epoch": 0.6748092488903072, + "grad_norm": 0.1105736717581749, + "learning_rate": 0.00026236833763872993, + "loss": 3.1466, + "step": 11630 + }, + { + "epoch": 0.6753894804026807, + "grad_norm": 0.10596097260713577, + "learning_rate": 0.0002615246178763983, + "loss": 3.1442, + "step": 11640 + }, + { + "epoch": 0.6759697119150541, + "grad_norm": 0.11247435957193375, + "learning_rate": 0.00026068177620540536, + "loss": 3.1439, + "step": 11650 + }, + { + "epoch": 0.6765499434274276, + "grad_norm": 0.11171044409275055, + "learning_rate": 0.00025983981572918314, + "loss": 3.1451, + "step": 11660 + }, + { + "epoch": 0.677130174939801, + "grad_norm": 0.12344854325056076, + "learning_rate": 0.0002589987395479175, + "loss": 3.1372, + "step": 11670 + }, + { + "epoch": 0.6777104064521744, + "grad_norm": 0.1220024824142456, + "learning_rate": 0.00025815855075853977, + "loss": 3.1366, + "step": 11680 + }, + { + "epoch": 0.6782906379645478, + "grad_norm": 0.10972929000854492, + "learning_rate": 0.0002573192524547128, + "loss": 3.1299, + "step": 11690 + }, + { + "epoch": 0.6788708694769213, + "grad_norm": 0.11175742000341415, + "learning_rate": 0.00025648084772682056, + "loss": 3.1375, + "step": 11700 + }, + { + "epoch": 0.6794511009892947, + "grad_norm": 0.12257856875658035, + "learning_rate": 0.00025564333966195785, + "loss": 3.1402, + "step": 11710 + }, + { + "epoch": 0.6800313325016681, + "grad_norm": 0.11670278757810593, + "learning_rate": 0.0002548067313439162, + "loss": 3.1357, + "step": 11720 + }, + { + "epoch": 0.6806115640140415, + "grad_norm": 0.11864405870437622, + "learning_rate": 0.0002539710258531759, + "loss": 3.136, + "step": 11730 + }, + { + "epoch": 0.6811917955264151, + "grad_norm": 0.10666168481111526, + "learning_rate": 0.00025313622626689134, + "loss": 3.1374, + "step": 11740 + }, + { + "epoch": 0.6817720270387885, + "grad_norm": 0.10843271762132645, + "learning_rate": 0.00025230233565888267, + "loss": 3.1343, + "step": 11750 + }, + { + "epoch": 0.6823522585511619, + "grad_norm": 0.10990433394908905, + "learning_rate": 0.00025146935709962216, + "loss": 3.1386, + "step": 11760 + }, + { + "epoch": 0.6829324900635354, + "grad_norm": 0.10423313081264496, + "learning_rate": 0.00025063729365622407, + "loss": 3.1382, + "step": 11770 + }, + { + "epoch": 0.6835127215759088, + "grad_norm": 0.11088298261165619, + "learning_rate": 0.00024980614839243364, + "loss": 3.1191, + "step": 11780 + }, + { + "epoch": 0.6840929530882822, + "grad_norm": 0.11372388154268265, + "learning_rate": 0.00024897592436861406, + "loss": 3.1294, + "step": 11790 + }, + { + "epoch": 0.6846731846006556, + "grad_norm": 0.10824663192033768, + "learning_rate": 0.0002481466246417377, + "loss": 3.1291, + "step": 11800 + }, + { + "epoch": 0.6852534161130291, + "grad_norm": 0.10850938409566879, + "learning_rate": 0.00024731825226537293, + "loss": 3.1438, + "step": 11810 + }, + { + "epoch": 0.6858336476254026, + "grad_norm": 0.1074269562959671, + "learning_rate": 0.00024649081028967334, + "loss": 3.1336, + "step": 11820 + }, + { + "epoch": 0.686413879137776, + "grad_norm": 0.11285442113876343, + "learning_rate": 0.00024566430176136756, + "loss": 3.1326, + "step": 11830 + }, + { + "epoch": 0.6869941106501494, + "grad_norm": 0.11877676844596863, + "learning_rate": 0.0002448387297237459, + "loss": 3.1333, + "step": 11840 + }, + { + "epoch": 0.6875743421625229, + "grad_norm": 0.1159949079155922, + "learning_rate": 0.00024401409721665148, + "loss": 3.1271, + "step": 11850 + }, + { + "epoch": 0.6881545736748963, + "grad_norm": 0.11141040176153183, + "learning_rate": 0.00024319040727646752, + "loss": 3.1315, + "step": 11860 + }, + { + "epoch": 0.6887348051872697, + "grad_norm": 0.1103438287973404, + "learning_rate": 0.0002423676629361064, + "loss": 3.1271, + "step": 11870 + }, + { + "epoch": 0.6893150366996431, + "grad_norm": 0.12033682316541672, + "learning_rate": 0.00024154586722499965, + "loss": 3.1317, + "step": 11880 + }, + { + "epoch": 0.6898952682120166, + "grad_norm": 0.10661648213863373, + "learning_rate": 0.00024072502316908428, + "loss": 3.1272, + "step": 11890 + }, + { + "epoch": 0.69047549972439, + "grad_norm": 0.1170666292309761, + "learning_rate": 0.00023990513379079477, + "loss": 3.1398, + "step": 11900 + }, + { + "epoch": 0.6910557312367634, + "grad_norm": 0.11095455288887024, + "learning_rate": 0.00023908620210904947, + "loss": 3.1298, + "step": 11910 + }, + { + "epoch": 0.6916359627491369, + "grad_norm": 0.1100478321313858, + "learning_rate": 0.00023826823113924035, + "loss": 3.1286, + "step": 11920 + }, + { + "epoch": 0.6922161942615104, + "grad_norm": 0.11419103294610977, + "learning_rate": 0.00023745122389322293, + "loss": 3.1343, + "step": 11930 + }, + { + "epoch": 0.6927964257738838, + "grad_norm": 0.11160432547330856, + "learning_rate": 0.00023663518337930256, + "loss": 3.1402, + "step": 11940 + }, + { + "epoch": 0.6933766572862572, + "grad_norm": 0.10984364151954651, + "learning_rate": 0.00023582011260222664, + "loss": 3.1351, + "step": 11950 + }, + { + "epoch": 0.6939568887986306, + "grad_norm": 0.11625155061483383, + "learning_rate": 0.00023500601456317083, + "loss": 3.134, + "step": 11960 + }, + { + "epoch": 0.6945371203110041, + "grad_norm": 0.1080445721745491, + "learning_rate": 0.00023419289225972946, + "loss": 3.1311, + "step": 11970 + }, + { + "epoch": 0.6951173518233775, + "grad_norm": 0.10590895265340805, + "learning_rate": 0.00023338074868590393, + "loss": 3.1371, + "step": 11980 + }, + { + "epoch": 0.6956975833357509, + "grad_norm": 0.11543317884206772, + "learning_rate": 0.0002325695868320919, + "loss": 3.1316, + "step": 11990 + }, + { + "epoch": 0.6962778148481245, + "grad_norm": 0.11939459294080734, + "learning_rate": 0.0002317594096850768, + "loss": 3.1365, + "step": 12000 + }, + { + "epoch": 0.6962778148481245, + "eval_loss": 3.064452648162842, + "eval_runtime": 3.2623, + "eval_samples_per_second": 1327.301, + "eval_steps_per_second": 10.422, + "step": 12000 + }, + { + "epoch": 0.6968580463604979, + "grad_norm": 0.10952937602996826, + "learning_rate": 0.00023095022022801503, + "loss": 3.1378, + "step": 12010 + }, + { + "epoch": 0.6974382778728713, + "grad_norm": 0.11545655131340027, + "learning_rate": 0.00023014202144042744, + "loss": 3.1373, + "step": 12020 + }, + { + "epoch": 0.6980185093852447, + "grad_norm": 0.10757040232419968, + "learning_rate": 0.00022933481629818653, + "loss": 3.137, + "step": 12030 + }, + { + "epoch": 0.6985987408976182, + "grad_norm": 0.11333664506673813, + "learning_rate": 0.00022852860777350593, + "loss": 3.1328, + "step": 12040 + }, + { + "epoch": 0.6991789724099916, + "grad_norm": 0.10705193877220154, + "learning_rate": 0.00022772339883493048, + "loss": 3.1283, + "step": 12050 + }, + { + "epoch": 0.699759203922365, + "grad_norm": 0.11158863455057144, + "learning_rate": 0.00022691919244732307, + "loss": 3.1303, + "step": 12060 + }, + { + "epoch": 0.7003394354347384, + "grad_norm": 0.11993270367383957, + "learning_rate": 0.00022611599157185648, + "loss": 3.1262, + "step": 12070 + }, + { + "epoch": 0.7009196669471119, + "grad_norm": 0.10471105575561523, + "learning_rate": 0.00022531379916600026, + "loss": 3.1397, + "step": 12080 + }, + { + "epoch": 0.7014998984594853, + "grad_norm": 0.11472434550523758, + "learning_rate": 0.00022451261818351082, + "loss": 3.1334, + "step": 12090 + }, + { + "epoch": 0.7020801299718588, + "grad_norm": 0.12063375115394592, + "learning_rate": 0.0002237124515744206, + "loss": 3.1311, + "step": 12100 + }, + { + "epoch": 0.7026603614842322, + "grad_norm": 0.11272242665290833, + "learning_rate": 0.00022291330228502658, + "loss": 3.13, + "step": 12110 + }, + { + "epoch": 0.7032405929966057, + "grad_norm": 0.10762272030115128, + "learning_rate": 0.00022211517325788056, + "loss": 3.1255, + "step": 12120 + }, + { + "epoch": 0.7038208245089791, + "grad_norm": 0.10291790962219238, + "learning_rate": 0.00022131806743177707, + "loss": 3.1284, + "step": 12130 + }, + { + "epoch": 0.7044010560213525, + "grad_norm": 0.10812744498252869, + "learning_rate": 0.00022052198774174327, + "loss": 3.1348, + "step": 12140 + }, + { + "epoch": 0.7049812875337259, + "grad_norm": 0.10778633505105972, + "learning_rate": 0.00021972693711902792, + "loss": 3.1342, + "step": 12150 + }, + { + "epoch": 0.7055615190460994, + "grad_norm": 0.10863006114959717, + "learning_rate": 0.00021893291849109053, + "loss": 3.1319, + "step": 12160 + }, + { + "epoch": 0.7061417505584728, + "grad_norm": 0.11223878711462021, + "learning_rate": 0.00021813993478159128, + "loss": 3.1299, + "step": 12170 + }, + { + "epoch": 0.7067219820708462, + "grad_norm": 0.11001647263765335, + "learning_rate": 0.000217347988910379, + "loss": 3.1256, + "step": 12180 + }, + { + "epoch": 0.7073022135832197, + "grad_norm": 0.11175502091646194, + "learning_rate": 0.00021655708379348144, + "loss": 3.1374, + "step": 12190 + }, + { + "epoch": 0.7078824450955932, + "grad_norm": 0.10740119218826294, + "learning_rate": 0.00021576722234309403, + "loss": 3.1284, + "step": 12200 + }, + { + "epoch": 0.7084626766079666, + "grad_norm": 0.1120908334851265, + "learning_rate": 0.00021497840746756942, + "loss": 3.1225, + "step": 12210 + }, + { + "epoch": 0.70904290812034, + "grad_norm": 0.110707126557827, + "learning_rate": 0.00021419064207140639, + "loss": 3.1256, + "step": 12220 + }, + { + "epoch": 0.7096231396327135, + "grad_norm": 0.11498415470123291, + "learning_rate": 0.00021340392905524002, + "loss": 3.1249, + "step": 12230 + }, + { + "epoch": 0.7102033711450869, + "grad_norm": 0.11245319992303848, + "learning_rate": 0.00021261827131582989, + "loss": 3.135, + "step": 12240 + }, + { + "epoch": 0.7107836026574603, + "grad_norm": 0.10842925310134888, + "learning_rate": 0.00021183367174605006, + "loss": 3.121, + "step": 12250 + }, + { + "epoch": 0.7113638341698337, + "grad_norm": 0.10173554718494415, + "learning_rate": 0.00021105013323487843, + "loss": 3.1246, + "step": 12260 + }, + { + "epoch": 0.7119440656822072, + "grad_norm": 0.10313431173563004, + "learning_rate": 0.00021026765866738578, + "loss": 3.1298, + "step": 12270 + }, + { + "epoch": 0.7125242971945807, + "grad_norm": 0.1053348183631897, + "learning_rate": 0.00020948625092472535, + "loss": 3.1264, + "step": 12280 + }, + { + "epoch": 0.7131045287069541, + "grad_norm": 0.12006527930498123, + "learning_rate": 0.00020870591288412254, + "loss": 3.1306, + "step": 12290 + }, + { + "epoch": 0.7136847602193275, + "grad_norm": 0.1115618497133255, + "learning_rate": 0.00020792664741886368, + "loss": 3.1264, + "step": 12300 + }, + { + "epoch": 0.714264991731701, + "grad_norm": 0.10678742080926895, + "learning_rate": 0.00020714845739828585, + "loss": 3.1337, + "step": 12310 + }, + { + "epoch": 0.7148452232440744, + "grad_norm": 0.11345722526311874, + "learning_rate": 0.00020637134568776615, + "loss": 3.1283, + "step": 12320 + }, + { + "epoch": 0.7154254547564478, + "grad_norm": 0.11007855087518692, + "learning_rate": 0.00020559531514871145, + "loss": 3.124, + "step": 12330 + }, + { + "epoch": 0.7160056862688212, + "grad_norm": 0.1093551367521286, + "learning_rate": 0.00020482036863854708, + "loss": 3.1251, + "step": 12340 + }, + { + "epoch": 0.7165859177811947, + "grad_norm": 0.10831980407238007, + "learning_rate": 0.00020404650901070787, + "loss": 3.122, + "step": 12350 + }, + { + "epoch": 0.7171661492935681, + "grad_norm": 0.1150059625506401, + "learning_rate": 0.00020327373911462572, + "loss": 3.1253, + "step": 12360 + }, + { + "epoch": 0.7177463808059416, + "grad_norm": 0.10329602658748627, + "learning_rate": 0.00020250206179572034, + "loss": 3.1315, + "step": 12370 + }, + { + "epoch": 0.718326612318315, + "grad_norm": 0.10848727822303772, + "learning_rate": 0.00020173147989538853, + "loss": 3.1334, + "step": 12380 + }, + { + "epoch": 0.7189068438306885, + "grad_norm": 0.10476922988891602, + "learning_rate": 0.00020096199625099337, + "loss": 3.1208, + "step": 12390 + }, + { + "epoch": 0.7194870753430619, + "grad_norm": 0.10537194460630417, + "learning_rate": 0.00020019361369585454, + "loss": 3.1265, + "step": 12400 + }, + { + "epoch": 0.7200673068554353, + "grad_norm": 0.10518410056829453, + "learning_rate": 0.00019942633505923703, + "loss": 3.124, + "step": 12410 + }, + { + "epoch": 0.7206475383678087, + "grad_norm": 0.10767358541488647, + "learning_rate": 0.000198660163166341, + "loss": 3.1174, + "step": 12420 + }, + { + "epoch": 0.7212277698801822, + "grad_norm": 0.10856916010379791, + "learning_rate": 0.0001978951008382918, + "loss": 3.124, + "step": 12430 + }, + { + "epoch": 0.7218080013925556, + "grad_norm": 0.1153908297419548, + "learning_rate": 0.0001971311508921288, + "loss": 3.1287, + "step": 12440 + }, + { + "epoch": 0.722388232904929, + "grad_norm": 0.10942938178777695, + "learning_rate": 0.00019636831614079625, + "loss": 3.118, + "step": 12450 + }, + { + "epoch": 0.7229684644173026, + "grad_norm": 0.1065671443939209, + "learning_rate": 0.00019560659939313096, + "loss": 3.1286, + "step": 12460 + }, + { + "epoch": 0.723548695929676, + "grad_norm": 0.10312582552433014, + "learning_rate": 0.0001948460034538543, + "loss": 3.1235, + "step": 12470 + }, + { + "epoch": 0.7241289274420494, + "grad_norm": 0.1132279559969902, + "learning_rate": 0.00019408653112355995, + "loss": 3.128, + "step": 12480 + }, + { + "epoch": 0.7247091589544228, + "grad_norm": 0.10551323741674423, + "learning_rate": 0.00019332818519870453, + "loss": 3.1256, + "step": 12490 + }, + { + "epoch": 0.7252893904667963, + "grad_norm": 0.11312738060951233, + "learning_rate": 0.00019257096847159766, + "loss": 3.1083, + "step": 12500 + }, + { + "epoch": 0.7258696219791697, + "grad_norm": 0.113512322306633, + "learning_rate": 0.00019181488373038992, + "loss": 3.1143, + "step": 12510 + }, + { + "epoch": 0.7264498534915431, + "grad_norm": 0.10724562406539917, + "learning_rate": 0.00019105993375906512, + "loss": 3.1284, + "step": 12520 + }, + { + "epoch": 0.7270300850039165, + "grad_norm": 0.10460355132818222, + "learning_rate": 0.00019030612133742787, + "loss": 3.1162, + "step": 12530 + }, + { + "epoch": 0.72761031651629, + "grad_norm": 0.10553659498691559, + "learning_rate": 0.00018955344924109435, + "loss": 3.1269, + "step": 12540 + }, + { + "epoch": 0.7281905480286635, + "grad_norm": 0.10612872242927551, + "learning_rate": 0.00018880192024148268, + "loss": 3.1362, + "step": 12550 + }, + { + "epoch": 0.7287707795410369, + "grad_norm": 0.11399485170841217, + "learning_rate": 0.00018805153710580054, + "loss": 3.135, + "step": 12560 + }, + { + "epoch": 0.7293510110534103, + "grad_norm": 0.11703281104564667, + "learning_rate": 0.00018730230259703795, + "loss": 3.1188, + "step": 12570 + }, + { + "epoch": 0.7299312425657838, + "grad_norm": 0.11565428972244263, + "learning_rate": 0.00018655421947395425, + "loss": 3.1244, + "step": 12580 + }, + { + "epoch": 0.7305114740781572, + "grad_norm": 0.10253434628248215, + "learning_rate": 0.00018580729049107026, + "loss": 3.1183, + "step": 12590 + }, + { + "epoch": 0.7310917055905306, + "grad_norm": 0.101934053003788, + "learning_rate": 0.0001850615183986567, + "loss": 3.1192, + "step": 12600 + }, + { + "epoch": 0.731671937102904, + "grad_norm": 0.10347875952720642, + "learning_rate": 0.0001843169059427243, + "loss": 3.1201, + "step": 12610 + }, + { + "epoch": 0.7322521686152775, + "grad_norm": 0.10719276964664459, + "learning_rate": 0.00018357345586501468, + "loss": 3.1261, + "step": 12620 + }, + { + "epoch": 0.7328324001276509, + "grad_norm": 0.10952641069889069, + "learning_rate": 0.00018283117090298813, + "loss": 3.1286, + "step": 12630 + }, + { + "epoch": 0.7334126316400243, + "grad_norm": 0.10987886041402817, + "learning_rate": 0.00018209005378981626, + "loss": 3.1325, + "step": 12640 + }, + { + "epoch": 0.7339928631523978, + "grad_norm": 0.1137159988284111, + "learning_rate": 0.00018135010725436968, + "loss": 3.1282, + "step": 12650 + }, + { + "epoch": 0.7345730946647713, + "grad_norm": 0.10682724416255951, + "learning_rate": 0.00018061133402120895, + "loss": 3.1168, + "step": 12660 + }, + { + "epoch": 0.7351533261771447, + "grad_norm": 0.11520636081695557, + "learning_rate": 0.00017987373681057495, + "loss": 3.1311, + "step": 12670 + }, + { + "epoch": 0.7357335576895181, + "grad_norm": 0.107805997133255, + "learning_rate": 0.00017913731833837715, + "loss": 3.1157, + "step": 12680 + }, + { + "epoch": 0.7363137892018915, + "grad_norm": 0.10552658140659332, + "learning_rate": 0.00017840208131618618, + "loss": 3.1206, + "step": 12690 + }, + { + "epoch": 0.736894020714265, + "grad_norm": 0.10237275809049606, + "learning_rate": 0.0001776680284512215, + "loss": 3.1185, + "step": 12700 + }, + { + "epoch": 0.7374742522266384, + "grad_norm": 0.10909226536750793, + "learning_rate": 0.00017693516244634246, + "loss": 3.1108, + "step": 12710 + }, + { + "epoch": 0.7380544837390118, + "grad_norm": 0.10805969685316086, + "learning_rate": 0.00017620348600003898, + "loss": 3.1244, + "step": 12720 + }, + { + "epoch": 0.7386347152513854, + "grad_norm": 0.1112237498164177, + "learning_rate": 0.00017547300180641978, + "loss": 3.1242, + "step": 12730 + }, + { + "epoch": 0.7392149467637588, + "grad_norm": 0.10815447568893433, + "learning_rate": 0.00017474371255520466, + "loss": 3.115, + "step": 12740 + }, + { + "epoch": 0.7397951782761322, + "grad_norm": 0.10469721257686615, + "learning_rate": 0.00017401562093171286, + "loss": 3.1276, + "step": 12750 + }, + { + "epoch": 0.7403754097885056, + "grad_norm": 0.10945837199687958, + "learning_rate": 0.00017328872961685382, + "loss": 3.1234, + "step": 12760 + }, + { + "epoch": 0.7409556413008791, + "grad_norm": 0.11551317572593689, + "learning_rate": 0.00017256304128711807, + "loss": 3.1234, + "step": 12770 + }, + { + "epoch": 0.7415358728132525, + "grad_norm": 0.10662870854139328, + "learning_rate": 0.0001718385586145654, + "loss": 3.1193, + "step": 12780 + }, + { + "epoch": 0.7421161043256259, + "grad_norm": 0.0992654412984848, + "learning_rate": 0.00017111528426681728, + "loss": 3.12, + "step": 12790 + }, + { + "epoch": 0.7426963358379993, + "grad_norm": 0.10338784754276276, + "learning_rate": 0.00017039322090704555, + "loss": 3.1162, + "step": 12800 + }, + { + "epoch": 0.7432765673503728, + "grad_norm": 0.11413225531578064, + "learning_rate": 0.00016967237119396318, + "loss": 3.1261, + "step": 12810 + }, + { + "epoch": 0.7438567988627462, + "grad_norm": 0.12023269385099411, + "learning_rate": 0.00016895273778181426, + "loss": 3.1234, + "step": 12820 + }, + { + "epoch": 0.7444370303751197, + "grad_norm": 0.10315112769603729, + "learning_rate": 0.00016823432332036426, + "loss": 3.1175, + "step": 12830 + }, + { + "epoch": 0.7450172618874931, + "grad_norm": 0.11159830540418625, + "learning_rate": 0.00016751713045489098, + "loss": 3.129, + "step": 12840 + }, + { + "epoch": 0.7455974933998666, + "grad_norm": 0.10925300419330597, + "learning_rate": 0.000166801161826173, + "loss": 3.1211, + "step": 12850 + }, + { + "epoch": 0.74617772491224, + "grad_norm": 0.10250292718410492, + "learning_rate": 0.00016608642007048235, + "loss": 3.1262, + "step": 12860 + }, + { + "epoch": 0.7467579564246134, + "grad_norm": 0.10880285501480103, + "learning_rate": 0.00016537290781957288, + "loss": 3.1129, + "step": 12870 + }, + { + "epoch": 0.7473381879369868, + "grad_norm": 0.10200098901987076, + "learning_rate": 0.00016466062770067124, + "loss": 3.1227, + "step": 12880 + }, + { + "epoch": 0.7479184194493603, + "grad_norm": 0.10475246608257294, + "learning_rate": 0.000163949582336468, + "loss": 3.1347, + "step": 12890 + }, + { + "epoch": 0.7484986509617337, + "grad_norm": 0.10829997807741165, + "learning_rate": 0.00016323977434510594, + "loss": 3.1228, + "step": 12900 + }, + { + "epoch": 0.7490788824741071, + "grad_norm": 0.10326212644577026, + "learning_rate": 0.000162531206340173, + "loss": 3.1217, + "step": 12910 + }, + { + "epoch": 0.7496591139864806, + "grad_norm": 0.10354658216238022, + "learning_rate": 0.0001618238809306906, + "loss": 3.1181, + "step": 12920 + }, + { + "epoch": 0.7502393454988541, + "grad_norm": 0.10224345326423645, + "learning_rate": 0.00016111780072110504, + "loss": 3.12, + "step": 12930 + }, + { + "epoch": 0.7508195770112275, + "grad_norm": 0.10535353422164917, + "learning_rate": 0.00016041296831127756, + "loss": 3.1297, + "step": 12940 + }, + { + "epoch": 0.7513998085236009, + "grad_norm": 0.11045780032873154, + "learning_rate": 0.0001597093862964748, + "loss": 3.1183, + "step": 12950 + }, + { + "epoch": 0.7519800400359744, + "grad_norm": 0.1009131371974945, + "learning_rate": 0.00015900705726735976, + "loss": 3.1174, + "step": 12960 + }, + { + "epoch": 0.7525602715483478, + "grad_norm": 0.10427884012460709, + "learning_rate": 0.00015830598380998134, + "loss": 3.1101, + "step": 12970 + }, + { + "epoch": 0.7531405030607212, + "grad_norm": 0.11507980525493622, + "learning_rate": 0.0001576061685057655, + "loss": 3.1247, + "step": 12980 + }, + { + "epoch": 0.7537207345730946, + "grad_norm": 0.10372275114059448, + "learning_rate": 0.00015690761393150537, + "loss": 3.1183, + "step": 12990 + }, + { + "epoch": 0.7543009660854681, + "grad_norm": 0.09975118935108185, + "learning_rate": 0.00015621032265935203, + "loss": 3.1256, + "step": 13000 + }, + { + "epoch": 0.7543009660854681, + "eval_loss": 3.050916910171509, + "eval_runtime": 3.2621, + "eval_samples_per_second": 1327.375, + "eval_steps_per_second": 10.423, + "step": 13000 + }, + { + "epoch": 0.7548811975978416, + "grad_norm": 0.10384030640125275, + "learning_rate": 0.00015551429725680531, + "loss": 3.1167, + "step": 13010 + }, + { + "epoch": 0.755461429110215, + "grad_norm": 0.10713458061218262, + "learning_rate": 0.00015481954028670342, + "loss": 3.1206, + "step": 13020 + }, + { + "epoch": 0.7560416606225884, + "grad_norm": 0.10227679461240768, + "learning_rate": 0.0001541260543072144, + "loss": 3.1142, + "step": 13030 + }, + { + "epoch": 0.7566218921349619, + "grad_norm": 0.10371891409158707, + "learning_rate": 0.00015343384187182612, + "loss": 3.12, + "step": 13040 + }, + { + "epoch": 0.7572021236473353, + "grad_norm": 0.10193309932947159, + "learning_rate": 0.00015274290552933745, + "loss": 3.1191, + "step": 13050 + }, + { + "epoch": 0.7577823551597087, + "grad_norm": 0.10207639634609222, + "learning_rate": 0.00015205324782384817, + "loss": 3.1159, + "step": 13060 + }, + { + "epoch": 0.7583625866720821, + "grad_norm": 0.1024574562907219, + "learning_rate": 0.00015136487129475046, + "loss": 3.1155, + "step": 13070 + }, + { + "epoch": 0.7589428181844556, + "grad_norm": 0.10583309829235077, + "learning_rate": 0.00015067777847671876, + "loss": 3.1178, + "step": 13080 + }, + { + "epoch": 0.759523049696829, + "grad_norm": 0.10229542851448059, + "learning_rate": 0.00014999197189970065, + "loss": 3.1168, + "step": 13090 + }, + { + "epoch": 0.7601032812092025, + "grad_norm": 0.09870131313800812, + "learning_rate": 0.00014930745408890794, + "loss": 3.1121, + "step": 13100 + }, + { + "epoch": 0.7606835127215759, + "grad_norm": 0.10222458094358444, + "learning_rate": 0.00014862422756480687, + "loss": 3.128, + "step": 13110 + }, + { + "epoch": 0.7612637442339494, + "grad_norm": 0.10863006114959717, + "learning_rate": 0.00014794229484310883, + "loss": 3.1115, + "step": 13120 + }, + { + "epoch": 0.7618439757463228, + "grad_norm": 0.10503353178501129, + "learning_rate": 0.00014726165843476202, + "loss": 3.1222, + "step": 13130 + }, + { + "epoch": 0.7624242072586962, + "grad_norm": 0.09862679988145828, + "learning_rate": 0.0001465823208459407, + "loss": 3.1138, + "step": 13140 + }, + { + "epoch": 0.7630044387710696, + "grad_norm": 0.10148298740386963, + "learning_rate": 0.00014590428457803706, + "loss": 3.1158, + "step": 13150 + }, + { + "epoch": 0.7635846702834431, + "grad_norm": 0.11013112962245941, + "learning_rate": 0.00014522755212765176, + "loss": 3.1157, + "step": 13160 + }, + { + "epoch": 0.7641649017958165, + "grad_norm": 0.10017859935760498, + "learning_rate": 0.00014455212598658447, + "loss": 3.1264, + "step": 13170 + }, + { + "epoch": 0.7647451333081899, + "grad_norm": 0.10435356944799423, + "learning_rate": 0.00014387800864182487, + "loss": 3.1072, + "step": 13180 + }, + { + "epoch": 0.7653253648205635, + "grad_norm": 0.10140141099691391, + "learning_rate": 0.00014320520257554397, + "loss": 3.1145, + "step": 13190 + }, + { + "epoch": 0.7659055963329369, + "grad_norm": 0.09953157603740692, + "learning_rate": 0.000142533710265084, + "loss": 3.1244, + "step": 13200 + }, + { + "epoch": 0.7664858278453103, + "grad_norm": 0.10731340944766998, + "learning_rate": 0.00014186353418295006, + "loss": 3.1164, + "step": 13210 + }, + { + "epoch": 0.7670660593576837, + "grad_norm": 0.097932830452919, + "learning_rate": 0.0001411946767968006, + "loss": 3.1192, + "step": 13220 + }, + { + "epoch": 0.7676462908700572, + "grad_norm": 0.10346731543540955, + "learning_rate": 0.00014052714056943849, + "loss": 3.1211, + "step": 13230 + }, + { + "epoch": 0.7682265223824306, + "grad_norm": 0.10606466978788376, + "learning_rate": 0.0001398609279588024, + "loss": 3.1154, + "step": 13240 + }, + { + "epoch": 0.768806753894804, + "grad_norm": 0.10412232577800751, + "learning_rate": 0.00013919604141795667, + "loss": 3.1164, + "step": 13250 + }, + { + "epoch": 0.7693869854071774, + "grad_norm": 0.09943995624780655, + "learning_rate": 0.0001385324833950833, + "loss": 3.1195, + "step": 13260 + }, + { + "epoch": 0.769967216919551, + "grad_norm": 0.098769411444664, + "learning_rate": 0.00013787025633347239, + "loss": 3.1183, + "step": 13270 + }, + { + "epoch": 0.7705474484319244, + "grad_norm": 0.11205046623945236, + "learning_rate": 0.00013720936267151324, + "loss": 3.12, + "step": 13280 + }, + { + "epoch": 0.7711276799442978, + "grad_norm": 0.10622609406709671, + "learning_rate": 0.00013654980484268598, + "loss": 3.1139, + "step": 13290 + }, + { + "epoch": 0.7717079114566712, + "grad_norm": 0.10065792500972748, + "learning_rate": 0.00013589158527555094, + "loss": 3.1104, + "step": 13300 + }, + { + "epoch": 0.7722881429690447, + "grad_norm": 0.11066281795501709, + "learning_rate": 0.0001352347063937422, + "loss": 3.1149, + "step": 13310 + }, + { + "epoch": 0.7728683744814181, + "grad_norm": 0.10775715857744217, + "learning_rate": 0.0001345791706159562, + "loss": 3.1172, + "step": 13320 + }, + { + "epoch": 0.7734486059937915, + "grad_norm": 0.0999317467212677, + "learning_rate": 0.0001339249803559444, + "loss": 3.118, + "step": 13330 + }, + { + "epoch": 0.7740288375061649, + "grad_norm": 0.10435137152671814, + "learning_rate": 0.0001332721380225042, + "loss": 3.1238, + "step": 13340 + }, + { + "epoch": 0.7746090690185384, + "grad_norm": 0.10001866519451141, + "learning_rate": 0.00013262064601946895, + "loss": 3.1035, + "step": 13350 + }, + { + "epoch": 0.7751893005309118, + "grad_norm": 0.10652778297662735, + "learning_rate": 0.00013197050674570077, + "loss": 3.1129, + "step": 13360 + }, + { + "epoch": 0.7757695320432852, + "grad_norm": 0.09261602908372879, + "learning_rate": 0.00013132172259508058, + "loss": 3.1256, + "step": 13370 + }, + { + "epoch": 0.7763497635556587, + "grad_norm": 0.1106601282954216, + "learning_rate": 0.0001306742959564995, + "loss": 3.1256, + "step": 13380 + }, + { + "epoch": 0.7769299950680322, + "grad_norm": 0.11139431595802307, + "learning_rate": 0.0001300282292138502, + "loss": 3.1171, + "step": 13390 + }, + { + "epoch": 0.7775102265804056, + "grad_norm": 0.09999184310436249, + "learning_rate": 0.00012938352474601805, + "loss": 3.1173, + "step": 13400 + }, + { + "epoch": 0.778090458092779, + "grad_norm": 0.10087510198354721, + "learning_rate": 0.0001287401849268728, + "loss": 3.1224, + "step": 13410 + }, + { + "epoch": 0.7786706896051525, + "grad_norm": 0.10008762031793594, + "learning_rate": 0.0001280982121252585, + "loss": 3.117, + "step": 13420 + }, + { + "epoch": 0.7792509211175259, + "grad_norm": 0.10388967394828796, + "learning_rate": 0.0001274576087049868, + "loss": 3.105, + "step": 13430 + }, + { + "epoch": 0.7798311526298993, + "grad_norm": 0.10136213898658752, + "learning_rate": 0.0001268183770248263, + "loss": 3.1128, + "step": 13440 + }, + { + "epoch": 0.7804113841422727, + "grad_norm": 0.09719151258468628, + "learning_rate": 0.0001261805194384949, + "loss": 3.1094, + "step": 13450 + }, + { + "epoch": 0.7809916156546463, + "grad_norm": 0.10660874843597412, + "learning_rate": 0.00012554403829465155, + "loss": 3.1207, + "step": 13460 + }, + { + "epoch": 0.7815718471670197, + "grad_norm": 0.10400804132223129, + "learning_rate": 0.00012490893593688584, + "loss": 3.1109, + "step": 13470 + }, + { + "epoch": 0.7821520786793931, + "grad_norm": 0.10217728465795517, + "learning_rate": 0.00012427521470371173, + "loss": 3.1128, + "step": 13480 + }, + { + "epoch": 0.7827323101917665, + "grad_norm": 0.10613488405942917, + "learning_rate": 0.0001236428769285569, + "loss": 3.1091, + "step": 13490 + }, + { + "epoch": 0.78331254170414, + "grad_norm": 0.09936373680830002, + "learning_rate": 0.00012301192493975526, + "loss": 3.1107, + "step": 13500 + }, + { + "epoch": 0.7838927732165134, + "grad_norm": 0.1033458337187767, + "learning_rate": 0.00012238236106053852, + "loss": 3.1209, + "step": 13510 + }, + { + "epoch": 0.7844730047288868, + "grad_norm": 0.10691102594137192, + "learning_rate": 0.00012175418760902617, + "loss": 3.1077, + "step": 13520 + }, + { + "epoch": 0.7850532362412602, + "grad_norm": 0.11017080396413803, + "learning_rate": 0.00012112740689821921, + "loss": 3.119, + "step": 13530 + }, + { + "epoch": 0.7856334677536337, + "grad_norm": 0.10583353787660599, + "learning_rate": 0.00012050202123598974, + "loss": 3.1136, + "step": 13540 + }, + { + "epoch": 0.7862136992660071, + "grad_norm": 0.10265874862670898, + "learning_rate": 0.00011987803292507305, + "loss": 3.1122, + "step": 13550 + }, + { + "epoch": 0.7867939307783806, + "grad_norm": 0.09812895208597183, + "learning_rate": 0.00011925544426305996, + "loss": 3.11, + "step": 13560 + }, + { + "epoch": 0.787374162290754, + "grad_norm": 0.10639332979917526, + "learning_rate": 0.00011863425754238655, + "loss": 3.1162, + "step": 13570 + }, + { + "epoch": 0.7879543938031275, + "grad_norm": 0.10056709498167038, + "learning_rate": 0.00011801447505032786, + "loss": 3.1108, + "step": 13580 + }, + { + "epoch": 0.7885346253155009, + "grad_norm": 0.1005856990814209, + "learning_rate": 0.00011739609906898774, + "loss": 3.1051, + "step": 13590 + }, + { + "epoch": 0.7891148568278743, + "grad_norm": 0.10471539199352264, + "learning_rate": 0.00011677913187529126, + "loss": 3.1174, + "step": 13600 + }, + { + "epoch": 0.7896950883402477, + "grad_norm": 0.10265914350748062, + "learning_rate": 0.0001161635757409767, + "loss": 3.1132, + "step": 13610 + }, + { + "epoch": 0.7902753198526212, + "grad_norm": 0.10047253966331482, + "learning_rate": 0.00011554943293258557, + "loss": 3.1144, + "step": 13620 + }, + { + "epoch": 0.7908555513649946, + "grad_norm": 0.10369555652141571, + "learning_rate": 0.00011493670571145665, + "loss": 3.1165, + "step": 13630 + }, + { + "epoch": 0.791435782877368, + "grad_norm": 0.10994482040405273, + "learning_rate": 0.0001143253963337152, + "loss": 3.1099, + "step": 13640 + }, + { + "epoch": 0.7920160143897415, + "grad_norm": 0.10117276012897491, + "learning_rate": 0.00011371550705026673, + "loss": 3.1207, + "step": 13650 + }, + { + "epoch": 0.792596245902115, + "grad_norm": 0.10518882423639297, + "learning_rate": 0.00011310704010678747, + "loss": 3.0989, + "step": 13660 + }, + { + "epoch": 0.7931764774144884, + "grad_norm": 0.10278623551130295, + "learning_rate": 0.00011249999774371621, + "loss": 3.1032, + "step": 13670 + }, + { + "epoch": 0.7937567089268618, + "grad_norm": 0.10157769918441772, + "learning_rate": 0.00011189438219624698, + "loss": 3.1141, + "step": 13680 + }, + { + "epoch": 0.7943369404392353, + "grad_norm": 0.10142907500267029, + "learning_rate": 0.00011129019569431908, + "loss": 3.1123, + "step": 13690 + }, + { + "epoch": 0.7949171719516087, + "grad_norm": 0.10015449672937393, + "learning_rate": 0.00011068744046261098, + "loss": 3.1125, + "step": 13700 + }, + { + "epoch": 0.7954974034639821, + "grad_norm": 0.10350023210048676, + "learning_rate": 0.00011008611872053037, + "loss": 3.1038, + "step": 13710 + }, + { + "epoch": 0.7960776349763555, + "grad_norm": 0.10056246072053909, + "learning_rate": 0.00010948623268220676, + "loss": 3.1087, + "step": 13720 + }, + { + "epoch": 0.796657866488729, + "grad_norm": 0.09896915405988693, + "learning_rate": 0.00010888778455648391, + "loss": 3.1132, + "step": 13730 + }, + { + "epoch": 0.7972380980011025, + "grad_norm": 0.10385739803314209, + "learning_rate": 0.00010829077654690983, + "loss": 3.1183, + "step": 13740 + }, + { + "epoch": 0.7978183295134759, + "grad_norm": 0.09953512251377106, + "learning_rate": 0.000107695210851731, + "loss": 3.1125, + "step": 13750 + }, + { + "epoch": 0.7983985610258493, + "grad_norm": 0.09491749107837677, + "learning_rate": 0.00010710108966388266, + "loss": 3.1131, + "step": 13760 + }, + { + "epoch": 0.7989787925382228, + "grad_norm": 0.09977880120277405, + "learning_rate": 0.00010650841517098115, + "loss": 3.121, + "step": 13770 + }, + { + "epoch": 0.7995590240505962, + "grad_norm": 0.10586149990558624, + "learning_rate": 0.00010591718955531605, + "loss": 3.1175, + "step": 13780 + }, + { + "epoch": 0.8001392555629696, + "grad_norm": 0.10209766030311584, + "learning_rate": 0.0001053274149938419, + "loss": 3.1164, + "step": 13790 + }, + { + "epoch": 0.800719487075343, + "grad_norm": 0.10039076209068298, + "learning_rate": 0.0001047390936581707, + "loss": 3.1094, + "step": 13800 + }, + { + "epoch": 0.8012997185877165, + "grad_norm": 0.10035811364650726, + "learning_rate": 0.00010415222771456307, + "loss": 3.1173, + "step": 13810 + }, + { + "epoch": 0.8018799501000899, + "grad_norm": 0.09645077586174011, + "learning_rate": 0.00010356681932392093, + "loss": 3.1097, + "step": 13820 + }, + { + "epoch": 0.8024601816124634, + "grad_norm": 0.10422459989786148, + "learning_rate": 0.0001029828706417793, + "loss": 3.1142, + "step": 13830 + }, + { + "epoch": 0.8030404131248368, + "grad_norm": 0.10029245167970657, + "learning_rate": 0.0001024003838182982, + "loss": 3.1054, + "step": 13840 + }, + { + "epoch": 0.8036206446372103, + "grad_norm": 0.09637358039617538, + "learning_rate": 0.00010181936099825551, + "loss": 3.1093, + "step": 13850 + }, + { + "epoch": 0.8042008761495837, + "grad_norm": 0.10224739462137222, + "learning_rate": 0.00010123980432103791, + "loss": 3.1085, + "step": 13860 + }, + { + "epoch": 0.8047811076619571, + "grad_norm": 0.09893719106912613, + "learning_rate": 0.00010066171592063377, + "loss": 3.1045, + "step": 13870 + }, + { + "epoch": 0.8053613391743305, + "grad_norm": 0.09696366637945175, + "learning_rate": 0.00010008509792562525, + "loss": 3.1068, + "step": 13880 + }, + { + "epoch": 0.805941570686704, + "grad_norm": 0.09792386740446091, + "learning_rate": 9.950995245918016e-05, + "loss": 3.1193, + "step": 13890 + }, + { + "epoch": 0.8065218021990774, + "grad_norm": 0.09721978008747101, + "learning_rate": 9.893628163904417e-05, + "loss": 3.1135, + "step": 13900 + }, + { + "epoch": 0.8071020337114508, + "grad_norm": 0.102999746799469, + "learning_rate": 9.836408757753363e-05, + "loss": 3.1162, + "step": 13910 + }, + { + "epoch": 0.8076822652238244, + "grad_norm": 0.10018763691186905, + "learning_rate": 9.779337238152697e-05, + "loss": 3.1185, + "step": 13920 + }, + { + "epoch": 0.8082624967361978, + "grad_norm": 0.09860005974769592, + "learning_rate": 9.722413815245717e-05, + "loss": 3.1131, + "step": 13930 + }, + { + "epoch": 0.8088427282485712, + "grad_norm": 0.09546367824077606, + "learning_rate": 9.665638698630442e-05, + "loss": 3.1123, + "step": 13940 + }, + { + "epoch": 0.8094229597609446, + "grad_norm": 0.09807273745536804, + "learning_rate": 9.6090120973588e-05, + "loss": 3.1063, + "step": 13950 + }, + { + "epoch": 0.8100031912733181, + "grad_norm": 0.10019023716449738, + "learning_rate": 9.552534219935844e-05, + "loss": 3.1155, + "step": 13960 + }, + { + "epoch": 0.8105834227856915, + "grad_norm": 0.1006435975432396, + "learning_rate": 9.496205274319069e-05, + "loss": 3.106, + "step": 13970 + }, + { + "epoch": 0.8111636542980649, + "grad_norm": 0.10411754250526428, + "learning_rate": 9.44002546791754e-05, + "loss": 3.1155, + "step": 13980 + }, + { + "epoch": 0.8117438858104383, + "grad_norm": 0.1065743938088417, + "learning_rate": 9.38399500759119e-05, + "loss": 3.1075, + "step": 13990 + }, + { + "epoch": 0.8123241173228118, + "grad_norm": 0.09786444157361984, + "learning_rate": 9.328114099650042e-05, + "loss": 3.1073, + "step": 14000 + }, + { + "epoch": 0.8123241173228118, + "eval_loss": 3.041724681854248, + "eval_runtime": 3.2609, + "eval_samples_per_second": 1327.855, + "eval_steps_per_second": 10.427, + "step": 14000 + }, + { + "epoch": 0.8129043488351853, + "grad_norm": 0.09952949732542038, + "learning_rate": 9.272382949853453e-05, + "loss": 3.1102, + "step": 14010 + }, + { + "epoch": 0.8134845803475587, + "grad_norm": 0.0931655615568161, + "learning_rate": 9.216801763409343e-05, + "loss": 3.1085, + "step": 14020 + }, + { + "epoch": 0.8140648118599321, + "grad_norm": 0.09814907610416412, + "learning_rate": 9.161370744973491e-05, + "loss": 3.1011, + "step": 14030 + }, + { + "epoch": 0.8146450433723056, + "grad_norm": 0.09730423241853714, + "learning_rate": 9.106090098648696e-05, + "loss": 3.1048, + "step": 14040 + }, + { + "epoch": 0.815225274884679, + "grad_norm": 0.0960068479180336, + "learning_rate": 9.05096002798409e-05, + "loss": 3.1144, + "step": 14050 + }, + { + "epoch": 0.8158055063970524, + "grad_norm": 0.0986780971288681, + "learning_rate": 8.995980735974369e-05, + "loss": 3.1092, + "step": 14060 + }, + { + "epoch": 0.8163857379094258, + "grad_norm": 0.09585044533014297, + "learning_rate": 8.941152425059034e-05, + "loss": 3.1125, + "step": 14070 + }, + { + "epoch": 0.8169659694217993, + "grad_norm": 0.10179416090250015, + "learning_rate": 8.886475297121693e-05, + "loss": 3.1041, + "step": 14080 + }, + { + "epoch": 0.8175462009341727, + "grad_norm": 0.0965443029999733, + "learning_rate": 8.831949553489249e-05, + "loss": 3.1132, + "step": 14090 + }, + { + "epoch": 0.8181264324465461, + "grad_norm": 0.0964798629283905, + "learning_rate": 8.777575394931198e-05, + "loss": 3.1103, + "step": 14100 + }, + { + "epoch": 0.8187066639589196, + "grad_norm": 0.11066204309463501, + "learning_rate": 8.723353021658892e-05, + "loss": 3.105, + "step": 14110 + }, + { + "epoch": 0.8192868954712931, + "grad_norm": 0.09815254807472229, + "learning_rate": 8.669282633324776e-05, + "loss": 3.1088, + "step": 14120 + }, + { + "epoch": 0.8198671269836665, + "grad_norm": 0.10448320209980011, + "learning_rate": 8.615364429021722e-05, + "loss": 3.0998, + "step": 14130 + }, + { + "epoch": 0.8204473584960399, + "grad_norm": 0.10330579429864883, + "learning_rate": 8.56159860728215e-05, + "loss": 3.1101, + "step": 14140 + }, + { + "epoch": 0.8210275900084134, + "grad_norm": 0.10102424770593643, + "learning_rate": 8.507985366077493e-05, + "loss": 3.1033, + "step": 14150 + }, + { + "epoch": 0.8216078215207868, + "grad_norm": 0.09644920378923416, + "learning_rate": 8.454524902817312e-05, + "loss": 3.1087, + "step": 14160 + }, + { + "epoch": 0.8221880530331602, + "grad_norm": 0.09765134006738663, + "learning_rate": 8.401217414348611e-05, + "loss": 3.0975, + "step": 14170 + }, + { + "epoch": 0.8227682845455336, + "grad_norm": 0.09793014079332352, + "learning_rate": 8.348063096955188e-05, + "loss": 3.116, + "step": 14180 + }, + { + "epoch": 0.8233485160579072, + "grad_norm": 0.09745863080024719, + "learning_rate": 8.295062146356763e-05, + "loss": 3.1123, + "step": 14190 + }, + { + "epoch": 0.8239287475702806, + "grad_norm": 0.10549872368574142, + "learning_rate": 8.242214757708416e-05, + "loss": 3.1137, + "step": 14200 + }, + { + "epoch": 0.824508979082654, + "grad_norm": 0.10081037133932114, + "learning_rate": 8.18952112559977e-05, + "loss": 3.1119, + "step": 14210 + }, + { + "epoch": 0.8250892105950274, + "grad_norm": 0.10452466458082199, + "learning_rate": 8.136981444054281e-05, + "loss": 3.108, + "step": 14220 + }, + { + "epoch": 0.8256694421074009, + "grad_norm": 0.10242857784032822, + "learning_rate": 8.084595906528574e-05, + "loss": 3.1052, + "step": 14230 + }, + { + "epoch": 0.8262496736197743, + "grad_norm": 0.10497426986694336, + "learning_rate": 8.032364705911665e-05, + "loss": 3.1, + "step": 14240 + }, + { + "epoch": 0.8268299051321477, + "grad_norm": 0.0955236405134201, + "learning_rate": 7.980288034524353e-05, + "loss": 3.1138, + "step": 14250 + }, + { + "epoch": 0.8274101366445211, + "grad_norm": 0.10172264277935028, + "learning_rate": 7.928366084118338e-05, + "loss": 3.0993, + "step": 14260 + }, + { + "epoch": 0.8279903681568946, + "grad_norm": 0.09582705050706863, + "learning_rate": 7.87659904587572e-05, + "loss": 3.1224, + "step": 14270 + }, + { + "epoch": 0.828570599669268, + "grad_norm": 0.10169988870620728, + "learning_rate": 7.824987110408149e-05, + "loss": 3.1154, + "step": 14280 + }, + { + "epoch": 0.8291508311816415, + "grad_norm": 0.09703046083450317, + "learning_rate": 7.773530467756168e-05, + "loss": 3.1, + "step": 14290 + }, + { + "epoch": 0.8297310626940149, + "grad_norm": 0.09659979492425919, + "learning_rate": 7.722229307388551e-05, + "loss": 3.1027, + "step": 14300 + }, + { + "epoch": 0.8303112942063884, + "grad_norm": 0.10455524176359177, + "learning_rate": 7.671083818201502e-05, + "loss": 3.1086, + "step": 14310 + }, + { + "epoch": 0.8308915257187618, + "grad_norm": 0.09594230353832245, + "learning_rate": 7.620094188518112e-05, + "loss": 3.098, + "step": 14320 + }, + { + "epoch": 0.8314717572311352, + "grad_norm": 0.0956626608967781, + "learning_rate": 7.569260606087518e-05, + "loss": 3.0967, + "step": 14330 + }, + { + "epoch": 0.8320519887435086, + "grad_norm": 0.09703920781612396, + "learning_rate": 7.518583258084288e-05, + "loss": 3.1088, + "step": 14340 + }, + { + "epoch": 0.8326322202558821, + "grad_norm": 0.09883217513561249, + "learning_rate": 7.468062331107761e-05, + "loss": 3.1125, + "step": 14350 + }, + { + "epoch": 0.8332124517682555, + "grad_norm": 0.09501095116138458, + "learning_rate": 7.417698011181234e-05, + "loss": 3.1007, + "step": 14360 + }, + { + "epoch": 0.8337926832806289, + "grad_norm": 0.09835023432970047, + "learning_rate": 7.367490483751448e-05, + "loss": 3.103, + "step": 14370 + }, + { + "epoch": 0.8343729147930025, + "grad_norm": 0.10247199982404709, + "learning_rate": 7.317439933687764e-05, + "loss": 3.1054, + "step": 14380 + }, + { + "epoch": 0.8349531463053759, + "grad_norm": 0.10141029953956604, + "learning_rate": 7.267546545281544e-05, + "loss": 3.1124, + "step": 14390 + }, + { + "epoch": 0.8355333778177493, + "grad_norm": 0.09786387532949448, + "learning_rate": 7.217810502245498e-05, + "loss": 3.1143, + "step": 14400 + }, + { + "epoch": 0.8361136093301227, + "grad_norm": 0.10012129694223404, + "learning_rate": 7.168231987712903e-05, + "loss": 3.1133, + "step": 14410 + }, + { + "epoch": 0.8366938408424962, + "grad_norm": 0.10030212253332138, + "learning_rate": 7.118811184237078e-05, + "loss": 3.1001, + "step": 14420 + }, + { + "epoch": 0.8372740723548696, + "grad_norm": 0.09827015548944473, + "learning_rate": 7.069548273790588e-05, + "loss": 3.1031, + "step": 14430 + }, + { + "epoch": 0.837854303867243, + "grad_norm": 0.09829414635896683, + "learning_rate": 7.020443437764629e-05, + "loss": 3.1095, + "step": 14440 + }, + { + "epoch": 0.8384345353796164, + "grad_norm": 0.0953378975391388, + "learning_rate": 6.971496856968351e-05, + "loss": 3.1009, + "step": 14450 + }, + { + "epoch": 0.83901476689199, + "grad_norm": 0.09821732342243195, + "learning_rate": 6.922708711628183e-05, + "loss": 3.1148, + "step": 14460 + }, + { + "epoch": 0.8395949984043634, + "grad_norm": 0.09834583848714828, + "learning_rate": 6.874079181387221e-05, + "loss": 3.1015, + "step": 14470 + }, + { + "epoch": 0.8401752299167368, + "grad_norm": 0.09869256615638733, + "learning_rate": 6.825608445304443e-05, + "loss": 3.1101, + "step": 14480 + }, + { + "epoch": 0.8407554614291102, + "grad_norm": 0.10293745249509811, + "learning_rate": 6.777296681854206e-05, + "loss": 3.1056, + "step": 14490 + }, + { + "epoch": 0.8413356929414837, + "grad_norm": 0.09749376773834229, + "learning_rate": 6.72914406892548e-05, + "loss": 3.1106, + "step": 14500 + }, + { + "epoch": 0.8419159244538571, + "grad_norm": 0.09520737081766129, + "learning_rate": 6.681150783821222e-05, + "loss": 3.1085, + "step": 14510 + }, + { + "epoch": 0.8424961559662305, + "grad_norm": 0.09628592431545258, + "learning_rate": 6.633317003257755e-05, + "loss": 3.1083, + "step": 14520 + }, + { + "epoch": 0.8430763874786039, + "grad_norm": 0.09358352422714233, + "learning_rate": 6.585642903364036e-05, + "loss": 3.1113, + "step": 14530 + }, + { + "epoch": 0.8436566189909774, + "grad_norm": 0.09465614706277847, + "learning_rate": 6.538128659681131e-05, + "loss": 3.1141, + "step": 14540 + }, + { + "epoch": 0.8442368505033508, + "grad_norm": 0.09725864231586456, + "learning_rate": 6.490774447161441e-05, + "loss": 3.1104, + "step": 14550 + }, + { + "epoch": 0.8448170820157243, + "grad_norm": 0.09526196122169495, + "learning_rate": 6.443580440168146e-05, + "loss": 3.1165, + "step": 14560 + }, + { + "epoch": 0.8453973135280977, + "grad_norm": 0.09678266197443008, + "learning_rate": 6.396546812474519e-05, + "loss": 3.1012, + "step": 14570 + }, + { + "epoch": 0.8459775450404712, + "grad_norm": 0.0973704531788826, + "learning_rate": 6.349673737263295e-05, + "loss": 3.1026, + "step": 14580 + }, + { + "epoch": 0.8465577765528446, + "grad_norm": 0.09803210198879242, + "learning_rate": 6.302961387126066e-05, + "loss": 3.1056, + "step": 14590 + }, + { + "epoch": 0.847138008065218, + "grad_norm": 0.09953798353672028, + "learning_rate": 6.256409934062595e-05, + "loss": 3.1067, + "step": 14600 + }, + { + "epoch": 0.8477182395775914, + "grad_norm": 0.10524503886699677, + "learning_rate": 6.2100195494802e-05, + "loss": 3.1031, + "step": 14610 + }, + { + "epoch": 0.8482984710899649, + "grad_norm": 0.09302034974098206, + "learning_rate": 6.163790404193148e-05, + "loss": 3.1096, + "step": 14620 + }, + { + "epoch": 0.8488787026023383, + "grad_norm": 0.09579528868198395, + "learning_rate": 6.117722668421971e-05, + "loss": 3.1069, + "step": 14630 + }, + { + "epoch": 0.8494589341147117, + "grad_norm": 0.09332197159528732, + "learning_rate": 6.071816511792932e-05, + "loss": 3.1117, + "step": 14640 + }, + { + "epoch": 0.8500391656270853, + "grad_norm": 0.09204788506031036, + "learning_rate": 6.0260721033372876e-05, + "loss": 3.0956, + "step": 14650 + }, + { + "epoch": 0.8506193971394587, + "grad_norm": 0.09581390768289566, + "learning_rate": 5.980489611490747e-05, + "loss": 3.098, + "step": 14660 + }, + { + "epoch": 0.8511996286518321, + "grad_norm": 0.09121184796094894, + "learning_rate": 5.935069204092819e-05, + "loss": 3.112, + "step": 14670 + }, + { + "epoch": 0.8517798601642055, + "grad_norm": 0.09455008059740067, + "learning_rate": 5.889811048386201e-05, + "loss": 3.1009, + "step": 14680 + }, + { + "epoch": 0.852360091676579, + "grad_norm": 0.09240598976612091, + "learning_rate": 5.8447153110161524e-05, + "loss": 3.1075, + "step": 14690 + }, + { + "epoch": 0.8529403231889524, + "grad_norm": 0.09437933564186096, + "learning_rate": 5.7997821580299256e-05, + "loss": 3.0997, + "step": 14700 + }, + { + "epoch": 0.8535205547013258, + "grad_norm": 0.0974365696310997, + "learning_rate": 5.755011754876088e-05, + "loss": 3.0986, + "step": 14710 + }, + { + "epoch": 0.8541007862136992, + "grad_norm": 0.0968250036239624, + "learning_rate": 5.710404266403951e-05, + "loss": 3.1132, + "step": 14720 + }, + { + "epoch": 0.8546810177260727, + "grad_norm": 0.09723575413227081, + "learning_rate": 5.665959856862962e-05, + "loss": 3.1009, + "step": 14730 + }, + { + "epoch": 0.8552612492384462, + "grad_norm": 0.09329159557819366, + "learning_rate": 5.621678689902077e-05, + "loss": 3.1138, + "step": 14740 + }, + { + "epoch": 0.8558414807508196, + "grad_norm": 0.09336376190185547, + "learning_rate": 5.57756092856922e-05, + "loss": 3.0973, + "step": 14750 + }, + { + "epoch": 0.856421712263193, + "grad_norm": 0.0969925969839096, + "learning_rate": 5.5336067353105976e-05, + "loss": 3.0939, + "step": 14760 + }, + { + "epoch": 0.8570019437755665, + "grad_norm": 0.09060470759868622, + "learning_rate": 5.489816271970149e-05, + "loss": 3.1113, + "step": 14770 + }, + { + "epoch": 0.8575821752879399, + "grad_norm": 0.09199319034814835, + "learning_rate": 5.4461896997889505e-05, + "loss": 3.1083, + "step": 14780 + }, + { + "epoch": 0.8581624068003133, + "grad_norm": 0.09637030959129333, + "learning_rate": 5.402727179404615e-05, + "loss": 3.1091, + "step": 14790 + }, + { + "epoch": 0.8587426383126867, + "grad_norm": 0.0952460765838623, + "learning_rate": 5.359428870850691e-05, + "loss": 3.1057, + "step": 14800 + }, + { + "epoch": 0.8593228698250602, + "grad_norm": 0.09476283192634583, + "learning_rate": 5.316294933556076e-05, + "loss": 3.1085, + "step": 14810 + }, + { + "epoch": 0.8599031013374336, + "grad_norm": 0.0928397923707962, + "learning_rate": 5.273325526344469e-05, + "loss": 3.0943, + "step": 14820 + }, + { + "epoch": 0.860483332849807, + "grad_norm": 0.09518643468618393, + "learning_rate": 5.230520807433714e-05, + "loss": 3.0964, + "step": 14830 + }, + { + "epoch": 0.8610635643621805, + "grad_norm": 0.09664203971624374, + "learning_rate": 5.187880934435274e-05, + "loss": 3.1037, + "step": 14840 + }, + { + "epoch": 0.861643795874554, + "grad_norm": 0.0946909636259079, + "learning_rate": 5.145406064353631e-05, + "loss": 3.0976, + "step": 14850 + }, + { + "epoch": 0.8622240273869274, + "grad_norm": 0.09507571905851364, + "learning_rate": 5.10309635358569e-05, + "loss": 3.0998, + "step": 14860 + }, + { + "epoch": 0.8628042588993008, + "grad_norm": 0.09289544820785522, + "learning_rate": 5.060951957920257e-05, + "loss": 3.1094, + "step": 14870 + }, + { + "epoch": 0.8633844904116743, + "grad_norm": 0.09258411824703217, + "learning_rate": 5.018973032537411e-05, + "loss": 3.1052, + "step": 14880 + }, + { + "epoch": 0.8639647219240477, + "grad_norm": 0.09440912306308746, + "learning_rate": 4.977159732007941e-05, + "loss": 3.1092, + "step": 14890 + }, + { + "epoch": 0.8645449534364211, + "grad_norm": 0.09835471212863922, + "learning_rate": 4.935512210292814e-05, + "loss": 3.0988, + "step": 14900 + }, + { + "epoch": 0.8651251849487945, + "grad_norm": 0.09666918218135834, + "learning_rate": 4.894030620742545e-05, + "loss": 3.1009, + "step": 14910 + }, + { + "epoch": 0.865705416461168, + "grad_norm": 0.0979539081454277, + "learning_rate": 4.8527151160967286e-05, + "loss": 3.0995, + "step": 14920 + }, + { + "epoch": 0.8662856479735415, + "grad_norm": 0.0943804681301117, + "learning_rate": 4.81156584848334e-05, + "loss": 3.1054, + "step": 14930 + }, + { + "epoch": 0.8668658794859149, + "grad_norm": 0.09513070434331894, + "learning_rate": 4.770582969418319e-05, + "loss": 3.1108, + "step": 14940 + }, + { + "epoch": 0.8674461109982883, + "grad_norm": 0.09076978266239166, + "learning_rate": 4.7297666298049156e-05, + "loss": 3.1028, + "step": 14950 + }, + { + "epoch": 0.8680263425106618, + "grad_norm": 0.09252411872148514, + "learning_rate": 4.6891169799331614e-05, + "loss": 3.117, + "step": 14960 + }, + { + "epoch": 0.8686065740230352, + "grad_norm": 0.09229396283626556, + "learning_rate": 4.648634169479343e-05, + "loss": 3.1078, + "step": 14970 + }, + { + "epoch": 0.8691868055354086, + "grad_norm": 0.09328366816043854, + "learning_rate": 4.60831834750538e-05, + "loss": 3.1087, + "step": 14980 + }, + { + "epoch": 0.869767037047782, + "grad_norm": 0.09177900850772858, + "learning_rate": 4.568169662458377e-05, + "loss": 3.0944, + "step": 14990 + }, + { + "epoch": 0.8703472685601555, + "grad_norm": 0.09426326304674149, + "learning_rate": 4.528188262169991e-05, + "loss": 3.108, + "step": 15000 + }, + { + "epoch": 0.8703472685601555, + "eval_loss": 3.0348994731903076, + "eval_runtime": 3.2633, + "eval_samples_per_second": 1326.859, + "eval_steps_per_second": 10.419, + "step": 15000 + }, + { + "epoch": 0.870927500072529, + "grad_norm": 0.0925714373588562, + "learning_rate": 4.488374293855918e-05, + "loss": 3.104, + "step": 15010 + }, + { + "epoch": 0.8715077315849024, + "grad_norm": 0.0921747237443924, + "learning_rate": 4.448727904115379e-05, + "loss": 3.1142, + "step": 15020 + }, + { + "epoch": 0.8720879630972758, + "grad_norm": 0.09229473024606705, + "learning_rate": 4.4092492389305074e-05, + "loss": 3.0982, + "step": 15030 + }, + { + "epoch": 0.8726681946096493, + "grad_norm": 0.09434866905212402, + "learning_rate": 4.369938443665922e-05, + "loss": 3.1127, + "step": 15040 + }, + { + "epoch": 0.8732484261220227, + "grad_norm": 0.09186001121997833, + "learning_rate": 4.330795663068044e-05, + "loss": 3.1025, + "step": 15050 + }, + { + "epoch": 0.8738286576343961, + "grad_norm": 0.09200981259346008, + "learning_rate": 4.291821041264721e-05, + "loss": 3.0938, + "step": 15060 + }, + { + "epoch": 0.8744088891467695, + "grad_norm": 0.09246696531772614, + "learning_rate": 4.253014721764592e-05, + "loss": 3.1122, + "step": 15070 + }, + { + "epoch": 0.874989120659143, + "grad_norm": 0.09366760402917862, + "learning_rate": 4.214376847456575e-05, + "loss": 3.1114, + "step": 15080 + }, + { + "epoch": 0.8755693521715164, + "grad_norm": 0.09217476844787598, + "learning_rate": 4.1759075606093934e-05, + "loss": 3.1152, + "step": 15090 + }, + { + "epoch": 0.8761495836838898, + "grad_norm": 0.09385888278484344, + "learning_rate": 4.137607002870969e-05, + "loss": 3.1151, + "step": 15100 + }, + { + "epoch": 0.8767298151962634, + "grad_norm": 0.09338943660259247, + "learning_rate": 4.099475315267981e-05, + "loss": 3.1108, + "step": 15110 + }, + { + "epoch": 0.8773100467086368, + "grad_norm": 0.09102658182382584, + "learning_rate": 4.0615126382052945e-05, + "loss": 3.106, + "step": 15120 + }, + { + "epoch": 0.8778902782210102, + "grad_norm": 0.09203559905290604, + "learning_rate": 4.023719111465457e-05, + "loss": 3.1, + "step": 15130 + }, + { + "epoch": 0.8784705097333836, + "grad_norm": 0.09662605822086334, + "learning_rate": 3.986094874208218e-05, + "loss": 3.1095, + "step": 15140 + }, + { + "epoch": 0.8790507412457571, + "grad_norm": 0.09100698679685593, + "learning_rate": 3.9486400649699216e-05, + "loss": 3.0917, + "step": 15150 + }, + { + "epoch": 0.8796309727581305, + "grad_norm": 0.09504050016403198, + "learning_rate": 3.911354821663127e-05, + "loss": 3.1041, + "step": 15160 + }, + { + "epoch": 0.8802112042705039, + "grad_norm": 0.09688866138458252, + "learning_rate": 3.874239281576003e-05, + "loss": 3.0942, + "step": 15170 + }, + { + "epoch": 0.8807914357828773, + "grad_norm": 0.0897068902850151, + "learning_rate": 3.837293581371837e-05, + "loss": 3.1024, + "step": 15180 + }, + { + "epoch": 0.8813716672952508, + "grad_norm": 0.0906522199511528, + "learning_rate": 3.800517857088604e-05, + "loss": 3.103, + "step": 15190 + }, + { + "epoch": 0.8819518988076243, + "grad_norm": 0.0898643210530281, + "learning_rate": 3.763912244138334e-05, + "loss": 3.11, + "step": 15200 + }, + { + "epoch": 0.8825321303199977, + "grad_norm": 0.09033368527889252, + "learning_rate": 3.727476877306751e-05, + "loss": 3.1093, + "step": 15210 + }, + { + "epoch": 0.8831123618323711, + "grad_norm": 0.0916651263833046, + "learning_rate": 3.691211890752688e-05, + "loss": 3.1059, + "step": 15220 + }, + { + "epoch": 0.8836925933447446, + "grad_norm": 0.09232784807682037, + "learning_rate": 3.6551174180076195e-05, + "loss": 3.1066, + "step": 15230 + }, + { + "epoch": 0.884272824857118, + "grad_norm": 0.09441141784191132, + "learning_rate": 3.619193591975195e-05, + "loss": 3.1105, + "step": 15240 + }, + { + "epoch": 0.8848530563694914, + "grad_norm": 0.09673616290092468, + "learning_rate": 3.583440544930672e-05, + "loss": 3.0993, + "step": 15250 + }, + { + "epoch": 0.8854332878818648, + "grad_norm": 0.09390676021575928, + "learning_rate": 3.547858408520538e-05, + "loss": 3.1056, + "step": 15260 + }, + { + "epoch": 0.8860135193942383, + "grad_norm": 0.09317633509635925, + "learning_rate": 3.512447313761946e-05, + "loss": 3.0977, + "step": 15270 + }, + { + "epoch": 0.8865937509066117, + "grad_norm": 0.09400813281536102, + "learning_rate": 3.477207391042253e-05, + "loss": 3.0963, + "step": 15280 + }, + { + "epoch": 0.8871739824189852, + "grad_norm": 0.09057381004095078, + "learning_rate": 3.442138770118547e-05, + "loss": 3.1024, + "step": 15290 + }, + { + "epoch": 0.8877542139313586, + "grad_norm": 0.09377612918615341, + "learning_rate": 3.4072415801171484e-05, + "loss": 3.0959, + "step": 15300 + }, + { + "epoch": 0.8883344454437321, + "grad_norm": 0.09093187749385834, + "learning_rate": 3.3725159495332e-05, + "loss": 3.0976, + "step": 15310 + }, + { + "epoch": 0.8889146769561055, + "grad_norm": 0.09208898991346359, + "learning_rate": 3.3379620062300774e-05, + "loss": 3.1007, + "step": 15320 + }, + { + "epoch": 0.8894949084684789, + "grad_norm": 0.09365664422512054, + "learning_rate": 3.303579877439039e-05, + "loss": 3.1053, + "step": 15330 + }, + { + "epoch": 0.8900751399808524, + "grad_norm": 0.09230521321296692, + "learning_rate": 3.269369689758683e-05, + "loss": 3.1055, + "step": 15340 + }, + { + "epoch": 0.8906553714932258, + "grad_norm": 0.0912981778383255, + "learning_rate": 3.235331569154493e-05, + "loss": 3.0972, + "step": 15350 + }, + { + "epoch": 0.8912356030055992, + "grad_norm": 0.08950291574001312, + "learning_rate": 3.2014656409584174e-05, + "loss": 3.0999, + "step": 15360 + }, + { + "epoch": 0.8918158345179726, + "grad_norm": 0.09219230711460114, + "learning_rate": 3.167772029868321e-05, + "loss": 3.1019, + "step": 15370 + }, + { + "epoch": 0.8923960660303462, + "grad_norm": 0.09009566158056259, + "learning_rate": 3.134250859947635e-05, + "loss": 3.0978, + "step": 15380 + }, + { + "epoch": 0.8929762975427196, + "grad_norm": 0.0936865359544754, + "learning_rate": 3.1009022546248045e-05, + "loss": 3.1021, + "step": 15390 + }, + { + "epoch": 0.893556529055093, + "grad_norm": 0.09219173341989517, + "learning_rate": 3.0677263366928944e-05, + "loss": 3.0984, + "step": 15400 + }, + { + "epoch": 0.8941367605674664, + "grad_norm": 0.09084775298833847, + "learning_rate": 3.0347232283091107e-05, + "loss": 3.1039, + "step": 15410 + }, + { + "epoch": 0.8947169920798399, + "grad_norm": 0.0911671370267868, + "learning_rate": 3.001893050994342e-05, + "loss": 3.0934, + "step": 15420 + }, + { + "epoch": 0.8952972235922133, + "grad_norm": 0.08928447961807251, + "learning_rate": 2.9692359256327628e-05, + "loss": 3.1013, + "step": 15430 + }, + { + "epoch": 0.8958774551045867, + "grad_norm": 0.09330154210329056, + "learning_rate": 2.936751972471313e-05, + "loss": 3.0978, + "step": 15440 + }, + { + "epoch": 0.8964576866169601, + "grad_norm": 0.09146152436733246, + "learning_rate": 2.904441311119321e-05, + "loss": 3.0998, + "step": 15450 + }, + { + "epoch": 0.8970379181293336, + "grad_norm": 0.09131080657243729, + "learning_rate": 2.87230406054802e-05, + "loss": 3.1012, + "step": 15460 + }, + { + "epoch": 0.897618149641707, + "grad_norm": 0.09509788453578949, + "learning_rate": 2.8403403390901305e-05, + "loss": 3.102, + "step": 15470 + }, + { + "epoch": 0.8981983811540805, + "grad_norm": 0.09333484619855881, + "learning_rate": 2.8085502644394355e-05, + "loss": 3.1051, + "step": 15480 + }, + { + "epoch": 0.8987786126664539, + "grad_norm": 0.09036233276128769, + "learning_rate": 2.7769339536503125e-05, + "loss": 3.1117, + "step": 15490 + }, + { + "epoch": 0.8993588441788274, + "grad_norm": 0.08942391723394394, + "learning_rate": 2.745491523137328e-05, + "loss": 3.1117, + "step": 15500 + }, + { + "epoch": 0.8999390756912008, + "grad_norm": 0.0910114273428917, + "learning_rate": 2.7142230886748053e-05, + "loss": 3.0984, + "step": 15510 + }, + { + "epoch": 0.9005193072035742, + "grad_norm": 0.09009117633104324, + "learning_rate": 2.683128765396403e-05, + "loss": 3.0985, + "step": 15520 + }, + { + "epoch": 0.9010995387159476, + "grad_norm": 0.0904909297823906, + "learning_rate": 2.652208667794659e-05, + "loss": 3.0974, + "step": 15530 + }, + { + "epoch": 0.9016797702283211, + "grad_norm": 0.09261862933635712, + "learning_rate": 2.6214629097206345e-05, + "loss": 3.1042, + "step": 15540 + }, + { + "epoch": 0.9022600017406945, + "grad_norm": 0.09034094959497452, + "learning_rate": 2.5908916043834218e-05, + "loss": 3.1026, + "step": 15550 + }, + { + "epoch": 0.902840233253068, + "grad_norm": 0.09254541248083115, + "learning_rate": 2.560494864349766e-05, + "loss": 3.0954, + "step": 15560 + }, + { + "epoch": 0.9034204647654415, + "grad_norm": 0.08995792269706726, + "learning_rate": 2.530272801543654e-05, + "loss": 3.1003, + "step": 15570 + }, + { + "epoch": 0.9040006962778149, + "grad_norm": 0.08986230194568634, + "learning_rate": 2.5002255272458806e-05, + "loss": 3.0967, + "step": 15580 + }, + { + "epoch": 0.9045809277901883, + "grad_norm": 0.08791092783212662, + "learning_rate": 2.4703531520936572e-05, + "loss": 3.0929, + "step": 15590 + }, + { + "epoch": 0.9051611593025617, + "grad_norm": 0.09303991496562958, + "learning_rate": 2.440655786080209e-05, + "loss": 3.0981, + "step": 15600 + }, + { + "epoch": 0.9057413908149352, + "grad_norm": 0.09381508827209473, + "learning_rate": 2.4111335385543387e-05, + "loss": 3.0977, + "step": 15610 + }, + { + "epoch": 0.9063216223273086, + "grad_norm": 0.09249861538410187, + "learning_rate": 2.3817865182200638e-05, + "loss": 3.0969, + "step": 15620 + }, + { + "epoch": 0.906901853839682, + "grad_norm": 0.09130273759365082, + "learning_rate": 2.352614833136174e-05, + "loss": 3.1012, + "step": 15630 + }, + { + "epoch": 0.9074820853520554, + "grad_norm": 0.08810003846883774, + "learning_rate": 2.3236185907158814e-05, + "loss": 3.0956, + "step": 15640 + }, + { + "epoch": 0.908062316864429, + "grad_norm": 0.09278014302253723, + "learning_rate": 2.2947978977263807e-05, + "loss": 3.1024, + "step": 15650 + }, + { + "epoch": 0.9086425483768024, + "grad_norm": 0.09021242707967758, + "learning_rate": 2.266152860288484e-05, + "loss": 3.0915, + "step": 15660 + }, + { + "epoch": 0.9092227798891758, + "grad_norm": 0.08989161998033524, + "learning_rate": 2.2376835838762265e-05, + "loss": 3.0851, + "step": 15670 + }, + { + "epoch": 0.9098030114015492, + "grad_norm": 0.09114709496498108, + "learning_rate": 2.2093901733164612e-05, + "loss": 3.1014, + "step": 15680 + }, + { + "epoch": 0.9103832429139227, + "grad_norm": 0.08926232159137726, + "learning_rate": 2.1812727327884918e-05, + "loss": 3.0965, + "step": 15690 + }, + { + "epoch": 0.9109634744262961, + "grad_norm": 0.09116176515817642, + "learning_rate": 2.1533313658236688e-05, + "loss": 3.1009, + "step": 15700 + }, + { + "epoch": 0.9115437059386695, + "grad_norm": 0.08799432218074799, + "learning_rate": 2.1255661753050492e-05, + "loss": 3.1023, + "step": 15710 + }, + { + "epoch": 0.9121239374510429, + "grad_norm": 0.08743447065353394, + "learning_rate": 2.097977263466966e-05, + "loss": 3.0984, + "step": 15720 + }, + { + "epoch": 0.9127041689634164, + "grad_norm": 0.09166787564754486, + "learning_rate": 2.0705647318946806e-05, + "loss": 3.097, + "step": 15730 + }, + { + "epoch": 0.9132844004757898, + "grad_norm": 0.09091733396053314, + "learning_rate": 2.0433286815240092e-05, + "loss": 3.1049, + "step": 15740 + }, + { + "epoch": 0.9138646319881633, + "grad_norm": 0.08930478990077972, + "learning_rate": 2.0162692126409365e-05, + "loss": 3.0977, + "step": 15750 + }, + { + "epoch": 0.9144448635005367, + "grad_norm": 0.08997286111116409, + "learning_rate": 1.989386424881273e-05, + "loss": 3.1036, + "step": 15760 + }, + { + "epoch": 0.9150250950129102, + "grad_norm": 0.08813077956438065, + "learning_rate": 1.9626804172302447e-05, + "loss": 3.1003, + "step": 15770 + }, + { + "epoch": 0.9156053265252836, + "grad_norm": 0.08898695558309555, + "learning_rate": 1.936151288022181e-05, + "loss": 3.1065, + "step": 15780 + }, + { + "epoch": 0.916185558037657, + "grad_norm": 0.0892912819981575, + "learning_rate": 1.9097991349401156e-05, + "loss": 3.1047, + "step": 15790 + }, + { + "epoch": 0.9167657895500304, + "grad_norm": 0.09048785269260406, + "learning_rate": 1.8836240550154205e-05, + "loss": 3.1035, + "step": 15800 + }, + { + "epoch": 0.9173460210624039, + "grad_norm": 0.08921167254447937, + "learning_rate": 1.8576261446275057e-05, + "loss": 3.1013, + "step": 15810 + }, + { + "epoch": 0.9179262525747773, + "grad_norm": 0.08912645280361176, + "learning_rate": 1.8318054995033805e-05, + "loss": 3.0982, + "step": 15820 + }, + { + "epoch": 0.9185064840871507, + "grad_norm": 0.09258027374744415, + "learning_rate": 1.8061622147173716e-05, + "loss": 3.1059, + "step": 15830 + }, + { + "epoch": 0.9190867155995243, + "grad_norm": 0.08841285109519958, + "learning_rate": 1.7806963846907498e-05, + "loss": 3.095, + "step": 15840 + }, + { + "epoch": 0.9196669471118977, + "grad_norm": 0.09142499417066574, + "learning_rate": 1.7554081031913528e-05, + "loss": 3.1007, + "step": 15850 + }, + { + "epoch": 0.9202471786242711, + "grad_norm": 0.08727526664733887, + "learning_rate": 1.7302974633332968e-05, + "loss": 3.0974, + "step": 15860 + }, + { + "epoch": 0.9208274101366445, + "grad_norm": 0.09390713274478912, + "learning_rate": 1.7053645575765718e-05, + "loss": 3.0998, + "step": 15870 + }, + { + "epoch": 0.921407641649018, + "grad_norm": 0.09036395698785782, + "learning_rate": 1.6806094777267744e-05, + "loss": 3.0948, + "step": 15880 + }, + { + "epoch": 0.9219878731613914, + "grad_norm": 0.08717726916074753, + "learning_rate": 1.656032314934669e-05, + "loss": 3.0995, + "step": 15890 + }, + { + "epoch": 0.9225681046737648, + "grad_norm": 0.08829261362552643, + "learning_rate": 1.631633159695972e-05, + "loss": 3.0997, + "step": 15900 + }, + { + "epoch": 0.9231483361861382, + "grad_norm": 0.09095877408981323, + "learning_rate": 1.6074121018509137e-05, + "loss": 3.099, + "step": 15910 + }, + { + "epoch": 0.9237285676985117, + "grad_norm": 0.08995683491230011, + "learning_rate": 1.5833692305839642e-05, + "loss": 3.0973, + "step": 15920 + }, + { + "epoch": 0.9243087992108852, + "grad_norm": 0.08980005234479904, + "learning_rate": 1.5595046344235143e-05, + "loss": 3.1039, + "step": 15930 + }, + { + "epoch": 0.9248890307232586, + "grad_norm": 0.08741045743227005, + "learning_rate": 1.535818401241479e-05, + "loss": 3.1075, + "step": 15940 + }, + { + "epoch": 0.925469262235632, + "grad_norm": 0.0891076922416687, + "learning_rate": 1.512310618253071e-05, + "loss": 3.0986, + "step": 15950 + }, + { + "epoch": 0.9260494937480055, + "grad_norm": 0.0910383015871048, + "learning_rate": 1.4889813720164013e-05, + "loss": 3.1035, + "step": 15960 + }, + { + "epoch": 0.9266297252603789, + "grad_norm": 0.08668383955955505, + "learning_rate": 1.4658307484321953e-05, + "loss": 3.1023, + "step": 15970 + }, + { + "epoch": 0.9272099567727523, + "grad_norm": 0.08511517196893692, + "learning_rate": 1.4428588327434933e-05, + "loss": 3.0929, + "step": 15980 + }, + { + "epoch": 0.9277901882851257, + "grad_norm": 0.08581209927797318, + "learning_rate": 1.4200657095352676e-05, + "loss": 3.1002, + "step": 15990 + }, + { + "epoch": 0.9283704197974992, + "grad_norm": 0.08814380317926407, + "learning_rate": 1.397451462734206e-05, + "loss": 3.098, + "step": 16000 + }, + { + "epoch": 0.9283704197974992, + "eval_loss": 3.031247854232788, + "eval_runtime": 3.2581, + "eval_samples_per_second": 1328.977, + "eval_steps_per_second": 10.435, + "step": 16000 + }, + { + "epoch": 0.9289506513098726, + "grad_norm": 0.08873005956411362, + "learning_rate": 1.3750161756083234e-05, + "loss": 3.1036, + "step": 16010 + }, + { + "epoch": 0.929530882822246, + "grad_norm": 0.08824755996465683, + "learning_rate": 1.3527599307667005e-05, + "loss": 3.0983, + "step": 16020 + }, + { + "epoch": 0.9301111143346195, + "grad_norm": 0.08939357846975327, + "learning_rate": 1.3306828101591728e-05, + "loss": 3.1033, + "step": 16030 + }, + { + "epoch": 0.930691345846993, + "grad_norm": 0.08686842769384384, + "learning_rate": 1.3087848950759873e-05, + "loss": 3.0965, + "step": 16040 + }, + { + "epoch": 0.9312715773593664, + "grad_norm": 0.08945832401514053, + "learning_rate": 1.2870662661475852e-05, + "loss": 3.0951, + "step": 16050 + }, + { + "epoch": 0.9318518088717398, + "grad_norm": 0.088344506919384, + "learning_rate": 1.2655270033442189e-05, + "loss": 3.1039, + "step": 16060 + }, + { + "epoch": 0.9324320403841133, + "grad_norm": 0.08870889991521835, + "learning_rate": 1.2441671859757143e-05, + "loss": 3.0998, + "step": 16070 + }, + { + "epoch": 0.9330122718964867, + "grad_norm": 0.08570938557386398, + "learning_rate": 1.2229868926911636e-05, + "loss": 3.0957, + "step": 16080 + }, + { + "epoch": 0.9335925034088601, + "grad_norm": 0.0883183628320694, + "learning_rate": 1.201986201478611e-05, + "loss": 3.1008, + "step": 16090 + }, + { + "epoch": 0.9341727349212335, + "grad_norm": 0.09012133628129959, + "learning_rate": 1.1811651896648178e-05, + "loss": 3.0915, + "step": 16100 + }, + { + "epoch": 0.9347529664336071, + "grad_norm": 0.09092947840690613, + "learning_rate": 1.1605239339149199e-05, + "loss": 3.0886, + "step": 16110 + }, + { + "epoch": 0.9353331979459805, + "grad_norm": 0.08645268529653549, + "learning_rate": 1.140062510232187e-05, + "loss": 3.09, + "step": 16120 + }, + { + "epoch": 0.9359134294583539, + "grad_norm": 0.08784764260053635, + "learning_rate": 1.1197809939577197e-05, + "loss": 3.0997, + "step": 16130 + }, + { + "epoch": 0.9364936609707273, + "grad_norm": 0.0903283953666687, + "learning_rate": 1.0996794597701865e-05, + "loss": 3.1108, + "step": 16140 + }, + { + "epoch": 0.9370738924831008, + "grad_norm": 0.0857265442609787, + "learning_rate": 1.0797579816855585e-05, + "loss": 3.1028, + "step": 16150 + }, + { + "epoch": 0.9376541239954742, + "grad_norm": 0.09120402485132217, + "learning_rate": 1.0600166330567761e-05, + "loss": 3.0891, + "step": 16160 + }, + { + "epoch": 0.9382343555078476, + "grad_norm": 0.09053795039653778, + "learning_rate": 1.0404554865735771e-05, + "loss": 3.1013, + "step": 16170 + }, + { + "epoch": 0.938814587020221, + "grad_norm": 0.08675380051136017, + "learning_rate": 1.0210746142621408e-05, + "loss": 3.1088, + "step": 16180 + }, + { + "epoch": 0.9393948185325945, + "grad_norm": 0.08844220638275146, + "learning_rate": 1.0018740874848664e-05, + "loss": 3.1075, + "step": 16190 + }, + { + "epoch": 0.939975050044968, + "grad_norm": 0.09051468223333359, + "learning_rate": 9.828539769401235e-06, + "loss": 3.0957, + "step": 16200 + }, + { + "epoch": 0.9405552815573414, + "grad_norm": 0.08637753129005432, + "learning_rate": 9.640143526619239e-06, + "loss": 3.103, + "step": 16210 + }, + { + "epoch": 0.9411355130697148, + "grad_norm": 0.09072989225387573, + "learning_rate": 9.45355284019761e-06, + "loss": 3.1031, + "step": 16220 + }, + { + "epoch": 0.9417157445820883, + "grad_norm": 0.08837340027093887, + "learning_rate": 9.268768397182715e-06, + "loss": 3.0989, + "step": 16230 + }, + { + "epoch": 0.9422959760944617, + "grad_norm": 0.08710675686597824, + "learning_rate": 9.085790877970234e-06, + "loss": 3.0922, + "step": 16240 + }, + { + "epoch": 0.9428762076068351, + "grad_norm": 0.08685487508773804, + "learning_rate": 8.904620956302512e-06, + "loss": 3.0877, + "step": 16250 + }, + { + "epoch": 0.9434564391192085, + "grad_norm": 0.08923321217298508, + "learning_rate": 8.725259299266209e-06, + "loss": 3.0994, + "step": 16260 + }, + { + "epoch": 0.944036670631582, + "grad_norm": 0.08733417093753815, + "learning_rate": 8.547706567289814e-06, + "loss": 3.0953, + "step": 16270 + }, + { + "epoch": 0.9446169021439554, + "grad_norm": 0.08618912100791931, + "learning_rate": 8.371963414140982e-06, + "loss": 3.1021, + "step": 16280 + }, + { + "epoch": 0.9451971336563288, + "grad_norm": 0.09028159826993942, + "learning_rate": 8.198030486924468e-06, + "loss": 3.1022, + "step": 16290 + }, + { + "epoch": 0.9457773651687024, + "grad_norm": 0.09073447436094284, + "learning_rate": 8.025908426079532e-06, + "loss": 3.1016, + "step": 16300 + }, + { + "epoch": 0.9463575966810758, + "grad_norm": 0.08651082217693329, + "learning_rate": 7.85559786537754e-06, + "loss": 3.1053, + "step": 16310 + }, + { + "epoch": 0.9469378281934492, + "grad_norm": 0.08602554351091385, + "learning_rate": 7.687099431919974e-06, + "loss": 3.0999, + "step": 16320 + }, + { + "epoch": 0.9475180597058226, + "grad_norm": 0.08610852062702179, + "learning_rate": 7.520413746135657e-06, + "loss": 3.1059, + "step": 16330 + }, + { + "epoch": 0.9480982912181961, + "grad_norm": 0.08780808746814728, + "learning_rate": 7.355541421778689e-06, + "loss": 3.1046, + "step": 16340 + }, + { + "epoch": 0.9486785227305695, + "grad_norm": 0.08994690328836441, + "learning_rate": 7.1924830659262916e-06, + "loss": 3.1094, + "step": 16350 + }, + { + "epoch": 0.9492587542429429, + "grad_norm": 0.08709990233182907, + "learning_rate": 7.03123927897642e-06, + "loss": 3.105, + "step": 16360 + }, + { + "epoch": 0.9498389857553163, + "grad_norm": 0.08651293069124222, + "learning_rate": 6.871810654645483e-06, + "loss": 3.0934, + "step": 16370 + }, + { + "epoch": 0.9504192172676899, + "grad_norm": 0.08726444095373154, + "learning_rate": 6.7141977799665685e-06, + "loss": 3.0952, + "step": 16380 + }, + { + "epoch": 0.9509994487800633, + "grad_norm": 0.08613457530736923, + "learning_rate": 6.558401235286615e-06, + "loss": 3.088, + "step": 16390 + }, + { + "epoch": 0.9515796802924367, + "grad_norm": 0.088069386780262, + "learning_rate": 6.404421594264909e-06, + "loss": 3.0973, + "step": 16400 + }, + { + "epoch": 0.9521599118048101, + "grad_norm": 0.08668968081474304, + "learning_rate": 6.252259423870643e-06, + "loss": 3.1089, + "step": 16410 + }, + { + "epoch": 0.9527401433171836, + "grad_norm": 0.08948860317468643, + "learning_rate": 6.10191528438081e-06, + "loss": 3.0993, + "step": 16420 + }, + { + "epoch": 0.953320374829557, + "grad_norm": 0.08769353479146957, + "learning_rate": 5.953389729378256e-06, + "loss": 3.1144, + "step": 16430 + }, + { + "epoch": 0.9539006063419304, + "grad_norm": 0.08861144632101059, + "learning_rate": 5.806683305749682e-06, + "loss": 3.1077, + "step": 16440 + }, + { + "epoch": 0.9544808378543038, + "grad_norm": 0.09008525311946869, + "learning_rate": 5.661796553683541e-06, + "loss": 3.101, + "step": 16450 + }, + { + "epoch": 0.9550610693666773, + "grad_norm": 0.08720948547124863, + "learning_rate": 5.518730006668027e-06, + "loss": 3.1043, + "step": 16460 + }, + { + "epoch": 0.9556413008790507, + "grad_norm": 0.08759420365095139, + "learning_rate": 5.377484191489035e-06, + "loss": 3.1016, + "step": 16470 + }, + { + "epoch": 0.9562215323914242, + "grad_norm": 0.08649755269289017, + "learning_rate": 5.238059628228598e-06, + "loss": 3.0915, + "step": 16480 + }, + { + "epoch": 0.9568017639037976, + "grad_norm": 0.0874122902750969, + "learning_rate": 5.1004568302624456e-06, + "loss": 3.1012, + "step": 16490 + }, + { + "epoch": 0.9573819954161711, + "grad_norm": 0.08565357327461243, + "learning_rate": 4.96467630425862e-06, + "loss": 3.1011, + "step": 16500 + }, + { + "epoch": 0.9579622269285445, + "grad_norm": 0.08730974048376083, + "learning_rate": 4.830718550175139e-06, + "loss": 3.1077, + "step": 16510 + }, + { + "epoch": 0.9585424584409179, + "grad_norm": 0.08632799237966537, + "learning_rate": 4.698584061258559e-06, + "loss": 3.0943, + "step": 16520 + }, + { + "epoch": 0.9591226899532914, + "grad_norm": 0.08760344982147217, + "learning_rate": 4.5682733240418605e-06, + "loss": 3.0995, + "step": 16530 + }, + { + "epoch": 0.9597029214656648, + "grad_norm": 0.08842134475708008, + "learning_rate": 4.439786818342784e-06, + "loss": 3.1084, + "step": 16540 + }, + { + "epoch": 0.9602831529780382, + "grad_norm": 0.08709140866994858, + "learning_rate": 4.313125017262221e-06, + "loss": 3.0968, + "step": 16550 + }, + { + "epoch": 0.9608633844904116, + "grad_norm": 0.08542519807815552, + "learning_rate": 4.188288387182104e-06, + "loss": 3.097, + "step": 16560 + }, + { + "epoch": 0.9614436160027852, + "grad_norm": 0.08856651186943054, + "learning_rate": 4.065277387764077e-06, + "loss": 3.0984, + "step": 16570 + }, + { + "epoch": 0.9620238475151586, + "grad_norm": 0.09005647897720337, + "learning_rate": 3.9440924719473805e-06, + "loss": 3.0914, + "step": 16580 + }, + { + "epoch": 0.962604079027532, + "grad_norm": 0.08719483762979507, + "learning_rate": 3.82473408594769e-06, + "loss": 3.0979, + "step": 16590 + }, + { + "epoch": 0.9631843105399054, + "grad_norm": 0.08669883012771606, + "learning_rate": 3.7072026692550608e-06, + "loss": 3.098, + "step": 16600 + }, + { + "epoch": 0.9637645420522789, + "grad_norm": 0.08828626573085785, + "learning_rate": 3.5914986546323747e-06, + "loss": 3.0995, + "step": 16610 + }, + { + "epoch": 0.9643447735646523, + "grad_norm": 0.08661678433418274, + "learning_rate": 3.4776224681141167e-06, + "loss": 3.0966, + "step": 16620 + }, + { + "epoch": 0.9649250050770257, + "grad_norm": 0.08606945723295212, + "learning_rate": 3.3655745290042117e-06, + "loss": 3.1034, + "step": 16630 + }, + { + "epoch": 0.9655052365893991, + "grad_norm": 0.08614910393953323, + "learning_rate": 3.255355249874914e-06, + "loss": 3.1112, + "step": 16640 + }, + { + "epoch": 0.9660854681017726, + "grad_norm": 0.08819039165973663, + "learning_rate": 3.1469650365652525e-06, + "loss": 3.0944, + "step": 16650 + }, + { + "epoch": 0.9666656996141461, + "grad_norm": 0.08805646747350693, + "learning_rate": 3.0404042881792546e-06, + "loss": 3.0881, + "step": 16660 + }, + { + "epoch": 0.9672459311265195, + "grad_norm": 0.08734241127967834, + "learning_rate": 2.9356733970847817e-06, + "loss": 3.0993, + "step": 16670 + }, + { + "epoch": 0.9678261626388929, + "grad_norm": 0.08561859279870987, + "learning_rate": 2.832772748911916e-06, + "loss": 3.0975, + "step": 16680 + }, + { + "epoch": 0.9684063941512664, + "grad_norm": 0.08738164603710175, + "learning_rate": 2.7317027225516323e-06, + "loss": 3.1009, + "step": 16690 + }, + { + "epoch": 0.9689866256636398, + "grad_norm": 0.08456070721149445, + "learning_rate": 2.632463690154463e-06, + "loss": 3.1, + "step": 16700 + }, + { + "epoch": 0.9695668571760132, + "grad_norm": 0.08673311769962311, + "learning_rate": 2.5350560171287783e-06, + "loss": 3.1015, + "step": 16710 + }, + { + "epoch": 0.9701470886883866, + "grad_norm": 0.09023085236549377, + "learning_rate": 2.439480062139954e-06, + "loss": 3.101, + "step": 16720 + }, + { + "epoch": 0.9707273202007601, + "grad_norm": 0.08622407913208008, + "learning_rate": 2.345736177108537e-06, + "loss": 3.1137, + "step": 16730 + }, + { + "epoch": 0.9713075517131335, + "grad_norm": 0.08697347342967987, + "learning_rate": 2.2538247072094177e-06, + "loss": 3.1018, + "step": 16740 + }, + { + "epoch": 0.971887783225507, + "grad_norm": 0.0895911455154419, + "learning_rate": 2.1637459908702695e-06, + "loss": 3.1039, + "step": 16750 + }, + { + "epoch": 0.9724680147378804, + "grad_norm": 0.08516402542591095, + "learning_rate": 2.075500359770277e-06, + "loss": 3.102, + "step": 16760 + }, + { + "epoch": 0.9730482462502539, + "grad_norm": 0.08854671567678452, + "learning_rate": 1.98908813883919e-06, + "loss": 3.0975, + "step": 16770 + }, + { + "epoch": 0.9736284777626273, + "grad_norm": 0.08487720042467117, + "learning_rate": 1.9045096462558253e-06, + "loss": 3.0992, + "step": 16780 + }, + { + "epoch": 0.9742087092750007, + "grad_norm": 0.08441456407308578, + "learning_rate": 1.8217651934470669e-06, + "loss": 3.1019, + "step": 16790 + }, + { + "epoch": 0.9747889407873742, + "grad_norm": 0.08551038801670074, + "learning_rate": 1.74085508508659e-06, + "loss": 3.098, + "step": 16800 + }, + { + "epoch": 0.9753691722997476, + "grad_norm": 0.08782043308019638, + "learning_rate": 1.6617796190939726e-06, + "loss": 3.1039, + "step": 16810 + }, + { + "epoch": 0.975949403812121, + "grad_norm": 0.08647370338439941, + "learning_rate": 1.5845390866333631e-06, + "loss": 3.104, + "step": 16820 + }, + { + "epoch": 0.9765296353244944, + "grad_norm": 0.08616235107183456, + "learning_rate": 1.5091337721124254e-06, + "loss": 3.0958, + "step": 16830 + }, + { + "epoch": 0.977109866836868, + "grad_norm": 0.08559519797563553, + "learning_rate": 1.4355639531815067e-06, + "loss": 3.1067, + "step": 16840 + }, + { + "epoch": 0.9776900983492414, + "grad_norm": 0.08562770485877991, + "learning_rate": 1.363829900732305e-06, + "loss": 3.1058, + "step": 16850 + }, + { + "epoch": 0.9782703298616148, + "grad_norm": 0.08564095199108124, + "learning_rate": 1.2939318788971477e-06, + "loss": 3.0996, + "step": 16860 + }, + { + "epoch": 0.9788505613739882, + "grad_norm": 0.08510784804821014, + "learning_rate": 1.225870145047936e-06, + "loss": 3.092, + "step": 16870 + }, + { + "epoch": 0.9794307928863617, + "grad_norm": 0.08742259442806244, + "learning_rate": 1.1596449497949802e-06, + "loss": 3.0902, + "step": 16880 + }, + { + "epoch": 0.9800110243987351, + "grad_norm": 0.08536962419748306, + "learning_rate": 1.0952565369864997e-06, + "loss": 3.0974, + "step": 16890 + }, + { + "epoch": 0.9805912559111085, + "grad_norm": 0.08702561259269714, + "learning_rate": 1.0327051437073464e-06, + "loss": 3.1094, + "step": 16900 + }, + { + "epoch": 0.9811714874234819, + "grad_norm": 0.08652474731206894, + "learning_rate": 9.719910002782829e-07, + "loss": 3.1017, + "step": 16910 + }, + { + "epoch": 0.9817517189358554, + "grad_norm": 0.08817258477210999, + "learning_rate": 9.131143302551492e-07, + "loss": 3.1042, + "step": 16920 + }, + { + "epoch": 0.9823319504482289, + "grad_norm": 0.08725294470787048, + "learning_rate": 8.560753504279761e-07, + "loss": 3.1031, + "step": 16930 + }, + { + "epoch": 0.9829121819606023, + "grad_norm": 0.0844888985157013, + "learning_rate": 8.008742708203731e-07, + "loss": 3.1036, + "step": 16940 + }, + { + "epoch": 0.9834924134729757, + "grad_norm": 0.08515673130750656, + "learning_rate": 7.475112946883633e-07, + "loss": 3.0962, + "step": 16950 + }, + { + "epoch": 0.9840726449853492, + "grad_norm": 0.08552297949790955, + "learning_rate": 6.959866185201058e-07, + "loss": 3.0973, + "step": 16960 + }, + { + "epoch": 0.9846528764977226, + "grad_norm": 0.08687438070774078, + "learning_rate": 6.463004320348409e-07, + "loss": 3.1033, + "step": 16970 + }, + { + "epoch": 0.985233108010096, + "grad_norm": 0.0858420580625534, + "learning_rate": 5.984529181822795e-07, + "loss": 3.1055, + "step": 16980 + }, + { + "epoch": 0.9858133395224694, + "grad_norm": 0.08427808433771133, + "learning_rate": 5.524442531419927e-07, + "loss": 3.0978, + "step": 16990 + }, + { + "epoch": 0.9863935710348429, + "grad_norm": 0.0840638130903244, + "learning_rate": 5.08274606322745e-07, + "loss": 3.092, + "step": 17000 + }, + { + "epoch": 0.9863935710348429, + "eval_loss": 3.0300889015197754, + "eval_runtime": 3.2481, + "eval_samples_per_second": 1333.079, + "eval_steps_per_second": 10.468, + "step": 17000 + }, + { + "epoch": 0.9869738025472163, + "grad_norm": 0.08689724653959274, + "learning_rate": 4.6594414036171815e-07, + "loss": 3.1035, + "step": 17010 + }, + { + "epoch": 0.9875540340595897, + "grad_norm": 0.08669095486402512, + "learning_rate": 4.2545301112423274e-07, + "loss": 3.0939, + "step": 17020 + }, + { + "epoch": 0.9881342655719633, + "grad_norm": 0.08550863713026047, + "learning_rate": 3.868013677028048e-07, + "loss": 3.097, + "step": 17030 + }, + { + "epoch": 0.9887144970843367, + "grad_norm": 0.08494267612695694, + "learning_rate": 3.4998935241681295e-07, + "loss": 3.0995, + "step": 17040 + }, + { + "epoch": 0.9892947285967101, + "grad_norm": 0.08708130568265915, + "learning_rate": 3.1501710081199843e-07, + "loss": 3.1011, + "step": 17050 + }, + { + "epoch": 0.9898749601090835, + "grad_norm": 0.08479303121566772, + "learning_rate": 2.8188474165979915e-07, + "loss": 3.1057, + "step": 17060 + }, + { + "epoch": 0.990455191621457, + "grad_norm": 0.08544128388166428, + "learning_rate": 2.505923969571278e-07, + "loss": 3.084, + "step": 17070 + }, + { + "epoch": 0.9910354231338304, + "grad_norm": 0.08571518212556839, + "learning_rate": 2.2114018192553874e-07, + "loss": 3.1036, + "step": 17080 + }, + { + "epoch": 0.9916156546462038, + "grad_norm": 0.08791361004114151, + "learning_rate": 1.9352820501133961e-07, + "loss": 3.1063, + "step": 17090 + }, + { + "epoch": 0.9921958861585772, + "grad_norm": 0.08581080287694931, + "learning_rate": 1.6775656788459158e-07, + "loss": 3.1072, + "step": 17100 + }, + { + "epoch": 0.9927761176709508, + "grad_norm": 0.0846245288848877, + "learning_rate": 1.4382536543922076e-07, + "loss": 3.0959, + "step": 17110 + }, + { + "epoch": 0.9933563491833242, + "grad_norm": 0.08586116135120392, + "learning_rate": 1.217346857924073e-07, + "loss": 3.114, + "step": 17120 + }, + { + "epoch": 0.9939365806956976, + "grad_norm": 0.0862361267209053, + "learning_rate": 1.014846102843081e-07, + "loss": 3.0948, + "step": 17130 + }, + { + "epoch": 0.994516812208071, + "grad_norm": 0.08657976239919662, + "learning_rate": 8.307521347789005e-08, + "loss": 3.1003, + "step": 17140 + }, + { + "epoch": 0.9950970437204445, + "grad_norm": 0.0862279012799263, + "learning_rate": 6.650656315848602e-08, + "loss": 3.0992, + "step": 17150 + }, + { + "epoch": 0.9956772752328179, + "grad_norm": 0.0885181874036789, + "learning_rate": 5.1778720333517383e-08, + "loss": 3.0967, + "step": 17160 + }, + { + "epoch": 0.9962575067451913, + "grad_norm": 0.08494796603918076, + "learning_rate": 3.88917392325494e-08, + "loss": 3.1039, + "step": 17170 + }, + { + "epoch": 0.9968377382575647, + "grad_norm": 0.08858942240476608, + "learning_rate": 2.78456673066807e-08, + "loss": 3.0973, + "step": 17180 + }, + { + "epoch": 0.9974179697699382, + "grad_norm": 0.08530562371015549, + "learning_rate": 1.8640545228820748e-08, + "loss": 3.0982, + "step": 17190 + }, + { + "epoch": 0.9979982012823116, + "grad_norm": 0.08376429975032806, + "learning_rate": 1.1276406893079294e-08, + "loss": 3.0963, + "step": 17200 + }, + { + "epoch": 0.9985784327946851, + "grad_norm": 0.08597017079591751, + "learning_rate": 5.7532794150994e-09, + "loss": 3.1043, + "step": 17210 + }, + { + "epoch": 0.9991586643070585, + "grad_norm": 0.08577102422714233, + "learning_rate": 2.0711831315578524e-09, + "loss": 3.0981, + "step": 17220 + }, + { + "epoch": 0.999738895819432, + "grad_norm": 0.09527655690908432, + "learning_rate": 2.3013160027618442e-10, + "loss": 3.0959, + "step": 17230 + }, + { + "epoch": 0.9999709884243814, + "step": 17234, + "total_flos": 7.47219573773697e+18, + "train_loss": 3.3572154520448043, + "train_runtime": 15550.2697, + "train_samples_per_second": 567.438, + "train_steps_per_second": 1.108 + } + ], + "logging_steps": 10, + "max_steps": 17234, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 7.47219573773697e+18, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}